From f8a907d25a9f319e67fcf005638adb52fa09dd8b Mon Sep 17 00:00:00 2001
From: Brent DeGraaf <bdegraaf@codeaurora.org>
Date: Wed, 2 Oct 2013 13:47:11 +0000
Subject: [AOSP Master] libc: krait: Use performance version of memcpy

* This commit improves performance for small copies compared to the original
  CAF one.  It also cleans up some functions.

Change-Id: Iaa52635240da8b8746693186b66b69778e833c32
---
 libc/arch-arm/krait/bionic/__strcat_chk.S |  19 ++---
 libc/arch-arm/krait/bionic/__strcpy_chk.S |  15 +---
 libc/arch-arm/krait/bionic/memcpy.S       |  17 ++--
 libc/arch-arm/krait/bionic/memcpy_base.S  | 124 ++++++++++++------------------
 4 files changed, 71 insertions(+), 104 deletions(-)

(limited to 'libc')

diff --git a/libc/arch-arm/krait/bionic/__strcat_chk.S b/libc/arch-arm/krait/bionic/__strcat_chk.S
index 246f159c0..1a39c5b88 100644
--- a/libc/arch-arm/krait/bionic/__strcat_chk.S
+++ b/libc/arch-arm/krait/bionic/__strcat_chk.S
@@ -40,7 +40,7 @@
 ENTRY(__strcat_chk)
     pld     [r0, #0]
     push    {r0, lr}
-    .cfi_def_cfa_offset 8
+    .cfi_adjust_cfa_offset 8
     .cfi_rel_offset r0, 0
     .cfi_rel_offset lr, 4
     push    {r4, r5}
@@ -177,7 +177,7 @@ ENTRY(__strcat_chk)
 .L_strlen_done:
     add     r2, r3, r4
     cmp     r2, lr
-    bhi     __strcat_chk_failed
+    bhi     .L_strcat_chk_failed
 
     // Set up the registers for the memcpy code.
     mov     r1, r5
@@ -185,20 +185,17 @@ ENTRY(__strcat_chk)
     mov     r2, r4
     add     r0, r0, r3
     pop     {r4, r5}
-END(__strcat_chk)
+    .cfi_adjust_cfa_offset -8
+    .cfi_restore r4
+    .cfi_restore r5
 
-#define MEMCPY_BASE         __strcat_chk_memcpy_base
-#define MEMCPY_BASE_ALIGNED __strcat_chk_memcpy_base_aligned
 #include "memcpy_base.S"
 
-ENTRY_PRIVATE(__strcat_chk_failed)
-    .cfi_def_cfa_offset 8
-    .cfi_rel_offset r0, 0
-    .cfi_rel_offset lr, 4
+    // Undo the above cfi directives.
     .cfi_adjust_cfa_offset 8
     .cfi_rel_offset r4, 0
     .cfi_rel_offset r5, 4
-
+.L_strcat_chk_failed:
     ldr     r0, error_message
     ldr     r1, error_code
 1:
@@ -208,7 +205,7 @@ error_code:
     .word   BIONIC_EVENT_STRCAT_BUFFER_OVERFLOW
 error_message:
     .word   error_string-(1b+4)
-END(__strcat_chk_failed)
+END(__strcat_chk)
 
     .data
 error_string:
diff --git a/libc/arch-arm/krait/bionic/__strcpy_chk.S b/libc/arch-arm/krait/bionic/__strcpy_chk.S
index db766863a..00202f3da 100644
--- a/libc/arch-arm/krait/bionic/__strcpy_chk.S
+++ b/libc/arch-arm/krait/bionic/__strcpy_chk.S
@@ -39,7 +39,7 @@
 ENTRY(__strcpy_chk)
     pld     [r0, #0]
     push    {r0, lr}
-    .cfi_def_cfa_offset 8
+    .cfi_adjust_cfa_offset 8
     .cfi_rel_offset r0, 0
     .cfi_rel_offset lr, 4
 
@@ -149,21 +149,14 @@ ENTRY(__strcpy_chk)
     pld     [r1, #64]
     ldr     r0, [sp]
     cmp     r3, lr
-    bhs     __strcpy_chk_failed
+    bhs     .L_strcpy_chk_failed
 
     // Add 1 for copy length to get the string terminator.
     add     r2, r3, #1
-END(__strcpy_chk)
 
-#define MEMCPY_BASE         __strcpy_chk_memcpy_base
-#define MEMCPY_BASE_ALIGNED __strcpy_chk_memcpy_base_aligned
 #include "memcpy_base.S"
 
-ENTRY_PRIVATE(__strcpy_chk_failed)
-    .cfi_def_cfa_offset 8
-    .cfi_rel_offset r0, 0
-    .cfi_rel_offset lr, 4
-
+.L_strcpy_chk_failed:
     ldr     r0, error_message
     ldr     r1, error_code
 1:
@@ -173,7 +166,7 @@ error_code:
     .word   BIONIC_EVENT_STRCPY_BUFFER_OVERFLOW
 error_message:
     .word   error_string-(1b+4)
-END(__strcpy_chk_failed)
+END(__strcpy_chk)
 
     .data
 error_string:
diff --git a/libc/arch-arm/krait/bionic/memcpy.S b/libc/arch-arm/krait/bionic/memcpy.S
index 9ff46a8ac..5d27b574f 100644
--- a/libc/arch-arm/krait/bionic/memcpy.S
+++ b/libc/arch-arm/krait/bionic/memcpy.S
@@ -45,7 +45,7 @@
 
 ENTRY(__memcpy_chk)
         cmp         r2, r3
-        bhi         __memcpy_chk_fail
+        bhi         .L_memcpy_chk_fail
 
         // Fall through to memcpy...
 END(__memcpy_chk)
@@ -53,19 +53,20 @@ END(__memcpy_chk)
 ENTRY(memcpy)
         pld     [r1, #64]
         stmfd   sp!, {r0, lr}
-        .cfi_def_cfa_offset 8
+        .cfi_adjust_cfa_offset 8
         .cfi_rel_offset r0, 0
         .cfi_rel_offset lr, 4
-END(memcpy)
 
-#define MEMCPY_BASE         __memcpy_base
-#define MEMCPY_BASE_ALIGNED __memcpy_base_aligned
 #include "memcpy_base.S"
 
-ENTRY_PRIVATE(__memcpy_chk_fail)
+        // Undo the cfi directives from above.
+        .cfi_adjust_cfa_offset -8
+        .cfi_restore r0
+        .cfi_restore lr
+.L_memcpy_chk_fail:
         // Preserve lr for backtrace.
         push    {lr}
-        .cfi_def_cfa_offset 4
+        .cfi_adjust_cfa_offset 4
         .cfi_rel_offset lr, 0
 
         ldr     r0, error_message
@@ -77,7 +78,7 @@ error_code:
         .word   BIONIC_EVENT_MEMCPY_BUFFER_OVERFLOW
 error_message:
         .word   error_string-(1b+4)
-END(__memcpy_chk_fail)
+END(memcpy)
 
         .data
 error_string:
diff --git a/libc/arch-arm/krait/bionic/memcpy_base.S b/libc/arch-arm/krait/bionic/memcpy_base.S
index 068f2f60c..76c5a8459 100644
--- a/libc/arch-arm/krait/bionic/memcpy_base.S
+++ b/libc/arch-arm/krait/bionic/memcpy_base.S
@@ -30,59 +30,35 @@
 #include <machine/cpu-features.h>
 #include <machine/asm.h>
 
-/*
- * These default settings are good for all Krait-based systems
- * as of this writing, but they can be overridden in:
- *   device/<vendor>/<board>/BoardConfig.mk
- * by setting the following:
- *   TARGET_USE_KRAIT_BIONIC_OPTIMIZATION := true
- *   TARGET_USE_KRAIT_PLD_SET := true
- *   TARGET_KRAIT_BIONIC_PLDOFFS := <pldoffset>
- *   TARGET_KRAIT_BIONIC_PLDSIZE := <pldsize>
- *   TARGET_KRAIT_BIONIC_PLDTHRESH := <pldthreshold>
- *   TARGET_KRAIT_BIONIC_BBTHRESH := <bbthreshold>
- */
-
-#ifndef PLDOFFS
 #define PLDOFFS	(10)
-#endif
-#ifndef PLDTHRESH
 #define PLDTHRESH (PLDOFFS)
-#endif
-#ifndef BBTHRESH
 #define BBTHRESH (4096/64)
-#endif
+#define PLDSIZE (64)
+
 #if (PLDOFFS < 1)
 #error Routine does not support offsets less than 1
 #endif
+
 #if (PLDTHRESH < PLDOFFS)
 #error PLD threshold must be greater than or equal to the PLD offset
 #endif
-#ifndef PLDSIZE
-#define PLDSIZE	(64)
-#endif
+
 	.text
 	.fpu    neon
 
-ENTRY(MEMCPY_BASE)
-MEMCPY_BASE_ALIGNED:
-       // .cfi_startproc
-	.save {r0, r9, r10, lr}
-       // .cfi_def_cfa_offset 8
-	//.cfi_rel_offset r0, 0
-	//.cfi_rel_offset lr, 4
+.L_memcpy_base:
 	cmp	r2, #4
-	blt	.Lneon_lt4
+	blt	.L_neon_lt4
 	cmp	r2, #16
-	blt	.Lneon_lt16
+	blt	.L_neon_lt16
 	cmp	r2, #32
-	blt	.Lneon_16
+	blt	.L_neon_16
 	cmp	r2, #64
-	blt	.Lneon_copy_32_a
+	blt	.L_neon_copy_32_a
 
 	mov	r12, r2, lsr #6
 	cmp	r12, #PLDTHRESH
-	ble	.Lneon_copy_64_loop_nopld
+	ble	.L_neon_copy_64_loop_nopld
 
 	push	{r9, r10}
 	.cfi_adjust_cfa_offset 8
@@ -90,7 +66,7 @@ MEMCPY_BASE_ALIGNED:
 	.cfi_rel_offset r10, 4
 
 	cmp	r12, #BBTHRESH
-	ble	.Lneon_prime_pump
+	ble	.L_neon_prime_pump
 
 	add	lr, r0, #0x400
 	add	r9, r1, #(PLDOFFS*PLDSIZE)
@@ -99,12 +75,12 @@ MEMCPY_BASE_ALIGNED:
 	lsr	lr, lr, #21
 	add	lr, lr, #(PLDOFFS*PLDSIZE)
 	cmp	r12, lr, lsr #6
-	ble	.Lneon_prime_pump
+	ble	.L_neon_prime_pump
 
 	itt	gt
 	movgt	r9, #(PLDOFFS)
 	rsbsgt	r9, r9, lr, lsr #6
-	ble	.Lneon_prime_pump
+	ble	.L_neon_prime_pump
 
 	add	r10, r1, lr
 	bic	r10, #0x3F
@@ -118,7 +94,7 @@ MEMCPY_BASE_ALIGNED:
 	movgt	r12, #0
 
 	pld	[r1, #((PLDOFFS-1)*PLDSIZE)]
-.Lneon_copy_64_loop_outer_doublepld:
+.L_neon_copy_64_loop_outer_doublepld:
 	pld	[r1, #((PLDOFFS)*PLDSIZE)]
 	vld1.32	{q0, q1}, [r1]!
 	vld1.32	{q2, q3}, [r1]!
@@ -127,14 +103,14 @@ MEMCPY_BASE_ALIGNED:
 	vst1.32	{q0, q1}, [r0]!
 	vst1.32	{q2, q3}, [r0]!
 	add	r10, #64
-	bne	.Lneon_copy_64_loop_outer_doublepld
+	bne	.L_neon_copy_64_loop_outer_doublepld
 	cmp	r12, #0
-	beq	.Lneon_pop_before_nopld
+	beq	.L_neon_pop_before_nopld
 
 	cmp	r12, #(512*1024/64)
-	blt	.Lneon_copy_64_loop_outer
+	blt	.L_neon_copy_64_loop_outer
 
-.Lneon_copy_64_loop_ddr:
+.L_neon_copy_64_loop_ddr:
 	vld1.32	{q0, q1}, [r1]!
 	vld1.32	{q2, q3}, [r1]!
 	pld	[r10]
@@ -142,16 +118,17 @@ MEMCPY_BASE_ALIGNED:
 	vst1.32	{q0, q1}, [r0]!
 	vst1.32	{q2, q3}, [r0]!
 	add	r10, #64
-	bne	.Lneon_copy_64_loop_ddr
-	b	.Lneon_pop_before_nopld
+	bne	.L_neon_copy_64_loop_ddr
+	b	.L_neon_pop_before_nopld
 
-.Lneon_prime_pump:
+.L_neon_prime_pump:
 	mov	lr, #(PLDOFFS*PLDSIZE)
 	add	r10, r1, #(PLDOFFS*PLDSIZE)
 	bic	r10, #0x3F
 	sub	r12, r12, #PLDOFFS
 	ldr	r3, [r10, #(-1*PLDSIZE)]
-.Lneon_copy_64_loop_outer:
+
+.L_neon_copy_64_loop_outer:
 	vld1.32	{q0, q1}, [r1]!
 	vld1.32	{q2, q3}, [r1]!
 	ldr	r3, [r10]
@@ -159,47 +136,49 @@ MEMCPY_BASE_ALIGNED:
 	vst1.32	{q0, q1}, [r0]!
 	vst1.32	{q2, q3}, [r0]!
 	add	r10, #64
-	bne	.Lneon_copy_64_loop_outer
-.Lneon_pop_before_nopld:
+	bne	.L_neon_copy_64_loop_outer
+
+.L_neon_pop_before_nopld:
 	mov	r12, lr, lsr #6
 	pop	{r9, r10}
+	.cfi_adjust_cfa_offset -8
 	.cfi_restore r9
 	.cfi_restore r10
-	.cfi_adjust_cfa_offset -8
 
-.Lneon_copy_64_loop_nopld:
+.L_neon_copy_64_loop_nopld:
 	vld1.32	{q8, q9}, [r1]!
 	vld1.32	{q10, q11}, [r1]!
 	subs	r12, r12, #1
 	vst1.32	{q8, q9}, [r0]!
 	vst1.32	{q10, q11}, [r0]!
-	bne	.Lneon_copy_64_loop_nopld
+	bne	.L_neon_copy_64_loop_nopld
 	ands	r2, r2, #0x3f
-	.cfi_restore r0
-	.cfi_adjust_cfa_offset -4
-	beq	.Lneon_exit
-.Lneon_copy_32_a:
+	beq	.L_neon_exit
+
+.L_neon_copy_32_a:
 	movs	r3, r2, lsl #27
-	bcc	.Lneon_16
+	bcc	.L_neon_16
 	vld1.32	{q0,q1}, [r1]!
 	vst1.32	{q0,q1}, [r0]!
-.Lneon_16:
-	bpl	.Lneon_lt16
+
+.L_neon_16:
+	bpl	.L_neon_lt16
 	vld1.32	{q8}, [r1]!
 	vst1.32	{q8}, [r0]!
 	ands	r2, r2, #0x0f
-	beq	.Lneon_exit
-.Lneon_lt16:
+	beq	.L_neon_exit
+
+.L_neon_lt16:
 	movs	r3, r2, lsl #29
-	itttt	cs
-	ldrcs	r3, [r1], #4
-	strcs	r3, [r0], #4
-	ldrcs	r3, [r1], #4
-	strcs	r3, [r0], #4
-	itt	mi
-	ldrmi	r3, [r1], #4
-	strmi	r3, [r0], #4
-.Lneon_lt4:
+	bcc	1f
+	vld1.8	{d0}, [r1]!
+	vst1.8	{d0}, [r0]!
+1:
+	bge	.L_neon_lt4
+	vld4.8	{d0[0], d1[0], d2[0], d3[0]}, [r1]!
+	vst4.8	{d0[0], d1[0], d2[0], d3[0]}, [r0]!
+
+.L_neon_lt4:
 	movs	r2, r2, lsl #31
 	itt	cs
 	ldrhcs	r3, [r1], #2
@@ -207,9 +186,6 @@ MEMCPY_BASE_ALIGNED:
 	itt	mi
 	ldrbmi	r3, [r1]
 	strbmi	r3, [r0]
-.Lneon_exit:
-	pop	{r0, lr}
-	bx	lr
-        //.cfi_endproc
-END(MEMCPY_BASE)
 
+.L_neon_exit:
+	pop	{r0, pc}
-- 
cgit v1.2.3