summaryrefslogtreecommitdiffstats
path: root/libc/arch-arm/krait/bionic/memcpy_base.S
diff options
context:
space:
mode:
Diffstat (limited to 'libc/arch-arm/krait/bionic/memcpy_base.S')
-rw-r--r--libc/arch-arm/krait/bionic/memcpy_base.S124
1 files changed, 50 insertions, 74 deletions
diff --git a/libc/arch-arm/krait/bionic/memcpy_base.S b/libc/arch-arm/krait/bionic/memcpy_base.S
index 068f2f60c..76c5a8459 100644
--- a/libc/arch-arm/krait/bionic/memcpy_base.S
+++ b/libc/arch-arm/krait/bionic/memcpy_base.S
@@ -30,59 +30,35 @@
#include <machine/cpu-features.h>
#include <machine/asm.h>
-/*
- * These default settings are good for all Krait-based systems
- * as of this writing, but they can be overridden in:
- * device/<vendor>/<board>/BoardConfig.mk
- * by setting the following:
- * TARGET_USE_KRAIT_BIONIC_OPTIMIZATION := true
- * TARGET_USE_KRAIT_PLD_SET := true
- * TARGET_KRAIT_BIONIC_PLDOFFS := <pldoffset>
- * TARGET_KRAIT_BIONIC_PLDSIZE := <pldsize>
- * TARGET_KRAIT_BIONIC_PLDTHRESH := <pldthreshold>
- * TARGET_KRAIT_BIONIC_BBTHRESH := <bbthreshold>
- */
-
-#ifndef PLDOFFS
#define PLDOFFS (10)
-#endif
-#ifndef PLDTHRESH
#define PLDTHRESH (PLDOFFS)
-#endif
-#ifndef BBTHRESH
#define BBTHRESH (4096/64)
-#endif
+#define PLDSIZE (64)
+
#if (PLDOFFS < 1)
#error Routine does not support offsets less than 1
#endif
+
#if (PLDTHRESH < PLDOFFS)
#error PLD threshold must be greater than or equal to the PLD offset
#endif
-#ifndef PLDSIZE
-#define PLDSIZE (64)
-#endif
+
.text
.fpu neon
-ENTRY(MEMCPY_BASE)
-MEMCPY_BASE_ALIGNED:
- // .cfi_startproc
- .save {r0, r9, r10, lr}
- // .cfi_def_cfa_offset 8
- //.cfi_rel_offset r0, 0
- //.cfi_rel_offset lr, 4
+.L_memcpy_base:
cmp r2, #4
- blt .Lneon_lt4
+ blt .L_neon_lt4
cmp r2, #16
- blt .Lneon_lt16
+ blt .L_neon_lt16
cmp r2, #32
- blt .Lneon_16
+ blt .L_neon_16
cmp r2, #64
- blt .Lneon_copy_32_a
+ blt .L_neon_copy_32_a
mov r12, r2, lsr #6
cmp r12, #PLDTHRESH
- ble .Lneon_copy_64_loop_nopld
+ ble .L_neon_copy_64_loop_nopld
push {r9, r10}
.cfi_adjust_cfa_offset 8
@@ -90,7 +66,7 @@ MEMCPY_BASE_ALIGNED:
.cfi_rel_offset r10, 4
cmp r12, #BBTHRESH
- ble .Lneon_prime_pump
+ ble .L_neon_prime_pump
add lr, r0, #0x400
add r9, r1, #(PLDOFFS*PLDSIZE)
@@ -99,12 +75,12 @@ MEMCPY_BASE_ALIGNED:
lsr lr, lr, #21
add lr, lr, #(PLDOFFS*PLDSIZE)
cmp r12, lr, lsr #6
- ble .Lneon_prime_pump
+ ble .L_neon_prime_pump
itt gt
movgt r9, #(PLDOFFS)
rsbsgt r9, r9, lr, lsr #6
- ble .Lneon_prime_pump
+ ble .L_neon_prime_pump
add r10, r1, lr
bic r10, #0x3F
@@ -118,7 +94,7 @@ MEMCPY_BASE_ALIGNED:
movgt r12, #0
pld [r1, #((PLDOFFS-1)*PLDSIZE)]
-.Lneon_copy_64_loop_outer_doublepld:
+.L_neon_copy_64_loop_outer_doublepld:
pld [r1, #((PLDOFFS)*PLDSIZE)]
vld1.32 {q0, q1}, [r1]!
vld1.32 {q2, q3}, [r1]!
@@ -127,14 +103,14 @@ MEMCPY_BASE_ALIGNED:
vst1.32 {q0, q1}, [r0]!
vst1.32 {q2, q3}, [r0]!
add r10, #64
- bne .Lneon_copy_64_loop_outer_doublepld
+ bne .L_neon_copy_64_loop_outer_doublepld
cmp r12, #0
- beq .Lneon_pop_before_nopld
+ beq .L_neon_pop_before_nopld
cmp r12, #(512*1024/64)
- blt .Lneon_copy_64_loop_outer
+ blt .L_neon_copy_64_loop_outer
-.Lneon_copy_64_loop_ddr:
+.L_neon_copy_64_loop_ddr:
vld1.32 {q0, q1}, [r1]!
vld1.32 {q2, q3}, [r1]!
pld [r10]
@@ -142,16 +118,17 @@ MEMCPY_BASE_ALIGNED:
vst1.32 {q0, q1}, [r0]!
vst1.32 {q2, q3}, [r0]!
add r10, #64
- bne .Lneon_copy_64_loop_ddr
- b .Lneon_pop_before_nopld
+ bne .L_neon_copy_64_loop_ddr
+ b .L_neon_pop_before_nopld
-.Lneon_prime_pump:
+.L_neon_prime_pump:
mov lr, #(PLDOFFS*PLDSIZE)
add r10, r1, #(PLDOFFS*PLDSIZE)
bic r10, #0x3F
sub r12, r12, #PLDOFFS
ldr r3, [r10, #(-1*PLDSIZE)]
-.Lneon_copy_64_loop_outer:
+
+.L_neon_copy_64_loop_outer:
vld1.32 {q0, q1}, [r1]!
vld1.32 {q2, q3}, [r1]!
ldr r3, [r10]
@@ -159,47 +136,49 @@ MEMCPY_BASE_ALIGNED:
vst1.32 {q0, q1}, [r0]!
vst1.32 {q2, q3}, [r0]!
add r10, #64
- bne .Lneon_copy_64_loop_outer
-.Lneon_pop_before_nopld:
+ bne .L_neon_copy_64_loop_outer
+
+.L_neon_pop_before_nopld:
mov r12, lr, lsr #6
pop {r9, r10}
+ .cfi_adjust_cfa_offset -8
.cfi_restore r9
.cfi_restore r10
- .cfi_adjust_cfa_offset -8
-.Lneon_copy_64_loop_nopld:
+.L_neon_copy_64_loop_nopld:
vld1.32 {q8, q9}, [r1]!
vld1.32 {q10, q11}, [r1]!
subs r12, r12, #1
vst1.32 {q8, q9}, [r0]!
vst1.32 {q10, q11}, [r0]!
- bne .Lneon_copy_64_loop_nopld
+ bne .L_neon_copy_64_loop_nopld
ands r2, r2, #0x3f
- .cfi_restore r0
- .cfi_adjust_cfa_offset -4
- beq .Lneon_exit
-.Lneon_copy_32_a:
+ beq .L_neon_exit
+
+.L_neon_copy_32_a:
movs r3, r2, lsl #27
- bcc .Lneon_16
+ bcc .L_neon_16
vld1.32 {q0,q1}, [r1]!
vst1.32 {q0,q1}, [r0]!
-.Lneon_16:
- bpl .Lneon_lt16
+
+.L_neon_16:
+ bpl .L_neon_lt16
vld1.32 {q8}, [r1]!
vst1.32 {q8}, [r0]!
ands r2, r2, #0x0f
- beq .Lneon_exit
-.Lneon_lt16:
+ beq .L_neon_exit
+
+.L_neon_lt16:
movs r3, r2, lsl #29
- itttt cs
- ldrcs r3, [r1], #4
- strcs r3, [r0], #4
- ldrcs r3, [r1], #4
- strcs r3, [r0], #4
- itt mi
- ldrmi r3, [r1], #4
- strmi r3, [r0], #4
-.Lneon_lt4:
+ bcc 1f
+ vld1.8 {d0}, [r1]!
+ vst1.8 {d0}, [r0]!
+1:
+ bge .L_neon_lt4
+ vld4.8 {d0[0], d1[0], d2[0], d3[0]}, [r1]!
+ vst4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0]!
+
+.L_neon_lt4:
movs r2, r2, lsl #31
itt cs
ldrhcs r3, [r1], #2
@@ -207,9 +186,6 @@ MEMCPY_BASE_ALIGNED:
itt mi
ldrbmi r3, [r1]
strbmi r3, [r0]
-.Lneon_exit:
- pop {r0, lr}
- bx lr
- //.cfi_endproc
-END(MEMCPY_BASE)
+.L_neon_exit:
+ pop {r0, pc}