diff options
Diffstat (limited to 'libc/arch-arm/krait/bionic/memcpy_base.S')
-rw-r--r-- | libc/arch-arm/krait/bionic/memcpy_base.S | 124 |
1 files changed, 50 insertions, 74 deletions
diff --git a/libc/arch-arm/krait/bionic/memcpy_base.S b/libc/arch-arm/krait/bionic/memcpy_base.S index 068f2f60c..76c5a8459 100644 --- a/libc/arch-arm/krait/bionic/memcpy_base.S +++ b/libc/arch-arm/krait/bionic/memcpy_base.S @@ -30,59 +30,35 @@ #include <machine/cpu-features.h> #include <machine/asm.h> -/* - * These default settings are good for all Krait-based systems - * as of this writing, but they can be overridden in: - * device/<vendor>/<board>/BoardConfig.mk - * by setting the following: - * TARGET_USE_KRAIT_BIONIC_OPTIMIZATION := true - * TARGET_USE_KRAIT_PLD_SET := true - * TARGET_KRAIT_BIONIC_PLDOFFS := <pldoffset> - * TARGET_KRAIT_BIONIC_PLDSIZE := <pldsize> - * TARGET_KRAIT_BIONIC_PLDTHRESH := <pldthreshold> - * TARGET_KRAIT_BIONIC_BBTHRESH := <bbthreshold> - */ - -#ifndef PLDOFFS #define PLDOFFS (10) -#endif -#ifndef PLDTHRESH #define PLDTHRESH (PLDOFFS) -#endif -#ifndef BBTHRESH #define BBTHRESH (4096/64) -#endif +#define PLDSIZE (64) + #if (PLDOFFS < 1) #error Routine does not support offsets less than 1 #endif + #if (PLDTHRESH < PLDOFFS) #error PLD threshold must be greater than or equal to the PLD offset #endif -#ifndef PLDSIZE -#define PLDSIZE (64) -#endif + .text .fpu neon -ENTRY(MEMCPY_BASE) -MEMCPY_BASE_ALIGNED: - // .cfi_startproc - .save {r0, r9, r10, lr} - // .cfi_def_cfa_offset 8 - //.cfi_rel_offset r0, 0 - //.cfi_rel_offset lr, 4 +.L_memcpy_base: cmp r2, #4 - blt .Lneon_lt4 + blt .L_neon_lt4 cmp r2, #16 - blt .Lneon_lt16 + blt .L_neon_lt16 cmp r2, #32 - blt .Lneon_16 + blt .L_neon_16 cmp r2, #64 - blt .Lneon_copy_32_a + blt .L_neon_copy_32_a mov r12, r2, lsr #6 cmp r12, #PLDTHRESH - ble .Lneon_copy_64_loop_nopld + ble .L_neon_copy_64_loop_nopld push {r9, r10} .cfi_adjust_cfa_offset 8 @@ -90,7 +66,7 @@ MEMCPY_BASE_ALIGNED: .cfi_rel_offset r10, 4 cmp r12, #BBTHRESH - ble .Lneon_prime_pump + ble .L_neon_prime_pump add lr, r0, #0x400 add r9, r1, #(PLDOFFS*PLDSIZE) @@ -99,12 +75,12 @@ MEMCPY_BASE_ALIGNED: lsr lr, lr, #21 add lr, lr, #(PLDOFFS*PLDSIZE) cmp r12, lr, lsr #6 - ble .Lneon_prime_pump + ble .L_neon_prime_pump itt gt movgt r9, #(PLDOFFS) rsbsgt r9, r9, lr, lsr #6 - ble .Lneon_prime_pump + ble .L_neon_prime_pump add r10, r1, lr bic r10, #0x3F @@ -118,7 +94,7 @@ MEMCPY_BASE_ALIGNED: movgt r12, #0 pld [r1, #((PLDOFFS-1)*PLDSIZE)] -.Lneon_copy_64_loop_outer_doublepld: +.L_neon_copy_64_loop_outer_doublepld: pld [r1, #((PLDOFFS)*PLDSIZE)] vld1.32 {q0, q1}, [r1]! vld1.32 {q2, q3}, [r1]! @@ -127,14 +103,14 @@ MEMCPY_BASE_ALIGNED: vst1.32 {q0, q1}, [r0]! vst1.32 {q2, q3}, [r0]! add r10, #64 - bne .Lneon_copy_64_loop_outer_doublepld + bne .L_neon_copy_64_loop_outer_doublepld cmp r12, #0 - beq .Lneon_pop_before_nopld + beq .L_neon_pop_before_nopld cmp r12, #(512*1024/64) - blt .Lneon_copy_64_loop_outer + blt .L_neon_copy_64_loop_outer -.Lneon_copy_64_loop_ddr: +.L_neon_copy_64_loop_ddr: vld1.32 {q0, q1}, [r1]! vld1.32 {q2, q3}, [r1]! pld [r10] @@ -142,16 +118,17 @@ MEMCPY_BASE_ALIGNED: vst1.32 {q0, q1}, [r0]! vst1.32 {q2, q3}, [r0]! add r10, #64 - bne .Lneon_copy_64_loop_ddr - b .Lneon_pop_before_nopld + bne .L_neon_copy_64_loop_ddr + b .L_neon_pop_before_nopld -.Lneon_prime_pump: +.L_neon_prime_pump: mov lr, #(PLDOFFS*PLDSIZE) add r10, r1, #(PLDOFFS*PLDSIZE) bic r10, #0x3F sub r12, r12, #PLDOFFS ldr r3, [r10, #(-1*PLDSIZE)] -.Lneon_copy_64_loop_outer: + +.L_neon_copy_64_loop_outer: vld1.32 {q0, q1}, [r1]! vld1.32 {q2, q3}, [r1]! ldr r3, [r10] @@ -159,47 +136,49 @@ MEMCPY_BASE_ALIGNED: vst1.32 {q0, q1}, [r0]! vst1.32 {q2, q3}, [r0]! add r10, #64 - bne .Lneon_copy_64_loop_outer -.Lneon_pop_before_nopld: + bne .L_neon_copy_64_loop_outer + +.L_neon_pop_before_nopld: mov r12, lr, lsr #6 pop {r9, r10} + .cfi_adjust_cfa_offset -8 .cfi_restore r9 .cfi_restore r10 - .cfi_adjust_cfa_offset -8 -.Lneon_copy_64_loop_nopld: +.L_neon_copy_64_loop_nopld: vld1.32 {q8, q9}, [r1]! vld1.32 {q10, q11}, [r1]! subs r12, r12, #1 vst1.32 {q8, q9}, [r0]! vst1.32 {q10, q11}, [r0]! - bne .Lneon_copy_64_loop_nopld + bne .L_neon_copy_64_loop_nopld ands r2, r2, #0x3f - .cfi_restore r0 - .cfi_adjust_cfa_offset -4 - beq .Lneon_exit -.Lneon_copy_32_a: + beq .L_neon_exit + +.L_neon_copy_32_a: movs r3, r2, lsl #27 - bcc .Lneon_16 + bcc .L_neon_16 vld1.32 {q0,q1}, [r1]! vst1.32 {q0,q1}, [r0]! -.Lneon_16: - bpl .Lneon_lt16 + +.L_neon_16: + bpl .L_neon_lt16 vld1.32 {q8}, [r1]! vst1.32 {q8}, [r0]! ands r2, r2, #0x0f - beq .Lneon_exit -.Lneon_lt16: + beq .L_neon_exit + +.L_neon_lt16: movs r3, r2, lsl #29 - itttt cs - ldrcs r3, [r1], #4 - strcs r3, [r0], #4 - ldrcs r3, [r1], #4 - strcs r3, [r0], #4 - itt mi - ldrmi r3, [r1], #4 - strmi r3, [r0], #4 -.Lneon_lt4: + bcc 1f + vld1.8 {d0}, [r1]! + vst1.8 {d0}, [r0]! +1: + bge .L_neon_lt4 + vld4.8 {d0[0], d1[0], d2[0], d3[0]}, [r1]! + vst4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0]! + +.L_neon_lt4: movs r2, r2, lsl #31 itt cs ldrhcs r3, [r1], #2 @@ -207,9 +186,6 @@ MEMCPY_BASE_ALIGNED: itt mi ldrbmi r3, [r1] strbmi r3, [r0] -.Lneon_exit: - pop {r0, lr} - bx lr - //.cfi_endproc -END(MEMCPY_BASE) +.L_neon_exit: + pop {r0, pc} |