diff options
author | Brent DeGraaf <bdegraaf@codeaurora.org> | 2013-10-03 16:47:18 -0400 |
---|---|---|
committer | Steve Kondik <shade@chemlab.org> | 2013-12-20 12:54:44 -0800 |
commit | 15e069e25b7d1818ce2fdac8e175ca4db9c2742b (patch) | |
tree | f1d228397350322506f345153fd14a1cfdf26c77 | |
parent | d55217e0a2ad539d09d9630005babfbe9fb8b9c3 (diff) | |
download | bionic-15e069e25b7d1818ce2fdac8e175ca4db9c2742b.tar.gz bionic-15e069e25b7d1818ce2fdac8e175ca4db9c2742b.tar.bz2 bionic-15e069e25b7d1818ce2fdac8e175ca4db9c2742b.zip |
libc: krait: Implement optimized versions of memmove and bcopy
Restore a jellybean optimization for memmove and bcopy on krait.
Code has been refactored to thumb2 for consistency with the rest of
bionic libc, as well as performance and correctness.
Change-Id: I8f2f77dce4534fbce1bdf0188ab353bf34ab8637
-rw-r--r-- | libc/arch-arm/krait/bionic/memmove.S | 167 | ||||
-rw-r--r-- | libc/arch-arm/krait/krait.mk | 1 |
2 files changed, 79 insertions, 89 deletions
diff --git a/libc/arch-arm/krait/bionic/memmove.S b/libc/arch-arm/krait/bionic/memmove.S index 349c8e300..cfa06cef4 100644 --- a/libc/arch-arm/krait/bionic/memmove.S +++ b/libc/arch-arm/krait/bionic/memmove.S @@ -36,7 +36,7 @@ ***************************************************************************/ #include <machine/cpu-features.h> - +#include <machine/asm.h> /* * These can be overridden in: * device/<vendor>/<board>/BoardConfig.mk @@ -62,148 +62,137 @@ #ifndef PLDSIZE #define PLDSIZE (64) #endif -#define NOP_OPCODE (0xe320f000) - - .code 32 - .align 5 - .global memmove - .type memmove, %function - .global _memmove_words - .type _memmove_words, %function + .text + .syntax unified + .fpu neon + .thumb + .thumb_func - .global bcopy - .type bcopy, %function - -bcopy: +ENTRY(bcopy) + .cfi_startproc mov r12, r0 mov r0, r1 mov r1, r12 - .balignl 64, NOP_OPCODE, 4*2 -memmove: + // Fall through to memmove + .cfi_endproc +END(bcopy) + +ENTRY(memmove) _memmove_words: -.Lneon_memmove_cmf: - subs r12, r0, r1 + .cfi_startproc + .save {r0, lr} + cmp r2, #0 + it ne + subsne r12, r0, r1 + it eq bxeq lr - cmphi r2, r12 - bls memcpy /* Use memcpy for non-overlapping areas */ - - push {r0} - -.Lneon_back_to_front_copy: +// memmove only if r1 < r0 < r1+r2 + cmp r0, r1 + itt ge + addge r12, r1, r2 + cmpge r12, r0 + it le + ble memcpy + cmp r2, #4 + it le + ble .Lneon_b2f_smallcopy_loop + push {r0, lr} add r0, r0, r2 add r1, r1, r2 - cmp r2, #4 - bgt .Lneon_b2f_gt4 - cmp r2, #0 -.Lneon_b2f_smallcopy_loop: - beq .Lneon_memmove_done - ldrb r12, [r1, #-1]! - subs r2, r2, #1 - strb r12, [r0, #-1]! - b .Lneon_b2f_smallcopy_loop -.Lneon_b2f_gt4: - sub r3, r0, r1 - cmp r2, r3 - movle r12, r2 - movgt r12, r3 - cmp r12, #64 + cmp r2, #64 + it ge bge .Lneon_b2f_copy_64 - cmp r12, #32 + cmp r2, #32 + it ge bge .Lneon_b2f_copy_32 - cmp r12, #8 + cmp r2, #8 + it ge bge .Lneon_b2f_copy_8 - cmp r12, #4 - bge .Lneon_b2f_copy_4 b .Lneon_b2f_copy_1 .Lneon_b2f_copy_64: - sub r1, r1, #64 /* Predecrement */ - sub r0, r0, #64 - movs r12, r2, lsr #6 + mov r12, r2, lsr #6 + add r1, r1, #32 + add r0, r0, #32 cmp r12, #PLDTHRESH + it le ble .Lneon_b2f_copy_64_loop_nopld sub r12, #PLDOFFS - pld [r1, #-(PLDOFFS-5)*PLDSIZE] - pld [r1, #-(PLDOFFS-4)*PLDSIZE] - pld [r1, #-(PLDOFFS-3)*PLDSIZE] - pld [r1, #-(PLDOFFS-2)*PLDSIZE] - pld [r1, #-(PLDOFFS-1)*PLDSIZE] - .balignl 64, NOP_OPCODE, 4*2 + sub lr, r1, #(PLDOFFS)*PLDSIZE .Lneon_b2f_copy_64_loop_outer: - pld [r1, #-(PLDOFFS)*PLDSIZE] + pld [lr] + sub r1, r1, #96 + sub r0, r0, #96 vld1.32 {q0, q1}, [r1]! vld1.32 {q2, q3}, [r1] + sub lr, lr, #64 subs r12, r12, #1 vst1.32 {q0, q1}, [r0]! - sub r1, r1, #96 /* Post-fixup and predecrement */ vst1.32 {q2, q3}, [r0] - sub r0, r0, #96 + it ne bne .Lneon_b2f_copy_64_loop_outer mov r12, #PLDOFFS - .balignl 64, NOP_OPCODE, 4*2 .Lneon_b2f_copy_64_loop_nopld: + sub r1, r1, #96 + sub r0, r0, #96 vld1.32 {q8, q9}, [r1]! vld1.32 {q10, q11}, [r1] subs r12, r12, #1 vst1.32 {q8, q9}, [r0]! - sub r1, r1, #96 /* Post-fixup and predecrement */ vst1.32 {q10, q11}, [r0] - sub r0, r0, #96 + it ne bne .Lneon_b2f_copy_64_loop_nopld ands r2, r2, #0x3f + it eq beq .Lneon_memmove_done - add r1, r1, #64 /* Post-fixup */ - add r0, r0, #64 + sub r1, r1, #32 + sub r0, r0, #32 cmp r2, #32 - blt .Lneon_b2f_copy_finish + it lt + blt .Lneon_b2f_copy_8 .Lneon_b2f_copy_32: - mov r12, r2, lsr #5 -.Lneon_b2f_copy_32_loop: - sub r1, r1, #32 /* Predecrement */ + sub r1, r1, #32 sub r0, r0, #32 - vld1.32 {q0,q1}, [r1] - subs r12, r12, #1 - vst1.32 {q0,q1}, [r0] - bne .Lneon_b2f_copy_32_loop + vld1.32 {q0, q1}, [r1] + vst1.32 {q0, q1}, [r0] ands r2, r2, #0x1f + it eq beq .Lneon_memmove_done .Lneon_b2f_copy_finish: .Lneon_b2f_copy_8: movs r12, r2, lsr #0x3 - beq .Lneon_b2f_copy_4 - .balignl 64, NOP_OPCODE, 4*2 + it eq + beq .Lneon_b2f_copy_1 .Lneon_b2f_copy_8_loop: - sub r1, r1, #8 /* Predecrement */ + sub r1, r1, #8 sub r0, r0, #8 vld1.32 {d0}, [r1] subs r12, r12, #1 vst1.32 {d0}, [r0] + it ne bne .Lneon_b2f_copy_8_loop - ands r2, r2, #0x7 - beq .Lneon_memmove_done -.Lneon_b2f_copy_4: - movs r12, r2, lsr #0x2 - beq .Lneon_b2f_copy_1 -.Lneon_b2f_copy_4_loop: - ldr r3, [r1, #-4]! - subs r12, r12, #1 - str r3, [r0, #-4]! - bne .Lneon_b2f_copy_4_loop - ands r2, r2, #0x3 .Lneon_b2f_copy_1: - cmp r2, #0 + ands r2, r2, #0x7 + it eq beq .Lneon_memmove_done - .balignl 64, NOP_OPCODE, 4*2 + sub r1, r1, r2 + sub r0, r0, r2 .Lneon_b2f_copy_1_loop: - ldrb r12, [r1, #-1]! subs r2, r2, #1 - strb r12, [r0, #-1]! + ldrb r3, [r1, r2] + strb r3, [r0, r2] + it ne bne .Lneon_b2f_copy_1_loop - .Lneon_memmove_done: - pop {r0} + pop {r0, pc} +.Lneon_b2f_smallcopy_loop: + subs r2, r2, #1 + ldrb r3, [r1, r2] + strb r3, [r0, r2] + it ne + bne .Lneon_b2f_smallcopy_loop bx lr - - .end + .cfi_endproc +END(memmove) diff --git a/libc/arch-arm/krait/krait.mk b/libc/arch-arm/krait/krait.mk index 6a5a83952..970053dc2 100644 --- a/libc/arch-arm/krait/krait.mk +++ b/libc/arch-arm/krait/krait.mk @@ -2,6 +2,7 @@ $(call libc-add-cpu-variant-src,MEMCPY,arch-arm/krait/bionic/memcpy.S) $(call libc-add-cpu-variant-src,MEMSET,arch-arm/krait/bionic/memset.S) $(call libc-add-cpu-variant-src,STRCMP,arch-arm/krait/bionic/strcmp.S) $(call libc-add-cpu-variant-src,MEMMOVE,arch-arm/krait/bionic/memmove.S) +$(call libc-add-cpu-variant-src,BCOPY,) $(call libc-add-cpu-variant-src,__STRCAT_CHK,arch-arm/krait/bionic/__strcat_chk.S) $(call libc-add-cpu-variant-src,__STRCPY_CHK,arch-arm/krait/bionic/__strcpy_chk.S) # Use cortex-a15 versions of strcat/strcpy/strlen. |