summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBrent DeGraaf <bdegraaf@codeaurora.org>2013-10-03 16:47:18 -0400
committerSteve Kondik <shade@chemlab.org>2013-12-20 12:54:44 -0800
commit15e069e25b7d1818ce2fdac8e175ca4db9c2742b (patch)
treef1d228397350322506f345153fd14a1cfdf26c77
parentd55217e0a2ad539d09d9630005babfbe9fb8b9c3 (diff)
downloadbionic-15e069e25b7d1818ce2fdac8e175ca4db9c2742b.tar.gz
bionic-15e069e25b7d1818ce2fdac8e175ca4db9c2742b.tar.bz2
bionic-15e069e25b7d1818ce2fdac8e175ca4db9c2742b.zip
libc: krait: Implement optimized versions of memmove and bcopy
Restore a jellybean optimization for memmove and bcopy on krait. Code has been refactored to thumb2 for consistency with the rest of bionic libc, as well as performance and correctness. Change-Id: I8f2f77dce4534fbce1bdf0188ab353bf34ab8637
-rw-r--r--libc/arch-arm/krait/bionic/memmove.S167
-rw-r--r--libc/arch-arm/krait/krait.mk1
2 files changed, 79 insertions, 89 deletions
diff --git a/libc/arch-arm/krait/bionic/memmove.S b/libc/arch-arm/krait/bionic/memmove.S
index 349c8e300..cfa06cef4 100644
--- a/libc/arch-arm/krait/bionic/memmove.S
+++ b/libc/arch-arm/krait/bionic/memmove.S
@@ -36,7 +36,7 @@
***************************************************************************/
#include <machine/cpu-features.h>
-
+#include <machine/asm.h>
/*
* These can be overridden in:
* device/<vendor>/<board>/BoardConfig.mk
@@ -62,148 +62,137 @@
#ifndef PLDSIZE
#define PLDSIZE (64)
#endif
-#define NOP_OPCODE (0xe320f000)
-
- .code 32
- .align 5
- .global memmove
- .type memmove, %function
- .global _memmove_words
- .type _memmove_words, %function
+ .text
+ .syntax unified
+ .fpu neon
+ .thumb
+ .thumb_func
- .global bcopy
- .type bcopy, %function
-
-bcopy:
+ENTRY(bcopy)
+ .cfi_startproc
mov r12, r0
mov r0, r1
mov r1, r12
- .balignl 64, NOP_OPCODE, 4*2
-memmove:
+ // Fall through to memmove
+ .cfi_endproc
+END(bcopy)
+
+ENTRY(memmove)
_memmove_words:
-.Lneon_memmove_cmf:
- subs r12, r0, r1
+ .cfi_startproc
+ .save {r0, lr}
+ cmp r2, #0
+ it ne
+ subsne r12, r0, r1
+ it eq
bxeq lr
- cmphi r2, r12
- bls memcpy /* Use memcpy for non-overlapping areas */
-
- push {r0}
-
-.Lneon_back_to_front_copy:
+// memmove only if r1 < r0 < r1+r2
+ cmp r0, r1
+ itt ge
+ addge r12, r1, r2
+ cmpge r12, r0
+ it le
+ ble memcpy
+ cmp r2, #4
+ it le
+ ble .Lneon_b2f_smallcopy_loop
+ push {r0, lr}
add r0, r0, r2
add r1, r1, r2
- cmp r2, #4
- bgt .Lneon_b2f_gt4
- cmp r2, #0
-.Lneon_b2f_smallcopy_loop:
- beq .Lneon_memmove_done
- ldrb r12, [r1, #-1]!
- subs r2, r2, #1
- strb r12, [r0, #-1]!
- b .Lneon_b2f_smallcopy_loop
-.Lneon_b2f_gt4:
- sub r3, r0, r1
- cmp r2, r3
- movle r12, r2
- movgt r12, r3
- cmp r12, #64
+ cmp r2, #64
+ it ge
bge .Lneon_b2f_copy_64
- cmp r12, #32
+ cmp r2, #32
+ it ge
bge .Lneon_b2f_copy_32
- cmp r12, #8
+ cmp r2, #8
+ it ge
bge .Lneon_b2f_copy_8
- cmp r12, #4
- bge .Lneon_b2f_copy_4
b .Lneon_b2f_copy_1
.Lneon_b2f_copy_64:
- sub r1, r1, #64 /* Predecrement */
- sub r0, r0, #64
- movs r12, r2, lsr #6
+ mov r12, r2, lsr #6
+ add r1, r1, #32
+ add r0, r0, #32
cmp r12, #PLDTHRESH
+ it le
ble .Lneon_b2f_copy_64_loop_nopld
sub r12, #PLDOFFS
- pld [r1, #-(PLDOFFS-5)*PLDSIZE]
- pld [r1, #-(PLDOFFS-4)*PLDSIZE]
- pld [r1, #-(PLDOFFS-3)*PLDSIZE]
- pld [r1, #-(PLDOFFS-2)*PLDSIZE]
- pld [r1, #-(PLDOFFS-1)*PLDSIZE]
- .balignl 64, NOP_OPCODE, 4*2
+ sub lr, r1, #(PLDOFFS)*PLDSIZE
.Lneon_b2f_copy_64_loop_outer:
- pld [r1, #-(PLDOFFS)*PLDSIZE]
+ pld [lr]
+ sub r1, r1, #96
+ sub r0, r0, #96
vld1.32 {q0, q1}, [r1]!
vld1.32 {q2, q3}, [r1]
+ sub lr, lr, #64
subs r12, r12, #1
vst1.32 {q0, q1}, [r0]!
- sub r1, r1, #96 /* Post-fixup and predecrement */
vst1.32 {q2, q3}, [r0]
- sub r0, r0, #96
+ it ne
bne .Lneon_b2f_copy_64_loop_outer
mov r12, #PLDOFFS
- .balignl 64, NOP_OPCODE, 4*2
.Lneon_b2f_copy_64_loop_nopld:
+ sub r1, r1, #96
+ sub r0, r0, #96
vld1.32 {q8, q9}, [r1]!
vld1.32 {q10, q11}, [r1]
subs r12, r12, #1
vst1.32 {q8, q9}, [r0]!
- sub r1, r1, #96 /* Post-fixup and predecrement */
vst1.32 {q10, q11}, [r0]
- sub r0, r0, #96
+ it ne
bne .Lneon_b2f_copy_64_loop_nopld
ands r2, r2, #0x3f
+ it eq
beq .Lneon_memmove_done
- add r1, r1, #64 /* Post-fixup */
- add r0, r0, #64
+ sub r1, r1, #32
+ sub r0, r0, #32
cmp r2, #32
- blt .Lneon_b2f_copy_finish
+ it lt
+ blt .Lneon_b2f_copy_8
.Lneon_b2f_copy_32:
- mov r12, r2, lsr #5
-.Lneon_b2f_copy_32_loop:
- sub r1, r1, #32 /* Predecrement */
+ sub r1, r1, #32
sub r0, r0, #32
- vld1.32 {q0,q1}, [r1]
- subs r12, r12, #1
- vst1.32 {q0,q1}, [r0]
- bne .Lneon_b2f_copy_32_loop
+ vld1.32 {q0, q1}, [r1]
+ vst1.32 {q0, q1}, [r0]
ands r2, r2, #0x1f
+ it eq
beq .Lneon_memmove_done
.Lneon_b2f_copy_finish:
.Lneon_b2f_copy_8:
movs r12, r2, lsr #0x3
- beq .Lneon_b2f_copy_4
- .balignl 64, NOP_OPCODE, 4*2
+ it eq
+ beq .Lneon_b2f_copy_1
.Lneon_b2f_copy_8_loop:
- sub r1, r1, #8 /* Predecrement */
+ sub r1, r1, #8
sub r0, r0, #8
vld1.32 {d0}, [r1]
subs r12, r12, #1
vst1.32 {d0}, [r0]
+ it ne
bne .Lneon_b2f_copy_8_loop
- ands r2, r2, #0x7
- beq .Lneon_memmove_done
-.Lneon_b2f_copy_4:
- movs r12, r2, lsr #0x2
- beq .Lneon_b2f_copy_1
-.Lneon_b2f_copy_4_loop:
- ldr r3, [r1, #-4]!
- subs r12, r12, #1
- str r3, [r0, #-4]!
- bne .Lneon_b2f_copy_4_loop
- ands r2, r2, #0x3
.Lneon_b2f_copy_1:
- cmp r2, #0
+ ands r2, r2, #0x7
+ it eq
beq .Lneon_memmove_done
- .balignl 64, NOP_OPCODE, 4*2
+ sub r1, r1, r2
+ sub r0, r0, r2
.Lneon_b2f_copy_1_loop:
- ldrb r12, [r1, #-1]!
subs r2, r2, #1
- strb r12, [r0, #-1]!
+ ldrb r3, [r1, r2]
+ strb r3, [r0, r2]
+ it ne
bne .Lneon_b2f_copy_1_loop
-
.Lneon_memmove_done:
- pop {r0}
+ pop {r0, pc}
+.Lneon_b2f_smallcopy_loop:
+ subs r2, r2, #1
+ ldrb r3, [r1, r2]
+ strb r3, [r0, r2]
+ it ne
+ bne .Lneon_b2f_smallcopy_loop
bx lr
-
- .end
+ .cfi_endproc
+END(memmove)
diff --git a/libc/arch-arm/krait/krait.mk b/libc/arch-arm/krait/krait.mk
index 6a5a83952..970053dc2 100644
--- a/libc/arch-arm/krait/krait.mk
+++ b/libc/arch-arm/krait/krait.mk
@@ -2,6 +2,7 @@ $(call libc-add-cpu-variant-src,MEMCPY,arch-arm/krait/bionic/memcpy.S)
$(call libc-add-cpu-variant-src,MEMSET,arch-arm/krait/bionic/memset.S)
$(call libc-add-cpu-variant-src,STRCMP,arch-arm/krait/bionic/strcmp.S)
$(call libc-add-cpu-variant-src,MEMMOVE,arch-arm/krait/bionic/memmove.S)
+$(call libc-add-cpu-variant-src,BCOPY,)
$(call libc-add-cpu-variant-src,__STRCAT_CHK,arch-arm/krait/bionic/__strcat_chk.S)
$(call libc-add-cpu-variant-src,__STRCPY_CHK,arch-arm/krait/bionic/__strcpy_chk.S)
# Use cortex-a15 versions of strcat/strcpy/strlen.