aboutsummaryrefslogtreecommitdiffstats
path: root/libc/arch-arm
diff options
context:
space:
mode:
Diffstat (limited to 'libc/arch-arm')
-rw-r--r--libc/arch-arm/bionic/arm_memcpy.S123
-rw-r--r--libc/arch-arm/bionic/atexit.S69
-rw-r--r--libc/arch-arm/bionic/crtbegin_dynamic.S1
-rw-r--r--libc/arch-arm/bionic/crtbegin_so.S6
-rw-r--r--libc/arch-arm/bionic/crtbegin_static.S1
-rw-r--r--libc/arch-arm/bionic/ffs.S35
-rw-r--r--libc/arch-arm/bionic/memcmp.S81
-rw-r--r--libc/arch-arm/bionic/memcpy.S126
-rw-r--r--libc/arch-arm/bionic/memmove.S356
-rw-r--r--libc/arch-arm/bionic/memset.S87
-rw-r--r--libc/arch-arm/bionic/strlen-armv7.S111
11 files changed, 950 insertions, 46 deletions
diff --git a/libc/arch-arm/bionic/arm_memcpy.S b/libc/arch-arm/bionic/arm_memcpy.S
new file mode 100644
index 000000000..ae1cf1ad1
--- /dev/null
+++ b/libc/arch-arm/bionic/arm_memcpy.S
@@ -0,0 +1,123 @@
+/*
+ * Copyright (C) 2011 Texas Instruments
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+ * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+ .text
+ .fpu neon
+ .code 32
+ .align 4
+
+
+/* r0 - dest */
+/* r1 - src */
+/* r2 - length */
+ .global memcpy
+memcpy:
+ .fnstart
+#if defined TARGET_BOARD_PLATFORM == omap4
+#define CACHE_LINE_SIZE 32
+#else
+#define CACHE_LINE_SIZE 64
+#endif
+ CMP r2,#3
+ BLS _BMLIB_memcpy_lastbytes
+ ANDS r12,r0,#3
+ BEQ l1;
+ LDRB r3,[r1],#1
+ CMP r12,#2
+ ADD r2,r2,r12
+ LDRLSB r12, [r1], #1
+ STRB r3,[r0],#1
+ LDRCCB r3,[r1],#1
+ STRLSB r12,[r0],#1
+ SUB r2,r2,#4
+ STRCCB r3,[r0],#1
+l1:
+ ANDS r3,r1,#3
+ BEQ _BMLIB_aeabi_memcpy4
+l3:
+ SUBS r2,r2,#8
+ BCC l2
+ LDR r3,[r1],#4
+ LDR r12,[r1],#4
+ STR r3,[r0],#4
+ STR r12,[r0],#4
+ B l3
+l2:
+ ADDS r2,r2,#4
+ LDRPL r3,[r1],#4
+ STRPL r3,[r0],#4
+ MOV r0,r0
+_BMLIB_memcpy_lastbytes:
+ LSLS r2,r2,#31
+ LDRCSB r3,[r1],#1
+ LDRCSB r12,[r1],#1
+ LDRMIB r2,[r1],#1
+ STRCSB r3,[r0],#1
+ STRCSB r12,[r0],#1
+ STRMIB r2,[r0],#1
+ BX lr
+
+_BMLIB_aeabi_memcpy4:
+ PUSH {r4-r8,lr}
+ SUBS r2,r2,#0x20
+ BCC l4
+ DSB
+#ifndef NOPLD
+ PLD [r1, #0]
+ PLD [r1, #(CACHE_LINE_SIZE*1)]
+ PLD [r1, #(CACHE_LINE_SIZE*2)]
+ PLD [r1, #(CACHE_LINE_SIZE*3)]
+ PLD [r1, #(CACHE_LINE_SIZE*4)]
+#endif
+l5:
+#ifndef NOPLD
+ PLD [r1, #(CACHE_LINE_SIZE*5)]
+#endif
+ LDMCS r1!,{r3-r8,r12,lr}
+ STMCS r0!,{r3-r8,r12,lr}
+ SUBS r2,r2,#0x20
+ BCS l5
+l4:
+ LSLS r12,r2,#28
+ LDMCS r1!,{r3,r4,r12,lr}
+ STMCS r0!,{r3,r4,r12,lr}
+ LDMMI r1!,{r3,r4}
+ STMMI r0!,{r3,r4}
+ POP {r4-r8,lr}
+ LSLS r12,r2,#30
+ LDRCS r3,[r1],#4
+ STRCS r3,[r0],#4
+ BXEQ lr
+_BMLIB_memcpy_lastbytes_aligned:
+ LSLS r2,r2,#31
+ LDRCSH r3,[r1],#2
+ LDRMIB r2,[r1],#1
+ STRCSH r3,[r0],#2
+ STRMIB r2,[r0],#1
+ BX lr
+ .fnend
diff --git a/libc/arch-arm/bionic/atexit.S b/libc/arch-arm/bionic/atexit.S
new file mode 100644
index 000000000..aa1e18d61
--- /dev/null
+++ b/libc/arch-arm/bionic/atexit.S
@@ -0,0 +1,69 @@
+/*
+ * Copyright (C) 2011 The Android Open Source Project
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+ * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#ifndef CRT_LEGACY_WORKAROUND
+ .arch armv5te
+ .fpu softvfp
+ .eabi_attribute 20, 1
+ .eabi_attribute 21, 1
+ .eabi_attribute 23, 3
+ .eabi_attribute 24, 1
+ .eabi_attribute 25, 1
+ .eabi_attribute 26, 2
+ .eabi_attribute 30, 4
+ .eabi_attribute 18, 4
+ .code 16
+ .section .text.atexit,"ax",%progbits
+ .align 2
+ .global atexit
+ .hidden atexit
+ .code 16
+ .thumb_func
+ .type atexit, %function
+atexit:
+ .fnstart
+.LFB0:
+ .save {r4, lr}
+ push {r4, lr}
+.LCFI0:
+ ldr r3, .L3
+ mov r1, #0
+ @ sp needed for prologue
+.LPIC0:
+ add r3, pc
+ ldr r2, [r3]
+ bl __cxa_atexit
+ pop {r4, pc}
+.L4:
+ .align 2
+.L3:
+ .word __dso_handle-(.LPIC0+4)
+.LFE0:
+ .fnend
+ .size atexit, .-atexit
+#endif
diff --git a/libc/arch-arm/bionic/crtbegin_dynamic.S b/libc/arch-arm/bionic/crtbegin_dynamic.S
index d18e715f5..099908444 100644
--- a/libc/arch-arm/bionic/crtbegin_dynamic.S
+++ b/libc/arch-arm/bionic/crtbegin_dynamic.S
@@ -85,3 +85,4 @@ __CTOR_LIST__:
.long -1
#include "__dso_handle.S"
+#include "atexit.S"
diff --git a/libc/arch-arm/bionic/crtbegin_so.S b/libc/arch-arm/bionic/crtbegin_so.S
index bb6b3e2c3..9275b1e01 100644
--- a/libc/arch-arm/bionic/crtbegin_so.S
+++ b/libc/arch-arm/bionic/crtbegin_so.S
@@ -52,4 +52,10 @@ __FINI_ARRAY__:
.long -1
.long __on_dlclose
+#ifdef CRT_LEGACY_WORKAROUND
#include "__dso_handle.S"
+#else
+#include "__dso_handle_so.S"
+#endif
+
+#include "atexit.S"
diff --git a/libc/arch-arm/bionic/crtbegin_static.S b/libc/arch-arm/bionic/crtbegin_static.S
index 6f9cf25dd..13b05b272 100644
--- a/libc/arch-arm/bionic/crtbegin_static.S
+++ b/libc/arch-arm/bionic/crtbegin_static.S
@@ -86,3 +86,4 @@ __CTOR_LIST__:
#include "__dso_handle.S"
+#include "atexit.S"
diff --git a/libc/arch-arm/bionic/ffs.S b/libc/arch-arm/bionic/ffs.S
index f11141c97..052b46a53 100644
--- a/libc/arch-arm/bionic/ffs.S
+++ b/libc/arch-arm/bionic/ffs.S
@@ -36,47 +36,14 @@
* 6 bits as an index into the table. This algorithm should be a win
* over the checking each bit in turn as per the C compiled version.
*
- * under ARMv5 there's an instruction called CLZ (count leading Zero's) that
- * could be used
- *
- * This is the ffs algorithm devised by d.seal and posted to comp.sys.arm on
- * 16 Feb 1994.
+ * since ARMv5 there's an instruction called CLZ (count leading Zero's)
*/
ENTRY(ffs)
/* Standard trick to isolate bottom bit in r0 or 0 if r0 = 0 on entry */
rsb r1, r0, #0
ands r0, r0, r1
-#ifndef __ARM_ARCH_5__
- /*
- * now r0 has at most one set bit, call this X
- * if X = 0, all further instructions are skipped
- */
- adrne r2, .L_ffs_table
- orrne r0, r0, r0, lsl #4 /* r0 = X * 0x11 */
- orrne r0, r0, r0, lsl #6 /* r0 = X * 0x451 */
- rsbne r0, r0, r0, lsl #16 /* r0 = X * 0x0450fbaf */
-
- /* now lookup in table indexed on top 6 bits of r0 */
- ldrneb r0, [ r2, r0, lsr #26 ]
-
- bx lr
-
-.text;
-.type .L_ffs_table, _ASM_TYPE_OBJECT;
-.L_ffs_table:
-/* 0 1 2 3 4 5 6 7 */
- .byte 0, 1, 2, 13, 3, 7, 0, 14 /* 0- 7 */
- .byte 4, 0, 8, 0, 0, 0, 0, 15 /* 8-15 */
- .byte 11, 5, 0, 0, 9, 0, 0, 26 /* 16-23 */
- .byte 0, 0, 0, 0, 0, 22, 28, 16 /* 24-31 */
- .byte 32, 12, 6, 0, 0, 0, 0, 0 /* 32-39 */
- .byte 10, 0, 0, 25, 0, 0, 21, 27 /* 40-47 */
- .byte 31, 0, 0, 0, 0, 24, 0, 20 /* 48-55 */
- .byte 30, 0, 23, 19, 29, 18, 17, 0 /* 56-63 */
-#else
clzne r0, r0
rsbne r0, r0, #32
bx lr
-#endif
diff --git a/libc/arch-arm/bionic/memcmp.S b/libc/arch-arm/bionic/memcmp.S
index 67dcddc1b..0fe26996c 100644
--- a/libc/arch-arm/bionic/memcmp.S
+++ b/libc/arch-arm/bionic/memcmp.S
@@ -43,36 +43,70 @@
* (2) The loads are scheduled in a way they won't stall
*/
+#if __ARM_ARCH__ >= 7
+#define __ARM_CORTEX
+
+#if defined(CORTEX_CACHE_LINE_32)
+#define CACHE_LINE_SIZE 32
+#else
+#define CACHE_LINE_SIZE 64
+#endif
+
+#endif /* __ARM_ARCH__ */
+
+
memcmp:
.fnstart
+
+#if defined(__ARM_CORTEX)
+ pld [r0, #(CACHE_LINE_SIZE * 0)]
+ pld [r0, #(CACHE_LINE_SIZE * 1)]
+#else
+
PLD (r0, #0)
PLD (r1, #0)
-
+#endif
/* take of the case where length is 0 or the buffers are the same */
cmp r0, r1
+#if !defined(__ARM_CORTEX)
cmpne r2, #0
+#endif
moveq r0, #0
bxeq lr
+#if defined(__ARM_CORTEX)
+ pld [r1, #(CACHE_LINE_SIZE * 0)]
+ pld [r1, #(CACHE_LINE_SIZE * 1)]
+
+ /* make sure we have at least 8+4 bytes, this simplify things below
+ * and avoid some overhead for small blocks
+ */
+ cmp r2, #(8+4)
+ bmi 10f
+#endif /* __ARM_CORTEX */
+
+
.save {r4, lr}
/* save registers */
stmfd sp!, {r4, lr}
-
+#if !defined(__ARM_CORTEX)
PLD (r0, #32)
PLD (r1, #32)
+#endif
/* since r0 hold the result, move the first source
* pointer somewhere else
*/
mov r4, r0
-
+
+#if !defined(__ARM_CORTEX)
/* make sure we have at least 8+4 bytes, this simplify things below
* and avoid some overhead for small blocks
*/
cmp r2, #(8+4)
bmi 8f
-
+#endif
/* align first pointer to word boundary
* offset = -src & 3
*/
@@ -109,8 +143,14 @@ memcmp:
subs r2, r2, #(32 + 4)
bmi 1f
-0: PLD (r4, #64)
+0:
+#if defined(__ARM_CORTEX)
+ pld [r4, #(CACHE_LINE_SIZE * 2)]
+ pld [r1, #(CACHE_LINE_SIZE * 2)]
+#else
+ PLD (r4, #64)
PLD (r1, #64)
+#endif
ldr r0, [r4], #4
ldr lr, [r1, #4]!
eors r0, r0, ip
@@ -176,6 +216,21 @@ memcmp:
9: /* restore registers and return */
ldmfd sp!, {r4, lr}
bx lr
+#if defined(__ARM_CORTEX)
+10: /* process less than 12 bytes */
+ cmp r2, #0
+ moveq r0, #0
+ bxeq lr
+ mov r3, r0
+11:
+ ldrb r0, [r3], #1
+ ldrb ip, [r1], #1
+ subs r0, ip
+ bxne lr
+ subs r2, r2, #1
+ bne 11b
+ bx lr
+#endif /* __ARM_CORTEX */
.fnend
@@ -198,8 +253,14 @@ memcmp:
bic r1, r1, #3
ldr lr, [r1], #4
-6: PLD (r1, #64)
+6:
+#if defined(__ARM_CORTEX)
+ pld [r1, #(CACHE_LINE_SIZE * 2)]
+ pld [r4, #(CACHE_LINE_SIZE * 2)]
+#else
+ PLD (r1, #64)
PLD (r4, #64)
+#endif
mov ip, lr, lsr #16
ldr lr, [r1], #4
ldr r0, [r4], #4
@@ -240,13 +301,13 @@ memcmp:
4: /*************** offset is 1 or 3 (less optimized) ***************/
- stmfd sp!, {r5, r6, r7}
+ stmfd sp!, {r5, r6, r7}
// r5 = rhs
// r6 = lhs
// r7 = scratch
- mov r5, r0, lsl #3 /* r5 = right shift */
+ mov r5, r0, lsl #3 /* r5 = right shift */
rsb r6, r5, #32 /* r6 = left shift */
/* align the unaligned pointer */
@@ -269,7 +330,7 @@ memcmp:
bhs 6b
sub r1, r1, r6, lsr #3
- ldmfd sp!, {r5, r6, r7}
+ ldmfd sp!, {r5, r6, r7}
/* are we done? */
adds r2, r2, #8
@@ -284,5 +345,5 @@ memcmp:
sub r1, r1, r6, lsr #3
sub r4, r4, #4
mov r2, #4
- ldmfd sp!, {r5, r6, r7}
+ ldmfd sp!, {r5, r6, r7}
b 8b
diff --git a/libc/arch-arm/bionic/memcpy.S b/libc/arch-arm/bionic/memcpy.S
index ba55996ec..f2a4e3328 100644
--- a/libc/arch-arm/bionic/memcpy.S
+++ b/libc/arch-arm/bionic/memcpy.S
@@ -2,6 +2,8 @@
* Copyright (C) 2008 The Android Open Source Project
* All rights reserved.
*
+ * Copyright (c) 2009-2011, Code Aurora Forum. All rights reserved.
+ *
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
@@ -29,7 +31,114 @@
#include <machine/cpu-features.h>
#if defined(__ARM_NEON__)
-
+#if defined(SCORPION_NEON_OPTIMIZATION)
+ /*
+ * These can be overridden in:
+ * device/<vendor>/<board>/BoardConfig.mk
+ * by setting the following:
+ * TARGET_USE_SCORPION_BIONIC_OPTIMIZATION := true
+ * TARGET_USE_SCORPION_PLD_SET := true
+ * TARGET_SCORPION_BIONIC_PLDOFFS := <pldoffset>
+ * TARGET_SCORPION_BIONIC_PLDSIZE := <pldsize>
+ */
+#ifndef PLDOFFS
+#define PLDOFFS (6)
+#endif
+#ifndef PLDSIZE
+#define PLDSIZE (128) /* L2 cache line size */
+#endif
+ .code 32
+ .align 5
+ .globl memcpy
+ .func
+memcpy:
+ push {r0}
+ cmp r2, #4
+ blt .Lneon_lt4
+ cmp r2, #16
+ blt .Lneon_lt16
+ cmp r2, #32
+ blt .Lneon_16
+ cmp r2, #128
+ blt .Lneon_copy_32_a
+ /* Copy blocks of 128-bytes (word-aligned) at a time*/
+ /* Code below is optimized for PLDSIZE=128 only */
+ mov r12, r2, lsr #7
+ cmp r12, #PLDOFFS
+ ble .Lneon_copy_128_loop_nopld
+ sub r12, #PLDOFFS
+ pld [r1, #(PLDOFFS-1)*PLDSIZE]
+.Lneon_copy_128_loop_outer:
+ pld [r1, #(PLDOFFS*PLDSIZE)]
+ vld1.32 {q0, q1}, [r1]!
+ vld1.32 {q2, q3}, [r1]!
+ vld1.32 {q8, q9}, [r1]!
+ vld1.32 {q10, q11}, [r1]!
+ subs r12, r12, #1
+ vst1.32 {q0, q1}, [r0]!
+ vst1.32 {q2, q3}, [r0]!
+ vst1.32 {q8, q9}, [r0]!
+ vst1.32 {q10, q11}, [r0]!
+ bne .Lneon_copy_128_loop_outer
+ mov r12, #PLDOFFS
+.Lneon_copy_128_loop_nopld:
+ vld1.32 {q0, q1}, [r1]!
+ vld1.32 {q2, q3}, [r1]!
+ vld1.32 {q8, q9}, [r1]!
+ vld1.32 {q10, q11}, [r1]!
+ subs r12, r12, #1
+ vst1.32 {q0, q1}, [r0]!
+ vst1.32 {q2, q3}, [r0]!
+ vst1.32 {q8, q9}, [r0]!
+ vst1.32 {q10, q11}, [r0]!
+ bne .Lneon_copy_128_loop_nopld
+ ands r2, r2, #0x7f
+ beq .Lneon_exit
+ cmp r2, #32
+ blt .Lneon_16
+ nop
+ /* Copy blocks of 32-bytes (word aligned) at a time*/
+.Lneon_copy_32_a:
+ mov r12, r2, lsr #5
+.Lneon_copy_32_loop_a:
+ vld1.32 {q0,q1}, [r1]!
+ subs r12, r12, #1
+ vst1.32 {q0,q1}, [r0]!
+ bne .Lneon_copy_32_loop_a
+ ands r2, r2, #0x1f
+ beq .Lneon_exit
+.Lneon_16:
+ subs r2, r2, #16
+ blt .Lneon_lt16
+ vld1.32 {q8}, [r1]!
+ vst1.32 {q8}, [r0]!
+ beq .Lneon_exit
+.Lneon_lt16:
+ movs r12, r2, lsl #29
+ bcc .Lneon_skip8
+ ldr r3, [r1], #4
+ ldr r12, [r1], #4
+ str r3, [r0], #4
+ str r12, [r0], #4
+.Lneon_skip8:
+ bpl .Lneon_lt4
+ ldr r3, [r1], #4
+ str r3, [r0], #4
+.Lneon_lt4:
+ movs r2, r2, lsl #31
+ bcc .Lneon_lt2
+ ldrh r3, [r1], #2
+ strh r3, [r0], #2
+.Lneon_lt2:
+ bpl .Lneon_exit
+ ldrb r12, [r1]
+ strb r12, [r0]
+.Lneon_exit:
+ pop {r0}
+ bx lr
+ .endfunc
+ .end
+#else /* !SCORPION_NEON_OPTIMIZATION */
.text
.fpu neon
@@ -141,11 +250,14 @@ memcpy:
strcsb ip, [r0], #1
strcsb lr, [r0], #1
+.ifdef NEEDS_ARM_ERRATA_754319_754320_ASM
+ VMOV s0,s0 @ NOP for ARM Errata
+.endif
ldmfd sp!, {r0, lr}
bx lr
.fnend
-
+#endif /* !SCORPION_NEON_OPTIMIZATION */
#else /* __ARM_ARCH__ < 7 */
@@ -260,20 +372,30 @@ cached_aligned32:
*
*/
+#if __ARM_ARCH__ == 5
// Align the preload register to a cache-line because the cpu does
// "critical word first" (the first word requested is loaded first).
bic r12, r1, #0x1F
add r12, r12, #64
+#endif
1: ldmia r1!, { r4-r11 }
+
+#if __ARM_ARCH__ == 5
PLD (r12, #64)
+#else
+ PLD (r1, #64)
+#endif
subs r2, r2, #32
+#if __ARM_ARCH__ == 5
// NOTE: if r12 is more than 64 ahead of r1, the following ldrhi
// for ARM9 preload will not be safely guarded by the preceding subs.
// When it is safely guarded the only possibility to have SIGSEGV here
// is because the caller overstates the length.
ldrhi r3, [r12], #32 /* cheap ARM9 preload */
+#endif
+
stmia r0!, { r4-r11 }
bhs 1b
diff --git a/libc/arch-arm/bionic/memmove.S b/libc/arch-arm/bionic/memmove.S
new file mode 100644
index 000000000..123419584
--- /dev/null
+++ b/libc/arch-arm/bionic/memmove.S
@@ -0,0 +1,356 @@
+/***************************************************************************
+ Copyright (c) 2009-2011 Code Aurora Forum. All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+ * Neither the name of Code Aurora nor the names of its contributors may
+ be used to endorse or promote products derived from this software
+ without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ POSSIBILITY OF SUCH DAMAGE.
+ ***************************************************************************/
+
+/***************************************************************************
+ * Neon memmove: Attempts to do a memmove with Neon registers if possible,
+ * Inputs:
+ * dest: The destination buffer
+ * src: The source buffer
+ * n: The size of the buffer to transfer
+ * Outputs:
+ *
+ ***************************************************************************/
+
+#include <machine/cpu-features.h>
+
+#if defined(SCORPION_NEON_OPTIMIZATION)
+ /*
+ * These can be overridden in:
+ * device/<vendor>/<board>/BoardConfig.mk
+ * by setting the following:
+ * TARGET_USE_SCORPION_BIONIC_OPTIMIZATION := true
+ * TARGET_USE_SCORPION_PLD_SET := true
+ * TARGET_SCORPION_BIONIC_PLDOFFS := <pldoffset>
+ * TARGET_SCORPION_BIONIC_PLDSIZE := <pldsize>
+ */
+#ifndef PLDOFFS
+#define PLDOFFS (6)
+#endif
+#ifndef PLDSIZE
+#define PLDSIZE (128) /* L2 cache line size */
+#endif
+
+ .code 32
+ .align 5
+ .global memmove
+ .type memmove, %function
+
+ .global bcopy
+ .type bcopy, %function
+
+bcopy:
+ mov r12, r0
+ mov r0, r1
+ mov r1, r12
+memmove:
+ push {r0}
+
+ /*
+ * The requirements for memmove state that the function should
+ * operate as if data were being copied from the source to a
+ * buffer, then to the destination. This is to allow a user
+ * to copy data from a source and target that overlap.
+ *
+ * We can't just do byte copies front-to-back automatically, since
+ * there's a good chance we may have an overlap (why else would someone
+ * intentionally use memmove then?).
+ *
+ * We'll break this into two parts. Front-to-back, or back-to-front
+ * copies.
+ */
+.Lneon_memmove_cmf:
+ cmp r0, r1
+ blt .Lneon_front_to_back_copy
+ bgt .Lneon_back_to_front_copy
+ b .Lneon_memmove_done
+
+ /* #############################################################
+ * Front to Back copy
+ */
+.Lneon_front_to_back_copy:
+ /*
+ * For small copies, just do a quick memcpy. We can do this for
+ * front-to-back copies, aligned or unaligned, since we're only
+ * doing 1 byte at a time...
+ */
+ cmp r2, #4
+ bgt .Lneon_f2b_gt4
+ cmp r2, #0
+.Lneon_f2b_smallcopy_loop:
+ beq .Lneon_memmove_done
+ ldrb r12, [r1], #1
+ subs r2, r2, #1
+ strb r12, [r0], #1
+ b .Lneon_f2b_smallcopy_loop
+.Lneon_f2b_gt4:
+ /* The window size is in r3. */
+ sub r3, r1, r0
+ /* #############################################################
+ * Front to Back copy
+ */
+ /*
+ * Note that we can't just route based on the size in r2. If that's
+ * larger than the overlap window in r3, we could potentially
+ * (and likely!) destroy data we're copying.
+ */
+ cmp r2, r3
+ movle r12, r2
+ movgt r12, r3
+ cmp r12, #256
+ bge .Lneon_f2b_copy_128
+ cmp r12, #64
+ bge .Lneon_f2b_copy_32
+ cmp r12, #16
+ bge .Lneon_f2b_copy_16
+ cmp r12, #8
+ bge .Lneon_f2b_copy_8
+ cmp r12, #4
+ bge .Lneon_f2b_copy_4
+ b .Lneon_f2b_copy_1
+ nop
+.Lneon_f2b_copy_128:
+ mov r12, r2, lsr #7
+ cmp r12, #PLDOFFS
+ ble .Lneon_f2b_copy_128_loop_nopld
+ sub r12, #PLDOFFS
+ pld [r1, #(PLDOFFS-1)*PLDSIZE]
+.Lneon_f2b_copy_128_loop_outer:
+ pld [r1, #(PLDOFFS*PLDSIZE)]
+ vld1.32 {q0,q1}, [r1]!
+ vld1.32 {q2,q3}, [r1]!
+ vld1.32 {q8,q9}, [r1]!
+ vld1.32 {q10,q11}, [r1]!
+ subs r12, r12, #1
+ vst1.32 {q0,q1}, [r0]!
+ vst1.32 {q2,q3}, [r0]!
+ vst1.32 {q8,q9}, [r0]!
+ vst1.32 {q10,q11}, [r0]!
+ bne .Lneon_f2b_copy_128_loop_outer
+ mov r12, #PLDOFFS
+.Lneon_f2b_copy_128_loop_nopld:
+ vld1.32 {q0,q1}, [r1]!
+ vld1.32 {q2,q3}, [r1]!
+ vld1.32 {q8,q9}, [r1]!
+ vld1.32 {q10,q11}, [r1]!
+ subs r12, r12, #1
+ vst1.32 {q0,q1}, [r0]!
+ vst1.32 {q2,q3}, [r0]!
+ vst1.32 {q8,q9}, [r0]!
+ vst1.32 {q10,q11}, [r0]!
+ bne .Lneon_f2b_copy_128_loop_nopld
+ ands r2, r2, #0x7f
+ beq .Lneon_memmove_done
+ cmp r2, #32
+ bge .Lneon_f2b_copy_32
+ b .Lneon_f2b_copy_finish
+.Lneon_f2b_copy_32:
+ mov r12, r2, lsr #5
+.Lneon_f2b_copy_32_loop:
+ vld1.32 {q0,q1}, [r1]!
+ subs r12, r12, #1
+ vst1.32 {q0,q1}, [r0]!
+ bne .Lneon_f2b_copy_32_loop
+ ands r2, r2, #0x1f
+ beq .Lneon_memmove_done
+.Lneon_f2b_copy_finish:
+.Lneon_f2b_copy_16:
+ movs r12, r2, lsr #4
+ beq .Lneon_f2b_copy_8
+.Lneon_f2b_copy_16_loop:
+ vld1.32 {q0}, [r1]!
+ subs r12, r12, #1
+ vst1.32 {q0}, [r0]!
+ bne .Lneon_f2b_copy_16_loop
+ ands r2, r2, #0xf
+ beq .Lneon_memmove_done
+.Lneon_f2b_copy_8:
+ movs r12, r2, lsr #3
+ beq .Lneon_f2b_copy_4
+.Lneon_f2b_copy_8_loop:
+ vld1.32 {d0}, [r1]!
+ subs r12, r12, #1
+ vst1.32 {d0}, [r0]!
+ bne .Lneon_f2b_copy_8_loop
+ ands r2, r2, #0x7
+ beq .Lneon_memmove_done
+.Lneon_f2b_copy_4:
+ movs r12, r2, lsr #2
+ beq .Lneon_f2b_copy_1
+.Lneon_f2b_copy_4_loop:
+ ldr r3, [r1], #4
+ subs r12, r12, #1
+ str r3, [r0], #4
+ bne .Lneon_f2b_copy_4_loop
+ ands r2, r2, #0x3
+ nop
+.Lneon_f2b_copy_1:
+ cmp r2, #0
+ beq .Lneon_memmove_done
+.Lneon_f2b_copy_1_loop:
+ ldrb r12, [r1], #1
+ subs r2, r2, #1
+ strb r12, [r0], #1
+ bne .Lneon_f2b_copy_1_loop
+.Lneon_f2b_finish:
+ b .Lneon_memmove_done
+
+ /* #############################################################
+ * Back to Front copy
+ */
+.Lneon_back_to_front_copy:
+ /*
+ * Here, we'll want to shift to the end of the buffers. This
+ * actually points us one past where we need to go, but since
+ * we'll pre-decrement throughout, this will be fine.
+ */
+ add r0, r0, r2
+ add r1, r1, r2
+ cmp r2, #4
+ bgt .Lneon_b2f_gt4
+ cmp r2, #0
+.Lneon_b2f_smallcopy_loop:
+ beq .Lneon_memmove_done
+ ldrb r12, [r1, #-1]!
+ subs r2, r2, #1
+ strb r12, [r0, #-1]!
+ b .Lneon_b2f_smallcopy_loop
+.Lneon_b2f_gt4:
+ /*
+ * The minimum of the overlap window size and the copy size
+ * is in r3.
+ */
+ sub r3, r0, r1
+ /*
+ * #############################################################
+ * Back to Front copy -
+ */
+ cmp r2, r3
+ movle r12, r2
+ movgt r12, r3
+ cmp r12, #256
+ bge .Lneon_b2f_copy_128
+ cmp r12, #64
+ bge .Lneon_b2f_copy_32
+ cmp r12, #8
+ bge .Lneon_b2f_copy_8
+ cmp r12, #4
+ bge .Lneon_b2f_copy_4
+ b .Lneon_b2f_copy_1
+ nop
+.Lneon_b2f_copy_128:
+ movs r12, r2, lsr #7
+ cmp r12, #PLDOFFS
+ ble .Lneon_b2f_copy_128_loop_nopld
+ sub r12, #PLDOFFS
+ pld [r1, #-(PLDOFFS-1)*PLDSIZE]
+.Lneon_b2f_copy_128_loop_outer:
+ pld [r1, #-(PLDOFFS*PLDSIZE)]
+ sub r1, r1, #128
+ sub r0, r0, #128
+ vld1.32 {q0, q1}, [r1]!
+ vld1.32 {q2, q3}, [r1]!
+ vld1.32 {q8, q9}, [r1]!
+ vld1.32 {q10, q11}, [r1]!
+ subs r12, r12, #1
+ vst1.32 {q0, q1}, [r0]!
+ vst1.32 {q2, q3}, [r0]!
+ vst1.32 {q8, q9}, [r0]!
+ vst1.32 {q10, q11}, [r0]!
+ sub r1, r1, #128
+ sub r0, r0, #128
+ bne .Lneon_b2f_copy_128_loop_outer
+ mov r12, #PLDOFFS
+.Lneon_b2f_copy_128_loop_nopld:
+ sub r1, r1, #128
+ sub r0, r0, #128
+ vld1.32 {q0, q1}, [r1]!
+ vld1.32 {q2, q3}, [r1]!
+ vld1.32 {q8, q9}, [r1]!
+ vld1.32 {q10, q11}, [r1]!
+ subs r12, r12, #1
+ vst1.32 {q0, q1}, [r0]!
+ vst1.32 {q2, q3}, [r0]!
+ vst1.32 {q8, q9}, [r0]!
+ vst1.32 {q10, q11}, [r0]!
+ sub r1, r1, #128
+ sub r0, r0, #128
+ bne .Lneon_b2f_copy_128_loop_nopld
+ ands r2, r2, #0x7f
+ beq .Lneon_memmove_done
+ cmp r2, #32
+ bge .Lneon_b2f_copy_32
+ b .Lneon_b2f_copy_finish
+.Lneon_b2f_copy_32:
+ mov r12, r2, lsr #5
+.Lneon_b2f_copy_32_loop:
+ sub r1, r1, #32
+ sub r0, r0, #32
+ vld1.32 {q0,q1}, [r1]
+ subs r12, r12, #1
+ vst1.32 {q0,q1}, [r0]
+ bne .Lneon_b2f_copy_32_loop
+ ands r2, r2, #0x1f
+ beq .Lneon_memmove_done
+.Lneon_b2f_copy_finish:
+.Lneon_b2f_copy_8:
+ movs r12, r2, lsr #0x3
+ beq .Lneon_b2f_copy_4
+.Lneon_b2f_copy_8_loop:
+ sub r1, r1, #8
+ sub r0, r0, #8
+ vld1.32 {d0}, [r1]
+ subs r12, r12, #1
+ vst1.32 {d0}, [r0]
+ bne .Lneon_b2f_copy_8_loop
+ ands r2, r2, #0x7
+ beq .Lneon_memmove_done
+.Lneon_b2f_copy_4:
+ movs r12, r2, lsr #0x2
+ beq .Lneon_b2f_copy_1
+.Lneon_b2f_copy_4_loop:
+ ldr r3, [r1, #-4]!
+ subs r12, r12, #1
+ str r3, [r0, #-4]!
+ bne .Lneon_b2f_copy_4_loop
+ ands r2, r2, #0x3
+ nop
+.Lneon_b2f_copy_1:
+ cmp r2, #0
+ beq .Lneon_memmove_done
+.Lneon_b2f_copy_1_loop:
+ ldrb r12, [r1, #-1]!
+ subs r2, r2, #1
+ strb r12, [r0, #-1]!
+ bne .Lneon_b2f_copy_1_loop
+
+.Lneon_memmove_done:
+ pop {r0}
+ bx lr
+
+ .end
+#endif /* SCORPION_NEON_OPTIMIZATION */
+
diff --git a/libc/arch-arm/bionic/memset.S b/libc/arch-arm/bionic/memset.S
index 93abe15a2..69abd4bdf 100644
--- a/libc/arch-arm/bionic/memset.S
+++ b/libc/arch-arm/bionic/memset.S
@@ -2,6 +2,8 @@
* Copyright (C) 2008 The Android Open Source Project
* All rights reserved.
*
+ * Copyright (c) 2009-2011, Code Aurora Forum. All rights reserved.
+ *
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
@@ -35,6 +37,90 @@
.align
+
+#if defined(SCORPION_NEON_OPTIMIZATION)
+ .code 32
+ .align 8
+ .global memset
+ .type memset, %function
+
+ .global bzero
+ .type bzero, %function
+
+bzero:
+ mov r2, r1
+ mov r1, #0
+memset:
+ push {r0}
+
+ cmp r2, #6
+ bgt .Lmemset_gt6
+ cmp r2, #0
+ beq .Lmemset_smallcopy_done
+.Lmemset_smallcopy_loop:
+ strb r1, [r0], #1
+ subs r2, r2, #1
+ bne .Lmemset_smallcopy_loop
+.Lmemset_smallcopy_done:
+ pop {r0}
+ bx lr
+
+.Lmemset_gt6:
+ vdup.8 q0, r1
+ vmov r1, s0
+
+ /*
+ * Decide where to route for the maximum copy sizes.
+ */
+ cmp r2, #4
+ blt .Lmemset_lt4
+ cmp r2, #16
+ blt .Lmemset_lt16
+ vmov q1, q0
+ cmp r2, #128
+ blt .Lmemset_32
+.Lmemset_128:
+ mov r12, r2, lsr #7
+.Lmemset_128_loop:
+ vst1.32 {q0, q1}, [r0]!
+ vst1.32 {q0, q1}, [r0]!
+ vst1.32 {q0, q1}, [r0]!
+ vst1.32 {q0, q1}, [r0]!
+ subs r12, r12, #1
+ bne .Lmemset_128_loop
+ ands r2, r2, #0x7f
+ beq .Lmemset_end
+.Lmemset_32:
+ movs r12, r2, lsr #5
+ beq .Lmemset_lt32
+.Lmemset_32_loop:
+ subs r12, r12, #1
+ vst1.32 {q0, q1}, [r0]!
+ bne .Lmemset_32_loop
+ ands r2, r2, #0x1f
+ beq .Lmemset_end
+.Lmemset_lt32:
+ cmp r2, #16
+ blt .Lmemset_lt16
+ vst1.64 {q0}, [r0]!
+ subs r2, r2, #16
+ beq .Lmemset_end
+.Lmemset_lt16:
+ movs r12, r2, lsl #29
+ strcs r1, [r0], #4
+ strcs r1, [r0], #4
+ strmi r1, [r0], #4
+.Lmemset_lt4:
+ movs r2, r2, lsl #31
+ strcsh r1, [r0], #2
+ strmib r1, [r0]
+.Lmemset_end:
+ pop {r0}
+ bx lr
+
+ .end
+#else /* !SCORPION_NEON_OPTIMIZATION */
+
/*
* Optimized memset() for ARM.
*
@@ -115,3 +201,4 @@ memset:
bx lr
.fnend
+#endif /* SCORPION_NEON_OPTIMIZATION */
diff --git a/libc/arch-arm/bionic/strlen-armv7.S b/libc/arch-arm/bionic/strlen-armv7.S
new file mode 100644
index 000000000..125e92fb8
--- /dev/null
+++ b/libc/arch-arm/bionic/strlen-armv7.S
@@ -0,0 +1,111 @@
+/* Copyright (c) 2010-2011, Linaro Limited
+ All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions
+ are met:
+
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+
+ * Neither the name of Linaro Limited nor the names of its
+ contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+ Written by Dave Gilbert <david.gilbert@linaro.org>
+
+ This strlen routine is optimised on a Cortex-A9 and should work on
+ all ARMv7 processors. This routine is reasonably fast for short
+ strings, but is probably slower than a simple implementation if all
+ your strings are very short */
+
+@ 2011-02-08 david.gilbert@linaro.org
+@ Extracted from local git 6848613a
+
+
+@ this lets us check a flag in a 00/ff byte easily in either endianness
+#ifdef __ARMEB__
+#define CHARTSTMASK(c) 1<<(31-(c*8))
+#else
+#define CHARTSTMASK(c) 1<<(c*8)
+#endif
+
+@-----------------------------------------------------------------------------------------------------------------------------
+ .syntax unified
+ .arch armv7-a
+
+ .thumb_func
+ .align 2
+ .p2align 4,,15
+ .global strlen
+ .type strlen,%function
+strlen:
+ @ r0 = string
+ @ returns count of bytes in string not including terminator
+ mov r1, r0
+ push { r4,r6 }
+ mvns r6, #0 @ all F
+ movs r4, #0
+ tst r0, #7
+ beq 2f
+
+1:
+ ldrb r2, [r1], #1
+ tst r1, #7 @ Hit alignment yet?
+ cbz r2, 10f @ Exit if we found the 0
+ bne 1b
+
+ @ So we're now aligned
+2:
+ ldmia r1!,{r2,r3}
+ uadd8 r2, r2, r6 @ Parallel add 0xff - sets the GE bits for anything that wasn't 0
+ sel r2, r4, r6 @ bytes are 00 for none-00 bytes, or ff for 00 bytes - NOTE INVERSION
+ uadd8 r3, r3, r6 @ Parallel add 0xff - sets the GE bits for anything that wasn't 0
+ sel r3, r2, r6 @ bytes are 00 for none-00 bytes, or ff for 00 bytes - NOTE INVERSION
+ cmp r3, #0
+ beq 2b
+
+strlenendtmp:
+ @ One (or more) of the bytes we loaded was 0 - but which one?
+ @ r2 has the mask corresponding to the first loaded word
+ @ r3 has a combined mask of the two words - but if r2 was all-non 0
+ @ then it's just the 2nd words
+ cmp r2, #0
+ itte eq
+ moveq r2, r3 @ the end is in the 2nd word
+ subeq r1,r1,#3
+ subne r1,r1,#7
+
+ @ r1 currently points to the 2nd byte of the word containing the 0
+ tst r2, # CHARTSTMASK(0) @ 1st character
+ bne 10f
+ adds r1,r1,#1
+ tst r2, # CHARTSTMASK(1) @ 2nd character
+ ittt eq
+ addeq r1,r1,#1
+ tsteq r2, # (3<<15) @ 2nd & 3rd character
+ @ If not the 3rd must be the last one
+ addeq r1,r1,#1
+
+10:
+ @ r0 is still at the beginning, r1 is pointing 1 byte after the terminator
+ sub r0, r1, r0
+ subs r0, r0, #1
+ pop { r4, r6 }
+ bx lr