diff options
Diffstat (limited to 'libc/arch-arm')
-rw-r--r-- | libc/arch-arm/bionic/arm_memcpy.S | 123 | ||||
-rw-r--r-- | libc/arch-arm/bionic/atexit.S | 69 | ||||
-rw-r--r-- | libc/arch-arm/bionic/crtbegin_dynamic.S | 1 | ||||
-rw-r--r-- | libc/arch-arm/bionic/crtbegin_so.S | 6 | ||||
-rw-r--r-- | libc/arch-arm/bionic/crtbegin_static.S | 1 | ||||
-rw-r--r-- | libc/arch-arm/bionic/ffs.S | 35 | ||||
-rw-r--r-- | libc/arch-arm/bionic/memcmp.S | 81 | ||||
-rw-r--r-- | libc/arch-arm/bionic/memcpy.S | 126 | ||||
-rw-r--r-- | libc/arch-arm/bionic/memmove.S | 356 | ||||
-rw-r--r-- | libc/arch-arm/bionic/memset.S | 87 | ||||
-rw-r--r-- | libc/arch-arm/bionic/strlen-armv7.S | 111 |
11 files changed, 950 insertions, 46 deletions
diff --git a/libc/arch-arm/bionic/arm_memcpy.S b/libc/arch-arm/bionic/arm_memcpy.S new file mode 100644 index 000000000..ae1cf1ad1 --- /dev/null +++ b/libc/arch-arm/bionic/arm_memcpy.S @@ -0,0 +1,123 @@ +/* + * Copyright (C) 2011 Texas Instruments + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS + * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED + * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT + * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + + .text + .fpu neon + .code 32 + .align 4 + + +/* r0 - dest */ +/* r1 - src */ +/* r2 - length */ + .global memcpy +memcpy: + .fnstart +#if defined TARGET_BOARD_PLATFORM == omap4 +#define CACHE_LINE_SIZE 32 +#else +#define CACHE_LINE_SIZE 64 +#endif + CMP r2,#3 + BLS _BMLIB_memcpy_lastbytes + ANDS r12,r0,#3 + BEQ l1; + LDRB r3,[r1],#1 + CMP r12,#2 + ADD r2,r2,r12 + LDRLSB r12, [r1], #1 + STRB r3,[r0],#1 + LDRCCB r3,[r1],#1 + STRLSB r12,[r0],#1 + SUB r2,r2,#4 + STRCCB r3,[r0],#1 +l1: + ANDS r3,r1,#3 + BEQ _BMLIB_aeabi_memcpy4 +l3: + SUBS r2,r2,#8 + BCC l2 + LDR r3,[r1],#4 + LDR r12,[r1],#4 + STR r3,[r0],#4 + STR r12,[r0],#4 + B l3 +l2: + ADDS r2,r2,#4 + LDRPL r3,[r1],#4 + STRPL r3,[r0],#4 + MOV r0,r0 +_BMLIB_memcpy_lastbytes: + LSLS r2,r2,#31 + LDRCSB r3,[r1],#1 + LDRCSB r12,[r1],#1 + LDRMIB r2,[r1],#1 + STRCSB r3,[r0],#1 + STRCSB r12,[r0],#1 + STRMIB r2,[r0],#1 + BX lr + +_BMLIB_aeabi_memcpy4: + PUSH {r4-r8,lr} + SUBS r2,r2,#0x20 + BCC l4 + DSB +#ifndef NOPLD + PLD [r1, #0] + PLD [r1, #(CACHE_LINE_SIZE*1)] + PLD [r1, #(CACHE_LINE_SIZE*2)] + PLD [r1, #(CACHE_LINE_SIZE*3)] + PLD [r1, #(CACHE_LINE_SIZE*4)] +#endif +l5: +#ifndef NOPLD + PLD [r1, #(CACHE_LINE_SIZE*5)] +#endif + LDMCS r1!,{r3-r8,r12,lr} + STMCS r0!,{r3-r8,r12,lr} + SUBS r2,r2,#0x20 + BCS l5 +l4: + LSLS r12,r2,#28 + LDMCS r1!,{r3,r4,r12,lr} + STMCS r0!,{r3,r4,r12,lr} + LDMMI r1!,{r3,r4} + STMMI r0!,{r3,r4} + POP {r4-r8,lr} + LSLS r12,r2,#30 + LDRCS r3,[r1],#4 + STRCS r3,[r0],#4 + BXEQ lr +_BMLIB_memcpy_lastbytes_aligned: + LSLS r2,r2,#31 + LDRCSH r3,[r1],#2 + LDRMIB r2,[r1],#1 + STRCSH r3,[r0],#2 + STRMIB r2,[r0],#1 + BX lr + .fnend diff --git a/libc/arch-arm/bionic/atexit.S b/libc/arch-arm/bionic/atexit.S new file mode 100644 index 000000000..aa1e18d61 --- /dev/null +++ b/libc/arch-arm/bionic/atexit.S @@ -0,0 +1,69 @@ +/* + * Copyright (C) 2011 The Android Open Source Project + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS + * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED + * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT + * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifndef CRT_LEGACY_WORKAROUND + .arch armv5te + .fpu softvfp + .eabi_attribute 20, 1 + .eabi_attribute 21, 1 + .eabi_attribute 23, 3 + .eabi_attribute 24, 1 + .eabi_attribute 25, 1 + .eabi_attribute 26, 2 + .eabi_attribute 30, 4 + .eabi_attribute 18, 4 + .code 16 + .section .text.atexit,"ax",%progbits + .align 2 + .global atexit + .hidden atexit + .code 16 + .thumb_func + .type atexit, %function +atexit: + .fnstart +.LFB0: + .save {r4, lr} + push {r4, lr} +.LCFI0: + ldr r3, .L3 + mov r1, #0 + @ sp needed for prologue +.LPIC0: + add r3, pc + ldr r2, [r3] + bl __cxa_atexit + pop {r4, pc} +.L4: + .align 2 +.L3: + .word __dso_handle-(.LPIC0+4) +.LFE0: + .fnend + .size atexit, .-atexit +#endif diff --git a/libc/arch-arm/bionic/crtbegin_dynamic.S b/libc/arch-arm/bionic/crtbegin_dynamic.S index d18e715f5..099908444 100644 --- a/libc/arch-arm/bionic/crtbegin_dynamic.S +++ b/libc/arch-arm/bionic/crtbegin_dynamic.S @@ -85,3 +85,4 @@ __CTOR_LIST__: .long -1 #include "__dso_handle.S" +#include "atexit.S" diff --git a/libc/arch-arm/bionic/crtbegin_so.S b/libc/arch-arm/bionic/crtbegin_so.S index bb6b3e2c3..9275b1e01 100644 --- a/libc/arch-arm/bionic/crtbegin_so.S +++ b/libc/arch-arm/bionic/crtbegin_so.S @@ -52,4 +52,10 @@ __FINI_ARRAY__: .long -1 .long __on_dlclose +#ifdef CRT_LEGACY_WORKAROUND #include "__dso_handle.S" +#else +#include "__dso_handle_so.S" +#endif + +#include "atexit.S" diff --git a/libc/arch-arm/bionic/crtbegin_static.S b/libc/arch-arm/bionic/crtbegin_static.S index 6f9cf25dd..13b05b272 100644 --- a/libc/arch-arm/bionic/crtbegin_static.S +++ b/libc/arch-arm/bionic/crtbegin_static.S @@ -86,3 +86,4 @@ __CTOR_LIST__: #include "__dso_handle.S" +#include "atexit.S" diff --git a/libc/arch-arm/bionic/ffs.S b/libc/arch-arm/bionic/ffs.S index f11141c97..052b46a53 100644 --- a/libc/arch-arm/bionic/ffs.S +++ b/libc/arch-arm/bionic/ffs.S @@ -36,47 +36,14 @@ * 6 bits as an index into the table. This algorithm should be a win * over the checking each bit in turn as per the C compiled version. * - * under ARMv5 there's an instruction called CLZ (count leading Zero's) that - * could be used - * - * This is the ffs algorithm devised by d.seal and posted to comp.sys.arm on - * 16 Feb 1994. + * since ARMv5 there's an instruction called CLZ (count leading Zero's) */ ENTRY(ffs) /* Standard trick to isolate bottom bit in r0 or 0 if r0 = 0 on entry */ rsb r1, r0, #0 ands r0, r0, r1 -#ifndef __ARM_ARCH_5__ - /* - * now r0 has at most one set bit, call this X - * if X = 0, all further instructions are skipped - */ - adrne r2, .L_ffs_table - orrne r0, r0, r0, lsl #4 /* r0 = X * 0x11 */ - orrne r0, r0, r0, lsl #6 /* r0 = X * 0x451 */ - rsbne r0, r0, r0, lsl #16 /* r0 = X * 0x0450fbaf */ - - /* now lookup in table indexed on top 6 bits of r0 */ - ldrneb r0, [ r2, r0, lsr #26 ] - - bx lr - -.text; -.type .L_ffs_table, _ASM_TYPE_OBJECT; -.L_ffs_table: -/* 0 1 2 3 4 5 6 7 */ - .byte 0, 1, 2, 13, 3, 7, 0, 14 /* 0- 7 */ - .byte 4, 0, 8, 0, 0, 0, 0, 15 /* 8-15 */ - .byte 11, 5, 0, 0, 9, 0, 0, 26 /* 16-23 */ - .byte 0, 0, 0, 0, 0, 22, 28, 16 /* 24-31 */ - .byte 32, 12, 6, 0, 0, 0, 0, 0 /* 32-39 */ - .byte 10, 0, 0, 25, 0, 0, 21, 27 /* 40-47 */ - .byte 31, 0, 0, 0, 0, 24, 0, 20 /* 48-55 */ - .byte 30, 0, 23, 19, 29, 18, 17, 0 /* 56-63 */ -#else clzne r0, r0 rsbne r0, r0, #32 bx lr -#endif diff --git a/libc/arch-arm/bionic/memcmp.S b/libc/arch-arm/bionic/memcmp.S index 67dcddc1b..0fe26996c 100644 --- a/libc/arch-arm/bionic/memcmp.S +++ b/libc/arch-arm/bionic/memcmp.S @@ -43,36 +43,70 @@ * (2) The loads are scheduled in a way they won't stall */ +#if __ARM_ARCH__ >= 7 +#define __ARM_CORTEX + +#if defined(CORTEX_CACHE_LINE_32) +#define CACHE_LINE_SIZE 32 +#else +#define CACHE_LINE_SIZE 64 +#endif + +#endif /* __ARM_ARCH__ */ + + memcmp: .fnstart + +#if defined(__ARM_CORTEX) + pld [r0, #(CACHE_LINE_SIZE * 0)] + pld [r0, #(CACHE_LINE_SIZE * 1)] +#else + PLD (r0, #0) PLD (r1, #0) - +#endif /* take of the case where length is 0 or the buffers are the same */ cmp r0, r1 +#if !defined(__ARM_CORTEX) cmpne r2, #0 +#endif moveq r0, #0 bxeq lr +#if defined(__ARM_CORTEX) + pld [r1, #(CACHE_LINE_SIZE * 0)] + pld [r1, #(CACHE_LINE_SIZE * 1)] + + /* make sure we have at least 8+4 bytes, this simplify things below + * and avoid some overhead for small blocks + */ + cmp r2, #(8+4) + bmi 10f +#endif /* __ARM_CORTEX */ + + .save {r4, lr} /* save registers */ stmfd sp!, {r4, lr} - +#if !defined(__ARM_CORTEX) PLD (r0, #32) PLD (r1, #32) +#endif /* since r0 hold the result, move the first source * pointer somewhere else */ mov r4, r0 - + +#if !defined(__ARM_CORTEX) /* make sure we have at least 8+4 bytes, this simplify things below * and avoid some overhead for small blocks */ cmp r2, #(8+4) bmi 8f - +#endif /* align first pointer to word boundary * offset = -src & 3 */ @@ -109,8 +143,14 @@ memcmp: subs r2, r2, #(32 + 4) bmi 1f -0: PLD (r4, #64) +0: +#if defined(__ARM_CORTEX) + pld [r4, #(CACHE_LINE_SIZE * 2)] + pld [r1, #(CACHE_LINE_SIZE * 2)] +#else + PLD (r4, #64) PLD (r1, #64) +#endif ldr r0, [r4], #4 ldr lr, [r1, #4]! eors r0, r0, ip @@ -176,6 +216,21 @@ memcmp: 9: /* restore registers and return */ ldmfd sp!, {r4, lr} bx lr +#if defined(__ARM_CORTEX) +10: /* process less than 12 bytes */ + cmp r2, #0 + moveq r0, #0 + bxeq lr + mov r3, r0 +11: + ldrb r0, [r3], #1 + ldrb ip, [r1], #1 + subs r0, ip + bxne lr + subs r2, r2, #1 + bne 11b + bx lr +#endif /* __ARM_CORTEX */ .fnend @@ -198,8 +253,14 @@ memcmp: bic r1, r1, #3 ldr lr, [r1], #4 -6: PLD (r1, #64) +6: +#if defined(__ARM_CORTEX) + pld [r1, #(CACHE_LINE_SIZE * 2)] + pld [r4, #(CACHE_LINE_SIZE * 2)] +#else + PLD (r1, #64) PLD (r4, #64) +#endif mov ip, lr, lsr #16 ldr lr, [r1], #4 ldr r0, [r4], #4 @@ -240,13 +301,13 @@ memcmp: 4: /*************** offset is 1 or 3 (less optimized) ***************/ - stmfd sp!, {r5, r6, r7} + stmfd sp!, {r5, r6, r7} // r5 = rhs // r6 = lhs // r7 = scratch - mov r5, r0, lsl #3 /* r5 = right shift */ + mov r5, r0, lsl #3 /* r5 = right shift */ rsb r6, r5, #32 /* r6 = left shift */ /* align the unaligned pointer */ @@ -269,7 +330,7 @@ memcmp: bhs 6b sub r1, r1, r6, lsr #3 - ldmfd sp!, {r5, r6, r7} + ldmfd sp!, {r5, r6, r7} /* are we done? */ adds r2, r2, #8 @@ -284,5 +345,5 @@ memcmp: sub r1, r1, r6, lsr #3 sub r4, r4, #4 mov r2, #4 - ldmfd sp!, {r5, r6, r7} + ldmfd sp!, {r5, r6, r7} b 8b diff --git a/libc/arch-arm/bionic/memcpy.S b/libc/arch-arm/bionic/memcpy.S index ba55996ec..f2a4e3328 100644 --- a/libc/arch-arm/bionic/memcpy.S +++ b/libc/arch-arm/bionic/memcpy.S @@ -2,6 +2,8 @@ * Copyright (C) 2008 The Android Open Source Project * All rights reserved. * + * Copyright (c) 2009-2011, Code Aurora Forum. All rights reserved. + * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: @@ -29,7 +31,114 @@ #include <machine/cpu-features.h> #if defined(__ARM_NEON__) - +#if defined(SCORPION_NEON_OPTIMIZATION) + /* + * These can be overridden in: + * device/<vendor>/<board>/BoardConfig.mk + * by setting the following: + * TARGET_USE_SCORPION_BIONIC_OPTIMIZATION := true + * TARGET_USE_SCORPION_PLD_SET := true + * TARGET_SCORPION_BIONIC_PLDOFFS := <pldoffset> + * TARGET_SCORPION_BIONIC_PLDSIZE := <pldsize> + */ +#ifndef PLDOFFS +#define PLDOFFS (6) +#endif +#ifndef PLDSIZE +#define PLDSIZE (128) /* L2 cache line size */ +#endif + .code 32 + .align 5 + .globl memcpy + .func +memcpy: + push {r0} + cmp r2, #4 + blt .Lneon_lt4 + cmp r2, #16 + blt .Lneon_lt16 + cmp r2, #32 + blt .Lneon_16 + cmp r2, #128 + blt .Lneon_copy_32_a + /* Copy blocks of 128-bytes (word-aligned) at a time*/ + /* Code below is optimized for PLDSIZE=128 only */ + mov r12, r2, lsr #7 + cmp r12, #PLDOFFS + ble .Lneon_copy_128_loop_nopld + sub r12, #PLDOFFS + pld [r1, #(PLDOFFS-1)*PLDSIZE] +.Lneon_copy_128_loop_outer: + pld [r1, #(PLDOFFS*PLDSIZE)] + vld1.32 {q0, q1}, [r1]! + vld1.32 {q2, q3}, [r1]! + vld1.32 {q8, q9}, [r1]! + vld1.32 {q10, q11}, [r1]! + subs r12, r12, #1 + vst1.32 {q0, q1}, [r0]! + vst1.32 {q2, q3}, [r0]! + vst1.32 {q8, q9}, [r0]! + vst1.32 {q10, q11}, [r0]! + bne .Lneon_copy_128_loop_outer + mov r12, #PLDOFFS +.Lneon_copy_128_loop_nopld: + vld1.32 {q0, q1}, [r1]! + vld1.32 {q2, q3}, [r1]! + vld1.32 {q8, q9}, [r1]! + vld1.32 {q10, q11}, [r1]! + subs r12, r12, #1 + vst1.32 {q0, q1}, [r0]! + vst1.32 {q2, q3}, [r0]! + vst1.32 {q8, q9}, [r0]! + vst1.32 {q10, q11}, [r0]! + bne .Lneon_copy_128_loop_nopld + ands r2, r2, #0x7f + beq .Lneon_exit + cmp r2, #32 + blt .Lneon_16 + nop + /* Copy blocks of 32-bytes (word aligned) at a time*/ +.Lneon_copy_32_a: + mov r12, r2, lsr #5 +.Lneon_copy_32_loop_a: + vld1.32 {q0,q1}, [r1]! + subs r12, r12, #1 + vst1.32 {q0,q1}, [r0]! + bne .Lneon_copy_32_loop_a + ands r2, r2, #0x1f + beq .Lneon_exit +.Lneon_16: + subs r2, r2, #16 + blt .Lneon_lt16 + vld1.32 {q8}, [r1]! + vst1.32 {q8}, [r0]! + beq .Lneon_exit +.Lneon_lt16: + movs r12, r2, lsl #29 + bcc .Lneon_skip8 + ldr r3, [r1], #4 + ldr r12, [r1], #4 + str r3, [r0], #4 + str r12, [r0], #4 +.Lneon_skip8: + bpl .Lneon_lt4 + ldr r3, [r1], #4 + str r3, [r0], #4 +.Lneon_lt4: + movs r2, r2, lsl #31 + bcc .Lneon_lt2 + ldrh r3, [r1], #2 + strh r3, [r0], #2 +.Lneon_lt2: + bpl .Lneon_exit + ldrb r12, [r1] + strb r12, [r0] +.Lneon_exit: + pop {r0} + bx lr + .endfunc + .end +#else /* !SCORPION_NEON_OPTIMIZATION */ .text .fpu neon @@ -141,11 +250,14 @@ memcpy: strcsb ip, [r0], #1 strcsb lr, [r0], #1 +.ifdef NEEDS_ARM_ERRATA_754319_754320_ASM + VMOV s0,s0 @ NOP for ARM Errata +.endif ldmfd sp!, {r0, lr} bx lr .fnend - +#endif /* !SCORPION_NEON_OPTIMIZATION */ #else /* __ARM_ARCH__ < 7 */ @@ -260,20 +372,30 @@ cached_aligned32: * */ +#if __ARM_ARCH__ == 5 // Align the preload register to a cache-line because the cpu does // "critical word first" (the first word requested is loaded first). bic r12, r1, #0x1F add r12, r12, #64 +#endif 1: ldmia r1!, { r4-r11 } + +#if __ARM_ARCH__ == 5 PLD (r12, #64) +#else + PLD (r1, #64) +#endif subs r2, r2, #32 +#if __ARM_ARCH__ == 5 // NOTE: if r12 is more than 64 ahead of r1, the following ldrhi // for ARM9 preload will not be safely guarded by the preceding subs. // When it is safely guarded the only possibility to have SIGSEGV here // is because the caller overstates the length. ldrhi r3, [r12], #32 /* cheap ARM9 preload */ +#endif + stmia r0!, { r4-r11 } bhs 1b diff --git a/libc/arch-arm/bionic/memmove.S b/libc/arch-arm/bionic/memmove.S new file mode 100644 index 000000000..123419584 --- /dev/null +++ b/libc/arch-arm/bionic/memmove.S @@ -0,0 +1,356 @@ +/*************************************************************************** + Copyright (c) 2009-2011 Code Aurora Forum. All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of Code Aurora nor the names of its contributors may + be used to endorse or promote products derived from this software + without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + POSSIBILITY OF SUCH DAMAGE. + ***************************************************************************/ + +/*************************************************************************** + * Neon memmove: Attempts to do a memmove with Neon registers if possible, + * Inputs: + * dest: The destination buffer + * src: The source buffer + * n: The size of the buffer to transfer + * Outputs: + * + ***************************************************************************/ + +#include <machine/cpu-features.h> + +#if defined(SCORPION_NEON_OPTIMIZATION) + /* + * These can be overridden in: + * device/<vendor>/<board>/BoardConfig.mk + * by setting the following: + * TARGET_USE_SCORPION_BIONIC_OPTIMIZATION := true + * TARGET_USE_SCORPION_PLD_SET := true + * TARGET_SCORPION_BIONIC_PLDOFFS := <pldoffset> + * TARGET_SCORPION_BIONIC_PLDSIZE := <pldsize> + */ +#ifndef PLDOFFS +#define PLDOFFS (6) +#endif +#ifndef PLDSIZE +#define PLDSIZE (128) /* L2 cache line size */ +#endif + + .code 32 + .align 5 + .global memmove + .type memmove, %function + + .global bcopy + .type bcopy, %function + +bcopy: + mov r12, r0 + mov r0, r1 + mov r1, r12 +memmove: + push {r0} + + /* + * The requirements for memmove state that the function should + * operate as if data were being copied from the source to a + * buffer, then to the destination. This is to allow a user + * to copy data from a source and target that overlap. + * + * We can't just do byte copies front-to-back automatically, since + * there's a good chance we may have an overlap (why else would someone + * intentionally use memmove then?). + * + * We'll break this into two parts. Front-to-back, or back-to-front + * copies. + */ +.Lneon_memmove_cmf: + cmp r0, r1 + blt .Lneon_front_to_back_copy + bgt .Lneon_back_to_front_copy + b .Lneon_memmove_done + + /* ############################################################# + * Front to Back copy + */ +.Lneon_front_to_back_copy: + /* + * For small copies, just do a quick memcpy. We can do this for + * front-to-back copies, aligned or unaligned, since we're only + * doing 1 byte at a time... + */ + cmp r2, #4 + bgt .Lneon_f2b_gt4 + cmp r2, #0 +.Lneon_f2b_smallcopy_loop: + beq .Lneon_memmove_done + ldrb r12, [r1], #1 + subs r2, r2, #1 + strb r12, [r0], #1 + b .Lneon_f2b_smallcopy_loop +.Lneon_f2b_gt4: + /* The window size is in r3. */ + sub r3, r1, r0 + /* ############################################################# + * Front to Back copy + */ + /* + * Note that we can't just route based on the size in r2. If that's + * larger than the overlap window in r3, we could potentially + * (and likely!) destroy data we're copying. + */ + cmp r2, r3 + movle r12, r2 + movgt r12, r3 + cmp r12, #256 + bge .Lneon_f2b_copy_128 + cmp r12, #64 + bge .Lneon_f2b_copy_32 + cmp r12, #16 + bge .Lneon_f2b_copy_16 + cmp r12, #8 + bge .Lneon_f2b_copy_8 + cmp r12, #4 + bge .Lneon_f2b_copy_4 + b .Lneon_f2b_copy_1 + nop +.Lneon_f2b_copy_128: + mov r12, r2, lsr #7 + cmp r12, #PLDOFFS + ble .Lneon_f2b_copy_128_loop_nopld + sub r12, #PLDOFFS + pld [r1, #(PLDOFFS-1)*PLDSIZE] +.Lneon_f2b_copy_128_loop_outer: + pld [r1, #(PLDOFFS*PLDSIZE)] + vld1.32 {q0,q1}, [r1]! + vld1.32 {q2,q3}, [r1]! + vld1.32 {q8,q9}, [r1]! + vld1.32 {q10,q11}, [r1]! + subs r12, r12, #1 + vst1.32 {q0,q1}, [r0]! + vst1.32 {q2,q3}, [r0]! + vst1.32 {q8,q9}, [r0]! + vst1.32 {q10,q11}, [r0]! + bne .Lneon_f2b_copy_128_loop_outer + mov r12, #PLDOFFS +.Lneon_f2b_copy_128_loop_nopld: + vld1.32 {q0,q1}, [r1]! + vld1.32 {q2,q3}, [r1]! + vld1.32 {q8,q9}, [r1]! + vld1.32 {q10,q11}, [r1]! + subs r12, r12, #1 + vst1.32 {q0,q1}, [r0]! + vst1.32 {q2,q3}, [r0]! + vst1.32 {q8,q9}, [r0]! + vst1.32 {q10,q11}, [r0]! + bne .Lneon_f2b_copy_128_loop_nopld + ands r2, r2, #0x7f + beq .Lneon_memmove_done + cmp r2, #32 + bge .Lneon_f2b_copy_32 + b .Lneon_f2b_copy_finish +.Lneon_f2b_copy_32: + mov r12, r2, lsr #5 +.Lneon_f2b_copy_32_loop: + vld1.32 {q0,q1}, [r1]! + subs r12, r12, #1 + vst1.32 {q0,q1}, [r0]! + bne .Lneon_f2b_copy_32_loop + ands r2, r2, #0x1f + beq .Lneon_memmove_done +.Lneon_f2b_copy_finish: +.Lneon_f2b_copy_16: + movs r12, r2, lsr #4 + beq .Lneon_f2b_copy_8 +.Lneon_f2b_copy_16_loop: + vld1.32 {q0}, [r1]! + subs r12, r12, #1 + vst1.32 {q0}, [r0]! + bne .Lneon_f2b_copy_16_loop + ands r2, r2, #0xf + beq .Lneon_memmove_done +.Lneon_f2b_copy_8: + movs r12, r2, lsr #3 + beq .Lneon_f2b_copy_4 +.Lneon_f2b_copy_8_loop: + vld1.32 {d0}, [r1]! + subs r12, r12, #1 + vst1.32 {d0}, [r0]! + bne .Lneon_f2b_copy_8_loop + ands r2, r2, #0x7 + beq .Lneon_memmove_done +.Lneon_f2b_copy_4: + movs r12, r2, lsr #2 + beq .Lneon_f2b_copy_1 +.Lneon_f2b_copy_4_loop: + ldr r3, [r1], #4 + subs r12, r12, #1 + str r3, [r0], #4 + bne .Lneon_f2b_copy_4_loop + ands r2, r2, #0x3 + nop +.Lneon_f2b_copy_1: + cmp r2, #0 + beq .Lneon_memmove_done +.Lneon_f2b_copy_1_loop: + ldrb r12, [r1], #1 + subs r2, r2, #1 + strb r12, [r0], #1 + bne .Lneon_f2b_copy_1_loop +.Lneon_f2b_finish: + b .Lneon_memmove_done + + /* ############################################################# + * Back to Front copy + */ +.Lneon_back_to_front_copy: + /* + * Here, we'll want to shift to the end of the buffers. This + * actually points us one past where we need to go, but since + * we'll pre-decrement throughout, this will be fine. + */ + add r0, r0, r2 + add r1, r1, r2 + cmp r2, #4 + bgt .Lneon_b2f_gt4 + cmp r2, #0 +.Lneon_b2f_smallcopy_loop: + beq .Lneon_memmove_done + ldrb r12, [r1, #-1]! + subs r2, r2, #1 + strb r12, [r0, #-1]! + b .Lneon_b2f_smallcopy_loop +.Lneon_b2f_gt4: + /* + * The minimum of the overlap window size and the copy size + * is in r3. + */ + sub r3, r0, r1 + /* + * ############################################################# + * Back to Front copy - + */ + cmp r2, r3 + movle r12, r2 + movgt r12, r3 + cmp r12, #256 + bge .Lneon_b2f_copy_128 + cmp r12, #64 + bge .Lneon_b2f_copy_32 + cmp r12, #8 + bge .Lneon_b2f_copy_8 + cmp r12, #4 + bge .Lneon_b2f_copy_4 + b .Lneon_b2f_copy_1 + nop +.Lneon_b2f_copy_128: + movs r12, r2, lsr #7 + cmp r12, #PLDOFFS + ble .Lneon_b2f_copy_128_loop_nopld + sub r12, #PLDOFFS + pld [r1, #-(PLDOFFS-1)*PLDSIZE] +.Lneon_b2f_copy_128_loop_outer: + pld [r1, #-(PLDOFFS*PLDSIZE)] + sub r1, r1, #128 + sub r0, r0, #128 + vld1.32 {q0, q1}, [r1]! + vld1.32 {q2, q3}, [r1]! + vld1.32 {q8, q9}, [r1]! + vld1.32 {q10, q11}, [r1]! + subs r12, r12, #1 + vst1.32 {q0, q1}, [r0]! + vst1.32 {q2, q3}, [r0]! + vst1.32 {q8, q9}, [r0]! + vst1.32 {q10, q11}, [r0]! + sub r1, r1, #128 + sub r0, r0, #128 + bne .Lneon_b2f_copy_128_loop_outer + mov r12, #PLDOFFS +.Lneon_b2f_copy_128_loop_nopld: + sub r1, r1, #128 + sub r0, r0, #128 + vld1.32 {q0, q1}, [r1]! + vld1.32 {q2, q3}, [r1]! + vld1.32 {q8, q9}, [r1]! + vld1.32 {q10, q11}, [r1]! + subs r12, r12, #1 + vst1.32 {q0, q1}, [r0]! + vst1.32 {q2, q3}, [r0]! + vst1.32 {q8, q9}, [r0]! + vst1.32 {q10, q11}, [r0]! + sub r1, r1, #128 + sub r0, r0, #128 + bne .Lneon_b2f_copy_128_loop_nopld + ands r2, r2, #0x7f + beq .Lneon_memmove_done + cmp r2, #32 + bge .Lneon_b2f_copy_32 + b .Lneon_b2f_copy_finish +.Lneon_b2f_copy_32: + mov r12, r2, lsr #5 +.Lneon_b2f_copy_32_loop: + sub r1, r1, #32 + sub r0, r0, #32 + vld1.32 {q0,q1}, [r1] + subs r12, r12, #1 + vst1.32 {q0,q1}, [r0] + bne .Lneon_b2f_copy_32_loop + ands r2, r2, #0x1f + beq .Lneon_memmove_done +.Lneon_b2f_copy_finish: +.Lneon_b2f_copy_8: + movs r12, r2, lsr #0x3 + beq .Lneon_b2f_copy_4 +.Lneon_b2f_copy_8_loop: + sub r1, r1, #8 + sub r0, r0, #8 + vld1.32 {d0}, [r1] + subs r12, r12, #1 + vst1.32 {d0}, [r0] + bne .Lneon_b2f_copy_8_loop + ands r2, r2, #0x7 + beq .Lneon_memmove_done +.Lneon_b2f_copy_4: + movs r12, r2, lsr #0x2 + beq .Lneon_b2f_copy_1 +.Lneon_b2f_copy_4_loop: + ldr r3, [r1, #-4]! + subs r12, r12, #1 + str r3, [r0, #-4]! + bne .Lneon_b2f_copy_4_loop + ands r2, r2, #0x3 + nop +.Lneon_b2f_copy_1: + cmp r2, #0 + beq .Lneon_memmove_done +.Lneon_b2f_copy_1_loop: + ldrb r12, [r1, #-1]! + subs r2, r2, #1 + strb r12, [r0, #-1]! + bne .Lneon_b2f_copy_1_loop + +.Lneon_memmove_done: + pop {r0} + bx lr + + .end +#endif /* SCORPION_NEON_OPTIMIZATION */ + diff --git a/libc/arch-arm/bionic/memset.S b/libc/arch-arm/bionic/memset.S index 93abe15a2..69abd4bdf 100644 --- a/libc/arch-arm/bionic/memset.S +++ b/libc/arch-arm/bionic/memset.S @@ -2,6 +2,8 @@ * Copyright (C) 2008 The Android Open Source Project * All rights reserved. * + * Copyright (c) 2009-2011, Code Aurora Forum. All rights reserved. + * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: @@ -35,6 +37,90 @@ .align + +#if defined(SCORPION_NEON_OPTIMIZATION) + .code 32 + .align 8 + .global memset + .type memset, %function + + .global bzero + .type bzero, %function + +bzero: + mov r2, r1 + mov r1, #0 +memset: + push {r0} + + cmp r2, #6 + bgt .Lmemset_gt6 + cmp r2, #0 + beq .Lmemset_smallcopy_done +.Lmemset_smallcopy_loop: + strb r1, [r0], #1 + subs r2, r2, #1 + bne .Lmemset_smallcopy_loop +.Lmemset_smallcopy_done: + pop {r0} + bx lr + +.Lmemset_gt6: + vdup.8 q0, r1 + vmov r1, s0 + + /* + * Decide where to route for the maximum copy sizes. + */ + cmp r2, #4 + blt .Lmemset_lt4 + cmp r2, #16 + blt .Lmemset_lt16 + vmov q1, q0 + cmp r2, #128 + blt .Lmemset_32 +.Lmemset_128: + mov r12, r2, lsr #7 +.Lmemset_128_loop: + vst1.32 {q0, q1}, [r0]! + vst1.32 {q0, q1}, [r0]! + vst1.32 {q0, q1}, [r0]! + vst1.32 {q0, q1}, [r0]! + subs r12, r12, #1 + bne .Lmemset_128_loop + ands r2, r2, #0x7f + beq .Lmemset_end +.Lmemset_32: + movs r12, r2, lsr #5 + beq .Lmemset_lt32 +.Lmemset_32_loop: + subs r12, r12, #1 + vst1.32 {q0, q1}, [r0]! + bne .Lmemset_32_loop + ands r2, r2, #0x1f + beq .Lmemset_end +.Lmemset_lt32: + cmp r2, #16 + blt .Lmemset_lt16 + vst1.64 {q0}, [r0]! + subs r2, r2, #16 + beq .Lmemset_end +.Lmemset_lt16: + movs r12, r2, lsl #29 + strcs r1, [r0], #4 + strcs r1, [r0], #4 + strmi r1, [r0], #4 +.Lmemset_lt4: + movs r2, r2, lsl #31 + strcsh r1, [r0], #2 + strmib r1, [r0] +.Lmemset_end: + pop {r0} + bx lr + + .end +#else /* !SCORPION_NEON_OPTIMIZATION */ + /* * Optimized memset() for ARM. * @@ -115,3 +201,4 @@ memset: bx lr .fnend +#endif /* SCORPION_NEON_OPTIMIZATION */ diff --git a/libc/arch-arm/bionic/strlen-armv7.S b/libc/arch-arm/bionic/strlen-armv7.S new file mode 100644 index 000000000..125e92fb8 --- /dev/null +++ b/libc/arch-arm/bionic/strlen-armv7.S @@ -0,0 +1,111 @@ +/* Copyright (c) 2010-2011, Linaro Limited + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + * Neither the name of Linaro Limited nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + Written by Dave Gilbert <david.gilbert@linaro.org> + + This strlen routine is optimised on a Cortex-A9 and should work on + all ARMv7 processors. This routine is reasonably fast for short + strings, but is probably slower than a simple implementation if all + your strings are very short */ + +@ 2011-02-08 david.gilbert@linaro.org +@ Extracted from local git 6848613a + + +@ this lets us check a flag in a 00/ff byte easily in either endianness +#ifdef __ARMEB__ +#define CHARTSTMASK(c) 1<<(31-(c*8)) +#else +#define CHARTSTMASK(c) 1<<(c*8) +#endif + +@----------------------------------------------------------------------------------------------------------------------------- + .syntax unified + .arch armv7-a + + .thumb_func + .align 2 + .p2align 4,,15 + .global strlen + .type strlen,%function +strlen: + @ r0 = string + @ returns count of bytes in string not including terminator + mov r1, r0 + push { r4,r6 } + mvns r6, #0 @ all F + movs r4, #0 + tst r0, #7 + beq 2f + +1: + ldrb r2, [r1], #1 + tst r1, #7 @ Hit alignment yet? + cbz r2, 10f @ Exit if we found the 0 + bne 1b + + @ So we're now aligned +2: + ldmia r1!,{r2,r3} + uadd8 r2, r2, r6 @ Parallel add 0xff - sets the GE bits for anything that wasn't 0 + sel r2, r4, r6 @ bytes are 00 for none-00 bytes, or ff for 00 bytes - NOTE INVERSION + uadd8 r3, r3, r6 @ Parallel add 0xff - sets the GE bits for anything that wasn't 0 + sel r3, r2, r6 @ bytes are 00 for none-00 bytes, or ff for 00 bytes - NOTE INVERSION + cmp r3, #0 + beq 2b + +strlenendtmp: + @ One (or more) of the bytes we loaded was 0 - but which one? + @ r2 has the mask corresponding to the first loaded word + @ r3 has a combined mask of the two words - but if r2 was all-non 0 + @ then it's just the 2nd words + cmp r2, #0 + itte eq + moveq r2, r3 @ the end is in the 2nd word + subeq r1,r1,#3 + subne r1,r1,#7 + + @ r1 currently points to the 2nd byte of the word containing the 0 + tst r2, # CHARTSTMASK(0) @ 1st character + bne 10f + adds r1,r1,#1 + tst r2, # CHARTSTMASK(1) @ 2nd character + ittt eq + addeq r1,r1,#1 + tsteq r2, # (3<<15) @ 2nd & 3rd character + @ If not the 3rd must be the last one + addeq r1,r1,#1 + +10: + @ r0 is still at the beginning, r1 is pointing 1 byte after the terminator + sub r0, r1, r0 + subs r0, r0, #1 + pop { r4, r6 } + bx lr |