11 files changed, 950 insertions, 46 deletions
diff --git a/libc/arch-arm/bionic/arm_memcpy.S b/libc/arch-arm/bionic/arm_memcpy.S
new file mode 100644
index 000000000..ae1cf1ad1
--- /dev/null
+++ b/libc/arch-arm/bionic/arm_memcpy.S
@@ -0,0 +1,123 @@
+/*
+ * Copyright (C) 2011 Texas Instruments
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+ * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+   .text
+   .fpu    neon
+   .code   32
+   .align 4
+
+
+/* r0 - dest */
+/* r1 - src  */
+/* r2 - length */
+   .global memcpy
+memcpy:
+       .fnstart
+#if defined TARGET_BOARD_PLATFORM == omap4
+#define CACHE_LINE_SIZE  32
+#else
+#define CACHE_LINE_SIZE  64
+#endif
+        CMP      r2,#3
+        BLS      _BMLIB_memcpy_lastbytes
+        ANDS     r12,r0,#3
+        BEQ      l1;
+        LDRB     r3,[r1],#1
+        CMP      r12,#2
+        ADD      r2,r2,r12
+        LDRLSB   r12, [r1], #1
+        STRB     r3,[r0],#1
+        LDRCCB   r3,[r1],#1
+        STRLSB   r12,[r0],#1
+        SUB      r2,r2,#4
+        STRCCB   r3,[r0],#1
+l1:
+        ANDS     r3,r1,#3
+        BEQ      _BMLIB_aeabi_memcpy4
+l3:
+        SUBS     r2,r2,#8
+        BCC      l2
+        LDR      r3,[r1],#4
+        LDR      r12,[r1],#4
+        STR      r3,[r0],#4
+        STR      r12,[r0],#4
+        B        l3
+l2:
+        ADDS     r2,r2,#4
+        LDRPL    r3,[r1],#4
+        STRPL    r3,[r0],#4
+        MOV      r0,r0
+_BMLIB_memcpy_lastbytes:
+        LSLS     r2,r2,#31
+        LDRCSB   r3,[r1],#1
+        LDRCSB   r12,[r1],#1
+        LDRMIB   r2,[r1],#1
+        STRCSB   r3,[r0],#1
+        STRCSB   r12,[r0],#1
+        STRMIB   r2,[r0],#1
+        BX       lr
+
+_BMLIB_aeabi_memcpy4:
+        PUSH     {r4-r8,lr}
+        SUBS     r2,r2,#0x20
+        BCC      l4
+        DSB
+#ifndef NOPLD
+        PLD      [r1, #0]
+        PLD      [r1, #(CACHE_LINE_SIZE*1)]
+        PLD      [r1, #(CACHE_LINE_SIZE*2)]
+        PLD      [r1, #(CACHE_LINE_SIZE*3)]
+        PLD      [r1, #(CACHE_LINE_SIZE*4)]
+#endif
+l5:
+#ifndef NOPLD
+        PLD      [r1, #(CACHE_LINE_SIZE*5)]
+#endif
+        LDMCS    r1!,{r3-r8,r12,lr}
+        STMCS    r0!,{r3-r8,r12,lr}
+        SUBS     r2,r2,#0x20
+        BCS      l5
+l4:
+        LSLS     r12,r2,#28
+        LDMCS    r1!,{r3,r4,r12,lr}
+        STMCS    r0!,{r3,r4,r12,lr}
+        LDMMI    r1!,{r3,r4}
+        STMMI    r0!,{r3,r4}
+        POP      {r4-r8,lr}
+        LSLS     r12,r2,#30
+        LDRCS    r3,[r1],#4
+        STRCS    r3,[r0],#4
+        BXEQ     lr
+_BMLIB_memcpy_lastbytes_aligned:
+        LSLS     r2,r2,#31
+        LDRCSH   r3,[r1],#2
+        LDRMIB   r2,[r1],#1
+        STRCSH   r3,[r0],#2
+        STRMIB   r2,[r0],#1
+        BX       lr
+        .fnend
diff --git a/libc/arch-arm/bionic/atexit.S b/libc/arch-arm/bionic/atexit.S
new file mode 100644
index 000000000..aa1e18d61
--- /dev/null
+++ b/libc/arch-arm/bionic/atexit.S
@@ -0,0 +1,69 @@
+/*
+ * Copyright (C) 2011 The Android Open Source Project
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
+ * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#ifndef CRT_LEGACY_WORKAROUND
+	.arch armv5te
+	.fpu softvfp
+	.eabi_attribute 20, 1
+	.eabi_attribute 21, 1
+	.eabi_attribute 23, 3
+	.eabi_attribute 24, 1
+	.eabi_attribute 25, 1
+	.eabi_attribute 26, 2
+	.eabi_attribute 30, 4
+	.eabi_attribute 18, 4
+	.code	16
+	.section	.text.atexit,"ax",%progbits
+	.align	2
+	.global	atexit
+	.hidden	atexit
+	.code	16
+	.thumb_func
+	.type	atexit, %function
+atexit:
+	.fnstart
+.LFB0:
+	.save	{r4, lr}
+	push	{r4, lr}
+.LCFI0:
+	ldr	r3, .L3
+	mov	r1, #0
+	@ sp needed for prologue
+.LPIC0:
+	add	r3, pc
+	ldr	r2, [r3]
+	bl	__cxa_atexit
+	pop	{r4, pc}
+.L4:
+	.align	2
+.L3:
+	.word	__dso_handle-(.LPIC0+4)
+.LFE0:
+	.fnend
+	.size	atexit, .-atexit
+#endif
diff --git a/libc/arch-arm/bionic/crtbegin_dynamic.S b/libc/arch-arm/bionic/crtbegin_dynamic.S
index d18e715f5..099908444 100644
--- a/libc/arch-arm/bionic/crtbegin_dynamic.S
+++ b/libc/arch-arm/bionic/crtbegin_dynamic.S
@@ -85,3 +85,4 @@ __CTOR_LIST__:
 	.long -1
 
 #include "__dso_handle.S"
+#include "atexit.S"
diff --git a/libc/arch-arm/bionic/crtbegin_so.S b/libc/arch-arm/bionic/crtbegin_so.S
index bb6b3e2c3..9275b1e01 100644
--- a/libc/arch-arm/bionic/crtbegin_so.S
+++ b/libc/arch-arm/bionic/crtbegin_so.S
@@ -52,4 +52,10 @@ __FINI_ARRAY__:
         .long -1
         .long __on_dlclose
 
+#ifdef CRT_LEGACY_WORKAROUND
 #include "__dso_handle.S"
+#else
+#include "__dso_handle_so.S"
+#endif
+
+#include "atexit.S"
diff --git a/libc/arch-arm/bionic/crtbegin_static.S b/libc/arch-arm/bionic/crtbegin_static.S
index 6f9cf25dd..13b05b272 100644
--- a/libc/arch-arm/bionic/crtbegin_static.S
+++ b/libc/arch-arm/bionic/crtbegin_static.S
@@ -86,3 +86,4 @@ __CTOR_LIST__:
 
 
 #include "__dso_handle.S"
+#include "atexit.S"
diff --git a/libc/arch-arm/bionic/ffs.S b/libc/arch-arm/bionic/ffs.S
index f11141c97..052b46a53 100644
--- a/libc/arch-arm/bionic/ffs.S
+++ b/libc/arch-arm/bionic/ffs.S
@@ -36,47 +36,14 @@
  * 6 bits as an index into the table.  This algorithm should be a win
  * over the checking each bit in turn as per the C compiled version.
  *
- * under ARMv5 there's an instruction called CLZ (count leading Zero's) that
- * could be used
- *
- * This is the ffs algorithm devised by d.seal and posted to comp.sys.arm on
- * 16 Feb 1994.
+ * since ARMv5 there's an instruction called CLZ (count leading Zero's)
  */
 
 ENTRY(ffs)
 	/* Standard trick to isolate bottom bit in r0 or 0 if r0 = 0 on entry */
  	rsb     r1, r0, #0
  	ands    r0, r0, r1
-#ifndef __ARM_ARCH_5__
-	/*
-	 * now r0 has at most one set bit, call this X
-	 * if X = 0, all further instructions are skipped
-	 */
-	adrne   r2, .L_ffs_table
-	orrne   r0, r0, r0, lsl #4  /* r0 = X * 0x11 */ 
-	orrne   r0, r0, r0, lsl #6  /* r0 = X * 0x451 */
-	rsbne   r0, r0, r0, lsl #16 /* r0 = X * 0x0450fbaf */
-              
-	/* now lookup in table indexed on top 6 bits of r0 */
-	ldrneb  r0, [ r2, r0, lsr #26 ]
-
-	bx		lr
-
-.text;
-.type .L_ffs_table, _ASM_TYPE_OBJECT;
-.L_ffs_table:
-/*               0   1   2   3   4   5   6   7           */
-	.byte	 0,  1,  2, 13,  3,  7,  0, 14  /*  0- 7 */
-	.byte	 4,  0,  8,  0,  0,  0,  0, 15  /*  8-15 */
-	.byte	11,  5,  0,  0,  9,  0,  0, 26  /* 16-23 */
-	.byte	 0,  0,  0,  0,  0, 22, 28, 16  /* 24-31 */
-	.byte	32, 12,  6,  0,  0,  0,  0,  0	/* 32-39 */
-	.byte	10,  0,  0, 25,  0,  0, 21, 27  /* 40-47 */
-	.byte	31,  0,  0,  0,  0, 24,  0, 20  /* 48-55 */
-	.byte   30,  0, 23, 19, 29, 18, 17,  0  /* 56-63 */
-#else
 	clzne	r0, r0
 	rsbne	r0, r0, #32
 	bx		lr
-#endif
 
diff --git a/libc/arch-arm/bionic/memcmp.S b/libc/arch-arm/bionic/memcmp.S
index 67dcddc1b..0fe26996c 100644
--- a/libc/arch-arm/bionic/memcmp.S
+++ b/libc/arch-arm/bionic/memcmp.S
@@ -43,36 +43,70 @@
  * (2) The loads are scheduled in a way they won't stall
  */
 
+#if __ARM_ARCH__ >= 7
+#define __ARM_CORTEX
+
+#if defined(CORTEX_CACHE_LINE_32)
+#define CACHE_LINE_SIZE     32
+#else
+#define CACHE_LINE_SIZE     64
+#endif
+
+#endif /* __ARM_ARCH__ */
+
+
 memcmp:
         .fnstart
+
+#if defined(__ARM_CORTEX)
+        pld         [r0, #(CACHE_LINE_SIZE * 0)]
+        pld         [r0, #(CACHE_LINE_SIZE * 1)]
+#else
+
         PLD         (r0, #0)
         PLD         (r1, #0)
-
+#endif
         /* take of the case where length is 0 or the buffers are the same */
         cmp         r0, r1
+#if !defined(__ARM_CORTEX)
         cmpne       r2, #0
+#endif
         moveq       r0, #0
         bxeq        lr
 
+#if defined(__ARM_CORTEX)
+        pld         [r1, #(CACHE_LINE_SIZE * 0)]
+        pld         [r1, #(CACHE_LINE_SIZE * 1)]
+
+        /* make sure we have at least 8+4 bytes, this simplify things below
+         * and avoid some overhead for small blocks
+         */
+        cmp        r2, #(8+4)
+        bmi        10f
+#endif /* __ARM_CORTEX */
+
+
         .save {r4, lr}
         /* save registers */
         stmfd       sp!, {r4, lr}
-        
+#if !defined(__ARM_CORTEX)
         PLD         (r0, #32)
         PLD         (r1, #32)
+#endif
 
         /* since r0 hold the result, move the first source
          * pointer somewhere else
          */
          
          mov        r4, r0
-         
+
+#if !defined(__ARM_CORTEX)
          /* make sure we have at least 8+4 bytes, this simplify things below
           * and avoid some overhead for small blocks
           */
          cmp        r2, #(8+4)
          bmi        8f
-        
+#endif
         /* align first pointer to word boundary
          * offset = -src & 3
          */
@@ -109,8 +143,14 @@ memcmp:
         subs        r2, r2, #(32 + 4)
         bmi         1f
         
-0:      PLD         (r4, #64)
+0:
+#if defined(__ARM_CORTEX)
+        pld         [r4, #(CACHE_LINE_SIZE * 2)]
+        pld         [r1, #(CACHE_LINE_SIZE * 2)]
+#else
+        PLD         (r4, #64)
         PLD         (r1, #64)
+#endif
         ldr         r0, [r4], #4
         ldr         lr, [r1, #4]!
         eors        r0, r0, ip
@@ -176,6 +216,21 @@ memcmp:
 9:      /* restore registers and return */
         ldmfd       sp!, {r4, lr}
         bx          lr
+#if defined(__ARM_CORTEX)
+10:     /* process less than 12 bytes */
+        cmp         r2, #0
+        moveq       r0, #0
+        bxeq        lr
+        mov         r3, r0
+11:
+        ldrb        r0, [r3], #1
+        ldrb        ip, [r1], #1
+        subs        r0, ip
+        bxne        lr
+        subs        r2, r2, #1
+        bne         11b
+        bx          lr
+#endif /* __ARM_CORTEX */
         .fnend
 
 
@@ -198,8 +253,14 @@ memcmp:
         bic         r1, r1, #3
         ldr         lr, [r1], #4
 
-6:      PLD         (r1, #64)
+6:
+#if defined(__ARM_CORTEX)
+        pld         [r1, #(CACHE_LINE_SIZE * 2)]
+        pld         [r4, #(CACHE_LINE_SIZE * 2)]
+#else
+        PLD         (r1, #64)
         PLD         (r4, #64)
+#endif
         mov         ip, lr, lsr #16
         ldr         lr, [r1], #4
         ldr         r0, [r4], #4
@@ -240,13 +301,13 @@ memcmp:
 
 4:      /*************** offset is 1 or 3 (less optimized) ***************/
 
-		stmfd		sp!, {r5, r6, r7}
+        stmfd       sp!, {r5, r6, r7}
 
         // r5 = rhs
         // r6 = lhs
         // r7 = scratch
 
-        mov         r5, r0, lsl #3		/* r5 = right shift */
+        mov         r5, r0, lsl #3      /* r5 = right shift */
         rsb         r6, r5, #32         /* r6 = left shift */
 
         /* align the unaligned pointer */
@@ -269,7 +330,7 @@ memcmp:
         bhs         6b
 
         sub         r1, r1, r6, lsr #3
-		ldmfd       sp!, {r5, r6, r7}
+        ldmfd       sp!, {r5, r6, r7}
 
         /* are we done? */
         adds        r2, r2, #8
@@ -284,5 +345,5 @@ memcmp:
         sub         r1, r1, r6, lsr #3
         sub         r4, r4, #4
         mov         r2, #4
-		ldmfd		sp!, {r5, r6, r7}
+        ldmfd       sp!, {r5, r6, r7}
         b           8b
diff --git a/libc/arch-arm/bionic/memcpy.S b/libc/arch-arm/bionic/memcpy.S
index ba55996ec..f2a4e3328 100644
--- a/libc/arch-arm/bionic/memcpy.S
+++ b/libc/arch-arm/bionic/memcpy.S
@@ -2,6 +2,8 @@
  * Copyright (C) 2008 The Android Open Source Project
  * All rights reserved.
  *
+ * Copyright (c) 2009-2011, Code Aurora Forum. All rights reserved.
+ *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
@@ -29,7 +31,114 @@
 #include <machine/cpu-features.h>
 
 #if defined(__ARM_NEON__)
-
+#if defined(SCORPION_NEON_OPTIMIZATION)
+	/*
+	 * These can be overridden in:
+	 *   device/<vendor>/<board>/BoardConfig.mk
+         * by setting the following:
+	 *   TARGET_USE_SCORPION_BIONIC_OPTIMIZATION := true
+	 *   TARGET_USE_SCORPION_PLD_SET := true
+	 *   TARGET_SCORPION_BIONIC_PLDOFFS := <pldoffset>
+	 *   TARGET_SCORPION_BIONIC_PLDSIZE := <pldsize>
+	 */
+#ifndef PLDOFFS
+#define PLDOFFS	(6)
+#endif
+#ifndef PLDSIZE
+#define PLDSIZE	(128)	/* L2 cache line size */
+#endif
+        .code 32
+        .align 5
+        .globl memcpy
+        .func
+memcpy:
+	push            {r0}
+	cmp             r2, #4
+	blt             .Lneon_lt4
+	cmp             r2, #16
+	blt             .Lneon_lt16
+	cmp             r2, #32
+	blt             .Lneon_16
+	cmp              r2, #128
+	blt              .Lneon_copy_32_a
+	/* Copy blocks of 128-bytes (word-aligned) at a time*/
+	/* Code below is optimized for PLDSIZE=128 only */
+	mov             r12, r2, lsr #7
+	cmp             r12, #PLDOFFS
+	ble             .Lneon_copy_128_loop_nopld
+	sub             r12, #PLDOFFS
+	pld             [r1, #(PLDOFFS-1)*PLDSIZE]
+.Lneon_copy_128_loop_outer:
+	pld             [r1, #(PLDOFFS*PLDSIZE)]
+	vld1.32         {q0, q1}, [r1]!
+	vld1.32         {q2, q3}, [r1]!
+	vld1.32         {q8, q9}, [r1]!
+	vld1.32         {q10, q11}, [r1]!
+	subs            r12, r12, #1
+	vst1.32	        {q0, q1}, [r0]!
+	vst1.32         {q2, q3}, [r0]!
+	vst1.32         {q8, q9}, [r0]!
+	vst1.32         {q10, q11}, [r0]!
+	bne             .Lneon_copy_128_loop_outer
+	mov             r12, #PLDOFFS
+.Lneon_copy_128_loop_nopld:
+	vld1.32         {q0, q1}, [r1]!
+	vld1.32         {q2, q3}, [r1]!
+	vld1.32         {q8, q9}, [r1]!
+	vld1.32         {q10, q11}, [r1]!
+	subs            r12, r12, #1
+	vst1.32         {q0, q1}, [r0]!
+	vst1.32         {q2, q3}, [r0]!
+	vst1.32         {q8, q9}, [r0]!
+	vst1.32         {q10, q11}, [r0]!
+	bne             .Lneon_copy_128_loop_nopld
+	ands            r2, r2, #0x7f
+	beq             .Lneon_exit
+	cmp             r2, #32
+	blt             .Lneon_16
+	nop
+	/* Copy blocks of 32-bytes (word aligned) at a time*/
+.Lneon_copy_32_a:
+	mov             r12, r2, lsr #5
+.Lneon_copy_32_loop_a:
+	vld1.32         {q0,q1}, [r1]!
+	subs            r12, r12, #1
+	vst1.32         {q0,q1}, [r0]!
+	bne             .Lneon_copy_32_loop_a
+	ands            r2, r2, #0x1f
+	beq             .Lneon_exit
+.Lneon_16:
+	subs            r2, r2, #16
+	blt             .Lneon_lt16
+	vld1.32         {q8}, [r1]!
+	vst1.32         {q8}, [r0]!
+	beq             .Lneon_exit
+.Lneon_lt16:
+	movs            r12, r2, lsl #29
+	bcc             .Lneon_skip8
+	ldr             r3, [r1], #4
+	ldr             r12, [r1], #4
+	str             r3, [r0], #4
+	str             r12, [r0], #4
+.Lneon_skip8:
+	bpl             .Lneon_lt4
+	ldr             r3, [r1], #4
+	str             r3, [r0], #4
+.Lneon_lt4:
+	movs            r2, r2, lsl #31
+	bcc             .Lneon_lt2
+	ldrh            r3, [r1], #2
+	strh            r3, [r0], #2
+.Lneon_lt2:
+	bpl             .Lneon_exit
+	ldrb            r12, [r1]
+	strb            r12, [r0]
+.Lneon_exit:
+	pop             {r0}
+	bx              lr
+	.endfunc
+	.end
+#else /* !SCORPION_NEON_OPTIMIZATION */
         .text
         .fpu    neon
 
@@ -141,11 +250,14 @@ memcpy:
         strcsb      ip, [r0], #1
         strcsb      lr, [r0], #1
 
+.ifdef NEEDS_ARM_ERRATA_754319_754320_ASM
+        VMOV s0,s0                           @ NOP for ARM Errata
+.endif
         ldmfd       sp!, {r0, lr}
         bx          lr
         .fnend
 
-
+#endif  /* !SCORPION_NEON_OPTIMIZATION */
 #else   /* __ARM_ARCH__ < 7 */
 
 
@@ -260,20 +372,30 @@ cached_aligned32:
          *
          */
 
+#if __ARM_ARCH__ == 5
         // Align the preload register to a cache-line because the cpu does
         // "critical word first" (the first word requested is loaded first).
         bic         r12, r1, #0x1F
         add         r12, r12, #64
+#endif
 
 1:      ldmia       r1!, { r4-r11 }
+
+#if __ARM_ARCH__ == 5
         PLD         (r12, #64)
+#else
+        PLD         (r1, #64)
+#endif
         subs        r2, r2, #32
 
+#if __ARM_ARCH__ == 5
         // NOTE: if r12 is more than 64 ahead of r1, the following ldrhi
         // for ARM9 preload will not be safely guarded by the preceding subs.
         // When it is safely guarded the only possibility to have SIGSEGV here
         // is because the caller overstates the length.
         ldrhi       r3, [r12], #32      /* cheap ARM9 preload */
+#endif
+
         stmia       r0!, { r4-r11 }
 		bhs         1b
 
diff --git a/libc/arch-arm/bionic/memmove.S b/libc/arch-arm/bionic/memmove.S
new file mode 100644
index 000000000..123419584
--- /dev/null
+++ b/libc/arch-arm/bionic/memmove.S
@@ -0,0 +1,356 @@
+/***************************************************************************
+ Copyright (c) 2009-2011 Code Aurora Forum. All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are met:
+     * Redistributions of source code must retain the above copyright
+       notice, this list of conditions and the following disclaimer.
+     * Redistributions in binary form must reproduce the above copyright
+       notice, this list of conditions and the following disclaimer in the
+       documentation and/or other materials provided with the distribution.
+     * Neither the name of Code Aurora nor the names of its contributors may
+       be used to endorse or promote products derived from this software
+       without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ POSSIBILITY OF SUCH DAMAGE.
+  ***************************************************************************/
+
+/***************************************************************************
+ *  Neon memmove: Attempts to do a memmove with Neon registers if possible,
+ *     Inputs:
+ *        dest: The destination buffer
+ *        src: The source buffer
+ *        n: The size of the buffer to transfer
+ *     Outputs:
+ *
+ ***************************************************************************/
+
+#include <machine/cpu-features.h>
+
+#if defined(SCORPION_NEON_OPTIMIZATION)
+	/*
+	 * These can be overridden in:
+	 *   device/<vendor>/<board>/BoardConfig.mk
+         * by setting the following:
+	 *   TARGET_USE_SCORPION_BIONIC_OPTIMIZATION := true
+	 *   TARGET_USE_SCORPION_PLD_SET := true
+	 *   TARGET_SCORPION_BIONIC_PLDOFFS := <pldoffset>
+	 *   TARGET_SCORPION_BIONIC_PLDSIZE := <pldsize>
+	 */
+#ifndef PLDOFFS
+#define PLDOFFS	(6)
+#endif
+#ifndef PLDSIZE
+#define PLDSIZE	(128)	/* L2 cache line size */
+#endif
+
+	.code 32
+	.align 5
+	.global memmove
+	.type memmove, %function
+
+	.global bcopy
+	.type bcopy, %function
+
+bcopy:
+	mov		r12, r0
+	mov		r0, r1
+	mov		r1, r12
+memmove:
+	push            {r0}
+
+	/*
+	 * The requirements for memmove state that the function should
+	 * operate as if data were being copied from the source to a
+	 * buffer, then to the destination.  This is to allow a user
+	 * to copy data from a source and target that overlap.
+	 *
+	 * We can't just do byte copies front-to-back automatically, since
+	 * there's a good chance we may have an overlap (why else would someone
+	 * intentionally use memmove then?).
+	 *
+	 * We'll break this into two parts.  Front-to-back, or back-to-front
+	 * copies.
+	 */
+.Lneon_memmove_cmf:
+	cmp             r0, r1
+	blt             .Lneon_front_to_back_copy
+	bgt             .Lneon_back_to_front_copy
+	b               .Lneon_memmove_done
+
+	/* #############################################################
+	 * Front to Back copy
+	 */
+.Lneon_front_to_back_copy:
+	/*
+	 * For small copies, just do a quick memcpy.  We can do this for
+	 * front-to-back copies, aligned or unaligned, since we're only
+	 * doing 1 byte at a time...
+	 */
+	cmp             r2, #4
+	bgt             .Lneon_f2b_gt4
+	cmp             r2, #0
+.Lneon_f2b_smallcopy_loop:
+	beq             .Lneon_memmove_done
+	ldrb            r12, [r1], #1
+	subs            r2, r2, #1
+	strb            r12, [r0], #1
+	b               .Lneon_f2b_smallcopy_loop
+.Lneon_f2b_gt4:
+	/* The window size is in r3. */
+	sub             r3, r1, r0
+	/* #############################################################
+	 * Front to Back copy 
+	 */
+	/*
+	 * Note that we can't just route based on the size in r2.  If that's
+	 * larger than the overlap window in r3, we could potentially
+	 * (and likely!) destroy data we're copying.
+	 */
+	cmp             r2, r3
+	movle           r12, r2
+	movgt           r12, r3
+	cmp             r12, #256
+	bge             .Lneon_f2b_copy_128
+	cmp             r12, #64
+	bge             .Lneon_f2b_copy_32
+	cmp             r12, #16
+	bge             .Lneon_f2b_copy_16
+	cmp             r12, #8
+	bge             .Lneon_f2b_copy_8
+	cmp             r12, #4
+	bge             .Lneon_f2b_copy_4
+	b               .Lneon_f2b_copy_1
+	nop
+.Lneon_f2b_copy_128:
+	mov             r12, r2, lsr #7
+	cmp             r12, #PLDOFFS
+	ble             .Lneon_f2b_copy_128_loop_nopld
+	sub             r12, #PLDOFFS
+	pld             [r1, #(PLDOFFS-1)*PLDSIZE]
+.Lneon_f2b_copy_128_loop_outer:
+	pld             [r1, #(PLDOFFS*PLDSIZE)]
+	vld1.32         {q0,q1}, [r1]!
+	vld1.32         {q2,q3}, [r1]!
+	vld1.32         {q8,q9}, [r1]!
+	vld1.32         {q10,q11}, [r1]!
+	subs            r12, r12, #1
+	vst1.32         {q0,q1}, [r0]!
+	vst1.32         {q2,q3}, [r0]!
+	vst1.32         {q8,q9}, [r0]!
+	vst1.32         {q10,q11}, [r0]!
+	bne             .Lneon_f2b_copy_128_loop_outer
+	mov             r12, #PLDOFFS
+.Lneon_f2b_copy_128_loop_nopld:
+	vld1.32         {q0,q1}, [r1]!
+	vld1.32         {q2,q3}, [r1]!
+	vld1.32         {q8,q9}, [r1]!
+	vld1.32         {q10,q11}, [r1]!
+	subs            r12, r12, #1
+	vst1.32         {q0,q1}, [r0]!
+	vst1.32         {q2,q3}, [r0]!
+	vst1.32         {q8,q9}, [r0]!
+	vst1.32         {q10,q11}, [r0]!
+	bne             .Lneon_f2b_copy_128_loop_nopld
+	ands            r2, r2, #0x7f
+	beq             .Lneon_memmove_done
+	cmp             r2, #32
+	bge             .Lneon_f2b_copy_32
+	b               .Lneon_f2b_copy_finish
+.Lneon_f2b_copy_32:
+	mov             r12, r2, lsr #5
+.Lneon_f2b_copy_32_loop:
+	vld1.32         {q0,q1}, [r1]!
+	subs            r12, r12, #1
+	vst1.32         {q0,q1}, [r0]!
+	bne             .Lneon_f2b_copy_32_loop
+	ands            r2, r2, #0x1f
+	beq             .Lneon_memmove_done
+.Lneon_f2b_copy_finish:
+.Lneon_f2b_copy_16:
+	movs            r12, r2, lsr #4
+	beq             .Lneon_f2b_copy_8
+.Lneon_f2b_copy_16_loop:
+	vld1.32         {q0}, [r1]!
+	subs            r12, r12, #1
+	vst1.32         {q0}, [r0]!
+	bne             .Lneon_f2b_copy_16_loop
+	ands            r2, r2, #0xf
+	beq             .Lneon_memmove_done
+.Lneon_f2b_copy_8:
+	movs            r12, r2, lsr #3
+	beq             .Lneon_f2b_copy_4
+.Lneon_f2b_copy_8_loop:
+	vld1.32         {d0}, [r1]!
+	subs            r12, r12, #1
+	vst1.32         {d0}, [r0]!
+	bne             .Lneon_f2b_copy_8_loop
+	ands            r2, r2, #0x7
+	beq             .Lneon_memmove_done
+.Lneon_f2b_copy_4:
+	movs            r12, r2, lsr #2
+	beq             .Lneon_f2b_copy_1
+.Lneon_f2b_copy_4_loop:
+	ldr             r3, [r1], #4
+	subs            r12, r12, #1
+	str             r3, [r0], #4
+	bne             .Lneon_f2b_copy_4_loop
+	ands            r2, r2, #0x3
+	nop
+.Lneon_f2b_copy_1:
+	cmp             r2, #0
+	beq             .Lneon_memmove_done
+.Lneon_f2b_copy_1_loop:
+	ldrb            r12, [r1], #1
+	subs            r2, r2, #1
+	strb            r12, [r0], #1
+	bne             .Lneon_f2b_copy_1_loop
+.Lneon_f2b_finish:
+	b               .Lneon_memmove_done
+
+	/* #############################################################
+	 * Back to Front copy
+	 */
+.Lneon_back_to_front_copy:
+	/*
+	 * Here, we'll want to shift to the end of the buffers.  This
+	 * actually points us one past where we need to go, but since
+	 * we'll pre-decrement throughout, this will be fine.
+	 */
+	add             r0, r0, r2
+	add             r1, r1, r2
+	cmp             r2, #4
+	bgt             .Lneon_b2f_gt4
+	cmp             r2, #0
+.Lneon_b2f_smallcopy_loop:
+	beq             .Lneon_memmove_done
+	ldrb            r12, [r1, #-1]!
+	subs            r2, r2, #1
+	strb            r12, [r0, #-1]!
+	b               .Lneon_b2f_smallcopy_loop
+.Lneon_b2f_gt4:
+	/*
+	 * The minimum of the overlap window size and the copy size
+	 * is in r3.
+	 */
+	sub             r3, r0, r1
+	/*
+	 * #############################################################
+	 * Back to Front copy -
+	 */
+	cmp             r2, r3
+	movle           r12, r2
+	movgt           r12, r3
+	cmp             r12, #256
+	bge             .Lneon_b2f_copy_128
+	cmp             r12, #64
+	bge             .Lneon_b2f_copy_32
+	cmp             r12, #8
+	bge             .Lneon_b2f_copy_8
+	cmp             r12, #4
+	bge             .Lneon_b2f_copy_4
+	b               .Lneon_b2f_copy_1
+	nop
+.Lneon_b2f_copy_128:
+	movs            r12, r2, lsr #7
+	cmp             r12, #PLDOFFS
+	ble             .Lneon_b2f_copy_128_loop_nopld
+	sub             r12, #PLDOFFS
+	pld             [r1, #-(PLDOFFS-1)*PLDSIZE]
+.Lneon_b2f_copy_128_loop_outer:
+	pld             [r1, #-(PLDOFFS*PLDSIZE)]
+	sub             r1, r1, #128
+	sub             r0, r0, #128
+	vld1.32         {q0, q1}, [r1]!
+	vld1.32         {q2, q3}, [r1]!
+	vld1.32         {q8, q9}, [r1]!
+	vld1.32         {q10, q11}, [r1]!
+	subs            r12, r12, #1
+	vst1.32         {q0, q1}, [r0]!
+	vst1.32         {q2, q3}, [r0]!
+	vst1.32         {q8, q9}, [r0]!
+	vst1.32         {q10, q11}, [r0]!
+	sub             r1, r1, #128
+	sub             r0, r0, #128
+	bne             .Lneon_b2f_copy_128_loop_outer
+	mov             r12, #PLDOFFS
+.Lneon_b2f_copy_128_loop_nopld:
+	sub             r1, r1, #128
+	sub             r0, r0, #128
+	vld1.32         {q0, q1}, [r1]!
+	vld1.32         {q2, q3}, [r1]!
+	vld1.32         {q8, q9}, [r1]!
+	vld1.32         {q10, q11}, [r1]!
+	subs            r12, r12, #1
+	vst1.32         {q0, q1}, [r0]!
+	vst1.32         {q2, q3}, [r0]!
+	vst1.32         {q8, q9}, [r0]!
+	vst1.32         {q10, q11}, [r0]!
+	sub             r1, r1, #128
+	sub             r0, r0, #128
+	bne             .Lneon_b2f_copy_128_loop_nopld
+	ands            r2, r2, #0x7f
+	beq             .Lneon_memmove_done
+	cmp             r2, #32
+	bge             .Lneon_b2f_copy_32
+	b               .Lneon_b2f_copy_finish
+.Lneon_b2f_copy_32:
+	mov             r12, r2, lsr #5
+.Lneon_b2f_copy_32_loop:
+	sub             r1, r1, #32
+	sub             r0, r0, #32
+	vld1.32         {q0,q1}, [r1]
+	subs            r12, r12, #1
+	vst1.32         {q0,q1}, [r0]
+	bne             .Lneon_b2f_copy_32_loop
+	ands            r2, r2, #0x1f
+	beq             .Lneon_memmove_done
+.Lneon_b2f_copy_finish:
+.Lneon_b2f_copy_8:
+	movs            r12, r2, lsr #0x3
+	beq             .Lneon_b2f_copy_4
+.Lneon_b2f_copy_8_loop:
+	sub             r1, r1, #8
+	sub             r0, r0, #8
+	vld1.32         {d0}, [r1]
+	subs            r12, r12, #1
+	vst1.32         {d0}, [r0]
+	bne             .Lneon_b2f_copy_8_loop
+	ands            r2, r2, #0x7
+	beq             .Lneon_memmove_done
+.Lneon_b2f_copy_4:
+	movs            r12, r2, lsr #0x2
+	beq             .Lneon_b2f_copy_1
+.Lneon_b2f_copy_4_loop:
+	ldr             r3, [r1, #-4]!
+	subs            r12, r12, #1
+	str             r3, [r0, #-4]!
+	bne             .Lneon_b2f_copy_4_loop
+	ands            r2, r2, #0x3
+	nop
+.Lneon_b2f_copy_1:
+	cmp             r2, #0
+	beq             .Lneon_memmove_done
+.Lneon_b2f_copy_1_loop:
+	ldrb            r12, [r1, #-1]!
+	subs            r2, r2, #1
+	strb            r12, [r0, #-1]!
+	bne             .Lneon_b2f_copy_1_loop
+
+.Lneon_memmove_done:
+	pop             {r0}
+	bx              lr
+
+	.end
+#endif /* SCORPION_NEON_OPTIMIZATION */
+
diff --git a/libc/arch-arm/bionic/memset.S b/libc/arch-arm/bionic/memset.S
index 93abe15a2..69abd4bdf 100644
--- a/libc/arch-arm/bionic/memset.S
+++ b/libc/arch-arm/bionic/memset.S
@@ -2,6 +2,8 @@
  * Copyright (C) 2008 The Android Open Source Project
  * All rights reserved.
  *
+ * Copyright (c) 2009-2011, Code Aurora Forum. All rights reserved.
+ *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
@@ -35,6 +37,90 @@
 
     .align
 	
+
+#if defined(SCORPION_NEON_OPTIMIZATION)
+	.code 32
+	.align 8
+	.global memset
+	.type memset, %function
+
+	.global bzero
+	.type bzero, %function
+
+bzero:
+	mov             r2, r1
+	mov             r1, #0
+memset:
+	push            {r0}
+
+	cmp             r2, #6
+	bgt             .Lmemset_gt6
+	cmp             r2, #0
+	beq             .Lmemset_smallcopy_done
+.Lmemset_smallcopy_loop:
+	strb            r1, [r0], #1
+	subs            r2, r2, #1
+	bne             .Lmemset_smallcopy_loop
+.Lmemset_smallcopy_done:
+	pop             {r0}
+	bx              lr
+
+.Lmemset_gt6:
+	vdup.8		q0, r1
+	vmov		r1, s0
+
+	/*
+	 * Decide where to route for the maximum copy sizes.
+	 */
+	cmp             r2, #4
+	blt             .Lmemset_lt4
+	cmp             r2, #16
+	blt             .Lmemset_lt16
+	vmov            q1, q0
+	cmp             r2, #128
+	blt             .Lmemset_32
+.Lmemset_128:
+	mov             r12, r2, lsr #7
+.Lmemset_128_loop:
+	vst1.32         {q0, q1}, [r0]!
+	vst1.32         {q0, q1}, [r0]!
+	vst1.32         {q0, q1}, [r0]!
+	vst1.32         {q0, q1}, [r0]!
+	subs            r12, r12, #1
+	bne             .Lmemset_128_loop
+	ands            r2, r2, #0x7f
+	beq             .Lmemset_end
+.Lmemset_32:
+	movs             r12, r2, lsr #5
+	beq              .Lmemset_lt32
+.Lmemset_32_loop:
+	subs            r12, r12, #1
+	vst1.32         {q0, q1}, [r0]!
+	bne             .Lmemset_32_loop
+	ands            r2, r2, #0x1f
+	beq             .Lmemset_end
+.Lmemset_lt32:
+	cmp             r2, #16
+	blt             .Lmemset_lt16
+	vst1.64         {q0}, [r0]!
+	subs            r2, r2, #16
+	beq             .Lmemset_end
+.Lmemset_lt16:
+	movs            r12, r2, lsl #29
+	strcs           r1, [r0], #4
+	strcs           r1, [r0], #4
+	strmi           r1, [r0], #4
+.Lmemset_lt4:
+	movs            r2, r2, lsl #31
+	strcsh          r1, [r0], #2
+	strmib          r1, [r0]
+.Lmemset_end:
+	pop             {r0}
+	bx		lr
+
+	.end
+#else   /* !SCORPION_NEON_OPTIMIZATION */
+
 		/*
 		 * Optimized memset() for ARM.
          *
@@ -115,3 +201,4 @@ memset:
         bx          lr
         .fnend
     
+#endif  /* SCORPION_NEON_OPTIMIZATION */
diff --git a/libc/arch-arm/bionic/strlen-armv7.S b/libc/arch-arm/bionic/strlen-armv7.S
new file mode 100644
index 000000000..125e92fb8
--- /dev/null
+++ b/libc/arch-arm/bionic/strlen-armv7.S
@@ -0,0 +1,111 @@
+/* Copyright (c) 2010-2011, Linaro Limited
+   All rights reserved.
+
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+      * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+      * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+
+      * Neither the name of Linaro Limited nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+   Written by Dave Gilbert <david.gilbert@linaro.org>
+
+   This strlen routine is optimised on a Cortex-A9 and should work on
+   all ARMv7 processors.   This routine is reasonably fast for short
+   strings, but is probably slower than a simple implementation if all
+   your strings are very short */
+
+@ 2011-02-08 david.gilbert@linaro.org
+@    Extracted from local git 6848613a
+
+
+@ this lets us check a flag in a 00/ff byte easily in either endianness
+#ifdef __ARMEB__
+#define CHARTSTMASK(c) 1<<(31-(c*8))
+#else
+#define CHARTSTMASK(c) 1<<(c*8)
+#endif
+
+@-----------------------------------------------------------------------------------------------------------------------------
+	.syntax unified
+	.arch armv7-a
+
+	.thumb_func
+	.align 2
+	.p2align 4,,15
+	.global strlen
+	.type strlen,%function
+strlen:
+	@ r0 = string
+	@ returns count of bytes in string not including terminator
+	mov	r1, r0
+	push	{ r4,r6 }
+	mvns	r6, #0		@ all F
+	movs	r4, #0
+	tst	r0, #7
+	beq	2f
+
+1:
+	ldrb	r2, [r1], #1
+	tst	r1, #7		@ Hit alignment yet?
+	cbz	r2, 10f		@ Exit if we found the 0
+	bne	1b
+
+	@ So we're now aligned
+2:
+	ldmia	r1!,{r2,r3}
+	uadd8	r2, r2, r6	@ Parallel add 0xff - sets the GE bits for anything that wasn't 0
+	sel	r2, r4, r6	@ bytes are 00 for none-00 bytes, or ff for 00 bytes - NOTE INVERSION
+	uadd8	r3, r3, r6	@ Parallel add 0xff - sets the GE bits for anything that wasn't 0
+	sel	r3, r2, r6	@ bytes are 00 for none-00 bytes, or ff for 00 bytes - NOTE INVERSION
+	cmp	r3, #0
+	beq	2b
+
+strlenendtmp:
+	@ One (or more) of the bytes we loaded was 0 - but which one?
+	@ r2 has the mask corresponding to the first loaded word
+	@ r3 has a combined mask of the two words - but if r2 was all-non 0 
+	@ then it's just the 2nd words
+	cmp	r2, #0
+	itte	eq
+	moveq	r2, r3		@ the end is in the 2nd word
+	subeq	r1,r1,#3
+	subne	r1,r1,#7
+
+	@ r1 currently points to the 2nd byte of the word containing the 0
+	tst	r2, # CHARTSTMASK(0)	@ 1st character
+	bne	10f
+	adds	r1,r1,#1
+	tst	r2, # CHARTSTMASK(1)	@ 2nd character
+	ittt	eq
+	addeq	r1,r1,#1
+	tsteq	r2, # (3<<15)	@ 2nd & 3rd character
+	@ If not the 3rd must be the last one
+	addeq	r1,r1,#1
+
+10:
+	@ r0 is still at the beginning, r1 is pointing 1 byte after the terminator
+	sub	r0, r1, r0
+	subs	r0, r0, #1
+	pop	{ r4, r6 }
+	bx	lr