Merge branch 'eclair' of git@github.com:cyanogen/android_external_skia into donutdonut

Conflicts: Android.mk
author: Steve Kondik <shade@chemlab.org> 2010-04-11 15:22:53 -0400
committer: Steve Kondik <shade@chemlab.org> 2010-04-11 15:22:53 -0400
commit: d76be2dc139764e92294234b822ef3cbb7253cc8 (patch)
tree: 1717117afdbc4f3592b95598d4b2f37b840209ca
parent: e7ccf8c1cdac8dfbe353ec3201f36fac398f19c6 (diff)
parent: bcbd70c3951bbc0e1b09132fe21c1cf04982909e (diff)
download: android_external_skia-donut.tar.gz
android_external_skia-donut.tar.bz2
android_external_skia-donut.zip
14 files changed, 1292 insertions, 32 deletions
diff --git a/Android.mk b/Android.mk
index 4b4605dda1..b1446d37e4 100644
--- a/Android.mk
+++ b/Android.mk
@@ -197,6 +197,33 @@ endif
 LOCAL_SRC_FILES += \
 	emoji/EmojiFont.cpp
 
+# including the optimized assembly code for the src-overing operation
+ifeq ($(TARGET_ARCH),arm)
+        LOCAL_CFLAGS += -DUSE_T32CB16BLEND_ASM
+        LOCAL_SRC_FILES += \
+		src/core/asm/t32cb16blend.S \
+		src/core/asm/xfer.S \
+		src/core/asm/S32A_Opaque_BlitRow32.S
+endif
+
+ifeq ($(TARGET_ARCH_VARIANT),armv6)
+	ARCH_ARMV6_ARMV7 := true
+endif
+
+ifeq ($(TARGET_ARCH_VARIANT),armv7-a)
+	ARCH_ARMV6_ARMV7 := true
+endif
+
+ifeq ($(ARCH_ARMV6_ARMV7),true)
+	LOCAL_SRC_FILES += \
+		src/core/asm/S32_Opaque_D32_nofilter_DX_gether.S
+endif
+
+ifeq ($(ARCH_ARM_HAVE_NEON),true)
+	LOCAL_SRC_FILES += \
+		src/core/asm/memset16_neon.S \
+		src/core/asm/memset32_neon.S
+endif
 
 LOCAL_SHARED_LIBRARIES := \
 	libcutils \
diff --git a/include/core/SkUtils.h b/include/core/SkUtils.h
index 9f3b1d6f36..deeebf00e7 100644
--- a/include/core/SkUtils.h
+++ b/include/core/SkUtils.h
@@ -36,10 +36,18 @@ void sk_memset16_portable(uint16_t dst[], uint16_t value, int count);
 void sk_memset32_portable(uint32_t dst[], uint32_t value, int count);
 
 #ifdef ANDROID
-    #include "cutils/memory.h"
-    
-    #define sk_memset16(dst, value, count)    android_memset16(dst, value, (count) << 1)
-    #define sk_memset32(dst, value, count)    android_memset32(dst, value, (count) << 2)
+    #if defined(__ARM_HAVE_NEON)
+        extern "C" void memset16_neon(uint16_t*, uint16_t, int);
+        extern "C" void memset32_neon(uint32_t*, uint32_t, int);
+
+        #define sk_memset16(dst, value, count)    memset16_neon(dst, value, (count) << 1)
+        #define sk_memset32(dst, value, count)    memset32_neon(dst, value, (count) << 2)
+    #else
+        #include "cutils/memory.h"
+
+        #define sk_memset16(dst, value, count)    android_memset16(dst, value, (count) << 1)
+        #define sk_memset32(dst, value, count)    android_memset32(dst, value, (count) << 2)
+    #endif
 #endif
 
 #ifndef sk_memset16
diff --git a/src/core/SkBitmapProcState.cpp b/src/core/SkBitmapProcState.cpp
index 600b963d9d..0fdc6c8e95 100644
--- a/src/core/SkBitmapProcState.cpp
+++ b/src/core/SkBitmapProcState.cpp
@@ -86,7 +86,11 @@ static inline U8CPU Filter_8(unsigned x, unsigned y,
                                 SkASSERT(state.fAlphaScale == 256)
 #define RETURNDST(src)          src
 #define SRC_TO_FILTER(src)      src
+#if __ARM_ARCH__ >= 6 && !defined(SK_CPU_BENDIAN)
+    #define USE_GETHER32
+#endif
 #include "SkBitmapProcState_sample.h"
+#undef  USE_GETHER32
 
 #undef FILTER_PROC
 #define FILTER_PROC(x, y, a, b, c, d, dst)   Filter_32_alpha(x, y, a, b, c, d, dst, alphaScale)
diff --git a/src/core/SkBitmapProcState_sample.h b/src/core/SkBitmapProcState_sample.h
index 4e1f1395e3..ae0750b20f 100644
--- a/src/core/SkBitmapProcState_sample.h
+++ b/src/core/SkBitmapProcState_sample.h
@@ -16,6 +16,13 @@
     #error "unsupported DSTSIZE"
 #endif
 
+#if defined(USE_GETHER32)
+    extern "C" void S32_Opaque_D32_nofilter_DX_gether(SkPMColor* SK_RESTRICT colors,
+                                                      const SkPMColor* SK_RESTRICT srcAddr,
+                                                      int count,
+                                                      const uint32_t* SK_RESTRICT xy);
+#endif
+
 static void MAKENAME(_nofilter_DXDY)(const SkBitmapProcState& s,
                                      const uint32_t* SK_RESTRICT xy,
                                      int count, DSTTYPE* SK_RESTRICT colors) {
@@ -85,6 +92,9 @@ static void MAKENAME(_nofilter_DX)(const SkBitmapProcState& s,
         DSTTYPE dstValue = RETURNDST(src);
         BITMAPPROC_MEMSET(colors, dstValue, count);
     } else {
+#if defined(USE_GETHER32)
+        S32_Opaque_D32_nofilter_DX_gether(colors, srcAddr, count, xy);
+#else
         int i;
         for (i = (count >> 2); i > 0; --i) {
             uint32_t xx0 = *xy++;
@@ -104,6 +114,7 @@ static void MAKENAME(_nofilter_DX)(const SkBitmapProcState& s,
             SkASSERT(*xx < (unsigned)s.fBitmap->width());
             src = srcAddr[*xx++]; *colors++ = RETURNDST(src);
         }
+#endif
     }
     
 #ifdef POSTAMBLE
diff --git a/src/core/SkBlitRow_D16.cpp b/src/core/SkBlitRow_D16.cpp
index 66ac90e29a..a0d4b99135 100644
--- a/src/core/SkBlitRow_D16.cpp
+++ b/src/core/SkBlitRow_D16.cpp
@@ -215,12 +215,20 @@ static void S32A_D565_Blend_Dither(uint16_t* SK_RESTRICT dst,
 ///////////////////////////////////////////////////////////////////////////////
 ///////////////////////////////////////////////////////////////////////////////
 
+#ifdef USE_T32CB16BLEND_ASM
+    extern "C" void scanline_t32cb16blend_arm(uint16_t*, uint32_t*, size_t);
+#endif
+
 static const SkBlitRow::Proc gDefault_565_Procs[] = {
     // no dither
     S32_D565_Opaque,
     S32_D565_Blend,
 
+#ifdef USE_T32CB16BLEND_ASM
+    (SkBlitRow::Proc)scanline_t32cb16blend_arm,
+#else
     S32A_D565_Opaque,
+#endif
     S32A_D565_Blend,
 
     // dither
diff --git a/src/core/SkXfermode.cpp b/src/core/SkXfermode.cpp
index 8d1531a0a0..0f56d866df 100644
--- a/src/core/SkXfermode.cpp
+++ b/src/core/SkXfermode.cpp
@@ -19,6 +19,9 @@
 
 #define SkAlphaMulAlpha(a, b)   SkMulDiv255Round(a, b)
 
+static SkPMColor src_modeproc(SkPMColor , SkPMColor );
+extern "C" void xfer16_arm(uint16_t*, uint32_t*, uint32_t);
+
 static SkPMColor SkFourByteInterp(SkPMColor src, SkPMColor dst, U8CPU alpha) {
     unsigned scale = SkAlpha255To256(alpha);
 
@@ -233,10 +236,14 @@ void SkProcXfermode::xfer16(SK_RESTRICT uint16_t dst[],
 
     if (NULL != proc) {
         if (NULL == aa) {
-            for (int i = count - 1; i >= 0; --i) {
-                SkPMColor dstC = SkPixel16ToPixel32(dst[i]);
-                dst[i] = SkPixel32ToPixel16_ToU16(proc(src[i], dstC));
-            }
+	    if (proc == src_modeproc) {
+	        xfer16_arm(dst, (uint32_t*)src, count);
+	    } else {
+                for (int i = count - 1; i >= 0; --i) {
+                    SkPMColor dstC = SkPixel16ToPixel32(dst[i]);
+                    dst[i] = SkPixel32ToPixel16_ToU16(proc(src[i], dstC));
+                }
+	    }
         } else {
             for (int i = count - 1; i >= 0; --i) {
                 unsigned a = aa[i];
diff --git a/src/core/asm/S32A_Opaque_BlitRow32.S b/src/core/asm/S32A_Opaque_BlitRow32.S
new file mode 100644
index 0000000000..1454dd19f6
--- /dev/null
+++ b/src/core/asm/S32A_Opaque_BlitRow32.S
@@ -0,0 +1,320 @@
+/*
+ * Copyright (c) 2005-2008, The Android Open Source Project
+ * Copyright (c) 2010, Code Aurora Forum. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+    .text
+
+    .global S32A_Opaque_BlitRow32_asm
+    .func S32A_Opaque_BlitRow32_asm
+
+S32A_Opaque_BlitRow32_asm:
+
+#if __ARM_ARCH__ == 7 || defined(__ARM_NEON__)
+
+    push     {r4-r11}
+    cmp      r2,#24
+    blt      .Lless_than_24
+
+    vpush    {Q4-Q7}
+
+    vmov.i16 q14,#0xff               //;Q4.16 = 255
+//prefix
+    vld4.8  {d0, d1, d2, d3}, [r1]! //d0,d1,d2,d3 = sourc rgb(0,1,2,3) A(0,1,2,3)
+                                    //update source ptr but not dst ptr
+    vld4.8  {d4, d5, d6, d7}, [r0]  //d4,d5,d6,d7 = dst rgb(0,1,2,3) A(0,1,2,3)
+    add      r3, r0, #32 // minus 16 to pretend the last round
+    mov      r5, #64
+    sub      r2,r2,#8
+.Lloop:
+    sub      r2,r2,#16
+    vsubw.u8 q4,q14,d3               //Q4.16 = 255-d3
+                                    //update source ptr but not dst ptr
+
+    //It has to be 24 since we pre-load 8 word for the next rounds
+    cmp      r2,#16
+
+    vsra.u16 q4,q4,#7               //Q4 = Q4.16+Q4 >> 7 ; A(0,1,2,3)
+
+    vmovl.u8 q6,d4                  //Q6 = vmovl.u8 d4
+    vmovl.u8 q7,d5                  //Q7 = vmovl.u8 d5
+    vmovl.u8 q8,d6                  //Q8 = vmovl.u8 d6
+    vmovl.u8 q9,d7                  //Q9 = vmovl.u8 d7
+
+
+    vmul.i16 q6,q6,q4               //Q6 = Q6 * Q4
+    vmul.i16 q7,q7,q4               //Q7 = Q7 * Q4
+
+    vld4.8  {d20, d21, d22, d23}, [r1]! //d0,d1,d2,d3 = sourc rgb(0,1,2,3) A(0,1,2,3)
+
+    vmul.i16 q8,q8,q4               //Q8 = Q8 * Q4
+    vmul.i16 q9,q9,q4               //Q9 = Q9 * Q4
+
+    vld4.8  {d24, d25, d26, d27}, [r3]  //d4,d5,d6,d7 = dst rgb(0,1,2,3) A(0,1,2,3)
+
+    vshrn.i16 d4,q6,#8              //d4 = Q6.16 shrn 8
+    vshrn.i16 d5,q7,#8              //d5 = Q7.16 shrn 8
+    vshrn.i16 d6,q8,#8              //d6 = Q8.16 shrn 8
+    vshrn.i16 d7,q9,#8              //d7 = Q9.16 shrn 8
+
+    vadd.i8  d4,d4,d0               //d4 = d4+d0
+    vadd.i8  d5,d5,d1               //d5 = d5+d1
+    vadd.i8  d6,d6,d2               //d6 = d6+d2
+    vadd.i8  d7,d7,d3               //d7 = d7+d3
+
+    vst4.8  {d4, d5, d6, d7}, [r0], r5 //dst rgb(0,1,2,3) A(0,1,2,3) = d4,d5,d6,d7
+    //add r0, r0, r5
+
+    //The next 4 words
+//    vld4.8  {d20, d21, d22, d23}, [r1]! ;d0,d1,d2,d3 = sourc rgb(0,1,2,3) A(0,1,2,3)
+//                                    ;update source ptr but not dst ptr
+//    vld4.8  {d24, d25, d26, d27}, [r0]  ;d4,d5,d6,d7 = dst rgb(0,1,2,3) A(0,1,2,3)
+
+                                    //update source ptr but not dst ptr
+    vsubW.u8 q4,q14,d23               //Q4.16 = 255-d3
+
+    vsra.u16 q4,q4,#7               //Q4 = Q4.16+Q4 >> 7 ; A(0,1,2,3)
+
+    vmovl.u8 q6,d24                  //Q6 = vmovl.u8 d4
+    vmovl.u8 q7,d25                  //Q7 = vmovl.u8 d5
+    vmovl.u8 q8,d26                  //Q8 = vmovl.u8 d6
+    vmovl.u8 q9,d27                  //Q9 = vmovl.u8 d7
+
+    vmul.i16 q6,q6,q4               //Q6 = Q6 * Q4
+    vmul.i16 q7,q7,q4               //Q7 = Q7 * Q4
+
+    vld4.8  {d0, d1, d2, d3}, [r1]! //d0,d1,d2,d3 = sourc rgb(0,1,2,3) A(0,1,2,3)
+
+    vmul.i16 q8,q8,q4               //Q8 = Q8 * Q4
+    vmul.i16 q9,q9,q4               //Q9 = Q9 * Q4
+
+    vld4.8  {d4, d5, d6, d7}, [r0]  //d4,d5,d6,d7 = dst rgb(0,1,2,3) A(0,1,2,3)
+    vshrn.i16 d24,q6,#8              //d4 = Q6.16 shrn 8
+    vshrn.i16 d25,q7,#8              //d5 = Q7.16 shrn 8
+    vshrn.i16 d26,q8,#8              //d6 = Q8.16 shrn 8
+    vshrn.i16 d27,q9,#8              //d7 = Q9.16 shrn 8
+
+    vadd.i8  d24,d24,d20               //d4 = d4+d0
+    vadd.i8  d25,d25,d21               //d5 = d5+d1
+    vadd.i8  d26,d26,d22               //d6 = d6+d2
+    vadd.i8  d27,d27,d23               //d7 = d7+d3
+
+    vst4.8  {d24, d25, d26, d27}, [r3], r5 //dst rgb(0,1,2,3) A(0,1,2,3) = d4,d5,d6,d7
+    //add r3, r3, r5
+
+    bge      .Lloop
+
+//postfix:
+//There are 8 words left unprocessed from previous round
+    vmov.i16 q4,#0xff               //Q4.16 = 255
+    vsubw.u8 q4,q4,d3               //Q4.16 = 255-d3
+
+    cmp      r2,#8
+
+    vshr.u16 q5,q4,#7               //Q5.16 = Q4 >> 7
+    vadd.i16 q4,q4,q5               //Q4 = Q4.16+Q5.16 ; A(0,1,2,3)
+
+    vmovl.u8 q6,d4                  //Q6 = vmovl.u8 d4
+    vmovl.u8 q7,d5                  //Q7 = vmovl.u8 d5
+    vmovl.u8 q8,d6                  //Q8 = vmovl.u8 d6
+    vmovl.u8 q9,d7                  //Q9 = vmovl.u8 d7
+
+    vmul.i16 q6,q6,q4               //Q6 = Q6 * Q4
+    vmul.i16 q7,q7,q4               //Q7 = Q7 * Q4
+    vmul.i16 q8,q8,q4               //Q8 = Q8 * Q4
+    vmul.i16 q9,q9,q4               //Q9 = Q9 * Q4
+
+    vshrn.i16 d4,q6,#8              //d4 = Q6.16 shrn 8
+    vshrn.i16 d5,q7,#8              //d5 = Q7.16 shrn 8
+    vshrn.i16 d6,q8,#8              //d6 = Q8.16 shrn 8
+    vshrn.i16 d7,q9,#8              //d7 = Q9.16 shrn 8
+
+    vadd.i8  d4,d4,d0               //d4 = d4+d0
+    vadd.i8  d5,d5,d1               //d5 = d5+d1
+    vadd.i8  d6,d6,d2               //d6 = d6+d2
+    vadd.i8  d7,d7,d3               //d7 = d7+d3
+
+    vst4.8  {d4, d5, d6, d7}, [r0]! //dst rgb(0,1,2,3) A(0,1,2,3) = d4,d5,d6,d7
+
+.Lless_than_16:
+    cmp      r2,#8
+    blt      .Lless_than_8
+
+    sub      r2,r2,#8
+
+    vld4.8  {d0, d1, d2, d3}, [r1]! //d0,d1,d2,d3 = sourc rgb(0,1,2,3) A(0,1,2,3)
+                                    //update source ptr but not dst ptr
+    vld4.8  {d4, d5, d6, d7}, [r0]  //d4,d5,d6,d7 = dst rgb(0,1,2,3) A(0,1,2,3)
+
+    vmov.i16 q4,#0xff               //Q4.16 = 255
+    vsubw.u8 q4,q4,d3               //Q4.16 = 255-d3
+
+    cmp      r2,#8
+
+    vshr.u16 q5,q4,#7               //Q5.16 = Q4 >> 7
+    vadd.i16 q4,q4,q5               //Q4 = Q4.16+Q5.16 ; A(0,1,2,3)
+
+    vmovl.u8 q6,d4                  //Q6 = vmovl.u8 d4
+    vmovl.u8 q7,d5                  //Q7 = vmovl.u8 d5
+    vmovl.u8 q8,d6                  //Q8 = vmovl.u8 d6
+    vmovl.u8 q9,d7                  //Q9 = vmovl.u8 d7
+
+    vmul.i16 q6,q6,q4               //Q6 = Q6 * Q4
+    vmul.i16 q7,q7,q4               //Q7 = Q7 * Q4
+    vmul.i16 q8,q8,q4               //Q8 = Q8 * Q4
+    vmul.i16 q9,q9,q4               //Q9 = Q9 * Q4
+
+    vshrn.i16 d4,q6,#8              //d4 = Q6.16 shrn 8
+    vshrn.i16 d5,q7,#8              //d5 = Q7.16 shrn 8
+    vshrn.i16 d6,q8,#8              //d6 = Q8.16 shrn 8
+    vshrn.i16 d7,q9,#8              //d7 = Q9.16 shrn 8
+
+    vadd.i8  d4,d4,d0               //d4 = d4+d0
+    vadd.i8  d5,d5,d1               //d5 = d5+d1
+    vadd.i8  d6,d6,d2               //d6 = d6+d2
+    vadd.i8  d7,d7,d3               //d7 = d7+d3
+
+    vst4.8  {d4, d5, d6, d7}, [r0]! //dst rgb(0,1,2,3) A(0,1,2,3) = d4,d5,d6,d7
+
+    //It will be guaranteed to be less than 8
+    //bge      loop
+.Lless_than_8:
+    vpop     {Q4-Q7}
+
+.Lless_than_4:
+    subs     r4,r2,#1
+    bmi      .Lto_exit  // S32A_Opaque_BlitRow32_neon + 268
+    mov      r8,#0xff
+    mvn      r10,#0xff00
+    orr      r9,r8,r8,lsl #16
+    lsl      r11,r9,#8
+.Lresidual_loop:
+    ldr      r3,[r1,#0]
+    ldr      r12,[r0,#0]
+    add      r1,r1,#4
+    sub      r2,r8,r3,lsr #24
+    and      r5,r12,r9
+    cmp      r2,r2
+    add      r2,r2,#1
+    and      r12,r10,r12,lsr #8
+    strne    r6,[r7,#0xeef]
+    mul      r5,r5,r2
+    mul      r2,r12,r2
+    strne    r6,[r7,#0xeef]
+    subs     r4,r4,#1
+    and      r12,r9,r5,lsr #8
+    and      r2,r2,r11
+    orr      r2,r2,r12
+    add      r2,r2,r3
+    str      r2,[r0],#4
+    bpl      .Lresidual_loop  // S32A_Opaque_BlitRow32_neon + 192
+
+.Lto_exit:
+    pop      {r4-r11}
+    bx       lr
+
+.Lless_than_24:
+    cmp      r2,#8
+    blt      .Lless_than_4
+
+.Lloop_8:
+    sub      r2,r2,#8
+    // We already read the 8 words from the previous pipe line
+    vld4.8  {d0, d1, d2, d3}, [r1]! //d0,d1,d2,d3 = sourc rgb(0,1,2,3) A(0,1,2,3)
+                                    //update source ptr but not dst ptr
+    vld4.8  {d4, d5, d6, d7}, [r0]  //d4,d5,d6,d7 = dst rgb(0,1,2,3) A(0,1,2,3)
+
+    vmov.i16 q10,#0xff               //Q4.16 = 255
+    vsubW.u8 q10,q10,d3               //Q4.16 = 255-d3
+
+    cmp      r2,#8
+
+    vshr.u16 q11,q10,#7               //Q5.16 = Q4 >> 7
+    vadd.i16 q10,q10,q11               //Q4 = Q4.16+Q5.16 ; A(0,1,2,3)
+
+    vmovl.u8 q12,d4                  //Q6 = vmovl.u8 d4
+    vmovl.u8 q13,d5                  //Q7 = vmovl.u8 d5
+    vmovl.u8 q8,d6                  //Q8 = vmovl.u8 d6
+    vmovl.u8 q9,d7                  //Q9 = vmovl.u8 d7
+
+    vmul.i16 q12,q12,q10               //Q6 = Q6 * Q4
+    vmul.i16 q13,q13,q10               //Q7 = Q7 * Q4
+    vmul.i16 q8,q8,q10               //Q8 = Q8 * Q4
+    vmul.i16 q9,q9,q10               //Q9 = Q9 * Q4
+
+    vshrn.i16 d4,q12,#8              //d4 = Q6.16 shrn 8
+    vshrn.i16 d5,q13,#8              //d5 = Q7.16 shrn 8
+    vshrn.i16 d6,q8,#8              //d6 = Q8.16 shrn 8
+    vshrn.i16 d7,q9,#8              //d7 = Q9.16 shrn 8
+
+    vadd.i8  d4,d4,d0               //d4 = d4+d0
+    vadd.i8  d5,d5,d1               //d5 = d5+d1
+    vadd.i8  d6,d6,d2               //d6 = d6+d2
+    vadd.i8  d7,d7,d3               //d7 = d7+d3
+
+    vst4.8  {d4, d5, d6, d7}, [r0]! //dst rgb(0,1,2,3) A(0,1,2,3) = d4,d5,d6,d7
+
+    bge      .Lloop_8
+    b        .Lless_than_4
+
+#else
+
+/*
+ * r0 - dst
+ * r1 - src
+ * r2 - count
+ */
+    push     {r4-r11}
+    mov      r9, #0xFF
+    orr      r10, r9, r9, lsl #16
+    mvn      r11, r10
+
+.Lblitrow32_loop:
+    ldr      r3, [r0]
+    ldr      r4, [r1], #4
+
+    cmp      r3, #0
+    streq    r4, [r0], #4
+    beq      .Lblitrow32_loop_cond
+
+    // r5 <- (255-alpha)+1
+    sub      r5, r9, r4, lsr #24
+    and      r6, r3, r10
+    add      r5, r5, #1
+    and      r7, r10, r3, lsr #8
+
+    mul      r8, r6, r5
+    lsr      r6, r8, #8
+    mul      r8, r7, r5
+
+    // combine rb and ag
+    and      r6, r6, r10
+    and      r7, r8, r11
+    orr      r6, r6, r7
+
+    // add src to combined value
+    add      r6, r6, r4
+    str      r6, [r0], #4
+
+.Lblitrow32_loop_cond:
+    subs     r2, r2, #1
+    bhi      .Lblitrow32_loop
+    pop      {r4-r11}
+    bx       lr
+
+#endif
+
+.endfunc
+.size S32A_Opaque_BlitRow32_asm, .-S32A_Opaque_BlitRow32_asm
diff --git a/src/core/asm/S32_Opaque_D32_nofilter_DX_gether.S b/src/core/asm/S32_Opaque_D32_nofilter_DX_gether.S
new file mode 100644
index 0000000000..3467432826
--- /dev/null
+++ b/src/core/asm/S32_Opaque_D32_nofilter_DX_gether.S
@@ -0,0 +1,85 @@
+/*
+ * Copyright (c) 2005-2008, The Android Open Source Project
+ * Copyright (c) 2010, Code Aurora Forum. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+    .text
+    .global S32_Opaque_D32_nofilter_DX_gether
+    .func S32_Opaque_D32_nofilter_DX_gether
+S32_Opaque_D32_nofilter_DX_gether:
+          push     {r0-r11,lr}
+          asr      r0,r2,#3
+          sub      sp,sp,#4              //23
+          cmp      r0,#0
+          str      r0,[sp,#0] //r0 = count >> 3
+          ble      .L1_140
+          ldr      r4,[sp,#4] //r4 = r0 (dst)
+          mov      r0,r3
+          add      r12,r3,#4
+          asr      r8,r2,#3
+.L1_52:
+          ldm      r3!, {r0,r6,r9,r11}
+          lsr      r5,r0,#16            //30
+          ldr      r5,[r1,r5,lsl #2]   //30
+          lsr      r7,r6,#16            //32
+          ldr      r7,[r1,r7,lsl #2]     //31
+          uxth     r0,r0                 //34
+          ldr      r0,[r1,r0,lsl #2]     //34
+          uxth     r6,r6                 //31
+          ldr      r6,[r1,r6,lsl #2]   //32
+          //stm      r4!, {r0,r5,r6,r7}         ;35
+          lsr      r10,r9,#16            //30
+          ldr      r10,[r1,r10,lsl #2]   //30
+          lsr      lr,r11,#16            //32
+          ldr      lr,[r1,lr,lsl #2]     //31
+          uxth     r9,r9                 //34
+          ldr      r9,[r1,r9,lsl #2]     //34
+          uxth     r11,r11                 //31
+          ldr      r11,[r1,r11,lsl #2]   //32
+          subs     r8,r8,#1
+          stm      r4!, {r0,r5,r6,r7,r9,r10,r11,lr}         //35
+
+          bne      .L1_52
+
+          ldr      r0,[sp,#0]  // count >> 3
+          mov      r12,r0
+          ldr      r0,[sp,#4]  //r0 = dst
+          add      r0,r0,r12,lsl #5 //dst += count >>3 << 5
+          str      r0,[sp,#4]  //save r0 into stack again
+.L1_140:
+//;;39         const uint16_t* SK_RESTRICT xx = (const uint16_t*)(xy);
+//;;40         for (i = (count & 7); i > 0; --i) {
+          tst      r2,#7
+          beq      .L1_184
+          ldr      r0,[sp,#4]  //r0 = currnt dst
+          and      r2,r2,#7
+.L1_156:
+//;;41             //SkASSERT(*xx < (unsigned)s.fBitmap->width());
+//;;42             src = srcAddr[*xx++]; *colors++ = RETURNDST(src);
+          ldrh     r4,[r3],#2
+          add      r12,r0,#4
+//;;43         }
+          subs     r2,r2,#1
+          ldr      r4,[r1,r4,lsl #2]     //42
+          str      r4,[r0,#0]            //42
+          mov      r0,r12                //42
+          bne      .L1_156
+.L1_184:
+//;;44     }
+          add      sp,sp,#0x14
+          pop      {r4-r11,pc}
+
+.endfunc
+.size S32_Opaque_D32_nofilter_DX_gether, .-S32_Opaque_D32_nofilter_DX_gether
diff --git a/src/core/asm/memset16_neon.S b/src/core/asm/memset16_neon.S
new file mode 100644
index 0000000000..0f04b90bbc
--- /dev/null
+++ b/src/core/asm/memset16_neon.S
@@ -0,0 +1,158 @@
+/* Copyright (c) 2009, Code Aurora Forum. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of Code Aurora nor
+ *       the names of its contributors may be used to endorse or promote
+ *       products derived from this software without specific prior written
+ *       permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NON-INFRINGEMENT ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+ * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+/***************************************************************************
+  Neon memset: Attempts to do a memset with Neon registers if possible,
+     Inputs:
+        s: The buffer to write to
+        c: The integer data to write to the buffer
+        n: The size_t count.
+     Outputs:
+
+***************************************************************************/
+
+        .code 32
+        .align 4
+        .globl memset16_neon
+        .func
+
+memset16_neon:
+        cmp             r2, #0
+        bxeq            lr
+
+        push            {r0}
+
+        /* If we have < 8 bytes, just do a quick loop to handle that */
+        cmp             r2, #8
+        bgt             memset_gt4
+memset_smallcopy_loop:
+        strh            r1, [r0], #2
+        subs            r2, r2, #2
+        bne             memset_smallcopy_loop
+memset_smallcopy_done:
+        pop             {r0}
+        bx              lr
+
+memset_gt4:
+        /*
+         * Duplicate the r1 lowest 16-bits across r1. The idea is to have
+         * a register with two 16-bit-values we can copy. We do this by
+         * duplicating lowest 16-bits of r1 to upper 16-bits.
+         */
+        orr             r1, r1, r1, lsl #16
+        /*
+         * If we're copying > 64 bytes, then we may want to get
+         * onto a 16-byte boundary to improve speed even more.
+         */
+        cmp             r2, #64
+        blt             memset_route
+        ands            r12, r0, #0xf
+        beq             memset_route
+        /*
+         * Determine the number of bytes to move forward to get to the 16-byte
+         * boundary.  Note that this will be a multiple of 4, since we
+         * already are word-aligned.
+         */
+        rsb             r12, r12, #16
+        sub             r2, r2, r12
+        lsls            r12, r12, #29
+        strmi           r1, [r0], #4
+        strcs           r1, [r0], #4
+        strcs           r1, [r0], #4
+        lsls            r12, r12, #2
+        strcsh          r1, [r0], #2
+memset_route:
+        /*
+         * Decide where to route for the maximum copy sizes.  Note that we
+         * build q0 and q1 depending on if we'll need it, so that's
+         * interwoven here as well.
+         */
+        vdup.u32        d0, r1
+        cmp             r2, #16
+        blt             memset_8
+        vmov            d1, d0
+        cmp             r2, #64
+        blt             memset_16
+        vmov            q1, q0
+        cmp             r2, #128
+        blt             memset_32
+memset_128:
+        mov             r12, r2, lsr #7
+memset_128_loop:
+        vst1.64         {q0, q1}, [r0]!
+        vst1.64         {q0, q1}, [r0]!
+        vst1.64         {q0, q1}, [r0]!
+        vst1.64         {q0, q1}, [r0]!
+        subs            r12, r12, #1
+        bne             memset_128_loop
+        ands            r2, r2, #0x7f
+        beq             memset_end
+memset_32:
+        movs            r12, r2, lsr #5
+        beq             memset_16
+memset_32_loop:
+        subs            r12, r12, #1
+        vst1.64         {q0, q1}, [r0]!
+        bne             memset_32_loop
+        ands            r2, r2, #0x1f
+        beq             memset_end
+memset_16:
+        movs            r12, r2, lsr #4
+        beq             memset_8
+memset_16_loop:
+        subs            r12, r12, #1
+        vst1.32         {q0}, [r0]!
+        bne             memset_16_loop
+        ands            r2, r2, #0xf
+        beq             memset_end
+        /*
+         * memset_8 isn't a loop, since we try to do our loops at 16
+         * bytes and above.  We should loop there, then drop down here
+         * to finish the <16-byte versions.  Same for memset_4 and
+         * memset_1.
+         */
+memset_8:
+        cmp             r2, #8
+        blt             memset_4
+        subs            r2, r2, #8
+        vst1.32         {d0}, [r0]!
+memset_4:
+        cmp             r2, #4
+        blt             memset_2
+        subs            r2, r2, #4
+        str             r1, [r0], #4
+memset_2:
+        cmp             r2, #0
+        ble             memset_end
+        strh            r1, [r0], #2
+memset_end:
+        pop             {r0}
+        bx              lr
+
+        .endfunc
+        .end
diff --git a/src/core/asm/memset32_neon.S b/src/core/asm/memset32_neon.S
new file mode 100644
index 0000000000..b611357b75
--- /dev/null
+++ b/src/core/asm/memset32_neon.S
@@ -0,0 +1,146 @@
+/* Copyright (c) 2009, Code Aurora Forum. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of Code Aurora nor
+ *       the names of its contributors may be used to endorse or promote
+ *       products derived from this software without specific prior written
+ *       permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NON-INFRINGEMENT ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+ * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+/***************************************************************************
+  Neon memset: Attempts to do a memset with Neon registers if possible,
+     Inputs:
+        s: The buffer to write to
+        c: The integer data to write to the buffer
+        n: The size_t count.
+     Outputs:
+
+***************************************************************************/
+
+        .code 32
+        .align 4
+        .globl memset32_neon
+        .func
+
+memset32_neon:
+        cmp             r2, #0
+        bxeq            lr
+
+        push            {r0}
+
+        /* If we have < 8 bytes, just do a quick loop to handle that */
+        cmp             r2, #8
+        bgt             memset_gt4
+memset_smallcopy_loop:
+        str             r1, [r0], #4
+        subs            r2, r2, #4
+        bne             memset_smallcopy_loop
+memset_smallcopy_done:
+        pop             {r0}
+        bx              lr
+
+memset_gt4:
+        /*
+         * If we're copying > 64 bytes, then we may want to get
+         * onto a 16-byte boundary to improve speed even more.
+         */
+        cmp             r2, #64
+        blt             memset_route
+        ands            r12, r0, #0xf
+        beq             memset_route
+        /*
+         * Determine the number of bytes to move forward to get to the 16-byte
+         * boundary.  Note that this will be a multiple of 4, since we
+         * already are word-aligned.
+         */
+        rsb             r12, r12, #16
+        sub             r2, r2, r12
+        lsls            r12, r12, #29
+        strmi           r1, [r0], #4
+        strcs           r1, [r0], #4
+        strcs           r1, [r0], #4
+memset_route:
+        /*
+         * Decide where to route for the maximum copy sizes.  Note that we
+         * build q0 and q1 depending on if we'll need it, so that's
+         * interwoven here as well.
+         */
+        vdup.u32        d0, r1
+        cmp             r2, #16
+        blt             memset_8
+        vmov            d1, d0
+        cmp             r2, #64
+        blt             memset_16
+        vmov            q1, q0
+        cmp             r2, #128
+        blt             memset_32
+memset_128:
+        mov             r12, r2, lsr #7
+memset_128_loop:
+        vst1.64         {q0, q1}, [r0]!
+        vst1.64         {q0, q1}, [r0]!
+        vst1.64         {q0, q1}, [r0]!
+        vst1.64         {q0, q1}, [r0]!
+        subs            r12, r12, #1
+        bne             memset_128_loop
+        ands            r2, r2, #0x7f
+        beq             memset_end
+memset_32:
+        movs            r12, r2, lsr #5
+        beq             memset_16
+memset_32_loop:
+        subs            r12, r12, #1
+        vst1.64         {q0, q1}, [r0]!
+        bne             memset_32_loop
+        ands            r2, r2, #0x1f
+        beq             memset_end
+memset_16:
+        movs            r12, r2, lsr #4
+        beq             memset_8
+memset_16_loop:
+        subs            r12, r12, #1
+        vst1.32         {q0}, [r0]!
+        bne             memset_16_loop
+        ands            r2, r2, #0xf
+        beq             memset_end
+        /*
+         * memset_8 isn't a loop, since we try to do our loops at 16
+         * bytes and above.  We should loop there, then drop down here
+         * to finish the <16-byte versions.  Same for memset_4 and
+         * memset_1.
+         */
+memset_8:
+        cmp             r2, #8
+        blt             memset_4
+        subs            r2, r2, #8
+        vst1.32         {d0}, [r0]!
+memset_4:
+        cmp             r2, #4
+        blt             memset_end
+        subs            r2, r2, #4
+        str             r1, [r0], #4
+memset_end:
+        pop             {r0}
+        bx              lr
+
+        .endfunc
+        .end
diff --git a/src/core/asm/t32cb16blend.S b/src/core/asm/t32cb16blend.S
new file mode 100644
index 0000000000..f835dd3271
--- /dev/null
+++ b/src/core/asm/t32cb16blend.S
@@ -0,0 +1,325 @@
+/*
+ * Copyright 2006, The Android Open Source Project
+ * Copyright (c) 2009, Code Aurora Forum.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+/*
+ * This file is derived from libpixelflinger version of BLIT routine.
+ * Algorithm used for BLIT operation here is equivalent to the one in
+ * C function, S32A_D565_Opaque. Use neon instructions to process 16 pixels
+ * at-a-time on armv7. If the number of pixels is less than 16 and/or the
+ * architecture is armv6 and below, use regular arm instructions. Regular
+ * arm code combines two 16-bit writes into one 32-bit write to destination,
+ * uses destination and source pre-loads, and unrolls the main loop thrice.
+ */
+	.text
+	.align
+
+	.global scanline_t32cb16blend_arm
+
+// uses r6, r7, r8, r9, r10, lr
+
+.macro pixel,   DREG, SRC, FB, OFFSET
+
+    // SRC = AABBGGRR
+    subs   r7, r10, \SRC, lsr #24           // sAA = 255 - sAA
+    beq    1f
+
+.if \OFFSET
+
+    // red
+    mov     lr, \DREG, lsr #(\OFFSET + 6 + 5)
+    smlabb  lr, r7, lr, r8
+    and     r6, \SRC, r10
+    add     lr, lr, lr, lsr #5
+    add     lr, r6, lr, lsr #5
+    lsr     lr, #3
+    orr     \FB, lr, lsl #(\OFFSET + 11)
+
+        // green
+        and     r6, \DREG, #(0x3F<<(\OFFSET + 5))
+        lsr     r6, #5
+        smlabt  r6, r7, r6, r9
+        and     lr, r10, \SRC, lsr #(8)
+        add     r6, r6, r6, lsr #6
+        add     r6, lr, r6, lsr #6
+        lsr     r6, #2
+        orr     \FB, \FB, r6, lsl #(\OFFSET + 5)
+
+            // blue
+            and     lr, \DREG, #(0x1F << \OFFSET)
+            smlabt  lr, r7, lr, r8
+            and     r6, r10, \SRC, lsr #(8+8)
+            add     lr, lr, lr, lsr #5
+            add     lr, r6, lr, lsr #5
+            lsr     lr, #3
+            orr     \FB, \FB, lr, lsl #\OFFSET
+
+.else
+
+    // red
+    mov     lr, \DREG, lsr #(6+5)
+    and     lr, lr, #0x1F
+    smlabb  lr, r7, lr, r8
+    and     r6, \SRC, r10
+    add     lr, lr, lr, lsr #5
+    add     lr, r6, lr, lsr #5
+    lsr     lr, #3
+    mov     \FB, lr, lsl #11
+
+        // green
+        and     r6, \DREG, #(0x3F<<5)
+        lsr     r6, #5
+        smlabb  r6, r7, r6, r9
+        and     lr, r10, \SRC, lsr #(8)
+        add     r6, r6, r6, lsr #6
+        add     r6, lr, r6, lsr #6
+        lsr     r6, #2
+        orr     \FB, \FB, r6, lsl #5
+
+            // blue
+            and     lr, \DREG, #0x1F
+            smlabb  lr, r7, lr, r8
+            and     r6, r10, \SRC, lsr #(8+8)
+            add     lr, lr, lr, lsr #5
+            add     lr, r6, lr, lsr #5
+            orr     \FB, \FB, lr, lsr #3
+
+.endif
+   b      2f
+
+   /*
+    * When alpha = 255, down scale the source RGB pixel (24 bits)
+    * to 16 bits(RGB565)
+    */
+1:
+    lsl    r6, \SRC, #8
+    lsr    lr, \SRC, #5
+    and    r7, r6, #0xf800
+    and    lr, lr, #0x7e0
+    orr    lr, lr, r7
+
+.if \OFFSET
+    orr    lr, lr, r6, lsr #27
+    orr    \FB, \FB, lr, lsl #(\OFFSET)
+.else
+    orr    \FB, lr, r6, lsr #27
+.endif
+
+2:
+.endm
+
+
+// r0:  dst ptr
+// r1:  src ptr
+// r2:  count
+// r3:  d
+// r4:  s0
+// r5:  s1
+// r6:  pixel
+// r7:  pixel
+// r8:  0x10
+// r9:  0x20
+// r10: 0xFF
+// r11: free
+// r12: scratch
+// r14: free
+
+scanline_t32cb16blend_arm:
+    stmfd	sp!, {r4-r10, lr}
+
+#if __ARM_ARCH__ == 7 || defined(__ARM_NEON__)
+    subs    r2, r2, #16
+
+    blo     blit_less_than_16_left
+
+    vmov.u16 q12,  #0x80
+    vmov.u8  q13,  #0xf8
+
+blit_neon_loop:
+    /*
+     * Load 64 bytes from source and 32 bytes from destination
+     * note that source pixels are 4 bytes wide and
+     * destination pixels are 2 bytes wide.
+     */
+    vld4.8  {d2, d4, d6, d8}, [r1]!
+    vld4.8  {d3, d5, d7, d9}, [r1]!
+
+    vand.8  d10, d8, d9
+    vmov    r3, r4, d10
+
+    cmp     r3, #0xffffffff
+    cmpeq   r4, #0xffffffff
+    bne     blit_alpha_not_255
+
+    // alpha equals 255 case
+
+    vshl.u8   q0, q2, #3
+
+    subs    r2, r2, #16
+
+    vsri.u8   q1, q2, #5
+    vsri.u8   q0, q3, #3
+
+    // store the rgb destination values back to memory
+    vst2.8  {d0, d2}, [r0]!
+    vst2.8  {d1, d3}, [r0]!
+
+    blo     blit_less_than_16_left
+    b       blit_neon_loop
+
+blit_alpha_not_255:
+    // alpha = 255 - alpha
+    vmvn.u8 q0, q4
+
+    vld2.8 {q5, q6}, [r0]
+
+    vshl.u8 q7, q6, #3
+
+    subs    r2, r2, #16
+
+    vand.u8 q6, q6, q13
+
+    vmov.16   q8, q12
+    vmov.16   q9, q12
+
+    vsri.u8 q7, q5, #5
+    vshl.u8 q5, q5, #3
+
+    vmlal.u8 q8, d0, d12
+    vmlal.u8 q9, d1, d13
+
+    vshl.u8 q7, q7, #2
+
+    vshr.u16  q10, q8, #5
+    vshr.u16  q11, q9, #5
+    vaddhn.u16 d12, q8, q10
+    vaddhn.u16 d13, q9, q11
+
+    vmov.16   q8, q12
+    vmov.16   q9, q12
+    vmlal.u8 q8, d0, d14
+    vmlal.u8 q9, d1, d15
+
+    vqadd.u8  q6, q6, q1
+
+    vshr.u16  q10, q8, #6
+    vshr.u16  q11, q9, #6
+    vaddhn.u16 d14, q8, q10
+    vaddhn.u16 d15, q9, q11
+
+    vmov.16   q8, q12
+    vmov.16   q9, q12
+    vmlal.u8 q8, d0, d10
+    vmlal.u8 q9, d1, d11
+
+    vqadd.u8  q7, q7, q2
+
+    vshl.u8  q5, q7, #3
+
+    vshr.u16  q10, q8, #5
+    vshr.u16  q11, q9, #5
+
+    vsri.u8  q6, q7, #5
+
+    vaddhn.u16 d16, q8, q10
+    vaddhn.u16 d17, q9, q11
+    vqadd.u8  q8, q8, q3
+
+    vsri.u8  q5, q8, #3
+
+    // store the rgb destination values back to memory
+    vst2.8  {d10, d12}, [r0]!
+    vst2.8  {d11, d13}, [r0]!
+
+    blo     blit_less_than_16_left
+    b       blit_neon_loop
+#endif
+
+blit_less_than_16_left:
+    pld     [r1]
+
+    mov     r8,  #0x10
+    mov     r9,  #0x20
+    mov     r10, #0xFF
+
+#if __ARM_ARCH__ == 7 || defined(__ARM_NEON__)
+    adds    r2, r2, #14
+#else
+    subs    r2, r2, #2
+#endif
+
+    pld     [r0]
+    blo     9f
+
+    // The main loop is unrolled thrice and process 6 pixels
+8:  ldmia   r1!, {r4, r5}
+    // stream the source
+    pld     [r1, #32]
+    add     r0, r0, #4
+    // it's all zero, skip this pixel
+    orrs    r3, r4, r5
+    beq     7f
+
+    // load the destination
+    ldr     r3, [r0, #-4]
+    // stream the destination
+    pld     [r0, #32]
+    pixel   r3, r4, r12, 0
+    pixel   r3, r5, r12, 16
+    // effectively, we're getting write-combining by virtue of the
+    // cpu's write-back cache.
+    str     r12, [r0, #-4]
+
+    // 2nd iteration of the loop, don't stream anything
+    subs    r2, r2, #2
+    blt     9f
+    ldmia   r1!, {r4, r5}
+    add     r0, r0, #4
+    orrs    r3, r4, r5
+    beq     7f
+    ldr     r3, [r0, #-4]
+    pixel   r3, r4, r12, 0
+    pixel   r3, r5, r12, 16
+    str     r12, [r0, #-4]
+
+    // 3rd iteration of the loop, don't stream anything
+    subs    r2, r2, #2
+    blt     9f
+    ldmia   r1!, {r4, r5}
+    add     r0, r0, #4
+    orrs    r3, r4, r5
+    beq     7f
+    ldr     r3, [r0, #-4]
+    pixel   r3, r4, r12, 0
+    pixel   r3, r5, r12, 16
+    str     r12, [r0, #-4]
+
+7:  subs    r2, r2, #2
+    blo     9f
+    b       8b
+
+9:  adds    r2, r2, #1
+    ldmlofd sp!, {r4-r10, lr}        // return
+    bxlo    lr
+
+    // last pixel left
+    ldr     r4, [r1], #4
+    ldrh    r3, [r0]
+    pixel   r3, r4, r12, 0
+    strh    r12, [r0], #2
+    ldmfd   sp!, {r4-r10, lr}        // return
+    bx      lr
diff --git a/src/core/asm/xfer.S b/src/core/asm/xfer.S
new file mode 100644
index 0000000000..96d587333b
--- /dev/null
+++ b/src/core/asm/xfer.S
@@ -0,0 +1,136 @@
+/*
+ * Copyright 2006, The Android Open Source Project
+ * Copyright (c) 2009, Code Aurora Forum.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+	.text
+	.align
+
+	.global xfer16_arm
+
+.macro pixel,   DREG, SRC, FB, OFFSET
+    lsl    r6, \SRC, #8
+    lsr    r8, \SRC, #5
+    and    r7, r6, #0xf800
+    and    r8, r8, #0x7e0
+    orr    r8, r8, r7
+
+.if \OFFSET
+    orr    r8, r8, r6, lsr #27
+    orr    \FB, \FB, r8, lsl #(\OFFSET)
+.else
+    orr    \FB, r8, r6, lsr #27
+.endif
+
+.endm
+
+xfer16_arm:
+    stmfd   sp!, {r4-r8}
+
+#if __ARM_ARCH__ == 7 || defined(__ARM_NEON__)
+    subs    r2, r2, #16
+
+    blo     xfer16_less_than_16_left
+
+    vmov.u16 q12,  #0x80
+    //pld     [r1]
+    //pld     [r1, #32]
+
+xfer16_neon_loop:
+    // load 64 bytes from source and 32 bytes from destination
+    // note that source pixels are 4 bytes wide and
+    // destination pixels are 2 bytes wide
+    vld4.8  {d2, d4, d6, d8}, [r1]!
+    vld4.8  {d3, d5, d7, d9}, [r1]!
+
+    vshl.u8   q0, q2, #3
+
+    subs    r2, r2, #16
+
+    vsri.u8   q1, q2, #5
+    vsri.u8   q0, q3, #3
+
+    // store the rgb destination values back to memory
+    vst2.8  {d0, d2}, [r0]!
+    vst2.8  {d1, d3}, [r0]!
+
+    blo     xfer16_less_than_16_left
+    b       xfer16_neon_loop
+#endif
+
+xfer16_less_than_16_left:
+    pld     [r1]
+
+#if __ARM_ARCH__ == 7 || defined(__ARM_NEON__)
+    adds    r2, r2, #14
+#else
+    subs    r2, r2, #2
+#endif
+
+    pld     [r0]
+    blo     9f
+
+    // The main loop is unrolled thrice and process 6 pixels
+8:  ldmia   r1!, {r4, r5}
+    // stream the source
+    pld     [r1, #32]
+    add     r0, r0, #4
+
+    // load the destination
+    ldr     r3, [r0, #-4]
+    // stream the destination
+    pld     [r0, #32]
+    pixel   r3, r4, r12, 0
+    pixel   r3, r5, r12, 16
+    // effectively, we're getting write-combining by virtue of the
+    // cpu's write-back cache.
+    str     r12, [r0, #-4]
+
+    // 2nd iteration of the loop, don't stream anything
+    subs    r2, r2, #2
+    blt     9f
+    ldmia   r1!, {r4, r5}
+    add     r0, r0, #4
+    ldr     r3, [r0, #-4]
+    pixel   r3, r4, r12, 0
+    pixel   r3, r5, r12, 16
+    str     r12, [r0, #-4]
+
+    // 3rd iteration of the loop, don't stream anything
+    subs    r2, r2, #2
+    blt     9f
+    ldmia   r1!, {r4, r5}
+    add     r0, r0, #4
+    ldr     r3, [r0, #-4]
+    pixel   r3, r4, r12, 0
+    pixel   r3, r5, r12, 16
+    str     r12, [r0, #-4]
+
+7:  subs    r2, r2, #2
+    blo     9f
+    b       8b
+
+9:  adds    r2, r2, #1
+    ldmlofd sp!, {r4-r8}        // return
+    bxlo    lr
+
+    // last pixel left
+    ldr     r4, [r1], #4
+    ldrh    r3, [r0]
+    pixel   r3, r4, r12, 0
+    strh    r12, [r0], #2
+    ldmfd   sp!, {r4-r8}        // return
+    bx      lr
diff --git a/src/images/SkImageDecoder_libjpeg.cpp b/src/images/SkImageDecoder_libjpeg.cpp
index 12fe76ab3b..279c8ab9c1 100644
--- a/src/images/SkImageDecoder_libjpeg.cpp
+++ b/src/images/SkImageDecoder_libjpeg.cpp
@@ -397,30 +397,40 @@ bool SkJPEGImageDecoder::onDecode(SkStream* stream, SkBitmap* bm,
 
     /*  image_width and image_height are the original dimensions, available
         after jpeg_read_header(). To see the scaled dimensions, we have to call
-        jpeg_start_decompress(), and then read output_width and output_height.
+        jpeg_calc_output_dimensions(), and then read output_width and output_height.
     */
+    jpeg_calc_output_dimensions(&cinfo);
+
+	/*  We have enough information to return
+		to the caller if they just wanted (subsampled bounds). If sampleSize
+		was 1, then we would have already returned. Thus we just check if
+		we're in kDecodeBounds_Mode, and that we have valid output sizes.
+	 */
+	if (SkImageDecoder::kDecodeBounds_Mode == mode &&
+			valid_output_dimensions(cinfo)) {
+		SkScaledBitmapSampler smpl(cinfo.output_width, cinfo.output_height,
+								   recompute_sampleSize(sampleSize, cinfo));
+		bm->setConfig(config, smpl.scaledWidth(), smpl.scaledHeight());
+		bm->setIsOpaque(true);
+		return true;
+    } 
+    
+    sampleSize = recompute_sampleSize(sampleSize, cinfo);
+
+#ifdef ANDROID_RGB
+    if ((sampleSize != 1) && (cinfo.out_color_space == JCS_RGB_565)) {
+        /*  Requires SkScaledBitmapSampler, but since
+            SkScaledBitmapSampler can't handle RGB_565 yet,
+            don't even try.
+            Revert back to the default format JCS_RGB.
+        */
+        cinfo.out_color_space = JCS_RGB;
+    }
+#endif
+
     if (!jpeg_start_decompress(&cinfo)) {
-        /*  If we failed here, we may still have enough information to return
-            to the caller if they just wanted (subsampled bounds). If sampleSize
-            was 1, then we would have already returned. Thus we just check if
-            we're in kDecodeBounds_Mode, and that we have valid output sizes.
-
-            One reason to fail here is that we have insufficient stream data
-            to complete the setup. However, output dimensions seem to get
-            computed very early, which is why this special check can pay off.
-         */
-        if (SkImageDecoder::kDecodeBounds_Mode == mode &&
-                valid_output_dimensions(cinfo)) {
-            SkScaledBitmapSampler smpl(cinfo.output_width, cinfo.output_height,
-                                       recompute_sampleSize(sampleSize, cinfo));
-            bm->setConfig(config, smpl.scaledWidth(), smpl.scaledHeight());
-            bm->setIsOpaque(true);
-            return true;
-        } else {
-            return return_false(cinfo, *bm, "start_decompress");
-        }
+        return return_false(cinfo, *bm, "start_decompress");
     }
-    sampleSize = recompute_sampleSize(sampleSize, cinfo);
 
     // should we allow the Chooser (if present) to pick a config for us???
     if (!this->chooseFromOneChoice(config, cinfo.output_width,
diff --git a/src/opts/SkBlitRow_opts_arm.cpp b/src/opts/SkBlitRow_opts_arm.cpp
index 0c38113adf..fb530b42ec 100644
--- a/src/opts/SkBlitRow_opts_arm.cpp
+++ b/src/opts/SkBlitRow_opts_arm.cpp
@@ -20,6 +20,11 @@
 #include "SkColorPriv.h"
 #include "SkDither.h"
 
+extern "C"  void S32A_Opaque_BlitRow32_asm(SkPMColor* SK_RESTRICT dst,
+                                           const SkPMColor* SK_RESTRICT src,
+                                           int count,
+                                           U8CPU alpha);
+
 #if defined(__ARM_HAVE_NEON) && !defined(SK_CPU_BENDIAN)
 static void S32A_D565_Opaque_neon(uint16_t* SK_RESTRICT dst,
                                   const SkPMColor* SK_RESTRICT src, int count,
@@ -398,15 +403,25 @@ static void S32_D565_Blend_Dither_neon(uint16_t *dst, const SkPMColor *src,
     }
 }
 
-#define S32A_D565_Opaque_PROC       S32A_D565_Opaque_neon
 #define S32A_D565_Blend_PROC        S32A_D565_Blend_neon
 #define S32_D565_Blend_Dither_PROC  S32_D565_Blend_Dither_neon
 #else
-#define S32A_D565_Opaque_PROC       NULL
 #define S32A_D565_Blend_PROC        NULL
 #define S32_D565_Blend_Dither_PROC  NULL
 #endif
 
+/*
+ * Use asm version of BlitRow function. Neon instructions are
+ * used for armv7 targets.
+ */
+#define S32A_Opaque_BlitRow32_PROC  S32A_Opaque_BlitRow32_asm
+
+/*
+ * Use neon version of BLIT assembly code from t32cb16blend.S, where we process
+ * 16 pixels at-a-time and also optimize for alpha=255 case.
+ */
+#define S32A_D565_Opaque_PROC       NULL
+
 /* Don't have a special version that assumes each src is opaque, but our S32A
     is still faster than the default, so use it here
  */
@@ -446,7 +461,7 @@ const SkBlitRow::Proc SkBlitRow::gPlatform_4444_Procs[] = {
 const SkBlitRow::Proc32 SkBlitRow::gPlatform_Procs32[] = {
     NULL,   // S32_Opaque,
     NULL,   // S32_Blend,
-    NULL,   // S32A_Opaque,
+    S32A_Opaque_BlitRow32_PROC,
     NULL,   // S32A_Blend,
 };
author	Steve Kondik <shade@chemlab.org>	2010-04-11 15:22:53 -0400
committer	Steve Kondik <shade@chemlab.org>	2010-04-11 15:22:53 -0400
commit	d76be2dc139764e92294234b822ef3cbb7253cc8 (patch)
tree	1717117afdbc4f3592b95598d4b2f37b840209ca
parent	e7ccf8c1cdac8dfbe353ec3201f36fac398f19c6 (diff)
parent	bcbd70c3951bbc0e1b09132fe21c1cf04982909e (diff)
download	android_external_skia-donut.tar.gz android_external_skia-donut.tar.bz2 android_external_skia-donut.zip