aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorSteve Kondik <shade@chemlab.org>2010-04-11 15:22:53 -0400
committerSteve Kondik <shade@chemlab.org>2010-04-11 15:22:53 -0400
commitd76be2dc139764e92294234b822ef3cbb7253cc8 (patch)
tree1717117afdbc4f3592b95598d4b2f37b840209ca
parente7ccf8c1cdac8dfbe353ec3201f36fac398f19c6 (diff)
parentbcbd70c3951bbc0e1b09132fe21c1cf04982909e (diff)
downloadandroid_external_skia-donut.tar.gz
android_external_skia-donut.tar.bz2
android_external_skia-donut.zip
Merge branch 'eclair' of git@github.com:cyanogen/android_external_skia into donutdonut
Conflicts: Android.mk
-rw-r--r--Android.mk27
-rw-r--r--include/core/SkUtils.h16
-rw-r--r--src/core/SkBitmapProcState.cpp4
-rw-r--r--src/core/SkBitmapProcState_sample.h11
-rw-r--r--src/core/SkBlitRow_D16.cpp8
-rw-r--r--src/core/SkXfermode.cpp15
-rw-r--r--src/core/asm/S32A_Opaque_BlitRow32.S320
-rw-r--r--src/core/asm/S32_Opaque_D32_nofilter_DX_gether.S85
-rw-r--r--src/core/asm/memset16_neon.S158
-rw-r--r--src/core/asm/memset32_neon.S146
-rw-r--r--src/core/asm/t32cb16blend.S325
-rw-r--r--src/core/asm/xfer.S136
-rw-r--r--src/images/SkImageDecoder_libjpeg.cpp52
-rw-r--r--src/opts/SkBlitRow_opts_arm.cpp21
14 files changed, 1292 insertions, 32 deletions
diff --git a/Android.mk b/Android.mk
index 4b4605dda1..b1446d37e4 100644
--- a/Android.mk
+++ b/Android.mk
@@ -197,6 +197,33 @@ endif
LOCAL_SRC_FILES += \
emoji/EmojiFont.cpp
+# including the optimized assembly code for the src-overing operation
+ifeq ($(TARGET_ARCH),arm)
+ LOCAL_CFLAGS += -DUSE_T32CB16BLEND_ASM
+ LOCAL_SRC_FILES += \
+ src/core/asm/t32cb16blend.S \
+ src/core/asm/xfer.S \
+ src/core/asm/S32A_Opaque_BlitRow32.S
+endif
+
+ifeq ($(TARGET_ARCH_VARIANT),armv6)
+ ARCH_ARMV6_ARMV7 := true
+endif
+
+ifeq ($(TARGET_ARCH_VARIANT),armv7-a)
+ ARCH_ARMV6_ARMV7 := true
+endif
+
+ifeq ($(ARCH_ARMV6_ARMV7),true)
+ LOCAL_SRC_FILES += \
+ src/core/asm/S32_Opaque_D32_nofilter_DX_gether.S
+endif
+
+ifeq ($(ARCH_ARM_HAVE_NEON),true)
+ LOCAL_SRC_FILES += \
+ src/core/asm/memset16_neon.S \
+ src/core/asm/memset32_neon.S
+endif
LOCAL_SHARED_LIBRARIES := \
libcutils \
diff --git a/include/core/SkUtils.h b/include/core/SkUtils.h
index 9f3b1d6f36..deeebf00e7 100644
--- a/include/core/SkUtils.h
+++ b/include/core/SkUtils.h
@@ -36,10 +36,18 @@ void sk_memset16_portable(uint16_t dst[], uint16_t value, int count);
void sk_memset32_portable(uint32_t dst[], uint32_t value, int count);
#ifdef ANDROID
- #include "cutils/memory.h"
-
- #define sk_memset16(dst, value, count) android_memset16(dst, value, (count) << 1)
- #define sk_memset32(dst, value, count) android_memset32(dst, value, (count) << 2)
+ #if defined(__ARM_HAVE_NEON)
+ extern "C" void memset16_neon(uint16_t*, uint16_t, int);
+ extern "C" void memset32_neon(uint32_t*, uint32_t, int);
+
+ #define sk_memset16(dst, value, count) memset16_neon(dst, value, (count) << 1)
+ #define sk_memset32(dst, value, count) memset32_neon(dst, value, (count) << 2)
+ #else
+ #include "cutils/memory.h"
+
+ #define sk_memset16(dst, value, count) android_memset16(dst, value, (count) << 1)
+ #define sk_memset32(dst, value, count) android_memset32(dst, value, (count) << 2)
+ #endif
#endif
#ifndef sk_memset16
diff --git a/src/core/SkBitmapProcState.cpp b/src/core/SkBitmapProcState.cpp
index 600b963d9d..0fdc6c8e95 100644
--- a/src/core/SkBitmapProcState.cpp
+++ b/src/core/SkBitmapProcState.cpp
@@ -86,7 +86,11 @@ static inline U8CPU Filter_8(unsigned x, unsigned y,
SkASSERT(state.fAlphaScale == 256)
#define RETURNDST(src) src
#define SRC_TO_FILTER(src) src
+#if __ARM_ARCH__ >= 6 && !defined(SK_CPU_BENDIAN)
+ #define USE_GETHER32
+#endif
#include "SkBitmapProcState_sample.h"
+#undef USE_GETHER32
#undef FILTER_PROC
#define FILTER_PROC(x, y, a, b, c, d, dst) Filter_32_alpha(x, y, a, b, c, d, dst, alphaScale)
diff --git a/src/core/SkBitmapProcState_sample.h b/src/core/SkBitmapProcState_sample.h
index 4e1f1395e3..ae0750b20f 100644
--- a/src/core/SkBitmapProcState_sample.h
+++ b/src/core/SkBitmapProcState_sample.h
@@ -16,6 +16,13 @@
#error "unsupported DSTSIZE"
#endif
+#if defined(USE_GETHER32)
+ extern "C" void S32_Opaque_D32_nofilter_DX_gether(SkPMColor* SK_RESTRICT colors,
+ const SkPMColor* SK_RESTRICT srcAddr,
+ int count,
+ const uint32_t* SK_RESTRICT xy);
+#endif
+
static void MAKENAME(_nofilter_DXDY)(const SkBitmapProcState& s,
const uint32_t* SK_RESTRICT xy,
int count, DSTTYPE* SK_RESTRICT colors) {
@@ -85,6 +92,9 @@ static void MAKENAME(_nofilter_DX)(const SkBitmapProcState& s,
DSTTYPE dstValue = RETURNDST(src);
BITMAPPROC_MEMSET(colors, dstValue, count);
} else {
+#if defined(USE_GETHER32)
+ S32_Opaque_D32_nofilter_DX_gether(colors, srcAddr, count, xy);
+#else
int i;
for (i = (count >> 2); i > 0; --i) {
uint32_t xx0 = *xy++;
@@ -104,6 +114,7 @@ static void MAKENAME(_nofilter_DX)(const SkBitmapProcState& s,
SkASSERT(*xx < (unsigned)s.fBitmap->width());
src = srcAddr[*xx++]; *colors++ = RETURNDST(src);
}
+#endif
}
#ifdef POSTAMBLE
diff --git a/src/core/SkBlitRow_D16.cpp b/src/core/SkBlitRow_D16.cpp
index 66ac90e29a..a0d4b99135 100644
--- a/src/core/SkBlitRow_D16.cpp
+++ b/src/core/SkBlitRow_D16.cpp
@@ -215,12 +215,20 @@ static void S32A_D565_Blend_Dither(uint16_t* SK_RESTRICT dst,
///////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////
+#ifdef USE_T32CB16BLEND_ASM
+ extern "C" void scanline_t32cb16blend_arm(uint16_t*, uint32_t*, size_t);
+#endif
+
static const SkBlitRow::Proc gDefault_565_Procs[] = {
// no dither
S32_D565_Opaque,
S32_D565_Blend,
+#ifdef USE_T32CB16BLEND_ASM
+ (SkBlitRow::Proc)scanline_t32cb16blend_arm,
+#else
S32A_D565_Opaque,
+#endif
S32A_D565_Blend,
// dither
diff --git a/src/core/SkXfermode.cpp b/src/core/SkXfermode.cpp
index 8d1531a0a0..0f56d866df 100644
--- a/src/core/SkXfermode.cpp
+++ b/src/core/SkXfermode.cpp
@@ -19,6 +19,9 @@
#define SkAlphaMulAlpha(a, b) SkMulDiv255Round(a, b)
+static SkPMColor src_modeproc(SkPMColor , SkPMColor );
+extern "C" void xfer16_arm(uint16_t*, uint32_t*, uint32_t);
+
static SkPMColor SkFourByteInterp(SkPMColor src, SkPMColor dst, U8CPU alpha) {
unsigned scale = SkAlpha255To256(alpha);
@@ -233,10 +236,14 @@ void SkProcXfermode::xfer16(SK_RESTRICT uint16_t dst[],
if (NULL != proc) {
if (NULL == aa) {
- for (int i = count - 1; i >= 0; --i) {
- SkPMColor dstC = SkPixel16ToPixel32(dst[i]);
- dst[i] = SkPixel32ToPixel16_ToU16(proc(src[i], dstC));
- }
+ if (proc == src_modeproc) {
+ xfer16_arm(dst, (uint32_t*)src, count);
+ } else {
+ for (int i = count - 1; i >= 0; --i) {
+ SkPMColor dstC = SkPixel16ToPixel32(dst[i]);
+ dst[i] = SkPixel32ToPixel16_ToU16(proc(src[i], dstC));
+ }
+ }
} else {
for (int i = count - 1; i >= 0; --i) {
unsigned a = aa[i];
diff --git a/src/core/asm/S32A_Opaque_BlitRow32.S b/src/core/asm/S32A_Opaque_BlitRow32.S
new file mode 100644
index 0000000000..1454dd19f6
--- /dev/null
+++ b/src/core/asm/S32A_Opaque_BlitRow32.S
@@ -0,0 +1,320 @@
+/*
+ * Copyright (c) 2005-2008, The Android Open Source Project
+ * Copyright (c) 2010, Code Aurora Forum. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ .text
+
+ .global S32A_Opaque_BlitRow32_asm
+ .func S32A_Opaque_BlitRow32_asm
+
+S32A_Opaque_BlitRow32_asm:
+
+#if __ARM_ARCH__ == 7 || defined(__ARM_NEON__)
+
+ push {r4-r11}
+ cmp r2,#24
+ blt .Lless_than_24
+
+ vpush {Q4-Q7}
+
+ vmov.i16 q14,#0xff //;Q4.16 = 255
+//prefix
+ vld4.8 {d0, d1, d2, d3}, [r1]! //d0,d1,d2,d3 = sourc rgb(0,1,2,3) A(0,1,2,3)
+ //update source ptr but not dst ptr
+ vld4.8 {d4, d5, d6, d7}, [r0] //d4,d5,d6,d7 = dst rgb(0,1,2,3) A(0,1,2,3)
+ add r3, r0, #32 // minus 16 to pretend the last round
+ mov r5, #64
+ sub r2,r2,#8
+.Lloop:
+ sub r2,r2,#16
+ vsubw.u8 q4,q14,d3 //Q4.16 = 255-d3
+ //update source ptr but not dst ptr
+
+ //It has to be 24 since we pre-load 8 word for the next rounds
+ cmp r2,#16
+
+ vsra.u16 q4,q4,#7 //Q4 = Q4.16+Q4 >> 7 ; A(0,1,2,3)
+
+ vmovl.u8 q6,d4 //Q6 = vmovl.u8 d4
+ vmovl.u8 q7,d5 //Q7 = vmovl.u8 d5
+ vmovl.u8 q8,d6 //Q8 = vmovl.u8 d6
+ vmovl.u8 q9,d7 //Q9 = vmovl.u8 d7
+
+
+ vmul.i16 q6,q6,q4 //Q6 = Q6 * Q4
+ vmul.i16 q7,q7,q4 //Q7 = Q7 * Q4
+
+ vld4.8 {d20, d21, d22, d23}, [r1]! //d0,d1,d2,d3 = sourc rgb(0,1,2,3) A(0,1,2,3)
+
+ vmul.i16 q8,q8,q4 //Q8 = Q8 * Q4
+ vmul.i16 q9,q9,q4 //Q9 = Q9 * Q4
+
+ vld4.8 {d24, d25, d26, d27}, [r3] //d4,d5,d6,d7 = dst rgb(0,1,2,3) A(0,1,2,3)
+
+ vshrn.i16 d4,q6,#8 //d4 = Q6.16 shrn 8
+ vshrn.i16 d5,q7,#8 //d5 = Q7.16 shrn 8
+ vshrn.i16 d6,q8,#8 //d6 = Q8.16 shrn 8
+ vshrn.i16 d7,q9,#8 //d7 = Q9.16 shrn 8
+
+ vadd.i8 d4,d4,d0 //d4 = d4+d0
+ vadd.i8 d5,d5,d1 //d5 = d5+d1
+ vadd.i8 d6,d6,d2 //d6 = d6+d2
+ vadd.i8 d7,d7,d3 //d7 = d7+d3
+
+ vst4.8 {d4, d5, d6, d7}, [r0], r5 //dst rgb(0,1,2,3) A(0,1,2,3) = d4,d5,d6,d7
+ //add r0, r0, r5
+
+ //The next 4 words
+// vld4.8 {d20, d21, d22, d23}, [r1]! ;d0,d1,d2,d3 = sourc rgb(0,1,2,3) A(0,1,2,3)
+// ;update source ptr but not dst ptr
+// vld4.8 {d24, d25, d26, d27}, [r0] ;d4,d5,d6,d7 = dst rgb(0,1,2,3) A(0,1,2,3)
+
+ //update source ptr but not dst ptr
+ vsubW.u8 q4,q14,d23 //Q4.16 = 255-d3
+
+ vsra.u16 q4,q4,#7 //Q4 = Q4.16+Q4 >> 7 ; A(0,1,2,3)
+
+ vmovl.u8 q6,d24 //Q6 = vmovl.u8 d4
+ vmovl.u8 q7,d25 //Q7 = vmovl.u8 d5
+ vmovl.u8 q8,d26 //Q8 = vmovl.u8 d6
+ vmovl.u8 q9,d27 //Q9 = vmovl.u8 d7
+
+ vmul.i16 q6,q6,q4 //Q6 = Q6 * Q4
+ vmul.i16 q7,q7,q4 //Q7 = Q7 * Q4
+
+ vld4.8 {d0, d1, d2, d3}, [r1]! //d0,d1,d2,d3 = sourc rgb(0,1,2,3) A(0,1,2,3)
+
+ vmul.i16 q8,q8,q4 //Q8 = Q8 * Q4
+ vmul.i16 q9,q9,q4 //Q9 = Q9 * Q4
+
+ vld4.8 {d4, d5, d6, d7}, [r0] //d4,d5,d6,d7 = dst rgb(0,1,2,3) A(0,1,2,3)
+ vshrn.i16 d24,q6,#8 //d4 = Q6.16 shrn 8
+ vshrn.i16 d25,q7,#8 //d5 = Q7.16 shrn 8
+ vshrn.i16 d26,q8,#8 //d6 = Q8.16 shrn 8
+ vshrn.i16 d27,q9,#8 //d7 = Q9.16 shrn 8
+
+ vadd.i8 d24,d24,d20 //d4 = d4+d0
+ vadd.i8 d25,d25,d21 //d5 = d5+d1
+ vadd.i8 d26,d26,d22 //d6 = d6+d2
+ vadd.i8 d27,d27,d23 //d7 = d7+d3
+
+ vst4.8 {d24, d25, d26, d27}, [r3], r5 //dst rgb(0,1,2,3) A(0,1,2,3) = d4,d5,d6,d7
+ //add r3, r3, r5
+
+ bge .Lloop
+
+//postfix:
+//There are 8 words left unprocessed from previous round
+ vmov.i16 q4,#0xff //Q4.16 = 255
+ vsubw.u8 q4,q4,d3 //Q4.16 = 255-d3
+
+ cmp r2,#8
+
+ vshr.u16 q5,q4,#7 //Q5.16 = Q4 >> 7
+ vadd.i16 q4,q4,q5 //Q4 = Q4.16+Q5.16 ; A(0,1,2,3)
+
+ vmovl.u8 q6,d4 //Q6 = vmovl.u8 d4
+ vmovl.u8 q7,d5 //Q7 = vmovl.u8 d5
+ vmovl.u8 q8,d6 //Q8 = vmovl.u8 d6
+ vmovl.u8 q9,d7 //Q9 = vmovl.u8 d7
+
+ vmul.i16 q6,q6,q4 //Q6 = Q6 * Q4
+ vmul.i16 q7,q7,q4 //Q7 = Q7 * Q4
+ vmul.i16 q8,q8,q4 //Q8 = Q8 * Q4
+ vmul.i16 q9,q9,q4 //Q9 = Q9 * Q4
+
+ vshrn.i16 d4,q6,#8 //d4 = Q6.16 shrn 8
+ vshrn.i16 d5,q7,#8 //d5 = Q7.16 shrn 8
+ vshrn.i16 d6,q8,#8 //d6 = Q8.16 shrn 8
+ vshrn.i16 d7,q9,#8 //d7 = Q9.16 shrn 8
+
+ vadd.i8 d4,d4,d0 //d4 = d4+d0
+ vadd.i8 d5,d5,d1 //d5 = d5+d1
+ vadd.i8 d6,d6,d2 //d6 = d6+d2
+ vadd.i8 d7,d7,d3 //d7 = d7+d3
+
+ vst4.8 {d4, d5, d6, d7}, [r0]! //dst rgb(0,1,2,3) A(0,1,2,3) = d4,d5,d6,d7
+
+.Lless_than_16:
+ cmp r2,#8
+ blt .Lless_than_8
+
+ sub r2,r2,#8
+
+ vld4.8 {d0, d1, d2, d3}, [r1]! //d0,d1,d2,d3 = sourc rgb(0,1,2,3) A(0,1,2,3)
+ //update source ptr but not dst ptr
+ vld4.8 {d4, d5, d6, d7}, [r0] //d4,d5,d6,d7 = dst rgb(0,1,2,3) A(0,1,2,3)
+
+ vmov.i16 q4,#0xff //Q4.16 = 255
+ vsubw.u8 q4,q4,d3 //Q4.16 = 255-d3
+
+ cmp r2,#8
+
+ vshr.u16 q5,q4,#7 //Q5.16 = Q4 >> 7
+ vadd.i16 q4,q4,q5 //Q4 = Q4.16+Q5.16 ; A(0,1,2,3)
+
+ vmovl.u8 q6,d4 //Q6 = vmovl.u8 d4
+ vmovl.u8 q7,d5 //Q7 = vmovl.u8 d5
+ vmovl.u8 q8,d6 //Q8 = vmovl.u8 d6
+ vmovl.u8 q9,d7 //Q9 = vmovl.u8 d7
+
+ vmul.i16 q6,q6,q4 //Q6 = Q6 * Q4
+ vmul.i16 q7,q7,q4 //Q7 = Q7 * Q4
+ vmul.i16 q8,q8,q4 //Q8 = Q8 * Q4
+ vmul.i16 q9,q9,q4 //Q9 = Q9 * Q4
+
+ vshrn.i16 d4,q6,#8 //d4 = Q6.16 shrn 8
+ vshrn.i16 d5,q7,#8 //d5 = Q7.16 shrn 8
+ vshrn.i16 d6,q8,#8 //d6 = Q8.16 shrn 8
+ vshrn.i16 d7,q9,#8 //d7 = Q9.16 shrn 8
+
+ vadd.i8 d4,d4,d0 //d4 = d4+d0
+ vadd.i8 d5,d5,d1 //d5 = d5+d1
+ vadd.i8 d6,d6,d2 //d6 = d6+d2
+ vadd.i8 d7,d7,d3 //d7 = d7+d3
+
+ vst4.8 {d4, d5, d6, d7}, [r0]! //dst rgb(0,1,2,3) A(0,1,2,3) = d4,d5,d6,d7
+
+ //It will be guaranteed to be less than 8
+ //bge loop
+.Lless_than_8:
+ vpop {Q4-Q7}
+
+.Lless_than_4:
+ subs r4,r2,#1
+ bmi .Lto_exit // S32A_Opaque_BlitRow32_neon + 268
+ mov r8,#0xff
+ mvn r10,#0xff00
+ orr r9,r8,r8,lsl #16
+ lsl r11,r9,#8
+.Lresidual_loop:
+ ldr r3,[r1,#0]
+ ldr r12,[r0,#0]
+ add r1,r1,#4
+ sub r2,r8,r3,lsr #24
+ and r5,r12,r9
+ cmp r2,r2
+ add r2,r2,#1
+ and r12,r10,r12,lsr #8
+ strne r6,[r7,#0xeef]
+ mul r5,r5,r2
+ mul r2,r12,r2
+ strne r6,[r7,#0xeef]
+ subs r4,r4,#1
+ and r12,r9,r5,lsr #8
+ and r2,r2,r11
+ orr r2,r2,r12
+ add r2,r2,r3
+ str r2,[r0],#4
+ bpl .Lresidual_loop // S32A_Opaque_BlitRow32_neon + 192
+
+.Lto_exit:
+ pop {r4-r11}
+ bx lr
+
+.Lless_than_24:
+ cmp r2,#8
+ blt .Lless_than_4
+
+.Lloop_8:
+ sub r2,r2,#8
+ // We already read the 8 words from the previous pipe line
+ vld4.8 {d0, d1, d2, d3}, [r1]! //d0,d1,d2,d3 = sourc rgb(0,1,2,3) A(0,1,2,3)
+ //update source ptr but not dst ptr
+ vld4.8 {d4, d5, d6, d7}, [r0] //d4,d5,d6,d7 = dst rgb(0,1,2,3) A(0,1,2,3)
+
+ vmov.i16 q10,#0xff //Q4.16 = 255
+ vsubW.u8 q10,q10,d3 //Q4.16 = 255-d3
+
+ cmp r2,#8
+
+ vshr.u16 q11,q10,#7 //Q5.16 = Q4 >> 7
+ vadd.i16 q10,q10,q11 //Q4 = Q4.16+Q5.16 ; A(0,1,2,3)
+
+ vmovl.u8 q12,d4 //Q6 = vmovl.u8 d4
+ vmovl.u8 q13,d5 //Q7 = vmovl.u8 d5
+ vmovl.u8 q8,d6 //Q8 = vmovl.u8 d6
+ vmovl.u8 q9,d7 //Q9 = vmovl.u8 d7
+
+ vmul.i16 q12,q12,q10 //Q6 = Q6 * Q4
+ vmul.i16 q13,q13,q10 //Q7 = Q7 * Q4
+ vmul.i16 q8,q8,q10 //Q8 = Q8 * Q4
+ vmul.i16 q9,q9,q10 //Q9 = Q9 * Q4
+
+ vshrn.i16 d4,q12,#8 //d4 = Q6.16 shrn 8
+ vshrn.i16 d5,q13,#8 //d5 = Q7.16 shrn 8
+ vshrn.i16 d6,q8,#8 //d6 = Q8.16 shrn 8
+ vshrn.i16 d7,q9,#8 //d7 = Q9.16 shrn 8
+
+ vadd.i8 d4,d4,d0 //d4 = d4+d0
+ vadd.i8 d5,d5,d1 //d5 = d5+d1
+ vadd.i8 d6,d6,d2 //d6 = d6+d2
+ vadd.i8 d7,d7,d3 //d7 = d7+d3
+
+ vst4.8 {d4, d5, d6, d7}, [r0]! //dst rgb(0,1,2,3) A(0,1,2,3) = d4,d5,d6,d7
+
+ bge .Lloop_8
+ b .Lless_than_4
+
+#else
+
+/*
+ * r0 - dst
+ * r1 - src
+ * r2 - count
+ */
+ push {r4-r11}
+ mov r9, #0xFF
+ orr r10, r9, r9, lsl #16
+ mvn r11, r10
+
+.Lblitrow32_loop:
+ ldr r3, [r0]
+ ldr r4, [r1], #4
+
+ cmp r3, #0
+ streq r4, [r0], #4
+ beq .Lblitrow32_loop_cond
+
+ // r5 <- (255-alpha)+1
+ sub r5, r9, r4, lsr #24
+ and r6, r3, r10
+ add r5, r5, #1
+ and r7, r10, r3, lsr #8
+
+ mul r8, r6, r5
+ lsr r6, r8, #8
+ mul r8, r7, r5
+
+ // combine rb and ag
+ and r6, r6, r10
+ and r7, r8, r11
+ orr r6, r6, r7
+
+ // add src to combined value
+ add r6, r6, r4
+ str r6, [r0], #4
+
+.Lblitrow32_loop_cond:
+ subs r2, r2, #1
+ bhi .Lblitrow32_loop
+ pop {r4-r11}
+ bx lr
+
+#endif
+
+.endfunc
+.size S32A_Opaque_BlitRow32_asm, .-S32A_Opaque_BlitRow32_asm
diff --git a/src/core/asm/S32_Opaque_D32_nofilter_DX_gether.S b/src/core/asm/S32_Opaque_D32_nofilter_DX_gether.S
new file mode 100644
index 0000000000..3467432826
--- /dev/null
+++ b/src/core/asm/S32_Opaque_D32_nofilter_DX_gether.S
@@ -0,0 +1,85 @@
+/*
+ * Copyright (c) 2005-2008, The Android Open Source Project
+ * Copyright (c) 2010, Code Aurora Forum. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ .text
+ .global S32_Opaque_D32_nofilter_DX_gether
+ .func S32_Opaque_D32_nofilter_DX_gether
+S32_Opaque_D32_nofilter_DX_gether:
+ push {r0-r11,lr}
+ asr r0,r2,#3
+ sub sp,sp,#4 //23
+ cmp r0,#0
+ str r0,[sp,#0] //r0 = count >> 3
+ ble .L1_140
+ ldr r4,[sp,#4] //r4 = r0 (dst)
+ mov r0,r3
+ add r12,r3,#4
+ asr r8,r2,#3
+.L1_52:
+ ldm r3!, {r0,r6,r9,r11}
+ lsr r5,r0,#16 //30
+ ldr r5,[r1,r5,lsl #2] //30
+ lsr r7,r6,#16 //32
+ ldr r7,[r1,r7,lsl #2] //31
+ uxth r0,r0 //34
+ ldr r0,[r1,r0,lsl #2] //34
+ uxth r6,r6 //31
+ ldr r6,[r1,r6,lsl #2] //32
+ //stm r4!, {r0,r5,r6,r7} ;35
+ lsr r10,r9,#16 //30
+ ldr r10,[r1,r10,lsl #2] //30
+ lsr lr,r11,#16 //32
+ ldr lr,[r1,lr,lsl #2] //31
+ uxth r9,r9 //34
+ ldr r9,[r1,r9,lsl #2] //34
+ uxth r11,r11 //31
+ ldr r11,[r1,r11,lsl #2] //32
+ subs r8,r8,#1
+ stm r4!, {r0,r5,r6,r7,r9,r10,r11,lr} //35
+
+ bne .L1_52
+
+ ldr r0,[sp,#0] // count >> 3
+ mov r12,r0
+ ldr r0,[sp,#4] //r0 = dst
+ add r0,r0,r12,lsl #5 //dst += count >>3 << 5
+ str r0,[sp,#4] //save r0 into stack again
+.L1_140:
+//;;39 const uint16_t* SK_RESTRICT xx = (const uint16_t*)(xy);
+//;;40 for (i = (count & 7); i > 0; --i) {
+ tst r2,#7
+ beq .L1_184
+ ldr r0,[sp,#4] //r0 = currnt dst
+ and r2,r2,#7
+.L1_156:
+//;;41 //SkASSERT(*xx < (unsigned)s.fBitmap->width());
+//;;42 src = srcAddr[*xx++]; *colors++ = RETURNDST(src);
+ ldrh r4,[r3],#2
+ add r12,r0,#4
+//;;43 }
+ subs r2,r2,#1
+ ldr r4,[r1,r4,lsl #2] //42
+ str r4,[r0,#0] //42
+ mov r0,r12 //42
+ bne .L1_156
+.L1_184:
+//;;44 }
+ add sp,sp,#0x14
+ pop {r4-r11,pc}
+
+.endfunc
+.size S32_Opaque_D32_nofilter_DX_gether, .-S32_Opaque_D32_nofilter_DX_gether
diff --git a/src/core/asm/memset16_neon.S b/src/core/asm/memset16_neon.S
new file mode 100644
index 0000000000..0f04b90bbc
--- /dev/null
+++ b/src/core/asm/memset16_neon.S
@@ -0,0 +1,158 @@
+/* Copyright (c) 2009, Code Aurora Forum. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Code Aurora nor
+ * the names of its contributors may be used to endorse or promote
+ * products derived from this software without specific prior written
+ * permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NON-INFRINGEMENT ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+ * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+/***************************************************************************
+ Neon memset: Attempts to do a memset with Neon registers if possible,
+ Inputs:
+ s: The buffer to write to
+ c: The integer data to write to the buffer
+ n: The size_t count.
+ Outputs:
+
+***************************************************************************/
+
+ .code 32
+ .align 4
+ .globl memset16_neon
+ .func
+
+memset16_neon:
+ cmp r2, #0
+ bxeq lr
+
+ push {r0}
+
+ /* If we have < 8 bytes, just do a quick loop to handle that */
+ cmp r2, #8
+ bgt memset_gt4
+memset_smallcopy_loop:
+ strh r1, [r0], #2
+ subs r2, r2, #2
+ bne memset_smallcopy_loop
+memset_smallcopy_done:
+ pop {r0}
+ bx lr
+
+memset_gt4:
+ /*
+ * Duplicate the r1 lowest 16-bits across r1. The idea is to have
+ * a register with two 16-bit-values we can copy. We do this by
+ * duplicating lowest 16-bits of r1 to upper 16-bits.
+ */
+ orr r1, r1, r1, lsl #16
+ /*
+ * If we're copying > 64 bytes, then we may want to get
+ * onto a 16-byte boundary to improve speed even more.
+ */
+ cmp r2, #64
+ blt memset_route
+ ands r12, r0, #0xf
+ beq memset_route
+ /*
+ * Determine the number of bytes to move forward to get to the 16-byte
+ * boundary. Note that this will be a multiple of 4, since we
+ * already are word-aligned.
+ */
+ rsb r12, r12, #16
+ sub r2, r2, r12
+ lsls r12, r12, #29
+ strmi r1, [r0], #4
+ strcs r1, [r0], #4
+ strcs r1, [r0], #4
+ lsls r12, r12, #2
+ strcsh r1, [r0], #2
+memset_route:
+ /*
+ * Decide where to route for the maximum copy sizes. Note that we
+ * build q0 and q1 depending on if we'll need it, so that's
+ * interwoven here as well.
+ */
+ vdup.u32 d0, r1
+ cmp r2, #16
+ blt memset_8
+ vmov d1, d0
+ cmp r2, #64
+ blt memset_16
+ vmov q1, q0
+ cmp r2, #128
+ blt memset_32
+memset_128:
+ mov r12, r2, lsr #7
+memset_128_loop:
+ vst1.64 {q0, q1}, [r0]!
+ vst1.64 {q0, q1}, [r0]!
+ vst1.64 {q0, q1}, [r0]!
+ vst1.64 {q0, q1}, [r0]!
+ subs r12, r12, #1
+ bne memset_128_loop
+ ands r2, r2, #0x7f
+ beq memset_end
+memset_32:
+ movs r12, r2, lsr #5
+ beq memset_16
+memset_32_loop:
+ subs r12, r12, #1
+ vst1.64 {q0, q1}, [r0]!
+ bne memset_32_loop
+ ands r2, r2, #0x1f
+ beq memset_end
+memset_16:
+ movs r12, r2, lsr #4
+ beq memset_8
+memset_16_loop:
+ subs r12, r12, #1
+ vst1.32 {q0}, [r0]!
+ bne memset_16_loop
+ ands r2, r2, #0xf
+ beq memset_end
+ /*
+ * memset_8 isn't a loop, since we try to do our loops at 16
+ * bytes and above. We should loop there, then drop down here
+ * to finish the <16-byte versions. Same for memset_4 and
+ * memset_1.
+ */
+memset_8:
+ cmp r2, #8
+ blt memset_4
+ subs r2, r2, #8
+ vst1.32 {d0}, [r0]!
+memset_4:
+ cmp r2, #4
+ blt memset_2
+ subs r2, r2, #4
+ str r1, [r0], #4
+memset_2:
+ cmp r2, #0
+ ble memset_end
+ strh r1, [r0], #2
+memset_end:
+ pop {r0}
+ bx lr
+
+ .endfunc
+ .end
diff --git a/src/core/asm/memset32_neon.S b/src/core/asm/memset32_neon.S
new file mode 100644
index 0000000000..b611357b75
--- /dev/null
+++ b/src/core/asm/memset32_neon.S
@@ -0,0 +1,146 @@
+/* Copyright (c) 2009, Code Aurora Forum. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of Code Aurora nor
+ * the names of its contributors may be used to endorse or promote
+ * products derived from this software without specific prior written
+ * permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NON-INFRINGEMENT ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+ * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+/***************************************************************************
+ Neon memset: Attempts to do a memset with Neon registers if possible,
+ Inputs:
+ s: The buffer to write to
+ c: The integer data to write to the buffer
+ n: The size_t count.
+ Outputs:
+
+***************************************************************************/
+
+ .code 32
+ .align 4
+ .globl memset32_neon
+ .func
+
+memset32_neon:
+ cmp r2, #0
+ bxeq lr
+
+ push {r0}
+
+ /* If we have < 8 bytes, just do a quick loop to handle that */
+ cmp r2, #8
+ bgt memset_gt4
+memset_smallcopy_loop:
+ str r1, [r0], #4
+ subs r2, r2, #4
+ bne memset_smallcopy_loop
+memset_smallcopy_done:
+ pop {r0}
+ bx lr
+
+memset_gt4:
+ /*
+ * If we're copying > 64 bytes, then we may want to get
+ * onto a 16-byte boundary to improve speed even more.
+ */
+ cmp r2, #64
+ blt memset_route
+ ands r12, r0, #0xf
+ beq memset_route
+ /*
+ * Determine the number of bytes to move forward to get to the 16-byte
+ * boundary. Note that this will be a multiple of 4, since we
+ * already are word-aligned.
+ */
+ rsb r12, r12, #16
+ sub r2, r2, r12
+ lsls r12, r12, #29
+ strmi r1, [r0], #4
+ strcs r1, [r0], #4
+ strcs r1, [r0], #4
+memset_route:
+ /*
+ * Decide where to route for the maximum copy sizes. Note that we
+ * build q0 and q1 depending on if we'll need it, so that's
+ * interwoven here as well.
+ */
+ vdup.u32 d0, r1
+ cmp r2, #16
+ blt memset_8
+ vmov d1, d0
+ cmp r2, #64
+ blt memset_16
+ vmov q1, q0
+ cmp r2, #128
+ blt memset_32
+memset_128:
+ mov r12, r2, lsr #7
+memset_128_loop:
+ vst1.64 {q0, q1}, [r0]!
+ vst1.64 {q0, q1}, [r0]!
+ vst1.64 {q0, q1}, [r0]!
+ vst1.64 {q0, q1}, [r0]!
+ subs r12, r12, #1
+ bne memset_128_loop
+ ands r2, r2, #0x7f
+ beq memset_end
+memset_32:
+ movs r12, r2, lsr #5
+ beq memset_16
+memset_32_loop:
+ subs r12, r12, #1
+ vst1.64 {q0, q1}, [r0]!
+ bne memset_32_loop
+ ands r2, r2, #0x1f
+ beq memset_end
+memset_16:
+ movs r12, r2, lsr #4
+ beq memset_8
+memset_16_loop:
+ subs r12, r12, #1
+ vst1.32 {q0}, [r0]!
+ bne memset_16_loop
+ ands r2, r2, #0xf
+ beq memset_end
+ /*
+ * memset_8 isn't a loop, since we try to do our loops at 16
+ * bytes and above. We should loop there, then drop down here
+ * to finish the <16-byte versions. Same for memset_4 and
+ * memset_1.
+ */
+memset_8:
+ cmp r2, #8
+ blt memset_4
+ subs r2, r2, #8
+ vst1.32 {d0}, [r0]!
+memset_4:
+ cmp r2, #4
+ blt memset_end
+ subs r2, r2, #4
+ str r1, [r0], #4
+memset_end:
+ pop {r0}
+ bx lr
+
+ .endfunc
+ .end
diff --git a/src/core/asm/t32cb16blend.S b/src/core/asm/t32cb16blend.S
new file mode 100644
index 0000000000..f835dd3271
--- /dev/null
+++ b/src/core/asm/t32cb16blend.S
@@ -0,0 +1,325 @@
+/*
+ * Copyright 2006, The Android Open Source Project
+ * Copyright (c) 2009, Code Aurora Forum.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+/*
+ * This file is derived from libpixelflinger version of BLIT routine.
+ * Algorithm used for BLIT operation here is equivalent to the one in
+ * C function, S32A_D565_Opaque. Use neon instructions to process 16 pixels
+ * at-a-time on armv7. If the number of pixels is less than 16 and/or the
+ * architecture is armv6 and below, use regular arm instructions. Regular
+ * arm code combines two 16-bit writes into one 32-bit write to destination,
+ * uses destination and source pre-loads, and unrolls the main loop thrice.
+ */
+ .text
+ .align
+
+ .global scanline_t32cb16blend_arm
+
+// uses r6, r7, r8, r9, r10, lr
+
+.macro pixel, DREG, SRC, FB, OFFSET
+
+ // SRC = AABBGGRR
+ subs r7, r10, \SRC, lsr #24 // sAA = 255 - sAA
+ beq 1f
+
+.if \OFFSET
+
+ // red
+ mov lr, \DREG, lsr #(\OFFSET + 6 + 5)
+ smlabb lr, r7, lr, r8
+ and r6, \SRC, r10
+ add lr, lr, lr, lsr #5
+ add lr, r6, lr, lsr #5
+ lsr lr, #3
+ orr \FB, lr, lsl #(\OFFSET + 11)
+
+ // green
+ and r6, \DREG, #(0x3F<<(\OFFSET + 5))
+ lsr r6, #5
+ smlabt r6, r7, r6, r9
+ and lr, r10, \SRC, lsr #(8)
+ add r6, r6, r6, lsr #6
+ add r6, lr, r6, lsr #6
+ lsr r6, #2
+ orr \FB, \FB, r6, lsl #(\OFFSET + 5)
+
+ // blue
+ and lr, \DREG, #(0x1F << \OFFSET)
+ smlabt lr, r7, lr, r8
+ and r6, r10, \SRC, lsr #(8+8)
+ add lr, lr, lr, lsr #5
+ add lr, r6, lr, lsr #5
+ lsr lr, #3
+ orr \FB, \FB, lr, lsl #\OFFSET
+
+.else
+
+ // red
+ mov lr, \DREG, lsr #(6+5)
+ and lr, lr, #0x1F
+ smlabb lr, r7, lr, r8
+ and r6, \SRC, r10
+ add lr, lr, lr, lsr #5
+ add lr, r6, lr, lsr #5
+ lsr lr, #3
+ mov \FB, lr, lsl #11
+
+ // green
+ and r6, \DREG, #(0x3F<<5)
+ lsr r6, #5
+ smlabb r6, r7, r6, r9
+ and lr, r10, \SRC, lsr #(8)
+ add r6, r6, r6, lsr #6
+ add r6, lr, r6, lsr #6
+ lsr r6, #2
+ orr \FB, \FB, r6, lsl #5
+
+ // blue
+ and lr, \DREG, #0x1F
+ smlabb lr, r7, lr, r8
+ and r6, r10, \SRC, lsr #(8+8)
+ add lr, lr, lr, lsr #5
+ add lr, r6, lr, lsr #5
+ orr \FB, \FB, lr, lsr #3
+
+.endif
+ b 2f
+
+ /*
+ * When alpha = 255, down scale the source RGB pixel (24 bits)
+ * to 16 bits(RGB565)
+ */
+1:
+ lsl r6, \SRC, #8
+ lsr lr, \SRC, #5
+ and r7, r6, #0xf800
+ and lr, lr, #0x7e0
+ orr lr, lr, r7
+
+.if \OFFSET
+ orr lr, lr, r6, lsr #27
+ orr \FB, \FB, lr, lsl #(\OFFSET)
+.else
+ orr \FB, lr, r6, lsr #27
+.endif
+
+2:
+.endm
+
+
+// r0: dst ptr
+// r1: src ptr
+// r2: count
+// r3: d
+// r4: s0
+// r5: s1
+// r6: pixel
+// r7: pixel
+// r8: 0x10
+// r9: 0x20
+// r10: 0xFF
+// r11: free
+// r12: scratch
+// r14: free
+
+scanline_t32cb16blend_arm:
+ stmfd sp!, {r4-r10, lr}
+
+#if __ARM_ARCH__ == 7 || defined(__ARM_NEON__)
+ subs r2, r2, #16
+
+ blo blit_less_than_16_left
+
+ vmov.u16 q12, #0x80
+ vmov.u8 q13, #0xf8
+
+blit_neon_loop:
+ /*
+ * Load 64 bytes from source and 32 bytes from destination
+ * note that source pixels are 4 bytes wide and
+ * destination pixels are 2 bytes wide.
+ */
+ vld4.8 {d2, d4, d6, d8}, [r1]!
+ vld4.8 {d3, d5, d7, d9}, [r1]!
+
+ vand.8 d10, d8, d9
+ vmov r3, r4, d10
+
+ cmp r3, #0xffffffff
+ cmpeq r4, #0xffffffff
+ bne blit_alpha_not_255
+
+ // alpha equals 255 case
+
+ vshl.u8 q0, q2, #3
+
+ subs r2, r2, #16
+
+ vsri.u8 q1, q2, #5
+ vsri.u8 q0, q3, #3
+
+ // store the rgb destination values back to memory
+ vst2.8 {d0, d2}, [r0]!
+ vst2.8 {d1, d3}, [r0]!
+
+ blo blit_less_than_16_left
+ b blit_neon_loop
+
+blit_alpha_not_255:
+ // alpha = 255 - alpha
+ vmvn.u8 q0, q4
+
+ vld2.8 {q5, q6}, [r0]
+
+ vshl.u8 q7, q6, #3
+
+ subs r2, r2, #16
+
+ vand.u8 q6, q6, q13
+
+ vmov.16 q8, q12
+ vmov.16 q9, q12
+
+ vsri.u8 q7, q5, #5
+ vshl.u8 q5, q5, #3
+
+ vmlal.u8 q8, d0, d12
+ vmlal.u8 q9, d1, d13
+
+ vshl.u8 q7, q7, #2
+
+ vshr.u16 q10, q8, #5
+ vshr.u16 q11, q9, #5
+ vaddhn.u16 d12, q8, q10
+ vaddhn.u16 d13, q9, q11
+
+ vmov.16 q8, q12
+ vmov.16 q9, q12
+ vmlal.u8 q8, d0, d14
+ vmlal.u8 q9, d1, d15
+
+ vqadd.u8 q6, q6, q1
+
+ vshr.u16 q10, q8, #6
+ vshr.u16 q11, q9, #6
+ vaddhn.u16 d14, q8, q10
+ vaddhn.u16 d15, q9, q11
+
+ vmov.16 q8, q12
+ vmov.16 q9, q12
+ vmlal.u8 q8, d0, d10
+ vmlal.u8 q9, d1, d11
+
+ vqadd.u8 q7, q7, q2
+
+ vshl.u8 q5, q7, #3
+
+ vshr.u16 q10, q8, #5
+ vshr.u16 q11, q9, #5
+
+ vsri.u8 q6, q7, #5
+
+ vaddhn.u16 d16, q8, q10
+ vaddhn.u16 d17, q9, q11
+ vqadd.u8 q8, q8, q3
+
+ vsri.u8 q5, q8, #3
+
+ // store the rgb destination values back to memory
+ vst2.8 {d10, d12}, [r0]!
+ vst2.8 {d11, d13}, [r0]!
+
+ blo blit_less_than_16_left
+ b blit_neon_loop
+#endif
+
+blit_less_than_16_left:
+ pld [r1]
+
+ mov r8, #0x10
+ mov r9, #0x20
+ mov r10, #0xFF
+
+#if __ARM_ARCH__ == 7 || defined(__ARM_NEON__)
+ adds r2, r2, #14
+#else
+ subs r2, r2, #2
+#endif
+
+ pld [r0]
+ blo 9f
+
+ // The main loop is unrolled thrice and process 6 pixels
+8: ldmia r1!, {r4, r5}
+ // stream the source
+ pld [r1, #32]
+ add r0, r0, #4
+ // it's all zero, skip this pixel
+ orrs r3, r4, r5
+ beq 7f
+
+ // load the destination
+ ldr r3, [r0, #-4]
+ // stream the destination
+ pld [r0, #32]
+ pixel r3, r4, r12, 0
+ pixel r3, r5, r12, 16
+ // effectively, we're getting write-combining by virtue of the
+ // cpu's write-back cache.
+ str r12, [r0, #-4]
+
+ // 2nd iteration of the loop, don't stream anything
+ subs r2, r2, #2
+ blt 9f
+ ldmia r1!, {r4, r5}
+ add r0, r0, #4
+ orrs r3, r4, r5
+ beq 7f
+ ldr r3, [r0, #-4]
+ pixel r3, r4, r12, 0
+ pixel r3, r5, r12, 16
+ str r12, [r0, #-4]
+
+ // 3rd iteration of the loop, don't stream anything
+ subs r2, r2, #2
+ blt 9f
+ ldmia r1!, {r4, r5}
+ add r0, r0, #4
+ orrs r3, r4, r5
+ beq 7f
+ ldr r3, [r0, #-4]
+ pixel r3, r4, r12, 0
+ pixel r3, r5, r12, 16
+ str r12, [r0, #-4]
+
+7: subs r2, r2, #2
+ blo 9f
+ b 8b
+
+9: adds r2, r2, #1
+ ldmlofd sp!, {r4-r10, lr} // return
+ bxlo lr
+
+ // last pixel left
+ ldr r4, [r1], #4
+ ldrh r3, [r0]
+ pixel r3, r4, r12, 0
+ strh r12, [r0], #2
+ ldmfd sp!, {r4-r10, lr} // return
+ bx lr
diff --git a/src/core/asm/xfer.S b/src/core/asm/xfer.S
new file mode 100644
index 0000000000..96d587333b
--- /dev/null
+++ b/src/core/asm/xfer.S
@@ -0,0 +1,136 @@
+/*
+ * Copyright 2006, The Android Open Source Project
+ * Copyright (c) 2009, Code Aurora Forum.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+ .text
+ .align
+
+ .global xfer16_arm
+
+.macro pixel, DREG, SRC, FB, OFFSET
+ lsl r6, \SRC, #8
+ lsr r8, \SRC, #5
+ and r7, r6, #0xf800
+ and r8, r8, #0x7e0
+ orr r8, r8, r7
+
+.if \OFFSET
+ orr r8, r8, r6, lsr #27
+ orr \FB, \FB, r8, lsl #(\OFFSET)
+.else
+ orr \FB, r8, r6, lsr #27
+.endif
+
+.endm
+
+xfer16_arm:
+ stmfd sp!, {r4-r8}
+
+#if __ARM_ARCH__ == 7 || defined(__ARM_NEON__)
+ subs r2, r2, #16
+
+ blo xfer16_less_than_16_left
+
+ vmov.u16 q12, #0x80
+ //pld [r1]
+ //pld [r1, #32]
+
+xfer16_neon_loop:
+ // load 64 bytes from source and 32 bytes from destination
+ // note that source pixels are 4 bytes wide and
+ // destination pixels are 2 bytes wide
+ vld4.8 {d2, d4, d6, d8}, [r1]!
+ vld4.8 {d3, d5, d7, d9}, [r1]!
+
+ vshl.u8 q0, q2, #3
+
+ subs r2, r2, #16
+
+ vsri.u8 q1, q2, #5
+ vsri.u8 q0, q3, #3
+
+ // store the rgb destination values back to memory
+ vst2.8 {d0, d2}, [r0]!
+ vst2.8 {d1, d3}, [r0]!
+
+ blo xfer16_less_than_16_left
+ b xfer16_neon_loop
+#endif
+
+xfer16_less_than_16_left:
+ pld [r1]
+
+#if __ARM_ARCH__ == 7 || defined(__ARM_NEON__)
+ adds r2, r2, #14
+#else
+ subs r2, r2, #2
+#endif
+
+ pld [r0]
+ blo 9f
+
+ // The main loop is unrolled thrice and process 6 pixels
+8: ldmia r1!, {r4, r5}
+ // stream the source
+ pld [r1, #32]
+ add r0, r0, #4
+
+ // load the destination
+ ldr r3, [r0, #-4]
+ // stream the destination
+ pld [r0, #32]
+ pixel r3, r4, r12, 0
+ pixel r3, r5, r12, 16
+ // effectively, we're getting write-combining by virtue of the
+ // cpu's write-back cache.
+ str r12, [r0, #-4]
+
+ // 2nd iteration of the loop, don't stream anything
+ subs r2, r2, #2
+ blt 9f
+ ldmia r1!, {r4, r5}
+ add r0, r0, #4
+ ldr r3, [r0, #-4]
+ pixel r3, r4, r12, 0
+ pixel r3, r5, r12, 16
+ str r12, [r0, #-4]
+
+ // 3rd iteration of the loop, don't stream anything
+ subs r2, r2, #2
+ blt 9f
+ ldmia r1!, {r4, r5}
+ add r0, r0, #4
+ ldr r3, [r0, #-4]
+ pixel r3, r4, r12, 0
+ pixel r3, r5, r12, 16
+ str r12, [r0, #-4]
+
+7: subs r2, r2, #2
+ blo 9f
+ b 8b
+
+9: adds r2, r2, #1
+ ldmlofd sp!, {r4-r8} // return
+ bxlo lr
+
+ // last pixel left
+ ldr r4, [r1], #4
+ ldrh r3, [r0]
+ pixel r3, r4, r12, 0
+ strh r12, [r0], #2
+ ldmfd sp!, {r4-r8} // return
+ bx lr
diff --git a/src/images/SkImageDecoder_libjpeg.cpp b/src/images/SkImageDecoder_libjpeg.cpp
index 12fe76ab3b..279c8ab9c1 100644
--- a/src/images/SkImageDecoder_libjpeg.cpp
+++ b/src/images/SkImageDecoder_libjpeg.cpp
@@ -397,30 +397,40 @@ bool SkJPEGImageDecoder::onDecode(SkStream* stream, SkBitmap* bm,
/* image_width and image_height are the original dimensions, available
after jpeg_read_header(). To see the scaled dimensions, we have to call
- jpeg_start_decompress(), and then read output_width and output_height.
+ jpeg_calc_output_dimensions(), and then read output_width and output_height.
*/
+ jpeg_calc_output_dimensions(&cinfo);
+
+ /* We have enough information to return
+ to the caller if they just wanted (subsampled bounds). If sampleSize
+ was 1, then we would have already returned. Thus we just check if
+ we're in kDecodeBounds_Mode, and that we have valid output sizes.
+ */
+ if (SkImageDecoder::kDecodeBounds_Mode == mode &&
+ valid_output_dimensions(cinfo)) {
+ SkScaledBitmapSampler smpl(cinfo.output_width, cinfo.output_height,
+ recompute_sampleSize(sampleSize, cinfo));
+ bm->setConfig(config, smpl.scaledWidth(), smpl.scaledHeight());
+ bm->setIsOpaque(true);
+ return true;
+ }
+
+ sampleSize = recompute_sampleSize(sampleSize, cinfo);
+
+#ifdef ANDROID_RGB
+ if ((sampleSize != 1) && (cinfo.out_color_space == JCS_RGB_565)) {
+ /* Requires SkScaledBitmapSampler, but since
+ SkScaledBitmapSampler can't handle RGB_565 yet,
+ don't even try.
+ Revert back to the default format JCS_RGB.
+ */
+ cinfo.out_color_space = JCS_RGB;
+ }
+#endif
+
if (!jpeg_start_decompress(&cinfo)) {
- /* If we failed here, we may still have enough information to return
- to the caller if they just wanted (subsampled bounds). If sampleSize
- was 1, then we would have already returned. Thus we just check if
- we're in kDecodeBounds_Mode, and that we have valid output sizes.
-
- One reason to fail here is that we have insufficient stream data
- to complete the setup. However, output dimensions seem to get
- computed very early, which is why this special check can pay off.
- */
- if (SkImageDecoder::kDecodeBounds_Mode == mode &&
- valid_output_dimensions(cinfo)) {
- SkScaledBitmapSampler smpl(cinfo.output_width, cinfo.output_height,
- recompute_sampleSize(sampleSize, cinfo));
- bm->setConfig(config, smpl.scaledWidth(), smpl.scaledHeight());
- bm->setIsOpaque(true);
- return true;
- } else {
- return return_false(cinfo, *bm, "start_decompress");
- }
+ return return_false(cinfo, *bm, "start_decompress");
}
- sampleSize = recompute_sampleSize(sampleSize, cinfo);
// should we allow the Chooser (if present) to pick a config for us???
if (!this->chooseFromOneChoice(config, cinfo.output_width,
diff --git a/src/opts/SkBlitRow_opts_arm.cpp b/src/opts/SkBlitRow_opts_arm.cpp
index 0c38113adf..fb530b42ec 100644
--- a/src/opts/SkBlitRow_opts_arm.cpp
+++ b/src/opts/SkBlitRow_opts_arm.cpp
@@ -20,6 +20,11 @@
#include "SkColorPriv.h"
#include "SkDither.h"
+extern "C" void S32A_Opaque_BlitRow32_asm(SkPMColor* SK_RESTRICT dst,
+ const SkPMColor* SK_RESTRICT src,
+ int count,
+ U8CPU alpha);
+
#if defined(__ARM_HAVE_NEON) && !defined(SK_CPU_BENDIAN)
static void S32A_D565_Opaque_neon(uint16_t* SK_RESTRICT dst,
const SkPMColor* SK_RESTRICT src, int count,
@@ -398,15 +403,25 @@ static void S32_D565_Blend_Dither_neon(uint16_t *dst, const SkPMColor *src,
}
}
-#define S32A_D565_Opaque_PROC S32A_D565_Opaque_neon
#define S32A_D565_Blend_PROC S32A_D565_Blend_neon
#define S32_D565_Blend_Dither_PROC S32_D565_Blend_Dither_neon
#else
-#define S32A_D565_Opaque_PROC NULL
#define S32A_D565_Blend_PROC NULL
#define S32_D565_Blend_Dither_PROC NULL
#endif
+/*
+ * Use asm version of BlitRow function. Neon instructions are
+ * used for armv7 targets.
+ */
+#define S32A_Opaque_BlitRow32_PROC S32A_Opaque_BlitRow32_asm
+
+/*
+ * Use neon version of BLIT assembly code from t32cb16blend.S, where we process
+ * 16 pixels at-a-time and also optimize for alpha=255 case.
+ */
+#define S32A_D565_Opaque_PROC NULL
+
/* Don't have a special version that assumes each src is opaque, but our S32A
is still faster than the default, so use it here
*/
@@ -446,7 +461,7 @@ const SkBlitRow::Proc SkBlitRow::gPlatform_4444_Procs[] = {
const SkBlitRow::Proc32 SkBlitRow::gPlatform_Procs32[] = {
NULL, // S32_Opaque,
NULL, // S32_Blend,
- NULL, // S32A_Opaque,
+ S32A_Opaque_BlitRow32_PROC,
NULL, // S32A_Blend,
};