diff options
author | Steve Kondik <shade@chemlab.org> | 2010-04-11 15:22:53 -0400 |
---|---|---|
committer | Steve Kondik <shade@chemlab.org> | 2010-04-11 15:22:53 -0400 |
commit | d76be2dc139764e92294234b822ef3cbb7253cc8 (patch) | |
tree | 1717117afdbc4f3592b95598d4b2f37b840209ca | |
parent | e7ccf8c1cdac8dfbe353ec3201f36fac398f19c6 (diff) | |
parent | bcbd70c3951bbc0e1b09132fe21c1cf04982909e (diff) | |
download | android_external_skia-donut.tar.gz android_external_skia-donut.tar.bz2 android_external_skia-donut.zip |
Merge branch 'eclair' of git@github.com:cyanogen/android_external_skia into donutdonut
Conflicts:
Android.mk
-rw-r--r-- | Android.mk | 27 | ||||
-rw-r--r-- | include/core/SkUtils.h | 16 | ||||
-rw-r--r-- | src/core/SkBitmapProcState.cpp | 4 | ||||
-rw-r--r-- | src/core/SkBitmapProcState_sample.h | 11 | ||||
-rw-r--r-- | src/core/SkBlitRow_D16.cpp | 8 | ||||
-rw-r--r-- | src/core/SkXfermode.cpp | 15 | ||||
-rw-r--r-- | src/core/asm/S32A_Opaque_BlitRow32.S | 320 | ||||
-rw-r--r-- | src/core/asm/S32_Opaque_D32_nofilter_DX_gether.S | 85 | ||||
-rw-r--r-- | src/core/asm/memset16_neon.S | 158 | ||||
-rw-r--r-- | src/core/asm/memset32_neon.S | 146 | ||||
-rw-r--r-- | src/core/asm/t32cb16blend.S | 325 | ||||
-rw-r--r-- | src/core/asm/xfer.S | 136 | ||||
-rw-r--r-- | src/images/SkImageDecoder_libjpeg.cpp | 52 | ||||
-rw-r--r-- | src/opts/SkBlitRow_opts_arm.cpp | 21 |
14 files changed, 1292 insertions, 32 deletions
diff --git a/Android.mk b/Android.mk index 4b4605dda1..b1446d37e4 100644 --- a/Android.mk +++ b/Android.mk @@ -197,6 +197,33 @@ endif LOCAL_SRC_FILES += \ emoji/EmojiFont.cpp +# including the optimized assembly code for the src-overing operation +ifeq ($(TARGET_ARCH),arm) + LOCAL_CFLAGS += -DUSE_T32CB16BLEND_ASM + LOCAL_SRC_FILES += \ + src/core/asm/t32cb16blend.S \ + src/core/asm/xfer.S \ + src/core/asm/S32A_Opaque_BlitRow32.S +endif + +ifeq ($(TARGET_ARCH_VARIANT),armv6) + ARCH_ARMV6_ARMV7 := true +endif + +ifeq ($(TARGET_ARCH_VARIANT),armv7-a) + ARCH_ARMV6_ARMV7 := true +endif + +ifeq ($(ARCH_ARMV6_ARMV7),true) + LOCAL_SRC_FILES += \ + src/core/asm/S32_Opaque_D32_nofilter_DX_gether.S +endif + +ifeq ($(ARCH_ARM_HAVE_NEON),true) + LOCAL_SRC_FILES += \ + src/core/asm/memset16_neon.S \ + src/core/asm/memset32_neon.S +endif LOCAL_SHARED_LIBRARIES := \ libcutils \ diff --git a/include/core/SkUtils.h b/include/core/SkUtils.h index 9f3b1d6f36..deeebf00e7 100644 --- a/include/core/SkUtils.h +++ b/include/core/SkUtils.h @@ -36,10 +36,18 @@ void sk_memset16_portable(uint16_t dst[], uint16_t value, int count); void sk_memset32_portable(uint32_t dst[], uint32_t value, int count); #ifdef ANDROID - #include "cutils/memory.h" - - #define sk_memset16(dst, value, count) android_memset16(dst, value, (count) << 1) - #define sk_memset32(dst, value, count) android_memset32(dst, value, (count) << 2) + #if defined(__ARM_HAVE_NEON) + extern "C" void memset16_neon(uint16_t*, uint16_t, int); + extern "C" void memset32_neon(uint32_t*, uint32_t, int); + + #define sk_memset16(dst, value, count) memset16_neon(dst, value, (count) << 1) + #define sk_memset32(dst, value, count) memset32_neon(dst, value, (count) << 2) + #else + #include "cutils/memory.h" + + #define sk_memset16(dst, value, count) android_memset16(dst, value, (count) << 1) + #define sk_memset32(dst, value, count) android_memset32(dst, value, (count) << 2) + #endif #endif #ifndef sk_memset16 diff --git a/src/core/SkBitmapProcState.cpp b/src/core/SkBitmapProcState.cpp index 600b963d9d..0fdc6c8e95 100644 --- a/src/core/SkBitmapProcState.cpp +++ b/src/core/SkBitmapProcState.cpp @@ -86,7 +86,11 @@ static inline U8CPU Filter_8(unsigned x, unsigned y, SkASSERT(state.fAlphaScale == 256) #define RETURNDST(src) src #define SRC_TO_FILTER(src) src +#if __ARM_ARCH__ >= 6 && !defined(SK_CPU_BENDIAN) + #define USE_GETHER32 +#endif #include "SkBitmapProcState_sample.h" +#undef USE_GETHER32 #undef FILTER_PROC #define FILTER_PROC(x, y, a, b, c, d, dst) Filter_32_alpha(x, y, a, b, c, d, dst, alphaScale) diff --git a/src/core/SkBitmapProcState_sample.h b/src/core/SkBitmapProcState_sample.h index 4e1f1395e3..ae0750b20f 100644 --- a/src/core/SkBitmapProcState_sample.h +++ b/src/core/SkBitmapProcState_sample.h @@ -16,6 +16,13 @@ #error "unsupported DSTSIZE" #endif +#if defined(USE_GETHER32) + extern "C" void S32_Opaque_D32_nofilter_DX_gether(SkPMColor* SK_RESTRICT colors, + const SkPMColor* SK_RESTRICT srcAddr, + int count, + const uint32_t* SK_RESTRICT xy); +#endif + static void MAKENAME(_nofilter_DXDY)(const SkBitmapProcState& s, const uint32_t* SK_RESTRICT xy, int count, DSTTYPE* SK_RESTRICT colors) { @@ -85,6 +92,9 @@ static void MAKENAME(_nofilter_DX)(const SkBitmapProcState& s, DSTTYPE dstValue = RETURNDST(src); BITMAPPROC_MEMSET(colors, dstValue, count); } else { +#if defined(USE_GETHER32) + S32_Opaque_D32_nofilter_DX_gether(colors, srcAddr, count, xy); +#else int i; for (i = (count >> 2); i > 0; --i) { uint32_t xx0 = *xy++; @@ -104,6 +114,7 @@ static void MAKENAME(_nofilter_DX)(const SkBitmapProcState& s, SkASSERT(*xx < (unsigned)s.fBitmap->width()); src = srcAddr[*xx++]; *colors++ = RETURNDST(src); } +#endif } #ifdef POSTAMBLE diff --git a/src/core/SkBlitRow_D16.cpp b/src/core/SkBlitRow_D16.cpp index 66ac90e29a..a0d4b99135 100644 --- a/src/core/SkBlitRow_D16.cpp +++ b/src/core/SkBlitRow_D16.cpp @@ -215,12 +215,20 @@ static void S32A_D565_Blend_Dither(uint16_t* SK_RESTRICT dst, /////////////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////////// +#ifdef USE_T32CB16BLEND_ASM + extern "C" void scanline_t32cb16blend_arm(uint16_t*, uint32_t*, size_t); +#endif + static const SkBlitRow::Proc gDefault_565_Procs[] = { // no dither S32_D565_Opaque, S32_D565_Blend, +#ifdef USE_T32CB16BLEND_ASM + (SkBlitRow::Proc)scanline_t32cb16blend_arm, +#else S32A_D565_Opaque, +#endif S32A_D565_Blend, // dither diff --git a/src/core/SkXfermode.cpp b/src/core/SkXfermode.cpp index 8d1531a0a0..0f56d866df 100644 --- a/src/core/SkXfermode.cpp +++ b/src/core/SkXfermode.cpp @@ -19,6 +19,9 @@ #define SkAlphaMulAlpha(a, b) SkMulDiv255Round(a, b) +static SkPMColor src_modeproc(SkPMColor , SkPMColor ); +extern "C" void xfer16_arm(uint16_t*, uint32_t*, uint32_t); + static SkPMColor SkFourByteInterp(SkPMColor src, SkPMColor dst, U8CPU alpha) { unsigned scale = SkAlpha255To256(alpha); @@ -233,10 +236,14 @@ void SkProcXfermode::xfer16(SK_RESTRICT uint16_t dst[], if (NULL != proc) { if (NULL == aa) { - for (int i = count - 1; i >= 0; --i) { - SkPMColor dstC = SkPixel16ToPixel32(dst[i]); - dst[i] = SkPixel32ToPixel16_ToU16(proc(src[i], dstC)); - } + if (proc == src_modeproc) { + xfer16_arm(dst, (uint32_t*)src, count); + } else { + for (int i = count - 1; i >= 0; --i) { + SkPMColor dstC = SkPixel16ToPixel32(dst[i]); + dst[i] = SkPixel32ToPixel16_ToU16(proc(src[i], dstC)); + } + } } else { for (int i = count - 1; i >= 0; --i) { unsigned a = aa[i]; diff --git a/src/core/asm/S32A_Opaque_BlitRow32.S b/src/core/asm/S32A_Opaque_BlitRow32.S new file mode 100644 index 0000000000..1454dd19f6 --- /dev/null +++ b/src/core/asm/S32A_Opaque_BlitRow32.S @@ -0,0 +1,320 @@ +/* + * Copyright (c) 2005-2008, The Android Open Source Project + * Copyright (c) 2010, Code Aurora Forum. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + .text + + .global S32A_Opaque_BlitRow32_asm + .func S32A_Opaque_BlitRow32_asm + +S32A_Opaque_BlitRow32_asm: + +#if __ARM_ARCH__ == 7 || defined(__ARM_NEON__) + + push {r4-r11} + cmp r2,#24 + blt .Lless_than_24 + + vpush {Q4-Q7} + + vmov.i16 q14,#0xff //;Q4.16 = 255 +//prefix + vld4.8 {d0, d1, d2, d3}, [r1]! //d0,d1,d2,d3 = sourc rgb(0,1,2,3) A(0,1,2,3) + //update source ptr but not dst ptr + vld4.8 {d4, d5, d6, d7}, [r0] //d4,d5,d6,d7 = dst rgb(0,1,2,3) A(0,1,2,3) + add r3, r0, #32 // minus 16 to pretend the last round + mov r5, #64 + sub r2,r2,#8 +.Lloop: + sub r2,r2,#16 + vsubw.u8 q4,q14,d3 //Q4.16 = 255-d3 + //update source ptr but not dst ptr + + //It has to be 24 since we pre-load 8 word for the next rounds + cmp r2,#16 + + vsra.u16 q4,q4,#7 //Q4 = Q4.16+Q4 >> 7 ; A(0,1,2,3) + + vmovl.u8 q6,d4 //Q6 = vmovl.u8 d4 + vmovl.u8 q7,d5 //Q7 = vmovl.u8 d5 + vmovl.u8 q8,d6 //Q8 = vmovl.u8 d6 + vmovl.u8 q9,d7 //Q9 = vmovl.u8 d7 + + + vmul.i16 q6,q6,q4 //Q6 = Q6 * Q4 + vmul.i16 q7,q7,q4 //Q7 = Q7 * Q4 + + vld4.8 {d20, d21, d22, d23}, [r1]! //d0,d1,d2,d3 = sourc rgb(0,1,2,3) A(0,1,2,3) + + vmul.i16 q8,q8,q4 //Q8 = Q8 * Q4 + vmul.i16 q9,q9,q4 //Q9 = Q9 * Q4 + + vld4.8 {d24, d25, d26, d27}, [r3] //d4,d5,d6,d7 = dst rgb(0,1,2,3) A(0,1,2,3) + + vshrn.i16 d4,q6,#8 //d4 = Q6.16 shrn 8 + vshrn.i16 d5,q7,#8 //d5 = Q7.16 shrn 8 + vshrn.i16 d6,q8,#8 //d6 = Q8.16 shrn 8 + vshrn.i16 d7,q9,#8 //d7 = Q9.16 shrn 8 + + vadd.i8 d4,d4,d0 //d4 = d4+d0 + vadd.i8 d5,d5,d1 //d5 = d5+d1 + vadd.i8 d6,d6,d2 //d6 = d6+d2 + vadd.i8 d7,d7,d3 //d7 = d7+d3 + + vst4.8 {d4, d5, d6, d7}, [r0], r5 //dst rgb(0,1,2,3) A(0,1,2,3) = d4,d5,d6,d7 + //add r0, r0, r5 + + //The next 4 words +// vld4.8 {d20, d21, d22, d23}, [r1]! ;d0,d1,d2,d3 = sourc rgb(0,1,2,3) A(0,1,2,3) +// ;update source ptr but not dst ptr +// vld4.8 {d24, d25, d26, d27}, [r0] ;d4,d5,d6,d7 = dst rgb(0,1,2,3) A(0,1,2,3) + + //update source ptr but not dst ptr + vsubW.u8 q4,q14,d23 //Q4.16 = 255-d3 + + vsra.u16 q4,q4,#7 //Q4 = Q4.16+Q4 >> 7 ; A(0,1,2,3) + + vmovl.u8 q6,d24 //Q6 = vmovl.u8 d4 + vmovl.u8 q7,d25 //Q7 = vmovl.u8 d5 + vmovl.u8 q8,d26 //Q8 = vmovl.u8 d6 + vmovl.u8 q9,d27 //Q9 = vmovl.u8 d7 + + vmul.i16 q6,q6,q4 //Q6 = Q6 * Q4 + vmul.i16 q7,q7,q4 //Q7 = Q7 * Q4 + + vld4.8 {d0, d1, d2, d3}, [r1]! //d0,d1,d2,d3 = sourc rgb(0,1,2,3) A(0,1,2,3) + + vmul.i16 q8,q8,q4 //Q8 = Q8 * Q4 + vmul.i16 q9,q9,q4 //Q9 = Q9 * Q4 + + vld4.8 {d4, d5, d6, d7}, [r0] //d4,d5,d6,d7 = dst rgb(0,1,2,3) A(0,1,2,3) + vshrn.i16 d24,q6,#8 //d4 = Q6.16 shrn 8 + vshrn.i16 d25,q7,#8 //d5 = Q7.16 shrn 8 + vshrn.i16 d26,q8,#8 //d6 = Q8.16 shrn 8 + vshrn.i16 d27,q9,#8 //d7 = Q9.16 shrn 8 + + vadd.i8 d24,d24,d20 //d4 = d4+d0 + vadd.i8 d25,d25,d21 //d5 = d5+d1 + vadd.i8 d26,d26,d22 //d6 = d6+d2 + vadd.i8 d27,d27,d23 //d7 = d7+d3 + + vst4.8 {d24, d25, d26, d27}, [r3], r5 //dst rgb(0,1,2,3) A(0,1,2,3) = d4,d5,d6,d7 + //add r3, r3, r5 + + bge .Lloop + +//postfix: +//There are 8 words left unprocessed from previous round + vmov.i16 q4,#0xff //Q4.16 = 255 + vsubw.u8 q4,q4,d3 //Q4.16 = 255-d3 + + cmp r2,#8 + + vshr.u16 q5,q4,#7 //Q5.16 = Q4 >> 7 + vadd.i16 q4,q4,q5 //Q4 = Q4.16+Q5.16 ; A(0,1,2,3) + + vmovl.u8 q6,d4 //Q6 = vmovl.u8 d4 + vmovl.u8 q7,d5 //Q7 = vmovl.u8 d5 + vmovl.u8 q8,d6 //Q8 = vmovl.u8 d6 + vmovl.u8 q9,d7 //Q9 = vmovl.u8 d7 + + vmul.i16 q6,q6,q4 //Q6 = Q6 * Q4 + vmul.i16 q7,q7,q4 //Q7 = Q7 * Q4 + vmul.i16 q8,q8,q4 //Q8 = Q8 * Q4 + vmul.i16 q9,q9,q4 //Q9 = Q9 * Q4 + + vshrn.i16 d4,q6,#8 //d4 = Q6.16 shrn 8 + vshrn.i16 d5,q7,#8 //d5 = Q7.16 shrn 8 + vshrn.i16 d6,q8,#8 //d6 = Q8.16 shrn 8 + vshrn.i16 d7,q9,#8 //d7 = Q9.16 shrn 8 + + vadd.i8 d4,d4,d0 //d4 = d4+d0 + vadd.i8 d5,d5,d1 //d5 = d5+d1 + vadd.i8 d6,d6,d2 //d6 = d6+d2 + vadd.i8 d7,d7,d3 //d7 = d7+d3 + + vst4.8 {d4, d5, d6, d7}, [r0]! //dst rgb(0,1,2,3) A(0,1,2,3) = d4,d5,d6,d7 + +.Lless_than_16: + cmp r2,#8 + blt .Lless_than_8 + + sub r2,r2,#8 + + vld4.8 {d0, d1, d2, d3}, [r1]! //d0,d1,d2,d3 = sourc rgb(0,1,2,3) A(0,1,2,3) + //update source ptr but not dst ptr + vld4.8 {d4, d5, d6, d7}, [r0] //d4,d5,d6,d7 = dst rgb(0,1,2,3) A(0,1,2,3) + + vmov.i16 q4,#0xff //Q4.16 = 255 + vsubw.u8 q4,q4,d3 //Q4.16 = 255-d3 + + cmp r2,#8 + + vshr.u16 q5,q4,#7 //Q5.16 = Q4 >> 7 + vadd.i16 q4,q4,q5 //Q4 = Q4.16+Q5.16 ; A(0,1,2,3) + + vmovl.u8 q6,d4 //Q6 = vmovl.u8 d4 + vmovl.u8 q7,d5 //Q7 = vmovl.u8 d5 + vmovl.u8 q8,d6 //Q8 = vmovl.u8 d6 + vmovl.u8 q9,d7 //Q9 = vmovl.u8 d7 + + vmul.i16 q6,q6,q4 //Q6 = Q6 * Q4 + vmul.i16 q7,q7,q4 //Q7 = Q7 * Q4 + vmul.i16 q8,q8,q4 //Q8 = Q8 * Q4 + vmul.i16 q9,q9,q4 //Q9 = Q9 * Q4 + + vshrn.i16 d4,q6,#8 //d4 = Q6.16 shrn 8 + vshrn.i16 d5,q7,#8 //d5 = Q7.16 shrn 8 + vshrn.i16 d6,q8,#8 //d6 = Q8.16 shrn 8 + vshrn.i16 d7,q9,#8 //d7 = Q9.16 shrn 8 + + vadd.i8 d4,d4,d0 //d4 = d4+d0 + vadd.i8 d5,d5,d1 //d5 = d5+d1 + vadd.i8 d6,d6,d2 //d6 = d6+d2 + vadd.i8 d7,d7,d3 //d7 = d7+d3 + + vst4.8 {d4, d5, d6, d7}, [r0]! //dst rgb(0,1,2,3) A(0,1,2,3) = d4,d5,d6,d7 + + //It will be guaranteed to be less than 8 + //bge loop +.Lless_than_8: + vpop {Q4-Q7} + +.Lless_than_4: + subs r4,r2,#1 + bmi .Lto_exit // S32A_Opaque_BlitRow32_neon + 268 + mov r8,#0xff + mvn r10,#0xff00 + orr r9,r8,r8,lsl #16 + lsl r11,r9,#8 +.Lresidual_loop: + ldr r3,[r1,#0] + ldr r12,[r0,#0] + add r1,r1,#4 + sub r2,r8,r3,lsr #24 + and r5,r12,r9 + cmp r2,r2 + add r2,r2,#1 + and r12,r10,r12,lsr #8 + strne r6,[r7,#0xeef] + mul r5,r5,r2 + mul r2,r12,r2 + strne r6,[r7,#0xeef] + subs r4,r4,#1 + and r12,r9,r5,lsr #8 + and r2,r2,r11 + orr r2,r2,r12 + add r2,r2,r3 + str r2,[r0],#4 + bpl .Lresidual_loop // S32A_Opaque_BlitRow32_neon + 192 + +.Lto_exit: + pop {r4-r11} + bx lr + +.Lless_than_24: + cmp r2,#8 + blt .Lless_than_4 + +.Lloop_8: + sub r2,r2,#8 + // We already read the 8 words from the previous pipe line + vld4.8 {d0, d1, d2, d3}, [r1]! //d0,d1,d2,d3 = sourc rgb(0,1,2,3) A(0,1,2,3) + //update source ptr but not dst ptr + vld4.8 {d4, d5, d6, d7}, [r0] //d4,d5,d6,d7 = dst rgb(0,1,2,3) A(0,1,2,3) + + vmov.i16 q10,#0xff //Q4.16 = 255 + vsubW.u8 q10,q10,d3 //Q4.16 = 255-d3 + + cmp r2,#8 + + vshr.u16 q11,q10,#7 //Q5.16 = Q4 >> 7 + vadd.i16 q10,q10,q11 //Q4 = Q4.16+Q5.16 ; A(0,1,2,3) + + vmovl.u8 q12,d4 //Q6 = vmovl.u8 d4 + vmovl.u8 q13,d5 //Q7 = vmovl.u8 d5 + vmovl.u8 q8,d6 //Q8 = vmovl.u8 d6 + vmovl.u8 q9,d7 //Q9 = vmovl.u8 d7 + + vmul.i16 q12,q12,q10 //Q6 = Q6 * Q4 + vmul.i16 q13,q13,q10 //Q7 = Q7 * Q4 + vmul.i16 q8,q8,q10 //Q8 = Q8 * Q4 + vmul.i16 q9,q9,q10 //Q9 = Q9 * Q4 + + vshrn.i16 d4,q12,#8 //d4 = Q6.16 shrn 8 + vshrn.i16 d5,q13,#8 //d5 = Q7.16 shrn 8 + vshrn.i16 d6,q8,#8 //d6 = Q8.16 shrn 8 + vshrn.i16 d7,q9,#8 //d7 = Q9.16 shrn 8 + + vadd.i8 d4,d4,d0 //d4 = d4+d0 + vadd.i8 d5,d5,d1 //d5 = d5+d1 + vadd.i8 d6,d6,d2 //d6 = d6+d2 + vadd.i8 d7,d7,d3 //d7 = d7+d3 + + vst4.8 {d4, d5, d6, d7}, [r0]! //dst rgb(0,1,2,3) A(0,1,2,3) = d4,d5,d6,d7 + + bge .Lloop_8 + b .Lless_than_4 + +#else + +/* + * r0 - dst + * r1 - src + * r2 - count + */ + push {r4-r11} + mov r9, #0xFF + orr r10, r9, r9, lsl #16 + mvn r11, r10 + +.Lblitrow32_loop: + ldr r3, [r0] + ldr r4, [r1], #4 + + cmp r3, #0 + streq r4, [r0], #4 + beq .Lblitrow32_loop_cond + + // r5 <- (255-alpha)+1 + sub r5, r9, r4, lsr #24 + and r6, r3, r10 + add r5, r5, #1 + and r7, r10, r3, lsr #8 + + mul r8, r6, r5 + lsr r6, r8, #8 + mul r8, r7, r5 + + // combine rb and ag + and r6, r6, r10 + and r7, r8, r11 + orr r6, r6, r7 + + // add src to combined value + add r6, r6, r4 + str r6, [r0], #4 + +.Lblitrow32_loop_cond: + subs r2, r2, #1 + bhi .Lblitrow32_loop + pop {r4-r11} + bx lr + +#endif + +.endfunc +.size S32A_Opaque_BlitRow32_asm, .-S32A_Opaque_BlitRow32_asm diff --git a/src/core/asm/S32_Opaque_D32_nofilter_DX_gether.S b/src/core/asm/S32_Opaque_D32_nofilter_DX_gether.S new file mode 100644 index 0000000000..3467432826 --- /dev/null +++ b/src/core/asm/S32_Opaque_D32_nofilter_DX_gether.S @@ -0,0 +1,85 @@ +/* + * Copyright (c) 2005-2008, The Android Open Source Project + * Copyright (c) 2010, Code Aurora Forum. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + .text + .global S32_Opaque_D32_nofilter_DX_gether + .func S32_Opaque_D32_nofilter_DX_gether +S32_Opaque_D32_nofilter_DX_gether: + push {r0-r11,lr} + asr r0,r2,#3 + sub sp,sp,#4 //23 + cmp r0,#0 + str r0,[sp,#0] //r0 = count >> 3 + ble .L1_140 + ldr r4,[sp,#4] //r4 = r0 (dst) + mov r0,r3 + add r12,r3,#4 + asr r8,r2,#3 +.L1_52: + ldm r3!, {r0,r6,r9,r11} + lsr r5,r0,#16 //30 + ldr r5,[r1,r5,lsl #2] //30 + lsr r7,r6,#16 //32 + ldr r7,[r1,r7,lsl #2] //31 + uxth r0,r0 //34 + ldr r0,[r1,r0,lsl #2] //34 + uxth r6,r6 //31 + ldr r6,[r1,r6,lsl #2] //32 + //stm r4!, {r0,r5,r6,r7} ;35 + lsr r10,r9,#16 //30 + ldr r10,[r1,r10,lsl #2] //30 + lsr lr,r11,#16 //32 + ldr lr,[r1,lr,lsl #2] //31 + uxth r9,r9 //34 + ldr r9,[r1,r9,lsl #2] //34 + uxth r11,r11 //31 + ldr r11,[r1,r11,lsl #2] //32 + subs r8,r8,#1 + stm r4!, {r0,r5,r6,r7,r9,r10,r11,lr} //35 + + bne .L1_52 + + ldr r0,[sp,#0] // count >> 3 + mov r12,r0 + ldr r0,[sp,#4] //r0 = dst + add r0,r0,r12,lsl #5 //dst += count >>3 << 5 + str r0,[sp,#4] //save r0 into stack again +.L1_140: +//;;39 const uint16_t* SK_RESTRICT xx = (const uint16_t*)(xy); +//;;40 for (i = (count & 7); i > 0; --i) { + tst r2,#7 + beq .L1_184 + ldr r0,[sp,#4] //r0 = currnt dst + and r2,r2,#7 +.L1_156: +//;;41 //SkASSERT(*xx < (unsigned)s.fBitmap->width()); +//;;42 src = srcAddr[*xx++]; *colors++ = RETURNDST(src); + ldrh r4,[r3],#2 + add r12,r0,#4 +//;;43 } + subs r2,r2,#1 + ldr r4,[r1,r4,lsl #2] //42 + str r4,[r0,#0] //42 + mov r0,r12 //42 + bne .L1_156 +.L1_184: +//;;44 } + add sp,sp,#0x14 + pop {r4-r11,pc} + +.endfunc +.size S32_Opaque_D32_nofilter_DX_gether, .-S32_Opaque_D32_nofilter_DX_gether diff --git a/src/core/asm/memset16_neon.S b/src/core/asm/memset16_neon.S new file mode 100644 index 0000000000..0f04b90bbc --- /dev/null +++ b/src/core/asm/memset16_neon.S @@ -0,0 +1,158 @@ +/* Copyright (c) 2009, Code Aurora Forum. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Code Aurora nor + * the names of its contributors may be used to endorse or promote + * products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NON-INFRINGEMENT ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR + * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF + * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ + +/*************************************************************************** + Neon memset: Attempts to do a memset with Neon registers if possible, + Inputs: + s: The buffer to write to + c: The integer data to write to the buffer + n: The size_t count. + Outputs: + +***************************************************************************/ + + .code 32 + .align 4 + .globl memset16_neon + .func + +memset16_neon: + cmp r2, #0 + bxeq lr + + push {r0} + + /* If we have < 8 bytes, just do a quick loop to handle that */ + cmp r2, #8 + bgt memset_gt4 +memset_smallcopy_loop: + strh r1, [r0], #2 + subs r2, r2, #2 + bne memset_smallcopy_loop +memset_smallcopy_done: + pop {r0} + bx lr + +memset_gt4: + /* + * Duplicate the r1 lowest 16-bits across r1. The idea is to have + * a register with two 16-bit-values we can copy. We do this by + * duplicating lowest 16-bits of r1 to upper 16-bits. + */ + orr r1, r1, r1, lsl #16 + /* + * If we're copying > 64 bytes, then we may want to get + * onto a 16-byte boundary to improve speed even more. + */ + cmp r2, #64 + blt memset_route + ands r12, r0, #0xf + beq memset_route + /* + * Determine the number of bytes to move forward to get to the 16-byte + * boundary. Note that this will be a multiple of 4, since we + * already are word-aligned. + */ + rsb r12, r12, #16 + sub r2, r2, r12 + lsls r12, r12, #29 + strmi r1, [r0], #4 + strcs r1, [r0], #4 + strcs r1, [r0], #4 + lsls r12, r12, #2 + strcsh r1, [r0], #2 +memset_route: + /* + * Decide where to route for the maximum copy sizes. Note that we + * build q0 and q1 depending on if we'll need it, so that's + * interwoven here as well. + */ + vdup.u32 d0, r1 + cmp r2, #16 + blt memset_8 + vmov d1, d0 + cmp r2, #64 + blt memset_16 + vmov q1, q0 + cmp r2, #128 + blt memset_32 +memset_128: + mov r12, r2, lsr #7 +memset_128_loop: + vst1.64 {q0, q1}, [r0]! + vst1.64 {q0, q1}, [r0]! + vst1.64 {q0, q1}, [r0]! + vst1.64 {q0, q1}, [r0]! + subs r12, r12, #1 + bne memset_128_loop + ands r2, r2, #0x7f + beq memset_end +memset_32: + movs r12, r2, lsr #5 + beq memset_16 +memset_32_loop: + subs r12, r12, #1 + vst1.64 {q0, q1}, [r0]! + bne memset_32_loop + ands r2, r2, #0x1f + beq memset_end +memset_16: + movs r12, r2, lsr #4 + beq memset_8 +memset_16_loop: + subs r12, r12, #1 + vst1.32 {q0}, [r0]! + bne memset_16_loop + ands r2, r2, #0xf + beq memset_end + /* + * memset_8 isn't a loop, since we try to do our loops at 16 + * bytes and above. We should loop there, then drop down here + * to finish the <16-byte versions. Same for memset_4 and + * memset_1. + */ +memset_8: + cmp r2, #8 + blt memset_4 + subs r2, r2, #8 + vst1.32 {d0}, [r0]! +memset_4: + cmp r2, #4 + blt memset_2 + subs r2, r2, #4 + str r1, [r0], #4 +memset_2: + cmp r2, #0 + ble memset_end + strh r1, [r0], #2 +memset_end: + pop {r0} + bx lr + + .endfunc + .end diff --git a/src/core/asm/memset32_neon.S b/src/core/asm/memset32_neon.S new file mode 100644 index 0000000000..b611357b75 --- /dev/null +++ b/src/core/asm/memset32_neon.S @@ -0,0 +1,146 @@ +/* Copyright (c) 2009, Code Aurora Forum. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Code Aurora nor + * the names of its contributors may be used to endorse or promote + * products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NON-INFRINGEMENT ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR + * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF + * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ + +/*************************************************************************** + Neon memset: Attempts to do a memset with Neon registers if possible, + Inputs: + s: The buffer to write to + c: The integer data to write to the buffer + n: The size_t count. + Outputs: + +***************************************************************************/ + + .code 32 + .align 4 + .globl memset32_neon + .func + +memset32_neon: + cmp r2, #0 + bxeq lr + + push {r0} + + /* If we have < 8 bytes, just do a quick loop to handle that */ + cmp r2, #8 + bgt memset_gt4 +memset_smallcopy_loop: + str r1, [r0], #4 + subs r2, r2, #4 + bne memset_smallcopy_loop +memset_smallcopy_done: + pop {r0} + bx lr + +memset_gt4: + /* + * If we're copying > 64 bytes, then we may want to get + * onto a 16-byte boundary to improve speed even more. + */ + cmp r2, #64 + blt memset_route + ands r12, r0, #0xf + beq memset_route + /* + * Determine the number of bytes to move forward to get to the 16-byte + * boundary. Note that this will be a multiple of 4, since we + * already are word-aligned. + */ + rsb r12, r12, #16 + sub r2, r2, r12 + lsls r12, r12, #29 + strmi r1, [r0], #4 + strcs r1, [r0], #4 + strcs r1, [r0], #4 +memset_route: + /* + * Decide where to route for the maximum copy sizes. Note that we + * build q0 and q1 depending on if we'll need it, so that's + * interwoven here as well. + */ + vdup.u32 d0, r1 + cmp r2, #16 + blt memset_8 + vmov d1, d0 + cmp r2, #64 + blt memset_16 + vmov q1, q0 + cmp r2, #128 + blt memset_32 +memset_128: + mov r12, r2, lsr #7 +memset_128_loop: + vst1.64 {q0, q1}, [r0]! + vst1.64 {q0, q1}, [r0]! + vst1.64 {q0, q1}, [r0]! + vst1.64 {q0, q1}, [r0]! + subs r12, r12, #1 + bne memset_128_loop + ands r2, r2, #0x7f + beq memset_end +memset_32: + movs r12, r2, lsr #5 + beq memset_16 +memset_32_loop: + subs r12, r12, #1 + vst1.64 {q0, q1}, [r0]! + bne memset_32_loop + ands r2, r2, #0x1f + beq memset_end +memset_16: + movs r12, r2, lsr #4 + beq memset_8 +memset_16_loop: + subs r12, r12, #1 + vst1.32 {q0}, [r0]! + bne memset_16_loop + ands r2, r2, #0xf + beq memset_end + /* + * memset_8 isn't a loop, since we try to do our loops at 16 + * bytes and above. We should loop there, then drop down here + * to finish the <16-byte versions. Same for memset_4 and + * memset_1. + */ +memset_8: + cmp r2, #8 + blt memset_4 + subs r2, r2, #8 + vst1.32 {d0}, [r0]! +memset_4: + cmp r2, #4 + blt memset_end + subs r2, r2, #4 + str r1, [r0], #4 +memset_end: + pop {r0} + bx lr + + .endfunc + .end diff --git a/src/core/asm/t32cb16blend.S b/src/core/asm/t32cb16blend.S new file mode 100644 index 0000000000..f835dd3271 --- /dev/null +++ b/src/core/asm/t32cb16blend.S @@ -0,0 +1,325 @@ +/* + * Copyright 2006, The Android Open Source Project + * Copyright (c) 2009, Code Aurora Forum. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +/* + * This file is derived from libpixelflinger version of BLIT routine. + * Algorithm used for BLIT operation here is equivalent to the one in + * C function, S32A_D565_Opaque. Use neon instructions to process 16 pixels + * at-a-time on armv7. If the number of pixels is less than 16 and/or the + * architecture is armv6 and below, use regular arm instructions. Regular + * arm code combines two 16-bit writes into one 32-bit write to destination, + * uses destination and source pre-loads, and unrolls the main loop thrice. + */ + .text + .align + + .global scanline_t32cb16blend_arm + +// uses r6, r7, r8, r9, r10, lr + +.macro pixel, DREG, SRC, FB, OFFSET + + // SRC = AABBGGRR + subs r7, r10, \SRC, lsr #24 // sAA = 255 - sAA + beq 1f + +.if \OFFSET + + // red + mov lr, \DREG, lsr #(\OFFSET + 6 + 5) + smlabb lr, r7, lr, r8 + and r6, \SRC, r10 + add lr, lr, lr, lsr #5 + add lr, r6, lr, lsr #5 + lsr lr, #3 + orr \FB, lr, lsl #(\OFFSET + 11) + + // green + and r6, \DREG, #(0x3F<<(\OFFSET + 5)) + lsr r6, #5 + smlabt r6, r7, r6, r9 + and lr, r10, \SRC, lsr #(8) + add r6, r6, r6, lsr #6 + add r6, lr, r6, lsr #6 + lsr r6, #2 + orr \FB, \FB, r6, lsl #(\OFFSET + 5) + + // blue + and lr, \DREG, #(0x1F << \OFFSET) + smlabt lr, r7, lr, r8 + and r6, r10, \SRC, lsr #(8+8) + add lr, lr, lr, lsr #5 + add lr, r6, lr, lsr #5 + lsr lr, #3 + orr \FB, \FB, lr, lsl #\OFFSET + +.else + + // red + mov lr, \DREG, lsr #(6+5) + and lr, lr, #0x1F + smlabb lr, r7, lr, r8 + and r6, \SRC, r10 + add lr, lr, lr, lsr #5 + add lr, r6, lr, lsr #5 + lsr lr, #3 + mov \FB, lr, lsl #11 + + // green + and r6, \DREG, #(0x3F<<5) + lsr r6, #5 + smlabb r6, r7, r6, r9 + and lr, r10, \SRC, lsr #(8) + add r6, r6, r6, lsr #6 + add r6, lr, r6, lsr #6 + lsr r6, #2 + orr \FB, \FB, r6, lsl #5 + + // blue + and lr, \DREG, #0x1F + smlabb lr, r7, lr, r8 + and r6, r10, \SRC, lsr #(8+8) + add lr, lr, lr, lsr #5 + add lr, r6, lr, lsr #5 + orr \FB, \FB, lr, lsr #3 + +.endif + b 2f + + /* + * When alpha = 255, down scale the source RGB pixel (24 bits) + * to 16 bits(RGB565) + */ +1: + lsl r6, \SRC, #8 + lsr lr, \SRC, #5 + and r7, r6, #0xf800 + and lr, lr, #0x7e0 + orr lr, lr, r7 + +.if \OFFSET + orr lr, lr, r6, lsr #27 + orr \FB, \FB, lr, lsl #(\OFFSET) +.else + orr \FB, lr, r6, lsr #27 +.endif + +2: +.endm + + +// r0: dst ptr +// r1: src ptr +// r2: count +// r3: d +// r4: s0 +// r5: s1 +// r6: pixel +// r7: pixel +// r8: 0x10 +// r9: 0x20 +// r10: 0xFF +// r11: free +// r12: scratch +// r14: free + +scanline_t32cb16blend_arm: + stmfd sp!, {r4-r10, lr} + +#if __ARM_ARCH__ == 7 || defined(__ARM_NEON__) + subs r2, r2, #16 + + blo blit_less_than_16_left + + vmov.u16 q12, #0x80 + vmov.u8 q13, #0xf8 + +blit_neon_loop: + /* + * Load 64 bytes from source and 32 bytes from destination + * note that source pixels are 4 bytes wide and + * destination pixels are 2 bytes wide. + */ + vld4.8 {d2, d4, d6, d8}, [r1]! + vld4.8 {d3, d5, d7, d9}, [r1]! + + vand.8 d10, d8, d9 + vmov r3, r4, d10 + + cmp r3, #0xffffffff + cmpeq r4, #0xffffffff + bne blit_alpha_not_255 + + // alpha equals 255 case + + vshl.u8 q0, q2, #3 + + subs r2, r2, #16 + + vsri.u8 q1, q2, #5 + vsri.u8 q0, q3, #3 + + // store the rgb destination values back to memory + vst2.8 {d0, d2}, [r0]! + vst2.8 {d1, d3}, [r0]! + + blo blit_less_than_16_left + b blit_neon_loop + +blit_alpha_not_255: + // alpha = 255 - alpha + vmvn.u8 q0, q4 + + vld2.8 {q5, q6}, [r0] + + vshl.u8 q7, q6, #3 + + subs r2, r2, #16 + + vand.u8 q6, q6, q13 + + vmov.16 q8, q12 + vmov.16 q9, q12 + + vsri.u8 q7, q5, #5 + vshl.u8 q5, q5, #3 + + vmlal.u8 q8, d0, d12 + vmlal.u8 q9, d1, d13 + + vshl.u8 q7, q7, #2 + + vshr.u16 q10, q8, #5 + vshr.u16 q11, q9, #5 + vaddhn.u16 d12, q8, q10 + vaddhn.u16 d13, q9, q11 + + vmov.16 q8, q12 + vmov.16 q9, q12 + vmlal.u8 q8, d0, d14 + vmlal.u8 q9, d1, d15 + + vqadd.u8 q6, q6, q1 + + vshr.u16 q10, q8, #6 + vshr.u16 q11, q9, #6 + vaddhn.u16 d14, q8, q10 + vaddhn.u16 d15, q9, q11 + + vmov.16 q8, q12 + vmov.16 q9, q12 + vmlal.u8 q8, d0, d10 + vmlal.u8 q9, d1, d11 + + vqadd.u8 q7, q7, q2 + + vshl.u8 q5, q7, #3 + + vshr.u16 q10, q8, #5 + vshr.u16 q11, q9, #5 + + vsri.u8 q6, q7, #5 + + vaddhn.u16 d16, q8, q10 + vaddhn.u16 d17, q9, q11 + vqadd.u8 q8, q8, q3 + + vsri.u8 q5, q8, #3 + + // store the rgb destination values back to memory + vst2.8 {d10, d12}, [r0]! + vst2.8 {d11, d13}, [r0]! + + blo blit_less_than_16_left + b blit_neon_loop +#endif + +blit_less_than_16_left: + pld [r1] + + mov r8, #0x10 + mov r9, #0x20 + mov r10, #0xFF + +#if __ARM_ARCH__ == 7 || defined(__ARM_NEON__) + adds r2, r2, #14 +#else + subs r2, r2, #2 +#endif + + pld [r0] + blo 9f + + // The main loop is unrolled thrice and process 6 pixels +8: ldmia r1!, {r4, r5} + // stream the source + pld [r1, #32] + add r0, r0, #4 + // it's all zero, skip this pixel + orrs r3, r4, r5 + beq 7f + + // load the destination + ldr r3, [r0, #-4] + // stream the destination + pld [r0, #32] + pixel r3, r4, r12, 0 + pixel r3, r5, r12, 16 + // effectively, we're getting write-combining by virtue of the + // cpu's write-back cache. + str r12, [r0, #-4] + + // 2nd iteration of the loop, don't stream anything + subs r2, r2, #2 + blt 9f + ldmia r1!, {r4, r5} + add r0, r0, #4 + orrs r3, r4, r5 + beq 7f + ldr r3, [r0, #-4] + pixel r3, r4, r12, 0 + pixel r3, r5, r12, 16 + str r12, [r0, #-4] + + // 3rd iteration of the loop, don't stream anything + subs r2, r2, #2 + blt 9f + ldmia r1!, {r4, r5} + add r0, r0, #4 + orrs r3, r4, r5 + beq 7f + ldr r3, [r0, #-4] + pixel r3, r4, r12, 0 + pixel r3, r5, r12, 16 + str r12, [r0, #-4] + +7: subs r2, r2, #2 + blo 9f + b 8b + +9: adds r2, r2, #1 + ldmlofd sp!, {r4-r10, lr} // return + bxlo lr + + // last pixel left + ldr r4, [r1], #4 + ldrh r3, [r0] + pixel r3, r4, r12, 0 + strh r12, [r0], #2 + ldmfd sp!, {r4-r10, lr} // return + bx lr diff --git a/src/core/asm/xfer.S b/src/core/asm/xfer.S new file mode 100644 index 0000000000..96d587333b --- /dev/null +++ b/src/core/asm/xfer.S @@ -0,0 +1,136 @@ +/* + * Copyright 2006, The Android Open Source Project + * Copyright (c) 2009, Code Aurora Forum. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + + .text + .align + + .global xfer16_arm + +.macro pixel, DREG, SRC, FB, OFFSET + lsl r6, \SRC, #8 + lsr r8, \SRC, #5 + and r7, r6, #0xf800 + and r8, r8, #0x7e0 + orr r8, r8, r7 + +.if \OFFSET + orr r8, r8, r6, lsr #27 + orr \FB, \FB, r8, lsl #(\OFFSET) +.else + orr \FB, r8, r6, lsr #27 +.endif + +.endm + +xfer16_arm: + stmfd sp!, {r4-r8} + +#if __ARM_ARCH__ == 7 || defined(__ARM_NEON__) + subs r2, r2, #16 + + blo xfer16_less_than_16_left + + vmov.u16 q12, #0x80 + //pld [r1] + //pld [r1, #32] + +xfer16_neon_loop: + // load 64 bytes from source and 32 bytes from destination + // note that source pixels are 4 bytes wide and + // destination pixels are 2 bytes wide + vld4.8 {d2, d4, d6, d8}, [r1]! + vld4.8 {d3, d5, d7, d9}, [r1]! + + vshl.u8 q0, q2, #3 + + subs r2, r2, #16 + + vsri.u8 q1, q2, #5 + vsri.u8 q0, q3, #3 + + // store the rgb destination values back to memory + vst2.8 {d0, d2}, [r0]! + vst2.8 {d1, d3}, [r0]! + + blo xfer16_less_than_16_left + b xfer16_neon_loop +#endif + +xfer16_less_than_16_left: + pld [r1] + +#if __ARM_ARCH__ == 7 || defined(__ARM_NEON__) + adds r2, r2, #14 +#else + subs r2, r2, #2 +#endif + + pld [r0] + blo 9f + + // The main loop is unrolled thrice and process 6 pixels +8: ldmia r1!, {r4, r5} + // stream the source + pld [r1, #32] + add r0, r0, #4 + + // load the destination + ldr r3, [r0, #-4] + // stream the destination + pld [r0, #32] + pixel r3, r4, r12, 0 + pixel r3, r5, r12, 16 + // effectively, we're getting write-combining by virtue of the + // cpu's write-back cache. + str r12, [r0, #-4] + + // 2nd iteration of the loop, don't stream anything + subs r2, r2, #2 + blt 9f + ldmia r1!, {r4, r5} + add r0, r0, #4 + ldr r3, [r0, #-4] + pixel r3, r4, r12, 0 + pixel r3, r5, r12, 16 + str r12, [r0, #-4] + + // 3rd iteration of the loop, don't stream anything + subs r2, r2, #2 + blt 9f + ldmia r1!, {r4, r5} + add r0, r0, #4 + ldr r3, [r0, #-4] + pixel r3, r4, r12, 0 + pixel r3, r5, r12, 16 + str r12, [r0, #-4] + +7: subs r2, r2, #2 + blo 9f + b 8b + +9: adds r2, r2, #1 + ldmlofd sp!, {r4-r8} // return + bxlo lr + + // last pixel left + ldr r4, [r1], #4 + ldrh r3, [r0] + pixel r3, r4, r12, 0 + strh r12, [r0], #2 + ldmfd sp!, {r4-r8} // return + bx lr diff --git a/src/images/SkImageDecoder_libjpeg.cpp b/src/images/SkImageDecoder_libjpeg.cpp index 12fe76ab3b..279c8ab9c1 100644 --- a/src/images/SkImageDecoder_libjpeg.cpp +++ b/src/images/SkImageDecoder_libjpeg.cpp @@ -397,30 +397,40 @@ bool SkJPEGImageDecoder::onDecode(SkStream* stream, SkBitmap* bm, /* image_width and image_height are the original dimensions, available after jpeg_read_header(). To see the scaled dimensions, we have to call - jpeg_start_decompress(), and then read output_width and output_height. + jpeg_calc_output_dimensions(), and then read output_width and output_height. */ + jpeg_calc_output_dimensions(&cinfo); + + /* We have enough information to return + to the caller if they just wanted (subsampled bounds). If sampleSize + was 1, then we would have already returned. Thus we just check if + we're in kDecodeBounds_Mode, and that we have valid output sizes. + */ + if (SkImageDecoder::kDecodeBounds_Mode == mode && + valid_output_dimensions(cinfo)) { + SkScaledBitmapSampler smpl(cinfo.output_width, cinfo.output_height, + recompute_sampleSize(sampleSize, cinfo)); + bm->setConfig(config, smpl.scaledWidth(), smpl.scaledHeight()); + bm->setIsOpaque(true); + return true; + } + + sampleSize = recompute_sampleSize(sampleSize, cinfo); + +#ifdef ANDROID_RGB + if ((sampleSize != 1) && (cinfo.out_color_space == JCS_RGB_565)) { + /* Requires SkScaledBitmapSampler, but since + SkScaledBitmapSampler can't handle RGB_565 yet, + don't even try. + Revert back to the default format JCS_RGB. + */ + cinfo.out_color_space = JCS_RGB; + } +#endif + if (!jpeg_start_decompress(&cinfo)) { - /* If we failed here, we may still have enough information to return - to the caller if they just wanted (subsampled bounds). If sampleSize - was 1, then we would have already returned. Thus we just check if - we're in kDecodeBounds_Mode, and that we have valid output sizes. - - One reason to fail here is that we have insufficient stream data - to complete the setup. However, output dimensions seem to get - computed very early, which is why this special check can pay off. - */ - if (SkImageDecoder::kDecodeBounds_Mode == mode && - valid_output_dimensions(cinfo)) { - SkScaledBitmapSampler smpl(cinfo.output_width, cinfo.output_height, - recompute_sampleSize(sampleSize, cinfo)); - bm->setConfig(config, smpl.scaledWidth(), smpl.scaledHeight()); - bm->setIsOpaque(true); - return true; - } else { - return return_false(cinfo, *bm, "start_decompress"); - } + return return_false(cinfo, *bm, "start_decompress"); } - sampleSize = recompute_sampleSize(sampleSize, cinfo); // should we allow the Chooser (if present) to pick a config for us??? if (!this->chooseFromOneChoice(config, cinfo.output_width, diff --git a/src/opts/SkBlitRow_opts_arm.cpp b/src/opts/SkBlitRow_opts_arm.cpp index 0c38113adf..fb530b42ec 100644 --- a/src/opts/SkBlitRow_opts_arm.cpp +++ b/src/opts/SkBlitRow_opts_arm.cpp @@ -20,6 +20,11 @@ #include "SkColorPriv.h" #include "SkDither.h" +extern "C" void S32A_Opaque_BlitRow32_asm(SkPMColor* SK_RESTRICT dst, + const SkPMColor* SK_RESTRICT src, + int count, + U8CPU alpha); + #if defined(__ARM_HAVE_NEON) && !defined(SK_CPU_BENDIAN) static void S32A_D565_Opaque_neon(uint16_t* SK_RESTRICT dst, const SkPMColor* SK_RESTRICT src, int count, @@ -398,15 +403,25 @@ static void S32_D565_Blend_Dither_neon(uint16_t *dst, const SkPMColor *src, } } -#define S32A_D565_Opaque_PROC S32A_D565_Opaque_neon #define S32A_D565_Blend_PROC S32A_D565_Blend_neon #define S32_D565_Blend_Dither_PROC S32_D565_Blend_Dither_neon #else -#define S32A_D565_Opaque_PROC NULL #define S32A_D565_Blend_PROC NULL #define S32_D565_Blend_Dither_PROC NULL #endif +/* + * Use asm version of BlitRow function. Neon instructions are + * used for armv7 targets. + */ +#define S32A_Opaque_BlitRow32_PROC S32A_Opaque_BlitRow32_asm + +/* + * Use neon version of BLIT assembly code from t32cb16blend.S, where we process + * 16 pixels at-a-time and also optimize for alpha=255 case. + */ +#define S32A_D565_Opaque_PROC NULL + /* Don't have a special version that assumes each src is opaque, but our S32A is still faster than the default, so use it here */ @@ -446,7 +461,7 @@ const SkBlitRow::Proc SkBlitRow::gPlatform_4444_Procs[] = { const SkBlitRow::Proc32 SkBlitRow::gPlatform_Procs32[] = { NULL, // S32_Opaque, NULL, // S32_Blend, - NULL, // S32A_Opaque, + S32A_Opaque_BlitRow32_PROC, NULL, // S32A_Blend, }; |