diff options
author | Xin Qi <xqi@codeaurora.org> | 2013-11-26 11:52:20 -0800 |
---|---|---|
committer | Steve Kondik <steve@cyngn.com> | 2014-12-11 20:42:18 -0800 |
commit | fe5bd66c43c1cff5ea84c1e88e05044ca19b3877 (patch) | |
tree | 1ac5cf28430c403c8e4877cfaf49d49ec3734e68 | |
parent | d9c9fb807b11d3f12e0f687bef40c58ad2ed7ae1 (diff) | |
download | android_external_skia-fe5bd66c43c1cff5ea84c1e88e05044ca19b3877.tar.gz android_external_skia-fe5bd66c43c1cff5ea84c1e88e05044ca19b3877.tar.bz2 android_external_skia-fe5bd66c43c1cff5ea84c1e88e05044ca19b3877.zip |
NEON optimized blitter S32_Opaque_D32_filter_DX
Re-work for latest code base as the -Os compiler options is added.
Overwrite the optimization level at function level.
Change-Id: I732fea7f02f775a4b06884460dea2887d4108820
-rw-r--r-- | Android.mk | 4 | ||||
-rw-r--r-- | src/core/SkBitmapProcState.cpp | 9 | ||||
-rw-r--r-- | src/core/SkBitmapProcState_procs.h | 18 | ||||
-rw-r--r-- | src/core/SkBitmapProcState_shaderproc.h | 114 | ||||
-rw-r--r-- | src/opts/ext/S32_Opaque_D32_filter_DX_shaderproc_neon.cpp | 476 |
5 files changed, 618 insertions, 3 deletions
diff --git a/Android.mk b/Android.mk index cb19f6e3fd..87daedff0e 100644 --- a/Android.mk +++ b/Android.mk @@ -599,6 +599,7 @@ LOCAL_SRC_FILES_arm += \ src/opts/SkUtils_opts_arm.cpp \ src/opts/SkXfermode_opts_arm.cpp + ifeq ($(ARCH_ARM_HAVE_NEON), true) LOCAL_SRC_FILES_arm += \ src/opts/memset16_neon.S \ @@ -609,7 +610,8 @@ LOCAL_SRC_FILES_arm += \ src/opts/SkBlitRow_opts_arm_neon.cpp \ src/opts/SkBlurImage_opts_neon.cpp \ src/opts/SkMorphology_opts_neon.cpp \ - src/opts/SkXfermode_opts_arm_neon.cpp + src/opts/SkXfermode_opts_arm_neon.cpp \ + src/opts/ext/S32_Opaque_D32_filter_DX_shaderproc_neon.cpp LOCAL_CFLAGS_arm += \ -D__ARM_HAVE_NEON diff --git a/src/core/SkBitmapProcState.cpp b/src/core/SkBitmapProcState.cpp index 3605910309..751c2c1753 100644 --- a/src/core/SkBitmapProcState.cpp +++ b/src/core/SkBitmapProcState.cpp @@ -1,4 +1,3 @@ - /* * Copyright 2011 Google Inc. * @@ -27,6 +26,10 @@ extern void Repeat_S16_D16_filter_DX_shaderproc_neon(const SkBitmapProcState&, extern void SI8_opaque_D32_filter_DX_neon(const SkBitmapProcState&, const uint32_t*, int, SkPMColor*); extern void SI8_opaque_D32_filter_DX_shaderproc_neon(const SkBitmapProcState&, int, int, uint32_t*, int); extern void Clamp_SI8_opaque_D32_filter_DX_shaderproc_neon(const SkBitmapProcState&, int, int, uint32_t*, int); +#if !defined(__LP64__) +extern void S32_opaque_D32_filter_DX_neon(const SkBitmapProcState&, const uint32_t*, int, SkPMColor*); +extern void Clamp_S32_Opaque_D32_filter_DX_shaderproc_neon(const SkBitmapProcState&, int, int, uint32_t*, int); +#endif //#if !defined(__LP64__) #endif #define NAME_WRAP(x) x @@ -626,6 +629,10 @@ bool SkBitmapProcState::chooseProcs(const SkMatrix& inv, const SkPaint& paint) { } } else if (SK_ARM_NEON_WRAP(SI8_opaque_D32_filter_DX) == fSampleProc32 && clampClamp) { fShaderProc32 = SK_ARM_NEON_WRAP(Clamp_SI8_opaque_D32_filter_DX_shaderproc); +#if !defined(__LP64__) + } else if (SK_ARM_NEON_WRAP(S32_opaque_D32_filter_DX) == fSampleProc32 && clampClamp) { + fShaderProc32 = SK_ARM_NEON_WRAP(Clamp_S32_Opaque_D32_filter_DX_shaderproc); +#endif //#if !defined(__LP64__) } if (NULL == fShaderProc32) { diff --git a/src/core/SkBitmapProcState_procs.h b/src/core/SkBitmapProcState_procs.h index 0d3b723e67..9a44346c92 100644 --- a/src/core/SkBitmapProcState_procs.h +++ b/src/core/SkBitmapProcState_procs.h @@ -340,4 +340,22 @@ static inline U8CPU Filter_8(unsigned x, unsigned y, #define POSTAMBLE(state) state.fBitmap->getColorTable()->unlockColors() #include "SkBitmapProcState_shaderproc.h" +#if !defined(__LP64__) +#define TILEX_PROCF(fx, max) SkClampMax((fx) >> 16, max) +#define TILEY_PROCF(fy, max) SkClampMax((fy) >> 16, max) +#define TILEX_LOW_BITS(fx, max) (((fx) >> 12) & 0xF) +#define TILEY_LOW_BITS(fy, max) (((fy) >> 12) & 0xF) + +#undef FILTER_PROC +#define FILTER_PROC(x, y, a, b, c, d, dst) NAME_WRAP(Filter_32_opaque)(x, y, a, b, c, d, dst) +#define MAKENAME(suffix) NAME_WRAP(Clamp_S32_Opaque_D32 ## suffix) +#define SRCTYPE uint32_t +#define DSTTYPE uint32_t +#define CHECKSTATE(state) SkASSERT(state.fBitmap->config() == SkBitmap::kARGB_8888_Config) +#define SRC_TO_FILTER(src) src +#define S32_OPAQUE_D32_FILTER_DX_NEON (!SK_ARM_NEON_IS_NONE) +#include "SkBitmapProcState_shaderproc.h" +#undef S32_OPAQUE_D32_FILTER_DX_NEON +#endif //#if !defined(__LP64__) + #undef NAME_WRAP diff --git a/src/core/SkBitmapProcState_shaderproc.h b/src/core/SkBitmapProcState_shaderproc.h index 0014b4a526..877b8cfbd0 100644 --- a/src/core/SkBitmapProcState_shaderproc.h +++ b/src/core/SkBitmapProcState_shaderproc.h @@ -1,4 +1,3 @@ - /* * Copyright 2011 Google Inc. * @@ -8,6 +7,13 @@ #include "SkMathPriv.h" +#if S32_OPAQUE_D32_FILTER_DX_NEON +void S32_Opaque_D32_filter_DX_shaderproc_neon(const unsigned int* image0, const unsigned int* image1, + SkFixed fx, unsigned int maxX, unsigned int subY, + unsigned int* colors, + SkFixed dx, int count); +#endif + #define SCALE_FILTER_NAME MAKENAME(_filter_DX_shaderproc) // Can't be static in the general case because some of these implementations @@ -55,6 +61,110 @@ void SCALE_FILTER_NAME(const SkBitmapProcState& s, int x, int y, PREAMBLE(s); #endif +#if S32_OPAQUE_D32_FILTER_DX_NEON + int post_count; + SkFixed post_fx; + DSTTYPE* SK_RESTRICT post_colors; + int num; + post_count = count; + post_fx = fx; + post_colors = colors; + + + if (dx>=0) + { + int end = ((int)maxX-1)<<16; + num = dx?((end-fx)/dx):0; + if (num < 0) num = 0; + + if (num<count) + { + count = num; + post_count = post_count - count; + post_fx = fx + count*dx; + post_colors = post_colors + count; + } + else + post_count = 0; + + while (fx<0 && count) { + unsigned subX = TILEX_LOW_BITS(fx, maxX); + unsigned x0 = TILEX_PROCF(fx, maxX); + unsigned x1 = TILEX_PROCF((fx + oneX), maxX); + + FILTER_PROC(subX, subY, + SRC_TO_FILTER(row0[x0]), + SRC_TO_FILTER(row0[x1]), + SRC_TO_FILTER(row1[x0]), + SRC_TO_FILTER(row1[x1]), + colors); + colors += 1; + + fx += dx; + count--; + } + } + else + { + int end = 0; + int maxXFix = ((int)maxX-1)<<16; + num = (end-fx)/dx; + if (num < 0) num = 0; + + + if (num<count) + { + count = num; + post_count = post_count - count; + post_fx = fx + count*dx; + post_colors = post_colors + count; + } + else + post_count = 0; + + while (fx>=maxXFix && count) { + unsigned subX = TILEX_LOW_BITS(fx, maxX); + unsigned x0 = TILEX_PROCF(fx, maxX); + unsigned x1 = TILEX_PROCF((fx + oneX), maxX); + + FILTER_PROC(subX, subY, + SRC_TO_FILTER(row0[x0]), + SRC_TO_FILTER(row0[x1]), + SRC_TO_FILTER(row1[x0]), + SRC_TO_FILTER(row1[x1]), + colors); + colors += 1; + + fx += dx; + count--; + } + + } + + S32_Opaque_D32_filter_DX_shaderproc_neon(row0, row1, fx, maxX, subY, colors, dx, count); + + fx = post_fx; + colors = post_colors; + while (post_count) { + unsigned subX = TILEX_LOW_BITS(fx, maxX); + unsigned x0 = TILEX_PROCF(fx, maxX); + unsigned x1 = TILEX_PROCF((fx + oneX), maxX); + + FILTER_PROC(subX, subY, + SRC_TO_FILTER(row0[x0]), + SRC_TO_FILTER(row0[x1]), + SRC_TO_FILTER(row1[x0]), + SRC_TO_FILTER(row1[x1]), + colors); + colors += 1; + + fx += dx; + post_count--; + } + + +#else //S32_OPAQUE_D32_FILTER_DX_NEON + do { unsigned subX = TILEX_LOW_BITS(fx, maxX); unsigned x0 = TILEX_PROCF(fx, maxX); @@ -71,6 +181,8 @@ void SCALE_FILTER_NAME(const SkBitmapProcState& s, int x, int y, fx += dx; } while (--count != 0); +#endif //S32_OPAQUE_D32_FILTER_DX_NEON + #ifdef POSTAMBLE POSTAMBLE(s); #endif diff --git a/src/opts/ext/S32_Opaque_D32_filter_DX_shaderproc_neon.cpp b/src/opts/ext/S32_Opaque_D32_filter_DX_shaderproc_neon.cpp new file mode 100644 index 0000000000..909fd4354d --- /dev/null +++ b/src/opts/ext/S32_Opaque_D32_filter_DX_shaderproc_neon.cpp @@ -0,0 +1,476 @@ +/* + * Copyright (c) 2010,2013, The Linux Foundation. All rights reserved. + * * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials provided + * with the distribution. + * * Neither the name of The Linux Foundation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * * + * THIS SOFTWARE IS PROVIDED "AS IS" AND ANY EXPRESS OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE + * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN + * IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * */ + +#include "SkFixed.h" +#include "SkUtilsArm.h" + +void S32_Opaque_D32_filter_DX_shaderproc_neon(const unsigned int* image0, const unsigned int* image1, + SkFixed fx, unsigned int maxX, unsigned int subY, + unsigned int* colors, + SkFixed dx, int count) __attribute__ ((optimize ("0"))) ; +void S32_Opaque_D32_filter_DX_shaderproc_neon(const unsigned int* image0, const unsigned int* image1, + SkFixed fx, unsigned int maxX, unsigned int subY, + unsigned int* colors, + SkFixed dx, int count) { + + asm volatile( + "mov r3, %[count] \n\t" //r3 = count + + "mov r5, %[fx] \n\t" //r5 = x = fx + "cmp r3, #0 \n\t" + "beq 12f \n\t" // branch forward to endloop if r3 == 0 + + "vdup.8 d17, %[subY] \n\t" // duplicate y into d17 + "vmov.u8 d16, #16 \n\t" // set up constant in d16 + "vsub.u8 d18, d16, d17 \n\t" // d18 = 16-y + + "vmov.u16 d16, #16 \n\t" // set up constant in d16,int 16bit + +#define UNROLL8 +#define UNROLL2 +#ifdef UNROLL8 + "cmp r3, #8 \n\t" + "blt 20f \n\t" // branch forward to initloop2 if r3 < 8 + ///////////////loop2 in x + "81: \n\t" // beginloop8: + + /////////////////pixel 1//////////////////////////////////// + //x0 = SkClampMax((fx) >> 16, max) + "asr r4, r5, #16 \n\t" + + "lsl r4, r4, #2 \n\t" + "add r6, r4, %[image0] \n\t" + "vldr.32 d4, [r6] \n\t" + "add r6, r4, %[image1] \n\t" + "vldr.32 d5, [r6] \n\t" + + //(((fx) >> 12) & 0xF) + "lsr r4, r5, #12 \n\t" + "and r4, r4, #15 \n\t" + "vdup.16 d19, r4 \n\t" // duplicate x into d19 + + + ////////////bilinear interp + + "vmull.u8 q3, d4, d18 \n\t" // q3 = [a01|a00] * (16-y) + "vmull.u8 q0, d5, d17 \n\t" // q0 = [a11|a10] * y + + "vsub.u16 d20, d16, d19 \n\t" // d20 = 16-x + + "vmul.i16 d22, d7, d19 \n\t" // d4 = a01 * x + "vmla.i16 d22, d1, d19 \n\t" // d4 += a11 * x + "vmla.i16 d22, d6, d20 \n\t" // d4 += a00 * (16-x) + "vmla.i16 d22, d0, d20 \n\t" // d4 += a10 * (16-x) + + //////////////// end bilinear interp + + "add r5, r5, %[dx] \n\t" //r5 = x += dx + + /////////////////pixel 2//////////////////////////////////// + //x0 = SkClampMax((fx) >> 16, max) + "asr r4, r5, #16 \n\t" + + "lsl r4, r4, #2 \n\t" + "add r6, r4, %[image0] \n\t" + "vldr.32 d4, [r6] \n\t" + "add r6, r4, %[image1] \n\t" + "vldr.32 d5, [r6] \n\t" + + //(((fx) >> 12) & 0xF) + "lsr r4, r5, #12 \n\t" + "and r4, r4, #15 \n\t" + "vdup.16 d19, r4 \n\t" // duplicate x into d19 + + + ////////////bilinear interp + + "vmull.u8 q3, d4, d18 \n\t" // q3 = [a01|a00] * (16-y) + "vmull.u8 q0, d5, d17 \n\t" // q0 = [a11|a10] * y + + "vsub.u16 d20, d16, d19 \n\t" // d20 = 16-x + + "vmul.i16 d24, d7, d19 \n\t" // d4 = a01 * x + "vmla.i16 d24, d1, d19 \n\t" // d4 += a11 * x + "vmla.i16 d24, d6, d20 \n\t" // d4 += a00 * (16-x) + "vmla.i16 d24, d0, d20 \n\t" // d4 += a10 * (16-x) + + //////////////// end bilinear interp + + "add r5, r5, %[dx] \n\t" //r5 = x += dx + + /////////////////pixel 3//////////////////////////////// + //x0 = SkClampMax((fx) >> 16, max) + "asr r4, r5, #16 \n\t" + + "lsl r4, r4, #2 \n\t" + "add r6, r4, %[image0] \n\t" + "vldr.32 d4, [r6] \n\t" + "add r6, r4, %[image1] \n\t" + "vldr.32 d5, [r6] \n\t" + + //(((fx) >> 12) & 0xF) + "lsr r4, r5, #12 \n\t" + "and r4, r4, #15 \n\t" + "vdup.16 d19, r4 \n\t" // duplicate x into d19 + + + ////////////bilinear interp + + "vmull.u8 q3, d4, d18 \n\t" // q3 = [a01|a00] * (16-y) + "vmull.u8 q0, d5, d17 \n\t" // q0 = [a11|a10] * y + + "vsub.u16 d20, d16, d19 \n\t" // d20 = 16-x + + "vmul.i16 d26, d7, d19 \n\t" // d4 = a01 * x + "vmla.i16 d26, d1, d19 \n\t" // d4 += a11 * x + "vmla.i16 d26, d6, d20 \n\t" // d4 += a00 * (16-x) + "vmla.i16 d26, d0, d20 \n\t" // d4 += a10 * (16-x) + + //////////////// end bilinear interp + + "add r5, r5, %[dx] \n\t" //r5 = x += dx + + /////////////////pixel 4//////////////////////////////// + //x0 = SkClampMax((fx) >> 16, max) + "asr r4, r5, #16 \n\t" + + "lsl r4, r4, #2 \n\t" + "add r6, r4, %[image0] \n\t" + "vldr.32 d4, [r6] \n\t" + "add r6, r4, %[image1] \n\t" + "vldr.32 d5, [r6] \n\t" + + //(((fx) >> 12) & 0xF) + "lsr r4, r5, #12 \n\t" + "and r4, r4, #15 \n\t" + "vdup.16 d19, r4 \n\t" // duplicate x into d19 + + + ////////////bilinear interp + + "vmull.u8 q3, d4, d18 \n\t" // q3 = [a01|a00] * (16-y) + "vmull.u8 q0, d5, d17 \n\t" // q0 = [a11|a10] * y + + "vsub.u16 d20, d16, d19 \n\t" // d20 = 16-x + + "vmul.i16 d28, d7, d19 \n\t" // d4 = a01 * x + "vmla.i16 d28, d1, d19 \n\t" // d4 += a11 * x + "vmla.i16 d28, d6, d20 \n\t" // d4 += a00 * (16-x) + "vmla.i16 d28, d0, d20 \n\t" // d4 += a10 * (16-x) + + //////////////// end bilinear interp + + "add r5, r5, %[dx] \n\t" //r5 = x += dx + + /////////////////pixel 5//////////////////////////////////// + //x0 = SkClampMax((fx) >> 16, max) + "asr r4, r5, #16 \n\t" + + "lsl r4, r4, #2 \n\t" + "add r6, r4, %[image0] \n\t" + "vldr.32 d4, [r6] \n\t" + "add r6, r4, %[image1] \n\t" + "vldr.32 d5, [r6] \n\t" + + //(((fx) >> 12) & 0xF) + "lsr r4, r5, #12 \n\t" + "and r4, r4, #15 \n\t" + "vdup.16 d19, r4 \n\t" // duplicate x into d19 + + + ////////////bilinear interp + + "vmull.u8 q3, d4, d18 \n\t" // q3 = [a01|a00] * (16-y) + "vmull.u8 q0, d5, d17 \n\t" // q0 = [a11|a10] * y + + "vsub.u16 d20, d16, d19 \n\t" // d20 = 16-x + + "vmul.i16 d23, d7, d19 \n\t" // d4 = a01 * x + "vmla.i16 d23, d1, d19 \n\t" // d4 += a11 * x + "vmla.i16 d23, d6, d20 \n\t" // d4 += a00 * (16-x) + "vmla.i16 d23, d0, d20 \n\t" // d4 += a10 * (16-x) + + //////////////// end bilinear interp + + "add r5, r5, %[dx] \n\t" //r5 = x += dx + + /////////////////pixel 6//////////////////////////////////// + //x0 = SkClampMax((fx) >> 16, max) + "asr r4, r5, #16 \n\t" + + "lsl r4, r4, #2 \n\t" + "add r6, r4, %[image0] \n\t" + "vldr.32 d4, [r6] \n\t" + "add r6, r4, %[image1] \n\t" + "vldr.32 d5, [r6] \n\t" + + //(((fx) >> 12) & 0xF) + "lsr r4, r5, #12 \n\t" + "and r4, r4, #15 \n\t" + "vdup.16 d19, r4 \n\t" // duplicate x into d19 + + + ////////////bilinear interp + + "vmull.u8 q3, d4, d18 \n\t" // q3 = [a01|a00] * (16-y) + "vmull.u8 q0, d5, d17 \n\t" // q0 = [a11|a10] * y + + "vsub.u16 d20, d16, d19 \n\t" // d20 = 16-x + + "vmul.i16 d25, d7, d19 \n\t" // d4 = a01 * x + "vmla.i16 d25, d1, d19 \n\t" // d4 += a11 * x + "vmla.i16 d25, d6, d20 \n\t" // d4 += a00 * (16-x) + "vmla.i16 d25, d0, d20 \n\t" // d4 += a10 * (16-x) + + //////////////// end bilinear interp + + "add r5, r5, %[dx] \n\t" //r5 = x += dx + + /////////////////pixel 7//////////////////////////////// + //x0 = SkClampMax((fx) >> 16, max) + "asr r4, r5, #16 \n\t" + + "lsl r4, r4, #2 \n\t" + "add r6, r4, %[image0] \n\t" + "vldr.32 d4, [r6] \n\t" + "add r6, r4, %[image1] \n\t" + "vldr.32 d5, [r6] \n\t" + + //(((fx) >> 12) & 0xF) + "lsr r4, r5, #12 \n\t" + "and r4, r4, #15 \n\t" + "vdup.16 d19, r4 \n\t" // duplicate x into d19 + + + ////////////bilinear interp + + "vmull.u8 q3, d4, d18 \n\t" // q3 = [a01|a00] * (16-y) + "vmull.u8 q0, d5, d17 \n\t" // q0 = [a11|a10] * y + + "vsub.u16 d20, d16, d19 \n\t" // d20 = 16-x + + "vmul.i16 d27, d7, d19 \n\t" // d4 = a01 * x + "vmla.i16 d27, d1, d19 \n\t" // d4 += a11 * x + "vmla.i16 d27, d6, d20 \n\t" // d4 += a00 * (16-x) + "vmla.i16 d27, d0, d20 \n\t" // d4 += a10 * (16-x) + + //////////////// end bilinear interp + + "add r5, r5, %[dx] \n\t" //r5 = x += dx + + /////////////////pixel 8//////////////////////////////// + //x0 = SkClampMax((fx) >> 16, max) + "asr r4, r5, #16 \n\t" + + "lsl r4, r4, #2 \n\t" + "add r6, r4, %[image0] \n\t" + "vldr.32 d4, [r6] \n\t" + "add r6, r4, %[image1] \n\t" + "vldr.32 d5, [r6] \n\t" + + //(((fx) >> 12) & 0xF) + "lsr r4, r5, #12 \n\t" + "and r4, r4, #15 \n\t" + "vdup.16 d19, r4 \n\t" // duplicate x into d19 + + + ////////////bilinear interp + + "vmull.u8 q3, d4, d18 \n\t" // q3 = [a01|a00] * (16-y) + "vmull.u8 q0, d5, d17 \n\t" // q0 = [a11|a10] * y + + "vsub.u16 d20, d16, d19 \n\t" // d20 = 16-x + + "vmul.i16 d29, d7, d19 \n\t" // d4 = a01 * x + "vmla.i16 d29, d1, d19 \n\t" // d4 += a11 * x + "vmla.i16 d29, d6, d20 \n\t" // d4 += a00 * (16-x) + "vmla.i16 d29, d0, d20 \n\t" // d4 += a10 * (16-x) + + //////////////// Store results/////////////////// + + "vshrn.i16 d0, q11, #8 \n\t" // shift down result by 8 + "vshrn.i16 d1, q12, #8 \n\t" // shift down result by 8 + "vshrn.i16 d2, q13, #8 \n\t" // shift down result by 8 + "vshrn.i16 d3, q14, #8 \n\t" // shift down result by 8 + + "vst4.u32 {d0, d1, d2, d3}, [%[colors]]! \n\t" // store result + + //////////////// end bilinear interp + + "sub r3, r3, #8 \n\t" //num -=8 + "add r5, r5, %[dx] \n\t" //r5 = x += dx + "cmp r3, #7 \n\t" + + "bgt 81b \n\t" // branch backward to beginloop8 if r3 > 7 + + "82: \n\t" // endloop8: + ////////////////end loop in x +#endif //UNROLL8 + + + +#ifdef UNROLL2 + "20: \n\t" // initloop2: + "cmp r3, #2 \n\t" + "blt 10f \n\t" // branch forward to initloop if r3 < 2 + ///////////////loop2 in x + "21: \n\t" // beginloop2: + + + //x0 = SkClampMax((fx) >> 16, max) + "asr r4, r5, #16 \n\t" + + "lsl r4, r4, #2 \n\t" + "add r6, r4, %[image0] \n\t" + "vldr.32 d4, [r6] \n\t" + "add r6, r4, %[image1] \n\t" + "vldr.32 d5, [r6] \n\t" + + //(((fx) >> 12) & 0xF) + "lsr r4, r5, #12 \n\t" + "and r4, r4, #15 \n\t" + "vdup.16 d19, r4 \n\t" // duplicate x into d19 + + + ////////////bilinear interp + + "vmull.u8 q3, d4, d18 \n\t" // q3 = [a01|a00] * (16-y) + "vmull.u8 q0, d5, d17 \n\t" // q0 = [a11|a10] * y + + "vsub.u16 d20, d16, d19 \n\t" // d20 = 16-x + + "vmul.i16 d22, d7, d19 \n\t" // d4 = a01 * x + "vmla.i16 d22, d1, d19 \n\t" // d4 += a11 * x + "vmla.i16 d22, d6, d20 \n\t" // d4 += a00 * (16-x) + "vmla.i16 d22, d0, d20 \n\t" // d4 += a10 * (16-x) + + //////////////// end bilinear interp + + "add r5, r5, %[dx] \n\t" //r5 = x += dx + + /////////////////second half//////////////////////////////// + //x0 = SkClampMax((fx) >> 16, max) + "asr r4, r5, #16 \n\t" + + "lsl r4, r4, #2 \n\t" + "add r6, r4, %[image0] \n\t" + "vldr.32 d4, [r6] \n\t" + "add r6, r4, %[image1] \n\t" + "vldr.32 d5, [r6] \n\t" + + //(((fx) >> 12) & 0xF) + "lsr r4, r5, #12 \n\t" + "and r4, r4, #15 \n\t" + "vdup.16 d19, r4 \n\t" // duplicate x into d19 + + + ////////////bilinear interp + + "vmull.u8 q3, d4, d18 \n\t" // q3 = [a01|a00] * (16-y) + "vmull.u8 q0, d5, d17 \n\t" // q0 = [a11|a10] * y + + "vsub.u16 d20, d16, d19 \n\t" // d20 = 16-x + + "vmul.i16 d23, d7, d19 \n\t" // d4 = a01 * x + "vmla.i16 d23, d1, d19 \n\t" // d4 += a11 * x + "vmla.i16 d23, d6, d20 \n\t" // d4 += a00 * (16-x) + "vmla.i16 d23, d0, d20 \n\t" // d4 += a10 * (16-x) + "vshrn.i16 d0, q11, #8 \n\t" // shift down result by 8 + + "vst1.u32 {d0}, [%[colors]]! \n\t" // store result + + //////////////// end bilinear interp + + "sub r3, r3, #2 \n\t" //num -=2 + "add r5, r5, %[dx] \n\t" //r5 = x += dx + "cmp r3, #1 \n\t" + + "bgt 21b \n\t" // branch backward to beginloop2 if r3 > 1 + + "22: \n\t" // endloop2: + ////////////////end loop in x +#endif //UNROLL2 + +#if defined (UNROLL2) || defined (UNROLL8) + "10: \n\t" // initloop: + "cmp r3, #0 \n\t" + "ble 12f \n\t" // branch forward to endloop if r3 <= 0 +#endif //defined (UNROLL2) || defined (UNROLL8) + + ///////////////loop in x + "11: \n\t" // beginloop: + + + //x0 = SkClampMax((fx) >> 16, max) + "asr r4, r5, #16 \n\t" + + "lsl r4, r4, #2 \n\t" + "add r6, r4, %[image0] \n\t" + "vldr.32 d4, [r6] \n\t" + "add r6, r4, %[image1] \n\t" + "vldr.32 d5, [r6] \n\t" + + //(((fx) >> 12) & 0xF) + "lsr r4, r5, #12 \n\t" + "and r4, r4, #15 \n\t" + "vdup.16 d19, r4 \n\t" // duplicate x into d19 + + + ////////////bilinear interp + + "vmull.u8 q3, d4, d18 \n\t" // q3 = [a01|a00] * (16-y) + "vmull.u8 q0, d5, d17 \n\t" // q0 = [a11|a10] * y + + "vsub.u16 d20, d16, d19 \n\t" // d20 = 16-x + + "vmul.i16 d4, d7, d19 \n\t" // d4 = a01 * x + "vmla.i16 d4, d1, d19 \n\t" // d4 += a11 * x + "vmla.i16 d4, d6, d20 \n\t" // d4 += a00 * (16-x) + "vmla.i16 d4, d0, d20 \n\t" // d4 += a10 * (16-x) + "vshrn.i16 d0, q2, #8 \n\t" // shift down result by 8 + + "vst1.u32 {d0[0]}, [%[colors]]! \n\t" // store result + + //////////////// end bilinear interp + + "sub r3, r3, #1 \n\t" //num -=1 + "add r5, r5, %[dx] \n\t" //r5 = x += dx + "cmp r3, #0 \n\t" + "bgt 11b \n\t" // branch backward to beginloop if r3 > 0 + + "12: \n\t" // endloop: + ////////////////end loop in x + : [colors] "+r" (colors) + : [image0] "r" (image0), [image1] "r" (image1), [fx] "r" (fx), [maxX] "r" (maxX), [subY] "r" (subY), + [dx] "r" (dx), [count] "r" (count) + : "cc", "memory", "r3", "r4", "r5", "r6", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d16", "d17", "d18", "d19", "d20", "d21", "d22", "d23", "d24", "d25", "d26", "d27", "d28", "d29" + ); + + +} |