NEON optimized blitter S32_Opaque_D32_filter_DX

Re-work for latest code base as the -Os compiler options is added. Overwrite the optimization level at function level. Change-Id: I732fea7f02f775a4b06884460dea2887d4108820
author: Xin Qi <xqi@codeaurora.org> 2013-11-26 11:52:20 -0800
committer: Steve Kondik <steve@cyngn.com> 2014-12-11 20:42:18 -0800
commit: fe5bd66c43c1cff5ea84c1e88e05044ca19b3877 (patch)
tree: 1ac5cf28430c403c8e4877cfaf49d49ec3734e68
parent: d9c9fb807b11d3f12e0f687bef40c58ad2ed7ae1 (diff)
download: android_external_skia-fe5bd66c43c1cff5ea84c1e88e05044ca19b3877.tar.gz
android_external_skia-fe5bd66c43c1cff5ea84c1e88e05044ca19b3877.tar.bz2
android_external_skia-fe5bd66c43c1cff5ea84c1e88e05044ca19b3877.zip
5 files changed, 618 insertions, 3 deletions
diff --git a/Android.mk b/Android.mk
index cb19f6e3fd..87daedff0e 100644
--- a/Android.mk
+++ b/Android.mk
@@ -599,6 +599,7 @@ LOCAL_SRC_FILES_arm += \
 	src/opts/SkUtils_opts_arm.cpp \
 	src/opts/SkXfermode_opts_arm.cpp
 
+
 ifeq ($(ARCH_ARM_HAVE_NEON), true)
 LOCAL_SRC_FILES_arm += \
 	src/opts/memset16_neon.S \
@@ -609,7 +610,8 @@ LOCAL_SRC_FILES_arm += \
 	src/opts/SkBlitRow_opts_arm_neon.cpp \
 	src/opts/SkBlurImage_opts_neon.cpp \
 	src/opts/SkMorphology_opts_neon.cpp \
-	src/opts/SkXfermode_opts_arm_neon.cpp
+	src/opts/SkXfermode_opts_arm_neon.cpp \
+	src/opts/ext/S32_Opaque_D32_filter_DX_shaderproc_neon.cpp
 
 LOCAL_CFLAGS_arm += \
 	-D__ARM_HAVE_NEON
diff --git a/src/core/SkBitmapProcState.cpp b/src/core/SkBitmapProcState.cpp
index 3605910309..751c2c1753 100644
--- a/src/core/SkBitmapProcState.cpp
+++ b/src/core/SkBitmapProcState.cpp
@@ -1,4 +1,3 @@
-
 /*
  * Copyright 2011 Google Inc.
  *
@@ -27,6 +26,10 @@ extern void  Repeat_S16_D16_filter_DX_shaderproc_neon(const SkBitmapProcState&,
 extern void  SI8_opaque_D32_filter_DX_neon(const SkBitmapProcState&, const uint32_t*, int, SkPMColor*);
 extern void  SI8_opaque_D32_filter_DX_shaderproc_neon(const SkBitmapProcState&, int, int, uint32_t*, int);
 extern void  Clamp_SI8_opaque_D32_filter_DX_shaderproc_neon(const SkBitmapProcState&, int, int, uint32_t*, int);
+#if !defined(__LP64__)
+extern void  S32_opaque_D32_filter_DX_neon(const SkBitmapProcState&, const uint32_t*, int, SkPMColor*);
+extern void  Clamp_S32_Opaque_D32_filter_DX_shaderproc_neon(const SkBitmapProcState&, int, int, uint32_t*, int);
+#endif //#if !defined(__LP64__)
 #endif
 
 #define   NAME_WRAP(x)  x
@@ -626,6 +629,10 @@ bool SkBitmapProcState::chooseProcs(const SkMatrix& inv, const SkPaint& paint) {
             }
         } else if (SK_ARM_NEON_WRAP(SI8_opaque_D32_filter_DX) == fSampleProc32 && clampClamp) {
             fShaderProc32 = SK_ARM_NEON_WRAP(Clamp_SI8_opaque_D32_filter_DX_shaderproc);
+#if !defined(__LP64__)
+        } else if (SK_ARM_NEON_WRAP(S32_opaque_D32_filter_DX) == fSampleProc32 && clampClamp) {
+            fShaderProc32 = SK_ARM_NEON_WRAP(Clamp_S32_Opaque_D32_filter_DX_shaderproc);
+#endif //#if !defined(__LP64__)
         }
 
         if (NULL == fShaderProc32) {
diff --git a/src/core/SkBitmapProcState_procs.h b/src/core/SkBitmapProcState_procs.h
index 0d3b723e67..9a44346c92 100644
--- a/src/core/SkBitmapProcState_procs.h
+++ b/src/core/SkBitmapProcState_procs.h
@@ -340,4 +340,22 @@ static inline U8CPU Filter_8(unsigned x, unsigned y,
 #define POSTAMBLE(state)        state.fBitmap->getColorTable()->unlockColors()
 #include "SkBitmapProcState_shaderproc.h"
 
+#if !defined(__LP64__)
+#define TILEX_PROCF(fx, max)    SkClampMax((fx) >> 16, max)
+#define TILEY_PROCF(fy, max)    SkClampMax((fy) >> 16, max)
+#define TILEX_LOW_BITS(fx, max) (((fx) >> 12) & 0xF)
+#define TILEY_LOW_BITS(fy, max) (((fy) >> 12) & 0xF)
+
+#undef FILTER_PROC
+#define FILTER_PROC(x, y, a, b, c, d, dst)   NAME_WRAP(Filter_32_opaque)(x, y, a, b, c, d, dst)
+#define MAKENAME(suffix)        NAME_WRAP(Clamp_S32_Opaque_D32 ## suffix)
+#define SRCTYPE                 uint32_t
+#define DSTTYPE                 uint32_t
+#define CHECKSTATE(state)       SkASSERT(state.fBitmap->config() == SkBitmap::kARGB_8888_Config)
+#define SRC_TO_FILTER(src)      src
+#define S32_OPAQUE_D32_FILTER_DX_NEON   (!SK_ARM_NEON_IS_NONE)
+#include "SkBitmapProcState_shaderproc.h"
+#undef S32_OPAQUE_D32_FILTER_DX_NEON
+#endif //#if !defined(__LP64__)
+
 #undef NAME_WRAP
diff --git a/src/core/SkBitmapProcState_shaderproc.h b/src/core/SkBitmapProcState_shaderproc.h
index 0014b4a526..877b8cfbd0 100644
--- a/src/core/SkBitmapProcState_shaderproc.h
+++ b/src/core/SkBitmapProcState_shaderproc.h
@@ -1,4 +1,3 @@
-
 /*
  * Copyright 2011 Google Inc.
  *
@@ -8,6 +7,13 @@
 
 #include "SkMathPriv.h"
 
+#if S32_OPAQUE_D32_FILTER_DX_NEON
+void S32_Opaque_D32_filter_DX_shaderproc_neon(const unsigned int* image0, const unsigned int* image1,
+                                        SkFixed fx, unsigned int maxX, unsigned int subY,
+                                         unsigned int* colors,
+                                         SkFixed dx, int count);
+#endif
+
 #define SCALE_FILTER_NAME       MAKENAME(_filter_DX_shaderproc)
 
 // Can't be static in the general case because some of these implementations
@@ -55,6 +61,110 @@ void SCALE_FILTER_NAME(const SkBitmapProcState& s, int x, int y,
     PREAMBLE(s);
 #endif
 
+#if S32_OPAQUE_D32_FILTER_DX_NEON
+    int post_count;
+    SkFixed post_fx;
+    DSTTYPE* SK_RESTRICT post_colors;
+    int num;
+    post_count = count;
+    post_fx = fx;
+    post_colors = colors;
+
+
+    if (dx>=0)
+    {
+        int end = ((int)maxX-1)<<16;
+        num = dx?((end-fx)/dx):0;
+        if (num < 0) num = 0;
+
+        if (num<count)
+        {
+             count = num;
+             post_count = post_count - count;
+             post_fx = fx + count*dx;
+             post_colors = post_colors + count;
+        }
+        else
+            post_count = 0;
+
+        while (fx<0 && count) {
+            unsigned subX = TILEX_LOW_BITS(fx, maxX);
+            unsigned x0 = TILEX_PROCF(fx, maxX);
+            unsigned x1 = TILEX_PROCF((fx + oneX), maxX);
+
+            FILTER_PROC(subX, subY,
+                SRC_TO_FILTER(row0[x0]),
+                SRC_TO_FILTER(row0[x1]),
+                SRC_TO_FILTER(row1[x0]),
+                SRC_TO_FILTER(row1[x1]),
+                colors);
+            colors += 1;
+
+            fx += dx;
+            count--;
+        }
+    }
+    else
+    {
+        int end = 0;
+        int maxXFix = ((int)maxX-1)<<16;
+        num = (end-fx)/dx;
+        if (num < 0) num = 0;
+
+
+        if (num<count)
+        {
+            count = num;
+            post_count = post_count - count;
+            post_fx = fx + count*dx;
+            post_colors = post_colors + count;
+        }
+        else
+            post_count = 0;
+
+        while (fx>=maxXFix && count) {
+            unsigned subX = TILEX_LOW_BITS(fx, maxX);
+            unsigned x0 = TILEX_PROCF(fx, maxX);
+            unsigned x1 = TILEX_PROCF((fx + oneX), maxX);
+
+            FILTER_PROC(subX, subY,
+                SRC_TO_FILTER(row0[x0]),
+                SRC_TO_FILTER(row0[x1]),
+                SRC_TO_FILTER(row1[x0]),
+                SRC_TO_FILTER(row1[x1]),
+                colors);
+            colors += 1;
+
+            fx += dx;
+            count--;
+        }
+
+    }
+
+    S32_Opaque_D32_filter_DX_shaderproc_neon(row0, row1, fx, maxX, subY, colors, dx, count);
+
+    fx = post_fx;
+    colors = post_colors;
+    while (post_count) {
+        unsigned subX = TILEX_LOW_BITS(fx, maxX);
+        unsigned x0 = TILEX_PROCF(fx, maxX);
+        unsigned x1 = TILEX_PROCF((fx + oneX), maxX);
+
+        FILTER_PROC(subX, subY,
+            SRC_TO_FILTER(row0[x0]),
+            SRC_TO_FILTER(row0[x1]),
+            SRC_TO_FILTER(row1[x0]),
+            SRC_TO_FILTER(row1[x1]),
+            colors);
+       colors += 1;
+
+       fx += dx;
+       post_count--;
+    }
+
+
+#else //S32_OPAQUE_D32_FILTER_DX_NEON
+
     do {
         unsigned subX = TILEX_LOW_BITS(fx, maxX);
         unsigned x0 = TILEX_PROCF(fx, maxX);
@@ -71,6 +181,8 @@ void SCALE_FILTER_NAME(const SkBitmapProcState& s, int x, int y,
         fx += dx;
     } while (--count != 0);
 
+#endif //S32_OPAQUE_D32_FILTER_DX_NEON
+
 #ifdef POSTAMBLE
     POSTAMBLE(s);
 #endif
diff --git a/src/opts/ext/S32_Opaque_D32_filter_DX_shaderproc_neon.cpp b/src/opts/ext/S32_Opaque_D32_filter_DX_shaderproc_neon.cpp
new file mode 100644
index 0000000000..909fd4354d
--- /dev/null
+++ b/src/opts/ext/S32_Opaque_D32_filter_DX_shaderproc_neon.cpp
@@ -0,0 +1,476 @@
+/*
+ * Copyright (c) 2010,2013, The Linux Foundation. All rights reserved.
+ * *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials provided
+ * with the distribution.
+ * * Neither the name of The Linux Foundation nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ * *
+ * THIS SOFTWARE IS PROVIDED "AS IS" AND ANY EXPRESS OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS
+ * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
+ * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN
+ * IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ * */
+
+#include "SkFixed.h"
+#include "SkUtilsArm.h"
+
+void S32_Opaque_D32_filter_DX_shaderproc_neon(const unsigned int* image0, const unsigned int* image1,
+                                        SkFixed fx, unsigned int maxX, unsigned int subY,
+                                         unsigned int* colors,
+                                         SkFixed dx, int count) __attribute__ ((optimize ("0"))) ;
+void S32_Opaque_D32_filter_DX_shaderproc_neon(const unsigned int* image0, const unsigned int* image1,
+                                        SkFixed fx, unsigned int maxX, unsigned int subY,
+                                         unsigned int* colors,
+                                         SkFixed dx, int count) {
+
+    asm volatile(
+            "mov r3, %[count]    \n\t"    //r3 = count
+
+            "mov r5, %[fx] \n\t"        //r5 = x = fx
+            "cmp r3, #0 \n\t"
+            "beq 12f \n\t"              // branch forward to endloop if r3 == 0
+
+            "vdup.8         d17, %[subY]                \n\t"   // duplicate y into d17
+            "vmov.u8        d16, #16                \n\t"   // set up constant in d16
+            "vsub.u8        d18, d16, d17             \n\t"   // d18 = 16-y
+
+            "vmov.u16       d16, #16                \n\t"   // set up constant in d16,int 16bit
+
+#define UNROLL8
+#define UNROLL2
+#ifdef UNROLL8
+            "cmp r3, #8 \n\t"
+            "blt 20f \n\t"              // branch forward to initloop2 if r3 < 8
+            ///////////////loop2 in x
+            "81:    \n\t"               // beginloop8:
+
+                /////////////////pixel 1////////////////////////////////////
+                //x0 = SkClampMax((fx) >> 16, max)
+                "asr r4, r5, #16 \n\t"
+
+                "lsl r4, r4, #2 \n\t"
+                "add r6, r4, %[image0] \n\t"
+                "vldr.32 d4, [r6] \n\t"
+                "add r6, r4, %[image1] \n\t"
+                "vldr.32 d5, [r6] \n\t"
+
+                //(((fx) >> 12) & 0xF)
+                "lsr r4, r5, #12 \n\t"
+                "and r4, r4, #15 \n\t"
+                "vdup.16        d19, r4                \n\t"   // duplicate x into d19
+
+
+                ////////////bilinear interp
+
+                "vmull.u8       q3, d4, d18              \n\t"   // q3 = [a01|a00] * (16-y)
+                "vmull.u8       q0, d5, d17              \n\t"   // q0 = [a11|a10] * y
+
+                "vsub.u16       d20, d16, d19             \n\t"   // d20 = 16-x
+
+                "vmul.i16       d22, d7, d19              \n\t"   // d4  = a01 * x
+                "vmla.i16       d22, d1, d19              \n\t"   // d4 += a11 * x
+                "vmla.i16       d22, d6, d20              \n\t"   // d4 += a00 * (16-x)
+                "vmla.i16       d22, d0, d20              \n\t"   // d4 += a10 * (16-x)
+
+                //////////////// end bilinear interp
+
+                "add r5, r5, %[dx] \n\t"    //r5 = x += dx
+
+                /////////////////pixel 2////////////////////////////////////
+                //x0 = SkClampMax((fx) >> 16, max)
+                "asr r4, r5, #16 \n\t"
+
+                "lsl r4, r4, #2 \n\t"
+                "add r6, r4, %[image0] \n\t"
+                "vldr.32 d4, [r6] \n\t"
+                "add r6, r4, %[image1] \n\t"
+                "vldr.32 d5, [r6] \n\t"
+
+                //(((fx) >> 12) & 0xF)
+                "lsr r4, r5, #12 \n\t"
+                "and r4, r4, #15 \n\t"
+                "vdup.16        d19, r4                \n\t"   // duplicate x into d19
+
+
+                ////////////bilinear interp
+
+                "vmull.u8       q3, d4, d18              \n\t"   // q3 = [a01|a00] * (16-y)
+                "vmull.u8       q0, d5, d17              \n\t"   // q0 = [a11|a10] * y
+
+                "vsub.u16       d20, d16, d19             \n\t"   // d20 = 16-x
+
+                "vmul.i16       d24, d7, d19              \n\t"   // d4  = a01 * x
+                "vmla.i16       d24, d1, d19              \n\t"   // d4 += a11 * x
+                "vmla.i16       d24, d6, d20              \n\t"   // d4 += a00 * (16-x)
+                "vmla.i16       d24, d0, d20              \n\t"   // d4 += a10 * (16-x)
+
+                //////////////// end bilinear interp
+
+                "add r5, r5, %[dx] \n\t"    //r5 = x += dx
+
+                /////////////////pixel 3////////////////////////////////
+                //x0 = SkClampMax((fx) >> 16, max)
+                "asr r4, r5, #16 \n\t"
+
+                "lsl r4, r4, #2 \n\t"
+                "add r6, r4, %[image0] \n\t"
+                "vldr.32 d4, [r6] \n\t"
+                "add r6, r4, %[image1] \n\t"
+                "vldr.32 d5, [r6] \n\t"
+
+                //(((fx) >> 12) & 0xF)
+                "lsr r4, r5, #12 \n\t"
+                "and r4, r4, #15 \n\t"
+                "vdup.16        d19, r4                \n\t"   // duplicate x into d19
+
+
+                ////////////bilinear interp
+
+                "vmull.u8       q3, d4, d18              \n\t"   // q3 = [a01|a00] * (16-y)
+                "vmull.u8       q0, d5, d17              \n\t"   // q0 = [a11|a10] * y
+
+                "vsub.u16       d20, d16, d19             \n\t"   // d20 = 16-x
+
+                "vmul.i16       d26, d7, d19              \n\t"   // d4  = a01 * x
+                "vmla.i16       d26, d1, d19              \n\t"   // d4 += a11 * x
+                "vmla.i16       d26, d6, d20              \n\t"   // d4 += a00 * (16-x)
+                "vmla.i16       d26, d0, d20              \n\t"   // d4 += a10 * (16-x)
+
+                //////////////// end bilinear interp
+
+                "add r5, r5, %[dx] \n\t"    //r5 = x += dx
+
+                /////////////////pixel 4////////////////////////////////
+                //x0 = SkClampMax((fx) >> 16, max)
+                "asr r4, r5, #16 \n\t"
+
+                "lsl r4, r4, #2 \n\t"
+                "add r6, r4, %[image0] \n\t"
+                "vldr.32 d4, [r6] \n\t"
+                "add r6, r4, %[image1] \n\t"
+                "vldr.32 d5, [r6] \n\t"
+
+                //(((fx) >> 12) & 0xF)
+                "lsr r4, r5, #12 \n\t"
+                "and r4, r4, #15 \n\t"
+                "vdup.16        d19, r4                \n\t"   // duplicate x into d19
+
+
+                ////////////bilinear interp
+
+                "vmull.u8       q3, d4, d18              \n\t"   // q3 = [a01|a00] * (16-y)
+                "vmull.u8       q0, d5, d17              \n\t"   // q0 = [a11|a10] * y
+
+                "vsub.u16       d20, d16, d19             \n\t"   // d20 = 16-x
+
+                "vmul.i16       d28, d7, d19              \n\t"   // d4  = a01 * x
+                "vmla.i16       d28, d1, d19              \n\t"   // d4 += a11 * x
+                "vmla.i16       d28, d6, d20              \n\t"   // d4 += a00 * (16-x)
+                "vmla.i16       d28, d0, d20              \n\t"   // d4 += a10 * (16-x)
+
+                //////////////// end bilinear interp
+
+                "add r5, r5, %[dx] \n\t"    //r5 = x += dx
+
+                /////////////////pixel 5////////////////////////////////////
+                //x0 = SkClampMax((fx) >> 16, max)
+                "asr r4, r5, #16 \n\t"
+
+                "lsl r4, r4, #2 \n\t"
+                "add r6, r4, %[image0] \n\t"
+                "vldr.32 d4, [r6] \n\t"
+                "add r6, r4, %[image1] \n\t"
+                "vldr.32 d5, [r6] \n\t"
+
+                //(((fx) >> 12) & 0xF)
+                "lsr r4, r5, #12 \n\t"
+                "and r4, r4, #15 \n\t"
+                "vdup.16        d19, r4                \n\t"   // duplicate x into d19
+
+
+                ////////////bilinear interp
+
+                "vmull.u8       q3, d4, d18              \n\t"   // q3 = [a01|a00] * (16-y)
+                "vmull.u8       q0, d5, d17              \n\t"   // q0 = [a11|a10] * y
+
+                "vsub.u16       d20, d16, d19             \n\t"   // d20 = 16-x
+
+                "vmul.i16       d23, d7, d19              \n\t"   // d4  = a01 * x
+                "vmla.i16       d23, d1, d19              \n\t"   // d4 += a11 * x
+                "vmla.i16       d23, d6, d20              \n\t"   // d4 += a00 * (16-x)
+                "vmla.i16       d23, d0, d20              \n\t"   // d4 += a10 * (16-x)
+
+                //////////////// end bilinear interp
+
+                "add r5, r5, %[dx] \n\t"    //r5 = x += dx
+
+                /////////////////pixel 6////////////////////////////////////
+                //x0 = SkClampMax((fx) >> 16, max)
+                "asr r4, r5, #16 \n\t"
+
+                "lsl r4, r4, #2 \n\t"
+                "add r6, r4, %[image0] \n\t"
+                "vldr.32 d4, [r6] \n\t"
+                "add r6, r4, %[image1] \n\t"
+                "vldr.32 d5, [r6] \n\t"
+
+                //(((fx) >> 12) & 0xF)
+                "lsr r4, r5, #12 \n\t"
+                "and r4, r4, #15 \n\t"
+                "vdup.16        d19, r4                \n\t"   // duplicate x into d19
+
+
+                ////////////bilinear interp
+
+                "vmull.u8       q3, d4, d18              \n\t"   // q3 = [a01|a00] * (16-y)
+                "vmull.u8       q0, d5, d17              \n\t"   // q0 = [a11|a10] * y
+
+                "vsub.u16       d20, d16, d19             \n\t"   // d20 = 16-x
+
+                "vmul.i16       d25, d7, d19              \n\t"   // d4  = a01 * x
+                "vmla.i16       d25, d1, d19              \n\t"   // d4 += a11 * x
+                "vmla.i16       d25, d6, d20              \n\t"   // d4 += a00 * (16-x)
+                "vmla.i16       d25, d0, d20              \n\t"   // d4 += a10 * (16-x)
+
+                //////////////// end bilinear interp
+
+                "add r5, r5, %[dx] \n\t"    //r5 = x += dx
+
+                /////////////////pixel 7////////////////////////////////
+                //x0 = SkClampMax((fx) >> 16, max)
+                "asr r4, r5, #16 \n\t"
+
+                "lsl r4, r4, #2 \n\t"
+                "add r6, r4, %[image0] \n\t"
+                "vldr.32 d4, [r6] \n\t"
+                "add r6, r4, %[image1] \n\t"
+                "vldr.32 d5, [r6] \n\t"
+
+                //(((fx) >> 12) & 0xF)
+                "lsr r4, r5, #12 \n\t"
+                "and r4, r4, #15 \n\t"
+                "vdup.16        d19, r4                \n\t"   // duplicate x into d19
+
+
+                ////////////bilinear interp
+
+                "vmull.u8       q3, d4, d18              \n\t"   // q3 = [a01|a00] * (16-y)
+                "vmull.u8       q0, d5, d17              \n\t"   // q0 = [a11|a10] * y
+
+                "vsub.u16       d20, d16, d19             \n\t"   // d20 = 16-x
+
+                "vmul.i16       d27, d7, d19              \n\t"   // d4  = a01 * x
+                "vmla.i16       d27, d1, d19              \n\t"   // d4 += a11 * x
+                "vmla.i16       d27, d6, d20              \n\t"   // d4 += a00 * (16-x)
+                "vmla.i16       d27, d0, d20              \n\t"   // d4 += a10 * (16-x)
+
+                //////////////// end bilinear interp
+
+                "add r5, r5, %[dx] \n\t"    //r5 = x += dx
+
+                /////////////////pixel 8////////////////////////////////
+                //x0 = SkClampMax((fx) >> 16, max)
+                "asr r4, r5, #16 \n\t"
+
+                "lsl r4, r4, #2 \n\t"
+                "add r6, r4, %[image0] \n\t"
+                "vldr.32 d4, [r6] \n\t"
+                "add r6, r4, %[image1] \n\t"
+                "vldr.32 d5, [r6] \n\t"
+
+                //(((fx) >> 12) & 0xF)
+                "lsr r4, r5, #12 \n\t"
+                "and r4, r4, #15 \n\t"
+                "vdup.16        d19, r4                \n\t"   // duplicate x into d19
+
+
+                ////////////bilinear interp
+
+                "vmull.u8       q3, d4, d18              \n\t"   // q3 = [a01|a00] * (16-y)
+                "vmull.u8       q0, d5, d17              \n\t"   // q0 = [a11|a10] * y
+
+                "vsub.u16       d20, d16, d19             \n\t"   // d20 = 16-x
+
+                "vmul.i16       d29, d7, d19              \n\t"   // d4  = a01 * x
+                "vmla.i16       d29, d1, d19              \n\t"   // d4 += a11 * x
+                "vmla.i16       d29, d6, d20              \n\t"   // d4 += a00 * (16-x)
+                "vmla.i16       d29, d0, d20              \n\t"   // d4 += a10 * (16-x)
+
+                //////////////// Store results///////////////////
+
+                "vshrn.i16      d0, q11, #8              \n\t"   // shift down result by 8
+                "vshrn.i16      d1, q12, #8              \n\t"   // shift down result by 8
+                "vshrn.i16      d2, q13, #8              \n\t"   // shift down result by 8
+                "vshrn.i16      d3, q14, #8              \n\t"   // shift down result by 8
+
+                "vst4.u32        {d0, d1, d2, d3}, [%[colors]]!       \n\t"   // store result
+
+                //////////////// end bilinear interp
+
+                "sub r3, r3, #8    \n\t"    //num -=8
+                "add r5, r5, %[dx] \n\t"    //r5 = x += dx
+                "cmp r3, #7 \n\t"
+
+                "bgt        81b  \n\t"      // branch backward to beginloop8 if r3 > 7
+
+            "82:    \n\t"                   // endloop8:
+            ////////////////end loop in x
+#endif    //UNROLL8
+
+
+
+#ifdef UNROLL2
+            "20: \n\t"                      // initloop2:
+            "cmp r3, #2 \n\t"
+            "blt 10f \n\t"                  // branch forward to initloop if r3 < 2
+            ///////////////loop2 in x
+            "21:    \n\t"                   // beginloop2:
+
+
+                //x0 = SkClampMax((fx) >> 16, max)
+                "asr r4, r5, #16 \n\t"
+
+                "lsl r4, r4, #2 \n\t"
+                "add r6, r4, %[image0] \n\t"
+                "vldr.32 d4, [r6] \n\t"
+                "add r6, r4, %[image1] \n\t"
+                "vldr.32 d5, [r6] \n\t"
+
+                //(((fx) >> 12) & 0xF)
+                "lsr r4, r5, #12 \n\t"
+                "and r4, r4, #15 \n\t"
+                "vdup.16        d19, r4                \n\t"   // duplicate x into d19
+
+
+                ////////////bilinear interp
+
+                "vmull.u8       q3, d4, d18              \n\t"   // q3 = [a01|a00] * (16-y)
+                "vmull.u8       q0, d5, d17              \n\t"   // q0 = [a11|a10] * y
+
+                "vsub.u16       d20, d16, d19             \n\t"   // d20 = 16-x
+
+                "vmul.i16       d22, d7, d19              \n\t"   // d4  = a01 * x
+                "vmla.i16       d22, d1, d19              \n\t"   // d4 += a11 * x
+                "vmla.i16       d22, d6, d20              \n\t"   // d4 += a00 * (16-x)
+                "vmla.i16       d22, d0, d20              \n\t"   // d4 += a10 * (16-x)
+
+                //////////////// end bilinear interp
+
+                "add r5, r5, %[dx] \n\t"    //r5 = x += dx
+
+                /////////////////second half////////////////////////////////
+                //x0 = SkClampMax((fx) >> 16, max)
+                "asr r4, r5, #16 \n\t"
+
+                "lsl r4, r4, #2 \n\t"
+                "add r6, r4, %[image0] \n\t"
+                "vldr.32 d4, [r6] \n\t"
+                "add r6, r4, %[image1] \n\t"
+                "vldr.32 d5, [r6] \n\t"
+
+                //(((fx) >> 12) & 0xF)
+                "lsr r4, r5, #12 \n\t"
+                "and r4, r4, #15 \n\t"
+                "vdup.16        d19, r4                \n\t"   // duplicate x into d19
+
+
+                ////////////bilinear interp
+
+                "vmull.u8       q3, d4, d18              \n\t"   // q3 = [a01|a00] * (16-y)
+                "vmull.u8       q0, d5, d17              \n\t"   // q0 = [a11|a10] * y
+
+                "vsub.u16       d20, d16, d19             \n\t"   // d20 = 16-x
+
+                "vmul.i16       d23, d7, d19              \n\t"   // d4  = a01 * x
+                "vmla.i16       d23, d1, d19              \n\t"   // d4 += a11 * x
+                "vmla.i16       d23, d6, d20              \n\t"   // d4 += a00 * (16-x)
+                "vmla.i16       d23, d0, d20              \n\t"   // d4 += a10 * (16-x)
+                "vshrn.i16      d0, q11, #8              \n\t"   // shift down result by 8
+
+                "vst1.u32        {d0}, [%[colors]]!       \n\t"   // store result
+
+                //////////////// end bilinear interp
+
+                "sub r3, r3, #2    \n\t"    //num -=2
+                "add r5, r5, %[dx] \n\t"    //r5 = x += dx
+                "cmp r3, #1 \n\t"
+
+                "bgt        21b  \n\t"                            // branch backward to beginloop2 if r3 > 1
+
+            "22:    \n\t"                                         // endloop2:
+            ////////////////end loop in x
+#endif    //UNROLL2
+
+#if defined (UNROLL2) || defined (UNROLL8)
+            "10: \n\t"                                            // initloop:
+            "cmp r3, #0 \n\t"
+            "ble 12f \n\t"                                        // branch forward to endloop if r3 <= 0
+#endif    //defined (UNROLL2) || defined (UNROLL8)
+
+            ///////////////loop in x
+            "11:    \n\t"                                         // beginloop:
+
+
+                //x0 = SkClampMax((fx) >> 16, max)
+                "asr r4, r5, #16 \n\t"
+
+                "lsl r4, r4, #2 \n\t"
+                "add r6, r4, %[image0] \n\t"
+                "vldr.32 d4, [r6] \n\t"
+                "add r6, r4, %[image1] \n\t"
+                "vldr.32 d5, [r6] \n\t"
+
+                //(((fx) >> 12) & 0xF)
+                "lsr r4, r5, #12 \n\t"
+                "and r4, r4, #15 \n\t"
+                "vdup.16        d19, r4                \n\t"   // duplicate x into d19
+
+
+                ////////////bilinear interp
+
+                "vmull.u8       q3, d4, d18              \n\t"   // q3 = [a01|a00] * (16-y)
+                "vmull.u8       q0, d5, d17              \n\t"   // q0 = [a11|a10] * y
+
+                "vsub.u16       d20, d16, d19             \n\t"   // d20 = 16-x
+
+                "vmul.i16       d4, d7, d19              \n\t"   // d4  = a01 * x
+                "vmla.i16       d4, d1, d19              \n\t"   // d4 += a11 * x
+                "vmla.i16       d4, d6, d20              \n\t"   // d4 += a00 * (16-x)
+                "vmla.i16       d4, d0, d20              \n\t"   // d4 += a10 * (16-x)
+                "vshrn.i16      d0, q2, #8              \n\t"   // shift down result by 8
+
+                "vst1.u32        {d0[0]}, [%[colors]]!       \n\t"   // store result
+
+                //////////////// end bilinear interp
+
+                "sub r3, r3, #1    \n\t"    //num -=1
+                "add r5, r5, %[dx] \n\t"    //r5 = x += dx
+                "cmp r3, #0 \n\t"
+                "bgt        11b  \n\t"      // branch backward to beginloop if r3 > 0
+
+            "12:    \n\t"                   // endloop:
+            ////////////////end loop in x
+            : [colors] "+r" (colors)
+            : [image0] "r" (image0), [image1] "r" (image1), [fx] "r" (fx), [maxX] "r" (maxX), [subY] "r" (subY),
+             [dx] "r" (dx), [count] "r" (count)
+            : "cc", "memory", "r3", "r4", "r5", "r6", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d16", "d17", "d18", "d19", "d20", "d21", "d22", "d23", "d24", "d25", "d26", "d27", "d28", "d29"
+            );
+
+
+}
author	Xin Qi <xqi@codeaurora.org>	2013-11-26 11:52:20 -0800
committer	Steve Kondik <steve@cyngn.com>	2014-12-11 20:42:18 -0800
commit	fe5bd66c43c1cff5ea84c1e88e05044ca19b3877 (patch)
tree	1ac5cf28430c403c8e4877cfaf49d49ec3734e68
parent	d9c9fb807b11d3f12e0f687bef40c58ad2ed7ae1 (diff)
download	android_external_skia-fe5bd66c43c1cff5ea84c1e88e05044ca19b3877.tar.gz android_external_skia-fe5bd66c43c1cff5ea84c1e88e05044ca19b3877.tar.bz2 android_external_skia-fe5bd66c43c1cff5ea84c1e88e05044ca19b3877.zip