aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorWei Wang <wangw@codeaurora.org>2015-01-20 15:50:09 -0800
committerRicardo Cerqueira <cyanogenmod@cerqueira.org>2015-02-15 03:45:25 +0000
commitbf4617b9502f5293e75515779d2df846f670a3e3 (patch)
treef7c55c7e82d943c5001de5533fd4674ef452e651
parent64ae5e4e74f0c89e062aabae24d956131c46f2b3 (diff)
downloadandroid_external_skia-stable/cm-12.0-YNG1T.tar.gz
android_external_skia-stable/cm-12.0-YNG1T.tar.bz2
android_external_skia-stable/cm-12.0-YNG1T.zip
Revert "NEON optimized blitter S32_Opaque_D32_filter_DX"stable/cm-12.0-YNG1TAstable/cm-12.0-YNG1Tstable/cm-12.0-YNG1I
This reverts commit cac2cc4d8af401b22b42d0dbec8f52f6e43516d1. Change-Id: I83b9c83fa9363f27675f97cc13ce79d092a43089
-rw-r--r--Android.mk4
-rw-r--r--src/core/SkBitmapProcState.cpp9
-rw-r--r--src/core/SkBitmapProcState_procs.h18
-rw-r--r--src/core/SkBitmapProcState_shaderproc.h114
-rw-r--r--src/opts/ext/S32_Opaque_D32_filter_DX_shaderproc_neon.cpp476
5 files changed, 3 insertions, 618 deletions
diff --git a/Android.mk b/Android.mk
index 87daedff0e..cb19f6e3fd 100644
--- a/Android.mk
+++ b/Android.mk
@@ -599,7 +599,6 @@ LOCAL_SRC_FILES_arm += \
src/opts/SkUtils_opts_arm.cpp \
src/opts/SkXfermode_opts_arm.cpp
-
ifeq ($(ARCH_ARM_HAVE_NEON), true)
LOCAL_SRC_FILES_arm += \
src/opts/memset16_neon.S \
@@ -610,8 +609,7 @@ LOCAL_SRC_FILES_arm += \
src/opts/SkBlitRow_opts_arm_neon.cpp \
src/opts/SkBlurImage_opts_neon.cpp \
src/opts/SkMorphology_opts_neon.cpp \
- src/opts/SkXfermode_opts_arm_neon.cpp \
- src/opts/ext/S32_Opaque_D32_filter_DX_shaderproc_neon.cpp
+ src/opts/SkXfermode_opts_arm_neon.cpp
LOCAL_CFLAGS_arm += \
-D__ARM_HAVE_NEON
diff --git a/src/core/SkBitmapProcState.cpp b/src/core/SkBitmapProcState.cpp
index 751c2c1753..3605910309 100644
--- a/src/core/SkBitmapProcState.cpp
+++ b/src/core/SkBitmapProcState.cpp
@@ -1,3 +1,4 @@
+
/*
* Copyright 2011 Google Inc.
*
@@ -26,10 +27,6 @@ extern void Repeat_S16_D16_filter_DX_shaderproc_neon(const SkBitmapProcState&,
extern void SI8_opaque_D32_filter_DX_neon(const SkBitmapProcState&, const uint32_t*, int, SkPMColor*);
extern void SI8_opaque_D32_filter_DX_shaderproc_neon(const SkBitmapProcState&, int, int, uint32_t*, int);
extern void Clamp_SI8_opaque_D32_filter_DX_shaderproc_neon(const SkBitmapProcState&, int, int, uint32_t*, int);
-#if !defined(__LP64__)
-extern void S32_opaque_D32_filter_DX_neon(const SkBitmapProcState&, const uint32_t*, int, SkPMColor*);
-extern void Clamp_S32_Opaque_D32_filter_DX_shaderproc_neon(const SkBitmapProcState&, int, int, uint32_t*, int);
-#endif //#if !defined(__LP64__)
#endif
#define NAME_WRAP(x) x
@@ -629,10 +626,6 @@ bool SkBitmapProcState::chooseProcs(const SkMatrix& inv, const SkPaint& paint) {
}
} else if (SK_ARM_NEON_WRAP(SI8_opaque_D32_filter_DX) == fSampleProc32 && clampClamp) {
fShaderProc32 = SK_ARM_NEON_WRAP(Clamp_SI8_opaque_D32_filter_DX_shaderproc);
-#if !defined(__LP64__)
- } else if (SK_ARM_NEON_WRAP(S32_opaque_D32_filter_DX) == fSampleProc32 && clampClamp) {
- fShaderProc32 = SK_ARM_NEON_WRAP(Clamp_S32_Opaque_D32_filter_DX_shaderproc);
-#endif //#if !defined(__LP64__)
}
if (NULL == fShaderProc32) {
diff --git a/src/core/SkBitmapProcState_procs.h b/src/core/SkBitmapProcState_procs.h
index 9a44346c92..0d3b723e67 100644
--- a/src/core/SkBitmapProcState_procs.h
+++ b/src/core/SkBitmapProcState_procs.h
@@ -340,22 +340,4 @@ static inline U8CPU Filter_8(unsigned x, unsigned y,
#define POSTAMBLE(state) state.fBitmap->getColorTable()->unlockColors()
#include "SkBitmapProcState_shaderproc.h"
-#if !defined(__LP64__)
-#define TILEX_PROCF(fx, max) SkClampMax((fx) >> 16, max)
-#define TILEY_PROCF(fy, max) SkClampMax((fy) >> 16, max)
-#define TILEX_LOW_BITS(fx, max) (((fx) >> 12) & 0xF)
-#define TILEY_LOW_BITS(fy, max) (((fy) >> 12) & 0xF)
-
-#undef FILTER_PROC
-#define FILTER_PROC(x, y, a, b, c, d, dst) NAME_WRAP(Filter_32_opaque)(x, y, a, b, c, d, dst)
-#define MAKENAME(suffix) NAME_WRAP(Clamp_S32_Opaque_D32 ## suffix)
-#define SRCTYPE uint32_t
-#define DSTTYPE uint32_t
-#define CHECKSTATE(state) SkASSERT(state.fBitmap->config() == SkBitmap::kARGB_8888_Config)
-#define SRC_TO_FILTER(src) src
-#define S32_OPAQUE_D32_FILTER_DX_NEON (!SK_ARM_NEON_IS_NONE)
-#include "SkBitmapProcState_shaderproc.h"
-#undef S32_OPAQUE_D32_FILTER_DX_NEON
-#endif //#if !defined(__LP64__)
-
#undef NAME_WRAP
diff --git a/src/core/SkBitmapProcState_shaderproc.h b/src/core/SkBitmapProcState_shaderproc.h
index 877b8cfbd0..0014b4a526 100644
--- a/src/core/SkBitmapProcState_shaderproc.h
+++ b/src/core/SkBitmapProcState_shaderproc.h
@@ -1,3 +1,4 @@
+
/*
* Copyright 2011 Google Inc.
*
@@ -7,13 +8,6 @@
#include "SkMathPriv.h"
-#if S32_OPAQUE_D32_FILTER_DX_NEON
-void S32_Opaque_D32_filter_DX_shaderproc_neon(const unsigned int* image0, const unsigned int* image1,
- SkFixed fx, unsigned int maxX, unsigned int subY,
- unsigned int* colors,
- SkFixed dx, int count);
-#endif
-
#define SCALE_FILTER_NAME MAKENAME(_filter_DX_shaderproc)
// Can't be static in the general case because some of these implementations
@@ -61,110 +55,6 @@ void SCALE_FILTER_NAME(const SkBitmapProcState& s, int x, int y,
PREAMBLE(s);
#endif
-#if S32_OPAQUE_D32_FILTER_DX_NEON
- int post_count;
- SkFixed post_fx;
- DSTTYPE* SK_RESTRICT post_colors;
- int num;
- post_count = count;
- post_fx = fx;
- post_colors = colors;
-
-
- if (dx>=0)
- {
- int end = ((int)maxX-1)<<16;
- num = dx?((end-fx)/dx):0;
- if (num < 0) num = 0;
-
- if (num<count)
- {
- count = num;
- post_count = post_count - count;
- post_fx = fx + count*dx;
- post_colors = post_colors + count;
- }
- else
- post_count = 0;
-
- while (fx<0 && count) {
- unsigned subX = TILEX_LOW_BITS(fx, maxX);
- unsigned x0 = TILEX_PROCF(fx, maxX);
- unsigned x1 = TILEX_PROCF((fx + oneX), maxX);
-
- FILTER_PROC(subX, subY,
- SRC_TO_FILTER(row0[x0]),
- SRC_TO_FILTER(row0[x1]),
- SRC_TO_FILTER(row1[x0]),
- SRC_TO_FILTER(row1[x1]),
- colors);
- colors += 1;
-
- fx += dx;
- count--;
- }
- }
- else
- {
- int end = 0;
- int maxXFix = ((int)maxX-1)<<16;
- num = (end-fx)/dx;
- if (num < 0) num = 0;
-
-
- if (num<count)
- {
- count = num;
- post_count = post_count - count;
- post_fx = fx + count*dx;
- post_colors = post_colors + count;
- }
- else
- post_count = 0;
-
- while (fx>=maxXFix && count) {
- unsigned subX = TILEX_LOW_BITS(fx, maxX);
- unsigned x0 = TILEX_PROCF(fx, maxX);
- unsigned x1 = TILEX_PROCF((fx + oneX), maxX);
-
- FILTER_PROC(subX, subY,
- SRC_TO_FILTER(row0[x0]),
- SRC_TO_FILTER(row0[x1]),
- SRC_TO_FILTER(row1[x0]),
- SRC_TO_FILTER(row1[x1]),
- colors);
- colors += 1;
-
- fx += dx;
- count--;
- }
-
- }
-
- S32_Opaque_D32_filter_DX_shaderproc_neon(row0, row1, fx, maxX, subY, colors, dx, count);
-
- fx = post_fx;
- colors = post_colors;
- while (post_count) {
- unsigned subX = TILEX_LOW_BITS(fx, maxX);
- unsigned x0 = TILEX_PROCF(fx, maxX);
- unsigned x1 = TILEX_PROCF((fx + oneX), maxX);
-
- FILTER_PROC(subX, subY,
- SRC_TO_FILTER(row0[x0]),
- SRC_TO_FILTER(row0[x1]),
- SRC_TO_FILTER(row1[x0]),
- SRC_TO_FILTER(row1[x1]),
- colors);
- colors += 1;
-
- fx += dx;
- post_count--;
- }
-
-
-#else //S32_OPAQUE_D32_FILTER_DX_NEON
-
do {
unsigned subX = TILEX_LOW_BITS(fx, maxX);
unsigned x0 = TILEX_PROCF(fx, maxX);
@@ -181,8 +71,6 @@ void SCALE_FILTER_NAME(const SkBitmapProcState& s, int x, int y,
fx += dx;
} while (--count != 0);
-#endif //S32_OPAQUE_D32_FILTER_DX_NEON
-
#ifdef POSTAMBLE
POSTAMBLE(s);
#endif
diff --git a/src/opts/ext/S32_Opaque_D32_filter_DX_shaderproc_neon.cpp b/src/opts/ext/S32_Opaque_D32_filter_DX_shaderproc_neon.cpp
deleted file mode 100644
index 909fd4354d..0000000000
--- a/src/opts/ext/S32_Opaque_D32_filter_DX_shaderproc_neon.cpp
+++ /dev/null
@@ -1,476 +0,0 @@
-/*
- * Copyright (c) 2010,2013, The Linux Foundation. All rights reserved.
- * *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are
- * met:
- * * Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- * * Redistributions in binary form must reproduce the above
- * copyright notice, this list of conditions and the following
- * disclaimer in the documentation and/or other materials provided
- * with the distribution.
- * * Neither the name of The Linux Foundation nor the names of its
- * contributors may be used to endorse or promote products derived
- * from this software without specific prior written permission.
- * *
- * THIS SOFTWARE IS PROVIDED "AS IS" AND ANY EXPRESS OR IMPLIED
- * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS
- * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
- * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
- * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
- * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN
- * IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- * */
-
-#include "SkFixed.h"
-#include "SkUtilsArm.h"
-
-void S32_Opaque_D32_filter_DX_shaderproc_neon(const unsigned int* image0, const unsigned int* image1,
- SkFixed fx, unsigned int maxX, unsigned int subY,
- unsigned int* colors,
- SkFixed dx, int count) __attribute__ ((optimize ("0"))) ;
-void S32_Opaque_D32_filter_DX_shaderproc_neon(const unsigned int* image0, const unsigned int* image1,
- SkFixed fx, unsigned int maxX, unsigned int subY,
- unsigned int* colors,
- SkFixed dx, int count) {
-
- asm volatile(
- "mov r3, %[count] \n\t" //r3 = count
-
- "mov r5, %[fx] \n\t" //r5 = x = fx
- "cmp r3, #0 \n\t"
- "beq 12f \n\t" // branch forward to endloop if r3 == 0
-
- "vdup.8 d17, %[subY] \n\t" // duplicate y into d17
- "vmov.u8 d16, #16 \n\t" // set up constant in d16
- "vsub.u8 d18, d16, d17 \n\t" // d18 = 16-y
-
- "vmov.u16 d16, #16 \n\t" // set up constant in d16,int 16bit
-
-#define UNROLL8
-#define UNROLL2
-#ifdef UNROLL8
- "cmp r3, #8 \n\t"
- "blt 20f \n\t" // branch forward to initloop2 if r3 < 8
- ///////////////loop2 in x
- "81: \n\t" // beginloop8:
-
- /////////////////pixel 1////////////////////////////////////
- //x0 = SkClampMax((fx) >> 16, max)
- "asr r4, r5, #16 \n\t"
-
- "lsl r4, r4, #2 \n\t"
- "add r6, r4, %[image0] \n\t"
- "vldr.32 d4, [r6] \n\t"
- "add r6, r4, %[image1] \n\t"
- "vldr.32 d5, [r6] \n\t"
-
- //(((fx) >> 12) & 0xF)
- "lsr r4, r5, #12 \n\t"
- "and r4, r4, #15 \n\t"
- "vdup.16 d19, r4 \n\t" // duplicate x into d19
-
-
- ////////////bilinear interp
-
- "vmull.u8 q3, d4, d18 \n\t" // q3 = [a01|a00] * (16-y)
- "vmull.u8 q0, d5, d17 \n\t" // q0 = [a11|a10] * y
-
- "vsub.u16 d20, d16, d19 \n\t" // d20 = 16-x
-
- "vmul.i16 d22, d7, d19 \n\t" // d4 = a01 * x
- "vmla.i16 d22, d1, d19 \n\t" // d4 += a11 * x
- "vmla.i16 d22, d6, d20 \n\t" // d4 += a00 * (16-x)
- "vmla.i16 d22, d0, d20 \n\t" // d4 += a10 * (16-x)
-
- //////////////// end bilinear interp
-
- "add r5, r5, %[dx] \n\t" //r5 = x += dx
-
- /////////////////pixel 2////////////////////////////////////
- //x0 = SkClampMax((fx) >> 16, max)
- "asr r4, r5, #16 \n\t"
-
- "lsl r4, r4, #2 \n\t"
- "add r6, r4, %[image0] \n\t"
- "vldr.32 d4, [r6] \n\t"
- "add r6, r4, %[image1] \n\t"
- "vldr.32 d5, [r6] \n\t"
-
- //(((fx) >> 12) & 0xF)
- "lsr r4, r5, #12 \n\t"
- "and r4, r4, #15 \n\t"
- "vdup.16 d19, r4 \n\t" // duplicate x into d19
-
-
- ////////////bilinear interp
-
- "vmull.u8 q3, d4, d18 \n\t" // q3 = [a01|a00] * (16-y)
- "vmull.u8 q0, d5, d17 \n\t" // q0 = [a11|a10] * y
-
- "vsub.u16 d20, d16, d19 \n\t" // d20 = 16-x
-
- "vmul.i16 d24, d7, d19 \n\t" // d4 = a01 * x
- "vmla.i16 d24, d1, d19 \n\t" // d4 += a11 * x
- "vmla.i16 d24, d6, d20 \n\t" // d4 += a00 * (16-x)
- "vmla.i16 d24, d0, d20 \n\t" // d4 += a10 * (16-x)
-
- //////////////// end bilinear interp
-
- "add r5, r5, %[dx] \n\t" //r5 = x += dx
-
- /////////////////pixel 3////////////////////////////////
- //x0 = SkClampMax((fx) >> 16, max)
- "asr r4, r5, #16 \n\t"
-
- "lsl r4, r4, #2 \n\t"
- "add r6, r4, %[image0] \n\t"
- "vldr.32 d4, [r6] \n\t"
- "add r6, r4, %[image1] \n\t"
- "vldr.32 d5, [r6] \n\t"
-
- //(((fx) >> 12) & 0xF)
- "lsr r4, r5, #12 \n\t"
- "and r4, r4, #15 \n\t"
- "vdup.16 d19, r4 \n\t" // duplicate x into d19
-
-
- ////////////bilinear interp
-
- "vmull.u8 q3, d4, d18 \n\t" // q3 = [a01|a00] * (16-y)
- "vmull.u8 q0, d5, d17 \n\t" // q0 = [a11|a10] * y
-
- "vsub.u16 d20, d16, d19 \n\t" // d20 = 16-x
-
- "vmul.i16 d26, d7, d19 \n\t" // d4 = a01 * x
- "vmla.i16 d26, d1, d19 \n\t" // d4 += a11 * x
- "vmla.i16 d26, d6, d20 \n\t" // d4 += a00 * (16-x)
- "vmla.i16 d26, d0, d20 \n\t" // d4 += a10 * (16-x)
-
- //////////////// end bilinear interp
-
- "add r5, r5, %[dx] \n\t" //r5 = x += dx
-
- /////////////////pixel 4////////////////////////////////
- //x0 = SkClampMax((fx) >> 16, max)
- "asr r4, r5, #16 \n\t"
-
- "lsl r4, r4, #2 \n\t"
- "add r6, r4, %[image0] \n\t"
- "vldr.32 d4, [r6] \n\t"
- "add r6, r4, %[image1] \n\t"
- "vldr.32 d5, [r6] \n\t"
-
- //(((fx) >> 12) & 0xF)
- "lsr r4, r5, #12 \n\t"
- "and r4, r4, #15 \n\t"
- "vdup.16 d19, r4 \n\t" // duplicate x into d19
-
-
- ////////////bilinear interp
-
- "vmull.u8 q3, d4, d18 \n\t" // q3 = [a01|a00] * (16-y)
- "vmull.u8 q0, d5, d17 \n\t" // q0 = [a11|a10] * y
-
- "vsub.u16 d20, d16, d19 \n\t" // d20 = 16-x
-
- "vmul.i16 d28, d7, d19 \n\t" // d4 = a01 * x
- "vmla.i16 d28, d1, d19 \n\t" // d4 += a11 * x
- "vmla.i16 d28, d6, d20 \n\t" // d4 += a00 * (16-x)
- "vmla.i16 d28, d0, d20 \n\t" // d4 += a10 * (16-x)
-
- //////////////// end bilinear interp
-
- "add r5, r5, %[dx] \n\t" //r5 = x += dx
-
- /////////////////pixel 5////////////////////////////////////
- //x0 = SkClampMax((fx) >> 16, max)
- "asr r4, r5, #16 \n\t"
-
- "lsl r4, r4, #2 \n\t"
- "add r6, r4, %[image0] \n\t"
- "vldr.32 d4, [r6] \n\t"
- "add r6, r4, %[image1] \n\t"
- "vldr.32 d5, [r6] \n\t"
-
- //(((fx) >> 12) & 0xF)
- "lsr r4, r5, #12 \n\t"
- "and r4, r4, #15 \n\t"
- "vdup.16 d19, r4 \n\t" // duplicate x into d19
-
-
- ////////////bilinear interp
-
- "vmull.u8 q3, d4, d18 \n\t" // q3 = [a01|a00] * (16-y)
- "vmull.u8 q0, d5, d17 \n\t" // q0 = [a11|a10] * y
-
- "vsub.u16 d20, d16, d19 \n\t" // d20 = 16-x
-
- "vmul.i16 d23, d7, d19 \n\t" // d4 = a01 * x
- "vmla.i16 d23, d1, d19 \n\t" // d4 += a11 * x
- "vmla.i16 d23, d6, d20 \n\t" // d4 += a00 * (16-x)
- "vmla.i16 d23, d0, d20 \n\t" // d4 += a10 * (16-x)
-
- //////////////// end bilinear interp
-
- "add r5, r5, %[dx] \n\t" //r5 = x += dx
-
- /////////////////pixel 6////////////////////////////////////
- //x0 = SkClampMax((fx) >> 16, max)
- "asr r4, r5, #16 \n\t"
-
- "lsl r4, r4, #2 \n\t"
- "add r6, r4, %[image0] \n\t"
- "vldr.32 d4, [r6] \n\t"
- "add r6, r4, %[image1] \n\t"
- "vldr.32 d5, [r6] \n\t"
-
- //(((fx) >> 12) & 0xF)
- "lsr r4, r5, #12 \n\t"
- "and r4, r4, #15 \n\t"
- "vdup.16 d19, r4 \n\t" // duplicate x into d19
-
-
- ////////////bilinear interp
-
- "vmull.u8 q3, d4, d18 \n\t" // q3 = [a01|a00] * (16-y)
- "vmull.u8 q0, d5, d17 \n\t" // q0 = [a11|a10] * y
-
- "vsub.u16 d20, d16, d19 \n\t" // d20 = 16-x
-
- "vmul.i16 d25, d7, d19 \n\t" // d4 = a01 * x
- "vmla.i16 d25, d1, d19 \n\t" // d4 += a11 * x
- "vmla.i16 d25, d6, d20 \n\t" // d4 += a00 * (16-x)
- "vmla.i16 d25, d0, d20 \n\t" // d4 += a10 * (16-x)
-
- //////////////// end bilinear interp
-
- "add r5, r5, %[dx] \n\t" //r5 = x += dx
-
- /////////////////pixel 7////////////////////////////////
- //x0 = SkClampMax((fx) >> 16, max)
- "asr r4, r5, #16 \n\t"
-
- "lsl r4, r4, #2 \n\t"
- "add r6, r4, %[image0] \n\t"
- "vldr.32 d4, [r6] \n\t"
- "add r6, r4, %[image1] \n\t"
- "vldr.32 d5, [r6] \n\t"
-
- //(((fx) >> 12) & 0xF)
- "lsr r4, r5, #12 \n\t"
- "and r4, r4, #15 \n\t"
- "vdup.16 d19, r4 \n\t" // duplicate x into d19
-
-
- ////////////bilinear interp
-
- "vmull.u8 q3, d4, d18 \n\t" // q3 = [a01|a00] * (16-y)
- "vmull.u8 q0, d5, d17 \n\t" // q0 = [a11|a10] * y
-
- "vsub.u16 d20, d16, d19 \n\t" // d20 = 16-x
-
- "vmul.i16 d27, d7, d19 \n\t" // d4 = a01 * x
- "vmla.i16 d27, d1, d19 \n\t" // d4 += a11 * x
- "vmla.i16 d27, d6, d20 \n\t" // d4 += a00 * (16-x)
- "vmla.i16 d27, d0, d20 \n\t" // d4 += a10 * (16-x)
-
- //////////////// end bilinear interp
-
- "add r5, r5, %[dx] \n\t" //r5 = x += dx
-
- /////////////////pixel 8////////////////////////////////
- //x0 = SkClampMax((fx) >> 16, max)
- "asr r4, r5, #16 \n\t"
-
- "lsl r4, r4, #2 \n\t"
- "add r6, r4, %[image0] \n\t"
- "vldr.32 d4, [r6] \n\t"
- "add r6, r4, %[image1] \n\t"
- "vldr.32 d5, [r6] \n\t"
-
- //(((fx) >> 12) & 0xF)
- "lsr r4, r5, #12 \n\t"
- "and r4, r4, #15 \n\t"
- "vdup.16 d19, r4 \n\t" // duplicate x into d19
-
-
- ////////////bilinear interp
-
- "vmull.u8 q3, d4, d18 \n\t" // q3 = [a01|a00] * (16-y)
- "vmull.u8 q0, d5, d17 \n\t" // q0 = [a11|a10] * y
-
- "vsub.u16 d20, d16, d19 \n\t" // d20 = 16-x
-
- "vmul.i16 d29, d7, d19 \n\t" // d4 = a01 * x
- "vmla.i16 d29, d1, d19 \n\t" // d4 += a11 * x
- "vmla.i16 d29, d6, d20 \n\t" // d4 += a00 * (16-x)
- "vmla.i16 d29, d0, d20 \n\t" // d4 += a10 * (16-x)
-
- //////////////// Store results///////////////////
-
- "vshrn.i16 d0, q11, #8 \n\t" // shift down result by 8
- "vshrn.i16 d1, q12, #8 \n\t" // shift down result by 8
- "vshrn.i16 d2, q13, #8 \n\t" // shift down result by 8
- "vshrn.i16 d3, q14, #8 \n\t" // shift down result by 8
-
- "vst4.u32 {d0, d1, d2, d3}, [%[colors]]! \n\t" // store result
-
- //////////////// end bilinear interp
-
- "sub r3, r3, #8 \n\t" //num -=8
- "add r5, r5, %[dx] \n\t" //r5 = x += dx
- "cmp r3, #7 \n\t"
-
- "bgt 81b \n\t" // branch backward to beginloop8 if r3 > 7
-
- "82: \n\t" // endloop8:
- ////////////////end loop in x
-#endif //UNROLL8
-
-
-
-#ifdef UNROLL2
- "20: \n\t" // initloop2:
- "cmp r3, #2 \n\t"
- "blt 10f \n\t" // branch forward to initloop if r3 < 2
- ///////////////loop2 in x
- "21: \n\t" // beginloop2:
-
-
- //x0 = SkClampMax((fx) >> 16, max)
- "asr r4, r5, #16 \n\t"
-
- "lsl r4, r4, #2 \n\t"
- "add r6, r4, %[image0] \n\t"
- "vldr.32 d4, [r6] \n\t"
- "add r6, r4, %[image1] \n\t"
- "vldr.32 d5, [r6] \n\t"
-
- //(((fx) >> 12) & 0xF)
- "lsr r4, r5, #12 \n\t"
- "and r4, r4, #15 \n\t"
- "vdup.16 d19, r4 \n\t" // duplicate x into d19
-
-
- ////////////bilinear interp
-
- "vmull.u8 q3, d4, d18 \n\t" // q3 = [a01|a00] * (16-y)
- "vmull.u8 q0, d5, d17 \n\t" // q0 = [a11|a10] * y
-
- "vsub.u16 d20, d16, d19 \n\t" // d20 = 16-x
-
- "vmul.i16 d22, d7, d19 \n\t" // d4 = a01 * x
- "vmla.i16 d22, d1, d19 \n\t" // d4 += a11 * x
- "vmla.i16 d22, d6, d20 \n\t" // d4 += a00 * (16-x)
- "vmla.i16 d22, d0, d20 \n\t" // d4 += a10 * (16-x)
-
- //////////////// end bilinear interp
-
- "add r5, r5, %[dx] \n\t" //r5 = x += dx
-
- /////////////////second half////////////////////////////////
- //x0 = SkClampMax((fx) >> 16, max)
- "asr r4, r5, #16 \n\t"
-
- "lsl r4, r4, #2 \n\t"
- "add r6, r4, %[image0] \n\t"
- "vldr.32 d4, [r6] \n\t"
- "add r6, r4, %[image1] \n\t"
- "vldr.32 d5, [r6] \n\t"
-
- //(((fx) >> 12) & 0xF)
- "lsr r4, r5, #12 \n\t"
- "and r4, r4, #15 \n\t"
- "vdup.16 d19, r4 \n\t" // duplicate x into d19
-
-
- ////////////bilinear interp
-
- "vmull.u8 q3, d4, d18 \n\t" // q3 = [a01|a00] * (16-y)
- "vmull.u8 q0, d5, d17 \n\t" // q0 = [a11|a10] * y
-
- "vsub.u16 d20, d16, d19 \n\t" // d20 = 16-x
-
- "vmul.i16 d23, d7, d19 \n\t" // d4 = a01 * x
- "vmla.i16 d23, d1, d19 \n\t" // d4 += a11 * x
- "vmla.i16 d23, d6, d20 \n\t" // d4 += a00 * (16-x)
- "vmla.i16 d23, d0, d20 \n\t" // d4 += a10 * (16-x)
- "vshrn.i16 d0, q11, #8 \n\t" // shift down result by 8
-
- "vst1.u32 {d0}, [%[colors]]! \n\t" // store result
-
- //////////////// end bilinear interp
-
- "sub r3, r3, #2 \n\t" //num -=2
- "add r5, r5, %[dx] \n\t" //r5 = x += dx
- "cmp r3, #1 \n\t"
-
- "bgt 21b \n\t" // branch backward to beginloop2 if r3 > 1
-
- "22: \n\t" // endloop2:
- ////////////////end loop in x
-#endif //UNROLL2
-
-#if defined (UNROLL2) || defined (UNROLL8)
- "10: \n\t" // initloop:
- "cmp r3, #0 \n\t"
- "ble 12f \n\t" // branch forward to endloop if r3 <= 0
-#endif //defined (UNROLL2) || defined (UNROLL8)
-
- ///////////////loop in x
- "11: \n\t" // beginloop:
-
-
- //x0 = SkClampMax((fx) >> 16, max)
- "asr r4, r5, #16 \n\t"
-
- "lsl r4, r4, #2 \n\t"
- "add r6, r4, %[image0] \n\t"
- "vldr.32 d4, [r6] \n\t"
- "add r6, r4, %[image1] \n\t"
- "vldr.32 d5, [r6] \n\t"
-
- //(((fx) >> 12) & 0xF)
- "lsr r4, r5, #12 \n\t"
- "and r4, r4, #15 \n\t"
- "vdup.16 d19, r4 \n\t" // duplicate x into d19
-
-
- ////////////bilinear interp
-
- "vmull.u8 q3, d4, d18 \n\t" // q3 = [a01|a00] * (16-y)
- "vmull.u8 q0, d5, d17 \n\t" // q0 = [a11|a10] * y
-
- "vsub.u16 d20, d16, d19 \n\t" // d20 = 16-x
-
- "vmul.i16 d4, d7, d19 \n\t" // d4 = a01 * x
- "vmla.i16 d4, d1, d19 \n\t" // d4 += a11 * x
- "vmla.i16 d4, d6, d20 \n\t" // d4 += a00 * (16-x)
- "vmla.i16 d4, d0, d20 \n\t" // d4 += a10 * (16-x)
- "vshrn.i16 d0, q2, #8 \n\t" // shift down result by 8
-
- "vst1.u32 {d0[0]}, [%[colors]]! \n\t" // store result
-
- //////////////// end bilinear interp
-
- "sub r3, r3, #1 \n\t" //num -=1
- "add r5, r5, %[dx] \n\t" //r5 = x += dx
- "cmp r3, #0 \n\t"
- "bgt 11b \n\t" // branch backward to beginloop if r3 > 0
-
- "12: \n\t" // endloop:
- ////////////////end loop in x
- : [colors] "+r" (colors)
- : [image0] "r" (image0), [image1] "r" (image1), [fx] "r" (fx), [maxX] "r" (maxX), [subY] "r" (subY),
- [dx] "r" (dx), [count] "r" (count)
- : "cc", "memory", "r3", "r4", "r5", "r6", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d16", "d17", "d18", "d19", "d20", "d21", "d22", "d23", "d24", "d25", "d26", "d27", "d28", "d29"
- );
-
-
-}