summaryrefslogtreecommitdiffstats
path: root/cpu_ref
diff options
context:
space:
mode:
authorRicardo Cerqueira <ricardo@cyngn.com>2015-03-10 12:15:08 +0000
committerRicardo Cerqueira <ricardo@cyngn.com>2015-03-10 12:15:08 +0000
commit4d9245fd508125ca177794b7eb740dbcf35a1366 (patch)
tree016c49a6d577cdfe83fe317bdf30274a43923199 /cpu_ref
parentb0a4a7f403287d16ade9451722a50b4cb00723f1 (diff)
parentad410d91de8f32e02f824c40b58db638dcafc1b4 (diff)
downloadandroid_frameworks_rs-cm-12.1.tar.gz
android_frameworks_rs-cm-12.1.tar.bz2
android_frameworks_rs-cm-12.1.zip
Android 5.1.0 release 1
Diffstat (limited to 'cpu_ref')
-rw-r--r--cpu_ref/Android.mk4
-rw-r--r--cpu_ref/rsCpuCore.cpp2
-rw-r--r--cpu_ref/rsCpuIntrinsicBlur.cpp6
-rw-r--r--cpu_ref/rsCpuIntrinsicColorMatrix.cpp20
-rw-r--r--cpu_ref/rsCpuIntrinsicConvolve3x3.cpp6
-rw-r--r--cpu_ref/rsCpuIntrinsicConvolve5x5.cpp6
-rw-r--r--cpu_ref/rsCpuIntrinsicResize.cpp30
-rw-r--r--cpu_ref/rsCpuIntrinsicYuvToRGB.cpp4
-rw-r--r--cpu_ref/rsCpuIntrinsics_advsimd_Blur.S141
-rw-r--r--cpu_ref/rsCpuIntrinsics_advsimd_YuvToRGB.S293
-rw-r--r--cpu_ref/rsCpuIntrinsics_neon_Blur.S130
-rw-r--r--cpu_ref/rsCpuScript.cpp5
12 files changed, 347 insertions, 300 deletions
diff --git a/cpu_ref/Android.mk b/cpu_ref/Android.mk
index bd276bf2..f041ad96 100644
--- a/cpu_ref/Android.mk
+++ b/cpu_ref/Android.mk
@@ -42,6 +42,10 @@ LOCAL_SRC_FILES:= \
LOCAL_CFLAGS_arm64 += -DARCH_ARM_USE_INTRINSICS -DARCH_ARM64_USE_INTRINSICS -DARCH_ARM64_HAVE_NEON
+ifeq ($(RS_DISABLE_A53_WORKAROUND),true)
+LOCAL_CFLAGS_arm64 += -DDISABLE_A53_WORKAROUND
+endif
+
LOCAL_SRC_FILES_arm64 += \
rsCpuIntrinsics_advsimd_3DLUT.S \
rsCpuIntrinsics_advsimd_Convolve.S \
diff --git a/cpu_ref/rsCpuCore.cpp b/cpu_ref/rsCpuCore.cpp
index f09e3342..4285dae5 100644
--- a/cpu_ref/rsCpuCore.cpp
+++ b/cpu_ref/rsCpuCore.cpp
@@ -266,7 +266,7 @@ bool RsdCpuReferenceImpl::init(uint32_t version_major, uint32_t version_minor,
GetCpuInfo();
- int cpu = sysconf(_SC_NPROCESSORS_ONLN);
+ int cpu = sysconf(_SC_NPROCESSORS_CONF);
if(mRSC->props.mDebugMaxThreads) {
cpu = mRSC->props.mDebugMaxThreads;
}
diff --git a/cpu_ref/rsCpuIntrinsicBlur.cpp b/cpu_ref/rsCpuIntrinsicBlur.cpp
index 123cc9f6..9dccd80d 100644
--- a/cpu_ref/rsCpuIntrinsicBlur.cpp
+++ b/cpu_ref/rsCpuIntrinsicBlur.cpp
@@ -297,7 +297,7 @@ void RsdCpuScriptIntrinsicBlur::kernelU4(const RsForEachStubParamStruct *p,
uint32_t x2 = xend;
#if defined(ARCH_ARM_USE_INTRINSICS)
- if (gArchUseSIMD && !xstart && (xend == p->dimX)) {
+ if (gArchUseSIMD) {
rsdIntrinsicBlurU4_K(out, (uchar4 const *)(pin + stride * p->y), p->dimX, p->dimY,
stride, x1, p->y, x2 - x1, cp->mIradius, cp->mIp + cp->mIradius);
return;
@@ -367,9 +367,9 @@ void RsdCpuScriptIntrinsicBlur::kernelU1(const RsForEachStubParamStruct *p,
uint32_t x2 = xend;
#if defined(ARCH_ARM_USE_INTRINSICS)
- if (gArchUseSIMD && !xstart && (xend == p->dimX)) {
+ if (gArchUseSIMD) {
rsdIntrinsicBlurU1_K(out, pin + stride * p->y, p->dimX, p->dimY,
- stride, 0, p->y, p->dimX, cp->mIradius, cp->mIp + cp->mIradius);
+ stride, x1, p->y, x2 - x1, cp->mIradius, cp->mIp + cp->mIradius);
return;
}
#endif
diff --git a/cpu_ref/rsCpuIntrinsicColorMatrix.cpp b/cpu_ref/rsCpuIntrinsicColorMatrix.cpp
index 8c852778..6a7808e7 100644
--- a/cpu_ref/rsCpuIntrinsicColorMatrix.cpp
+++ b/cpu_ref/rsCpuIntrinsicColorMatrix.cpp
@@ -126,7 +126,7 @@ typedef union {
} Key_t;
//Re-enable when intrinsic is fixed
-#if 0 && defined(ARCH_ARM64_USE_INTRINSICS)
+#if defined(ARCH_ARM64_USE_INTRINSICS)
typedef struct {
void (*column[4])(void);
void (*store)(void);
@@ -184,7 +184,7 @@ protected:
int ipa[4];
float tmpFp[16];
float tmpFpa[4];
-#if 0 && defined(ARCH_ARM64_USE_INTRINSICS)
+#if defined(ARCH_ARM64_USE_INTRINSICS)
FunctionTab_t mFnTab;
#endif
@@ -910,16 +910,20 @@ void RsdCpuScriptIntrinsicColorMatrix::kernel(const RsForEachStubParamStruct *p,
out += outstep * len;
in += instep * len;
}
-#if 0 && defined(ARCH_ARM64_USE_INTRINSICS)
+#if defined(ARCH_ARM64_USE_INTRINSICS)
else {
if (cp->mLastKey.u.inType == RS_TYPE_FLOAT_32 || cp->mLastKey.u.outType == RS_TYPE_FLOAT_32) {
- rsdIntrinsicColorMatrix_float_K(out, in, len, &cp->mFnTab, cp->tmpFp, cp->tmpFpa);
+ // Currently this generates off by one errors.
+ //rsdIntrinsicColorMatrix_float_K(out, in, len, &cp->mFnTab, cp->tmpFp, cp->tmpFpa);
+ //x1 += len;
+ //out += outstep * len;
+ //in += instep * len;
} else {
rsdIntrinsicColorMatrix_int_K(out, in, len, &cp->mFnTab, cp->ip, cp->ipa);
+ x1 += len;
+ out += outstep * len;
+ in += instep * len;
}
- x1 += len;
- out += outstep * len;
- in += instep * len;
}
#endif
}
@@ -971,7 +975,7 @@ void RsdCpuScriptIntrinsicColorMatrix::preLaunch(
if (build(key)) {
mOptKernel = (void (*)(void *, const void *, const short *, uint32_t)) mBuf;
}
-#if 0 && defined(ARCH_ARM64_USE_INTRINSICS)
+#if defined(ARCH_ARM64_USE_INTRINSICS)
else {
int dt = key.u.outVecSize + (key.u.outType == RS_TYPE_FLOAT_32 ? 4 : 0);
int st = key.u.inVecSize + (key.u.inType == RS_TYPE_FLOAT_32 ? 4 : 0);
diff --git a/cpu_ref/rsCpuIntrinsicConvolve3x3.cpp b/cpu_ref/rsCpuIntrinsicConvolve3x3.cpp
index 552a8353..e5953cf3 100644
--- a/cpu_ref/rsCpuIntrinsicConvolve3x3.cpp
+++ b/cpu_ref/rsCpuIntrinsicConvolve3x3.cpp
@@ -105,7 +105,7 @@ static void ConvolveOneU4(const RsForEachStubParamStruct *p, uint32_t x, uchar4
convert_float4(py2[x]) * coeff[7] +
convert_float4(py2[x2]) * coeff[8];
- px = clamp(px, 0.f, 255.f);
+ px = clamp(px + 0.5f, 0.f, 255.f);
uchar4 o = {(uchar)px.x, (uchar)px.y, (uchar)px.z, (uchar)px.w};
*out = o;
}
@@ -127,7 +127,7 @@ static void ConvolveOneU2(const RsForEachStubParamStruct *p, uint32_t x, uchar2
convert_float2(py2[x]) * coeff[7] +
convert_float2(py2[x2]) * coeff[8];
- px = clamp(px, 0.f, 255.f);
+ px = clamp(px + 0.5f, 0.f, 255.f);
*out = convert_uchar2(px);
}
@@ -147,7 +147,7 @@ static void ConvolveOneU1(const RsForEachStubParamStruct *p, uint32_t x, uchar *
((float)py2[x1]) * coeff[6] +
((float)py2[x]) * coeff[7] +
((float)py2[x2]) * coeff[8];
- *out = clamp(px, 0.f, 255.f);
+ *out = clamp(px + 0.5f, 0.f, 255.f);
}
static void ConvolveOneF4(const RsForEachStubParamStruct *p, uint32_t x, float4 *out,
diff --git a/cpu_ref/rsCpuIntrinsicConvolve5x5.cpp b/cpu_ref/rsCpuIntrinsicConvolve5x5.cpp
index e2a6b8b1..a2c29fd3 100644
--- a/cpu_ref/rsCpuIntrinsicConvolve5x5.cpp
+++ b/cpu_ref/rsCpuIntrinsicConvolve5x5.cpp
@@ -125,7 +125,7 @@ static void OneU4(const RsForEachStubParamStruct *p, uint32_t x, uchar4 *out,
convert_float4(py4[x2]) * coeff[22] +
convert_float4(py4[x3]) * coeff[23] +
convert_float4(py4[x4]) * coeff[24];
- px = clamp(px, 0.f, 255.f);
+ px = clamp(px + 0.5f, 0.f, 255.f);
*out = convert_uchar4(px);
}
@@ -168,7 +168,7 @@ static void OneU2(const RsForEachStubParamStruct *p, uint32_t x, uchar2 *out,
convert_float2(py4[x2]) * coeff[22] +
convert_float2(py4[x3]) * coeff[23] +
convert_float2(py4[x4]) * coeff[24];
- px = clamp(px, 0.f, 255.f);
+ px = clamp(px + 0.5f, 0.f, 255.f);
*out = convert_uchar2(px);
}
@@ -211,7 +211,7 @@ static void OneU1(const RsForEachStubParamStruct *p, uint32_t x, uchar *out,
(float)(py4[x2]) * coeff[22] +
(float)(py4[x3]) * coeff[23] +
(float)(py4[x4]) * coeff[24];
- px = clamp(px, 0.f, 255.f);
+ px = clamp(px + 0.5f, 0.f, 255.f);
*out = px;
}
diff --git a/cpu_ref/rsCpuIntrinsicResize.cpp b/cpu_ref/rsCpuIntrinsicResize.cpp
index 474f82d1..19607c97 100644
--- a/cpu_ref/rsCpuIntrinsicResize.cpp
+++ b/cpu_ref/rsCpuIntrinsicResize.cpp
@@ -83,7 +83,7 @@ static float cubicInterpolate(float p0,float p1,float p2,float p3 , float x) {
static uchar4 OneBiCubic(const uchar4 *yp0, const uchar4 *yp1, const uchar4 *yp2, const uchar4 *yp3,
float xf, float yf, int width) {
- int startx = (int) floor(xf - 2);
+ int startx = (int) floor(xf - 1);
xf = xf - floor(xf);
int maxx = width - 1;
int xs0 = rsMax(0, startx + 0);
@@ -112,13 +112,13 @@ static uchar4 OneBiCubic(const uchar4 *yp0, const uchar4 *yp1, const uchar4 *yp2
convert_float4(yp3[xs3]), xf);
float4 p = cubicInterpolate(p0, p1, p2, p3, yf);
- p = clamp(p, 0.f, 255.f);
+ p = clamp(p + 0.5f, 0.f, 255.f);
return convert_uchar4(p);
}
static uchar2 OneBiCubic(const uchar2 *yp0, const uchar2 *yp1, const uchar2 *yp2, const uchar2 *yp3,
float xf, float yf, int width) {
- int startx = (int) floor(xf - 2);
+ int startx = (int) floor(xf - 1);
xf = xf - floor(xf);
int maxx = width - 1;
int xs0 = rsMax(0, startx + 0);
@@ -147,13 +147,13 @@ static uchar2 OneBiCubic(const uchar2 *yp0, const uchar2 *yp1, const uchar2 *yp2
convert_float2(yp3[xs3]), xf);
float2 p = cubicInterpolate(p0, p1, p2, p3, yf);
- p = clamp(p, 0.f, 255.f);
+ p = clamp(p + 0.5f, 0.f, 255.f);
return convert_uchar2(p);
}
static uchar OneBiCubic(const uchar *yp0, const uchar *yp1, const uchar *yp2, const uchar *yp3,
float xf, float yf, int width) {
- int startx = (int) floor(xf - 2);
+ int startx = (int) floor(xf - 1);
xf = xf - floor(xf);
int maxx = width - 1;
int xs0 = rsMax(0, startx + 0);
@@ -171,7 +171,7 @@ static uchar OneBiCubic(const uchar *yp0, const uchar *yp1, const uchar *yp2, co
(float)yp3[xs2], (float)yp3[xs3], xf);
float p = cubicInterpolate(p0, p1, p2, p3, yf);
- p = clamp(p, 0.f, 255.f);
+ p = clamp(p + 0.5f, 0.f, 255.f);
return (uchar)p;
}
@@ -189,8 +189,8 @@ void RsdCpuScriptIntrinsicResize::kernelU4(const RsForEachStubParamStruct *p,
const int srcWidth = cp->mAlloc->mHal.drvState.lod[0].dimX;
const size_t stride = cp->mAlloc->mHal.drvState.lod[0].stride;
- float yf = p->y * cp->scaleY;
- int starty = (int) floor(yf - 2);
+ float yf = (p->y + 0.5f) * cp->scaleY - 0.5f;
+ int starty = (int) floor(yf - 1);
yf = yf - floor(yf);
int maxy = srcHeight - 1;
int ys0 = rsMax(0, starty + 0);
@@ -208,7 +208,7 @@ void RsdCpuScriptIntrinsicResize::kernelU4(const RsForEachStubParamStruct *p,
uint32_t x2 = xend;
while(x1 < x2) {
- float xf = x1 * cp->scaleX;
+ float xf = (x1 + 0.5f) * cp->scaleX - 0.5f;
*out = OneBiCubic(yp0, yp1, yp2, yp3, xf, yf, srcWidth);
out++;
x1++;
@@ -229,8 +229,8 @@ void RsdCpuScriptIntrinsicResize::kernelU2(const RsForEachStubParamStruct *p,
const int srcWidth = cp->mAlloc->mHal.drvState.lod[0].dimX;
const size_t stride = cp->mAlloc->mHal.drvState.lod[0].stride;
- float yf = p->y * cp->scaleY;
- int starty = (int) floor(yf - 2);
+ float yf = (p->y + 0.5f) * cp->scaleY - 0.5f;
+ int starty = (int) floor(yf - 1);
yf = yf - floor(yf);
int maxy = srcHeight - 1;
int ys0 = rsMax(0, starty + 0);
@@ -248,7 +248,7 @@ void RsdCpuScriptIntrinsicResize::kernelU2(const RsForEachStubParamStruct *p,
uint32_t x2 = xend;
while(x1 < x2) {
- float xf = x1 * cp->scaleX;
+ float xf = (x1 + 0.5f) * cp->scaleX - 0.5f;
*out = OneBiCubic(yp0, yp1, yp2, yp3, xf, yf, srcWidth);
out++;
x1++;
@@ -269,8 +269,8 @@ void RsdCpuScriptIntrinsicResize::kernelU1(const RsForEachStubParamStruct *p,
const int srcWidth = cp->mAlloc->mHal.drvState.lod[0].dimX;
const size_t stride = cp->mAlloc->mHal.drvState.lod[0].stride;
- float yf = p->y * cp->scaleY;
- int starty = (int) floor(yf - 2);
+ float yf = (p->y + 0.5f) * cp->scaleY - 0.5f;
+ int starty = (int) floor(yf - 1);
yf = yf - floor(yf);
int maxy = srcHeight - 1;
int ys0 = rsMax(0, starty + 0);
@@ -288,7 +288,7 @@ void RsdCpuScriptIntrinsicResize::kernelU1(const RsForEachStubParamStruct *p,
uint32_t x2 = xend;
while(x1 < x2) {
- float xf = x1 * cp->scaleX;
+ float xf = (x1 + 0.5f) * cp->scaleX - 0.5f;
*out = OneBiCubic(yp0, yp1, yp2, yp3, xf, yf, srcWidth);
out++;
x1++;
diff --git a/cpu_ref/rsCpuIntrinsicYuvToRGB.cpp b/cpu_ref/rsCpuIntrinsicYuvToRGB.cpp
index c53ef313..e191e25d 100644
--- a/cpu_ref/rsCpuIntrinsicYuvToRGB.cpp
+++ b/cpu_ref/rsCpuIntrinsicYuvToRGB.cpp
@@ -161,8 +161,8 @@ void RsdCpuScriptIntrinsicYuvToRGB::kernel(const RsForEachStubParamStruct *p,
out++;
x1++;
}
-// reenable for ARM64 when intrinsic is fixed
-#if defined(ARCH_ARM_USE_INTRINSICS) && !defined(ARCH_ARM64_USE_INTRINSICS)
+
+#if defined(ARCH_ARM_USE_INTRINSICS)
if((x2 > x1) && gArchUseSIMD) {
int32_t len = x2 - x1;
if (cstep == 1) {
diff --git a/cpu_ref/rsCpuIntrinsics_advsimd_Blur.S b/cpu_ref/rsCpuIntrinsics_advsimd_Blur.S
index 929f76f7..fc1eefee 100644
--- a/cpu_ref/rsCpuIntrinsics_advsimd_Blur.S
+++ b/cpu_ref/rsCpuIntrinsics_advsimd_Blur.S
@@ -52,17 +52,17 @@
* x6 -- rup
* x7 -- rdn
* x12 -- switch index
- * q0-q3 -- coefficient table
+ * v0-v3 -- coefficient table
* x13 = -pitch
* x15 = top-row in
* x19 = bottom-row in
* Output:
* x1 += 16
- * q10,q11 -- 16 convolved columns
+ * v10,v11 -- 16 convolved columns
* Modifies:
* x10 = upper row pointer
* x11 = lower row pointer
- * q12-q15 = temporary sums
+ * v12-v15 = temporary sums
*/
.macro fetch, max_r=MAX_R, labelc=1, labelnc=2, reg=x12 /*{{{*/
.ifc \reg,x12 ; .set cc, 1 ; .else ; .set cc, 0 ; .endif
@@ -146,15 +146,15 @@ nop
* When the buffer gets too big the buffer at [x9] is used.
*
* Input:
- * q4-q11 -- convoltion window
+ * v16-v31,v4-v11 -- convoltion window
* x9 -- pointer to additional convolution window data
* Output:
* x9 -- updated buffer pointer (if used)
* d31 -- result to be stored
* Modifies:
* x12 -- temp buffer pointer
- * q12-q13 -- temporaries for load and vext operations.
- * q14-q15 -- intermediate sums
+ * v12-v13 -- temporaries for load and vext operations.
+ * v14-v15 -- intermediate sums
*/
#define TUNED_LIST1 8, 16
.macro hconv1_8/*{{{*/
@@ -407,7 +407,7 @@ nop
umlal2 v15.4s, v12.8h, v3.h[1]
umlal v14.4s, v13.4h, v3.h[1]
umlal2 v15.4s, v13.8h, v3.h[1]
- 124: ext v12.16b, v3.16b, v4.16b, #7*2
+ 124: ext v12.16b, v31.16b, v4.16b, #7*2
ext v13.16b, v9.16b, v10.16b, #7*2
umlal v14.4s, v12.4h, v3.h[0]
umlal2 v15.4s, v12.8h, v3.h[0]
@@ -1055,64 +1055,47 @@ PRIVATE(fetch_generic_asm)
ret
END(fetch_generic_asm)
-/* Given values in q10 and q11, and an index in x11, sweep the (x11&15)th value
+/* Given values in v10 and v11, and an index in x11, sweep the (x11&15)th value
* across to fill the rest of the register pair. Used for filling the right
* hand edge of the window when starting too close to the right hand edge of
* the image.
+ * Also returns a dup-ed copy of the last element in v12 for the tail-fill
+ * case (this happens incidentally in common path, but must be done
+ * deliberately in the fast-out path).
*/
-PRIVATE(prefetch_clamp1)
- sub x11, xzr, x11
- sub x15, x15, x1
- sub x19, x19, x1
- tbz x11, #3, 1f
- mov v11.16b, v10.16b
- sub x1, x1, #16
-1: mov v12.16b, v11.16b
- movi v13.8b, #0xff
- tbz x11, #2, 1f
- ext v12.16b, v12.16b, v12.16b, #4*2
- sub x1, x1, #8
- shl v13.2d, v13.2d, #32
-1: tbz x11, #1, 1f
- ext v12.16b, v12.16b, v12.16b, #6*2
- sub x1, x1, #4
- shl v13.2d, v13.2d, #16
-1: tbz x11, #0, 1f
- ext v12.16b, v12.16b, v12.16b, #7*2
- sub x1, x1, #2
- shl v13.2d, v13.2d, #8
-1: dup v12.8h, v12.h[6]
- sxtl v13.8h, v13.8b
- bif v11.16b, v12.16b, v13.16b
-1: tbz x11, #3, 1f
- mov v10.16b, v11.16b
- mov v11.16b, v12.16b
-1: sub x11, xzr, x11
- add x15, x15, x1
- add x19, x19, x1
+PRIVATE(prefetch_clampright1)
+ ands x12, x11, #15
+ beq 1f
+ sub x12, x12, #1
+ sub sp, sp, #64
+ st1 {v10.8h,v11.8h}, [sp]
+ add x12, sp, x12, LSL #1
+ ld1r {v12.8h}, [x12]
+ st1 {v12.8h}, [x12], #16
+ st1 {v12.8h}, [x12]
+ ld1 {v10.8h,v11.8h}, [sp]
+ add sp, sp, #64
+ ret
+1: dup v12.8h, v11.h[7]
+ ret
+END(prefetch_clampright1)
+
+PRIVATE(prefetch_clampright4)
+ ands x12, x11, #15
+ beq 1f
+ sub x12, x12, #4
+ sub sp, sp, #64
+ st1 {v10.8h,v11.8h}, [sp]
+ add x12, sp, x12, LSL #1
+ ld1r {v12.2d}, [x12]
+ st1 {v12.8h}, [x12], #16
+ st1 {v12.8h}, [x12]
+ ld1 {v10.8h,v11.8h}, [sp]
+ add sp, sp, #64
ret
-END(prefetch_clamp1)
-
-PRIVATE(prefetch_clamp4)
- sub x11, xzr, x11
- sub x15, x15, x1
- sub x19, x19, x1
- tbz x11, #3, 1f
- sub x1, x1, #16 // what's this?
- mov v11.16b, v10.16b
1: dup v12.2d, v11.d[1]
- tbz x11, #2, 1f
- dup v12.2d, v11.d[0]
- sub x1, x1, #8
- dup v11.2d, v11.d[0]
-1: tbz x11, #3, 1f
- mov v10.16b, v11.16b
- mov v11.16b, v12.16b
-1: sub x11, xzr, x11
- add x15, x15, x1
- add x19, x19, x1
ret
-END(prefetch_clamp4)
+END(prefetch_clampright4)
/* Helpers for prefetch, below.
@@ -1147,10 +1130,10 @@ END(prefetch_clamp4)
prefetch_out \qa, \qb, \store, v10.16b, v11.16b, v11.d[1]
bl fetch_generic_asm
b 2f
-3: bl prefetch_clamp\step
+3: bl prefetch_clampright\step
prefetch_out \qa, \qb, \store, v10.16b, v11.16b, v11.d[1]
4: b 4f+4
- //v12 contains pad word from prefetch_clamp call
+ //v12 contains pad word from prefetch_clampright call
prefetch_out \qa, \qb, \store, v12.16b, v12.16b, v12.d[1]
.if \rem > 0
b 4f+4
@@ -1209,24 +1192,18 @@ END(prefetch_clamp4)
.else
dup v9.2d, v10.d[0]
.endif
- tst x10, #15
+ ands x12, x10, #15
beq 2f
- sub x12, xzr, x10
- tbz x10, #3, 1f
- mov v11.16b, v10.16b
- mov v10.16b, v9.16b
-1: tbz x12, #2, 1f
- ext v11.16b, v10.16b, v11.16b, #4*2
- ext v10.16b, v9.16b, v10.16b, #4*2
- .if \step == 1
- 1: tbz x12, #1, 1f
- ext v11.16b, v10.16b, v11.16b, #2*2
- ext v10.16b, v9.16b, v10.16b, #2*2
- 1: tbz x12, #0, 1f
- ext v11.16b, v10.16b, v11.16b, #1*2
- ext v10.16b, v9.16b, v10.16b, #1*2
- .endif
-1: sub x1, x1, x10
+ sub sp, sp, #32
+ st1 {v10.8h,v11.8h}, [sp]
+ sub x12, sp, x12, LSL #1
+ sub sp, sp, #16
+ st1 {v9.8h}, [sp]
+ sub sp, sp, #16
+ st1 {v9.8h}, [sp]
+ ld1 {v10.8h,v11.8h}, [x12]
+ add sp, sp, #64
+ sub x1, x1, x10
sub x15, x15, x10
sub x19, x19, x10
bic x10, x10, #15
@@ -1363,13 +1340,13 @@ END(prefetch_clamp4)
b 3b
4: tbz x3, #2, 1f
st1 {v15.s}[0], [x0], #4
- ext v15.16b, v15.16b, v15.16b, #4*2
+ ext v15.8b, v15.8b, v15.8b, #4
1: tbz x3, #1, 1f
st1 {v15.h}[0], [x0], #2
- ext v15.16b, v15.16b, v15.16b, #2*2
+ ext v15.8b, v15.8b, v15.8b, #2
1: tbz x3, #0, 5f
st1 {v15.b}[0], [x0], #1
- ext v15.16b, v15.16b, v15.16b, #1*2
+ ext v15.8b, v15.8b, v15.8b, #1
5: nop
.endm
@@ -1438,7 +1415,6 @@ ENTRY(rsdIntrinsicBlurU1_K)
ldr x12, [sp, #88] // tab
- add x0, x0, x8
add x1, x1, x8
cmp x6, x5
@@ -1448,7 +1424,7 @@ ENTRY(rsdIntrinsicBlurU1_K)
cmp x8, x5
csel x8, x5, x8, hs
cmp x9, x5
- csel x9, x5, x8, hs
+ csel x9, x5, x9, hs
add x4, x8, x9
add x4, x4, x3
@@ -1504,7 +1480,6 @@ ENTRY(rsdIntrinsicBlurU4_K)
ldr x12, [sp, #88]
- add x0, x0, x8, LSL #2
add x1, x1, x8, LSL #2
cmp x6, x5
diff --git a/cpu_ref/rsCpuIntrinsics_advsimd_YuvToRGB.S b/cpu_ref/rsCpuIntrinsics_advsimd_YuvToRGB.S
index 632ef7a4..bb4b7ae3 100644
--- a/cpu_ref/rsCpuIntrinsics_advsimd_YuvToRGB.S
+++ b/cpu_ref/rsCpuIntrinsics_advsimd_YuvToRGB.S
@@ -21,60 +21,127 @@
* register. This macro will be called from within several different wrapper
* variants for different data layouts. Y data starts with the even and odd
* bytes split into the low parts of v8 and v9 respectively. U and V are in
- * v16 and v17. Working constants are pre-loaded into v13-v15, and v3 is
- * pre-loaded with a constant 0xff alpha channel.
+ * v10 and v11. Working constants are pre-loaded into v24-v31, and v3 and v7
+ * are pre-loaded with a constant 0xff alpha channel.
*
* The complicated arithmetic is the result of refactoring the original
* equations to avoid 16-bit overflow without losing any precision.
*/
-.macro yuvkern
- movi v7.8b, #149
-
- umull v1.8h, v8.8b, v7.8b // g0 = y0 * 149
- umull v5.8h, v9.8b, v7.8b // g1 = y1 * 149
-
- movi v7.8b, #50
- movi v10.8b, #104
- umull v8.8h, v16.8b, v7.8b // g2 = u * 50 + v * 104
- umlal v8.8h, v17.8b, v10.8b
-
- ushr v7.8b, v17.8b, #1
- uaddw v0.8h, v1.8h, v7.8b // r0 = y0 * 149 + (v >> 1)
- uaddw v4.8h, v5.8h, v7.8b // r1 = y1 * 149 + (v >> 1)
-
- ushll v7.8h, v16.8b, #2
- add v2.8h, v1.8h, v7.8h // b0 = y0 * 149 + (u << 2)
- add v6.8h, v5.8h, v7.8h // b1 = y1 * 149 + (u << 2)
-
- movi v7.16b, #204
- movi v10.8b, #254
- umull v11.8h, v17.8b, v7.8b // r2 = v * 204
- umull v12.8h, v16.8b, v10.8b // b2 = u * 254
-
- uhadd v0.8h, v0.8h, v11.8h // r0 = (r0 + r2) >> 1
- uhadd v4.8h, v4.8h, v11.8h // r1 = (r1 + r2) >> 1
- uqadd v1.8h, v1.8h, v14.8h // g0 = satu16(g0 + (-16 * 149 + 128 * 50 + 128 * 104) >> 0)
- uqadd v5.8h, v5.8h, v14.8h // g1 = satu16(g1 + (-16 * 149 + 128 * 50 + 128 * 104) >> 0)
- uhadd v2.8h, v2.8h, v12.8h // b0 = (b0 + b2) >> 1
- uhadd v6.8h, v6.8h, v12.8h // b1 = (b1 + b2) >> 1
-
- uqsub v0.8h, v0.8h, v13.8h // r0 = satu16(r0 - (16 * 149 + (128 >> 1) + 128 * 204) >> 1)
- uqsub v4.8h, v4.8h, v13.8h // r1 = satu16(r1 - (16 * 149 + (128 >> 1) + 128 * 204) >> 1)
- uqsub v1.8h, v1.8h, v8.8h // g0 = satu16(g0 - g2)
- uqsub v5.8h, v5.8h, v8.8h // g1 = satu16(g1 - g2)
- uqsub v2.8h, v2.8h, v15.8h // b0 = satu16(b0 - (16 * 149 + (128 << 2) + 128 * 254) >> 1)
- uqsub v6.8h, v6.8h, v15.8h // b1 = satu16(b1 - (16 * 149 + (128 << 2) + 128 * 254) >> 1)
-
- uqrshrn v0.8b, v0.8h, #6
- uqrshrn v4.8b, v4.8h, #6
- uqrshrn v1.8b, v1.8h, #7
- uqrshrn v5.8b, v5.8h, #7
- uqrshrn v2.8b, v2.8h, #6
- uqrshrn v6.8b, v6.8h, #6
-
- zip1 v0.16b, v0.16b, v4.16b
- zip1 v1.16b, v1.16b, v5.16b
- zip1 v2.16b, v2.16b, v6.16b
+.macro yuvkern, regu=v10, regv=v11
+ /* v0 out R_lo / even R_lo accumulator
+ * v1 out G_lo / even G_lo accumulator
+ * v2 out B_lo / even B_lo accumulator
+ * v3 out A_lo / const 0xff*ff
+ * v4 out R_hi / even R_hi accumulator
+ * v5 out G_hi / even G_hi accumulator
+ * v6 out B_hi / even B_hi accumulator
+ * v7 out A_hi / const 0xff*ff
+ * v8 even Y / G_lo luma tmp
+ * v9 odd Y / G_lo luma tmp
+ * \regu in U
+ * \regv in V
+ * v12 R_lo luma tmp
+ * v13 B_lo luma tmp
+ * v14 R_hi luma tmp
+ * v15 B_hi luma tmp
+ * v16 odd R_lo accumulator
+ * v17 odd G_lo accumulator
+ * v18 odd B_lo accumulator
+ * v19 multiplier extra bits low
+ * v20 odd R_hi accumulator
+ * v21 odd G_hi accumulator
+ * v22 odd B_hi accumulator
+ * v23 multiplier extra bits high
+ * v24 constant 149
+ * v25 constant 50
+ * v26 constant 104
+ * v27 constant 204
+ * v28 constant 254
+ * v29 constant ((16 * 149 + (128 >> 1) + 128 * 204) >> 1)
+ * v30 constant ((-16 * 149 + 128 * 50 + 128 * 104) >> 0)
+ * v31 constant ((16 * 149 + (128 << 2) + 128 * 254) >> 1)
+ */
+
+ umull v1.8h, v8.8b, v24.8b // g0 = y0 * 149
+ umull v17.8h, v9.8b, v24.8b // g1 = y1 * 149
+ umull2 v5.8h, v8.16b, v24.16b // g0_hi = y0_hi * 149
+ umull2 v21.8h, v9.16b, v24.16b // g1_hi = y1_hi * 149
+
+ umull v8.8h, \regu\().8b, v25.8b // g2 = u * 50 + v * 104
+ umlal v8.8h, \regv\().8b, v26.8b
+ umull2 v9.8h, \regu\().16b, v25.16b // g2_hi = u_hi * 50 + v_hi * 104
+ umlal2 v9.8h, \regv\().16b, v26.16b
+
+ ushr v19.16b, \regv\().16b, #1
+ uaddw v0.8h, v1.8h, v19.8b // r0 = g0 + (v >> 1)
+ uaddw v16.8h, v17.8h, v19.8b // r1 = g1 + (v >> 1)
+
+ uaddw2 v4.8h, v5.8h, v19.16b // r0_hi = g0_hi + (v_hi >> 1)
+ uaddw2 v20.8h, v21.8h, v19.16b // r1_hi = g1_hi + (v_hi >> 1)
+
+ ushll v19.8h, \regu\().8b, #2
+ ushll2 v23.8h, \regu\().16b, #2
+ add v2.8h, v1.8h, v19.8h // b0 = g0 + (u << 2)
+ add v18.8h, v17.8h, v19.8h // b1 = g1 + (u << 2)
+
+ add v6.8h, v5.8h, v23.8h // b0_hi = g0_hi + (u_hi << 2)
+ add v22.8h, v21.8h, v23.8h // b1_hi = g1_hi + (u_hi << 2)
+
+ umull v12.8h, \regv\().8b, v27.8b // r2 = v * 204
+ umull v13.8h, \regu\().8b, v28.8b // b2 = u * 254
+
+ umull2 v14.8h, \regv\().16b, v27.16b // r2_hi = v_hi * 204
+ umull2 v15.8h, \regu\().16b, v28.16b // b2_hi = u_hi * 254
+
+ uhadd v0.8h, v0.8h, v12.8h // r0 = (r0 + r2) >> 1
+ uhadd v16.8h, v16.8h, v12.8h // r1 = (r1 + r2) >> 1
+ uqadd v1.8h, v1.8h, v30.8h // g0 = satu16(g0 + (-16 * 149 + 128 * 50 + 128 * 104) >> 0)
+ uqadd v17.8h, v17.8h, v30.8h // g1 = satu16(g1 + (-16 * 149 + 128 * 50 + 128 * 104) >> 0)
+ uhadd v2.8h, v2.8h, v13.8h // b0 = (b0 + b2) >> 1
+ uhadd v18.8h, v18.8h, v13.8h // b1 = (b1 + b2) >> 1
+
+ uhadd v4.8h, v4.8h, v14.8h // r0_hi = (r0_hi + r2_hi) >> 1
+ uhadd v20.8h, v20.8h, v14.8h // r1_hi = (r1_hi + r2_hi) >> 1
+ uqadd v5.8h, v5.8h, v30.8h // g0_hi = satu16(g0_hi + (-16 * 149 + 128 * 50 + 128 * 104) >> 0)
+ uqadd v21.8h, v21.8h, v30.8h // g1_hi = satu16(g1_hi + (-16 * 149 + 128 * 50 + 128 * 104) >> 0)
+ uhadd v6.8h, v6.8h, v15.8h // b0_hi = (b0_hi + b2_hi) >> 1
+ uhadd v22.8h, v22.8h, v15.8h // b1_hi = (b1_hi + b2_hi) >> 1
+
+ uqsub v0.8h, v0.8h, v29.8h // r0 = satu16(r0 - (16 * 149 + (128 >> 1) + 128 * 204) >> 1)
+ uqsub v16.8h, v16.8h, v29.8h // r1 = satu16(r1 - (16 * 149 + (128 >> 1) + 128 * 204) >> 1)
+ uqsub v1.8h, v1.8h, v8.8h // g0 = satu16(g0 - g2)
+ uqsub v17.8h, v17.8h, v8.8h // g1 = satu16(g1 - g2)
+ uqsub v2.8h, v2.8h, v31.8h // b0 = satu16(b0 - (16 * 149 + (128 << 2) + 128 * 254) >> 1)
+ uqsub v18.8h, v18.8h, v31.8h // b1 = satu16(b1 - (16 * 149 + (128 << 2) + 128 * 254) >> 1)
+
+ uqsub v4.8h, v4.8h, v29.8h // r0_hi = satu16(r0_hi - (16 * 149 + (128 >> 1) + 128 * 204) >> 1)
+ uqsub v20.8h, v20.8h, v29.8h // r1_hi = satu16(r1_hi - (16 * 149 + (128 >> 1) + 128 * 204) >> 1)
+ uqsub v5.8h, v5.8h, v9.8h // g0_hi = satu16(g0_hi - g2_hi)
+ uqsub v21.8h, v21.8h, v9.8h // g1_hi = satu16(g1_hi - g2_hi)
+ uqsub v6.8h, v6.8h, v31.8h // b0_hi = satu16(b0_hi - (16 * 149 + (128 << 2) + 128 * 254) >> 1)
+ uqsub v22.8h, v22.8h, v31.8h // b1_hi = satu16(b1_hi - (16 * 149 + (128 << 2) + 128 * 254) >> 1)
+
+ uqrshrn v0.8b, v0.8h, #6
+ uqrshrn v16.8b, v16.8h, #6
+ uqrshrn v1.8b, v1.8h, #7
+ uqrshrn v17.8b, v17.8h, #7
+ uqrshrn v2.8b, v2.8h, #6
+ uqrshrn v18.8b, v18.8h, #6
+
+ uqrshrn v4.8b, v4.8h, #6
+ uqrshrn v20.8b, v20.8h, #6
+ uqrshrn v5.8b, v5.8h, #7
+ uqrshrn v21.8b, v21.8h, #7
+ uqrshrn v6.8b, v6.8h, #6
+ uqrshrn v22.8b, v22.8h, #6
+
+ zip1 v0.16b, v0.16b, v16.16b
+ zip1 v1.16b, v1.16b, v17.16b
+ zip1 v2.16b, v2.16b, v18.16b
+
+ zip1 v4.16b, v4.16b, v20.16b
+ zip1 v5.16b, v5.16b, v21.16b
+ zip1 v6.16b, v6.16b, v22.16b
.endm
/* Define the wrapper code which will load and store the data, iterate the
@@ -83,50 +150,51 @@
* being handled.
*/
.macro wrap_line kernel, interleaved=0, swapuv=0
-
+ movi v24.16b, #149
+ movi v25.16b, #50
+ movi v26.16b, #104
+ movi v27.16b, #204
+ movi v28.16b, #254
mov w5, #((16 * 149 + (128 >> 1) + 128 * 204) >> 1)
- dup v13.8h, w5
+ dup v29.8h, w5
mov w5, #((-16 * 149 + 128 * 50 + 128 * 104) >> 0)
- dup v14.8h, w5
+ dup v30.8h, w5
mov w5, #((16 * 149 + (128 << 2) + 128 * 254) >> 1)
- dup v15.8h, w5
+ dup v31.8h, w5
movi v3.16b, #0xff
+ movi v7.16b, #0xff
- subs x2, x2, #16
+ subs x2, x2, #32
bhs 1f
b 2f
.align 4
-1: ld2 {v8.8b,v9.8b}, [x1], #16
-// prfm PLDL1STRM, [x1, #256]
+1: ld2 {v8.16b,v9.16b}, [x1], #32
.if \interleaved
- .if \swapuv
- ld2 {v17.8b,v18.8b}, [x3], #16
- mov v16.8b, v18.8b
- .else
- ld2 {v16.8b,v17.8b}, [x3], #16
- .endif
-// prfm PLD1STRM, [x3, #256]
+ ld2 {v10.16b,v11.16b}, [x3], #32
.else
- ld1 {v16.8b}, [x3], #8
- ld1 {v17.8b}, [x4], #8
-// prfm PLD1STRM, [x3, #128]
-// prfm PLD1STRM, [x4, #128]
+ ld1 {v10.16b}, [x3], #16
+ ld1 {v11.16b}, [x4], #16
.endif
+ .if \swapuv
+ \kernel regu=v11, regv=v10
+ .else
\kernel
+ .endif
- subs x2, x2, #16
+ subs x2, x2, #32
- st4 {v0.16b,v1.16b,v2.16b,v3.16b}, [x0], #64
+ st4 {v0.16b - v3.16b}, [x0], #64
+ st4 {v4.16b - v7.16b}, [x0], #64
bhs 1b
-2: adds x2, x2, #16
+2: adds x2, x2, #32
beq 2f
- /* To handle the tail portion of the data (something less than 16
+ /* To handle the tail portion of the data (something less than 32
* bytes) load small power-of-two chunks into working registers. It
* doesn't matter where they end up in the register; the same process
* will store them back out using the same positions and the
@@ -135,40 +203,48 @@
*/
movi v8.8b, #0
movi v9.8b, #0
- movi v16.8b, #0
- movi v17.8b, #0
+ movi v10.8b, #0
+ movi v11.8b, #0
- tbz x2, #3, 1f
- ld1 {v9.8b}, [x1], #8
+ tbz x2, #4, 1f
+ ld1 {v9.16b}, [x1], #16
+ .if \interleaved
+ ld1 {v11.16b}, [x3], #16
+ .else
+ ld1 {v10.d}[1], [x3], #8
+ ld1 {v11.d}[1], [x4], #8
+ .endif
+1: tbz x2, #3, 1f
+ ld1 {v8.d}[1], [x1], #8
.if \interleaved
- ld1 {v17.8b}, [x3], #8
+ ld1 {v10.d}[1], [x3], #8
.else
- ld1 {v16.s}[1], [x3], #4
- ld1 {v17.s}[1], [x4], #4
+ ld1 {v10.s}[1], [x3], #4
+ ld1 {v11.s}[1], [x4], #4
.endif
1: tbz x2, #2, 1f
ld1 {v8.s}[1], [x1], #4
.if \interleaved
- ld1 {v16.s}[1], [x3], #4
+ ld1 {v10.s}[1], [x3], #4
.else
- ld1 {v16.h}[1], [x3], #2
- ld1 {v17.h}[1], [x4], #2
+ ld1 {v10.h}[1], [x3], #2
+ ld1 {v11.h}[1], [x4], #2
.endif
1: tbz x2, #1, 1f
ld1 {v8.h}[1], [x1], #2
.if \interleaved
- ld1 {v16.h}[1], [x3], #2
+ ld1 {v10.h}[1], [x3], #2
.else
- ld1 {v16.b}[1], [x3], #1
- ld1 {v17.b}[1], [x4], #1
+ ld1 {v10.b}[1], [x3], #1
+ ld1 {v11.b}[1], [x4], #1
.endif
1: tbz x2, #0, 1f
ld1 {v8.b}[1], [x1], #1
.if \interleaved
- ld1 {v16.h}[0], [x3], #2
+ ld1 {v10.h}[0], [x3], #2
.else
- ld1 {v16.b}[0], [x3], #1
- ld1 {v17.b}[0], [x4], #1
+ ld1 {v10.b}[0], [x3], #1
+ ld1 {v11.b}[0], [x4], #1
.endif
/* One small impediment in the process above is that some of the load
@@ -176,29 +252,38 @@
* same time as loading only part of a register. So the data is loaded
* linearly and unpacked manually at this point if necessary.
*/
-1: uzp1 v8.16b, v8.16b, v9.16b
+1: mov v12.16b, v8.16b
+ uzp1 v8.16b, v12.16b, v9.16b
+ uzp2 v9.16b, v12.16b, v9.16b
.if \interleaved
- .if \swapuv
- uzp1 v16.16b, v17.16b, v16.16b
- .else
- uzp1 v16.16b, v16.16b, v17.16b
- .endif
+ mov v12.16b, v10.16b
+ uzp1 v10.16b, v12.16b, v11.16b
+ uzp2 v11.16b, v12.16b, v11.16b
.endif
+ .if \swapuv
+ \kernel regu=v11, regv=v10
+ .else
\kernel
+ .endif
/* As above but with the output; structured stores for partial vectors
* aren't available, so the data is re-packed first and stored linearly.
*/
- zip1 v4.16b, v0.16b, v2.16b
- zip2 v6.16b, v0.16b, v2.16b
- zip1 v5.16b, v1.16b, v3.16b
- zip2 v7.16b, v1.16b, v3.16b
- zip1 v0.16b, v4.16b, v5.16b
- zip2 v1.16b, v4.16b, v5.16b
- zip1 v2.16b, v6.16b, v7.16b
- zip2 v3.16b, v6.16b, v7.16b
-
+ zip1 v16.16b, v0.16b, v2.16b
+ zip2 v18.16b, v0.16b, v2.16b
+ zip1 v17.16b, v1.16b, v3.16b
+ zip2 v19.16b, v1.16b, v3.16b
+ zip1 v0.16b, v16.16b, v17.16b
+ zip2 v1.16b, v16.16b, v17.16b
+ zip1 v2.16b, v18.16b, v19.16b
+ zip2 v3.16b, v18.16b, v19.16b
+
+ /* Luckily v4-v7 don't need to be unzipped because the complete set of
+ * four and can be stored using st4. */
+
+ tbz x2, #4, 1f
+ st4 {v4.16b - v7.16b}, [x0], #64
1: tbz x2, #3, 1f
st1 {v2.16b,v3.16b}, [x0], #32
1: tbz x2, #2, 1f
@@ -225,7 +310,7 @@ ENTRY(rsdIntrinsicYuv2_K)
add x1, x1, x4
add x4, x3, x6
add x3, x2, x6
- sub x2, x5, x6, LSL #2
+ sub x2, x5, x6, LSL #1
sub x6, sp, #32
sub sp, sp, #64
diff --git a/cpu_ref/rsCpuIntrinsics_neon_Blur.S b/cpu_ref/rsCpuIntrinsics_neon_Blur.S
index 8fc47f5b..a7ae795c 100644
--- a/cpu_ref/rsCpuIntrinsics_neon_Blur.S
+++ b/cpu_ref/rsCpuIntrinsics_neon_Blur.S
@@ -15,6 +15,7 @@
*/
#define ENTRY(f) .text; .align 4; .globl f; .type f,#function; f: .fnstart
+#define PRIVATE(f) .text; .align 4; .type f,#function; f: .fnstart
#define END(f) .fnend; .size f, .-f;
.eabi_attribute 25,1 @Tag_ABI_align8_preserved
@@ -1049,7 +1050,7 @@
/* Dedicated function wrapper for the fetch macro, for the cases where
* performance isn't that important, to keep code size down.
*/
-ENTRY(fetch_generic_asm)
+PRIVATE(fetch_generic_asm)
push {r10,r11}
fetch
pop {r10,r11}
@@ -1060,61 +1061,46 @@ END(fetch_generic_asm)
* across to fill the rest of the register pair. Used for filling the right
* hand edge of the window when starting too close to the right hand edge of
* the image.
+ * Also returns a dup-ed copy of the last element in q12 for the tail-fill
+ * case (this happens incidentally in common path, but must be done
+ * deliberately in the fast-out path).
*/
-ENTRY(prefetch_clamp1)
- rsb r11, r11, #0
- tst r11, #8
+PRIVATE(prefetch_clampright1)
+ ands r12, r11, #15
beq 1f
- vmov.u16 q11, q10
- sub r1, r1, #16
-1: vmov.u16 q12, q11
- vmov.i8 d26, #0xff
- tst r11, #4
- beq 1f
- vext.u16 q12, q12, q12, #4
- sub r1, r1, #8
- vshl.u64 d26, d26, #32
-1: tst r11, #2
- beq 1f
- vext.u16 q12, q12, q12, #6
- sub r1, r1, #4
- vshl.u64 d26, d26, #16
-1: tst r11, #1
- beq 1f
- vext.u16 q12, q12, q12, #7
- sub r1, r1, #2
- vshl.u64 d26, d26, #8
-1: vdup.u16 q12, d25[2]
- vmovl.s8 q13, d26
- vbif q11, q12, q13
-1: tst r11, #8
- beq 1f
- vmov q10, q11
- vmov q11, q12
-1: rsb r11, r11, #0
+ sub r12, r12, #1
+ sub sp, sp, #64
+ vst1.u16 {q10,q11}, [sp]
+ add r12, sp, r12, LSL #1
+ vld1.u16 {d24[]}, [r12]
+ vld1.u16 {d25[]}, [r12]
+ vst1.u16 {q12}, [r12]!
+ vst1.u16 {q12}, [r12]
+ vld1.u16 {q10,q11}, [sp]
+ add sp, sp, #64
+ bx lr
+1: vdup.u16 q12, d23[3]
bx lr
-END(prefetch_clamp1)
+END(prefetch_clampright1)
-ENTRY(prefetch_clamp4)
- rsb r11, r11, #0
- tst r11, #8
- beq 1f
- sub r1, r1, #16
- vmov.u16 q11, q10
-1: vmov d24, d23
- tst r11, #4
- beq 1f
- vmov d24, d22
- sub r1, r1, #8
- vmov d23, d22
-1: vmov d25, d24
- tst r11, #8
+PRIVATE(prefetch_clampright4)
+ ands r12, r11, #15
beq 1f
- vmov q10, q11
- vmov q11, q12
-1: rsb r11, r11, #0
+ sub r12, r12, #4
+ sub sp, sp, #64
+ vst1.u16 {q10,q11}, [sp]
+ add r12, sp, r12, LSL #1
+ vld1.u64 {d24}, [r12]
+ vld1.u64 {d25}, [r12]
+ vst1.u16 {q12}, [r12]!
+ vst1.u16 {q12}, [r12]
+ vld1.u16 {q10,q11}, [sp]
+ add sp, sp, #64
+ bx lr
+1: vmov.u16 d24, d23
+ vmov.u16 d25, d23
bx lr
-END(prefetch_clamp4)
+END(prefetch_clampright4)
/* Helpers for prefetch, below.
@@ -1147,10 +1133,10 @@ END(prefetch_clamp4)
prefetch_out \qa, \qb, \store, q10, q11, d23
bl fetch_generic_asm
b 2f
-3: bl prefetch_clamp\step
+3: bl prefetch_clampright\step
prefetch_out \qa, \qb, \store, q10, q11, d23
4: b 4f+4
- @q12 contains pad word from prefetch_clam call
+ @q12 contains pad word from prefetch_clampright call
prefetch_out \qa, \qb, \store, q12, q12, d25
.if \rem > 0
b 4f+4
@@ -1205,28 +1191,18 @@ END(prefetch_clamp4)
vmov.u16 d18, d20
vmov.u16 d19, d20
.endif
- tst r10, #15
+ ands r12, r10, #15
beq 2f
- rsb r12, r10, #0
- tst r10, #8
- beq 1f
- vmov.u16 q11, q10
- vmov.u16 q10, q9
-1: tst r12, #4
- beq 1f
- vext.u16 q11, q10, q11, #4
- vext.u16 q10, q9, q10, #4
- .if \step == 1
- 1: tst r12, #2
- beq 1f
- vext.u16 q11, q10, q11, #2
- vext.u16 q10, q9, q10, #2
- 1: tst r12, #1
- beq 1f
- vext.u16 q11, q10, q11, #1
- vext.u16 q10, q9, q10, #1
- .endif
-1: sub r1, r1, r10
+ sub sp, sp, #32
+ vst1.u16 {q10,q11}, [sp]
+ sub r12, sp, r12, LSL #1
+ sub sp, sp, #16
+ vst1.u16 {q9}, [sp]
+ sub sp, sp, #16
+ vst1.u16 {q9}, [sp]
+ vld1.u16 {q10,q11}, [r12]
+ add sp, sp, #64
+ sub r1, r1, r10
bic r10, r10, #15
add r1, r1, r10
2:
@@ -1383,7 +1359,7 @@ END(prefetch_clamp4)
.endm
.irep r, TUNED_LIST1, 25
-ENTRY(convolve1_\r)
+PRIVATE(convolve1_\r)
push {r12,lr}
sub r1, r1, r8
@@ -1397,7 +1373,7 @@ END(convolve1_\r)
.endr
.irep r, TUNED_LIST4, 25
-ENTRY(convolve4_\r)
+PRIVATE(convolve4_\r)
sub r12, sp, #0x200
bic r9, r12, #0x3fc
mov sp, r9
@@ -1447,8 +1423,7 @@ ENTRY(rsdIntrinsicBlurU1_K)
ldr r12, [sp,#124]
- add r0, r0, r8 @, LSL #2 /* for blur4 option */
- add r1, r1, r8 @, LSL #2 /* for blur4 option */
+ add r1, r1, r8
cmp r6, r5
movhi r6, r5
@@ -1503,7 +1478,6 @@ ENTRY(rsdIntrinsicBlurU4_K)
ldr r12, [sp,#124]
- add r0, r0, r8, LSL #2
add r1, r1, r8, LSL #2
cmp r6, r5
diff --git a/cpu_ref/rsCpuScript.cpp b/cpu_ref/rsCpuScript.cpp
index a11fda19..e8b3fb6d 100644
--- a/cpu_ref/rsCpuScript.cpp
+++ b/cpu_ref/rsCpuScript.cpp
@@ -230,6 +230,11 @@ static void setCompileArguments(std::vector<const char*>* args, const android::S
args->push_back("-mtriple");
args->push_back(DEFAULT_TARGET_TRIPLE_STRING);
+ // Enable workaround for A53 codegen by default.
+#if defined(__aarch64__) && !defined(DISABLE_A53_WORKAROUND)
+ args->push_back("-aarch64-fix-cortex-a53-835769");
+#endif
+
// Execute the bcc compiler.
if (useRSDebugContext) {
args->push_back("-rs-debug-ctx");