12 files changed, 347 insertions, 300 deletions
diff --git a/cpu_ref/Android.mk b/cpu_ref/Android.mk
index bd276bf2..f041ad96 100644
--- a/cpu_ref/Android.mk
+++ b/cpu_ref/Android.mk
@@ -42,6 +42,10 @@ LOCAL_SRC_FILES:= \
 
 LOCAL_CFLAGS_arm64 += -DARCH_ARM_USE_INTRINSICS -DARCH_ARM64_USE_INTRINSICS -DARCH_ARM64_HAVE_NEON
 
+ifeq ($(RS_DISABLE_A53_WORKAROUND),true)
+LOCAL_CFLAGS_arm64 += -DDISABLE_A53_WORKAROUND
+endif
+
 LOCAL_SRC_FILES_arm64 += \
     rsCpuIntrinsics_advsimd_3DLUT.S \
     rsCpuIntrinsics_advsimd_Convolve.S \
diff --git a/cpu_ref/rsCpuCore.cpp b/cpu_ref/rsCpuCore.cpp
index f09e3342..4285dae5 100644
--- a/cpu_ref/rsCpuCore.cpp
+++ b/cpu_ref/rsCpuCore.cpp
@@ -266,7 +266,7 @@ bool RsdCpuReferenceImpl::init(uint32_t version_major, uint32_t version_minor,
 
     GetCpuInfo();
 
-    int cpu = sysconf(_SC_NPROCESSORS_ONLN);
+    int cpu = sysconf(_SC_NPROCESSORS_CONF);
     if(mRSC->props.mDebugMaxThreads) {
         cpu = mRSC->props.mDebugMaxThreads;
     }
diff --git a/cpu_ref/rsCpuIntrinsicBlur.cpp b/cpu_ref/rsCpuIntrinsicBlur.cpp
index 123cc9f6..9dccd80d 100644
--- a/cpu_ref/rsCpuIntrinsicBlur.cpp
+++ b/cpu_ref/rsCpuIntrinsicBlur.cpp
@@ -297,7 +297,7 @@ void RsdCpuScriptIntrinsicBlur::kernelU4(const RsForEachStubParamStruct *p,
     uint32_t x2 = xend;
 
 #if defined(ARCH_ARM_USE_INTRINSICS)
-    if (gArchUseSIMD && !xstart && (xend == p->dimX)) {
+    if (gArchUseSIMD) {
         rsdIntrinsicBlurU4_K(out, (uchar4 const *)(pin + stride * p->y), p->dimX, p->dimY,
                  stride, x1, p->y, x2 - x1, cp->mIradius, cp->mIp + cp->mIradius);
         return;
@@ -367,9 +367,9 @@ void RsdCpuScriptIntrinsicBlur::kernelU1(const RsForEachStubParamStruct *p,
     uint32_t x2 = xend;
 
 #if defined(ARCH_ARM_USE_INTRINSICS)
-    if (gArchUseSIMD && !xstart && (xend == p->dimX)) {
+    if (gArchUseSIMD) {
         rsdIntrinsicBlurU1_K(out, pin + stride * p->y, p->dimX, p->dimY,
-                 stride, 0, p->y, p->dimX, cp->mIradius, cp->mIp + cp->mIradius);
+                 stride, x1, p->y, x2 - x1, cp->mIradius, cp->mIp + cp->mIradius);
         return;
     }
 #endif
diff --git a/cpu_ref/rsCpuIntrinsicColorMatrix.cpp b/cpu_ref/rsCpuIntrinsicColorMatrix.cpp
index 8c852778..6a7808e7 100644
--- a/cpu_ref/rsCpuIntrinsicColorMatrix.cpp
+++ b/cpu_ref/rsCpuIntrinsicColorMatrix.cpp
@@ -126,7 +126,7 @@ typedef union {
 } Key_t;
 
 //Re-enable when intrinsic is fixed
-#if 0 && defined(ARCH_ARM64_USE_INTRINSICS)
+#if defined(ARCH_ARM64_USE_INTRINSICS)
 typedef struct {
     void (*column[4])(void);
     void (*store)(void);
@@ -184,7 +184,7 @@ protected:
     int ipa[4];
     float tmpFp[16];
     float tmpFpa[4];
-#if 0 && defined(ARCH_ARM64_USE_INTRINSICS)
+#if defined(ARCH_ARM64_USE_INTRINSICS)
     FunctionTab_t mFnTab;
 #endif
 
@@ -910,16 +910,20 @@ void RsdCpuScriptIntrinsicColorMatrix::kernel(const RsForEachStubParamStruct *p,
                 out += outstep * len;
                 in += instep * len;
             }
-#if 0 && defined(ARCH_ARM64_USE_INTRINSICS)
+#if defined(ARCH_ARM64_USE_INTRINSICS)
             else {
                 if (cp->mLastKey.u.inType == RS_TYPE_FLOAT_32 || cp->mLastKey.u.outType == RS_TYPE_FLOAT_32) {
-                    rsdIntrinsicColorMatrix_float_K(out, in, len, &cp->mFnTab, cp->tmpFp, cp->tmpFpa);
+                    // Currently this generates off by one errors.
+                    //rsdIntrinsicColorMatrix_float_K(out, in, len, &cp->mFnTab, cp->tmpFp, cp->tmpFpa);
+                    //x1 += len;
+                    //out += outstep * len;
+                    //in += instep * len;
                 } else {
                     rsdIntrinsicColorMatrix_int_K(out, in, len, &cp->mFnTab, cp->ip, cp->ipa);
+                    x1 += len;
+                    out += outstep * len;
+                    in += instep * len;
                 }
-                x1 += len;
-                out += outstep * len;
-                in += instep * len;
             }
 #endif
         }
@@ -971,7 +975,7 @@ void RsdCpuScriptIntrinsicColorMatrix::preLaunch(
         if (build(key)) {
             mOptKernel = (void (*)(void *, const void *, const short *, uint32_t)) mBuf;
         }
-#if 0 && defined(ARCH_ARM64_USE_INTRINSICS)
+#if defined(ARCH_ARM64_USE_INTRINSICS)
         else {
             int dt = key.u.outVecSize + (key.u.outType == RS_TYPE_FLOAT_32 ? 4 : 0);
             int st = key.u.inVecSize + (key.u.inType == RS_TYPE_FLOAT_32 ? 4 : 0);
diff --git a/cpu_ref/rsCpuIntrinsicConvolve3x3.cpp b/cpu_ref/rsCpuIntrinsicConvolve3x3.cpp
index 552a8353..e5953cf3 100644
--- a/cpu_ref/rsCpuIntrinsicConvolve3x3.cpp
+++ b/cpu_ref/rsCpuIntrinsicConvolve3x3.cpp
@@ -105,7 +105,7 @@ static void ConvolveOneU4(const RsForEachStubParamStruct *p, uint32_t x, uchar4
                 convert_float4(py2[x]) * coeff[7] +
                 convert_float4(py2[x2]) * coeff[8];
 
-    px = clamp(px, 0.f, 255.f);
+    px = clamp(px + 0.5f, 0.f, 255.f);
     uchar4 o = {(uchar)px.x, (uchar)px.y, (uchar)px.z, (uchar)px.w};
     *out = o;
 }
@@ -127,7 +127,7 @@ static void ConvolveOneU2(const RsForEachStubParamStruct *p, uint32_t x, uchar2
                 convert_float2(py2[x]) * coeff[7] +
                 convert_float2(py2[x2]) * coeff[8];
 
-    px = clamp(px, 0.f, 255.f);
+    px = clamp(px + 0.5f, 0.f, 255.f);
     *out = convert_uchar2(px);
 }
 
@@ -147,7 +147,7 @@ static void ConvolveOneU1(const RsForEachStubParamStruct *p, uint32_t x, uchar *
                ((float)py2[x1]) * coeff[6] +
                ((float)py2[x]) * coeff[7] +
                ((float)py2[x2]) * coeff[8];
-    *out = clamp(px, 0.f, 255.f);
+    *out = clamp(px + 0.5f, 0.f, 255.f);
 }
 
 static void ConvolveOneF4(const RsForEachStubParamStruct *p, uint32_t x, float4 *out,
diff --git a/cpu_ref/rsCpuIntrinsicConvolve5x5.cpp b/cpu_ref/rsCpuIntrinsicConvolve5x5.cpp
index e2a6b8b1..a2c29fd3 100644
--- a/cpu_ref/rsCpuIntrinsicConvolve5x5.cpp
+++ b/cpu_ref/rsCpuIntrinsicConvolve5x5.cpp
@@ -125,7 +125,7 @@ static void OneU4(const RsForEachStubParamStruct *p, uint32_t x, uchar4 *out,
                 convert_float4(py4[x2]) * coeff[22] +
                 convert_float4(py4[x3]) * coeff[23] +
                 convert_float4(py4[x4]) * coeff[24];
-    px = clamp(px, 0.f, 255.f);
+    px = clamp(px + 0.5f, 0.f, 255.f);
     *out = convert_uchar4(px);
 }
 
@@ -168,7 +168,7 @@ static void OneU2(const RsForEachStubParamStruct *p, uint32_t x, uchar2 *out,
                 convert_float2(py4[x2]) * coeff[22] +
                 convert_float2(py4[x3]) * coeff[23] +
                 convert_float2(py4[x4]) * coeff[24];
-    px = clamp(px, 0.f, 255.f);
+    px = clamp(px + 0.5f, 0.f, 255.f);
     *out = convert_uchar2(px);
 }
 
@@ -211,7 +211,7 @@ static void OneU1(const RsForEachStubParamStruct *p, uint32_t x, uchar *out,
                (float)(py4[x2]) * coeff[22] +
                (float)(py4[x3]) * coeff[23] +
                (float)(py4[x4]) * coeff[24];
-    px = clamp(px, 0.f, 255.f);
+    px = clamp(px + 0.5f, 0.f, 255.f);
     *out = px;
 }
 
diff --git a/cpu_ref/rsCpuIntrinsicResize.cpp b/cpu_ref/rsCpuIntrinsicResize.cpp
index 474f82d1..19607c97 100644
--- a/cpu_ref/rsCpuIntrinsicResize.cpp
+++ b/cpu_ref/rsCpuIntrinsicResize.cpp
@@ -83,7 +83,7 @@ static float cubicInterpolate(float p0,float p1,float p2,float p3 , float x) {
 
 static uchar4 OneBiCubic(const uchar4 *yp0, const uchar4 *yp1, const uchar4 *yp2, const uchar4 *yp3,
                          float xf, float yf, int width) {
-    int startx = (int) floor(xf - 2);
+    int startx = (int) floor(xf - 1);
     xf = xf - floor(xf);
     int maxx = width - 1;
     int xs0 = rsMax(0, startx + 0);
@@ -112,13 +112,13 @@ static uchar4 OneBiCubic(const uchar4 *yp0, const uchar4 *yp1, const uchar4 *yp2
                                   convert_float4(yp3[xs3]), xf);
 
     float4 p  = cubicInterpolate(p0, p1, p2, p3, yf);
-    p = clamp(p, 0.f, 255.f);
+    p = clamp(p + 0.5f, 0.f, 255.f);
     return convert_uchar4(p);
 }
 
 static uchar2 OneBiCubic(const uchar2 *yp0, const uchar2 *yp1, const uchar2 *yp2, const uchar2 *yp3,
                          float xf, float yf, int width) {
-    int startx = (int) floor(xf - 2);
+    int startx = (int) floor(xf - 1);
     xf = xf - floor(xf);
     int maxx = width - 1;
     int xs0 = rsMax(0, startx + 0);
@@ -147,13 +147,13 @@ static uchar2 OneBiCubic(const uchar2 *yp0, const uchar2 *yp1, const uchar2 *yp2
                                   convert_float2(yp3[xs3]), xf);
 
     float2 p  = cubicInterpolate(p0, p1, p2, p3, yf);
-    p = clamp(p, 0.f, 255.f);
+    p = clamp(p + 0.5f, 0.f, 255.f);
     return convert_uchar2(p);
 }
 
 static uchar OneBiCubic(const uchar *yp0, const uchar *yp1, const uchar *yp2, const uchar *yp3,
                         float xf, float yf, int width) {
-    int startx = (int) floor(xf - 2);
+    int startx = (int) floor(xf - 1);
     xf = xf - floor(xf);
     int maxx = width - 1;
     int xs0 = rsMax(0, startx + 0);
@@ -171,7 +171,7 @@ static uchar OneBiCubic(const uchar *yp0, const uchar *yp1, const uchar *yp2, co
                                  (float)yp3[xs2], (float)yp3[xs3], xf);
 
     float p  = cubicInterpolate(p0, p1, p2, p3, yf);
-    p = clamp(p, 0.f, 255.f);
+    p = clamp(p + 0.5f, 0.f, 255.f);
     return (uchar)p;
 }
 
@@ -189,8 +189,8 @@ void RsdCpuScriptIntrinsicResize::kernelU4(const RsForEachStubParamStruct *p,
     const int srcWidth = cp->mAlloc->mHal.drvState.lod[0].dimX;
     const size_t stride = cp->mAlloc->mHal.drvState.lod[0].stride;
 
-    float yf = p->y * cp->scaleY;
-    int starty = (int) floor(yf - 2);
+    float yf = (p->y + 0.5f) * cp->scaleY - 0.5f;
+    int starty = (int) floor(yf - 1);
     yf = yf - floor(yf);
     int maxy = srcHeight - 1;
     int ys0 = rsMax(0, starty + 0);
@@ -208,7 +208,7 @@ void RsdCpuScriptIntrinsicResize::kernelU4(const RsForEachStubParamStruct *p,
     uint32_t x2 = xend;
 
     while(x1 < x2) {
-        float xf = x1 * cp->scaleX;
+        float xf = (x1 + 0.5f) * cp->scaleX - 0.5f;
         *out = OneBiCubic(yp0, yp1, yp2, yp3, xf, yf, srcWidth);
         out++;
         x1++;
@@ -229,8 +229,8 @@ void RsdCpuScriptIntrinsicResize::kernelU2(const RsForEachStubParamStruct *p,
     const int srcWidth = cp->mAlloc->mHal.drvState.lod[0].dimX;
     const size_t stride = cp->mAlloc->mHal.drvState.lod[0].stride;
 
-    float yf = p->y * cp->scaleY;
-    int starty = (int) floor(yf - 2);
+    float yf = (p->y + 0.5f) * cp->scaleY - 0.5f;
+    int starty = (int) floor(yf - 1);
     yf = yf - floor(yf);
     int maxy = srcHeight - 1;
     int ys0 = rsMax(0, starty + 0);
@@ -248,7 +248,7 @@ void RsdCpuScriptIntrinsicResize::kernelU2(const RsForEachStubParamStruct *p,
     uint32_t x2 = xend;
 
     while(x1 < x2) {
-        float xf = x1 * cp->scaleX;
+        float xf = (x1 + 0.5f) * cp->scaleX - 0.5f;
         *out = OneBiCubic(yp0, yp1, yp2, yp3, xf, yf, srcWidth);
         out++;
         x1++;
@@ -269,8 +269,8 @@ void RsdCpuScriptIntrinsicResize::kernelU1(const RsForEachStubParamStruct *p,
     const int srcWidth = cp->mAlloc->mHal.drvState.lod[0].dimX;
     const size_t stride = cp->mAlloc->mHal.drvState.lod[0].stride;
 
-    float yf = p->y * cp->scaleY;
-    int starty = (int) floor(yf - 2);
+    float yf = (p->y + 0.5f) * cp->scaleY - 0.5f;
+    int starty = (int) floor(yf - 1);
     yf = yf - floor(yf);
     int maxy = srcHeight - 1;
     int ys0 = rsMax(0, starty + 0);
@@ -288,7 +288,7 @@ void RsdCpuScriptIntrinsicResize::kernelU1(const RsForEachStubParamStruct *p,
     uint32_t x2 = xend;
 
     while(x1 < x2) {
-        float xf = x1 * cp->scaleX;
+        float xf = (x1 + 0.5f) * cp->scaleX - 0.5f;
         *out = OneBiCubic(yp0, yp1, yp2, yp3, xf, yf, srcWidth);
         out++;
         x1++;
diff --git a/cpu_ref/rsCpuIntrinsicYuvToRGB.cpp b/cpu_ref/rsCpuIntrinsicYuvToRGB.cpp
index c53ef313..e191e25d 100644
--- a/cpu_ref/rsCpuIntrinsicYuvToRGB.cpp
+++ b/cpu_ref/rsCpuIntrinsicYuvToRGB.cpp
@@ -161,8 +161,8 @@ void RsdCpuScriptIntrinsicYuvToRGB::kernel(const RsForEachStubParamStruct *p,
         out++;
         x1++;
     }
-// reenable for ARM64 when intrinsic is fixed
-#if defined(ARCH_ARM_USE_INTRINSICS) && !defined(ARCH_ARM64_USE_INTRINSICS)
+
+#if defined(ARCH_ARM_USE_INTRINSICS)
     if((x2 > x1) && gArchUseSIMD) {
         int32_t len = x2 - x1;
         if (cstep == 1) {
diff --git a/cpu_ref/rsCpuIntrinsics_advsimd_Blur.S b/cpu_ref/rsCpuIntrinsics_advsimd_Blur.S
index 929f76f7..fc1eefee 100644
--- a/cpu_ref/rsCpuIntrinsics_advsimd_Blur.S
+++ b/cpu_ref/rsCpuIntrinsics_advsimd_Blur.S
@@ -52,17 +52,17 @@
  *      x6 -- rup
  *      x7 -- rdn
  *      x12 -- switch index
- *      q0-q3 -- coefficient table
+ *      v0-v3 -- coefficient table
  *      x13 = -pitch
  *      x15 = top-row in
  *      x19 = bottom-row in
  * Output:
  *      x1 += 16
- *      q10,q11 -- 16 convolved columns
+ *      v10,v11 -- 16 convolved columns
  * Modifies:
  *      x10 = upper row pointer
  *      x11 = lower row pointer
- *      q12-q15 = temporary sums
+ *      v12-v15 = temporary sums
  */
 .macro fetch, max_r=MAX_R, labelc=1, labelnc=2, reg=x12 /*{{{*/
   .ifc \reg,x12 ; .set cc, 1 ; .else ; .set cc, 0 ; .endif
@@ -146,15 +146,15 @@ nop
  * When the buffer gets too big the buffer at [x9] is used.
  *
  * Input:
- *      q4-q11 -- convoltion window
+ *      v16-v31,v4-v11 -- convoltion window
  *      x9 -- pointer to additional convolution window data
  * Output:
  *      x9 -- updated buffer pointer (if used)
  *      d31 -- result to be stored
  * Modifies:
  *      x12 -- temp buffer pointer
- *      q12-q13 -- temporaries for load and vext operations.
- *      q14-q15 -- intermediate sums
+ *      v12-v13 -- temporaries for load and vext operations.
+ *      v14-v15 -- intermediate sums
  */
 #define TUNED_LIST1 8, 16
 .macro hconv1_8/*{{{*/
@@ -407,7 +407,7 @@ nop
             umlal2      v15.4s, v12.8h, v3.h[1]
             umlal       v14.4s, v13.4h, v3.h[1]
             umlal2      v15.4s, v13.8h, v3.h[1]
-    124:    ext         v12.16b, v3.16b, v4.16b, #7*2
+    124:    ext         v12.16b, v31.16b, v4.16b, #7*2
             ext         v13.16b, v9.16b, v10.16b, #7*2
             umlal       v14.4s, v12.4h, v3.h[0]
             umlal2      v15.4s, v12.8h, v3.h[0]
@@ -1055,64 +1055,47 @@ PRIVATE(fetch_generic_asm)
             ret
 END(fetch_generic_asm)
 
-/* Given values in q10 and q11, and an index in x11, sweep the (x11&15)th value
+/* Given values in v10 and v11, and an index in x11, sweep the (x11&15)th value
  * across to fill the rest of the register pair.  Used for filling the right
  * hand edge of the window when starting too close to the right hand edge of
  * the image.
+ * Also returns a dup-ed copy of the last element in v12 for the tail-fill
+ * case (this happens incidentally in common path, but must be done
+ * deliberately in the fast-out path).
  */
-PRIVATE(prefetch_clamp1)
-            sub         x11, xzr, x11
-            sub         x15, x15, x1
-            sub         x19, x19, x1
-            tbz         x11, #3, 1f
-            mov         v11.16b, v10.16b
-            sub         x1, x1, #16
-1:          mov         v12.16b, v11.16b
-            movi        v13.8b, #0xff
-            tbz         x11, #2, 1f
-            ext         v12.16b, v12.16b, v12.16b, #4*2
-            sub         x1, x1, #8
-            shl         v13.2d, v13.2d, #32
-1:          tbz         x11, #1, 1f
-            ext         v12.16b, v12.16b, v12.16b, #6*2
-            sub         x1, x1, #4
-            shl         v13.2d, v13.2d, #16
-1:          tbz         x11, #0, 1f
-            ext         v12.16b, v12.16b, v12.16b, #7*2
-            sub         x1, x1, #2
-            shl         v13.2d, v13.2d, #8
-1:          dup         v12.8h, v12.h[6]
-            sxtl        v13.8h, v13.8b
-            bif         v11.16b, v12.16b, v13.16b
-1:          tbz         x11, #3, 1f
-            mov         v10.16b, v11.16b
-            mov         v11.16b, v12.16b
-1:          sub         x11, xzr, x11
-            add         x15, x15, x1
-            add         x19, x19, x1
+PRIVATE(prefetch_clampright1)
+            ands        x12, x11, #15
+            beq         1f
+            sub         x12, x12, #1
+            sub         sp, sp, #64
+            st1         {v10.8h,v11.8h}, [sp]
+            add         x12, sp, x12, LSL #1
+            ld1r        {v12.8h}, [x12]
+            st1         {v12.8h}, [x12], #16
+            st1         {v12.8h}, [x12]
+            ld1         {v10.8h,v11.8h}, [sp]
+            add         sp, sp, #64
+            ret
+1:          dup         v12.8h, v11.h[7]
+            ret
+END(prefetch_clampright1)
+
+PRIVATE(prefetch_clampright4)
+            ands        x12, x11, #15
+            beq         1f
+            sub         x12, x12, #4
+            sub         sp, sp, #64
+            st1         {v10.8h,v11.8h}, [sp]
+            add         x12, sp, x12, LSL #1
+            ld1r        {v12.2d}, [x12]
+            st1         {v12.8h}, [x12], #16
+            st1         {v12.8h}, [x12]
+            ld1         {v10.8h,v11.8h}, [sp]
+            add         sp, sp, #64
             ret
-END(prefetch_clamp1)
-
-PRIVATE(prefetch_clamp4)
-            sub         x11, xzr, x11
-            sub         x15, x15, x1
-            sub         x19, x19, x1
-            tbz         x11, #3, 1f
-            sub         x1, x1, #16     // what's this?
-            mov         v11.16b, v10.16b
 1:          dup         v12.2d, v11.d[1]
-            tbz         x11, #2, 1f
-            dup         v12.2d, v11.d[0]
-            sub         x1, x1, #8
-            dup         v11.2d, v11.d[0]
-1:          tbz         x11, #3, 1f
-            mov         v10.16b, v11.16b
-            mov         v11.16b, v12.16b
-1:          sub         x11, xzr, x11
-            add         x15, x15, x1
-            add         x19, x19, x1
             ret
-END(prefetch_clamp4)
+END(prefetch_clampright4)
 
 
 /* Helpers for prefetch, below.
@@ -1147,10 +1130,10 @@ END(prefetch_clamp4)
             prefetch_out \qa, \qb, \store, v10.16b, v11.16b, v11.d[1]
             bl          fetch_generic_asm
             b           2f
-3:          bl          prefetch_clamp\step
+3:          bl          prefetch_clampright\step
             prefetch_out \qa, \qb, \store, v10.16b, v11.16b, v11.d[1]
 4:          b           4f+4
-           //v12 contains pad word from prefetch_clamp call
+           //v12 contains pad word from prefetch_clampright call
             prefetch_out \qa, \qb, \store, v12.16b, v12.16b, v12.d[1]
   .if \rem > 0
             b           4f+4
@@ -1209,24 +1192,18 @@ END(prefetch_clamp4)
   .else
             dup         v9.2d, v10.d[0]
   .endif
-            tst         x10, #15
+            ands        x12, x10, #15
             beq         2f
-            sub         x12, xzr, x10
-            tbz         x10, #3, 1f
-            mov         v11.16b, v10.16b
-            mov         v10.16b, v9.16b
-1:          tbz         x12, #2, 1f
-            ext         v11.16b, v10.16b, v11.16b, #4*2
-            ext         v10.16b, v9.16b, v10.16b, #4*2
-  .if \step == 1
-  1:        tbz         x12, #1, 1f
-            ext         v11.16b, v10.16b, v11.16b, #2*2
-            ext         v10.16b, v9.16b, v10.16b, #2*2
-  1:        tbz         x12, #0, 1f
-            ext         v11.16b, v10.16b, v11.16b, #1*2
-            ext         v10.16b, v9.16b, v10.16b, #1*2
-  .endif
-1:          sub         x1, x1, x10
+            sub         sp, sp, #32
+            st1         {v10.8h,v11.8h}, [sp]
+            sub         x12, sp, x12, LSL #1
+            sub         sp, sp, #16
+            st1         {v9.8h}, [sp]
+            sub         sp, sp, #16
+            st1         {v9.8h}, [sp]
+            ld1         {v10.8h,v11.8h}, [x12]
+            add         sp, sp, #64
+            sub         x1, x1, x10
             sub         x15, x15, x10
             sub         x19, x19, x10
             bic         x10, x10, #15
@@ -1363,13 +1340,13 @@ END(prefetch_clamp4)
             b           3b
 4:          tbz         x3, #2, 1f
             st1         {v15.s}[0], [x0], #4
-            ext         v15.16b, v15.16b, v15.16b, #4*2
+            ext         v15.8b, v15.8b, v15.8b, #4
 1:          tbz         x3, #1, 1f
             st1         {v15.h}[0], [x0], #2
-            ext         v15.16b, v15.16b, v15.16b, #2*2
+            ext         v15.8b, v15.8b, v15.8b, #2
 1:          tbz         x3, #0, 5f
             st1         {v15.b}[0], [x0], #1
-            ext         v15.16b, v15.16b, v15.16b, #1*2
+            ext         v15.8b, v15.8b, v15.8b, #1
 5:          nop
 .endm
 
@@ -1438,7 +1415,6 @@ ENTRY(rsdIntrinsicBlurU1_K)
 
             ldr         x12, [sp, #88] // tab
 
-            add         x0, x0, x8
             add         x1, x1, x8
 
             cmp         x6, x5
@@ -1448,7 +1424,7 @@ ENTRY(rsdIntrinsicBlurU1_K)
             cmp         x8, x5
             csel        x8, x5, x8, hs
             cmp         x9, x5
-            csel        x9, x5, x8, hs
+            csel        x9, x5, x9, hs
 
             add         x4, x8, x9
             add         x4, x4, x3
@@ -1504,7 +1480,6 @@ ENTRY(rsdIntrinsicBlurU4_K)
 
             ldr         x12, [sp, #88]
 
-            add         x0, x0, x8, LSL #2
             add         x1, x1, x8, LSL #2
 
             cmp         x6, x5
diff --git a/cpu_ref/rsCpuIntrinsics_advsimd_YuvToRGB.S b/cpu_ref/rsCpuIntrinsics_advsimd_YuvToRGB.S
index 632ef7a4..bb4b7ae3 100644
--- a/cpu_ref/rsCpuIntrinsics_advsimd_YuvToRGB.S
+++ b/cpu_ref/rsCpuIntrinsics_advsimd_YuvToRGB.S
@@ -21,60 +21,127 @@
  * register.  This macro will be called from within several different wrapper
  * variants for different data layouts.  Y data starts with the even and odd
  * bytes split into the low parts of v8 and v9 respectively.  U and V are in
- * v16 and v17.  Working constants are pre-loaded into v13-v15, and v3 is
- * pre-loaded with a constant 0xff alpha channel.
+ * v10 and v11.  Working constants are pre-loaded into v24-v31, and v3 and v7
+ * are pre-loaded with a constant 0xff alpha channel.
  *
  * The complicated arithmetic is the result of refactoring the original
  * equations to avoid 16-bit overflow without losing any precision.
  */
-.macro yuvkern
-        movi        v7.8b, #149
-
-        umull       v1.8h, v8.8b, v7.8b        // g0 = y0 * 149
-        umull       v5.8h, v9.8b, v7.8b        // g1 = y1 * 149
-
-        movi        v7.8b, #50
-        movi        v10.8b, #104
-        umull       v8.8h, v16.8b, v7.8b       // g2 = u * 50 + v * 104
-        umlal       v8.8h, v17.8b, v10.8b
-
-        ushr        v7.8b, v17.8b, #1
-        uaddw       v0.8h, v1.8h, v7.8b        // r0 = y0 * 149 + (v >> 1)
-        uaddw       v4.8h, v5.8h, v7.8b        // r1 = y1 * 149 + (v >> 1)
-
-        ushll       v7.8h, v16.8b, #2
-        add         v2.8h, v1.8h, v7.8h        // b0 = y0 * 149 + (u << 2)
-        add         v6.8h, v5.8h, v7.8h        // b1 = y1 * 149 + (u << 2)
-
-        movi        v7.16b, #204
-        movi        v10.8b, #254
-        umull       v11.8h, v17.8b, v7.8b     // r2 = v * 204
-        umull       v12.8h, v16.8b, v10.8b      // b2 = u * 254
-
-        uhadd       v0.8h, v0.8h, v11.8h       // r0 = (r0 + r2) >> 1
-        uhadd       v4.8h, v4.8h, v11.8h       // r1 = (r1 + r2) >> 1
-        uqadd       v1.8h, v1.8h, v14.8h       // g0 = satu16(g0 + (-16 * 149 + 128 * 50 + 128 * 104) >> 0)
-        uqadd       v5.8h, v5.8h, v14.8h       // g1 = satu16(g1 + (-16 * 149 + 128 * 50 + 128 * 104) >> 0)
-        uhadd       v2.8h, v2.8h, v12.8h       // b0 = (b0 + b2) >> 1
-        uhadd       v6.8h, v6.8h, v12.8h       // b1 = (b1 + b2) >> 1
-
-        uqsub       v0.8h, v0.8h, v13.8h       // r0 = satu16(r0 - (16 * 149 + (128 >> 1) + 128 * 204) >> 1)
-        uqsub       v4.8h, v4.8h, v13.8h       // r1 = satu16(r1 - (16 * 149 + (128 >> 1) + 128 * 204) >> 1)
-        uqsub       v1.8h, v1.8h, v8.8h        // g0 = satu16(g0 - g2)
-        uqsub       v5.8h, v5.8h, v8.8h        // g1 = satu16(g1 - g2)
-        uqsub       v2.8h, v2.8h, v15.8h       // b0 = satu16(b0 - (16 * 149 + (128 << 2) + 128 * 254) >> 1)
-        uqsub       v6.8h, v6.8h, v15.8h       // b1 = satu16(b1 - (16 * 149 + (128 << 2) + 128 * 254) >> 1)
-
-        uqrshrn     v0.8b, v0.8h, #6
-        uqrshrn     v4.8b, v4.8h, #6
-        uqrshrn     v1.8b, v1.8h, #7
-        uqrshrn     v5.8b, v5.8h, #7
-        uqrshrn     v2.8b, v2.8h, #6
-        uqrshrn     v6.8b, v6.8h, #6
-
-        zip1        v0.16b, v0.16b, v4.16b
-        zip1        v1.16b, v1.16b, v5.16b
-        zip1        v2.16b, v2.16b, v6.16b
+.macro yuvkern, regu=v10, regv=v11
+        /* v0   out R_lo / even R_lo accumulator
+         * v1   out G_lo / even G_lo accumulator
+         * v2   out B_lo / even B_lo accumulator
+         * v3   out A_lo / const 0xff*ff
+         * v4   out R_hi / even R_hi accumulator
+         * v5   out G_hi / even G_hi accumulator
+         * v6   out B_hi / even B_hi accumulator
+         * v7   out A_hi / const 0xff*ff
+         * v8   even Y   / G_lo luma tmp
+         * v9   odd Y    / G_lo luma tmp
+         * \regu in U
+         * \regv in V
+         * v12  R_lo luma tmp
+         * v13  B_lo luma tmp
+         * v14  R_hi luma tmp
+         * v15  B_hi luma tmp
+         * v16  odd R_lo accumulator
+         * v17  odd G_lo accumulator
+         * v18  odd B_lo accumulator
+         * v19  multiplier extra bits low
+         * v20  odd R_hi accumulator
+         * v21  odd G_hi accumulator
+         * v22  odd B_hi accumulator
+         * v23  multiplier extra bits high
+         * v24  constant 149
+         * v25  constant 50
+         * v26  constant 104
+         * v27  constant 204
+         * v28  constant 254
+         * v29  constant ((16 * 149 + (128 >> 1) + 128 * 204) >> 1)
+         * v30  constant ((-16 * 149 + 128 * 50 + 128 * 104) >> 0)
+         * v31  constant ((16 * 149 + (128 << 2) + 128 * 254) >> 1)
+         */
+
+        umull       v1.8h,  v8.8b,  v24.8b      // g0 = y0 * 149
+        umull       v17.8h, v9.8b,  v24.8b      // g1 = y1 * 149
+        umull2      v5.8h,  v8.16b, v24.16b     // g0_hi = y0_hi * 149
+        umull2      v21.8h, v9.16b, v24.16b     // g1_hi = y1_hi * 149
+
+        umull       v8.8h, \regu\().8b, v25.8b     // g2 = u * 50 + v * 104
+        umlal       v8.8h, \regv\().8b, v26.8b
+        umull2      v9.8h, \regu\().16b, v25.16b   // g2_hi = u_hi * 50 + v_hi * 104
+        umlal2      v9.8h, \regv\().16b, v26.16b
+
+        ushr        v19.16b, \regv\().16b, #1
+        uaddw       v0.8h,  v1.8h,  v19.8b      // r0 = g0 + (v >> 1)
+        uaddw       v16.8h, v17.8h, v19.8b      // r1 = g1 + (v >> 1)
+
+        uaddw2      v4.8h,  v5.8h,  v19.16b     // r0_hi = g0_hi + (v_hi >> 1)
+        uaddw2      v20.8h, v21.8h, v19.16b     // r1_hi = g1_hi + (v_hi >> 1)
+
+        ushll       v19.8h, \regu\().8b,  #2
+        ushll2      v23.8h, \regu\().16b, #2
+        add         v2.8h,  v1.8h,  v19.8h      // b0 = g0 + (u << 2)
+        add         v18.8h, v17.8h, v19.8h      // b1 = g1 + (u << 2)
+
+        add         v6.8h,  v5.8h,  v23.8h      // b0_hi = g0_hi + (u_hi << 2)
+        add         v22.8h, v21.8h, v23.8h      // b1_hi = g1_hi + (u_hi << 2)
+
+        umull       v12.8h, \regv\().8b, v27.8b    // r2 = v * 204
+        umull       v13.8h, \regu\().8b, v28.8b    // b2 = u * 254
+
+        umull2      v14.8h, \regv\().16b, v27.16b  // r2_hi = v_hi * 204
+        umull2      v15.8h, \regu\().16b, v28.16b  // b2_hi = u_hi * 254
+
+        uhadd       v0.8h,  v0.8h,  v12.8h      // r0 = (r0 + r2) >> 1
+        uhadd       v16.8h, v16.8h, v12.8h      // r1 = (r1 + r2) >> 1
+        uqadd       v1.8h,  v1.8h,  v30.8h      // g0 = satu16(g0 + (-16 * 149 + 128 * 50 + 128 * 104) >> 0)
+        uqadd       v17.8h, v17.8h, v30.8h      // g1 = satu16(g1 + (-16 * 149 + 128 * 50 + 128 * 104) >> 0)
+        uhadd       v2.8h,  v2.8h,  v13.8h      // b0 = (b0 + b2) >> 1
+        uhadd       v18.8h, v18.8h, v13.8h      // b1 = (b1 + b2) >> 1
+
+        uhadd       v4.8h,  v4.8h,  v14.8h      // r0_hi = (r0_hi + r2_hi) >> 1
+        uhadd       v20.8h, v20.8h, v14.8h      // r1_hi = (r1_hi + r2_hi) >> 1
+        uqadd       v5.8h,  v5.8h,  v30.8h      // g0_hi = satu16(g0_hi + (-16 * 149 + 128 * 50 + 128 * 104) >> 0)
+        uqadd       v21.8h, v21.8h, v30.8h      // g1_hi = satu16(g1_hi + (-16 * 149 + 128 * 50 + 128 * 104) >> 0)
+        uhadd       v6.8h,  v6.8h,  v15.8h      // b0_hi = (b0_hi + b2_hi) >> 1
+        uhadd       v22.8h, v22.8h, v15.8h      // b1_hi = (b1_hi + b2_hi) >> 1
+
+        uqsub       v0.8h,  v0.8h,  v29.8h      // r0 = satu16(r0 - (16 * 149 + (128 >> 1) + 128 * 204) >> 1)
+        uqsub       v16.8h, v16.8h, v29.8h      // r1 = satu16(r1 - (16 * 149 + (128 >> 1) + 128 * 204) >> 1)
+        uqsub       v1.8h,  v1.8h,  v8.8h       // g0 = satu16(g0 - g2)
+        uqsub       v17.8h, v17.8h, v8.8h       // g1 = satu16(g1 - g2)
+        uqsub       v2.8h,  v2.8h,  v31.8h      // b0 = satu16(b0 - (16 * 149 + (128 << 2) + 128 * 254) >> 1)
+        uqsub       v18.8h, v18.8h, v31.8h      // b1 = satu16(b1 - (16 * 149 + (128 << 2) + 128 * 254) >> 1)
+
+        uqsub       v4.8h,  v4.8h,  v29.8h      // r0_hi = satu16(r0_hi - (16 * 149 + (128 >> 1) + 128 * 204) >> 1)
+        uqsub       v20.8h, v20.8h, v29.8h      // r1_hi = satu16(r1_hi - (16 * 149 + (128 >> 1) + 128 * 204) >> 1)
+        uqsub       v5.8h,  v5.8h,  v9.8h       // g0_hi = satu16(g0_hi - g2_hi)
+        uqsub       v21.8h, v21.8h, v9.8h       // g1_hi = satu16(g1_hi - g2_hi)
+        uqsub       v6.8h,  v6.8h,  v31.8h      // b0_hi = satu16(b0_hi - (16 * 149 + (128 << 2) + 128 * 254) >> 1)
+        uqsub       v22.8h, v22.8h, v31.8h      // b1_hi = satu16(b1_hi - (16 * 149 + (128 << 2) + 128 * 254) >> 1)
+
+        uqrshrn     v0.8b,  v0.8h,  #6
+        uqrshrn     v16.8b, v16.8h, #6
+        uqrshrn     v1.8b,  v1.8h,  #7
+        uqrshrn     v17.8b, v17.8h, #7
+        uqrshrn     v2.8b,  v2.8h,  #6
+        uqrshrn     v18.8b, v18.8h, #6
+
+        uqrshrn     v4.8b,  v4.8h,  #6
+        uqrshrn     v20.8b, v20.8h, #6
+        uqrshrn     v5.8b,  v5.8h,  #7
+        uqrshrn     v21.8b, v21.8h, #7
+        uqrshrn     v6.8b,  v6.8h,  #6
+        uqrshrn     v22.8b, v22.8h, #6
+
+        zip1        v0.16b, v0.16b, v16.16b
+        zip1        v1.16b, v1.16b, v17.16b
+        zip1        v2.16b, v2.16b, v18.16b
+
+        zip1        v4.16b, v4.16b, v20.16b
+        zip1        v5.16b, v5.16b, v21.16b
+        zip1        v6.16b, v6.16b, v22.16b
 .endm
 
 /* Define the wrapper code which will load and store the data, iterate the
@@ -83,50 +150,51 @@
  * being handled.
  */
 .macro wrap_line kernel, interleaved=0, swapuv=0
-
+        movi        v24.16b, #149
+        movi        v25.16b, #50
+        movi        v26.16b, #104
+        movi        v27.16b, #204
+        movi        v28.16b, #254
         mov         w5, #((16 * 149 + (128 >> 1) + 128 * 204) >> 1)
-        dup         v13.8h, w5
+        dup         v29.8h, w5
         mov         w5, #((-16 * 149 + 128 * 50 + 128 * 104) >> 0)
-        dup         v14.8h, w5
+        dup         v30.8h, w5
         mov         w5, #((16 * 149 + (128 << 2) + 128 * 254) >> 1)
-        dup         v15.8h, w5
+        dup         v31.8h, w5
 
         movi        v3.16b, #0xff
+        movi        v7.16b, #0xff
 
-        subs        x2, x2, #16
+        subs        x2, x2, #32
         bhs         1f
         b           2f
 
         .align 4
-1:      ld2         {v8.8b,v9.8b}, [x1], #16
-//      prfm PLDL1STRM, [x1, #256]
+1:      ld2         {v8.16b,v9.16b}, [x1], #32
   .if \interleaved
-    .if \swapuv
-        ld2         {v17.8b,v18.8b}, [x3], #16
-        mov         v16.8b, v18.8b
-    .else
-        ld2         {v16.8b,v17.8b}, [x3], #16
-    .endif
-//      prfm PLD1STRM,  [x3, #256]
+        ld2         {v10.16b,v11.16b}, [x3], #32
   .else
-        ld1         {v16.8b}, [x3], #8
-        ld1         {v17.8b}, [x4], #8
-//      prfm PLD1STRM,  [x3, #128]
-//      prfm PLD1STRM,  [x4, #128]
+        ld1         {v10.16b}, [x3], #16
+        ld1         {v11.16b}, [x4], #16
   .endif
 
+  .if \swapuv
+        \kernel regu=v11, regv=v10
+  .else
         \kernel
+  .endif
 
-        subs        x2, x2, #16
+        subs        x2, x2, #32
 
-        st4         {v0.16b,v1.16b,v2.16b,v3.16b}, [x0], #64
+        st4         {v0.16b - v3.16b}, [x0], #64
+        st4         {v4.16b - v7.16b}, [x0], #64
 
         bhs         1b
 
-2:      adds        x2, x2, #16
+2:      adds        x2, x2, #32
         beq         2f
 
-        /* To handle the tail portion of the data (something less than 16
+        /* To handle the tail portion of the data (something less than 32
          * bytes) load small power-of-two chunks into working registers.  It
          * doesn't matter where they end up in the register; the same process
          * will store them back out using the same positions and the
@@ -135,40 +203,48 @@
          */
         movi        v8.8b, #0
         movi        v9.8b, #0
-        movi        v16.8b, #0
-        movi        v17.8b, #0
+        movi        v10.8b, #0
+        movi        v11.8b, #0
 
-        tbz         x2, #3, 1f
-        ld1         {v9.8b}, [x1], #8
+        tbz         x2, #4, 1f
+        ld1         {v9.16b}, [x1], #16
+  .if \interleaved
+        ld1         {v11.16b}, [x3], #16
+  .else
+        ld1         {v10.d}[1], [x3], #8
+        ld1         {v11.d}[1], [x4], #8
+  .endif
+1:      tbz         x2, #3, 1f
+        ld1         {v8.d}[1], [x1], #8
   .if \interleaved
-        ld1         {v17.8b}, [x3], #8
+        ld1         {v10.d}[1], [x3], #8
   .else
-        ld1         {v16.s}[1], [x3], #4
-        ld1         {v17.s}[1], [x4], #4
+        ld1         {v10.s}[1], [x3], #4
+        ld1         {v11.s}[1], [x4], #4
   .endif
 1:      tbz         x2, #2, 1f
         ld1         {v8.s}[1], [x1], #4
   .if \interleaved
-        ld1         {v16.s}[1], [x3], #4
+        ld1         {v10.s}[1], [x3], #4
   .else
-        ld1         {v16.h}[1], [x3], #2
-        ld1         {v17.h}[1], [x4], #2
+        ld1         {v10.h}[1], [x3], #2
+        ld1         {v11.h}[1], [x4], #2
   .endif
 1:      tbz         x2, #1, 1f
         ld1         {v8.h}[1], [x1], #2
   .if \interleaved
-        ld1         {v16.h}[1], [x3], #2
+        ld1         {v10.h}[1], [x3], #2
   .else
-        ld1         {v16.b}[1], [x3], #1
-        ld1         {v17.b}[1], [x4], #1
+        ld1         {v10.b}[1], [x3], #1
+        ld1         {v11.b}[1], [x4], #1
   .endif
 1:      tbz         x2, #0, 1f
         ld1         {v8.b}[1], [x1], #1
   .if \interleaved
-        ld1         {v16.h}[0], [x3], #2
+        ld1         {v10.h}[0], [x3], #2
   .else
-        ld1         {v16.b}[0], [x3], #1
-        ld1         {v17.b}[0], [x4], #1
+        ld1         {v10.b}[0], [x3], #1
+        ld1         {v11.b}[0], [x4], #1
   .endif
 
         /* One small impediment in the process above is that some of the load
@@ -176,29 +252,38 @@
          * same time as loading only part of a register.  So the data is loaded
          * linearly and unpacked manually at this point if necessary.
          */
-1:      uzp1        v8.16b, v8.16b, v9.16b
+1:      mov         v12.16b, v8.16b
+        uzp1        v8.16b, v12.16b, v9.16b
+        uzp2        v9.16b, v12.16b, v9.16b
   .if \interleaved
-    .if \swapuv
-        uzp1        v16.16b, v17.16b, v16.16b
-    .else
-        uzp1        v16.16b, v16.16b, v17.16b
-    .endif
+        mov         v12.16b, v10.16b
+        uzp1        v10.16b, v12.16b, v11.16b
+        uzp2        v11.16b, v12.16b, v11.16b
   .endif
 
+  .if \swapuv
+        \kernel regu=v11, regv=v10
+  .else
         \kernel
+  .endif
 
         /* As above but with the output; structured stores for partial vectors
          * aren't available, so the data is re-packed first and stored linearly.
          */
-        zip1        v4.16b, v0.16b, v2.16b
-        zip2        v6.16b, v0.16b, v2.16b
-        zip1        v5.16b, v1.16b, v3.16b
-        zip2        v7.16b, v1.16b, v3.16b
-        zip1        v0.16b, v4.16b, v5.16b
-        zip2        v1.16b, v4.16b, v5.16b
-        zip1        v2.16b, v6.16b, v7.16b
-        zip2        v3.16b, v6.16b, v7.16b
-
+        zip1        v16.16b, v0.16b, v2.16b
+        zip2        v18.16b, v0.16b, v2.16b
+        zip1        v17.16b, v1.16b, v3.16b
+        zip2        v19.16b, v1.16b, v3.16b
+        zip1        v0.16b, v16.16b, v17.16b
+        zip2        v1.16b, v16.16b, v17.16b
+        zip1        v2.16b, v18.16b, v19.16b
+        zip2        v3.16b, v18.16b, v19.16b
+
+        /* Luckily v4-v7 don't need to be unzipped because the complete set of
+         * four and can be stored using st4. */
+
+        tbz         x2, #4, 1f
+        st4         {v4.16b - v7.16b}, [x0], #64
 1:      tbz         x2, #3, 1f
         st1         {v2.16b,v3.16b}, [x0], #32
 1:      tbz         x2, #2, 1f
@@ -225,7 +310,7 @@ ENTRY(rsdIntrinsicYuv2_K)
         add         x1, x1, x4
         add         x4, x3, x6
         add         x3, x2, x6
-        sub         x2, x5, x6, LSL #2
+        sub         x2, x5, x6, LSL #1
 
         sub         x6, sp, #32
         sub         sp, sp, #64
diff --git a/cpu_ref/rsCpuIntrinsics_neon_Blur.S b/cpu_ref/rsCpuIntrinsics_neon_Blur.S
index 8fc47f5b..a7ae795c 100644
--- a/cpu_ref/rsCpuIntrinsics_neon_Blur.S
+++ b/cpu_ref/rsCpuIntrinsics_neon_Blur.S
@@ -15,6 +15,7 @@
  */
 
 #define ENTRY(f) .text; .align 4; .globl f; .type f,#function; f: .fnstart
+#define PRIVATE(f) .text; .align 4; .type f,#function; f: .fnstart
 #define END(f) .fnend; .size f, .-f;
 
 .eabi_attribute 25,1 @Tag_ABI_align8_preserved
@@ -1049,7 +1050,7 @@
 /* Dedicated function wrapper for the fetch macro, for the cases where
  * performance isn't that important, to keep code size down.
  */
-ENTRY(fetch_generic_asm)
+PRIVATE(fetch_generic_asm)
             push        {r10,r11}
             fetch
             pop         {r10,r11}
@@ -1060,61 +1061,46 @@ END(fetch_generic_asm)
  * across to fill the rest of the register pair.  Used for filling the right
  * hand edge of the window when starting too close to the right hand edge of
  * the image.
+ * Also returns a dup-ed copy of the last element in q12 for the tail-fill
+ * case (this happens incidentally in common path, but must be done
+ * deliberately in the fast-out path).
  */
-ENTRY(prefetch_clamp1)
-            rsb         r11, r11, #0
-            tst         r11, #8
+PRIVATE(prefetch_clampright1)
+            ands        r12, r11, #15
             beq         1f
-            vmov.u16    q11, q10
-            sub         r1, r1, #16
-1:          vmov.u16    q12, q11
-            vmov.i8     d26, #0xff
-            tst         r11, #4
-            beq         1f
-            vext.u16    q12, q12, q12, #4
-            sub         r1, r1, #8
-            vshl.u64    d26, d26, #32
-1:          tst         r11, #2
-            beq         1f
-            vext.u16    q12, q12, q12, #6
-            sub         r1, r1, #4
-            vshl.u64    d26, d26, #16
-1:          tst         r11, #1
-            beq         1f
-            vext.u16    q12, q12, q12, #7
-            sub         r1, r1, #2
-            vshl.u64    d26, d26, #8
-1:          vdup.u16    q12, d25[2]
-            vmovl.s8    q13, d26
-            vbif        q11, q12, q13
-1:          tst         r11, #8
-            beq         1f
-            vmov        q10, q11
-            vmov        q11, q12
-1:          rsb         r11, r11, #0
+            sub         r12, r12, #1
+            sub         sp, sp, #64
+            vst1.u16    {q10,q11}, [sp]
+            add         r12, sp, r12, LSL #1
+            vld1.u16    {d24[]}, [r12]
+            vld1.u16    {d25[]}, [r12]
+            vst1.u16    {q12}, [r12]!
+            vst1.u16    {q12}, [r12]
+            vld1.u16    {q10,q11}, [sp]
+            add         sp, sp, #64
+            bx          lr
+1:          vdup.u16    q12, d23[3]
             bx          lr
-END(prefetch_clamp1)
+END(prefetch_clampright1)
 
-ENTRY(prefetch_clamp4)
-            rsb         r11, r11, #0
-            tst         r11, #8
-            beq         1f
-            sub         r1, r1, #16
-            vmov.u16    q11, q10
-1:          vmov        d24, d23
-            tst         r11, #4
-            beq         1f
-            vmov        d24, d22
-            sub         r1, r1, #8
-            vmov        d23, d22
-1:          vmov        d25, d24
-            tst         r11, #8
+PRIVATE(prefetch_clampright4)
+            ands        r12, r11, #15
             beq         1f
-            vmov        q10, q11
-            vmov        q11, q12
-1:          rsb         r11, r11, #0
+            sub         r12, r12, #4
+            sub         sp, sp, #64
+            vst1.u16    {q10,q11}, [sp]
+            add         r12, sp, r12, LSL #1
+            vld1.u64    {d24}, [r12]
+            vld1.u64    {d25}, [r12]
+            vst1.u16    {q12}, [r12]!
+            vst1.u16    {q12}, [r12]
+            vld1.u16    {q10,q11}, [sp]
+            add         sp, sp, #64
+            bx          lr
+1:          vmov.u16    d24, d23
+            vmov.u16    d25, d23
             bx          lr
-END(prefetch_clamp4)
+END(prefetch_clampright4)
 
 
 /* Helpers for prefetch, below.
@@ -1147,10 +1133,10 @@ END(prefetch_clamp4)
             prefetch_out \qa, \qb, \store, q10, q11, d23
             bl          fetch_generic_asm
             b           2f
-3:          bl          prefetch_clamp\step
+3:          bl          prefetch_clampright\step
             prefetch_out \qa, \qb, \store, q10, q11, d23
 4:          b           4f+4
-            @q12 contains pad word from prefetch_clam call
+            @q12 contains pad word from prefetch_clampright call
             prefetch_out \qa, \qb, \store, q12, q12, d25
   .if \rem > 0
             b           4f+4
@@ -1205,28 +1191,18 @@ END(prefetch_clamp4)
             vmov.u16    d18, d20
             vmov.u16    d19, d20
   .endif
-            tst         r10, #15
+            ands        r12, r10, #15
             beq         2f
-            rsb         r12, r10, #0
-            tst         r10, #8
-            beq         1f
-            vmov.u16    q11, q10
-            vmov.u16    q10, q9
-1:          tst         r12, #4
-            beq         1f
-            vext.u16    q11, q10, q11, #4
-            vext.u16    q10, q9, q10, #4
-  .if \step == 1
-  1:        tst         r12, #2
-            beq         1f
-            vext.u16    q11, q10, q11, #2
-            vext.u16    q10, q9, q10, #2
-  1:        tst         r12, #1
-            beq         1f
-            vext.u16    q11, q10, q11, #1
-            vext.u16    q10, q9, q10, #1
-  .endif
-1:          sub         r1, r1, r10
+            sub         sp, sp, #32
+            vst1.u16    {q10,q11}, [sp]
+            sub         r12, sp, r12, LSL #1
+            sub         sp, sp, #16
+            vst1.u16    {q9}, [sp]
+            sub         sp, sp, #16
+            vst1.u16    {q9}, [sp]
+            vld1.u16    {q10,q11}, [r12]
+            add         sp, sp, #64
+            sub         r1, r1, r10
             bic         r10, r10, #15
             add         r1, r1, r10
 2:
@@ -1383,7 +1359,7 @@ END(prefetch_clamp4)
 .endm
 
 .irep r, TUNED_LIST1, 25
-ENTRY(convolve1_\r)
+PRIVATE(convolve1_\r)
             push        {r12,lr}
 
             sub         r1, r1, r8
@@ -1397,7 +1373,7 @@ END(convolve1_\r)
 .endr
 
 .irep r, TUNED_LIST4, 25
-ENTRY(convolve4_\r)
+PRIVATE(convolve4_\r)
             sub         r12, sp, #0x200
             bic         r9, r12, #0x3fc
             mov         sp, r9
@@ -1447,8 +1423,7 @@ ENTRY(rsdIntrinsicBlurU1_K)
 
             ldr         r12, [sp,#124]
 
-            add         r0, r0, r8 @, LSL #2 /* for blur4 option */
-            add         r1, r1, r8 @, LSL #2 /* for blur4 option */
+            add         r1, r1, r8
 
             cmp         r6, r5
             movhi       r6, r5
@@ -1503,7 +1478,6 @@ ENTRY(rsdIntrinsicBlurU4_K)
 
             ldr         r12, [sp,#124]
 
-            add         r0, r0, r8, LSL #2
             add         r1, r1, r8, LSL #2
 
             cmp         r6, r5
diff --git a/cpu_ref/rsCpuScript.cpp b/cpu_ref/rsCpuScript.cpp
index a11fda19..e8b3fb6d 100644
--- a/cpu_ref/rsCpuScript.cpp
+++ b/cpu_ref/rsCpuScript.cpp
@@ -230,6 +230,11 @@ static void setCompileArguments(std::vector<const char*>* args, const android::S
     args->push_back("-mtriple");
     args->push_back(DEFAULT_TARGET_TRIPLE_STRING);
 
+    // Enable workaround for A53 codegen by default.
+#if defined(__aarch64__) && !defined(DISABLE_A53_WORKAROUND)
+    args->push_back("-aarch64-fix-cortex-a53-835769");
+#endif
+
     // Execute the bcc compiler.
     if (useRSDebugContext) {
         args->push_back("-rs-debug-ctx");