diff options
Diffstat (limited to 'cpu_ref')
| -rw-r--r-- | cpu_ref/Android.mk | 4 | ||||
| -rw-r--r-- | cpu_ref/rsCpuCore.cpp | 2 | ||||
| -rw-r--r-- | cpu_ref/rsCpuIntrinsicBlur.cpp | 6 | ||||
| -rw-r--r-- | cpu_ref/rsCpuIntrinsicColorMatrix.cpp | 20 | ||||
| -rw-r--r-- | cpu_ref/rsCpuIntrinsicConvolve3x3.cpp | 6 | ||||
| -rw-r--r-- | cpu_ref/rsCpuIntrinsicConvolve5x5.cpp | 6 | ||||
| -rw-r--r-- | cpu_ref/rsCpuIntrinsicResize.cpp | 30 | ||||
| -rw-r--r-- | cpu_ref/rsCpuIntrinsicYuvToRGB.cpp | 4 | ||||
| -rw-r--r-- | cpu_ref/rsCpuIntrinsics_advsimd_Blur.S | 141 | ||||
| -rw-r--r-- | cpu_ref/rsCpuIntrinsics_advsimd_YuvToRGB.S | 293 | ||||
| -rw-r--r-- | cpu_ref/rsCpuIntrinsics_neon_Blur.S | 130 | ||||
| -rw-r--r-- | cpu_ref/rsCpuScript.cpp | 5 |
12 files changed, 347 insertions, 300 deletions
diff --git a/cpu_ref/Android.mk b/cpu_ref/Android.mk index bd276bf2..f041ad96 100644 --- a/cpu_ref/Android.mk +++ b/cpu_ref/Android.mk @@ -42,6 +42,10 @@ LOCAL_SRC_FILES:= \ LOCAL_CFLAGS_arm64 += -DARCH_ARM_USE_INTRINSICS -DARCH_ARM64_USE_INTRINSICS -DARCH_ARM64_HAVE_NEON +ifeq ($(RS_DISABLE_A53_WORKAROUND),true) +LOCAL_CFLAGS_arm64 += -DDISABLE_A53_WORKAROUND +endif + LOCAL_SRC_FILES_arm64 += \ rsCpuIntrinsics_advsimd_3DLUT.S \ rsCpuIntrinsics_advsimd_Convolve.S \ diff --git a/cpu_ref/rsCpuCore.cpp b/cpu_ref/rsCpuCore.cpp index f09e3342..4285dae5 100644 --- a/cpu_ref/rsCpuCore.cpp +++ b/cpu_ref/rsCpuCore.cpp @@ -266,7 +266,7 @@ bool RsdCpuReferenceImpl::init(uint32_t version_major, uint32_t version_minor, GetCpuInfo(); - int cpu = sysconf(_SC_NPROCESSORS_ONLN); + int cpu = sysconf(_SC_NPROCESSORS_CONF); if(mRSC->props.mDebugMaxThreads) { cpu = mRSC->props.mDebugMaxThreads; } diff --git a/cpu_ref/rsCpuIntrinsicBlur.cpp b/cpu_ref/rsCpuIntrinsicBlur.cpp index 123cc9f6..9dccd80d 100644 --- a/cpu_ref/rsCpuIntrinsicBlur.cpp +++ b/cpu_ref/rsCpuIntrinsicBlur.cpp @@ -297,7 +297,7 @@ void RsdCpuScriptIntrinsicBlur::kernelU4(const RsForEachStubParamStruct *p, uint32_t x2 = xend; #if defined(ARCH_ARM_USE_INTRINSICS) - if (gArchUseSIMD && !xstart && (xend == p->dimX)) { + if (gArchUseSIMD) { rsdIntrinsicBlurU4_K(out, (uchar4 const *)(pin + stride * p->y), p->dimX, p->dimY, stride, x1, p->y, x2 - x1, cp->mIradius, cp->mIp + cp->mIradius); return; @@ -367,9 +367,9 @@ void RsdCpuScriptIntrinsicBlur::kernelU1(const RsForEachStubParamStruct *p, uint32_t x2 = xend; #if defined(ARCH_ARM_USE_INTRINSICS) - if (gArchUseSIMD && !xstart && (xend == p->dimX)) { + if (gArchUseSIMD) { rsdIntrinsicBlurU1_K(out, pin + stride * p->y, p->dimX, p->dimY, - stride, 0, p->y, p->dimX, cp->mIradius, cp->mIp + cp->mIradius); + stride, x1, p->y, x2 - x1, cp->mIradius, cp->mIp + cp->mIradius); return; } #endif diff --git a/cpu_ref/rsCpuIntrinsicColorMatrix.cpp b/cpu_ref/rsCpuIntrinsicColorMatrix.cpp index 8c852778..6a7808e7 100644 --- a/cpu_ref/rsCpuIntrinsicColorMatrix.cpp +++ b/cpu_ref/rsCpuIntrinsicColorMatrix.cpp @@ -126,7 +126,7 @@ typedef union { } Key_t; //Re-enable when intrinsic is fixed -#if 0 && defined(ARCH_ARM64_USE_INTRINSICS) +#if defined(ARCH_ARM64_USE_INTRINSICS) typedef struct { void (*column[4])(void); void (*store)(void); @@ -184,7 +184,7 @@ protected: int ipa[4]; float tmpFp[16]; float tmpFpa[4]; -#if 0 && defined(ARCH_ARM64_USE_INTRINSICS) +#if defined(ARCH_ARM64_USE_INTRINSICS) FunctionTab_t mFnTab; #endif @@ -910,16 +910,20 @@ void RsdCpuScriptIntrinsicColorMatrix::kernel(const RsForEachStubParamStruct *p, out += outstep * len; in += instep * len; } -#if 0 && defined(ARCH_ARM64_USE_INTRINSICS) +#if defined(ARCH_ARM64_USE_INTRINSICS) else { if (cp->mLastKey.u.inType == RS_TYPE_FLOAT_32 || cp->mLastKey.u.outType == RS_TYPE_FLOAT_32) { - rsdIntrinsicColorMatrix_float_K(out, in, len, &cp->mFnTab, cp->tmpFp, cp->tmpFpa); + // Currently this generates off by one errors. + //rsdIntrinsicColorMatrix_float_K(out, in, len, &cp->mFnTab, cp->tmpFp, cp->tmpFpa); + //x1 += len; + //out += outstep * len; + //in += instep * len; } else { rsdIntrinsicColorMatrix_int_K(out, in, len, &cp->mFnTab, cp->ip, cp->ipa); + x1 += len; + out += outstep * len; + in += instep * len; } - x1 += len; - out += outstep * len; - in += instep * len; } #endif } @@ -971,7 +975,7 @@ void RsdCpuScriptIntrinsicColorMatrix::preLaunch( if (build(key)) { mOptKernel = (void (*)(void *, const void *, const short *, uint32_t)) mBuf; } -#if 0 && defined(ARCH_ARM64_USE_INTRINSICS) +#if defined(ARCH_ARM64_USE_INTRINSICS) else { int dt = key.u.outVecSize + (key.u.outType == RS_TYPE_FLOAT_32 ? 4 : 0); int st = key.u.inVecSize + (key.u.inType == RS_TYPE_FLOAT_32 ? 4 : 0); diff --git a/cpu_ref/rsCpuIntrinsicConvolve3x3.cpp b/cpu_ref/rsCpuIntrinsicConvolve3x3.cpp index 552a8353..e5953cf3 100644 --- a/cpu_ref/rsCpuIntrinsicConvolve3x3.cpp +++ b/cpu_ref/rsCpuIntrinsicConvolve3x3.cpp @@ -105,7 +105,7 @@ static void ConvolveOneU4(const RsForEachStubParamStruct *p, uint32_t x, uchar4 convert_float4(py2[x]) * coeff[7] + convert_float4(py2[x2]) * coeff[8]; - px = clamp(px, 0.f, 255.f); + px = clamp(px + 0.5f, 0.f, 255.f); uchar4 o = {(uchar)px.x, (uchar)px.y, (uchar)px.z, (uchar)px.w}; *out = o; } @@ -127,7 +127,7 @@ static void ConvolveOneU2(const RsForEachStubParamStruct *p, uint32_t x, uchar2 convert_float2(py2[x]) * coeff[7] + convert_float2(py2[x2]) * coeff[8]; - px = clamp(px, 0.f, 255.f); + px = clamp(px + 0.5f, 0.f, 255.f); *out = convert_uchar2(px); } @@ -147,7 +147,7 @@ static void ConvolveOneU1(const RsForEachStubParamStruct *p, uint32_t x, uchar * ((float)py2[x1]) * coeff[6] + ((float)py2[x]) * coeff[7] + ((float)py2[x2]) * coeff[8]; - *out = clamp(px, 0.f, 255.f); + *out = clamp(px + 0.5f, 0.f, 255.f); } static void ConvolveOneF4(const RsForEachStubParamStruct *p, uint32_t x, float4 *out, diff --git a/cpu_ref/rsCpuIntrinsicConvolve5x5.cpp b/cpu_ref/rsCpuIntrinsicConvolve5x5.cpp index e2a6b8b1..a2c29fd3 100644 --- a/cpu_ref/rsCpuIntrinsicConvolve5x5.cpp +++ b/cpu_ref/rsCpuIntrinsicConvolve5x5.cpp @@ -125,7 +125,7 @@ static void OneU4(const RsForEachStubParamStruct *p, uint32_t x, uchar4 *out, convert_float4(py4[x2]) * coeff[22] + convert_float4(py4[x3]) * coeff[23] + convert_float4(py4[x4]) * coeff[24]; - px = clamp(px, 0.f, 255.f); + px = clamp(px + 0.5f, 0.f, 255.f); *out = convert_uchar4(px); } @@ -168,7 +168,7 @@ static void OneU2(const RsForEachStubParamStruct *p, uint32_t x, uchar2 *out, convert_float2(py4[x2]) * coeff[22] + convert_float2(py4[x3]) * coeff[23] + convert_float2(py4[x4]) * coeff[24]; - px = clamp(px, 0.f, 255.f); + px = clamp(px + 0.5f, 0.f, 255.f); *out = convert_uchar2(px); } @@ -211,7 +211,7 @@ static void OneU1(const RsForEachStubParamStruct *p, uint32_t x, uchar *out, (float)(py4[x2]) * coeff[22] + (float)(py4[x3]) * coeff[23] + (float)(py4[x4]) * coeff[24]; - px = clamp(px, 0.f, 255.f); + px = clamp(px + 0.5f, 0.f, 255.f); *out = px; } diff --git a/cpu_ref/rsCpuIntrinsicResize.cpp b/cpu_ref/rsCpuIntrinsicResize.cpp index 474f82d1..19607c97 100644 --- a/cpu_ref/rsCpuIntrinsicResize.cpp +++ b/cpu_ref/rsCpuIntrinsicResize.cpp @@ -83,7 +83,7 @@ static float cubicInterpolate(float p0,float p1,float p2,float p3 , float x) { static uchar4 OneBiCubic(const uchar4 *yp0, const uchar4 *yp1, const uchar4 *yp2, const uchar4 *yp3, float xf, float yf, int width) { - int startx = (int) floor(xf - 2); + int startx = (int) floor(xf - 1); xf = xf - floor(xf); int maxx = width - 1; int xs0 = rsMax(0, startx + 0); @@ -112,13 +112,13 @@ static uchar4 OneBiCubic(const uchar4 *yp0, const uchar4 *yp1, const uchar4 *yp2 convert_float4(yp3[xs3]), xf); float4 p = cubicInterpolate(p0, p1, p2, p3, yf); - p = clamp(p, 0.f, 255.f); + p = clamp(p + 0.5f, 0.f, 255.f); return convert_uchar4(p); } static uchar2 OneBiCubic(const uchar2 *yp0, const uchar2 *yp1, const uchar2 *yp2, const uchar2 *yp3, float xf, float yf, int width) { - int startx = (int) floor(xf - 2); + int startx = (int) floor(xf - 1); xf = xf - floor(xf); int maxx = width - 1; int xs0 = rsMax(0, startx + 0); @@ -147,13 +147,13 @@ static uchar2 OneBiCubic(const uchar2 *yp0, const uchar2 *yp1, const uchar2 *yp2 convert_float2(yp3[xs3]), xf); float2 p = cubicInterpolate(p0, p1, p2, p3, yf); - p = clamp(p, 0.f, 255.f); + p = clamp(p + 0.5f, 0.f, 255.f); return convert_uchar2(p); } static uchar OneBiCubic(const uchar *yp0, const uchar *yp1, const uchar *yp2, const uchar *yp3, float xf, float yf, int width) { - int startx = (int) floor(xf - 2); + int startx = (int) floor(xf - 1); xf = xf - floor(xf); int maxx = width - 1; int xs0 = rsMax(0, startx + 0); @@ -171,7 +171,7 @@ static uchar OneBiCubic(const uchar *yp0, const uchar *yp1, const uchar *yp2, co (float)yp3[xs2], (float)yp3[xs3], xf); float p = cubicInterpolate(p0, p1, p2, p3, yf); - p = clamp(p, 0.f, 255.f); + p = clamp(p + 0.5f, 0.f, 255.f); return (uchar)p; } @@ -189,8 +189,8 @@ void RsdCpuScriptIntrinsicResize::kernelU4(const RsForEachStubParamStruct *p, const int srcWidth = cp->mAlloc->mHal.drvState.lod[0].dimX; const size_t stride = cp->mAlloc->mHal.drvState.lod[0].stride; - float yf = p->y * cp->scaleY; - int starty = (int) floor(yf - 2); + float yf = (p->y + 0.5f) * cp->scaleY - 0.5f; + int starty = (int) floor(yf - 1); yf = yf - floor(yf); int maxy = srcHeight - 1; int ys0 = rsMax(0, starty + 0); @@ -208,7 +208,7 @@ void RsdCpuScriptIntrinsicResize::kernelU4(const RsForEachStubParamStruct *p, uint32_t x2 = xend; while(x1 < x2) { - float xf = x1 * cp->scaleX; + float xf = (x1 + 0.5f) * cp->scaleX - 0.5f; *out = OneBiCubic(yp0, yp1, yp2, yp3, xf, yf, srcWidth); out++; x1++; @@ -229,8 +229,8 @@ void RsdCpuScriptIntrinsicResize::kernelU2(const RsForEachStubParamStruct *p, const int srcWidth = cp->mAlloc->mHal.drvState.lod[0].dimX; const size_t stride = cp->mAlloc->mHal.drvState.lod[0].stride; - float yf = p->y * cp->scaleY; - int starty = (int) floor(yf - 2); + float yf = (p->y + 0.5f) * cp->scaleY - 0.5f; + int starty = (int) floor(yf - 1); yf = yf - floor(yf); int maxy = srcHeight - 1; int ys0 = rsMax(0, starty + 0); @@ -248,7 +248,7 @@ void RsdCpuScriptIntrinsicResize::kernelU2(const RsForEachStubParamStruct *p, uint32_t x2 = xend; while(x1 < x2) { - float xf = x1 * cp->scaleX; + float xf = (x1 + 0.5f) * cp->scaleX - 0.5f; *out = OneBiCubic(yp0, yp1, yp2, yp3, xf, yf, srcWidth); out++; x1++; @@ -269,8 +269,8 @@ void RsdCpuScriptIntrinsicResize::kernelU1(const RsForEachStubParamStruct *p, const int srcWidth = cp->mAlloc->mHal.drvState.lod[0].dimX; const size_t stride = cp->mAlloc->mHal.drvState.lod[0].stride; - float yf = p->y * cp->scaleY; - int starty = (int) floor(yf - 2); + float yf = (p->y + 0.5f) * cp->scaleY - 0.5f; + int starty = (int) floor(yf - 1); yf = yf - floor(yf); int maxy = srcHeight - 1; int ys0 = rsMax(0, starty + 0); @@ -288,7 +288,7 @@ void RsdCpuScriptIntrinsicResize::kernelU1(const RsForEachStubParamStruct *p, uint32_t x2 = xend; while(x1 < x2) { - float xf = x1 * cp->scaleX; + float xf = (x1 + 0.5f) * cp->scaleX - 0.5f; *out = OneBiCubic(yp0, yp1, yp2, yp3, xf, yf, srcWidth); out++; x1++; diff --git a/cpu_ref/rsCpuIntrinsicYuvToRGB.cpp b/cpu_ref/rsCpuIntrinsicYuvToRGB.cpp index c53ef313..e191e25d 100644 --- a/cpu_ref/rsCpuIntrinsicYuvToRGB.cpp +++ b/cpu_ref/rsCpuIntrinsicYuvToRGB.cpp @@ -161,8 +161,8 @@ void RsdCpuScriptIntrinsicYuvToRGB::kernel(const RsForEachStubParamStruct *p, out++; x1++; } -// reenable for ARM64 when intrinsic is fixed -#if defined(ARCH_ARM_USE_INTRINSICS) && !defined(ARCH_ARM64_USE_INTRINSICS) + +#if defined(ARCH_ARM_USE_INTRINSICS) if((x2 > x1) && gArchUseSIMD) { int32_t len = x2 - x1; if (cstep == 1) { diff --git a/cpu_ref/rsCpuIntrinsics_advsimd_Blur.S b/cpu_ref/rsCpuIntrinsics_advsimd_Blur.S index 929f76f7..fc1eefee 100644 --- a/cpu_ref/rsCpuIntrinsics_advsimd_Blur.S +++ b/cpu_ref/rsCpuIntrinsics_advsimd_Blur.S @@ -52,17 +52,17 @@ * x6 -- rup * x7 -- rdn * x12 -- switch index - * q0-q3 -- coefficient table + * v0-v3 -- coefficient table * x13 = -pitch * x15 = top-row in * x19 = bottom-row in * Output: * x1 += 16 - * q10,q11 -- 16 convolved columns + * v10,v11 -- 16 convolved columns * Modifies: * x10 = upper row pointer * x11 = lower row pointer - * q12-q15 = temporary sums + * v12-v15 = temporary sums */ .macro fetch, max_r=MAX_R, labelc=1, labelnc=2, reg=x12 /*{{{*/ .ifc \reg,x12 ; .set cc, 1 ; .else ; .set cc, 0 ; .endif @@ -146,15 +146,15 @@ nop * When the buffer gets too big the buffer at [x9] is used. * * Input: - * q4-q11 -- convoltion window + * v16-v31,v4-v11 -- convoltion window * x9 -- pointer to additional convolution window data * Output: * x9 -- updated buffer pointer (if used) * d31 -- result to be stored * Modifies: * x12 -- temp buffer pointer - * q12-q13 -- temporaries for load and vext operations. - * q14-q15 -- intermediate sums + * v12-v13 -- temporaries for load and vext operations. + * v14-v15 -- intermediate sums */ #define TUNED_LIST1 8, 16 .macro hconv1_8/*{{{*/ @@ -407,7 +407,7 @@ nop umlal2 v15.4s, v12.8h, v3.h[1] umlal v14.4s, v13.4h, v3.h[1] umlal2 v15.4s, v13.8h, v3.h[1] - 124: ext v12.16b, v3.16b, v4.16b, #7*2 + 124: ext v12.16b, v31.16b, v4.16b, #7*2 ext v13.16b, v9.16b, v10.16b, #7*2 umlal v14.4s, v12.4h, v3.h[0] umlal2 v15.4s, v12.8h, v3.h[0] @@ -1055,64 +1055,47 @@ PRIVATE(fetch_generic_asm) ret END(fetch_generic_asm) -/* Given values in q10 and q11, and an index in x11, sweep the (x11&15)th value +/* Given values in v10 and v11, and an index in x11, sweep the (x11&15)th value * across to fill the rest of the register pair. Used for filling the right * hand edge of the window when starting too close to the right hand edge of * the image. + * Also returns a dup-ed copy of the last element in v12 for the tail-fill + * case (this happens incidentally in common path, but must be done + * deliberately in the fast-out path). */ -PRIVATE(prefetch_clamp1) - sub x11, xzr, x11 - sub x15, x15, x1 - sub x19, x19, x1 - tbz x11, #3, 1f - mov v11.16b, v10.16b - sub x1, x1, #16 -1: mov v12.16b, v11.16b - movi v13.8b, #0xff - tbz x11, #2, 1f - ext v12.16b, v12.16b, v12.16b, #4*2 - sub x1, x1, #8 - shl v13.2d, v13.2d, #32 -1: tbz x11, #1, 1f - ext v12.16b, v12.16b, v12.16b, #6*2 - sub x1, x1, #4 - shl v13.2d, v13.2d, #16 -1: tbz x11, #0, 1f - ext v12.16b, v12.16b, v12.16b, #7*2 - sub x1, x1, #2 - shl v13.2d, v13.2d, #8 -1: dup v12.8h, v12.h[6] - sxtl v13.8h, v13.8b - bif v11.16b, v12.16b, v13.16b -1: tbz x11, #3, 1f - mov v10.16b, v11.16b - mov v11.16b, v12.16b -1: sub x11, xzr, x11 - add x15, x15, x1 - add x19, x19, x1 +PRIVATE(prefetch_clampright1) + ands x12, x11, #15 + beq 1f + sub x12, x12, #1 + sub sp, sp, #64 + st1 {v10.8h,v11.8h}, [sp] + add x12, sp, x12, LSL #1 + ld1r {v12.8h}, [x12] + st1 {v12.8h}, [x12], #16 + st1 {v12.8h}, [x12] + ld1 {v10.8h,v11.8h}, [sp] + add sp, sp, #64 + ret +1: dup v12.8h, v11.h[7] + ret +END(prefetch_clampright1) + +PRIVATE(prefetch_clampright4) + ands x12, x11, #15 + beq 1f + sub x12, x12, #4 + sub sp, sp, #64 + st1 {v10.8h,v11.8h}, [sp] + add x12, sp, x12, LSL #1 + ld1r {v12.2d}, [x12] + st1 {v12.8h}, [x12], #16 + st1 {v12.8h}, [x12] + ld1 {v10.8h,v11.8h}, [sp] + add sp, sp, #64 ret -END(prefetch_clamp1) - -PRIVATE(prefetch_clamp4) - sub x11, xzr, x11 - sub x15, x15, x1 - sub x19, x19, x1 - tbz x11, #3, 1f - sub x1, x1, #16 // what's this? - mov v11.16b, v10.16b 1: dup v12.2d, v11.d[1] - tbz x11, #2, 1f - dup v12.2d, v11.d[0] - sub x1, x1, #8 - dup v11.2d, v11.d[0] -1: tbz x11, #3, 1f - mov v10.16b, v11.16b - mov v11.16b, v12.16b -1: sub x11, xzr, x11 - add x15, x15, x1 - add x19, x19, x1 ret -END(prefetch_clamp4) +END(prefetch_clampright4) /* Helpers for prefetch, below. @@ -1147,10 +1130,10 @@ END(prefetch_clamp4) prefetch_out \qa, \qb, \store, v10.16b, v11.16b, v11.d[1] bl fetch_generic_asm b 2f -3: bl prefetch_clamp\step +3: bl prefetch_clampright\step prefetch_out \qa, \qb, \store, v10.16b, v11.16b, v11.d[1] 4: b 4f+4 - //v12 contains pad word from prefetch_clamp call + //v12 contains pad word from prefetch_clampright call prefetch_out \qa, \qb, \store, v12.16b, v12.16b, v12.d[1] .if \rem > 0 b 4f+4 @@ -1209,24 +1192,18 @@ END(prefetch_clamp4) .else dup v9.2d, v10.d[0] .endif - tst x10, #15 + ands x12, x10, #15 beq 2f - sub x12, xzr, x10 - tbz x10, #3, 1f - mov v11.16b, v10.16b - mov v10.16b, v9.16b -1: tbz x12, #2, 1f - ext v11.16b, v10.16b, v11.16b, #4*2 - ext v10.16b, v9.16b, v10.16b, #4*2 - .if \step == 1 - 1: tbz x12, #1, 1f - ext v11.16b, v10.16b, v11.16b, #2*2 - ext v10.16b, v9.16b, v10.16b, #2*2 - 1: tbz x12, #0, 1f - ext v11.16b, v10.16b, v11.16b, #1*2 - ext v10.16b, v9.16b, v10.16b, #1*2 - .endif -1: sub x1, x1, x10 + sub sp, sp, #32 + st1 {v10.8h,v11.8h}, [sp] + sub x12, sp, x12, LSL #1 + sub sp, sp, #16 + st1 {v9.8h}, [sp] + sub sp, sp, #16 + st1 {v9.8h}, [sp] + ld1 {v10.8h,v11.8h}, [x12] + add sp, sp, #64 + sub x1, x1, x10 sub x15, x15, x10 sub x19, x19, x10 bic x10, x10, #15 @@ -1363,13 +1340,13 @@ END(prefetch_clamp4) b 3b 4: tbz x3, #2, 1f st1 {v15.s}[0], [x0], #4 - ext v15.16b, v15.16b, v15.16b, #4*2 + ext v15.8b, v15.8b, v15.8b, #4 1: tbz x3, #1, 1f st1 {v15.h}[0], [x0], #2 - ext v15.16b, v15.16b, v15.16b, #2*2 + ext v15.8b, v15.8b, v15.8b, #2 1: tbz x3, #0, 5f st1 {v15.b}[0], [x0], #1 - ext v15.16b, v15.16b, v15.16b, #1*2 + ext v15.8b, v15.8b, v15.8b, #1 5: nop .endm @@ -1438,7 +1415,6 @@ ENTRY(rsdIntrinsicBlurU1_K) ldr x12, [sp, #88] // tab - add x0, x0, x8 add x1, x1, x8 cmp x6, x5 @@ -1448,7 +1424,7 @@ ENTRY(rsdIntrinsicBlurU1_K) cmp x8, x5 csel x8, x5, x8, hs cmp x9, x5 - csel x9, x5, x8, hs + csel x9, x5, x9, hs add x4, x8, x9 add x4, x4, x3 @@ -1504,7 +1480,6 @@ ENTRY(rsdIntrinsicBlurU4_K) ldr x12, [sp, #88] - add x0, x0, x8, LSL #2 add x1, x1, x8, LSL #2 cmp x6, x5 diff --git a/cpu_ref/rsCpuIntrinsics_advsimd_YuvToRGB.S b/cpu_ref/rsCpuIntrinsics_advsimd_YuvToRGB.S index 632ef7a4..bb4b7ae3 100644 --- a/cpu_ref/rsCpuIntrinsics_advsimd_YuvToRGB.S +++ b/cpu_ref/rsCpuIntrinsics_advsimd_YuvToRGB.S @@ -21,60 +21,127 @@ * register. This macro will be called from within several different wrapper * variants for different data layouts. Y data starts with the even and odd * bytes split into the low parts of v8 and v9 respectively. U and V are in - * v16 and v17. Working constants are pre-loaded into v13-v15, and v3 is - * pre-loaded with a constant 0xff alpha channel. + * v10 and v11. Working constants are pre-loaded into v24-v31, and v3 and v7 + * are pre-loaded with a constant 0xff alpha channel. * * The complicated arithmetic is the result of refactoring the original * equations to avoid 16-bit overflow without losing any precision. */ -.macro yuvkern - movi v7.8b, #149 - - umull v1.8h, v8.8b, v7.8b // g0 = y0 * 149 - umull v5.8h, v9.8b, v7.8b // g1 = y1 * 149 - - movi v7.8b, #50 - movi v10.8b, #104 - umull v8.8h, v16.8b, v7.8b // g2 = u * 50 + v * 104 - umlal v8.8h, v17.8b, v10.8b - - ushr v7.8b, v17.8b, #1 - uaddw v0.8h, v1.8h, v7.8b // r0 = y0 * 149 + (v >> 1) - uaddw v4.8h, v5.8h, v7.8b // r1 = y1 * 149 + (v >> 1) - - ushll v7.8h, v16.8b, #2 - add v2.8h, v1.8h, v7.8h // b0 = y0 * 149 + (u << 2) - add v6.8h, v5.8h, v7.8h // b1 = y1 * 149 + (u << 2) - - movi v7.16b, #204 - movi v10.8b, #254 - umull v11.8h, v17.8b, v7.8b // r2 = v * 204 - umull v12.8h, v16.8b, v10.8b // b2 = u * 254 - - uhadd v0.8h, v0.8h, v11.8h // r0 = (r0 + r2) >> 1 - uhadd v4.8h, v4.8h, v11.8h // r1 = (r1 + r2) >> 1 - uqadd v1.8h, v1.8h, v14.8h // g0 = satu16(g0 + (-16 * 149 + 128 * 50 + 128 * 104) >> 0) - uqadd v5.8h, v5.8h, v14.8h // g1 = satu16(g1 + (-16 * 149 + 128 * 50 + 128 * 104) >> 0) - uhadd v2.8h, v2.8h, v12.8h // b0 = (b0 + b2) >> 1 - uhadd v6.8h, v6.8h, v12.8h // b1 = (b1 + b2) >> 1 - - uqsub v0.8h, v0.8h, v13.8h // r0 = satu16(r0 - (16 * 149 + (128 >> 1) + 128 * 204) >> 1) - uqsub v4.8h, v4.8h, v13.8h // r1 = satu16(r1 - (16 * 149 + (128 >> 1) + 128 * 204) >> 1) - uqsub v1.8h, v1.8h, v8.8h // g0 = satu16(g0 - g2) - uqsub v5.8h, v5.8h, v8.8h // g1 = satu16(g1 - g2) - uqsub v2.8h, v2.8h, v15.8h // b0 = satu16(b0 - (16 * 149 + (128 << 2) + 128 * 254) >> 1) - uqsub v6.8h, v6.8h, v15.8h // b1 = satu16(b1 - (16 * 149 + (128 << 2) + 128 * 254) >> 1) - - uqrshrn v0.8b, v0.8h, #6 - uqrshrn v4.8b, v4.8h, #6 - uqrshrn v1.8b, v1.8h, #7 - uqrshrn v5.8b, v5.8h, #7 - uqrshrn v2.8b, v2.8h, #6 - uqrshrn v6.8b, v6.8h, #6 - - zip1 v0.16b, v0.16b, v4.16b - zip1 v1.16b, v1.16b, v5.16b - zip1 v2.16b, v2.16b, v6.16b +.macro yuvkern, regu=v10, regv=v11 + /* v0 out R_lo / even R_lo accumulator + * v1 out G_lo / even G_lo accumulator + * v2 out B_lo / even B_lo accumulator + * v3 out A_lo / const 0xff*ff + * v4 out R_hi / even R_hi accumulator + * v5 out G_hi / even G_hi accumulator + * v6 out B_hi / even B_hi accumulator + * v7 out A_hi / const 0xff*ff + * v8 even Y / G_lo luma tmp + * v9 odd Y / G_lo luma tmp + * \regu in U + * \regv in V + * v12 R_lo luma tmp + * v13 B_lo luma tmp + * v14 R_hi luma tmp + * v15 B_hi luma tmp + * v16 odd R_lo accumulator + * v17 odd G_lo accumulator + * v18 odd B_lo accumulator + * v19 multiplier extra bits low + * v20 odd R_hi accumulator + * v21 odd G_hi accumulator + * v22 odd B_hi accumulator + * v23 multiplier extra bits high + * v24 constant 149 + * v25 constant 50 + * v26 constant 104 + * v27 constant 204 + * v28 constant 254 + * v29 constant ((16 * 149 + (128 >> 1) + 128 * 204) >> 1) + * v30 constant ((-16 * 149 + 128 * 50 + 128 * 104) >> 0) + * v31 constant ((16 * 149 + (128 << 2) + 128 * 254) >> 1) + */ + + umull v1.8h, v8.8b, v24.8b // g0 = y0 * 149 + umull v17.8h, v9.8b, v24.8b // g1 = y1 * 149 + umull2 v5.8h, v8.16b, v24.16b // g0_hi = y0_hi * 149 + umull2 v21.8h, v9.16b, v24.16b // g1_hi = y1_hi * 149 + + umull v8.8h, \regu\().8b, v25.8b // g2 = u * 50 + v * 104 + umlal v8.8h, \regv\().8b, v26.8b + umull2 v9.8h, \regu\().16b, v25.16b // g2_hi = u_hi * 50 + v_hi * 104 + umlal2 v9.8h, \regv\().16b, v26.16b + + ushr v19.16b, \regv\().16b, #1 + uaddw v0.8h, v1.8h, v19.8b // r0 = g0 + (v >> 1) + uaddw v16.8h, v17.8h, v19.8b // r1 = g1 + (v >> 1) + + uaddw2 v4.8h, v5.8h, v19.16b // r0_hi = g0_hi + (v_hi >> 1) + uaddw2 v20.8h, v21.8h, v19.16b // r1_hi = g1_hi + (v_hi >> 1) + + ushll v19.8h, \regu\().8b, #2 + ushll2 v23.8h, \regu\().16b, #2 + add v2.8h, v1.8h, v19.8h // b0 = g0 + (u << 2) + add v18.8h, v17.8h, v19.8h // b1 = g1 + (u << 2) + + add v6.8h, v5.8h, v23.8h // b0_hi = g0_hi + (u_hi << 2) + add v22.8h, v21.8h, v23.8h // b1_hi = g1_hi + (u_hi << 2) + + umull v12.8h, \regv\().8b, v27.8b // r2 = v * 204 + umull v13.8h, \regu\().8b, v28.8b // b2 = u * 254 + + umull2 v14.8h, \regv\().16b, v27.16b // r2_hi = v_hi * 204 + umull2 v15.8h, \regu\().16b, v28.16b // b2_hi = u_hi * 254 + + uhadd v0.8h, v0.8h, v12.8h // r0 = (r0 + r2) >> 1 + uhadd v16.8h, v16.8h, v12.8h // r1 = (r1 + r2) >> 1 + uqadd v1.8h, v1.8h, v30.8h // g0 = satu16(g0 + (-16 * 149 + 128 * 50 + 128 * 104) >> 0) + uqadd v17.8h, v17.8h, v30.8h // g1 = satu16(g1 + (-16 * 149 + 128 * 50 + 128 * 104) >> 0) + uhadd v2.8h, v2.8h, v13.8h // b0 = (b0 + b2) >> 1 + uhadd v18.8h, v18.8h, v13.8h // b1 = (b1 + b2) >> 1 + + uhadd v4.8h, v4.8h, v14.8h // r0_hi = (r0_hi + r2_hi) >> 1 + uhadd v20.8h, v20.8h, v14.8h // r1_hi = (r1_hi + r2_hi) >> 1 + uqadd v5.8h, v5.8h, v30.8h // g0_hi = satu16(g0_hi + (-16 * 149 + 128 * 50 + 128 * 104) >> 0) + uqadd v21.8h, v21.8h, v30.8h // g1_hi = satu16(g1_hi + (-16 * 149 + 128 * 50 + 128 * 104) >> 0) + uhadd v6.8h, v6.8h, v15.8h // b0_hi = (b0_hi + b2_hi) >> 1 + uhadd v22.8h, v22.8h, v15.8h // b1_hi = (b1_hi + b2_hi) >> 1 + + uqsub v0.8h, v0.8h, v29.8h // r0 = satu16(r0 - (16 * 149 + (128 >> 1) + 128 * 204) >> 1) + uqsub v16.8h, v16.8h, v29.8h // r1 = satu16(r1 - (16 * 149 + (128 >> 1) + 128 * 204) >> 1) + uqsub v1.8h, v1.8h, v8.8h // g0 = satu16(g0 - g2) + uqsub v17.8h, v17.8h, v8.8h // g1 = satu16(g1 - g2) + uqsub v2.8h, v2.8h, v31.8h // b0 = satu16(b0 - (16 * 149 + (128 << 2) + 128 * 254) >> 1) + uqsub v18.8h, v18.8h, v31.8h // b1 = satu16(b1 - (16 * 149 + (128 << 2) + 128 * 254) >> 1) + + uqsub v4.8h, v4.8h, v29.8h // r0_hi = satu16(r0_hi - (16 * 149 + (128 >> 1) + 128 * 204) >> 1) + uqsub v20.8h, v20.8h, v29.8h // r1_hi = satu16(r1_hi - (16 * 149 + (128 >> 1) + 128 * 204) >> 1) + uqsub v5.8h, v5.8h, v9.8h // g0_hi = satu16(g0_hi - g2_hi) + uqsub v21.8h, v21.8h, v9.8h // g1_hi = satu16(g1_hi - g2_hi) + uqsub v6.8h, v6.8h, v31.8h // b0_hi = satu16(b0_hi - (16 * 149 + (128 << 2) + 128 * 254) >> 1) + uqsub v22.8h, v22.8h, v31.8h // b1_hi = satu16(b1_hi - (16 * 149 + (128 << 2) + 128 * 254) >> 1) + + uqrshrn v0.8b, v0.8h, #6 + uqrshrn v16.8b, v16.8h, #6 + uqrshrn v1.8b, v1.8h, #7 + uqrshrn v17.8b, v17.8h, #7 + uqrshrn v2.8b, v2.8h, #6 + uqrshrn v18.8b, v18.8h, #6 + + uqrshrn v4.8b, v4.8h, #6 + uqrshrn v20.8b, v20.8h, #6 + uqrshrn v5.8b, v5.8h, #7 + uqrshrn v21.8b, v21.8h, #7 + uqrshrn v6.8b, v6.8h, #6 + uqrshrn v22.8b, v22.8h, #6 + + zip1 v0.16b, v0.16b, v16.16b + zip1 v1.16b, v1.16b, v17.16b + zip1 v2.16b, v2.16b, v18.16b + + zip1 v4.16b, v4.16b, v20.16b + zip1 v5.16b, v5.16b, v21.16b + zip1 v6.16b, v6.16b, v22.16b .endm /* Define the wrapper code which will load and store the data, iterate the @@ -83,50 +150,51 @@ * being handled. */ .macro wrap_line kernel, interleaved=0, swapuv=0 - + movi v24.16b, #149 + movi v25.16b, #50 + movi v26.16b, #104 + movi v27.16b, #204 + movi v28.16b, #254 mov w5, #((16 * 149 + (128 >> 1) + 128 * 204) >> 1) - dup v13.8h, w5 + dup v29.8h, w5 mov w5, #((-16 * 149 + 128 * 50 + 128 * 104) >> 0) - dup v14.8h, w5 + dup v30.8h, w5 mov w5, #((16 * 149 + (128 << 2) + 128 * 254) >> 1) - dup v15.8h, w5 + dup v31.8h, w5 movi v3.16b, #0xff + movi v7.16b, #0xff - subs x2, x2, #16 + subs x2, x2, #32 bhs 1f b 2f .align 4 -1: ld2 {v8.8b,v9.8b}, [x1], #16 -// prfm PLDL1STRM, [x1, #256] +1: ld2 {v8.16b,v9.16b}, [x1], #32 .if \interleaved - .if \swapuv - ld2 {v17.8b,v18.8b}, [x3], #16 - mov v16.8b, v18.8b - .else - ld2 {v16.8b,v17.8b}, [x3], #16 - .endif -// prfm PLD1STRM, [x3, #256] + ld2 {v10.16b,v11.16b}, [x3], #32 .else - ld1 {v16.8b}, [x3], #8 - ld1 {v17.8b}, [x4], #8 -// prfm PLD1STRM, [x3, #128] -// prfm PLD1STRM, [x4, #128] + ld1 {v10.16b}, [x3], #16 + ld1 {v11.16b}, [x4], #16 .endif + .if \swapuv + \kernel regu=v11, regv=v10 + .else \kernel + .endif - subs x2, x2, #16 + subs x2, x2, #32 - st4 {v0.16b,v1.16b,v2.16b,v3.16b}, [x0], #64 + st4 {v0.16b - v3.16b}, [x0], #64 + st4 {v4.16b - v7.16b}, [x0], #64 bhs 1b -2: adds x2, x2, #16 +2: adds x2, x2, #32 beq 2f - /* To handle the tail portion of the data (something less than 16 + /* To handle the tail portion of the data (something less than 32 * bytes) load small power-of-two chunks into working registers. It * doesn't matter where they end up in the register; the same process * will store them back out using the same positions and the @@ -135,40 +203,48 @@ */ movi v8.8b, #0 movi v9.8b, #0 - movi v16.8b, #0 - movi v17.8b, #0 + movi v10.8b, #0 + movi v11.8b, #0 - tbz x2, #3, 1f - ld1 {v9.8b}, [x1], #8 + tbz x2, #4, 1f + ld1 {v9.16b}, [x1], #16 + .if \interleaved + ld1 {v11.16b}, [x3], #16 + .else + ld1 {v10.d}[1], [x3], #8 + ld1 {v11.d}[1], [x4], #8 + .endif +1: tbz x2, #3, 1f + ld1 {v8.d}[1], [x1], #8 .if \interleaved - ld1 {v17.8b}, [x3], #8 + ld1 {v10.d}[1], [x3], #8 .else - ld1 {v16.s}[1], [x3], #4 - ld1 {v17.s}[1], [x4], #4 + ld1 {v10.s}[1], [x3], #4 + ld1 {v11.s}[1], [x4], #4 .endif 1: tbz x2, #2, 1f ld1 {v8.s}[1], [x1], #4 .if \interleaved - ld1 {v16.s}[1], [x3], #4 + ld1 {v10.s}[1], [x3], #4 .else - ld1 {v16.h}[1], [x3], #2 - ld1 {v17.h}[1], [x4], #2 + ld1 {v10.h}[1], [x3], #2 + ld1 {v11.h}[1], [x4], #2 .endif 1: tbz x2, #1, 1f ld1 {v8.h}[1], [x1], #2 .if \interleaved - ld1 {v16.h}[1], [x3], #2 + ld1 {v10.h}[1], [x3], #2 .else - ld1 {v16.b}[1], [x3], #1 - ld1 {v17.b}[1], [x4], #1 + ld1 {v10.b}[1], [x3], #1 + ld1 {v11.b}[1], [x4], #1 .endif 1: tbz x2, #0, 1f ld1 {v8.b}[1], [x1], #1 .if \interleaved - ld1 {v16.h}[0], [x3], #2 + ld1 {v10.h}[0], [x3], #2 .else - ld1 {v16.b}[0], [x3], #1 - ld1 {v17.b}[0], [x4], #1 + ld1 {v10.b}[0], [x3], #1 + ld1 {v11.b}[0], [x4], #1 .endif /* One small impediment in the process above is that some of the load @@ -176,29 +252,38 @@ * same time as loading only part of a register. So the data is loaded * linearly and unpacked manually at this point if necessary. */ -1: uzp1 v8.16b, v8.16b, v9.16b +1: mov v12.16b, v8.16b + uzp1 v8.16b, v12.16b, v9.16b + uzp2 v9.16b, v12.16b, v9.16b .if \interleaved - .if \swapuv - uzp1 v16.16b, v17.16b, v16.16b - .else - uzp1 v16.16b, v16.16b, v17.16b - .endif + mov v12.16b, v10.16b + uzp1 v10.16b, v12.16b, v11.16b + uzp2 v11.16b, v12.16b, v11.16b .endif + .if \swapuv + \kernel regu=v11, regv=v10 + .else \kernel + .endif /* As above but with the output; structured stores for partial vectors * aren't available, so the data is re-packed first and stored linearly. */ - zip1 v4.16b, v0.16b, v2.16b - zip2 v6.16b, v0.16b, v2.16b - zip1 v5.16b, v1.16b, v3.16b - zip2 v7.16b, v1.16b, v3.16b - zip1 v0.16b, v4.16b, v5.16b - zip2 v1.16b, v4.16b, v5.16b - zip1 v2.16b, v6.16b, v7.16b - zip2 v3.16b, v6.16b, v7.16b - + zip1 v16.16b, v0.16b, v2.16b + zip2 v18.16b, v0.16b, v2.16b + zip1 v17.16b, v1.16b, v3.16b + zip2 v19.16b, v1.16b, v3.16b + zip1 v0.16b, v16.16b, v17.16b + zip2 v1.16b, v16.16b, v17.16b + zip1 v2.16b, v18.16b, v19.16b + zip2 v3.16b, v18.16b, v19.16b + + /* Luckily v4-v7 don't need to be unzipped because the complete set of + * four and can be stored using st4. */ + + tbz x2, #4, 1f + st4 {v4.16b - v7.16b}, [x0], #64 1: tbz x2, #3, 1f st1 {v2.16b,v3.16b}, [x0], #32 1: tbz x2, #2, 1f @@ -225,7 +310,7 @@ ENTRY(rsdIntrinsicYuv2_K) add x1, x1, x4 add x4, x3, x6 add x3, x2, x6 - sub x2, x5, x6, LSL #2 + sub x2, x5, x6, LSL #1 sub x6, sp, #32 sub sp, sp, #64 diff --git a/cpu_ref/rsCpuIntrinsics_neon_Blur.S b/cpu_ref/rsCpuIntrinsics_neon_Blur.S index 8fc47f5b..a7ae795c 100644 --- a/cpu_ref/rsCpuIntrinsics_neon_Blur.S +++ b/cpu_ref/rsCpuIntrinsics_neon_Blur.S @@ -15,6 +15,7 @@ */ #define ENTRY(f) .text; .align 4; .globl f; .type f,#function; f: .fnstart +#define PRIVATE(f) .text; .align 4; .type f,#function; f: .fnstart #define END(f) .fnend; .size f, .-f; .eabi_attribute 25,1 @Tag_ABI_align8_preserved @@ -1049,7 +1050,7 @@ /* Dedicated function wrapper for the fetch macro, for the cases where * performance isn't that important, to keep code size down. */ -ENTRY(fetch_generic_asm) +PRIVATE(fetch_generic_asm) push {r10,r11} fetch pop {r10,r11} @@ -1060,61 +1061,46 @@ END(fetch_generic_asm) * across to fill the rest of the register pair. Used for filling the right * hand edge of the window when starting too close to the right hand edge of * the image. + * Also returns a dup-ed copy of the last element in q12 for the tail-fill + * case (this happens incidentally in common path, but must be done + * deliberately in the fast-out path). */ -ENTRY(prefetch_clamp1) - rsb r11, r11, #0 - tst r11, #8 +PRIVATE(prefetch_clampright1) + ands r12, r11, #15 beq 1f - vmov.u16 q11, q10 - sub r1, r1, #16 -1: vmov.u16 q12, q11 - vmov.i8 d26, #0xff - tst r11, #4 - beq 1f - vext.u16 q12, q12, q12, #4 - sub r1, r1, #8 - vshl.u64 d26, d26, #32 -1: tst r11, #2 - beq 1f - vext.u16 q12, q12, q12, #6 - sub r1, r1, #4 - vshl.u64 d26, d26, #16 -1: tst r11, #1 - beq 1f - vext.u16 q12, q12, q12, #7 - sub r1, r1, #2 - vshl.u64 d26, d26, #8 -1: vdup.u16 q12, d25[2] - vmovl.s8 q13, d26 - vbif q11, q12, q13 -1: tst r11, #8 - beq 1f - vmov q10, q11 - vmov q11, q12 -1: rsb r11, r11, #0 + sub r12, r12, #1 + sub sp, sp, #64 + vst1.u16 {q10,q11}, [sp] + add r12, sp, r12, LSL #1 + vld1.u16 {d24[]}, [r12] + vld1.u16 {d25[]}, [r12] + vst1.u16 {q12}, [r12]! + vst1.u16 {q12}, [r12] + vld1.u16 {q10,q11}, [sp] + add sp, sp, #64 + bx lr +1: vdup.u16 q12, d23[3] bx lr -END(prefetch_clamp1) +END(prefetch_clampright1) -ENTRY(prefetch_clamp4) - rsb r11, r11, #0 - tst r11, #8 - beq 1f - sub r1, r1, #16 - vmov.u16 q11, q10 -1: vmov d24, d23 - tst r11, #4 - beq 1f - vmov d24, d22 - sub r1, r1, #8 - vmov d23, d22 -1: vmov d25, d24 - tst r11, #8 +PRIVATE(prefetch_clampright4) + ands r12, r11, #15 beq 1f - vmov q10, q11 - vmov q11, q12 -1: rsb r11, r11, #0 + sub r12, r12, #4 + sub sp, sp, #64 + vst1.u16 {q10,q11}, [sp] + add r12, sp, r12, LSL #1 + vld1.u64 {d24}, [r12] + vld1.u64 {d25}, [r12] + vst1.u16 {q12}, [r12]! + vst1.u16 {q12}, [r12] + vld1.u16 {q10,q11}, [sp] + add sp, sp, #64 + bx lr +1: vmov.u16 d24, d23 + vmov.u16 d25, d23 bx lr -END(prefetch_clamp4) +END(prefetch_clampright4) /* Helpers for prefetch, below. @@ -1147,10 +1133,10 @@ END(prefetch_clamp4) prefetch_out \qa, \qb, \store, q10, q11, d23 bl fetch_generic_asm b 2f -3: bl prefetch_clamp\step +3: bl prefetch_clampright\step prefetch_out \qa, \qb, \store, q10, q11, d23 4: b 4f+4 - @q12 contains pad word from prefetch_clam call + @q12 contains pad word from prefetch_clampright call prefetch_out \qa, \qb, \store, q12, q12, d25 .if \rem > 0 b 4f+4 @@ -1205,28 +1191,18 @@ END(prefetch_clamp4) vmov.u16 d18, d20 vmov.u16 d19, d20 .endif - tst r10, #15 + ands r12, r10, #15 beq 2f - rsb r12, r10, #0 - tst r10, #8 - beq 1f - vmov.u16 q11, q10 - vmov.u16 q10, q9 -1: tst r12, #4 - beq 1f - vext.u16 q11, q10, q11, #4 - vext.u16 q10, q9, q10, #4 - .if \step == 1 - 1: tst r12, #2 - beq 1f - vext.u16 q11, q10, q11, #2 - vext.u16 q10, q9, q10, #2 - 1: tst r12, #1 - beq 1f - vext.u16 q11, q10, q11, #1 - vext.u16 q10, q9, q10, #1 - .endif -1: sub r1, r1, r10 + sub sp, sp, #32 + vst1.u16 {q10,q11}, [sp] + sub r12, sp, r12, LSL #1 + sub sp, sp, #16 + vst1.u16 {q9}, [sp] + sub sp, sp, #16 + vst1.u16 {q9}, [sp] + vld1.u16 {q10,q11}, [r12] + add sp, sp, #64 + sub r1, r1, r10 bic r10, r10, #15 add r1, r1, r10 2: @@ -1383,7 +1359,7 @@ END(prefetch_clamp4) .endm .irep r, TUNED_LIST1, 25 -ENTRY(convolve1_\r) +PRIVATE(convolve1_\r) push {r12,lr} sub r1, r1, r8 @@ -1397,7 +1373,7 @@ END(convolve1_\r) .endr .irep r, TUNED_LIST4, 25 -ENTRY(convolve4_\r) +PRIVATE(convolve4_\r) sub r12, sp, #0x200 bic r9, r12, #0x3fc mov sp, r9 @@ -1447,8 +1423,7 @@ ENTRY(rsdIntrinsicBlurU1_K) ldr r12, [sp,#124] - add r0, r0, r8 @, LSL #2 /* for blur4 option */ - add r1, r1, r8 @, LSL #2 /* for blur4 option */ + add r1, r1, r8 cmp r6, r5 movhi r6, r5 @@ -1503,7 +1478,6 @@ ENTRY(rsdIntrinsicBlurU4_K) ldr r12, [sp,#124] - add r0, r0, r8, LSL #2 add r1, r1, r8, LSL #2 cmp r6, r5 diff --git a/cpu_ref/rsCpuScript.cpp b/cpu_ref/rsCpuScript.cpp index a11fda19..e8b3fb6d 100644 --- a/cpu_ref/rsCpuScript.cpp +++ b/cpu_ref/rsCpuScript.cpp @@ -230,6 +230,11 @@ static void setCompileArguments(std::vector<const char*>* args, const android::S args->push_back("-mtriple"); args->push_back(DEFAULT_TARGET_TRIPLE_STRING); + // Enable workaround for A53 codegen by default. +#if defined(__aarch64__) && !defined(DISABLE_A53_WORKAROUND) + args->push_back("-aarch64-fix-cortex-a53-835769"); +#endif + // Execute the bcc compiler. if (useRSDebugContext) { args->push_back("-rs-debug-ctx"); |
