diff options
| -rw-r--r-- | driver/rsdBcc.cpp | 32 | ||||
| -rw-r--r-- | driver/rsdCore.cpp | 16 | ||||
| -rw-r--r-- | driver/rsdIntrinsicBlend.cpp | 3 | ||||
| -rw-r--r-- | driver/rsdIntrinsicBlur.cpp | 71 | ||||
| -rw-r--r-- | driver/rsdIntrinsicColorMatrix.cpp | 3 | ||||
| -rw-r--r-- | driver/rsdIntrinsicConvolve3x3.cpp | 2 | ||||
| -rw-r--r-- | driver/rsdIntrinsicConvolve5x5.cpp | 2 | ||||
| -rw-r--r-- | driver/rsdIntrinsicLUT.cpp | 3 | ||||
| -rw-r--r-- | rs_hal.h | 1 |
9 files changed, 110 insertions, 23 deletions
diff --git a/driver/rsdBcc.cpp b/driver/rsdBcc.cpp index c78508cd..e17c107a 100644 --- a/driver/rsdBcc.cpp +++ b/driver/rsdBcc.cpp @@ -176,6 +176,7 @@ static void wc_xy(void *usr, uint32_t idx) { MTLaunchStruct *mtls = (MTLaunchStruct *)usr; RsForEachStubParamStruct p; memcpy(&p, &mtls->fep, sizeof(p)); + p.lid = idx; RsdHal * dc = (RsdHal *)mtls->rsc->mHal.drv; uint32_t sig = mtls->sig; @@ -222,6 +223,7 @@ static void wc_x(void *usr, uint32_t idx) { MTLaunchStruct *mtls = (MTLaunchStruct *)usr; RsForEachStubParamStruct p; memcpy(&p, &mtls->fep, sizeof(p)); + p.lid = idx; RsdHal * dc = (RsdHal *)mtls->rsc->mHal.drv; uint32_t sig = mtls->sig; @@ -341,17 +343,40 @@ void rsdScriptLaunchThreads(const Context *rsc, Context *mrsc = (Context *)rsc; RsdHal * dc = (RsdHal *)mtls->rsc->mHal.drv; - if ((dc->mWorkers.mCount > 1) && s->mHal.info.isThreadable && !dc->mInForEach) { + if ((dc->mWorkers.mCount >= 1) && s->mHal.info.isThreadable && !dc->mInForEach) { + const size_t targetByteChunk = 16 * 1024; dc->mInForEach = true; if (mtls->fep.dimY > 1) { - mtls->mSliceSize = mtls->fep.dimY / (dc->mWorkers.mCount * 4); + uint32_t s1 = mtls->fep.dimY / ((dc->mWorkers.mCount + 1) * 4); + uint32_t s2 = 0; + + // This chooses our slice size to rate limit atomic ops to + // one per 16k bytes of reads/writes. + if (mtls->fep.yStrideOut) { + s2 = targetByteChunk / mtls->fep.yStrideOut; + } else { + s2 = targetByteChunk / mtls->fep.yStrideIn; + } + mtls->mSliceSize = rsMin(s1, s2); + if(mtls->mSliceSize < 1) { mtls->mSliceSize = 1; } rsdLaunchThreads(mrsc, wc_xy, mtls); } else { - mtls->mSliceSize = mtls->fep.dimX / (dc->mWorkers.mCount * 4); + uint32_t s1 = mtls->fep.dimX / ((dc->mWorkers.mCount + 1) * 4); + uint32_t s2 = 0; + + // This chooses our slice size to rate limit atomic ops to + // one per 16k bytes of reads/writes. + if (mtls->fep.eStrideOut) { + s2 = targetByteChunk / mtls->fep.eStrideOut; + } else { + s2 = targetByteChunk / mtls->fep.eStrideIn; + } + mtls->mSliceSize = rsMin(s1, s2); + if(mtls->mSliceSize < 1) { mtls->mSliceSize = 1; } @@ -364,6 +389,7 @@ void rsdScriptLaunchThreads(const Context *rsc, } else { RsForEachStubParamStruct p; memcpy(&p, &mtls->fep, sizeof(p)); + p.lid = 0; uint32_t sig = mtls->sig; //ALOGE("launch 3"); diff --git a/driver/rsdCore.cpp b/driver/rsdCore.cpp index cdfc600b..c8b8014c 100644 --- a/driver/rsdCore.cpp +++ b/driver/rsdCore.cpp @@ -189,7 +189,8 @@ static void * HelperThreadProc(void *vrsc) { while (!dc->mExit) { dc->mWorkers.mLaunchSignals[idx].wait(); if (dc->mWorkers.mLaunchCallback) { - dc->mWorkers.mLaunchCallback(dc->mWorkers.mLaunchData, idx); + // idx +1 is used because the calling thread is always worker 0. + dc->mWorkers.mLaunchCallback(dc->mWorkers.mLaunchData, idx+1); } android_atomic_dec(&dc->mWorkers.mRunningCount); dc->mWorkers.mCompleteSignal.set(); @@ -208,6 +209,13 @@ void rsdLaunchThreads(Context *rsc, WorkerCallback_t cbk, void *data) { for (uint32_t ct = 0; ct < dc->mWorkers.mCount; ct++) { dc->mWorkers.mLaunchSignals[ct].set(); } + + // We use the calling thread as one of the workers so we can start without + // the delay of the thread wakeup. + if (dc->mWorkers.mLaunchCallback) { + dc->mWorkers.mLaunchCallback(dc->mWorkers.mLaunchData, 0); + } + while (android_atomic_acquire_load(&dc->mWorkers.mRunningCount) != 0) { dc->mWorkers.mCompleteSignal.wait(); } @@ -250,11 +258,13 @@ extern "C" bool rsdHalInit(RsContext c, uint32_t version_major, cpu = rsc->props.mDebugMaxThreads; } if (cpu < 2) { - cpu = 0; + dc->mWorkers.mCount = 0; + return true; } ALOGV("%p Launching thread(s), CPUs %i", rsc, cpu); - dc->mWorkers.mCount = (uint32_t)cpu; + // Subtract one from the cpu count because we also use the command thread as a worker. + dc->mWorkers.mCount = (uint32_t)(cpu - 1); dc->mWorkers.mThreadId = (pthread_t *) calloc(dc->mWorkers.mCount, sizeof(pthread_t)); dc->mWorkers.mNativeThreadId = (pid_t *) calloc(dc->mWorkers.mCount, sizeof(pid_t)); dc->mWorkers.mLaunchSignals = new Signal[dc->mWorkers.mCount]; diff --git a/driver/rsdIntrinsicBlend.cpp b/driver/rsdIntrinsicBlend.cpp index 22ad108a..c35c3796 100644 --- a/driver/rsdIntrinsicBlend.cpp +++ b/driver/rsdIntrinsicBlend.cpp @@ -103,9 +103,6 @@ static void ColorMatrix_uchar4(const RsForEachStubParamStruct *p, uint32_t x1 = xstart; uint32_t x2 = xend; - in += xstart; - out += xstart; - switch (p->slot) { case BLEND_CLEAR: for (;x1 < x2; x1++, out++) { diff --git a/driver/rsdIntrinsicBlur.cpp b/driver/rsdIntrinsicBlur.cpp index 9c1fe685..5cd671e3 100644 --- a/driver/rsdIntrinsicBlur.cpp +++ b/driver/rsdIntrinsicBlur.cpp @@ -29,6 +29,8 @@ struct ConvolveParams { short ip[104]; float radius; int iradius; + void **scratch; + size_t *scratchSize; ObjectBaseRef<Allocation> alloc; }; @@ -139,6 +141,7 @@ static void OneVF(float4 *out, out->xyzw = blurredPixel; x1++; out++; + gPtr++; } } @@ -161,7 +164,8 @@ static void OneH(const RsForEachStubParamStruct *p, uchar4 *out, int32_t x, static void Blur_uchar4(const RsForEachStubParamStruct *p, uint32_t xstart, uint32_t xend, uint32_t instep, uint32_t outstep) { - float buf[4 * 2048]; + float stackbuf[4 * 2048]; + float *buf = &stackbuf[0]; ConvolveParams *cp = (ConvolveParams *)p->usr; if (!cp->alloc.get()) { ALOGE("Blur executed without input, skipping"); @@ -174,16 +178,37 @@ static void Blur_uchar4(const RsForEachStubParamStruct *p, uint32_t x1 = xstart; uint32_t x2 = xend; + if (p->dimX > 2048) { + if ((p->dimX > cp->scratchSize[p->lid]) || !cp->scratch[p->lid]) { + cp->scratch[p->lid] = realloc(cp->scratch[p->lid], p->dimX * 16); + cp->scratchSize[p->lid] = p->dimX; + } + buf = (float *)cp->scratch[p->lid]; + } float4 *fout = (float4 *)buf; + int y = p->y; + uint32_t vx1 = x1; + uint32_t vx2 = x2; + + if (vx1 > (uint32_t)cp->iradius) { + vx1 -= cp->iradius; + } else { + vx1 = 0; + } + vx2 += cp->iradius; + if (vx2 >= p->dimX) { + vx2 = p->dimX - 1; + } + if ((y > cp->iradius) && (y < ((int)p->dimY - cp->iradius))) { const uchar *pi = pin + (y - cp->iradius) * din->lod[0].stride; - OneVF(fout, pi, din->lod[0].stride, cp->fp, cp->iradius * 2 + 1, x1, x2); + OneVF(fout + vx1, pi, din->lod[0].stride, cp->fp, cp->iradius * 2 + 1, vx1, vx2); } else { - while(x2 > x1) { - OneV(p, fout, x1, y, pin, din->lod[0].stride, cp->fp, cp->iradius); + while(vx2 > vx1) { + OneV(p, fout, vx1, y, pin, din->lod[0].stride, cp->fp, cp->iradius); fout++; - x1++; + vx1++; } } @@ -208,17 +233,51 @@ static void Blur_uchar4(const RsForEachStubParamStruct *p, } -void * rsdIntrinsic_InitBlur(const android::renderscript::Context *dc, +static void Destroy(const Context *rsc, const Script *script, void * intrinsicData) { + RsdHal * dc = (RsdHal *)rsc->mHal.drv; + ConvolveParams *cp = (ConvolveParams *)intrinsicData; + + if (cp) { + if (cp->scratch) { + for (size_t i = 0; i < dc->mWorkers.mCount + 1; i++) { + if (cp->scratch[i]) { + free(cp->scratch[i]); + } + } + free(cp->scratch); + } + if (cp->scratchSize) { + free(cp->scratchSize); + } + free(cp); + } +} + +void * rsdIntrinsic_InitBlur(const android::renderscript::Context *rsc, android::renderscript::Script *script, RsdIntriniscFuncs_t *funcs) { + RsdHal * dc = (RsdHal *)rsc->mHal.drv; + script->mHal.info.exportedVariableCount = 2; funcs->setVarObj = Blur_Bind; funcs->setVar = Blur_SetVar; funcs->root = Blur_uchar4; + funcs->destroy = Destroy; ConvolveParams *cp = (ConvolveParams *)calloc(1, sizeof(ConvolveParams)); + if (!cp) { + return NULL; + } + cp->radius = 5; + cp->scratch = (void **)calloc(dc->mWorkers.mCount + 1, sizeof(void *)); + cp->scratchSize = (size_t *)calloc(dc->mWorkers.mCount + 1, sizeof(size_t)); + if (!cp->scratch || !cp->scratchSize) { + Destroy(rsc, script, cp); + return NULL; + } + ComputeGaussianWeights(cp); return cp; } diff --git a/driver/rsdIntrinsicColorMatrix.cpp b/driver/rsdIntrinsicColorMatrix.cpp index 8f6c70cc..cfe0333e 100644 --- a/driver/rsdIntrinsicColorMatrix.cpp +++ b/driver/rsdIntrinsicColorMatrix.cpp @@ -97,9 +97,6 @@ static void ColorMatrix_uchar4(const RsForEachStubParamStruct *p, uint32_t x1 = xstart; uint32_t x2 = xend; - in += xstart; - out += xstart; - if(x2 > x1) { #if defined(ARCH_ARM_HAVE_NEON) int32_t len = (x2 - x1) >> 2; diff --git a/driver/rsdIntrinsicConvolve3x3.cpp b/driver/rsdIntrinsicConvolve3x3.cpp index 55f4360e..dac2f24e 100644 --- a/driver/rsdIntrinsicConvolve3x3.cpp +++ b/driver/rsdIntrinsicConvolve3x3.cpp @@ -56,7 +56,7 @@ static void ConvolveOne(const RsForEachStubParamStruct *p, uint32_t x, uchar4 *o const float* coeff) { uint32_t x1 = rsMax((int32_t)x-1, 0); - uint32_t x2 = rsMin((int32_t)x+1, (int32_t)p->dimX); + uint32_t x2 = rsMin((int32_t)x+1, (int32_t)p->dimX-1); float4 px = convert_float4(py0[x1]) * coeff[0] + convert_float4(py0[x]) * coeff[1] + diff --git a/driver/rsdIntrinsicConvolve5x5.cpp b/driver/rsdIntrinsicConvolve5x5.cpp index fc6b029e..ac063040 100644 --- a/driver/rsdIntrinsicConvolve5x5.cpp +++ b/driver/rsdIntrinsicConvolve5x5.cpp @@ -134,7 +134,7 @@ static void Convolve5x5_uchar4(const RsForEachStubParamStruct *p, #if defined(ARCH_ARM_HAVE_NEON) if((x1 + 3) < x2) { uint32_t len = (x2 - x1 - 3) >> 1; - rsdIntrinsicConvolve5x5_K(out, py0, py1, py2, py3, py4, cp->ip, len); + rsdIntrinsicConvolve5x5_K(out, py0+x1-2, py1+x1-2, py2+x1-2, py3+x1-2, py4+x1-2, cp->ip, len); out += len << 1; x1 += len << 1; } diff --git a/driver/rsdIntrinsicLUT.cpp b/driver/rsdIntrinsicLUT.cpp index a75534ed..818a132d 100644 --- a/driver/rsdIntrinsicLUT.cpp +++ b/driver/rsdIntrinsicLUT.cpp @@ -44,9 +44,6 @@ static void LUT_uchar4(const RsForEachStubParamStruct *p, uint32_t x1 = xstart; uint32_t x2 = xend; - in += xstart; - out += xstart; - DrvAllocation *din = (DrvAllocation *)cp->lut->mHal.drv; const uchar *tr = (const uchar *)din->lod[0].mallocPtr; const uchar *tg = &tr[256]; @@ -58,6 +58,7 @@ typedef struct { uint32_t lod; RsAllocationCubemapFace face; uint32_t ar[16]; + uint32_t lid; uint32_t dimX; uint32_t dimY; |
