/* * Copyright (C) 2013 The Android Open Source Project * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #include "rsCpuIntrinsic.h" #include "rsCpuIntrinsicInlines.h" using namespace android; using namespace android::renderscript; namespace android { namespace renderscript { class RsdCpuScriptIntrinsicHistogram : public RsdCpuScriptIntrinsic { public: void populateScript(Script *) override; void invokeFreeChildren() override; void setGlobalVar(uint32_t slot, const void *data, size_t dataLength) override; void setGlobalObj(uint32_t slot, ObjectBase *data) override; ~RsdCpuScriptIntrinsicHistogram() override; RsdCpuScriptIntrinsicHistogram(RsdCpuReferenceImpl *ctx, const Script *s, const Element *e); protected: void preLaunch(uint32_t slot, const Allocation ** ains, uint32_t inLen, Allocation * aout, const void * usr, uint32_t usrLen, const RsScriptCall *sc); void postLaunch(uint32_t slot, const Allocation ** ains, uint32_t inLen, Allocation * aout, const void * usr, uint32_t usrLen, const RsScriptCall *sc); float mDot[4]; int mDotI[4]; int *mSums; ObjectBaseRef mAllocOut; static void kernelP1U4(const RsExpandKernelDriverInfo *info, uint32_t xstart, uint32_t xend, uint32_t outstep); static void kernelP1U3(const RsExpandKernelDriverInfo *info, uint32_t xstart, uint32_t xend, uint32_t outstep); static void kernelP1U2(const RsExpandKernelDriverInfo *info, uint32_t xstart, uint32_t xend, uint32_t outstep); static void kernelP1U1(const RsExpandKernelDriverInfo *info, uint32_t xstart, uint32_t xend, uint32_t outstep); static void kernelP1L4(const RsExpandKernelDriverInfo *info, uint32_t xstart, uint32_t xend, uint32_t outstep); static void kernelP1L3(const RsExpandKernelDriverInfo *info, uint32_t xstart, uint32_t xend, uint32_t outstep); static void kernelP1L2(const RsExpandKernelDriverInfo *info, uint32_t xstart, uint32_t xend, uint32_t outstep); static void kernelP1L1(const RsExpandKernelDriverInfo *info, uint32_t xstart, uint32_t xend, uint32_t outstep); }; } } void RsdCpuScriptIntrinsicHistogram::setGlobalObj(uint32_t slot, ObjectBase *data) { rsAssert(slot == 1); mAllocOut.set(static_cast(data)); } void RsdCpuScriptIntrinsicHistogram::setGlobalVar(uint32_t slot, const void *data, size_t dataLength) { rsAssert(slot == 0); rsAssert(dataLength == 16); memcpy(mDot, data, 16); mDotI[0] = (int)((mDot[0] * 256.f) + 0.5f); mDotI[1] = (int)((mDot[1] * 256.f) + 0.5f); mDotI[2] = (int)((mDot[2] * 256.f) + 0.5f); mDotI[3] = (int)((mDot[3] * 256.f) + 0.5f); } void RsdCpuScriptIntrinsicHistogram::preLaunch(uint32_t slot, const Allocation ** ains, uint32_t inLen, Allocation * aout, const void * usr, uint32_t usrLen, const RsScriptCall *sc) { const uint32_t threads = mCtx->getThreadCount(); uint32_t vSize = mAllocOut->getType()->getElement()->getVectorSize(); switch (slot) { case 0: switch(vSize) { case 1: mRootPtr = &kernelP1U1; break; case 2: mRootPtr = &kernelP1U2; break; case 3: mRootPtr = &kernelP1U3; vSize = 4; break; case 4: mRootPtr = &kernelP1U4; break; } break; case 1: switch(ains[0]->getType()->getElement()->getVectorSize()) { case 1: mRootPtr = &kernelP1L1; break; case 2: mRootPtr = &kernelP1L2; break; case 3: mRootPtr = &kernelP1L3; break; case 4: mRootPtr = &kernelP1L4; break; } break; } memset(mSums, 0, 256 * sizeof(int32_t) * threads * vSize); } void RsdCpuScriptIntrinsicHistogram::postLaunch(uint32_t slot, const Allocation ** ains, uint32_t inLen, Allocation * aout, const void * usr, uint32_t usrLen, const RsScriptCall *sc) { unsigned int *o = (unsigned int *)mAllocOut->mHal.drvState.lod[0].mallocPtr; uint32_t threads = mCtx->getThreadCount(); uint32_t vSize = mAllocOut->getType()->getElement()->getVectorSize(); if (vSize == 3) vSize = 4; for (uint32_t ct=0; ct < (256 * vSize); ct++) { o[ct] = mSums[ct]; for (uint32_t t=1; t < threads; t++) { o[ct] += mSums[ct + (256 * vSize * t)]; } } } void RsdCpuScriptIntrinsicHistogram::kernelP1U4(const RsExpandKernelDriverInfo *info, uint32_t xstart, uint32_t xend, uint32_t outstep) { RsdCpuScriptIntrinsicHistogram *cp = (RsdCpuScriptIntrinsicHistogram *)info->usr; uchar *in = (uchar *)info->inPtr[0]; int * sums = &cp->mSums[256 * 4 * info->lid]; for (uint32_t x = xstart; x < xend; x++) { sums[(in[0] << 2) ] ++; sums[(in[1] << 2) + 1] ++; sums[(in[2] << 2) + 2] ++; sums[(in[3] << 2) + 3] ++; in += info->inStride[0]; } } void RsdCpuScriptIntrinsicHistogram::kernelP1U3(const RsExpandKernelDriverInfo *info, uint32_t xstart, uint32_t xend, uint32_t outstep) { RsdCpuScriptIntrinsicHistogram *cp = (RsdCpuScriptIntrinsicHistogram *)info->usr; uchar *in = (uchar *)info->inPtr[0]; int * sums = &cp->mSums[256 * 4 * info->lid]; for (uint32_t x = xstart; x < xend; x++) { sums[(in[0] << 2) ] ++; sums[(in[1] << 2) + 1] ++; sums[(in[2] << 2) + 2] ++; in += info->inStride[0]; } } void RsdCpuScriptIntrinsicHistogram::kernelP1U2(const RsExpandKernelDriverInfo *info, uint32_t xstart, uint32_t xend, uint32_t outstep) { RsdCpuScriptIntrinsicHistogram *cp = (RsdCpuScriptIntrinsicHistogram *)info->usr; uchar *in = (uchar *)info->inPtr[0]; int * sums = &cp->mSums[256 * 2 * info->lid]; for (uint32_t x = xstart; x < xend; x++) { sums[(in[0] << 1) ] ++; sums[(in[1] << 1) + 1] ++; in += info->inStride[0]; } } void RsdCpuScriptIntrinsicHistogram::kernelP1L4(const RsExpandKernelDriverInfo *info, uint32_t xstart, uint32_t xend, uint32_t outstep) { RsdCpuScriptIntrinsicHistogram *cp = (RsdCpuScriptIntrinsicHistogram *)info->usr; uchar *in = (uchar *)info->inPtr[0]; int * sums = &cp->mSums[256 * info->lid]; for (uint32_t x = xstart; x < xend; x++) { int t = (cp->mDotI[0] * in[0]) + (cp->mDotI[1] * in[1]) + (cp->mDotI[2] * in[2]) + (cp->mDotI[3] * in[3]); sums[(t + 0x7f) >> 8] ++; in += info->inStride[0]; } } void RsdCpuScriptIntrinsicHistogram::kernelP1L3(const RsExpandKernelDriverInfo *info, uint32_t xstart, uint32_t xend, uint32_t outstep) { RsdCpuScriptIntrinsicHistogram *cp = (RsdCpuScriptIntrinsicHistogram *)info->usr; uchar *in = (uchar *)info->inPtr[0]; int * sums = &cp->mSums[256 * info->lid]; for (uint32_t x = xstart; x < xend; x++) { int t = (cp->mDotI[0] * in[0]) + (cp->mDotI[1] * in[1]) + (cp->mDotI[2] * in[2]); sums[(t + 0x7f) >> 8] ++; in += info->inStride[0]; } } void RsdCpuScriptIntrinsicHistogram::kernelP1L2(const RsExpandKernelDriverInfo *info, uint32_t xstart, uint32_t xend, uint32_t outstep) { RsdCpuScriptIntrinsicHistogram *cp = (RsdCpuScriptIntrinsicHistogram *)info->usr; uchar *in = (uchar *)info->inPtr[0]; int * sums = &cp->mSums[256 * info->lid]; for (uint32_t x = xstart; x < xend; x++) { int t = (cp->mDotI[0] * in[0]) + (cp->mDotI[1] * in[1]); sums[(t + 0x7f) >> 8] ++; in += info->inStride[0]; } } void RsdCpuScriptIntrinsicHistogram::kernelP1L1(const RsExpandKernelDriverInfo *info, uint32_t xstart, uint32_t xend, uint32_t outstep) { RsdCpuScriptIntrinsicHistogram *cp = (RsdCpuScriptIntrinsicHistogram *)info->usr; uchar *in = (uchar *)info->inPtr[0]; int * sums = &cp->mSums[256 * info->lid]; for (uint32_t x = xstart; x < xend; x++) { int t = (cp->mDotI[0] * in[0]); sums[(t + 0x7f) >> 8] ++; in += info->inStride[0]; } } void RsdCpuScriptIntrinsicHistogram::kernelP1U1(const RsExpandKernelDriverInfo *info, uint32_t xstart, uint32_t xend, uint32_t outstep) { RsdCpuScriptIntrinsicHistogram *cp = (RsdCpuScriptIntrinsicHistogram *)info->usr; uchar *in = (uchar *)info->inPtr[0]; int * sums = &cp->mSums[256 * info->lid]; for (uint32_t x = xstart; x < xend; x++) { sums[in[0]] ++; in += info->inStride[0]; } } RsdCpuScriptIntrinsicHistogram::RsdCpuScriptIntrinsicHistogram(RsdCpuReferenceImpl *ctx, const Script *s, const Element *e) : RsdCpuScriptIntrinsic(ctx, s, e, RS_SCRIPT_INTRINSIC_ID_HISTOGRAM) { mRootPtr = nullptr; mSums = new int[256 * 4 * mCtx->getThreadCount()]; mDot[0] = 0.299f; mDot[1] = 0.587f; mDot[2] = 0.114f; mDot[3] = 0; mDotI[0] = (int)((mDot[0] * 256.f) + 0.5f); mDotI[1] = (int)((mDot[1] * 256.f) + 0.5f); mDotI[2] = (int)((mDot[2] * 256.f) + 0.5f); mDotI[3] = (int)((mDot[3] * 256.f) + 0.5f); } RsdCpuScriptIntrinsicHistogram::~RsdCpuScriptIntrinsicHistogram() { if (mSums) { delete []mSums; } } void RsdCpuScriptIntrinsicHistogram::populateScript(Script *s) { s->mHal.info.exportedVariableCount = 2; } void RsdCpuScriptIntrinsicHistogram::invokeFreeChildren() { } RsdCpuScriptImpl * rsdIntrinsic_Histogram(RsdCpuReferenceImpl *ctx, const Script *s, const Element *e) { return new RsdCpuScriptIntrinsicHistogram(ctx, s, e); }