/* * Copyright (C) 2013 The Android Open Source Project * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #include <array> #include <cstdint> #include "RenderScriptToolkit.h" #include "TaskProcessor.h" #include "Utils.h" #define LOG_TAG "renderscript.toolkit.Histogram" namespace android { namespace renderscript { class HistogramTask : public Task { const uchar* mIn; std::vector<int> mSums; uint32_t mThreadCount; // Process a 2D tile of the overall work. threadIndex identifies which thread does the work. virtual void processData(int threadIndex, size_t startX, size_t startY, size_t endX, size_t endY) override; void kernelP1U4(const uchar* in, int* sums, uint32_t xstart, uint32_t xend); void kernelP1U3(const uchar* in, int* sums, uint32_t xstart, uint32_t xend); void kernelP1U2(const uchar* in, int* sums, uint32_t xstart, uint32_t xend); void kernelP1U1(const uchar* in, int* sums, uint32_t xstart, uint32_t xend); public: HistogramTask(const uint8_t* in, size_t sizeX, size_t sizeY, size_t vectorSize, uint32_t threadCount, const Restriction* restriction); void collateSums(int* out); }; class HistogramDotTask : public Task { const uchar* mIn; float mDot[4]; int mDotI[4]; std::vector<int> mSums; uint32_t mThreadCount; void kernelP1L4(const uchar* in, int* sums, uint32_t xstart, uint32_t xend); void kernelP1L3(const uchar* in, int* sums, uint32_t xstart, uint32_t xend); void kernelP1L2(const uchar* in, int* sums, uint32_t xstart, uint32_t xend); void kernelP1L1(const uchar* in, int* sums, uint32_t xstart, uint32_t xend); public: HistogramDotTask(const uint8_t* in, size_t sizeX, size_t sizeY, size_t vectorSize, uint32_t threadCount, const float* coefficients, const Restriction* restriction); void collateSums(int* out); virtual void processData(int threadIndex, size_t startX, size_t startY, size_t endX, size_t endY) override; }; HistogramTask::HistogramTask(const uchar* in, size_t sizeX, size_t sizeY, size_t vectorSize, uint32_t threadCount, const Restriction* restriction) : Task{sizeX, sizeY, vectorSize, true, restriction}, mIn{in}, mSums(256 * paddedSize(vectorSize) * threadCount) { mThreadCount = threadCount; } void HistogramTask::processData(int threadIndex, size_t startX, size_t startY, size_t endX, size_t endY) { typedef void (HistogramTask::*KernelFunction)(const uchar*, int*, uint32_t, uint32_t); KernelFunction kernel; switch (mVectorSize) { case 4: kernel = &HistogramTask::kernelP1U4; break; case 3: kernel = &HistogramTask::kernelP1U3; break; case 2: kernel = &HistogramTask::kernelP1U2; break; case 1: kernel = &HistogramTask::kernelP1U1; break; default: ALOGE("Bad vector size %zd", mVectorSize); return; } int* sums = &mSums[256 * paddedSize(mVectorSize) * threadIndex]; for (size_t y = startY; y < endY; y++) { const uchar* inPtr = mIn + (mSizeX * y + startX) * paddedSize(mVectorSize); std::invoke(kernel, this, inPtr, sums, startX, endX); } } void HistogramTask::kernelP1U4(const uchar* in, int* sums, uint32_t xstart, uint32_t xend) { for (uint32_t x = xstart; x < xend; x++) { sums[(in[0] << 2)]++; sums[(in[1] << 2) + 1]++; sums[(in[2] << 2) + 2]++; sums[(in[3] << 2) + 3]++; in += 4; } } void HistogramTask::kernelP1U3(const uchar* in, int* sums, uint32_t xstart, uint32_t xend) { for (uint32_t x = xstart; x < xend; x++) { sums[(in[0] << 2)]++; sums[(in[1] << 2) + 1]++; sums[(in[2] << 2) + 2]++; in += 4; } } void HistogramTask::kernelP1U2(const uchar* in, int* sums, uint32_t xstart, uint32_t xend) { for (uint32_t x = xstart; x < xend; x++) { sums[(in[0] << 1)]++; sums[(in[1] << 1) + 1]++; in += 2; } } void HistogramTask::kernelP1U1(const uchar* in, int* sums, uint32_t xstart, uint32_t xend) { for (uint32_t x = xstart; x < xend; x++) { sums[in[0]]++; in++; } } void HistogramTask::collateSums(int* out) { for (uint32_t ct = 0; ct < (256 * paddedSize(mVectorSize)); ct++) { out[ct] = mSums[ct]; for (uint32_t t = 1; t < mThreadCount; t++) { out[ct] += mSums[ct + (256 * paddedSize(mVectorSize) * t)]; } } } HistogramDotTask::HistogramDotTask(const uchar* in, size_t sizeX, size_t sizeY, size_t vectorSize, uint32_t threadCount, const float* coefficients, const Restriction* restriction) : Task{sizeX, sizeY, vectorSize, true, restriction}, mIn{in}, mSums(256 * threadCount, 0) { mThreadCount = threadCount; if (coefficients == nullptr) { mDot[0] = 0.299f; mDot[1] = 0.587f; mDot[2] = 0.114f; mDot[3] = 0; } else { memcpy(mDot, coefficients, 16); } mDotI[0] = (int)((mDot[0] * 256.f) + 0.5f); mDotI[1] = (int)((mDot[1] * 256.f) + 0.5f); mDotI[2] = (int)((mDot[2] * 256.f) + 0.5f); mDotI[3] = (int)((mDot[3] * 256.f) + 0.5f); } void HistogramDotTask::processData(int threadIndex, size_t startX, size_t startY, size_t endX, size_t endY) { typedef void (HistogramDotTask::*KernelFunction)(const uchar*, int*, uint32_t, uint32_t); KernelFunction kernel; switch (mVectorSize) { case 4: kernel = &HistogramDotTask::kernelP1L4; break; case 3: kernel = &HistogramDotTask::kernelP1L3; break; case 2: kernel = &HistogramDotTask::kernelP1L2; break; case 1: kernel = &HistogramDotTask::kernelP1L1; break; default: ALOGI("Bad vector size %zd", mVectorSize); return; } int* sums = &mSums[256 * threadIndex]; for (size_t y = startY; y < endY; y++) { const uchar* inPtr = mIn + (mSizeX * y + startX) * paddedSize(mVectorSize); std::invoke(kernel, this, inPtr, sums, startX, endX); } } void HistogramDotTask::kernelP1L4(const uchar* in, int* sums, uint32_t xstart, uint32_t xend) { for (uint32_t x = xstart; x < xend; x++) { int t = (mDotI[0] * in[0]) + (mDotI[1] * in[1]) + (mDotI[2] * in[2]) + (mDotI[3] * in[3]); sums[(t + 0x7f) >> 8]++; in += 4; } } void HistogramDotTask::kernelP1L3(const uchar* in, int* sums, uint32_t xstart, uint32_t xend) { for (uint32_t x = xstart; x < xend; x++) { int t = (mDotI[0] * in[0]) + (mDotI[1] * in[1]) + (mDotI[2] * in[2]); sums[(t + 0x7f) >> 8]++; in += 4; } } void HistogramDotTask::kernelP1L2(const uchar* in, int* sums, uint32_t xstart, uint32_t xend) { for (uint32_t x = xstart; x < xend; x++) { int t = (mDotI[0] * in[0]) + (mDotI[1] * in[1]); sums[(t + 0x7f) >> 8]++; in += 2; } } void HistogramDotTask::kernelP1L1(const uchar* in, int* sums, uint32_t xstart, uint32_t xend) { for (uint32_t x = xstart; x < xend; x++) { int t = (mDotI[0] * in[0]); sums[(t + 0x7f) >> 8]++; in++; } } void HistogramDotTask::collateSums(int* out) { for (uint32_t ct = 0; ct < 256; ct++) { out[ct] = mSums[ct]; for (uint32_t t = 1; t < mThreadCount; t++) { out[ct] += mSums[ct + (256 * t)]; } } } //////////////////////////////////////////////////////////////////////////// void RenderScriptToolkit::histogram(const uint8_t* in, int32_t* out, size_t sizeX, size_t sizeY, size_t vectorSize, const Restriction* restriction) { #ifdef ANDROID_RENDERSCRIPT_TOOLKIT_VALIDATE if (!validRestriction(LOG_TAG, sizeX, sizeY, restriction)) { return; } if (vectorSize < 1 || vectorSize > 4) { ALOGE("The vectorSize should be between 1 and 4. %zu provided.", vectorSize); return; } #endif HistogramTask task(in, sizeX, sizeY, vectorSize, processor->getNumberOfThreads(), restriction); processor->doTask(&task); task.collateSums(out); } void RenderScriptToolkit::histogramDot(const uint8_t* in, int32_t* out, size_t sizeX, size_t sizeY, size_t vectorSize, const float* coefficients, const Restriction* restriction) { #ifdef ANDROID_RENDERSCRIPT_TOOLKIT_VALIDATE if (!validRestriction(LOG_TAG, sizeX, sizeY, restriction)) { return; } if (vectorSize < 1 || vectorSize > 4) { ALOGE("The vectorSize should be between 1 and 4. %zu provided.", vectorSize); return; } if (coefficients != nullptr) { float sum = 0.0f; for (size_t i = 0; i < vectorSize; i++) { if (coefficients[i] < 0.0f) { ALOGE("histogramDot coefficients should not be negative. Coefficient %zu was %f.", i, coefficients[i]); return; } sum += coefficients[i]; } if (sum > 1.0f) { ALOGE("histogramDot coefficients should add to 1 or less. Their sum is %f.", sum); return; } } #endif HistogramDotTask task(in, sizeX, sizeY, vectorSize, processor->getNumberOfThreads(), coefficients, restriction); processor->doTask(&task); task.collateSums(out); } } // namespace renderscript } // namespace android