1 /*
2 * Copyright (C) 2012 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #include <cstdint>
18
19 #include "RenderScriptToolkit.h"
20 #include "TaskProcessor.h"
21 #include "Utils.h"
22
23 #define LOG_TAG "renderscript.toolkit.Convolve3x3"
24
25 namespace android {
26 namespace renderscript {
27
28 extern "C" void rsdIntrinsicConvolve3x3_K(void* dst, const void* y0, const void* y1, const void* y2,
29 const int16_t* coef, uint32_t count);
30
31 class Convolve3x3Task : public Task {
32 const void* mIn;
33 void* mOut;
34 // Even though we have exactly 9 coefficients, store them in an array of size 16 so that
35 // the SIMD instructions can load them in chunks multiple of 8.
36 float mFp[16];
37 int16_t mIp[16];
38
39 void kernelU4(uchar* out, uint32_t xstart, uint32_t xend, const uchar* py0, const uchar* py1,
40 const uchar* py2);
41 void convolveU4(const uchar* pin, uchar* pout, size_t vectorSize, size_t sizeX, size_t sizeY,
42 size_t startX, size_t startY, size_t endX, size_t endY);
43
44 // Process a 2D tile of the overall work. threadIndex identifies which thread does the work.
45 virtual void processData(int threadIndex, size_t startX, size_t startY, size_t endX,
46 size_t endY) override;
47
48 public:
Convolve3x3Task(const void * in,void * out,size_t vectorSize,size_t sizeX,size_t sizeY,const float * coefficients,const Restriction * restriction)49 Convolve3x3Task(const void* in, void* out, size_t vectorSize, size_t sizeX, size_t sizeY,
50 const float* coefficients, const Restriction* restriction)
51 : Task{sizeX, sizeY, vectorSize, false, restriction}, mIn{in}, mOut{out} {
52 for (int ct = 0; ct < 9; ct++) {
53 mFp[ct] = coefficients[ct];
54 if (mFp[ct] >= 0) {
55 mIp[ct] = (int16_t)(mFp[ct] * 256.f + 0.5f);
56 } else {
57 mIp[ct] = (int16_t)(mFp[ct] * 256.f - 0.5f);
58 }
59 }
60 }
61 };
62
63 /**
64 * Computes one convolution and stores the result in the output. This is used for uchar, uchar2,
65 * uchar3, and uchar4 vectors.
66 *
67 * @tparam InputOutputType Type of the input and output arrays. A vector type, e.g. uchar4.
68 * @tparam ComputationType Type we use for the intermediate computations.
69 * @param x The index in the row of the value we'll convolve.
70 * @param out The location in the output array where we store the value.
71 * @param py0 The start of the top row.
72 * @param py1 The start of the middle row.
73 * @param py2 The start of the bottom row.
74 * @param coeff Pointer to the float coefficients, in row major format.
75 * @param sizeX The number of cells of one row.
76 */
77 template <typename InputOutputType, typename ComputationType>
convolveOneU(uint32_t x,InputOutputType * out,const InputOutputType * py0,const InputOutputType * py1,const InputOutputType * py2,const float * coeff,int32_t sizeX)78 static void convolveOneU(uint32_t x, InputOutputType* out, const InputOutputType* py0,
79 const InputOutputType* py1, const InputOutputType* py2, const float* coeff,
80 int32_t sizeX) {
81 uint32_t x1 = std::max((int32_t)x - 1, 0);
82 uint32_t x2 = std::min((int32_t)x + 1, sizeX - 1);
83
84 ComputationType px = convert<ComputationType>(py0[x1]) * coeff[0] +
85 convert<ComputationType>(py0[x]) * coeff[1] +
86 convert<ComputationType>(py0[x2]) * coeff[2] +
87 convert<ComputationType>(py1[x1]) * coeff[3] +
88 convert<ComputationType>(py1[x]) * coeff[4] +
89 convert<ComputationType>(py1[x2]) * coeff[5] +
90 convert<ComputationType>(py2[x1]) * coeff[6] +
91 convert<ComputationType>(py2[x]) * coeff[7] +
92 convert<ComputationType>(py2[x2]) * coeff[8];
93
94 px = clamp(px + 0.5f, 0.f, 255.f);
95 *out = convert<InputOutputType>(px);
96 }
97
98 #ifdef ANDROID_RENDERSCRIPT_TOOLKIT_SUPPORTS_FLOAT
99 /**
100 * Computes one convolution and stores the result in the output. This is used for float, float2,
101 * float3, and float4 vectors.
102 *
103 * @tparam InputOutputType Type of the input and output arrays. A vector type, e.g. float4.
104 * @param x The index in the row of the value we'll convolve.
105 * @param out The location in the output array where we store the value.
106 * @param py0 The start of the top row.
107 * @param py1 The start of the middle row.
108 * @param py2 The start of the bottom row.
109 * @param coeff Pointer to the float coefficients, in row major format.
110 * @param sizeX The number of cells of one row.
111 */
112 template <typename InputOutputType>
ConvolveOneF(uint32_t x,InputOutputType * out,const InputOutputType * py0,const InputOutputType * py1,const InputOutputType * py2,const float * coeff,int32_t sizeX)113 static void ConvolveOneF(uint32_t x, InputOutputType* out, const InputOutputType* py0,
114 const InputOutputType* py1, const InputOutputType* py2, const float* coeff,
115 int32_t sizeX) {
116 uint32_t x1 = std::max((int32_t)x - 1, 0);
117 uint32_t x2 = std::min((int32_t)x + 1, sizeX - 1);
118 *out = (py0[x1] * coeff[0]) + (py0[x] * coeff[1]) + (py0[x2] * coeff[2]) +
119 (py1[x1] * coeff[3]) + (py1[x] * coeff[4]) + (py1[x2] * coeff[5]) +
120 (py2[x1] * coeff[6]) + (py2[x] * coeff[7]) + (py2[x2] * coeff[8]);
121 }
122 #endif // ANDROID_RENDERSCRIPT_TOOLKIT_SUPPORTS_FLOAT
123
124 /**
125 * This function convolves one line.
126 *
127 * @param pout Where to place the next output.
128 * @param xstart Index in the X direction of where to start.
129 * @param xend End index
130 * @param ppy0 Points to the start of the previous line.
131 * @param ppy1 Points to the start of the current line.
132 * @param ppy2 Points to the start of the next line.
133 */
kernelU4(uchar * pout,uint32_t xstart,uint32_t xend,const uchar * ppy0,const uchar * ppy1,const uchar * ppy2)134 void Convolve3x3Task::kernelU4(uchar* pout, uint32_t xstart, uint32_t xend, const uchar* ppy0,
135 const uchar* ppy1, const uchar* ppy2) {
136 uchar4* out = (uchar4*)pout;
137 const uchar4* py0 = (const uchar4*)ppy0;
138 const uchar4* py1 = (const uchar4*)ppy1;
139 const uchar4* py2 = (const uchar4*)ppy2;
140
141 uint32_t x1 = xstart;
142 uint32_t x2 = xend;
143 if (x1 == 0) {
144 convolveOneU<uchar4, float4>(0, out, py0, py1, py2, mFp, mSizeX);
145 x1++;
146 out++;
147 }
148
149 if (x2 > x1) {
150 #if defined(ARCH_ARM_USE_INTRINSICS) || defined(ARCH_X86_HAVE_SSSE3)
151 if (mUsesSimd) {
152 int32_t len = (x2 - x1 - 1) >> 1;
153 if (len > 0) {
154 rsdIntrinsicConvolve3x3_K(out, &py0[x1 - 1], &py1[x1 - 1], &py2[x1 - 1], mIp, len);
155 x1 += len << 1;
156 out += len << 1;
157 }
158 }
159 #endif
160
161 while (x1 != x2) {
162 convolveOneU<uchar4, float4>(x1, out, py0, py1, py2, mFp, mSizeX);
163 out++;
164 x1++;
165 }
166 }
167 }
168
169 #ifdef ANDROID_RENDERSCRIPT_TOOLKIT_SUPPORTS_FLOAT
170 template <typename T>
RsdCpuScriptIntrinsicConvolve3x3_kernelF(void * in,T * out,uint32_t xstart,uint32_t xend,uint32_t currentY,size_t sizeX,size_t sizeY,size_t vectorSize,float * fp)171 void RsdCpuScriptIntrinsicConvolve3x3_kernelF(void* in, T* out, uint32_t xstart, uint32_t xend,
172 uint32_t currentY, size_t sizeX, size_t sizeY,
173 size_t vectorSize, float* fp) {
174 const uchar* pin = (const uchar*)in;
175 const size_t stride = sizeX * vectorSize * 4; // float takes 4 bytes
176
177 uint32_t y1 = std::min((int32_t)currentY + 1, (int32_t)(sizeY - 1));
178 uint32_t y2 = std::max((int32_t)currentY - 1, 0);
179 const T* py0 = (const T*)(pin + stride * y2);
180 const T* py1 = (const T*)(pin + stride * currentY);
181 const T* py2 = (const T*)(pin + stride * y1);
182
183 for (uint32_t x = xstart; x < xend; x++, out++) {
184 ConvolveOneF<T>(x, out, py0, py1, py2, fp, sizeX);
185 }
186 }
187 #endif // ANDROID_RENDERSCRIPT_TOOLKIT_SUPPORTS_FLOAT
188
189 template <typename InputOutputType, typename ComputationType>
convolveU(const uchar * pin,uchar * pout,size_t vectorSize,size_t sizeX,size_t sizeY,size_t startX,size_t startY,size_t endX,size_t endY,float * fp)190 static void convolveU(const uchar* pin, uchar* pout, size_t vectorSize, size_t sizeX, size_t sizeY,
191 size_t startX, size_t startY, size_t endX, size_t endY, float* fp) {
192 const size_t stride = vectorSize * sizeX;
193 for (size_t y = startY; y < endY; y++) {
194 uint32_t y1 = std::min((int32_t)y + 1, (int32_t)(sizeY - 1));
195 uint32_t y2 = std::max((int32_t)y - 1, 0);
196
197 size_t offset = (y * sizeX + startX) * vectorSize;
198 InputOutputType* px = (InputOutputType*)(pout + offset);
199 InputOutputType* py0 = (InputOutputType*)(pin + stride * y2);
200 InputOutputType* py1 = (InputOutputType*)(pin + stride * y);
201 InputOutputType* py2 = (InputOutputType*)(pin + stride * y1);
202 for (uint32_t x = startX; x < endX; x++, px++) {
203 convolveOneU<InputOutputType, ComputationType>(x, px, py0, py1, py2, fp, sizeX);
204 }
205 }
206 }
207
convolveU4(const uchar * pin,uchar * pout,size_t vectorSize,size_t sizeX,size_t sizeY,size_t startX,size_t startY,size_t endX,size_t endY)208 void Convolve3x3Task::convolveU4(const uchar* pin, uchar* pout, size_t vectorSize, size_t sizeX,
209 size_t sizeY, size_t startX, size_t startY, size_t endX,
210 size_t endY) {
211 const size_t stride = paddedSize(vectorSize) * sizeX;
212 for (size_t y = startY; y < endY; y++) {
213 uint32_t y1 = std::min((int32_t)y + 1, (int32_t)(sizeY - 1));
214 uint32_t y2 = std::max((int32_t)y - 1, 0);
215
216 size_t offset = (y * sizeX + startX) * paddedSize(vectorSize);
217 uchar* px = pout + offset;
218 const uchar* py0 = pin + stride * y2;
219 const uchar* py1 = pin + stride * y;
220 const uchar* py2 = pin + stride * y1;
221 kernelU4(px, startX, endX, py0, py1, py2);
222 }
223 }
224
processData(int,size_t startX,size_t startY,size_t endX,size_t endY)225 void Convolve3x3Task::processData(int /* threadIndex */, size_t startX, size_t startY, size_t endX,
226 size_t endY) {
227 // ALOGI("Thread %d start tile from (%zd, %zd) to (%zd, %zd)", threadIndex, startX, startY,
228 // endX, endY);
229 switch (mVectorSize) {
230 case 1:
231 convolveU<uchar, float>((const uchar*)mIn, (uchar*)mOut, mVectorSize, mSizeX, mSizeY,
232 startX, startY, endX, endY, mFp);
233 break;
234 case 2:
235 convolveU<uchar2, float2>((const uchar*)mIn, (uchar*)mOut, mVectorSize, mSizeX, mSizeY,
236 startX, startY, endX, endY, mFp);
237 break;
238 case 3:
239 case 4:
240 convolveU4((const uchar*)mIn, (uchar*)mOut, mVectorSize, mSizeX, mSizeY, startX, startY,
241 endX, endY);
242 break;
243 }
244 }
245
convolve3x3(const void * in,void * out,size_t vectorSize,size_t sizeX,size_t sizeY,const float * coefficients,const Restriction * restriction)246 void RenderScriptToolkit::convolve3x3(const void* in, void* out, size_t vectorSize, size_t sizeX,
247 size_t sizeY, const float* coefficients,
248 const Restriction* restriction) {
249 #ifdef ANDROID_RENDERSCRIPT_TOOLKIT_VALIDATE
250 if (!validRestriction(LOG_TAG, sizeX, sizeY, restriction)) {
251 return;
252 }
253 if (vectorSize < 1 || vectorSize > 4) {
254 ALOGE("The vectorSize should be between 1 and 4. %zu provided.", vectorSize);
255 return;
256 }
257 #endif
258
259 Convolve3x3Task task(in, out, vectorSize, sizeX, sizeY, coefficients, restriction);
260 processor->doTask(&task);
261 }
262
263 } // namespace renderscript
264 } // namespace android
265