1 /*
2 * Copyright (C) 2012 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #include <math.h>
18
19 #include <cstdint>
20
21 #include "RenderScriptToolkit.h"
22 #include "TaskProcessor.h"
23 #include "Utils.h"
24
25 namespace android {
26 namespace renderscript {
27
28 #define LOG_TAG "renderscript.toolkit.Blur"
29
30 /**
31 * Blurs an image or a section of an image.
32 *
33 * Our algorithm does two passes: a vertical blur followed by an horizontal blur.
34 */
35 class BlurTask : public Task {
36 // The image we're blurring.
37 const uchar* mIn;
38 // Where we store the blurred image.
39 uchar* outArray;
40 // The size of the kernel radius is limited to 25 in ScriptIntrinsicBlur.java.
41 // So, the max kernel size is 51 (= 2 * 25 + 1).
42 // Considering SSSE3 case, which requires the size is multiple of 4,
43 // at least 52 words are necessary. Values outside of the kernel should be 0.
44 float mFp[104];
45 uint16_t mIp[104];
46
47 // Working area to store the result of the vertical blur, to be used by the horizontal pass.
48 // There's one area per thread. Since the needed working area may be too large to put on the
49 // stack, we are allocating it from the heap. To avoid paying the allocation cost for each
50 // tile, we cache the scratch area here.
51 std::vector<void*> mScratch; // Pointers to the scratch areas, one per thread.
52 std::vector<size_t> mScratchSize; // The size in bytes of the scratch areas, one per thread.
53
54 // The radius of the blur, in floating point and integer format.
55 float mRadius;
56 int mIradius;
57
58 void kernelU4(void* outPtr, uint32_t xstart, uint32_t xend, uint32_t currentY,
59 uint32_t threadIndex);
60 void kernelU1(void* outPtr, uint32_t xstart, uint32_t xend, uint32_t currentY);
61 void ComputeGaussianWeights();
62
63 // Process a 2D tile of the overall work. threadIndex identifies which thread does the work.
64 virtual void processData(int threadIndex, size_t startX, size_t startY, size_t endX,
65 size_t endY) override;
66
67 public:
BlurTask(const uint8_t * in,uint8_t * out,size_t sizeX,size_t sizeY,size_t vectorSize,uint32_t threadCount,float radius,const Restriction * restriction)68 BlurTask(const uint8_t* in, uint8_t* out, size_t sizeX, size_t sizeY, size_t vectorSize,
69 uint32_t threadCount, float radius, const Restriction* restriction)
70 : Task{sizeX, sizeY, vectorSize, false, restriction},
71 mIn{in},
72 outArray{out},
73 mScratch{threadCount},
74 mScratchSize{threadCount},
75 mRadius{std::min(25.0f, radius)} {
76 ComputeGaussianWeights();
77 }
78
~BlurTask()79 ~BlurTask() {
80 for (size_t i = 0; i < mScratch.size(); i++) {
81 if (mScratch[i]) {
82 free(mScratch[i]);
83 }
84 }
85 }
86 };
87
ComputeGaussianWeights()88 void BlurTask::ComputeGaussianWeights() {
89 memset(mFp, 0, sizeof(mFp));
90 memset(mIp, 0, sizeof(mIp));
91
92 // Compute gaussian weights for the blur
93 // e is the euler's number
94 float e = 2.718281828459045f;
95 float pi = 3.1415926535897932f;
96 // g(x) = (1 / (sqrt(2 * pi) * sigma)) * e ^ (-x^2 / (2 * sigma^2))
97 // x is of the form [-radius .. 0 .. radius]
98 // and sigma varies with the radius.
99 // Based on some experimental radius values and sigmas,
100 // we approximately fit sigma = f(radius) as
101 // sigma = radius * 0.4 + 0.6
102 // The larger the radius gets, the more our gaussian blur
103 // will resemble a box blur since with large sigma
104 // the gaussian curve begins to lose its shape
105 float sigma = 0.4f * mRadius + 0.6f;
106
107 // Now compute the coefficients. We will store some redundant values to save
108 // some math during the blur calculations precompute some values
109 float coeff1 = 1.0f / (sqrtf(2.0f * pi) * sigma);
110 float coeff2 = - 1.0f / (2.0f * sigma * sigma);
111
112 float normalizeFactor = 0.0f;
113 float floatR = 0.0f;
114 int r;
115 mIradius = (float)ceil(mRadius) + 0.5f;
116 for (r = -mIradius; r <= mIradius; r ++) {
117 floatR = (float)r;
118 mFp[r + mIradius] = coeff1 * powf(e, floatR * floatR * coeff2);
119 normalizeFactor += mFp[r + mIradius];
120 }
121
122 // Now we need to normalize the weights because all our coefficients need to add up to one
123 normalizeFactor = 1.0f / normalizeFactor;
124 for (r = -mIradius; r <= mIradius; r ++) {
125 mFp[r + mIradius] *= normalizeFactor;
126 mIp[r + mIradius] = (uint16_t)(mFp[r + mIradius] * 65536.0f + 0.5f);
127 }
128 }
129
130 /**
131 * Vertical blur of a uchar4 line.
132 *
133 * @param sizeY Number of cells of the input array in the vertical direction.
134 * @param out Where to place the computed value.
135 * @param x Coordinate of the point we're blurring.
136 * @param y Coordinate of the point we're blurring.
137 * @param ptrIn Start of the input array.
138 * @param iStride The size in byte of a row of the input array.
139 * @param gPtr The gaussian coefficients.
140 * @param iradius The radius of the blur.
141 */
OneVU4(uint32_t sizeY,float4 * out,int32_t x,int32_t y,const uchar * ptrIn,int iStride,const float * gPtr,int iradius)142 static void OneVU4(uint32_t sizeY, float4* out, int32_t x, int32_t y, const uchar* ptrIn,
143 int iStride, const float* gPtr, int iradius) {
144 const uchar *pi = ptrIn + x*4;
145
146 float4 blurredPixel = 0;
147 for (int r = -iradius; r <= iradius; r ++) {
148 int validY = std::max((y + r), 0);
149 validY = std::min(validY, (int)(sizeY - 1));
150 const uchar4 *pvy = (const uchar4 *)&pi[validY * iStride];
151 float4 pf = convert<float4>(pvy[0]);
152 blurredPixel += pf * gPtr[0];
153 gPtr++;
154 }
155
156 out[0] = blurredPixel;
157 }
158
159 /**
160 * Vertical blur of a uchar1 line.
161 *
162 * @param sizeY Number of cells of the input array in the vertical direction.
163 * @param out Where to place the computed value.
164 * @param x Coordinate of the point we're blurring.
165 * @param y Coordinate of the point we're blurring.
166 * @param ptrIn Start of the input array.
167 * @param iStride The size in byte of a row of the input array.
168 * @param gPtr The gaussian coefficients.
169 * @param iradius The radius of the blur.
170 */
OneVU1(uint32_t sizeY,float * out,int32_t x,int32_t y,const uchar * ptrIn,int iStride,const float * gPtr,int iradius)171 static void OneVU1(uint32_t sizeY, float *out, int32_t x, int32_t y,
172 const uchar *ptrIn, int iStride, const float* gPtr, int iradius) {
173
174 const uchar *pi = ptrIn + x;
175
176 float blurredPixel = 0;
177 for (int r = -iradius; r <= iradius; r ++) {
178 int validY = std::max((y + r), 0);
179 validY = std::min(validY, (int)(sizeY - 1));
180 float pf = (float)pi[validY * iStride];
181 blurredPixel += pf * gPtr[0];
182 gPtr++;
183 }
184
185 out[0] = blurredPixel;
186 }
187
188
189 extern "C" void rsdIntrinsicBlurU1_K(uchar *out, uchar const *in, size_t w, size_t h,
190 size_t p, size_t x, size_t y, size_t count, size_t r, uint16_t const *tab);
191 extern "C" void rsdIntrinsicBlurU4_K(uchar4 *out, uchar4 const *in, size_t w, size_t h,
192 size_t p, size_t x, size_t y, size_t count, size_t r, uint16_t const *tab);
193
194 #if defined(ARCH_X86_HAVE_SSSE3)
195 extern void rsdIntrinsicBlurVFU4_K(void *dst, const void *pin, int stride, const void *gptr,
196 int rct, int x1, int ct);
197 extern void rsdIntrinsicBlurHFU4_K(void *dst, const void *pin, const void *gptr, int rct, int x1,
198 int ct);
199 extern void rsdIntrinsicBlurHFU1_K(void *dst, const void *pin, const void *gptr, int rct, int x1,
200 int ct);
201 #endif
202
203 /**
204 * Vertical blur of a line of RGBA, knowing that there's enough rows above and below us to avoid
205 * dealing with boundary conditions.
206 *
207 * @param out Where to store the results. This is the input to the horizontal blur.
208 * @param ptrIn The input data for this line.
209 * @param iStride The width of the input.
210 * @param gPtr The gaussian coefficients.
211 * @param ct The diameter of the blur.
212 * @param len How many cells to blur.
213 * @param usesSimd Whether this processor supports SIMD.
214 */
OneVFU4(float4 * out,const uchar * ptrIn,int iStride,const float * gPtr,int ct,int x2,bool usesSimd)215 static void OneVFU4(float4 *out, const uchar *ptrIn, int iStride, const float* gPtr, int ct,
216 int x2, bool usesSimd) {
217 int x1 = 0;
218 #if defined(ARCH_X86_HAVE_SSSE3)
219 if (usesSimd) {
220 int t = (x2 - x1);
221 t &= ~1;
222 if (t) {
223 rsdIntrinsicBlurVFU4_K(out, ptrIn, iStride, gPtr, ct, x1, x1 + t);
224 }
225 x1 += t;
226 out += t;
227 ptrIn += t << 2;
228 }
229 #else
230 (void) usesSimd; // Avoid unused parameter warning.
231 #endif
232 while(x2 > x1) {
233 const uchar *pi = ptrIn;
234 float4 blurredPixel = 0;
235 const float* gp = gPtr;
236
237 for (int r = 0; r < ct; r++) {
238 float4 pf = convert<float4>(((const uchar4 *)pi)[0]);
239 blurredPixel += pf * gp[0];
240 pi += iStride;
241 gp++;
242 }
243 out->xyzw = blurredPixel;
244 x1++;
245 out++;
246 ptrIn+=4;
247 }
248 }
249
250 /**
251 * Vertical blur of a line of U_8, knowing that there's enough rows above and below us to avoid
252 * dealing with boundary conditions.
253 *
254 * @param out Where to store the results. This is the input to the horizontal blur.
255 * @param ptrIn The input data for this line.
256 * @param iStride The width of the input.
257 * @param gPtr The gaussian coefficients.
258 * @param ct The diameter of the blur.
259 * @param len How many cells to blur.
260 * @param usesSimd Whether this processor supports SIMD.
261 */
OneVFU1(float * out,const uchar * ptrIn,int iStride,const float * gPtr,int ct,int len,bool usesSimd)262 static void OneVFU1(float* out, const uchar* ptrIn, int iStride, const float* gPtr, int ct, int len,
263 bool usesSimd) {
264 int x1 = 0;
265
266 while((len > x1) && (((uintptr_t)ptrIn) & 0x3)) {
267 const uchar *pi = ptrIn;
268 float blurredPixel = 0;
269 const float* gp = gPtr;
270
271 for (int r = 0; r < ct; r++) {
272 float pf = (float)pi[0];
273 blurredPixel += pf * gp[0];
274 pi += iStride;
275 gp++;
276 }
277 out[0] = blurredPixel;
278 x1++;
279 out++;
280 ptrIn++;
281 len--;
282 }
283 #if defined(ARCH_X86_HAVE_SSSE3)
284 if (usesSimd && (len > x1)) {
285 int t = (len - x1) >> 2;
286 t &= ~1;
287 if (t) {
288 rsdIntrinsicBlurVFU4_K(out, ptrIn, iStride, gPtr, ct, 0, t );
289 len -= t << 2;
290 ptrIn += t << 2;
291 out += t << 2;
292 }
293 }
294 #else
295 (void) usesSimd; // Avoid unused parameter warning.
296 #endif
297 while(len > 0) {
298 const uchar *pi = ptrIn;
299 float blurredPixel = 0;
300 const float* gp = gPtr;
301
302 for (int r = 0; r < ct; r++) {
303 float pf = (float)pi[0];
304 blurredPixel += pf * gp[0];
305 pi += iStride;
306 gp++;
307 }
308 out[0] = blurredPixel;
309 len--;
310 out++;
311 ptrIn++;
312 }
313 }
314
315 /**
316 * Horizontal blur of a uchar4 line.
317 *
318 * @param sizeX Number of cells of the input array in the horizontal direction.
319 * @param out Where to place the computed value.
320 * @param x Coordinate of the point we're blurring.
321 * @param ptrIn The start of the input row from which we're indexing x.
322 * @param gPtr The gaussian coefficients.
323 * @param iradius The radius of the blur.
324 */
OneHU4(uint32_t sizeX,uchar4 * out,int32_t x,const float4 * ptrIn,const float * gPtr,int iradius)325 static void OneHU4(uint32_t sizeX, uchar4* out, int32_t x, const float4* ptrIn, const float* gPtr,
326 int iradius) {
327 float4 blurredPixel = 0;
328 for (int r = -iradius; r <= iradius; r ++) {
329 int validX = std::max((x + r), 0);
330 validX = std::min(validX, (int)(sizeX - 1));
331 float4 pf = ptrIn[validX];
332 blurredPixel += pf * gPtr[0];
333 gPtr++;
334 }
335
336 out->xyzw = convert<uchar4>(blurredPixel);
337 }
338
339 /**
340 * Horizontal blur of a uchar line.
341 *
342 * @param sizeX Number of cells of the input array in the horizontal direction.
343 * @param out Where to place the computed value.
344 * @param x Coordinate of the point we're blurring.
345 * @param ptrIn The start of the input row from which we're indexing x.
346 * @param gPtr The gaussian coefficients.
347 * @param iradius The radius of the blur.
348 */
OneHU1(uint32_t sizeX,uchar * out,int32_t x,const float * ptrIn,const float * gPtr,int iradius)349 static void OneHU1(uint32_t sizeX, uchar* out, int32_t x, const float* ptrIn, const float* gPtr,
350 int iradius) {
351 float blurredPixel = 0;
352 for (int r = -iradius; r <= iradius; r ++) {
353 int validX = std::max((x + r), 0);
354 validX = std::min(validX, (int)(sizeX - 1));
355 float pf = ptrIn[validX];
356 blurredPixel += pf * gPtr[0];
357 gPtr++;
358 }
359
360 out[0] = (uchar)blurredPixel;
361 }
362
363 /**
364 * Full blur of a line of RGBA data.
365 *
366 * @param outPtr Where to store the results
367 * @param xstart The index of the section we're starting to blur.
368 * @param xend The end index of the section.
369 * @param currentY The index of the line we're blurring.
370 * @param usesSimd Whether this processor supports SIMD.
371 */
kernelU4(void * outPtr,uint32_t xstart,uint32_t xend,uint32_t currentY,uint32_t threadIndex)372 void BlurTask::kernelU4(void *outPtr, uint32_t xstart, uint32_t xend, uint32_t currentY,
373 uint32_t threadIndex) {
374 float4 stackbuf[2048];
375 float4 *buf = &stackbuf[0];
376 const uint32_t stride = mSizeX * mVectorSize;
377
378 uchar4 *out = (uchar4 *)outPtr;
379 uint32_t x1 = xstart;
380 uint32_t x2 = xend;
381
382 #if defined(ARCH_ARM_USE_INTRINSICS)
383 if (mUsesSimd && mSizeX >= 4) {
384 rsdIntrinsicBlurU4_K(out, (uchar4 const *)(mIn + stride * currentY),
385 mSizeX, mSizeY,
386 stride, x1, currentY, x2 - x1, mIradius, mIp + mIradius);
387 return;
388 }
389 #endif
390
391 if (mSizeX > 2048) {
392 if ((mSizeX > mScratchSize[threadIndex]) || !mScratch[threadIndex]) {
393 // Pad the side of the allocation by one unit to allow alignment later
394 mScratch[threadIndex] = realloc(mScratch[threadIndex], (mSizeX + 1) * 16);
395 mScratchSize[threadIndex] = mSizeX;
396 }
397 // realloc only aligns to 8 bytes so we manually align to 16.
398 buf = (float4 *) ((((intptr_t)mScratch[threadIndex]) + 15) & ~0xf);
399 }
400 float4 *fout = (float4 *)buf;
401 int y = currentY;
402 if ((y > mIradius) && (y < ((int)mSizeY - mIradius))) {
403 const uchar *pi = mIn + (y - mIradius) * stride;
404 OneVFU4(fout, pi, stride, mFp, mIradius * 2 + 1, mSizeX, mUsesSimd);
405 } else {
406 x1 = 0;
407 while(mSizeX > x1) {
408 OneVU4(mSizeY, fout, x1, y, mIn, stride, mFp, mIradius);
409 fout++;
410 x1++;
411 }
412 }
413
414 x1 = xstart;
415 while ((x1 < (uint32_t)mIradius) && (x1 < x2)) {
416 OneHU4(mSizeX, out, x1, buf, mFp, mIradius);
417 out++;
418 x1++;
419 }
420 #if defined(ARCH_X86_HAVE_SSSE3)
421 if (mUsesSimd) {
422 if ((x1 + mIradius) < x2) {
423 rsdIntrinsicBlurHFU4_K(out, buf - mIradius, mFp,
424 mIradius * 2 + 1, x1, x2 - mIradius);
425 out += (x2 - mIradius) - x1;
426 x1 = x2 - mIradius;
427 }
428 }
429 #endif
430 while(x2 > x1) {
431 OneHU4(mSizeX, out, x1, buf, mFp, mIradius);
432 out++;
433 x1++;
434 }
435 }
436
437 /**
438 * Full blur of a line of U_8 data.
439 *
440 * @param outPtr Where to store the results
441 * @param xstart The index of the section we're starting to blur.
442 * @param xend The end index of the section.
443 * @param currentY The index of the line we're blurring.
444 */
kernelU1(void * outPtr,uint32_t xstart,uint32_t xend,uint32_t currentY)445 void BlurTask::kernelU1(void *outPtr, uint32_t xstart, uint32_t xend, uint32_t currentY) {
446 float buf[4 * 2048];
447 const uint32_t stride = mSizeX * mVectorSize;
448
449 uchar *out = (uchar *)outPtr;
450 uint32_t x1 = xstart;
451 uint32_t x2 = xend;
452
453 #if defined(ARCH_ARM_USE_INTRINSICS)
454 if (mUsesSimd && mSizeX >= 16) {
455 // The specialisation for r<=8 has an awkward prefill case, which is
456 // fiddly to resolve, where starting close to the right edge can cause
457 // a read beyond the end of input. So avoid that case here.
458 if (mIradius > 8 || (mSizeX - std::max(0, (int32_t)x1 - 8)) >= 16) {
459 rsdIntrinsicBlurU1_K(out, mIn + stride * currentY, mSizeX, mSizeY,
460 stride, x1, currentY, x2 - x1, mIradius, mIp + mIradius);
461 return;
462 }
463 }
464 #endif
465
466 float *fout = (float *)buf;
467 int y = currentY;
468 if ((y > mIradius) && (y < ((int)mSizeY - mIradius -1))) {
469 const uchar *pi = mIn + (y - mIradius) * stride;
470 OneVFU1(fout, pi, stride, mFp, mIradius * 2 + 1, mSizeX, mUsesSimd);
471 } else {
472 x1 = 0;
473 while(mSizeX > x1) {
474 OneVU1(mSizeY, fout, x1, y, mIn, stride, mFp, mIradius);
475 fout++;
476 x1++;
477 }
478 }
479
480 x1 = xstart;
481 while ((x1 < x2) &&
482 ((x1 < (uint32_t)mIradius) || (((uintptr_t)out) & 0x3))) {
483 OneHU1(mSizeX, out, x1, buf, mFp, mIradius);
484 out++;
485 x1++;
486 }
487 #if defined(ARCH_X86_HAVE_SSSE3)
488 if (mUsesSimd) {
489 if ((x1 + mIradius) < x2) {
490 uint32_t len = x2 - (x1 + mIradius);
491 len &= ~3;
492
493 // rsdIntrinsicBlurHFU1_K() processes each four float values in |buf| at once, so it
494 // nees to ensure four more values can be accessed in order to avoid accessing
495 // uninitialized buffer.
496 if (len > 4) {
497 len -= 4;
498 rsdIntrinsicBlurHFU1_K(out, ((float *)buf) - mIradius, mFp,
499 mIradius * 2 + 1, x1, x1 + len);
500 out += len;
501 x1 += len;
502 }
503 }
504 }
505 #endif
506 while(x2 > x1) {
507 OneHU1(mSizeX, out, x1, buf, mFp, mIradius);
508 out++;
509 x1++;
510 }
511 }
512
processData(int threadIndex,size_t startX,size_t startY,size_t endX,size_t endY)513 void BlurTask::processData(int threadIndex, size_t startX, size_t startY, size_t endX,
514 size_t endY) {
515 for (size_t y = startY; y < endY; y++) {
516 void* outPtr = outArray + (mSizeX * y + startX) * mVectorSize;
517 if (mVectorSize == 4) {
518 kernelU4(outPtr, startX, endX, y, threadIndex);
519 } else {
520 kernelU1(outPtr, startX, endX, y);
521 }
522 }
523 }
524
blur(const uint8_t * in,uint8_t * out,size_t sizeX,size_t sizeY,size_t vectorSize,int radius,const Restriction * restriction)525 void RenderScriptToolkit::blur(const uint8_t* in, uint8_t* out, size_t sizeX, size_t sizeY,
526 size_t vectorSize, int radius, const Restriction* restriction) {
527 #ifdef ANDROID_RENDERSCRIPT_TOOLKIT_VALIDATE
528 if (!validRestriction(LOG_TAG, sizeX, sizeY, restriction)) {
529 return;
530 }
531 if (radius <= 0 || radius > 25) {
532 ALOGE("The radius should be between 1 and 25. %d provided.", radius);
533 }
534 if (vectorSize != 1 && vectorSize != 4) {
535 ALOGE("The vectorSize should be 1 or 4. %zu provided.", vectorSize);
536 }
537 #endif
538
539 BlurTask task(in, out, sizeX, sizeY, vectorSize, processor->getNumberOfThreads(), radius,
540 restriction);
541 processor->doTask(&task);
542 }
543
544 } // namespace renderscript
545 } // namespace android
546