1 /*
2 * Copyright (C) 2012 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #include "rsCpuIntrinsic.h"
18 #include "rsCpuIntrinsicInlines.h"
19
20 using namespace android;
21 using namespace android::renderscript;
22
23 namespace android {
24 namespace renderscript {
25
26
27 class RsdCpuScriptIntrinsicBlur : public RsdCpuScriptIntrinsic {
28 public:
29 virtual void populateScript(Script *);
30 virtual void invokeFreeChildren();
31
32 virtual void setGlobalVar(uint32_t slot, const void *data, size_t dataLength);
33 virtual void setGlobalObj(uint32_t slot, ObjectBase *data);
34
35 virtual ~RsdCpuScriptIntrinsicBlur();
36 RsdCpuScriptIntrinsicBlur(RsdCpuReferenceImpl *ctx, const Script *s, const Element *e);
37
38 protected:
39 float mFp[104];
40 short mIp[104];
41 void **mScratch;
42 size_t *mScratchSize;
43 float mRadius;
44 int mIradius;
45 ObjectBaseRef<Allocation> mAlloc;
46
47 static void kernelU4(const RsForEachStubParamStruct *p,
48 uint32_t xstart, uint32_t xend,
49 uint32_t instep, uint32_t outstep);
50 static void kernelU1(const RsForEachStubParamStruct *p,
51 uint32_t xstart, uint32_t xend,
52 uint32_t instep, uint32_t outstep);
53 void ComputeGaussianWeights();
54 };
55
56 }
57 }
58
59
ComputeGaussianWeights()60 void RsdCpuScriptIntrinsicBlur::ComputeGaussianWeights() {
61 memset(mFp, 0, sizeof(mFp));
62 memset(mIp, 0, sizeof(mIp));
63
64 // Compute gaussian weights for the blur
65 // e is the euler's number
66 float e = 2.718281828459045f;
67 float pi = 3.1415926535897932f;
68 // g(x) = ( 1 / sqrt( 2 * pi ) * sigma) * e ^ ( -x^2 / 2 * sigma^2 )
69 // x is of the form [-radius .. 0 .. radius]
70 // and sigma varies with radius.
71 // Based on some experimental radius values and sigma's
72 // we approximately fit sigma = f(radius) as
73 // sigma = radius * 0.4 + 0.6
74 // The larger the radius gets, the more our gaussian blur
75 // will resemble a box blur since with large sigma
76 // the gaussian curve begins to lose its shape
77 float sigma = 0.4f * mRadius + 0.6f;
78
79 // Now compute the coefficients. We will store some redundant values to save
80 // some math during the blur calculations precompute some values
81 float coeff1 = 1.0f / (sqrtf(2.0f * pi) * sigma);
82 float coeff2 = - 1.0f / (2.0f * sigma * sigma);
83
84 float normalizeFactor = 0.0f;
85 float floatR = 0.0f;
86 int r;
87 mIradius = (float)ceil(mRadius) + 0.5f;
88 for (r = -mIradius; r <= mIradius; r ++) {
89 floatR = (float)r;
90 mFp[r + mIradius] = coeff1 * powf(e, floatR * floatR * coeff2);
91 normalizeFactor += mFp[r + mIradius];
92 }
93
94 //Now we need to normalize the weights because all our coefficients need to add up to one
95 normalizeFactor = 1.0f / normalizeFactor;
96 for (r = -mIradius; r <= mIradius; r ++) {
97 mFp[r + mIradius] *= normalizeFactor;
98 mIp[r + mIradius] = (short)(mIp[r + mIradius] * 32768);
99 }
100 }
101
setGlobalObj(uint32_t slot,ObjectBase * data)102 void RsdCpuScriptIntrinsicBlur::setGlobalObj(uint32_t slot, ObjectBase *data) {
103 rsAssert(slot == 1);
104 mAlloc.set(static_cast<Allocation *>(data));
105 }
106
setGlobalVar(uint32_t slot,const void * data,size_t dataLength)107 void RsdCpuScriptIntrinsicBlur::setGlobalVar(uint32_t slot, const void *data, size_t dataLength) {
108 rsAssert(slot == 0);
109 mRadius = ((const float *)data)[0];
110 ComputeGaussianWeights();
111 }
112
113
114
OneVU4(const RsForEachStubParamStruct * p,float4 * out,int32_t x,int32_t y,const uchar * ptrIn,int iStride,const float * gPtr,int iradius)115 static void OneVU4(const RsForEachStubParamStruct *p, float4 *out, int32_t x, int32_t y,
116 const uchar *ptrIn, int iStride, const float* gPtr, int iradius) {
117
118 const uchar *pi = ptrIn + x*4;
119
120 float4 blurredPixel = 0;
121 for (int r = -iradius; r <= iradius; r ++) {
122 int validY = rsMax((y + r), 0);
123 validY = rsMin(validY, (int)(p->dimY - 1));
124 const uchar4 *pvy = (const uchar4 *)&pi[validY * iStride];
125 float4 pf = convert_float4(pvy[0]);
126 blurredPixel += pf * gPtr[0];
127 gPtr++;
128 }
129
130 out->xyzw = blurredPixel;
131 }
132
OneVU1(const RsForEachStubParamStruct * p,float * out,int32_t x,int32_t y,const uchar * ptrIn,int iStride,const float * gPtr,int iradius)133 static void OneVU1(const RsForEachStubParamStruct *p, float *out, int32_t x, int32_t y,
134 const uchar *ptrIn, int iStride, const float* gPtr, int iradius) {
135
136 const uchar *pi = ptrIn + x;
137
138 float blurredPixel = 0;
139 for (int r = -iradius; r <= iradius; r ++) {
140 int validY = rsMax((y + r), 0);
141 validY = rsMin(validY, (int)(p->dimY - 1));
142 float pf = (float)pi[validY * iStride];
143 blurredPixel += pf * gPtr[0];
144 gPtr++;
145 }
146
147 out[0] = blurredPixel;
148 }
149
150 extern "C" void rsdIntrinsicBlurVFU4_K(void *dst, const void *pin, int stride, const void *gptr, int rct, int x1, int ct);
151 extern "C" void rsdIntrinsicBlurHFU4_K(void *dst, const void *pin, const void *gptr, int rct, int x1, int ct);
152 extern "C" void rsdIntrinsicBlurHFU1_K(void *dst, const void *pin, const void *gptr, int rct, int x1, int ct);
153
OneVFU4(float4 * out,const uchar * ptrIn,int iStride,const float * gPtr,int ct,int x1,int x2)154 static void OneVFU4(float4 *out,
155 const uchar *ptrIn, int iStride, const float* gPtr, int ct,
156 int x1, int x2) {
157
158 #if defined(ARCH_ARM_HAVE_VFP)
159 if (gArchUseSIMD) {
160 int t = (x2 - x1);
161 t &= ~1;
162 if(t) {
163 rsdIntrinsicBlurVFU4_K(out, ptrIn, iStride, gPtr, ct, x1, x1 + t);
164 }
165 x1 += t;
166 }
167 #endif
168
169 while(x2 > x1) {
170 const uchar *pi = ptrIn;
171 float4 blurredPixel = 0;
172 const float* gp = gPtr;
173
174 for (int r = 0; r < ct; r++) {
175 float4 pf = convert_float4(((const uchar4 *)pi)[0]);
176 blurredPixel += pf * gp[0];
177 pi += iStride;
178 gp++;
179 }
180 out->xyzw = blurredPixel;
181 x1++;
182 out++;
183 ptrIn+=4;
184 }
185 }
186
OneVFU1(float * out,const uchar * ptrIn,int iStride,const float * gPtr,int ct,int x1,int x2)187 static void OneVFU1(float *out,
188 const uchar *ptrIn, int iStride, const float* gPtr, int ct, int x1, int x2) {
189
190 int len = x2 - x1;
191
192 while((x2 > x1) && (((uintptr_t)ptrIn) & 0x3)) {
193 const uchar *pi = ptrIn;
194 float blurredPixel = 0;
195 const float* gp = gPtr;
196
197 for (int r = 0; r < ct; r++) {
198 float pf = (float)pi[0];
199 blurredPixel += pf * gp[0];
200 pi += iStride;
201 gp++;
202 }
203 out[0] = blurredPixel;
204 x1++;
205 out++;
206 ptrIn++;
207 len--;
208 }
209
210 #if defined(ARCH_ARM_HAVE_VFP)
211 if (gArchUseSIMD && (x2 > x1)) {
212 int t = (x2 - x1) >> 2;
213 t &= ~1;
214 if(t) {
215 rsdIntrinsicBlurVFU4_K(out, ptrIn, iStride, gPtr, ct, 0, t );
216 len -= t << 2;
217 ptrIn += t << 2;
218 out += t << 2;
219 }
220 }
221 #endif
222
223 while(len > 0) {
224 const uchar *pi = ptrIn;
225 float blurredPixel = 0;
226 const float* gp = gPtr;
227
228 for (int r = 0; r < ct; r++) {
229 float pf = (float)pi[0];
230 blurredPixel += pf * gp[0];
231 pi += iStride;
232 gp++;
233 }
234 out[0] = blurredPixel;
235 len--;
236 out++;
237 ptrIn++;
238 }
239 }
240
OneHU4(const RsForEachStubParamStruct * p,uchar4 * out,int32_t x,const float4 * ptrIn,const float * gPtr,int iradius)241 static void OneHU4(const RsForEachStubParamStruct *p, uchar4 *out, int32_t x,
242 const float4 *ptrIn, const float* gPtr, int iradius) {
243
244 float4 blurredPixel = 0;
245 for (int r = -iradius; r <= iradius; r ++) {
246 int validX = rsMax((x + r), 0);
247 validX = rsMin(validX, (int)(p->dimX - 1));
248 float4 pf = ptrIn[validX];
249 blurredPixel += pf * gPtr[0];
250 gPtr++;
251 }
252
253 out->xyzw = convert_uchar4(blurredPixel);
254 }
255
OneHU1(const RsForEachStubParamStruct * p,uchar * out,int32_t x,const float * ptrIn,const float * gPtr,int iradius)256 static void OneHU1(const RsForEachStubParamStruct *p, uchar *out, int32_t x,
257 const float *ptrIn, const float* gPtr, int iradius) {
258
259 float blurredPixel = 0;
260 for (int r = -iradius; r <= iradius; r ++) {
261 int validX = rsMax((x + r), 0);
262 validX = rsMin(validX, (int)(p->dimX - 1));
263 float pf = ptrIn[validX];
264 blurredPixel += pf * gPtr[0];
265 gPtr++;
266 }
267
268 out[0] = (uchar)blurredPixel;
269 }
270
271
kernelU4(const RsForEachStubParamStruct * p,uint32_t xstart,uint32_t xend,uint32_t instep,uint32_t outstep)272 void RsdCpuScriptIntrinsicBlur::kernelU4(const RsForEachStubParamStruct *p,
273 uint32_t xstart, uint32_t xend,
274 uint32_t instep, uint32_t outstep) {
275
276 float4 stackbuf[2048];
277 float4 *buf = &stackbuf[0];
278 RsdCpuScriptIntrinsicBlur *cp = (RsdCpuScriptIntrinsicBlur *)p->usr;
279 if (!cp->mAlloc.get()) {
280 ALOGE("Blur executed without input, skipping");
281 return;
282 }
283 const uchar *pin = (const uchar *)cp->mAlloc->mHal.drvState.lod[0].mallocPtr;
284 const size_t stride = cp->mAlloc->mHal.drvState.lod[0].stride;
285
286 uchar4 *out = (uchar4 *)p->out;
287 uint32_t x1 = xstart;
288 uint32_t x2 = xend;
289
290 if (p->dimX > 2048) {
291 if ((p->dimX > cp->mScratchSize[p->lid]) || !cp->mScratch[p->lid]) {
292 // Pad the side of the allocation by one unit to allow alignment later
293 cp->mScratch[p->lid] = realloc(cp->mScratch[p->lid], (p->dimX + 1) * 16);
294 cp->mScratchSize[p->lid] = p->dimX;
295 }
296 // realloc only aligns to 8 bytes so we manually align to 16.
297 buf = (float4 *) ((((intptr_t)cp->mScratch[p->lid]) + 15) & ~0xf);
298 }
299 float4 *fout = (float4 *)buf;
300 int y = p->y;
301 if ((y > cp->mIradius) && (y < ((int)p->dimY - cp->mIradius))) {
302 const uchar *pi = pin + (y - cp->mIradius) * stride;
303 OneVFU4(fout, pi, stride, cp->mFp, cp->mIradius * 2 + 1, x1, x2);
304 } else {
305 while(x2 > x1) {
306 OneVU4(p, fout, x1, y, pin, stride, cp->mFp, cp->mIradius);
307 fout++;
308 x1++;
309 }
310 }
311
312 x1 = xstart;
313 while ((x1 < (uint32_t)cp->mIradius) && (x1 < x2)) {
314 OneHU4(p, out, x1, buf, cp->mFp, cp->mIradius);
315 out++;
316 x1++;
317 }
318 #if defined(ARCH_ARM_HAVE_VFP)
319 if (gArchUseSIMD) {
320 if ((x1 + cp->mIradius) < x2) {
321 rsdIntrinsicBlurHFU4_K(out, buf - cp->mIradius, cp->mFp,
322 cp->mIradius * 2 + 1, x1, x2 - cp->mIradius);
323 out += (x2 - cp->mIradius) - x1;
324 x1 = x2 - cp->mIradius;
325 }
326 }
327 #endif
328 while(x2 > x1) {
329 OneHU4(p, out, x1, buf, cp->mFp, cp->mIradius);
330 out++;
331 x1++;
332 }
333 }
334
kernelU1(const RsForEachStubParamStruct * p,uint32_t xstart,uint32_t xend,uint32_t instep,uint32_t outstep)335 void RsdCpuScriptIntrinsicBlur::kernelU1(const RsForEachStubParamStruct *p,
336 uint32_t xstart, uint32_t xend,
337 uint32_t instep, uint32_t outstep) {
338 float buf[4 * 2048];
339 RsdCpuScriptIntrinsicBlur *cp = (RsdCpuScriptIntrinsicBlur *)p->usr;
340 if (!cp->mAlloc.get()) {
341 ALOGE("Blur executed without input, skipping");
342 return;
343 }
344 const uchar *pin = (const uchar *)cp->mAlloc->mHal.drvState.lod[0].mallocPtr;
345 const size_t stride = cp->mAlloc->mHal.drvState.lod[0].stride;
346
347 uchar *out = (uchar *)p->out;
348 uint32_t x1 = xstart;
349 uint32_t x2 = xend;
350
351 float *fout = (float *)buf;
352 int y = p->y;
353 if ((y > cp->mIradius) && (y < ((int)p->dimY - cp->mIradius -1))) {
354 const uchar *pi = pin + (y - cp->mIradius) * stride;
355 OneVFU1(fout, pi, stride, cp->mFp, cp->mIradius * 2 + 1, x1, x2);
356 } else {
357 while(x2 > x1) {
358 OneVU1(p, fout, x1, y, pin, stride, cp->mFp, cp->mIradius);
359 fout++;
360 x1++;
361 }
362 }
363
364 x1 = xstart;
365 while ((x1 < x2) &&
366 ((x1 < (uint32_t)cp->mIradius) || (((uintptr_t)out) & 0x3))) {
367 OneHU1(p, out, x1, buf, cp->mFp, cp->mIradius);
368 out++;
369 x1++;
370 }
371 #if defined(ARCH_ARM_HAVE_VFP)
372 if (gArchUseSIMD) {
373 if ((x1 + cp->mIradius) < x2) {
374 uint32_t len = x2 - (x1 + cp->mIradius);
375 len &= ~3;
376 if (len > 0) {
377 rsdIntrinsicBlurHFU1_K(out, ((float *)buf) - cp->mIradius, cp->mFp,
378 cp->mIradius * 2 + 1, x1, x1 + len);
379 out += len;
380 x1 += len;
381 }
382 }
383 }
384 #endif
385 while(x2 > x1) {
386 OneHU1(p, out, x1, buf, cp->mFp, cp->mIradius);
387 out++;
388 x1++;
389 }
390 }
391
RsdCpuScriptIntrinsicBlur(RsdCpuReferenceImpl * ctx,const Script * s,const Element * e)392 RsdCpuScriptIntrinsicBlur::RsdCpuScriptIntrinsicBlur(RsdCpuReferenceImpl *ctx,
393 const Script *s, const Element *e)
394 : RsdCpuScriptIntrinsic(ctx, s, e, RS_SCRIPT_INTRINSIC_ID_BLUR) {
395
396 mRootPtr = NULL;
397 if (e->getType() == RS_TYPE_UNSIGNED_8) {
398 switch (e->getVectorSize()) {
399 case 1:
400 mRootPtr = &kernelU1;
401 break;
402 case 4:
403 mRootPtr = &kernelU4;
404 break;
405 }
406 }
407 rsAssert(mRootPtr);
408 mRadius = 5;
409
410 mScratch = new void *[mCtx->getThreadCount()];
411 mScratchSize = new size_t[mCtx->getThreadCount()];
412 memset(mScratch, 0, sizeof(void *) * mCtx->getThreadCount());
413 memset(mScratchSize, 0, sizeof(size_t) * mCtx->getThreadCount());
414
415 ComputeGaussianWeights();
416 }
417
~RsdCpuScriptIntrinsicBlur()418 RsdCpuScriptIntrinsicBlur::~RsdCpuScriptIntrinsicBlur() {
419 uint32_t threads = mCtx->getThreadCount();
420 if (mScratch) {
421 for (size_t i = 0; i < threads; i++) {
422 if (mScratch[i]) {
423 free(mScratch[i]);
424 }
425 }
426 delete []mScratch;
427 }
428 if (mScratchSize) {
429 delete []mScratchSize;
430 }
431 }
432
populateScript(Script * s)433 void RsdCpuScriptIntrinsicBlur::populateScript(Script *s) {
434 s->mHal.info.exportedVariableCount = 2;
435 }
436
invokeFreeChildren()437 void RsdCpuScriptIntrinsicBlur::invokeFreeChildren() {
438 mAlloc.clear();
439 }
440
441
rsdIntrinsic_Blur(RsdCpuReferenceImpl * ctx,const Script * s,const Element * e)442 RsdCpuScriptImpl * rsdIntrinsic_Blur(RsdCpuReferenceImpl *ctx, const Script *s, const Element *e) {
443
444 return new RsdCpuScriptIntrinsicBlur(ctx, s, e);
445 }
446
447
448