1 /*
2 * Copyright (C) 2012 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17
18 #include "rsCpuIntrinsic.h"
19 #include "rsCpuIntrinsicInlines.h"
20
21 using namespace android;
22 using namespace android::renderscript;
23
24 namespace android {
25 namespace renderscript {
26
27
28 class RsdCpuScriptIntrinsicConvolve5x5 : public RsdCpuScriptIntrinsic {
29 public:
30 virtual void populateScript(Script *);
31 virtual void invokeFreeChildren();
32
33 virtual void setGlobalVar(uint32_t slot, const void *data, size_t dataLength);
34 virtual void setGlobalObj(uint32_t slot, ObjectBase *data);
35
36 virtual ~RsdCpuScriptIntrinsicConvolve5x5();
37 RsdCpuScriptIntrinsicConvolve5x5(RsdCpuReferenceImpl *ctx, const Script *s, const Element *e);
38
39 protected:
40 float mFp[28];
41 short mIp[28];
42 ObjectBaseRef<Allocation> alloc;
43
44
45 static void kernelU1(const RsForEachStubParamStruct *p,
46 uint32_t xstart, uint32_t xend,
47 uint32_t instep, uint32_t outstep);
48 static void kernelU2(const RsForEachStubParamStruct *p,
49 uint32_t xstart, uint32_t xend,
50 uint32_t instep, uint32_t outstep);
51 static void kernelU4(const RsForEachStubParamStruct *p,
52 uint32_t xstart, uint32_t xend,
53 uint32_t instep, uint32_t outstep);
54 static void kernelF1(const RsForEachStubParamStruct *p,
55 uint32_t xstart, uint32_t xend,
56 uint32_t instep, uint32_t outstep);
57 static void kernelF2(const RsForEachStubParamStruct *p,
58 uint32_t xstart, uint32_t xend,
59 uint32_t instep, uint32_t outstep);
60 static void kernelF4(const RsForEachStubParamStruct *p,
61 uint32_t xstart, uint32_t xend,
62 uint32_t instep, uint32_t outstep);
63
64
65 };
66
67 }
68 }
69
setGlobalObj(uint32_t slot,ObjectBase * data)70 void RsdCpuScriptIntrinsicConvolve5x5::setGlobalObj(uint32_t slot, ObjectBase *data) {
71 rsAssert(slot == 1);
72 alloc.set(static_cast<Allocation *>(data));
73 }
74
setGlobalVar(uint32_t slot,const void * data,size_t dataLength)75 void RsdCpuScriptIntrinsicConvolve5x5::setGlobalVar(uint32_t slot,
76 const void *data, size_t dataLength) {
77 rsAssert(slot == 0);
78 memcpy (&mFp, data, dataLength);
79 for(int ct=0; ct < 25; ct++) {
80 if (mFp[ct] >= 0) {
81 mIp[ct] = (short)(mFp[ct] * 256.f + 0.5f);
82 } else {
83 mIp[ct] = (short)(mFp[ct] * 256.f - 0.5f);
84 }
85 }
86 }
87
88
OneU4(const RsForEachStubParamStruct * p,uint32_t x,uchar4 * out,const uchar4 * py0,const uchar4 * py1,const uchar4 * py2,const uchar4 * py3,const uchar4 * py4,const float * coeff)89 static void OneU4(const RsForEachStubParamStruct *p, uint32_t x, uchar4 *out,
90 const uchar4 *py0, const uchar4 *py1, const uchar4 *py2, const uchar4 *py3, const uchar4 *py4,
91 const float* coeff) {
92
93 uint32_t x0 = rsMax((int32_t)x-2, 0);
94 uint32_t x1 = rsMax((int32_t)x-1, 0);
95 uint32_t x2 = x;
96 uint32_t x3 = rsMin((int32_t)x+1, (int32_t)(p->dimX-1));
97 uint32_t x4 = rsMin((int32_t)x+2, (int32_t)(p->dimX-1));
98
99 float4 px = convert_float4(py0[x0]) * coeff[0] +
100 convert_float4(py0[x1]) * coeff[1] +
101 convert_float4(py0[x2]) * coeff[2] +
102 convert_float4(py0[x3]) * coeff[3] +
103 convert_float4(py0[x4]) * coeff[4] +
104
105 convert_float4(py1[x0]) * coeff[5] +
106 convert_float4(py1[x1]) * coeff[6] +
107 convert_float4(py1[x2]) * coeff[7] +
108 convert_float4(py1[x3]) * coeff[8] +
109 convert_float4(py1[x4]) * coeff[9] +
110
111 convert_float4(py2[x0]) * coeff[10] +
112 convert_float4(py2[x1]) * coeff[11] +
113 convert_float4(py2[x2]) * coeff[12] +
114 convert_float4(py2[x3]) * coeff[13] +
115 convert_float4(py2[x4]) * coeff[14] +
116
117 convert_float4(py3[x0]) * coeff[15] +
118 convert_float4(py3[x1]) * coeff[16] +
119 convert_float4(py3[x2]) * coeff[17] +
120 convert_float4(py3[x3]) * coeff[18] +
121 convert_float4(py3[x4]) * coeff[19] +
122
123 convert_float4(py4[x0]) * coeff[20] +
124 convert_float4(py4[x1]) * coeff[21] +
125 convert_float4(py4[x2]) * coeff[22] +
126 convert_float4(py4[x3]) * coeff[23] +
127 convert_float4(py4[x4]) * coeff[24];
128 px = clamp(px, 0.f, 255.f);
129 *out = convert_uchar4(px);
130 }
131
OneU2(const RsForEachStubParamStruct * p,uint32_t x,uchar2 * out,const uchar2 * py0,const uchar2 * py1,const uchar2 * py2,const uchar2 * py3,const uchar2 * py4,const float * coeff)132 static void OneU2(const RsForEachStubParamStruct *p, uint32_t x, uchar2 *out,
133 const uchar2 *py0, const uchar2 *py1, const uchar2 *py2, const uchar2 *py3, const uchar2 *py4,
134 const float* coeff) {
135
136 uint32_t x0 = rsMax((int32_t)x-2, 0);
137 uint32_t x1 = rsMax((int32_t)x-1, 0);
138 uint32_t x2 = x;
139 uint32_t x3 = rsMin((int32_t)x+1, (int32_t)(p->dimX-1));
140 uint32_t x4 = rsMin((int32_t)x+2, (int32_t)(p->dimX-1));
141
142 float2 px = convert_float2(py0[x0]) * coeff[0] +
143 convert_float2(py0[x1]) * coeff[1] +
144 convert_float2(py0[x2]) * coeff[2] +
145 convert_float2(py0[x3]) * coeff[3] +
146 convert_float2(py0[x4]) * coeff[4] +
147
148 convert_float2(py1[x0]) * coeff[5] +
149 convert_float2(py1[x1]) * coeff[6] +
150 convert_float2(py1[x2]) * coeff[7] +
151 convert_float2(py1[x3]) * coeff[8] +
152 convert_float2(py1[x4]) * coeff[9] +
153
154 convert_float2(py2[x0]) * coeff[10] +
155 convert_float2(py2[x1]) * coeff[11] +
156 convert_float2(py2[x2]) * coeff[12] +
157 convert_float2(py2[x3]) * coeff[13] +
158 convert_float2(py2[x4]) * coeff[14] +
159
160 convert_float2(py3[x0]) * coeff[15] +
161 convert_float2(py3[x1]) * coeff[16] +
162 convert_float2(py3[x2]) * coeff[17] +
163 convert_float2(py3[x3]) * coeff[18] +
164 convert_float2(py3[x4]) * coeff[19] +
165
166 convert_float2(py4[x0]) * coeff[20] +
167 convert_float2(py4[x1]) * coeff[21] +
168 convert_float2(py4[x2]) * coeff[22] +
169 convert_float2(py4[x3]) * coeff[23] +
170 convert_float2(py4[x4]) * coeff[24];
171 px = clamp(px, 0.f, 255.f);
172 *out = convert_uchar2(px);
173 }
174
OneU1(const RsForEachStubParamStruct * p,uint32_t x,uchar * out,const uchar * py0,const uchar * py1,const uchar * py2,const uchar * py3,const uchar * py4,const float * coeff)175 static void OneU1(const RsForEachStubParamStruct *p, uint32_t x, uchar *out,
176 const uchar *py0, const uchar *py1, const uchar *py2, const uchar *py3, const uchar *py4,
177 const float* coeff) {
178
179 uint32_t x0 = rsMax((int32_t)x-2, 0);
180 uint32_t x1 = rsMax((int32_t)x-1, 0);
181 uint32_t x2 = x;
182 uint32_t x3 = rsMin((int32_t)x+1, (int32_t)(p->dimX-1));
183 uint32_t x4 = rsMin((int32_t)x+2, (int32_t)(p->dimX-1));
184
185 float px = (float)(py0[x0]) * coeff[0] +
186 (float)(py0[x1]) * coeff[1] +
187 (float)(py0[x2]) * coeff[2] +
188 (float)(py0[x3]) * coeff[3] +
189 (float)(py0[x4]) * coeff[4] +
190
191 (float)(py1[x0]) * coeff[5] +
192 (float)(py1[x1]) * coeff[6] +
193 (float)(py1[x2]) * coeff[7] +
194 (float)(py1[x3]) * coeff[8] +
195 (float)(py1[x4]) * coeff[9] +
196
197 (float)(py2[x0]) * coeff[10] +
198 (float)(py2[x1]) * coeff[11] +
199 (float)(py2[x2]) * coeff[12] +
200 (float)(py2[x3]) * coeff[13] +
201 (float)(py2[x4]) * coeff[14] +
202
203 (float)(py3[x0]) * coeff[15] +
204 (float)(py3[x1]) * coeff[16] +
205 (float)(py3[x2]) * coeff[17] +
206 (float)(py3[x3]) * coeff[18] +
207 (float)(py3[x4]) * coeff[19] +
208
209 (float)(py4[x0]) * coeff[20] +
210 (float)(py4[x1]) * coeff[21] +
211 (float)(py4[x2]) * coeff[22] +
212 (float)(py4[x3]) * coeff[23] +
213 (float)(py4[x4]) * coeff[24];
214 px = clamp(px, 0.f, 255.f);
215 *out = px;
216 }
217
OneF4(const RsForEachStubParamStruct * p,uint32_t x,float4 * out,const float4 * py0,const float4 * py1,const float4 * py2,const float4 * py3,const float4 * py4,const float * coeff)218 static void OneF4(const RsForEachStubParamStruct *p, uint32_t x, float4 *out,
219 const float4 *py0, const float4 *py1, const float4 *py2, const float4 *py3, const float4 *py4,
220 const float* coeff) {
221
222 uint32_t x0 = rsMax((int32_t)x-2, 0);
223 uint32_t x1 = rsMax((int32_t)x-1, 0);
224 uint32_t x2 = x;
225 uint32_t x3 = rsMin((int32_t)x+1, (int32_t)(p->dimX-1));
226 uint32_t x4 = rsMin((int32_t)x+2, (int32_t)(p->dimX-1));
227
228 float4 px = py0[x0] * coeff[0] +
229 py0[x1] * coeff[1] +
230 py0[x2] * coeff[2] +
231 py0[x3] * coeff[3] +
232 py0[x4] * coeff[4] +
233
234 py1[x0] * coeff[5] +
235 py1[x1] * coeff[6] +
236 py1[x2] * coeff[7] +
237 py1[x3] * coeff[8] +
238 py1[x4] * coeff[9] +
239
240 py2[x0] * coeff[10] +
241 py2[x1] * coeff[11] +
242 py2[x2] * coeff[12] +
243 py2[x3] * coeff[13] +
244 py2[x4] * coeff[14] +
245
246 py3[x0] * coeff[15] +
247 py3[x1] * coeff[16] +
248 py3[x2] * coeff[17] +
249 py3[x3] * coeff[18] +
250 py3[x4] * coeff[19] +
251
252 py4[x0] * coeff[20] +
253 py4[x1] * coeff[21] +
254 py4[x2] * coeff[22] +
255 py4[x3] * coeff[23] +
256 py4[x4] * coeff[24];
257 *out = px;
258 }
259
OneF2(const RsForEachStubParamStruct * p,uint32_t x,float2 * out,const float2 * py0,const float2 * py1,const float2 * py2,const float2 * py3,const float2 * py4,const float * coeff)260 static void OneF2(const RsForEachStubParamStruct *p, uint32_t x, float2 *out,
261 const float2 *py0, const float2 *py1, const float2 *py2, const float2 *py3, const float2 *py4,
262 const float* coeff) {
263
264 uint32_t x0 = rsMax((int32_t)x-2, 0);
265 uint32_t x1 = rsMax((int32_t)x-1, 0);
266 uint32_t x2 = x;
267 uint32_t x3 = rsMin((int32_t)x+1, (int32_t)(p->dimX-1));
268 uint32_t x4 = rsMin((int32_t)x+2, (int32_t)(p->dimX-1));
269
270 float2 px = py0[x0] * coeff[0] +
271 py0[x1] * coeff[1] +
272 py0[x2] * coeff[2] +
273 py0[x3] * coeff[3] +
274 py0[x4] * coeff[4] +
275
276 py1[x0] * coeff[5] +
277 py1[x1] * coeff[6] +
278 py1[x2] * coeff[7] +
279 py1[x3] * coeff[8] +
280 py1[x4] * coeff[9] +
281
282 py2[x0] * coeff[10] +
283 py2[x1] * coeff[11] +
284 py2[x2] * coeff[12] +
285 py2[x3] * coeff[13] +
286 py2[x4] * coeff[14] +
287
288 py3[x0] * coeff[15] +
289 py3[x1] * coeff[16] +
290 py3[x2] * coeff[17] +
291 py3[x3] * coeff[18] +
292 py3[x4] * coeff[19] +
293
294 py4[x0] * coeff[20] +
295 py4[x1] * coeff[21] +
296 py4[x2] * coeff[22] +
297 py4[x3] * coeff[23] +
298 py4[x4] * coeff[24];
299 *out = px;
300 }
301
OneF1(const RsForEachStubParamStruct * p,uint32_t x,float * out,const float * py0,const float * py1,const float * py2,const float * py3,const float * py4,const float * coeff)302 static void OneF1(const RsForEachStubParamStruct *p, uint32_t x, float *out,
303 const float *py0, const float *py1, const float *py2, const float *py3, const float *py4,
304 const float* coeff) {
305
306 uint32_t x0 = rsMax((int32_t)x-2, 0);
307 uint32_t x1 = rsMax((int32_t)x-1, 0);
308 uint32_t x2 = x;
309 uint32_t x3 = rsMin((int32_t)x+1, (int32_t)(p->dimX-1));
310 uint32_t x4 = rsMin((int32_t)x+2, (int32_t)(p->dimX-1));
311
312 float px = py0[x0] * coeff[0] +
313 py0[x1] * coeff[1] +
314 py0[x2] * coeff[2] +
315 py0[x3] * coeff[3] +
316 py0[x4] * coeff[4] +
317
318 py1[x0] * coeff[5] +
319 py1[x1] * coeff[6] +
320 py1[x2] * coeff[7] +
321 py1[x3] * coeff[8] +
322 py1[x4] * coeff[9] +
323
324 py2[x0] * coeff[10] +
325 py2[x1] * coeff[11] +
326 py2[x2] * coeff[12] +
327 py2[x3] * coeff[13] +
328 py2[x4] * coeff[14] +
329
330 py3[x0] * coeff[15] +
331 py3[x1] * coeff[16] +
332 py3[x2] * coeff[17] +
333 py3[x3] * coeff[18] +
334 py3[x4] * coeff[19] +
335
336 py4[x0] * coeff[20] +
337 py4[x1] * coeff[21] +
338 py4[x2] * coeff[22] +
339 py4[x3] * coeff[23] +
340 py4[x4] * coeff[24];
341 *out = px;
342 }
343
344
345 extern "C" void rsdIntrinsicConvolve5x5_K(void *dst, const void *y0, const void *y1,
346 const void *y2, const void *y3, const void *y4,
347 const short *coef, uint32_t count);
348
kernelU4(const RsForEachStubParamStruct * p,uint32_t xstart,uint32_t xend,uint32_t instep,uint32_t outstep)349 void RsdCpuScriptIntrinsicConvolve5x5::kernelU4(const RsForEachStubParamStruct *p,
350 uint32_t xstart, uint32_t xend,
351 uint32_t instep, uint32_t outstep) {
352 RsdCpuScriptIntrinsicConvolve5x5 *cp = (RsdCpuScriptIntrinsicConvolve5x5 *)p->usr;
353 if (!cp->alloc.get()) {
354 ALOGE("Convolve5x5 executed without input, skipping");
355 return;
356 }
357 const uchar *pin = (const uchar *)cp->alloc->mHal.drvState.lod[0].mallocPtr;
358 const size_t stride = cp->alloc->mHal.drvState.lod[0].stride;
359
360 uint32_t y0 = rsMax((int32_t)p->y-2, 0);
361 uint32_t y1 = rsMax((int32_t)p->y-1, 0);
362 uint32_t y2 = p->y;
363 uint32_t y3 = rsMin((int32_t)p->y+1, (int32_t)(p->dimY-1));
364 uint32_t y4 = rsMin((int32_t)p->y+2, (int32_t)(p->dimY-1));
365
366 const uchar4 *py0 = (const uchar4 *)(pin + stride * y0);
367 const uchar4 *py1 = (const uchar4 *)(pin + stride * y1);
368 const uchar4 *py2 = (const uchar4 *)(pin + stride * y2);
369 const uchar4 *py3 = (const uchar4 *)(pin + stride * y3);
370 const uchar4 *py4 = (const uchar4 *)(pin + stride * y4);
371
372 uchar4 *out = (uchar4 *)p->out;
373 uint32_t x1 = xstart;
374 uint32_t x2 = xend;
375
376 while((x1 < x2) && (x1 < 2)) {
377 OneU4(p, x1, out, py0, py1, py2, py3, py4, cp->mFp);
378 out++;
379 x1++;
380 }
381
382 #if defined(ARCH_ARM_HAVE_VFP)
383 if(gArchUseSIMD && ((x1 + 3) < x2)) {
384 uint32_t len = (x2 - x1 - 3) >> 1;
385 rsdIntrinsicConvolve5x5_K(out, py0, py1, py2, py3, py4, cp->mIp, len);
386 out += len << 1;
387 x1 += len << 1;
388 }
389 #endif
390
391 while(x1 < x2) {
392 OneU4(p, x1, out, py0, py1, py2, py3, py4, cp->mFp);
393 out++;
394 x1++;
395 }
396 }
397
kernelU2(const RsForEachStubParamStruct * p,uint32_t xstart,uint32_t xend,uint32_t instep,uint32_t outstep)398 void RsdCpuScriptIntrinsicConvolve5x5::kernelU2(const RsForEachStubParamStruct *p,
399 uint32_t xstart, uint32_t xend,
400 uint32_t instep, uint32_t outstep) {
401 RsdCpuScriptIntrinsicConvolve5x5 *cp = (RsdCpuScriptIntrinsicConvolve5x5 *)p->usr;
402 if (!cp->alloc.get()) {
403 ALOGE("Convolve5x5 executed without input, skipping");
404 return;
405 }
406 const uchar *pin = (const uchar *)cp->alloc->mHal.drvState.lod[0].mallocPtr;
407 const size_t stride = cp->alloc->mHal.drvState.lod[0].stride;
408
409 uint32_t y0 = rsMax((int32_t)p->y-2, 0);
410 uint32_t y1 = rsMax((int32_t)p->y-1, 0);
411 uint32_t y2 = p->y;
412 uint32_t y3 = rsMin((int32_t)p->y+1, (int32_t)(p->dimY-1));
413 uint32_t y4 = rsMin((int32_t)p->y+2, (int32_t)(p->dimY-1));
414
415 const uchar2 *py0 = (const uchar2 *)(pin + stride * y0);
416 const uchar2 *py1 = (const uchar2 *)(pin + stride * y1);
417 const uchar2 *py2 = (const uchar2 *)(pin + stride * y2);
418 const uchar2 *py3 = (const uchar2 *)(pin + stride * y3);
419 const uchar2 *py4 = (const uchar2 *)(pin + stride * y4);
420
421 uchar2 *out = (uchar2 *)p->out;
422 uint32_t x1 = xstart;
423 uint32_t x2 = xend;
424
425 while((x1 < x2) && (x1 < 2)) {
426 OneU2(p, x1, out, py0, py1, py2, py3, py4, cp->mFp);
427 out++;
428 x1++;
429 }
430
431 #if 0//defined(ARCH_ARM_HAVE_NEON)
432 if((x1 + 3) < x2) {
433 uint32_t len = (x2 - x1 - 3) >> 1;
434 rsdIntrinsicConvolve5x5_K(out, py0, py1, py2, py3, py4, cp->ip, len);
435 out += len << 1;
436 x1 += len << 1;
437 }
438 #endif
439
440 while(x1 < x2) {
441 OneU2(p, x1, out, py0, py1, py2, py3, py4, cp->mFp);
442 out++;
443 x1++;
444 }
445 }
446
kernelU1(const RsForEachStubParamStruct * p,uint32_t xstart,uint32_t xend,uint32_t instep,uint32_t outstep)447 void RsdCpuScriptIntrinsicConvolve5x5::kernelU1(const RsForEachStubParamStruct *p,
448 uint32_t xstart, uint32_t xend,
449 uint32_t instep, uint32_t outstep) {
450 RsdCpuScriptIntrinsicConvolve5x5 *cp = (RsdCpuScriptIntrinsicConvolve5x5 *)p->usr;
451 if (!cp->alloc.get()) {
452 ALOGE("Convolve5x5 executed without input, skipping");
453 return;
454 }
455 const uchar *pin = (const uchar *)cp->alloc->mHal.drvState.lod[0].mallocPtr;
456 const size_t stride = cp->alloc->mHal.drvState.lod[0].stride;
457
458 uint32_t y0 = rsMax((int32_t)p->y-2, 0);
459 uint32_t y1 = rsMax((int32_t)p->y-1, 0);
460 uint32_t y2 = p->y;
461 uint32_t y3 = rsMin((int32_t)p->y+1, (int32_t)(p->dimY-1));
462 uint32_t y4 = rsMin((int32_t)p->y+2, (int32_t)(p->dimY-1));
463
464 const uchar *py0 = (const uchar *)(pin + stride * y0);
465 const uchar *py1 = (const uchar *)(pin + stride * y1);
466 const uchar *py2 = (const uchar *)(pin + stride * y2);
467 const uchar *py3 = (const uchar *)(pin + stride * y3);
468 const uchar *py4 = (const uchar *)(pin + stride * y4);
469
470 uchar *out = (uchar *)p->out;
471 uint32_t x1 = xstart;
472 uint32_t x2 = xend;
473
474 while((x1 < x2) && (x1 < 2)) {
475 OneU1(p, x1, out, py0, py1, py2, py3, py4, cp->mFp);
476 out++;
477 x1++;
478 }
479
480 #if 0//defined(ARCH_ARM_HAVE_NEON)
481 if((x1 + 3) < x2) {
482 uint32_t len = (x2 - x1 - 3) >> 1;
483 rsdIntrinsicConvolve5x5_K(out, py0, py1, py2, py3, py4, cp->ip, len);
484 out += len << 1;
485 x1 += len << 1;
486 }
487 #endif
488
489 while(x1 < x2) {
490 OneU1(p, x1, out, py0, py1, py2, py3, py4, cp->mFp);
491 out++;
492 x1++;
493 }
494 }
495
kernelF4(const RsForEachStubParamStruct * p,uint32_t xstart,uint32_t xend,uint32_t instep,uint32_t outstep)496 void RsdCpuScriptIntrinsicConvolve5x5::kernelF4(const RsForEachStubParamStruct *p,
497 uint32_t xstart, uint32_t xend,
498 uint32_t instep, uint32_t outstep) {
499 RsdCpuScriptIntrinsicConvolve5x5 *cp = (RsdCpuScriptIntrinsicConvolve5x5 *)p->usr;
500 if (!cp->alloc.get()) {
501 ALOGE("Convolve5x5 executed without input, skipping");
502 return;
503 }
504 const uchar *pin = (const uchar *)cp->alloc->mHal.drvState.lod[0].mallocPtr;
505 const size_t stride = cp->alloc->mHal.drvState.lod[0].stride;
506
507 uint32_t y0 = rsMax((int32_t)p->y-2, 0);
508 uint32_t y1 = rsMax((int32_t)p->y-1, 0);
509 uint32_t y2 = p->y;
510 uint32_t y3 = rsMin((int32_t)p->y+1, (int32_t)(p->dimY-1));
511 uint32_t y4 = rsMin((int32_t)p->y+2, (int32_t)(p->dimY-1));
512
513 const float4 *py0 = (const float4 *)(pin + stride * y0);
514 const float4 *py1 = (const float4 *)(pin + stride * y1);
515 const float4 *py2 = (const float4 *)(pin + stride * y2);
516 const float4 *py3 = (const float4 *)(pin + stride * y3);
517 const float4 *py4 = (const float4 *)(pin + stride * y4);
518
519 float4 *out = (float4 *)p->out;
520 uint32_t x1 = xstart;
521 uint32_t x2 = xend;
522
523 while((x1 < x2) && (x1 < 2)) {
524 OneF4(p, x1, out, py0, py1, py2, py3, py4, cp->mFp);
525 out++;
526 x1++;
527 }
528
529 #if 0//defined(ARCH_ARM_HAVE_NEON)
530 if((x1 + 3) < x2) {
531 uint32_t len = (x2 - x1 - 3) >> 1;
532 rsdIntrinsicConvolve5x5_K(out, py0, py1, py2, py3, py4, cp->ip, len);
533 out += len << 1;
534 x1 += len << 1;
535 }
536 #endif
537
538 while(x1 < x2) {
539 OneF4(p, x1, out, py0, py1, py2, py3, py4, cp->mFp);
540 out++;
541 x1++;
542 }
543 }
544
kernelF2(const RsForEachStubParamStruct * p,uint32_t xstart,uint32_t xend,uint32_t instep,uint32_t outstep)545 void RsdCpuScriptIntrinsicConvolve5x5::kernelF2(const RsForEachStubParamStruct *p,
546 uint32_t xstart, uint32_t xend,
547 uint32_t instep, uint32_t outstep) {
548 RsdCpuScriptIntrinsicConvolve5x5 *cp = (RsdCpuScriptIntrinsicConvolve5x5 *)p->usr;
549 if (!cp->alloc.get()) {
550 ALOGE("Convolve5x5 executed without input, skipping");
551 return;
552 }
553 const uchar *pin = (const uchar *)cp->alloc->mHal.drvState.lod[0].mallocPtr;
554 const size_t stride = cp->alloc->mHal.drvState.lod[0].stride;
555
556 uint32_t y0 = rsMax((int32_t)p->y-2, 0);
557 uint32_t y1 = rsMax((int32_t)p->y-1, 0);
558 uint32_t y2 = p->y;
559 uint32_t y3 = rsMin((int32_t)p->y+1, (int32_t)(p->dimY-1));
560 uint32_t y4 = rsMin((int32_t)p->y+2, (int32_t)(p->dimY-1));
561
562 const float2 *py0 = (const float2 *)(pin + stride * y0);
563 const float2 *py1 = (const float2 *)(pin + stride * y1);
564 const float2 *py2 = (const float2 *)(pin + stride * y2);
565 const float2 *py3 = (const float2 *)(pin + stride * y3);
566 const float2 *py4 = (const float2 *)(pin + stride * y4);
567
568 float2 *out = (float2 *)p->out;
569 uint32_t x1 = xstart;
570 uint32_t x2 = xend;
571
572 while((x1 < x2) && (x1 < 2)) {
573 OneF2(p, x1, out, py0, py1, py2, py3, py4, cp->mFp);
574 out++;
575 x1++;
576 }
577
578 #if 0//defined(ARCH_ARM_HAVE_NEON)
579 if((x1 + 3) < x2) {
580 uint32_t len = (x2 - x1 - 3) >> 1;
581 rsdIntrinsicConvolve5x5_K(out, py0, py1, py2, py3, py4, cp->ip, len);
582 out += len << 1;
583 x1 += len << 1;
584 }
585 #endif
586
587 while(x1 < x2) {
588 OneF2(p, x1, out, py0, py1, py2, py3, py4, cp->mFp);
589 out++;
590 x1++;
591 }
592 }
593
kernelF1(const RsForEachStubParamStruct * p,uint32_t xstart,uint32_t xend,uint32_t instep,uint32_t outstep)594 void RsdCpuScriptIntrinsicConvolve5x5::kernelF1(const RsForEachStubParamStruct *p,
595 uint32_t xstart, uint32_t xend,
596 uint32_t instep, uint32_t outstep) {
597 RsdCpuScriptIntrinsicConvolve5x5 *cp = (RsdCpuScriptIntrinsicConvolve5x5 *)p->usr;
598 if (!cp->alloc.get()) {
599 ALOGE("Convolve5x5 executed without input, skipping");
600 return;
601 }
602 const uchar *pin = (const uchar *)cp->alloc->mHal.drvState.lod[0].mallocPtr;
603 const size_t stride = cp->alloc->mHal.drvState.lod[0].stride;
604
605 uint32_t y0 = rsMax((int32_t)p->y-2, 0);
606 uint32_t y1 = rsMax((int32_t)p->y-1, 0);
607 uint32_t y2 = p->y;
608 uint32_t y3 = rsMin((int32_t)p->y+1, (int32_t)(p->dimY-1));
609 uint32_t y4 = rsMin((int32_t)p->y+2, (int32_t)(p->dimY-1));
610
611 const float *py0 = (const float *)(pin + stride * y0);
612 const float *py1 = (const float *)(pin + stride * y1);
613 const float *py2 = (const float *)(pin + stride * y2);
614 const float *py3 = (const float *)(pin + stride * y3);
615 const float *py4 = (const float *)(pin + stride * y4);
616
617 float *out = (float *)p->out;
618 uint32_t x1 = xstart;
619 uint32_t x2 = xend;
620
621 while((x1 < x2) && (x1 < 2)) {
622 OneF1(p, x1, out, py0, py1, py2, py3, py4, cp->mFp);
623 out++;
624 x1++;
625 }
626
627 #if 0//defined(ARCH_ARM_HAVE_NEON)
628 if((x1 + 3) < x2) {
629 uint32_t len = (x2 - x1 - 3) >> 1;
630 rsdIntrinsicConvolve5x5_K(out, py0, py1, py2, py3, py4, cp->ip, len);
631 out += len << 1;
632 x1 += len << 1;
633 }
634 #endif
635
636 while(x1 < x2) {
637 OneF1(p, x1, out, py0, py1, py2, py3, py4, cp->mFp);
638 out++;
639 x1++;
640 }
641 }
642
RsdCpuScriptIntrinsicConvolve5x5(RsdCpuReferenceImpl * ctx,const Script * s,const Element * e)643 RsdCpuScriptIntrinsicConvolve5x5::RsdCpuScriptIntrinsicConvolve5x5(
644 RsdCpuReferenceImpl *ctx, const Script *s, const Element *e)
645 : RsdCpuScriptIntrinsic(ctx, s, e, RS_SCRIPT_INTRINSIC_ID_CONVOLVE_5x5) {
646
647 if (e->getType() == RS_TYPE_FLOAT_32) {
648 switch(e->getVectorSize()) {
649 case 1:
650 mRootPtr = &kernelF1;
651 break;
652 case 2:
653 mRootPtr = &kernelF2;
654 break;
655 case 3:
656 case 4:
657 mRootPtr = &kernelF4;
658 break;
659 }
660 } else {
661 switch(e->getVectorSize()) {
662 case 1:
663 mRootPtr = &kernelU1;
664 break;
665 case 2:
666 mRootPtr = &kernelU2;
667 break;
668 case 3:
669 case 4:
670 mRootPtr = &kernelU4;
671 break;
672 }
673 }
674 for(int ct=0; ct < 25; ct++) {
675 mFp[ct] = 1.f / 25.f;
676 mIp[ct] = (short)(mFp[ct] * 256.f);
677 }
678 }
679
~RsdCpuScriptIntrinsicConvolve5x5()680 RsdCpuScriptIntrinsicConvolve5x5::~RsdCpuScriptIntrinsicConvolve5x5() {
681 }
682
populateScript(Script * s)683 void RsdCpuScriptIntrinsicConvolve5x5::populateScript(Script *s) {
684 s->mHal.info.exportedVariableCount = 2;
685 }
686
invokeFreeChildren()687 void RsdCpuScriptIntrinsicConvolve5x5::invokeFreeChildren() {
688 alloc.clear();
689 }
690
691
rsdIntrinsic_Convolve5x5(RsdCpuReferenceImpl * ctx,const Script * s,const Element * e)692 RsdCpuScriptImpl * rsdIntrinsic_Convolve5x5(RsdCpuReferenceImpl *ctx,
693 const Script *s, const Element *e) {
694
695 return new RsdCpuScriptIntrinsicConvolve5x5(ctx, s, e);
696 }
697
698
699
700