1 /*
2 * By downloading, copying, installing or using the software you agree to this license.
3 * If you do not agree to this license, do not download, install,
4 * copy or use the software.
5 *
6 *
7 * License Agreement
8 * For Open Source Computer Vision Library
9 * (3-clause BSD License)
10 *
11 * Copyright (C) 2012-2015, NVIDIA Corporation, all rights reserved.
12 * Third party copyrights are property of their respective owners.
13 *
14 * Redistribution and use in source and binary forms, with or without modification,
15 * are permitted provided that the following conditions are met:
16 *
17 * * Redistributions of source code must retain the above copyright notice,
18 * this list of conditions and the following disclaimer.
19 *
20 * * Redistributions in binary form must reproduce the above copyright notice,
21 * this list of conditions and the following disclaimer in the documentation
22 * and/or other materials provided with the distribution.
23 *
24 * * Neither the names of the copyright holders nor the names of the contributors
25 * may be used to endorse or promote products derived from this software
26 * without specific prior written permission.
27 *
28 * This software is provided by the copyright holders and contributors "as is" and
29 * any express or implied warranties, including, but not limited to, the implied
30 * warranties of merchantability and fitness for a particular purpose are disclaimed.
31 * In no event shall copyright holders or contributors be liable for any direct,
32 * indirect, incidental, special, exemplary, or consequential damages
33 * (including, but not limited to, procurement of substitute goods or services;
34 * loss of use, data, or profits; or business interruption) however caused
35 * and on any theory of liability, whether in contract, strict liability,
36 * or tort (including negligence or otherwise) arising in any way out of
37 * the use of this software, even if advised of the possibility of such damage.
38 */
39
40 #include <vector>
41
42 #include "common.hpp"
43 #include "saturate_cast.hpp"
44
45 namespace CAROTENE_NS {
46
isBlur3x3Supported(const Size2D & size,BORDER_MODE border)47 bool isBlur3x3Supported(const Size2D &size, BORDER_MODE border)
48 {
49 return isSupportedConfiguration() && size.width >= 8 &&
50 (border == BORDER_MODE_CONSTANT ||
51 border == BORDER_MODE_REPLICATE);
52 }
53
blur3x3(const Size2D & size,const u8 * srcBase,ptrdiff_t srcStride,u8 * dstBase,ptrdiff_t dstStride,BORDER_MODE border,u8 borderValue)54 void blur3x3(const Size2D &size,
55 const u8 * srcBase, ptrdiff_t srcStride,
56 u8 * dstBase, ptrdiff_t dstStride,
57 BORDER_MODE border, u8 borderValue)
58 {
59 internal::assertSupportedConfiguration(isBlur3x3Supported(size, border));
60 #ifdef CAROTENE_NEON
61 const int16x8_t v_scale = vmovq_n_s16(3640);
62 const uint16x8_t v_border_x3 = vdupq_n_u16(borderValue * 3);
63 const uint16x8_t v_zero = vdupq_n_u16(0);
64 const uint8x8_t v_border = vdup_n_u8(borderValue);
65
66 uint16x8_t tprev = v_zero, tcurr = v_zero, tnext = v_zero;
67 uint16x8_t t0 = v_zero, t1 = v_zero, t2 = v_zero;
68
69 ptrdiff_t width = (ptrdiff_t)size.width, height = (ptrdiff_t)size.height;
70
71 for (ptrdiff_t y = 0; y < height; ++y)
72 {
73 const u8 * srow0 = y == 0 && border == BORDER_MODE_CONSTANT ? NULL : internal::getRowPtr(srcBase, srcStride, std::max<ptrdiff_t>(y - 1, 0));
74 const u8 * srow1 = internal::getRowPtr(srcBase, srcStride, y);
75 const u8 * srow2 = y + 1 == height && border == BORDER_MODE_CONSTANT ? NULL : internal::getRowPtr(srcBase, srcStride, std::min(y + 1, height - 1));
76 u8 * drow = internal::getRowPtr(dstBase, dstStride, y);
77
78 s16 prevx = 0, currx = 0, nextx = 0;
79 ptrdiff_t x = 0;
80 const ptrdiff_t bwidth = y + 2 < height ? width : (width - 8);
81
82 // perform vertical convolution
83 for ( ; x <= bwidth; x += 8)
84 {
85 internal::prefetch(srow0 + x);
86 internal::prefetch(srow1 + x);
87 internal::prefetch(srow2 + x);
88
89 uint8x8_t x0 = !srow0 ? v_border : vld1_u8(srow0 + x);
90 uint8x8_t x1 = vld1_u8(srow1 + x);
91 uint8x8_t x2 = !srow2 ? v_border : vld1_u8(srow2 + x);
92
93 // calculate values for plain CPU part below if needed
94 if (x + 8 >= bwidth)
95 {
96 ptrdiff_t x3 = x == width ? width - 1 : x;
97 ptrdiff_t x4 = border == BORDER_MODE_CONSTANT ? x3 - 1 : std::max<ptrdiff_t>(x3 - 1, 0);
98
99 if (border == BORDER_MODE_CONSTANT && x4 < 0)
100 prevx = borderValue;
101 else
102 prevx = (srow2 ? srow2[x4] : borderValue) + srow1[x4] + (srow0 ? srow0[x4] : borderValue);
103
104 currx = (srow2 ? srow2[x3] : borderValue) + srow1[x3] + (srow0 ? srow0[x3] : borderValue);
105 }
106
107 // make shift
108 if (x)
109 {
110 tprev = tcurr;
111 tcurr = tnext;
112 }
113
114 // and calculate next value
115 tnext = vaddw_u8(vaddl_u8(x0, x1), x2);
116
117 // make extrapolation for the first elements
118 if (!x)
119 {
120 // make border
121 if (border == BORDER_MODE_CONSTANT)
122 tcurr = v_border_x3;
123 else if (border == BORDER_MODE_REPLICATE)
124 tcurr = vdupq_n_u16(vgetq_lane_u16(tnext, 0));
125
126 continue;
127 }
128
129 // combine 3 "shifted" vectors
130 t0 = vextq_u16(tprev, tcurr, 7);
131 t1 = tcurr;
132 t2 = vextq_u16(tcurr, tnext, 1);
133
134 // and add them
135 t0 = vqaddq_u16(t0, vqaddq_u16(t1, t2));
136
137 int16x8_t tt0 = vqrdmulhq_s16(vreinterpretq_s16_u16(t0), v_scale);
138 uint8x8_t it0 = vmovn_u16(vreinterpretq_u16_s16(tt0));
139 vst1_u8(drow + x - 8, it0);
140 }
141
142 x -= 8;
143 if (x == width)
144 --x;
145
146 for ( ; x < width; ++x)
147 {
148 // make extrapolation for the last elements
149 if (x + 1 >= width)
150 {
151 if (border == BORDER_MODE_CONSTANT)
152 nextx = borderValue * 3;
153 else if (border == BORDER_MODE_REPLICATE)
154 nextx = srow2[x] + srow1[x] + srow0[x];
155 }
156 else
157 nextx = (srow2 ? srow2[x + 1] : borderValue) +
158 srow1[x + 1] +
159 (srow0 ? srow0[x + 1] : borderValue);
160
161 f32 val = (prevx + currx + nextx) * (1 / 9.f) + 0.5f;
162 drow[x] = internal::saturate_cast<u8>((s32)val);
163
164 // make shift
165 prevx = currx;
166 currx = nextx;
167 }
168 }
169 #else
170 (void)size;
171 (void)srcBase;
172 (void)srcStride;
173 (void)dstBase;
174 (void)dstStride;
175 (void)border;
176 (void)borderValue;
177 #endif
178 }
179
isBlurU8Supported(const Size2D & size,s32 cn,BORDER_MODE border)180 bool isBlurU8Supported(const Size2D &size, s32 cn, BORDER_MODE border)
181 {
182 return isSupportedConfiguration() &&
183 cn > 0 && cn <= 4 &&
184 size.width*cn >= 8 && size.height >= 2 &&
185 (border == BORDER_MODE_CONSTANT ||
186 border == BORDER_MODE_REFLECT101 ||
187 border == BORDER_MODE_REFLECT ||
188 border == BORDER_MODE_REPLICATE);
189 }
190
blur3x3(const Size2D & size,s32 cn,const u8 * srcBase,ptrdiff_t srcStride,u8 * dstBase,ptrdiff_t dstStride,BORDER_MODE borderType,u8 borderValue)191 void blur3x3(const Size2D &size, s32 cn,
192 const u8 * srcBase, ptrdiff_t srcStride,
193 u8 * dstBase, ptrdiff_t dstStride,
194 BORDER_MODE borderType, u8 borderValue)
195 {
196 internal::assertSupportedConfiguration(isBlurU8Supported(size, cn, borderType));
197 #ifdef CAROTENE_NEON
198 //#define FLOAT_VARIANT_1_9
199 #ifdef FLOAT_VARIANT_1_9
200 float32x4_t v1_9 = vdupq_n_f32 (1.0/9.0);
201 float32x4_t v0_5 = vdupq_n_f32 (.5);
202 #else
203 const int16x8_t vScale = vmovq_n_s16(3640);
204 #endif
205
206 size_t colsn = size.width*cn;
207
208 std::vector<u8> _tmp;
209 u8 *tmp = 0;
210 if (borderType == BORDER_MODE_CONSTANT)
211 {
212 _tmp.assign(colsn + 2*cn, borderValue);
213 tmp = &_tmp[cn];
214 }
215
216 uint16x8_t tprev = vdupq_n_u16(0x0);
217 uint16x8_t tcurr = tprev;
218 uint16x8_t tnext = tprev;
219 uint16x8_t t0, t1, t2;
220 if(cn == 1)
221 {
222 for( size_t y = 0; y < size.height; y++ )
223 {
224 const u8* srow0;
225 const u8* srow1 = internal::getRowPtr(srcBase, srcStride, y);
226 const u8* srow2;
227 u8* drow = internal::getRowPtr(dstBase, dstStride, y);
228 if (borderType == BORDER_MODE_REFLECT101) {
229 srow0 = internal::getRowPtr(srcBase, srcStride, y > 0 ? y-1 : 1);
230 srow2 = internal::getRowPtr(srcBase, srcStride, y < size.height-1 ? y+1 : size.height-2);
231 } else if (borderType == BORDER_MODE_CONSTANT) {
232 srow0 = y > 0 ? internal::getRowPtr(srcBase, srcStride, y-1) : tmp;
233 srow2 = y < size.height-1 ? internal::getRowPtr(srcBase, srcStride, y+1) : tmp;
234 } else { // BORDER_MODE_REFLECT || BORDER_MODE_REPLICATE
235 srow0 = internal::getRowPtr(srcBase, srcStride, y > 0 ? y-1 : 0);
236 srow2 = internal::getRowPtr(srcBase, srcStride, y < size.height-1 ? y+1 : size.height-1);
237 }
238
239 // do vertical convolution
240 size_t x = 0;
241 const size_t bcols = y + 2 < size.height ? colsn : (colsn - 8);
242 for( ; x <= bcols; x += 8 )
243 {
244 internal::prefetch(srow0 + x);
245 internal::prefetch(srow1 + x);
246 internal::prefetch(srow2 + x);
247
248 uint8x8_t x0 = vld1_u8(srow0 + x);
249 uint8x8_t x1 = vld1_u8(srow1 + x);
250 uint8x8_t x2 = vld1_u8(srow2 + x);
251
252 tprev = tcurr;
253 tcurr = tnext;
254 tnext = vaddw_u8(vaddl_u8(x0, x1), x2);
255
256 if(!x) {
257 tcurr = tnext;
258
259 // make border
260 if (borderType == BORDER_MODE_CONSTANT)
261 {
262 tcurr = vsetq_lane_u16(borderValue, tcurr, 7);
263 }
264 else if (borderType == BORDER_MODE_REFLECT101)
265 {
266 tcurr = vsetq_lane_u16(vgetq_lane_u16(tcurr, 1),tcurr, 7);
267 }
268 else // borderType == BORDER_MODE_REFLECT || borderType == BORDER_MODE_REPLICATE
269 {
270 tcurr = vsetq_lane_u16(vgetq_lane_u16(tcurr, 0),tcurr, 7);
271 }
272 continue;
273 }
274
275 t0 = vextq_u16(tprev, tcurr, 7);
276 t1 = tcurr;
277 t2 = vextq_u16(tcurr, tnext, 1);
278
279 t0 = vqaddq_u16(t0, vqaddq_u16(t1, t2));
280
281 #ifdef FLOAT_VARIANT_1_9
282 uint32x4_t tres1 = vmovl_u16(vget_low_u16(t0));
283 uint32x4_t tres2 = vmovl_u16(vget_high_u16(t0));
284 float32x4_t vf1 = vmulq_f32(v1_9, vcvtq_f32_u32(tres1));
285 float32x4_t vf2 = vmulq_f32(v1_9, vcvtq_f32_u32(tres2));
286 tres1 = vcvtq_u32_f32(vaddq_f32(vf1, v0_5));
287 tres2 = vcvtq_u32_f32(vaddq_f32(vf2, v0_5));
288 t0 = vcombine_u16(vmovn_u32(tres1),vmovn_u32(tres2));
289 vst1_u8(drow + x - 8, vmovn_u16(t0));
290 #else
291 int16x8_t tt0 = vqrdmulhq_s16(vreinterpretq_s16_u16(t0), vScale);
292 uint8x8_t it0 = vmovn_u16(vreinterpretq_u16_s16(tt0));
293 vst1_u8(drow + x - 8, it0);
294 #endif
295 }
296
297 x -= 8;
298 if(x == colsn){
299 x--;
300 }
301 s16 prevx, rowx, nextx;
302 prevx = srow2[x-1] + srow1[x-1] + srow0[x-1];
303 rowx = srow2[x] + srow1[x] + srow0[x];
304 for( ; x < colsn; x++ )
305 {
306 if(x+1 >= colsn) {
307 // make border
308 if (borderType == BORDER_MODE_CONSTANT)
309 {
310 nextx = borderValue;
311 } else if (borderType == BORDER_MODE_REFLECT101)
312 {
313 nextx = srow2[x-1] + srow1[x-1] + srow0[x-1];
314 } else {
315 nextx = srow2[x] + srow1[x] + srow0[x];
316 }
317 } else {
318 nextx = srow2[x+1] + srow1[x+1] + srow0[x+1];
319 }
320 *(drow+x) = internal::saturate_cast<u8>((prevx + rowx + nextx)*(1/9.));
321 prevx = rowx;
322 rowx = nextx;
323 }
324 }
325 }
326 else
327 {
328 for( size_t y = 0; y < size.height; y++ )
329 {
330 const u8* srow0;
331 const u8* srow1 = internal::getRowPtr(srcBase, srcStride, y);
332 const u8* srow2;
333 u8* drow = internal::getRowPtr(dstBase, dstStride, y);
334 if (borderType == BORDER_MODE_REFLECT101) {
335 srow0 = internal::getRowPtr(srcBase, srcStride, y > 0 ? y-1 : 1);
336 srow2 = internal::getRowPtr(srcBase, srcStride, y < size.height-1 ? y+1 : size.height-2);
337 } else if (borderType == BORDER_MODE_CONSTANT) {
338 srow0 = y > 0 ? internal::getRowPtr(srcBase, srcStride, y-1) : tmp;
339 srow2 = y < size.height-1 ? internal::getRowPtr(srcBase, srcStride, y+1) : tmp;
340 } else { // BORDER_MODE_REFLECT || BORDER_MODE_REPLICATE
341 srow0 = internal::getRowPtr(srcBase, srcStride, y > 0 ? y-1 : 0);
342 srow2 = internal::getRowPtr(srcBase, srcStride, y < size.height-1 ? y+1 : size.height-1);
343 }
344
345 // do vertical convolution
346 size_t x = 0;
347 const size_t bcols = y + 2 < size.height ? colsn : (colsn - 8);
348 for( ; x <= bcols; x += 8 )
349 {
350 internal::prefetch(srow0 + x);
351 internal::prefetch(srow1 + x);
352 internal::prefetch(srow2 + x);
353
354 uint8x8_t x0 = vld1_u8(srow0 + x);
355 uint8x8_t x1 = vld1_u8(srow1 + x);
356 uint8x8_t x2 = vld1_u8(srow2 + x);
357
358 tprev = tcurr;
359 tcurr = tnext;
360 tnext = vaddw_u8(vaddl_u8(x0, x1), x2);
361
362 if(!x) {
363 tcurr = tnext;
364
365 // make border
366 switch(cn)
367 {
368 case 2:
369 if (borderType == BORDER_MODE_CONSTANT)
370 {
371 tcurr = vsetq_lane_u16(borderValue, tcurr, 6);
372 tcurr = vsetq_lane_u16(borderValue, tcurr, 7);
373 }
374 else if (borderType == BORDER_MODE_REFLECT101)
375 {
376 tcurr = vsetq_lane_u16(vgetq_lane_u16(tcurr, 2),tcurr, 6);
377 tcurr = vsetq_lane_u16(vgetq_lane_u16(tcurr, 3),tcurr, 6);
378 }
379 else
380 {
381 tcurr = vsetq_lane_u16(vgetq_lane_u16(tcurr, 0),tcurr, 6);
382 tcurr = vsetq_lane_u16(vgetq_lane_u16(tcurr, 1),tcurr, 7);
383 }
384 break;
385 case 3:
386 if (borderType == BORDER_MODE_CONSTANT)
387 {
388 tcurr = vsetq_lane_u16(borderValue, tcurr, 5);
389 tcurr = vsetq_lane_u16(borderValue, tcurr, 6);
390 tcurr = vsetq_lane_u16(borderValue, tcurr, 7);
391 }
392 else if (borderType == BORDER_MODE_REFLECT101)
393 {
394 tcurr = vsetq_lane_u16(vgetq_lane_u16(tcurr, 3),tcurr, 5);
395 tcurr = vsetq_lane_u16(vgetq_lane_u16(tcurr, 4),tcurr, 6);
396 tcurr = vsetq_lane_u16(vgetq_lane_u16(tcurr, 5),tcurr, 7);
397 }
398 else
399 {
400 tcurr = vsetq_lane_u16(vgetq_lane_u16(tcurr, 0),tcurr, 5);
401 tcurr = vsetq_lane_u16(vgetq_lane_u16(tcurr, 1),tcurr, 6);
402 tcurr = vsetq_lane_u16(vgetq_lane_u16(tcurr, 2),tcurr, 7);
403 }
404 break;
405 case 4:
406 if (borderType == BORDER_MODE_CONSTANT)
407 {
408 tcurr = vsetq_lane_u16(borderValue, tcurr, 4);
409 tcurr = vsetq_lane_u16(borderValue, tcurr, 5);
410 tcurr = vsetq_lane_u16(borderValue, tcurr, 6);
411 tcurr = vsetq_lane_u16(borderValue, tcurr, 7);
412 }
413 else if (borderType != BORDER_MODE_REFLECT101)
414 {
415 tcurr = vsetq_lane_u16(vgetq_lane_u16(tcurr, 0),tcurr, 4);
416 tcurr = vsetq_lane_u16(vgetq_lane_u16(tcurr, 1),tcurr, 5);
417 tcurr = vsetq_lane_u16(vgetq_lane_u16(tcurr, 2),tcurr, 6);
418 tcurr = vsetq_lane_u16(vgetq_lane_u16(tcurr, 3),tcurr, 7);
419 }
420 break;
421 }
422 continue;
423 }
424
425 if(cn==2)
426 t0 = vextq_u16(tprev, tcurr, 6);
427 else if(cn==3)
428 t0 = vextq_u16(tprev, tcurr, 5);
429 else if(cn==4)
430 t0 = vextq_u16(tprev, tcurr, 4);
431
432 t1 = tcurr;
433
434 if(cn==2)
435 t2 = vextq_u16(tcurr, tnext, 2);
436 else if(cn==3)
437 t2 = vextq_u16(tcurr, tnext, 3);
438 else if(cn==4)
439 t2 = vextq_u16(tcurr, tnext, 4);
440
441 t0 = vqaddq_u16(t0, vqaddq_u16(t1, t2));
442
443 #ifdef FLOAT_VARIANT_1_9
444 uint32x4_t tres1 = vmovl_u16(vget_low_u16(t0));
445 uint32x4_t tres2 = vmovl_u16(vget_high_u16(t0));
446 float32x4_t vf1 = vmulq_f32(v1_9, vcvtq_f32_u32(tres1));
447 float32x4_t vf2 = vmulq_f32(v1_9, vcvtq_f32_u32(tres2));
448 tres1 = vcvtq_u32_f32(vaddq_f32(vf1, v0_5));
449 tres2 = vcvtq_u32_f32(vaddq_f32(vf2, v0_5));
450 t0 = vcombine_u16(vmovn_u32(tres1),vmovn_u32(tres2));
451 vst1_u8(drow + x - 8, vmovn_u16(t0));
452 #else
453 int16x8_t tt0 = vqrdmulhq_s16(vreinterpretq_s16_u16(t0), vScale);
454 uint8x8_t it0 = vmovn_u16(vreinterpretq_u16_s16(tt0));
455 vst1_u8(drow + x - 8, it0);
456 #endif
457 }
458
459 x -= 8;
460 if(x == colsn){
461 x -= cn;
462 }
463 s16 prevx[4], rowx[4], nextx[4];
464 for( s32 k = 0; k < cn; k++ )
465 {
466 prevx[(k + x%cn)%cn] = srow2[x+k-cn] + srow1[x+k-cn] + srow0[x+k-cn];
467 rowx[(k + x%cn)%cn] = srow2[x+k] + srow1[x+k] + srow0[x+k];
468 }
469 for( ; x < colsn; x++ )
470 {
471 size_t xx = x%cn;
472 if(x+cn >= colsn) {
473 // make border
474 if (borderType == BORDER_MODE_CONSTANT)
475 {
476 nextx[xx] = borderValue;
477 } else if (borderType == BORDER_MODE_REFLECT101)
478 {
479 nextx[xx] = srow2[x-cn] + srow1[x-cn] + srow0[x-cn];
480 } else {
481 nextx[xx] = srow2[x] + srow1[x] + srow0[x];
482 }
483 } else {
484 nextx[xx] = srow2[x+cn] + srow1[x+cn] + srow0[x+cn];
485 }
486 *(drow+x) = internal::saturate_cast<u8>((prevx[xx] + rowx[xx] + nextx[xx])*(1/9.));
487 prevx[xx] = rowx[xx];
488 rowx[xx] = nextx[xx];
489 }
490 }
491 }
492 #else
493 (void)srcBase;
494 (void)srcStride;
495 (void)dstBase;
496 (void)dstStride;
497 (void)borderValue;
498 #endif
499 }
500
blur5x5(const Size2D & size,s32 cn,const u8 * srcBase,ptrdiff_t srcStride,u8 * dstBase,ptrdiff_t dstStride,BORDER_MODE borderType,u8 borderValue)501 void blur5x5(const Size2D &size, s32 cn,
502 const u8 * srcBase, ptrdiff_t srcStride,
503 u8 * dstBase, ptrdiff_t dstStride,
504 BORDER_MODE borderType, u8 borderValue)
505 {
506 internal::assertSupportedConfiguration(isBlurU8Supported(size, cn, borderType));
507 #ifdef CAROTENE_NEON
508 #define FLOAT_VARIANT_1_25
509 #ifdef FLOAT_VARIANT_1_25
510 float32x4_t v1_25 = vdupq_n_f32 (1.0f/25.0f);
511 float32x4_t v0_5 = vdupq_n_f32 (.5f);
512 #else
513 const int16x8_t vScale = vmovq_n_s16(1310);
514 #endif
515 size_t colsn = size.width*cn;
516
517 std::vector<u8> _tmp;
518 u8 *tmp = 0;
519 if (borderType == BORDER_MODE_CONSTANT)
520 {
521 _tmp.assign(colsn + 2*cn, borderValue);
522 tmp = &_tmp[cn];
523 }
524
525 uint16x8_t tprev = vdupq_n_u16(0x0);
526 uint16x8_t tcurr = tprev;
527 uint16x8_t tnext = tprev;
528 uint16x8_t t0, t1, t2, t3, t4;
529 for( size_t y = 0; y < size.height; y++ )
530 {
531 const u8 *srow0, *srow1;
532 const u8 *srow2 = internal::getRowPtr(srcBase, srcStride, y);
533 const u8 *srow3, *srow4;
534 u8 *drow = internal::getRowPtr(dstBase, dstStride, y);
535 if (borderType == BORDER_MODE_REFLECT101) {
536 srow0 = internal::getRowPtr(srcBase, srcStride, y > 1 ? y-2 : 2-y);
537 srow1 = internal::getRowPtr(srcBase, srcStride, y > 0 ? y-1 : 1);
538 srow3 = internal::getRowPtr(srcBase, srcStride, y < size.height-1 ? y+1 : size.height-2);
539 srow4 = internal::getRowPtr(srcBase, srcStride, y < size.height-2 ? y+2 : (size.height<<1)-4-y);
540 } else if (borderType == BORDER_MODE_CONSTANT) {
541 srow0 = y > 1 ? internal::getRowPtr(srcBase, srcStride, y-2) : tmp;
542 srow1 = y > 0 ? internal::getRowPtr(srcBase, srcStride, y-1) : tmp;
543 srow3 = y < size.height-1 ? internal::getRowPtr(srcBase, srcStride, y+1) : tmp;
544 srow4 = y < size.height-2 ? internal::getRowPtr(srcBase, srcStride, y+2) : tmp;
545 } else if (borderType == BORDER_MODE_REFLECT) {
546 srow0 = internal::getRowPtr(srcBase, srcStride, y > 1 ? y-2 : 1-y);
547 srow1 = internal::getRowPtr(srcBase, srcStride, y > 0 ? y-1 : 0);
548 srow3 = internal::getRowPtr(srcBase, srcStride, y < size.height-1 ? y+1 : size.height-1);
549 srow4 = internal::getRowPtr(srcBase, srcStride, y < size.height-2 ? y+2 : (size.height<<1)-3-y);
550 } else { // BORDER_MODE_REPLICATE
551 srow0 = internal::getRowPtr(srcBase, srcStride, y > 1 ? y-2 : 0);
552 srow1 = internal::getRowPtr(srcBase, srcStride, y > 0 ? y-1 : 0);
553 srow3 = internal::getRowPtr(srcBase, srcStride, y < size.height-1 ? y+1 : size.height-1);
554 srow4 = internal::getRowPtr(srcBase, srcStride, y < size.height-2 ? y+2 : size.height-1);
555 }
556
557 // do vertical convolution
558 size_t x = 0;
559 const size_t bcols = y + 3 < size.height ? colsn : (colsn - 8);
560 for( ; x <= bcols; x += 8 )
561 {
562 internal::prefetch(srow0 + x);
563 internal::prefetch(srow1 + x);
564 internal::prefetch(srow2 + x);
565 internal::prefetch(srow3 + x);
566 internal::prefetch(srow4 + x);
567
568 uint8x8_t x0 = vld1_u8(srow0 + x);
569 uint8x8_t x1 = vld1_u8(srow1 + x);
570 uint8x8_t x2 = vld1_u8(srow2 + x);
571 uint8x8_t x3 = vld1_u8(srow3 + x);
572 uint8x8_t x4 = vld1_u8(srow4 + x);
573
574 tprev = tcurr;
575 tcurr = tnext;
576 tnext = vaddw_u8(vaddq_u16(vaddl_u8(x0, x1), vaddl_u8(x2, x3)), x4);
577
578 if(!x) {
579 tcurr = tnext;
580
581 if(borderType == BORDER_MODE_REFLECT101 && size.width < 3)
582 {
583 x = 8;
584 break;
585 }
586
587 // make border
588 switch(cn)
589 {
590 case 1:
591 if (borderType == BORDER_MODE_CONSTANT)
592 {
593 tcurr = vsetq_lane_u16(borderValue, tcurr, 6);
594 tcurr = vsetq_lane_u16(borderValue, tcurr, 7);
595 }
596 else if (borderType == BORDER_MODE_REFLECT101)
597 {
598 tcurr = vsetq_lane_u16(vgetq_lane_u16(tcurr, 2),tcurr, 6);
599 tcurr = vsetq_lane_u16(vgetq_lane_u16(tcurr, 1),tcurr, 7);
600 }
601 else if (borderType == BORDER_MODE_REFLECT)
602 {
603 tcurr = vsetq_lane_u16(vgetq_lane_u16(tcurr, 1),tcurr, 6);
604 tcurr = vsetq_lane_u16(vgetq_lane_u16(tcurr, 0),tcurr, 7);
605 }
606 else
607 {
608 tcurr = vsetq_lane_u16(vgetq_lane_u16(tcurr, 0),tcurr, 6);
609 tcurr = vsetq_lane_u16(vgetq_lane_u16(tcurr, 0),tcurr, 7);
610 }
611 break;
612 case 2:
613 if (borderType == BORDER_MODE_CONSTANT)
614 {
615 tcurr = vsetq_lane_u16(borderValue, tcurr, 4);
616 tcurr = vsetq_lane_u16(borderValue, tcurr, 5);
617 tcurr = vsetq_lane_u16(borderValue, tcurr, 6);
618 tcurr = vsetq_lane_u16(borderValue, tcurr, 7);
619 }
620 else if (borderType == BORDER_MODE_REFLECT101)
621 {
622 tcurr = vsetq_lane_u16(vgetq_lane_u16(tcurr, 2),tcurr, 6);
623 tcurr = vsetq_lane_u16(vgetq_lane_u16(tcurr, 3),tcurr, 7);
624 }
625 else if (borderType == BORDER_MODE_REFLECT)
626 {
627 tcurr = vsetq_lane_u16(vgetq_lane_u16(tcurr, 2),tcurr, 4);
628 tcurr = vsetq_lane_u16(vgetq_lane_u16(tcurr, 3),tcurr, 5);
629 tcurr = vsetq_lane_u16(vgetq_lane_u16(tcurr, 0),tcurr, 6);
630 tcurr = vsetq_lane_u16(vgetq_lane_u16(tcurr, 1),tcurr, 7);
631 }
632 else
633 {
634 tcurr = vsetq_lane_u16(vgetq_lane_u16(tcurr, 0),tcurr, 4);
635 tcurr = vsetq_lane_u16(vgetq_lane_u16(tcurr, 1),tcurr, 5);
636 tcurr = vsetq_lane_u16(vgetq_lane_u16(tcurr, 0),tcurr, 6);
637 tcurr = vsetq_lane_u16(vgetq_lane_u16(tcurr, 1),tcurr, 7);
638 }
639 break;
640 case 3:
641 if (borderType == BORDER_MODE_CONSTANT)
642 {
643 tcurr = vsetq_lane_u16(borderValue, tcurr, 2);
644 tcurr = vsetq_lane_u16(borderValue, tcurr, 3);
645 tcurr = vsetq_lane_u16(borderValue, tcurr, 4);
646 tcurr = vsetq_lane_u16(borderValue, tcurr, 5);
647 tcurr = vsetq_lane_u16(borderValue, tcurr, 6);
648 tcurr = vsetq_lane_u16(borderValue, tcurr, 7);
649 }
650 else if (borderType == BORDER_MODE_REFLECT101)
651 {
652 tprev = vsetq_lane_u16(vgetq_lane_u16(tcurr, 6),tcurr, 2);
653 tprev = vsetq_lane_u16(vgetq_lane_u16(tcurr, 7),tprev, 3);
654 tprev = vsetq_lane_u16(vgetq_lane_u16(tcurr, 3),tprev, 5);
655 tprev = vsetq_lane_u16(vgetq_lane_u16(tcurr, 4),tprev, 6);
656 tprev = vsetq_lane_u16(vgetq_lane_u16(tcurr, 5),tprev, 7);
657 s16 lane8 = srow4[8] + srow3[8] + srow2[8] + srow1[8] + srow0[8];
658 tcurr = vsetq_lane_u16(lane8,tprev, 4);
659 }
660 else if (borderType == BORDER_MODE_REFLECT)
661 {
662 tprev = vsetq_lane_u16(vgetq_lane_u16(tcurr, 3),tcurr, 2);
663 tprev = vsetq_lane_u16(vgetq_lane_u16(tcurr, 4),tprev, 3);
664 tprev = vsetq_lane_u16(vgetq_lane_u16(tcurr, 5),tprev, 4);
665 tprev = vsetq_lane_u16(vgetq_lane_u16(tcurr, 0),tprev, 5);
666 tprev = vsetq_lane_u16(vgetq_lane_u16(tcurr, 1),tprev, 6);
667 tcurr = vsetq_lane_u16(vgetq_lane_u16(tcurr, 2),tprev, 7);
668 }
669 else
670 {
671 tprev = vsetq_lane_u16(vgetq_lane_u16(tcurr, 0),tcurr, 2);
672 tprev = vsetq_lane_u16(vgetq_lane_u16(tcurr, 1),tprev, 3);
673 tprev = vsetq_lane_u16(vgetq_lane_u16(tcurr, 2),tprev, 4);
674 tprev = vsetq_lane_u16(vgetq_lane_u16(tcurr, 0),tprev, 5);
675 tprev = vsetq_lane_u16(vgetq_lane_u16(tcurr, 1),tprev, 6);
676 tcurr = vsetq_lane_u16(vgetq_lane_u16(tcurr, 2),tprev, 7);
677 }
678 break;
679 case 4:
680 if (borderType == BORDER_MODE_CONSTANT)
681 {
682 tcurr = vsetq_lane_u16(borderValue, tcurr, 0);
683 tcurr = vsetq_lane_u16(borderValue, tcurr, 1);
684 tcurr = vsetq_lane_u16(borderValue, tcurr, 2);
685 tcurr = vsetq_lane_u16(borderValue, tcurr, 3);
686 tcurr = vsetq_lane_u16(borderValue, tcurr, 4);
687 tcurr = vsetq_lane_u16(borderValue, tcurr, 5);
688 tcurr = vsetq_lane_u16(borderValue, tcurr, 6);
689 tcurr = vsetq_lane_u16(borderValue, tcurr, 7);
690 }
691 else if (borderType == BORDER_MODE_REFLECT101)
692 {
693 s16 lane8 = srow4[ 8] + srow3[ 8] + srow2[ 8] + srow1[ 8] + srow0[ 8];
694 s16 lane9 = srow4[ 9] + srow3[ 9] + srow2[ 9] + srow1[ 9] + srow0[ 9];
695 s16 lane10 = srow4[10] + srow3[10] + srow2[10] + srow1[10] + srow0[10];
696 s16 lane11 = srow4[11] + srow3[11] + srow2[11] + srow1[11] + srow0[11];
697 tprev = vsetq_lane_u16( lane8,tcurr, 0);
698 tprev = vsetq_lane_u16( lane9,tprev, 1);
699 tprev = vsetq_lane_u16(lane10,tprev, 2);
700 tcurr = vsetq_lane_u16(lane11,tprev, 3);
701 }
702 else if (borderType == BORDER_MODE_REFLECT)
703 {
704 tcurr = vcombine_u16(vget_high_u16(tcurr),vget_low_u16(tcurr));//swap 64-bit parts
705 }
706 else
707 {
708 tcurr = vcombine_u16(vget_low_u16(tcurr),vget_low_u16(tcurr));//double 64-bit part
709 }
710 break;
711 }
712 continue;
713 }
714 switch(cn)
715 {
716 case 1:
717 t0 = vextq_u16(tprev, tcurr, 6);
718 t1 = vextq_u16(tprev, tcurr, 7);
719 t2 = tcurr;
720 t3 = vextq_u16(tcurr, tnext, 1);
721 t4 = vextq_u16(tcurr, tnext, 2);
722 break;
723 case 2:
724 t0 = vextq_u16(tprev, tcurr, 4);
725 t1 = vextq_u16(tprev, tcurr, 6);
726 t2 = tcurr;
727 t3 = vextq_u16(tcurr, tnext, 2);
728 t4 = vextq_u16(tcurr, tnext, 4);
729 break;
730 case 3:
731 t0 = vextq_u16(tprev, tcurr, 2);
732 t1 = vextq_u16(tprev, tcurr, 5);
733 t2 = tcurr;
734 t3 = vextq_u16(tcurr, tnext, 3);
735 t4 = vextq_u16(tcurr, tnext, 6);
736 break;
737 case 4:
738 t0 = tprev;
739 t1 = vextq_u16(tprev, tcurr, 4);
740 t2 = tcurr;
741 t3 = vextq_u16(tcurr, tnext, 4);
742 t4 = tnext;
743 break;
744 default:
745 internal::assertSupportedConfiguration(false);//Unsupported channels number
746 return;
747 }
748 t0 = vqaddq_u16(vqaddq_u16(vqaddq_u16(t0, t1), vqaddq_u16(t2, t3)), t4);
749
750 #ifdef FLOAT_VARIANT_1_25
751 uint32x4_t tres1 = vmovl_u16(vget_low_u16(t0));
752 uint32x4_t tres2 = vmovl_u16(vget_high_u16(t0));
753 float32x4_t vf1 = vmulq_f32(v1_25, vcvtq_f32_u32(tres1));
754 float32x4_t vf2 = vmulq_f32(v1_25, vcvtq_f32_u32(tres2));
755 tres1 = vcvtq_u32_f32(vaddq_f32(vf1, v0_5));
756 tres2 = vcvtq_u32_f32(vaddq_f32(vf2, v0_5));
757 t0 = vcombine_u16(vmovn_u32(tres1),vmovn_u32(tres2));
758 vst1_u8(drow + x - 8, vmovn_u16(t0));
759 #else
760 int16x8_t tt0 = vqrdmulhq_s16(vreinterpretq_s16_u16(t0), vScale);
761 uint8x8_t it0 = vmovn_u16(vreinterpretq_u16_s16(tt0));
762 vst1_u8(drow + x - 8, it0);
763 #endif
764 }
765
766 x -= 8;
767 if(x == colsn){
768 x -= cn;
769 }
770 s16 pprevx[4], prevx[4], rowx[4], nextx[4], nnextx[4];
771 ptrdiff_t px = x / cn;
772 for( s32 k = 0; k < cn; k++ )
773 {
774 ptrdiff_t ploc;
775 ploc = internal::borderInterpolate(px-2, size.width, borderType);
776 pprevx[k] = ploc < 0 ? 5*borderValue :
777 srow4[ploc*cn+k] + srow3[ploc*cn+k] + srow2[ploc*cn+k] + srow1[ploc*cn+k] + srow0[ploc*cn+k];
778
779 ploc = internal::borderInterpolate(px-1, size.width, borderType);
780 prevx[k] = ploc < 0 ? 5*borderValue :
781 srow4[ploc*cn+k] + srow3[ploc*cn+k] + srow2[ploc*cn+k] + srow1[ploc*cn+k] + srow0[ploc*cn+k];
782
783 rowx[k] = srow4[px*cn+k] + srow3[px*cn+k] + srow2[px*cn+k] + srow1[px*cn+k] + srow0[px*cn+k];
784
785 ploc = internal::borderInterpolate(px+1, size.width, borderType);
786 nextx[k] = ploc < 0 ? 5*borderValue :
787 srow4[ploc*cn+k] + srow3[ploc*cn+k] + srow2[ploc*cn+k] + srow1[ploc*cn+k] + srow0[ploc*cn+k];
788 }
789 x = px*cn;
790 for( ; x < colsn; x+=cn, px++ )
791 {
792 for( s32 k = 0; k < cn; k++ )
793 {
794 ptrdiff_t ploc = internal::borderInterpolate(px+2, size.width, borderType);
795 nnextx[k] = ploc < 0 ? 5*borderValue :
796 srow4[ploc*cn+k] + srow3[ploc*cn+k] + srow2[ploc*cn+k] + srow1[ploc*cn+k] + srow0[ploc*cn+k];
797 *(drow+x+k) = internal::saturate_cast<u8>((pprevx[k] + prevx[k] + rowx[k] + nextx[k] +nnextx[k])*(1/25.));
798 pprevx[k] = prevx[k];
799 prevx[k] = rowx[k];
800 rowx[k] = nextx[k];
801 nextx[k] = nnextx[k];
802 }
803 }
804 }
805 #else
806 (void)srcBase;
807 (void)srcStride;
808 (void)dstBase;
809 (void)dstStride;
810 (void)borderValue;
811 #endif
812 }
813
isBlurF32Supported(const Size2D & size,s32 cn,BORDER_MODE border)814 bool isBlurF32Supported(const Size2D &size, s32 cn, BORDER_MODE border)
815 {
816 return isSupportedConfiguration() &&
817 cn > 0 && cn <= 4 &&
818 size.width*cn >= 4 && size.height >= 2 &&
819 (border == BORDER_MODE_CONSTANT ||
820 border == BORDER_MODE_REFLECT101 ||
821 border == BORDER_MODE_REFLECT ||
822 border == BORDER_MODE_REPLICATE ||
823 border == BORDER_MODE_WRAP);
824 }
825
blur3x3(const Size2D & size,s32 cn,const f32 * srcBase,ptrdiff_t srcStride,f32 * dstBase,ptrdiff_t dstStride,BORDER_MODE borderType,f32 borderValue,Margin borderMargin)826 void blur3x3(const Size2D &size, s32 cn,
827 const f32 * srcBase, ptrdiff_t srcStride,
828 f32 * dstBase, ptrdiff_t dstStride,
829 BORDER_MODE borderType, f32 borderValue, Margin borderMargin)
830 {
831 internal::assertSupportedConfiguration(isBlurF32Supported(size, cn, borderType));
832 #ifdef CAROTENE_NEON
833 size_t colsn = size.width * cn;
834
835 std::vector<f32> _tmp;
836 f32 *tmp = 0;
837 if (borderType == BORDER_MODE_CONSTANT)
838 {
839 _tmp.assign(colsn + 2*cn, borderValue);
840 tmp = &_tmp[cn];
841 }
842
843 ptrdiff_t idx_l = internal::borderInterpolate(-1, size.width, borderType, borderMargin.left, borderMargin.right) * cn;
844 ptrdiff_t idx_r = internal::borderInterpolate(size.width, size.width, borderType, borderMargin.left, borderMargin.right) * cn;
845
846 //2-line buffer
847 std::vector<f32> _buf(4*(cn * (size.width + 2) + 32 / sizeof(f32)));
848 f32* lanea = internal::alignPtr(&_buf[cn], 32);
849 f32* laneA = internal::alignPtr(lanea + cn * (size.width + 2), 32);
850
851 f32* laneb = internal::alignPtr(laneA + cn * (size.width + 2), 32);
852 f32* laneB = internal::alignPtr(laneb + cn * (size.width + 2), 32);
853
854 if (borderType == BORDER_MODE_CONSTANT)
855 for (s32 k = 0; k < cn; ++k)
856 {
857 lanea[-cn+k] = borderValue;
858 lanea[colsn+k] = borderValue;
859 laneA[-cn+k] = borderValue;
860 laneA[colsn+k] = borderValue;
861 laneb[-cn+k] = borderValue;
862 laneb[colsn+k] = borderValue;
863 laneB[-cn+k] = borderValue;
864 laneB[colsn+k] = borderValue;
865 }
866
867 size_t i = 0;
868 f32* dsta = internal::getRowPtr(dstBase, dstStride, 0);
869 for (; i < size.height-1; i+=2)
870 {
871 //vertical convolution
872 ptrdiff_t idx_rm1 = internal::borderInterpolate(i - 1, size.height, borderType, borderMargin.top, borderMargin.bottom);
873 ptrdiff_t idx_rp2 = internal::borderInterpolate(i + 2, size.height, borderType, borderMargin.top, borderMargin.bottom);
874
875 const f32* ln0 = idx_rm1 >= -(ptrdiff_t)borderMargin.top ? internal::getRowPtr(srcBase, srcStride, idx_rm1) : tmp;
876 const f32* ln1 = internal::getRowPtr(srcBase, srcStride, i);
877 const f32* ln2 = internal::getRowPtr(srcBase, srcStride, i + 1);
878 const f32* ln3 = idx_rp2 >= -(ptrdiff_t)borderMargin.top ? internal::getRowPtr(srcBase, srcStride, idx_rp2) : tmp;
879
880 size_t x = 0;
881 for (; x <= colsn - 4; x += 4)
882 {
883 internal::prefetch(ln1 + x);
884 internal::prefetch(ln2 + x);
885 internal::prefetch(ln0 + x);
886 internal::prefetch(ln3 + x);
887 box3x3f32_vert:
888 float32x4_t v1 = vld1q_f32(ln1 + x);
889 float32x4_t v2 = vld1q_f32(ln2 + x);
890 float32x4_t v0 = vld1q_f32(ln0 + x);
891 float32x4_t v3 = vld1q_f32(ln3 + x);
892
893 float32x4_t v = vaddq_f32(v1, v2);
894 float32x4_t w0 = vaddq_f32(v, v0);
895 float32x4_t w1 = vaddq_f32(v, v3);
896
897 vst1q_f32(lanea + x, w0);
898 vst1q_f32(laneb + x, w1);
899 }
900 if(x < colsn)
901 {
902 x = colsn-4;
903 goto box3x3f32_vert;
904 }
905
906 //left&right borders
907 if (borderType != BORDER_MODE_CONSTANT)
908 for (s32 k = 0; k < cn; ++k)
909 {
910 lanea[-cn+k] = lanea[idx_l + k];
911 lanea[colsn+k] = lanea[idx_r + k];
912 laneb[-cn+k] = laneb[idx_l + k];
913 laneb[colsn+k] = laneb[idx_r + k];
914 }
915
916 //horizontal convolution (2 lines from previous iteration)
917 if (i > 0)
918 {
919 f32* dstb = internal::getRowPtr(dstBase, dstStride, i-1);
920 x = 0;
921 for (; x <= colsn - 4; x += 4)
922 {
923 internal::prefetch(laneA + x + cn);
924 internal::prefetch(laneB + x + cn);
925 box3x3f32_horiz:
926 float32x4_t lane0a = vld1q_f32(laneA + x - cn);
927 float32x4_t lane2a = vld1q_f32(laneA + x + cn);
928 float32x4_t lane1a = vld1q_f32(laneA + x);
929
930 float32x4_t lane0b = vld1q_f32(laneB + x - cn);
931 float32x4_t lane2b = vld1q_f32(laneB + x + cn);
932 float32x4_t lane1b = vld1q_f32(laneB + x);
933
934 float32x4_t va = vaddq_f32(lane0a, lane2a);
935 float32x4_t vb = vaddq_f32(lane0b, lane2b);
936 float32x4_t wa = vaddq_f32(va, lane1a);
937 float32x4_t wb = vaddq_f32(vb, lane1b);
938
939 vst1q_f32(dsta + x, wa);
940 vst1q_f32(dstb + x, wb);
941 }
942 if(x < colsn)
943 {
944 x = colsn-4;
945 goto box3x3f32_horiz;
946 }
947 dsta = internal::getRowPtr(dstBase, dstStride, i);
948 }
949
950 std::swap(lanea, laneA);
951 std::swap(laneb, laneB);
952 }
953
954 //last line
955 if(i < size.height)
956 {
957 //vertical convolution
958 ptrdiff_t idx_rm1 = internal::borderInterpolate(i - 1, size.height, borderType, borderMargin.top, borderMargin.bottom);
959 ptrdiff_t idx_rp1 = internal::borderInterpolate(i + 1, size.height, borderType, borderMargin.top, borderMargin.bottom);
960
961 const f32* ln0 = idx_rm1 >= -(ptrdiff_t)borderMargin.top ? internal::getRowPtr(srcBase, srcStride, idx_rm1) : tmp;
962 const f32* ln1 = internal::getRowPtr(srcBase, srcStride, i);
963 const f32* ln2 = idx_rp1 >= -(ptrdiff_t)borderMargin.top ? internal::getRowPtr(srcBase, srcStride, idx_rp1) : tmp;
964
965 size_t x = 0;
966 for (; x <= colsn - 4; x += 4)
967 {
968 internal::prefetch(ln0 + x);
969 internal::prefetch(ln1 + x);
970 internal::prefetch(ln2 + x);
971 box3x3f32_vert_ll:
972 float32x4_t v0 = vld1q_f32(ln0+x);
973 float32x4_t v1 = vld1q_f32(ln1+x);
974 float32x4_t v2 = vld1q_f32(ln2+x);
975
976 float32x4_t v = vaddq_f32(v0, v1);
977 float32x4_t w = vaddq_f32(v, v2);
978
979 vst1q_f32(lanea + x, w);
980 }
981 if(x < colsn)
982 {
983 x = colsn-4;
984 goto box3x3f32_vert_ll;
985 }
986
987 //left&right borders
988 if (borderType != BORDER_MODE_CONSTANT)
989 for (s32 k = 0; k < cn; ++k)
990 {
991 lanea[-cn+k] = lanea[idx_l + k];
992 lanea[colsn+k] = lanea[idx_r + k];
993 }
994
995 //horizontal convolution (last 3 lines)
996 x = 0;
997 f32* dstb = internal::getRowPtr(dstBase, dstStride, i-1);
998 f32* dstc = internal::getRowPtr(dstBase, dstStride, i);
999 for (; x <= colsn - 4; x += 4)
1000 {
1001 internal::prefetch(laneA + x + cn);
1002 internal::prefetch(laneB + x + cn);
1003 internal::prefetch(lanea + x + cn);
1004 box3x3f32_horiz_ll:
1005 float32x4_t lane0a = vld1q_f32(laneA + x - cn);
1006 float32x4_t lane2a = vld1q_f32(laneA + x + cn);
1007 float32x4_t lane1a = vld1q_f32(laneA + x);
1008
1009 float32x4_t lane0b = vld1q_f32(laneB + x - cn);
1010 float32x4_t lane2b = vld1q_f32(laneB + x + cn);
1011 float32x4_t lane1b = vld1q_f32(laneB + x);
1012
1013 float32x4_t lane0c = vld1q_f32(lanea + x - cn);
1014 float32x4_t lane2c = vld1q_f32(lanea + x + cn);
1015 float32x4_t lane1c = vld1q_f32(lanea + x);
1016
1017 float32x4_t va = vaddq_f32(lane0a, lane2a);
1018 float32x4_t vb = vaddq_f32(lane0b, lane2b);
1019 float32x4_t vc = vaddq_f32(lane0c, lane2c);
1020 float32x4_t wa = vaddq_f32(va, lane1a);
1021 float32x4_t wb = vaddq_f32(vb, lane1b);
1022 float32x4_t wc = vaddq_f32(vc, lane1c);
1023
1024 vst1q_f32(dsta + x, wa);
1025 vst1q_f32(dstb + x, wb);
1026 vst1q_f32(dstc + x, wc);
1027 }
1028 if(x < colsn)
1029 {
1030 x = colsn-4;
1031 goto box3x3f32_horiz_ll;
1032 }
1033 }
1034 else
1035 {
1036 //horizontal convolution (last 2 lines)
1037 f32* dstb = internal::getRowPtr(dstBase, dstStride, i-1);
1038 size_t x = 0;
1039 for (; x <= colsn - 4; x += 4)
1040 {
1041 internal::prefetch(laneA + x + cn);
1042 internal::prefetch(laneB + x + cn);
1043 box3x3f32_horiz_last2:
1044 float32x4_t lane0a = vld1q_f32(laneA + x - cn);
1045 float32x4_t lane2a = vld1q_f32(laneA + x + cn);
1046 float32x4_t lane1a = vld1q_f32(laneA + x);
1047
1048 float32x4_t lane0b = vld1q_f32(laneB + x - cn);
1049 float32x4_t lane2b = vld1q_f32(laneB + x + cn);
1050 float32x4_t lane1b = vld1q_f32(laneB + x);
1051
1052 float32x4_t va = vaddq_f32(lane0a, lane2a);
1053 float32x4_t vb = vaddq_f32(lane0b, lane2b);
1054 float32x4_t wa = vaddq_f32(va, lane1a);
1055 float32x4_t wb = vaddq_f32(vb, lane1b);
1056
1057 vst1q_f32(dsta + x, wa);
1058 vst1q_f32(dstb + x, wb);
1059 }
1060 if(x < colsn)
1061 {
1062 x = colsn-4;
1063 goto box3x3f32_horiz_last2;
1064 }
1065 }
1066 #else
1067 (void)srcBase;
1068 (void)srcStride;
1069 (void)dstBase;
1070 (void)dstStride;
1071 (void)borderValue;
1072 (void)borderMargin;
1073 #endif
1074 }
1075
isBlurS32Supported(const Size2D & size,s32 cn,BORDER_MODE border)1076 bool isBlurS32Supported(const Size2D &size, s32 cn, BORDER_MODE border)
1077 {
1078 return isSupportedConfiguration() &&
1079 cn > 0 && cn <= 4 &&
1080 size.width*cn >= 4 && size.height >= 2 &&
1081 (border == BORDER_MODE_CONSTANT ||
1082 border == BORDER_MODE_REFLECT101 ||
1083 border == BORDER_MODE_REFLECT ||
1084 border == BORDER_MODE_REPLICATE ||
1085 border == BORDER_MODE_WRAP);
1086 }
1087
blur3x3(const Size2D & size,s32 cn,const s32 * srcBase,ptrdiff_t srcStride,s32 * dstBase,ptrdiff_t dstStride,BORDER_MODE borderType,s32 borderValue,Margin borderMargin)1088 void blur3x3(const Size2D &size, s32 cn,
1089 const s32 * srcBase, ptrdiff_t srcStride,
1090 s32 * dstBase, ptrdiff_t dstStride,
1091 BORDER_MODE borderType, s32 borderValue, Margin borderMargin)
1092 {
1093 internal::assertSupportedConfiguration(isBlurS32Supported(size, cn, borderType));
1094 #ifdef CAROTENE_NEON
1095 size_t colsn = size.width * cn;
1096
1097 std::vector<s32> _tmp;
1098 s32 *tmp = 0;
1099 if (borderType == BORDER_MODE_CONSTANT)
1100 {
1101 _tmp.assign(colsn + 2*cn, borderValue);
1102 tmp = &_tmp[cn];
1103 }
1104
1105 ptrdiff_t idx_l = internal::borderInterpolate(-1, size.width, borderType, borderMargin.left, borderMargin.right) * cn;
1106 ptrdiff_t idx_r = internal::borderInterpolate(size.width, size.width, borderType, borderMargin.left, borderMargin.right) * cn;
1107
1108 //2-line buffer
1109 std::vector<s32> _buf(4*(cn * (size.width + 2) + 32 / sizeof(s32)));
1110 s32* lanea = internal::alignPtr(&_buf[cn], 32);
1111 s32* laneA = internal::alignPtr(lanea + cn * (size.width + 2), 32);
1112
1113 s32* laneb = internal::alignPtr(laneA + cn * (size.width + 2), 32);
1114 s32* laneB = internal::alignPtr(laneb + cn * (size.width + 2), 32);
1115
1116 if (borderType == BORDER_MODE_CONSTANT)
1117 for (s32 k = 0; k < cn; ++k)
1118 {
1119 lanea[-cn+k] = borderValue;
1120 lanea[colsn+k] = borderValue;
1121 laneA[-cn+k] = borderValue;
1122 laneA[colsn+k] = borderValue;
1123 laneb[-cn+k] = borderValue;
1124 laneb[colsn+k] = borderValue;
1125 laneB[-cn+k] = borderValue;
1126 laneB[colsn+k] = borderValue;
1127 }
1128
1129 size_t i = 0;
1130 s32* dsta = internal::getRowPtr(dstBase, dstStride, 0);
1131 for (; i < size.height-1; i+=2)
1132 {
1133 //vertical convolution
1134 ptrdiff_t idx_rm1 = internal::borderInterpolate(i - 1, size.height, borderType, borderMargin.top, borderMargin.bottom);
1135 ptrdiff_t idx_rp2 = internal::borderInterpolate(i + 2, size.height, borderType, borderMargin.top, borderMargin.bottom);
1136
1137 const s32* ln0 = idx_rm1 >= -(ptrdiff_t)borderMargin.top ? internal::getRowPtr(srcBase, srcStride, idx_rm1) : tmp;
1138 const s32* ln1 = internal::getRowPtr(srcBase, srcStride, i);
1139 const s32* ln2 = internal::getRowPtr(srcBase, srcStride, i + 1);
1140 const s32* ln3 = idx_rp2 >= -(ptrdiff_t)borderMargin.top ? internal::getRowPtr(srcBase, srcStride, idx_rp2) : tmp;
1141
1142 size_t x = 0;
1143 for (; x <= colsn - 4; x += 4)
1144 {
1145 internal::prefetch(ln1 + x);
1146 internal::prefetch(ln2 + x);
1147 internal::prefetch(ln0 + x);
1148 internal::prefetch(ln3 + x);
1149 box3x3s32_vert:
1150 int32x4_t v1 = vld1q_s32(ln1 + x);
1151 int32x4_t v2 = vld1q_s32(ln2 + x);
1152 int32x4_t v0 = vld1q_s32(ln0 + x);
1153 int32x4_t v3 = vld1q_s32(ln3 + x);
1154
1155 int32x4_t v = vaddq_s32(v1, v2);
1156 int32x4_t w0 = vaddq_s32(v, v0);
1157 int32x4_t w1 = vaddq_s32(v, v3);
1158
1159 vst1q_s32(lanea + x, w0);
1160 vst1q_s32(laneb + x, w1);
1161 }
1162 if(x < colsn)
1163 {
1164 x = colsn-4;
1165 goto box3x3s32_vert;
1166 }
1167
1168 //left&right borders
1169 if (borderType != BORDER_MODE_CONSTANT)
1170 for (s32 k = 0; k < cn; ++k)
1171 {
1172 lanea[-cn+k] = lanea[idx_l + k];
1173 lanea[colsn+k] = lanea[idx_r + k];
1174 laneb[-cn+k] = laneb[idx_l + k];
1175 laneb[colsn+k] = laneb[idx_r + k];
1176 }
1177
1178 //horizontal convolution (2 lines from previous iteration)
1179 if (i > 0)
1180 {
1181 s32* dstb = internal::getRowPtr(dstBase, dstStride, i-1);
1182 x = 0;
1183 for (; x <= colsn - 4; x += 4)
1184 {
1185 internal::prefetch(laneA + x + cn);
1186 internal::prefetch(laneB + x + cn);
1187 box3x3s32_horiz:
1188 int32x4_t lane0a = vld1q_s32(laneA + x - cn);
1189 int32x4_t lane2a = vld1q_s32(laneA + x + cn);
1190 int32x4_t lane1a = vld1q_s32(laneA + x);
1191
1192 int32x4_t lane0b = vld1q_s32(laneB + x - cn);
1193 int32x4_t lane2b = vld1q_s32(laneB + x + cn);
1194 int32x4_t lane1b = vld1q_s32(laneB + x);
1195
1196 int32x4_t va = vaddq_s32(lane0a, lane2a);
1197 int32x4_t vb = vaddq_s32(lane0b, lane2b);
1198 int32x4_t wa = vaddq_s32(va, lane1a);
1199 int32x4_t wb = vaddq_s32(vb, lane1b);
1200
1201 vst1q_s32(dsta + x, wa);
1202 vst1q_s32(dstb + x, wb);
1203 }
1204 if(x < colsn)
1205 {
1206 x = colsn-4;
1207 goto box3x3s32_horiz;
1208 }
1209 dsta = internal::getRowPtr(dstBase, dstStride, i);
1210 }
1211
1212 std::swap(lanea, laneA);
1213 std::swap(laneb, laneB);
1214 }
1215 //last line
1216 if(i < size.height)
1217 {
1218 //vertical convolution
1219 ptrdiff_t idx_rm1 = internal::borderInterpolate(i - 1, size.height, borderType, borderMargin.top, borderMargin.bottom);
1220 ptrdiff_t idx_rp1 = internal::borderInterpolate(i + 1, size.height, borderType, borderMargin.top, borderMargin.bottom);
1221
1222 const s32* ln0 = idx_rm1 >= -(ptrdiff_t)borderMargin.top ? internal::getRowPtr(srcBase, srcStride, idx_rm1) : tmp;
1223 const s32* ln1 = internal::getRowPtr(srcBase, srcStride, i);
1224 const s32* ln2 = idx_rp1 >= -(ptrdiff_t)borderMargin.top ? internal::getRowPtr(srcBase, srcStride, idx_rp1) : tmp;
1225
1226 size_t x = 0;
1227 for (; x <= colsn - 4; x += 4)
1228 {
1229 internal::prefetch(ln0 + x);
1230 internal::prefetch(ln1 + x);
1231 internal::prefetch(ln2 + x);
1232 box3x3s32_vert_ll:
1233 int32x4_t v0 = vld1q_s32(ln0+x);
1234 int32x4_t v1 = vld1q_s32(ln1+x);
1235 int32x4_t v2 = vld1q_s32(ln2+x);
1236
1237 int32x4_t v = vaddq_s32(v0, v1);
1238 int32x4_t w = vaddq_s32(v, v2);
1239
1240 vst1q_s32(lanea + x, w);
1241 }
1242 if(x < colsn)
1243 {
1244 x = colsn-4;
1245 goto box3x3s32_vert_ll;
1246 }
1247
1248 //left&right borders
1249 if (borderType != BORDER_MODE_CONSTANT)
1250 for (s32 k = 0; k < cn; ++k)
1251 {
1252 lanea[-cn+k] = lanea[idx_l + k];
1253 lanea[colsn+k] = lanea[idx_r + k];
1254 }
1255
1256 //horizontal convolution (last 3 lines)
1257 x = 0;
1258 s32* dstb = internal::getRowPtr(dstBase, dstStride, i-1);
1259 s32* dstc = internal::getRowPtr(dstBase, dstStride, i);
1260 for (; x <= colsn - 4; x += 4)
1261 {
1262 internal::prefetch(laneA + x + cn);
1263 internal::prefetch(laneB + x + cn);
1264 internal::prefetch(lanea + x + cn);
1265 box3x3s32_horiz_ll:
1266 int32x4_t lane0a = vld1q_s32(laneA + x - cn);
1267 int32x4_t lane2a = vld1q_s32(laneA + x + cn);
1268 int32x4_t lane1a = vld1q_s32(laneA + x);
1269
1270 int32x4_t lane0b = vld1q_s32(laneB + x - cn);
1271 int32x4_t lane2b = vld1q_s32(laneB + x + cn);
1272 int32x4_t lane1b = vld1q_s32(laneB + x);
1273
1274 int32x4_t lane0c = vld1q_s32(lanea + x - cn);
1275 int32x4_t lane2c = vld1q_s32(lanea + x + cn);
1276 int32x4_t lane1c = vld1q_s32(lanea + x);
1277
1278 int32x4_t va = vaddq_s32(lane0a, lane2a);
1279 int32x4_t vb = vaddq_s32(lane0b, lane2b);
1280 int32x4_t vc = vaddq_s32(lane0c, lane2c);
1281 int32x4_t wa = vaddq_s32(va, lane1a);
1282 int32x4_t wb = vaddq_s32(vb, lane1b);
1283 int32x4_t wc = vaddq_s32(vc, lane1c);
1284
1285 vst1q_s32(dsta + x, wa);
1286 vst1q_s32(dstb + x, wb);
1287 vst1q_s32(dstc + x, wc);
1288 }
1289 if(x < colsn)
1290 {
1291 x = colsn-4;
1292 goto box3x3s32_horiz_ll;
1293 }
1294 }
1295 else
1296 {
1297 //horizontal convolution (last 2 lines)
1298 s32* dstb = internal::getRowPtr(dstBase, dstStride, i-1);
1299 size_t x = 0;
1300 for (; x <= colsn - 4; x += 4)
1301 {
1302 internal::prefetch(laneA + x + cn);
1303 internal::prefetch(laneB + x + cn);
1304 box3x3s32_horiz_last2:
1305 int32x4_t lane0a = vld1q_s32(laneA + x - cn);
1306 int32x4_t lane2a = vld1q_s32(laneA + x + cn);
1307 int32x4_t lane1a = vld1q_s32(laneA + x);
1308
1309 int32x4_t lane0b = vld1q_s32(laneB + x - cn);
1310 int32x4_t lane2b = vld1q_s32(laneB + x + cn);
1311 int32x4_t lane1b = vld1q_s32(laneB + x);
1312
1313 int32x4_t va = vaddq_s32(lane0a, lane2a);
1314 int32x4_t vb = vaddq_s32(lane0b, lane2b);
1315 int32x4_t wa = vaddq_s32(va, lane1a);
1316 int32x4_t wb = vaddq_s32(vb, lane1b);
1317
1318 vst1q_s32(dsta + x, wa);
1319 vst1q_s32(dstb + x, wb);
1320 }
1321 if(x < colsn)
1322 {
1323 x = colsn-4;
1324 goto box3x3s32_horiz_last2;
1325 }
1326 }
1327 #else
1328 (void)srcBase;
1329 (void)srcStride;
1330 (void)dstBase;
1331 (void)dstStride;
1332 (void)borderValue;
1333 (void)borderMargin;
1334 #endif
1335 }
1336
1337 } //namespace CAROTENE_NS
1338