• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * By downloading, copying, installing or using the software you agree to this license.
3  * If you do not agree to this license, do not download, install,
4  * copy or use the software.
5  *
6  *
7  *                           License Agreement
8  *                For Open Source Computer Vision Library
9  *                        (3-clause BSD License)
10  *
11  * Copyright (C) 2012-2015, NVIDIA Corporation, all rights reserved.
12  * Third party copyrights are property of their respective owners.
13  *
14  * Redistribution and use in source and binary forms, with or without modification,
15  * are permitted provided that the following conditions are met:
16  *
17  *   * Redistributions of source code must retain the above copyright notice,
18  *     this list of conditions and the following disclaimer.
19  *
20  *   * Redistributions in binary form must reproduce the above copyright notice,
21  *     this list of conditions and the following disclaimer in the documentation
22  *     and/or other materials provided with the distribution.
23  *
24  *   * Neither the names of the copyright holders nor the names of the contributors
25  *     may be used to endorse or promote products derived from this software
26  *     without specific prior written permission.
27  *
28  * This software is provided by the copyright holders and contributors "as is" and
29  * any express or implied warranties, including, but not limited to, the implied
30  * warranties of merchantability and fitness for a particular purpose are disclaimed.
31  * In no event shall copyright holders or contributors be liable for any direct,
32  * indirect, incidental, special, exemplary, or consequential damages
33  * (including, but not limited to, procurement of substitute goods or services;
34  * loss of use, data, or profits; or business interruption) however caused
35  * and on any theory of liability, whether in contract, strict liability,
36  * or tort (including negligence or otherwise) arising in any way out of
37  * the use of this software, even if advised of the possibility of such damage.
38  */
39 
40 #include <vector>
41 
42 #include "common.hpp"
43 #include "saturate_cast.hpp"
44 
45 namespace CAROTENE_NS {
46 
isBlur3x3Supported(const Size2D & size,BORDER_MODE border)47 bool isBlur3x3Supported(const Size2D &size, BORDER_MODE border)
48 {
49     return isSupportedConfiguration() && size.width >= 8 &&
50         (border == BORDER_MODE_CONSTANT ||
51             border == BORDER_MODE_REPLICATE);
52 }
53 
blur3x3(const Size2D & size,const u8 * srcBase,ptrdiff_t srcStride,u8 * dstBase,ptrdiff_t dstStride,BORDER_MODE border,u8 borderValue)54 void blur3x3(const Size2D &size,
55              const u8 * srcBase, ptrdiff_t srcStride,
56              u8 * dstBase, ptrdiff_t dstStride,
57              BORDER_MODE border, u8 borderValue)
58 {
59     internal::assertSupportedConfiguration(isBlur3x3Supported(size, border));
60 #ifdef CAROTENE_NEON
61     const int16x8_t v_scale = vmovq_n_s16(3640);
62     const uint16x8_t v_border_x3 = vdupq_n_u16(borderValue * 3);
63     const uint16x8_t v_zero = vdupq_n_u16(0);
64     const uint8x8_t v_border = vdup_n_u8(borderValue);
65 
66     uint16x8_t tprev = v_zero, tcurr = v_zero, tnext = v_zero;
67     uint16x8_t t0 = v_zero, t1 = v_zero, t2 = v_zero;
68 
69     ptrdiff_t width = (ptrdiff_t)size.width, height = (ptrdiff_t)size.height;
70 
71     for (ptrdiff_t y = 0; y < height; ++y)
72     {
73         const u8 * srow0 = y == 0 && border == BORDER_MODE_CONSTANT ? NULL : internal::getRowPtr(srcBase, srcStride, std::max<ptrdiff_t>(y - 1, 0));
74         const u8 * srow1 = internal::getRowPtr(srcBase, srcStride, y);
75         const u8 * srow2 = y + 1 == height && border == BORDER_MODE_CONSTANT ? NULL : internal::getRowPtr(srcBase, srcStride, std::min(y + 1, height - 1));
76         u8 * drow = internal::getRowPtr(dstBase, dstStride, y);
77 
78         s16 prevx = 0, currx = 0, nextx = 0;
79         ptrdiff_t x = 0;
80         const ptrdiff_t bwidth = y + 2 < height ? width : (width - 8);
81 
82         // perform vertical convolution
83         for ( ; x <= bwidth; x += 8)
84         {
85             internal::prefetch(srow0 + x);
86             internal::prefetch(srow1 + x);
87             internal::prefetch(srow2 + x);
88 
89             uint8x8_t x0 = !srow0 ? v_border : vld1_u8(srow0 + x);
90             uint8x8_t x1 = vld1_u8(srow1 + x);
91             uint8x8_t x2 = !srow2 ? v_border : vld1_u8(srow2 + x);
92 
93             // calculate values for plain CPU part below if needed
94             if (x + 8 >= bwidth)
95             {
96                 ptrdiff_t x3 = x == width ? width - 1 : x;
97                 ptrdiff_t x4 = border == BORDER_MODE_CONSTANT ? x3 - 1 : std::max<ptrdiff_t>(x3 - 1, 0);
98 
99                 if (border == BORDER_MODE_CONSTANT && x4 < 0)
100                     prevx = borderValue;
101                 else
102                     prevx = (srow2 ? srow2[x4] : borderValue) + srow1[x4] + (srow0 ? srow0[x4] : borderValue);
103 
104                 currx = (srow2 ? srow2[x3] : borderValue) + srow1[x3] + (srow0 ? srow0[x3] : borderValue);
105             }
106 
107             // make shift
108             if (x)
109             {
110                 tprev = tcurr;
111                 tcurr = tnext;
112             }
113 
114             // and calculate next value
115             tnext = vaddw_u8(vaddl_u8(x0, x1), x2);
116 
117             // make extrapolation for the first elements
118             if (!x)
119             {
120                 // make border
121                 if (border == BORDER_MODE_CONSTANT)
122                     tcurr = v_border_x3;
123                 else if (border == BORDER_MODE_REPLICATE)
124                     tcurr = vdupq_n_u16(vgetq_lane_u16(tnext, 0));
125 
126                 continue;
127             }
128 
129             // combine 3 "shifted" vectors
130             t0 = vextq_u16(tprev, tcurr, 7);
131             t1 = tcurr;
132             t2 = vextq_u16(tcurr, tnext, 1);
133 
134             // and add them
135             t0 = vqaddq_u16(t0, vqaddq_u16(t1, t2));
136 
137             int16x8_t tt0 = vqrdmulhq_s16(vreinterpretq_s16_u16(t0), v_scale);
138             uint8x8_t it0 = vmovn_u16(vreinterpretq_u16_s16(tt0));
139             vst1_u8(drow + x - 8, it0);
140         }
141 
142         x -= 8;
143         if (x == width)
144             --x;
145 
146         for ( ; x < width; ++x)
147         {
148             // make extrapolation for the last elements
149             if (x + 1 >= width)
150             {
151                 if (border == BORDER_MODE_CONSTANT)
152                     nextx = borderValue * 3;
153                 else if (border == BORDER_MODE_REPLICATE)
154                     nextx = srow2[x] + srow1[x] + srow0[x];
155             }
156             else
157                 nextx = (srow2 ? srow2[x + 1] : borderValue) +
158                                  srow1[x + 1] +
159                         (srow0 ? srow0[x + 1] : borderValue);
160 
161             f32 val = (prevx + currx + nextx) * (1 / 9.f) + 0.5f;
162             drow[x] = internal::saturate_cast<u8>((s32)val);
163 
164             // make shift
165             prevx = currx;
166             currx = nextx;
167         }
168     }
169 #else
170     (void)size;
171     (void)srcBase;
172     (void)srcStride;
173     (void)dstBase;
174     (void)dstStride;
175     (void)border;
176     (void)borderValue;
177 #endif
178 }
179 
isBlurU8Supported(const Size2D & size,s32 cn,BORDER_MODE border)180 bool isBlurU8Supported(const Size2D &size, s32 cn, BORDER_MODE border)
181 {
182     return isSupportedConfiguration() &&
183            cn > 0 && cn <= 4 &&
184            size.width*cn >= 8 && size.height >= 2 &&
185            (border == BORDER_MODE_CONSTANT ||
186             border == BORDER_MODE_REFLECT101 ||
187             border == BORDER_MODE_REFLECT ||
188             border == BORDER_MODE_REPLICATE);
189 }
190 
blur3x3(const Size2D & size,s32 cn,const u8 * srcBase,ptrdiff_t srcStride,u8 * dstBase,ptrdiff_t dstStride,BORDER_MODE borderType,u8 borderValue)191 void blur3x3(const Size2D &size, s32 cn,
192              const u8 * srcBase, ptrdiff_t srcStride,
193              u8 * dstBase, ptrdiff_t dstStride,
194              BORDER_MODE borderType, u8 borderValue)
195 {
196     internal::assertSupportedConfiguration(isBlurU8Supported(size, cn, borderType));
197 #ifdef CAROTENE_NEON
198 //#define FLOAT_VARIANT_1_9
199 #ifdef FLOAT_VARIANT_1_9
200     float32x4_t v1_9 = vdupq_n_f32 (1.0/9.0);
201     float32x4_t v0_5 = vdupq_n_f32 (.5);
202 #else
203     const int16x8_t vScale = vmovq_n_s16(3640);
204 #endif
205 
206     size_t colsn = size.width*cn;
207 
208     std::vector<u8> _tmp;
209     u8 *tmp = 0;
210     if (borderType == BORDER_MODE_CONSTANT)
211     {
212         _tmp.assign(colsn + 2*cn, borderValue);
213         tmp = &_tmp[cn];
214     }
215 
216     uint16x8_t tprev = vdupq_n_u16(0x0);
217     uint16x8_t tcurr = tprev;
218     uint16x8_t tnext = tprev;
219     uint16x8_t t0, t1, t2;
220     if(cn == 1)
221     {
222         for( size_t y = 0; y < size.height; y++ )
223         {
224             const u8* srow0;
225             const u8* srow1 = internal::getRowPtr(srcBase, srcStride, y);
226             const u8* srow2;
227             u8* drow = internal::getRowPtr(dstBase, dstStride, y);
228             if (borderType == BORDER_MODE_REFLECT101) {
229                 srow0 = internal::getRowPtr(srcBase, srcStride, y > 0 ? y-1 : 1);
230                 srow2 = internal::getRowPtr(srcBase, srcStride, y < size.height-1 ? y+1 : size.height-2);
231             } else  if (borderType == BORDER_MODE_CONSTANT) {
232                 srow0 = y > 0 ? internal::getRowPtr(srcBase, srcStride, y-1) : tmp;
233                 srow2 =  y < size.height-1 ? internal::getRowPtr(srcBase, srcStride, y+1) : tmp;
234             } else { // BORDER_MODE_REFLECT || BORDER_MODE_REPLICATE
235                 srow0 = internal::getRowPtr(srcBase, srcStride, y > 0 ? y-1 : 0);
236                 srow2 = internal::getRowPtr(srcBase, srcStride, y < size.height-1 ? y+1 : size.height-1);
237             }
238 
239             // do vertical convolution
240             size_t x = 0;
241             const size_t bcols = y + 2 < size.height ? colsn : (colsn - 8);
242             for( ; x <= bcols; x += 8 )
243             {
244                 internal::prefetch(srow0 + x);
245                 internal::prefetch(srow1 + x);
246                 internal::prefetch(srow2 + x);
247 
248                 uint8x8_t x0 = vld1_u8(srow0 + x);
249                 uint8x8_t x1 = vld1_u8(srow1 + x);
250                 uint8x8_t x2 = vld1_u8(srow2 + x);
251 
252                 tprev = tcurr;
253                 tcurr = tnext;
254                 tnext = vaddw_u8(vaddl_u8(x0, x1), x2);
255 
256                 if(!x) {
257                     tcurr = tnext;
258 
259                     // make border
260                         if (borderType == BORDER_MODE_CONSTANT)
261                         {
262                             tcurr = vsetq_lane_u16(borderValue, tcurr, 7);
263                         }
264                         else if (borderType == BORDER_MODE_REFLECT101)
265                         {
266                             tcurr = vsetq_lane_u16(vgetq_lane_u16(tcurr, 1),tcurr, 7);
267                         }
268                         else // borderType == BORDER_MODE_REFLECT || borderType == BORDER_MODE_REPLICATE
269                         {
270                             tcurr = vsetq_lane_u16(vgetq_lane_u16(tcurr, 0),tcurr, 7);
271                         }
272                     continue;
273                 }
274 
275                 t0 = vextq_u16(tprev, tcurr, 7);
276                 t1 = tcurr;
277                 t2 = vextq_u16(tcurr, tnext, 1);
278 
279                 t0 = vqaddq_u16(t0, vqaddq_u16(t1, t2));
280 
281 #ifdef FLOAT_VARIANT_1_9
282                 uint32x4_t tres1 = vmovl_u16(vget_low_u16(t0));
283                 uint32x4_t tres2 = vmovl_u16(vget_high_u16(t0));
284                 float32x4_t vf1 = vmulq_f32(v1_9, vcvtq_f32_u32(tres1));
285                 float32x4_t vf2 = vmulq_f32(v1_9, vcvtq_f32_u32(tres2));
286                 tres1 = vcvtq_u32_f32(vaddq_f32(vf1, v0_5));
287                 tres2 = vcvtq_u32_f32(vaddq_f32(vf2, v0_5));
288                 t0 = vcombine_u16(vmovn_u32(tres1),vmovn_u32(tres2));
289                 vst1_u8(drow + x - 8, vmovn_u16(t0));
290 #else
291                 int16x8_t tt0 = vqrdmulhq_s16(vreinterpretq_s16_u16(t0), vScale);
292                 uint8x8_t it0 = vmovn_u16(vreinterpretq_u16_s16(tt0));
293                 vst1_u8(drow + x - 8, it0);
294 #endif
295             }
296 
297             x -= 8;
298             if(x == colsn){
299                 x--;
300             }
301             s16 prevx, rowx, nextx;
302             prevx = srow2[x-1] + srow1[x-1] + srow0[x-1];
303             rowx = srow2[x] + srow1[x] + srow0[x];
304             for( ; x < colsn; x++ )
305             {
306                 if(x+1 >= colsn) {
307                     // make border
308                     if (borderType == BORDER_MODE_CONSTANT)
309                     {
310                         nextx = borderValue;
311                     } else if (borderType == BORDER_MODE_REFLECT101)
312                     {
313                         nextx = srow2[x-1] + srow1[x-1] + srow0[x-1];
314                     } else {
315                         nextx = srow2[x] + srow1[x] + srow0[x];
316                     }
317                 } else {
318                     nextx = srow2[x+1] + srow1[x+1] + srow0[x+1];
319                 }
320                 *(drow+x) = internal::saturate_cast<u8>((prevx + rowx + nextx)*(1/9.));
321                 prevx = rowx;
322                 rowx = nextx;
323             }
324         }
325     }
326     else
327     {
328         for( size_t y = 0; y < size.height; y++ )
329         {
330             const u8* srow0;
331             const u8* srow1 = internal::getRowPtr(srcBase, srcStride, y);
332             const u8* srow2;
333             u8* drow = internal::getRowPtr(dstBase, dstStride, y);
334             if (borderType == BORDER_MODE_REFLECT101) {
335                 srow0 = internal::getRowPtr(srcBase, srcStride, y > 0 ? y-1 : 1);
336                 srow2 = internal::getRowPtr(srcBase, srcStride, y < size.height-1 ? y+1 : size.height-2);
337             } else  if (borderType == BORDER_MODE_CONSTANT) {
338                 srow0 = y > 0 ? internal::getRowPtr(srcBase, srcStride, y-1) : tmp;
339                 srow2 =  y < size.height-1 ? internal::getRowPtr(srcBase, srcStride, y+1) : tmp;
340             } else { // BORDER_MODE_REFLECT || BORDER_MODE_REPLICATE
341                 srow0 = internal::getRowPtr(srcBase, srcStride, y > 0 ? y-1 : 0);
342                 srow2 = internal::getRowPtr(srcBase, srcStride, y < size.height-1 ? y+1 : size.height-1);
343             }
344 
345             // do vertical convolution
346             size_t x = 0;
347             const size_t bcols = y + 2 < size.height ? colsn : (colsn - 8);
348             for( ; x <= bcols; x += 8 )
349             {
350                 internal::prefetch(srow0 + x);
351                 internal::prefetch(srow1 + x);
352                 internal::prefetch(srow2 + x);
353 
354                 uint8x8_t x0 = vld1_u8(srow0 + x);
355                 uint8x8_t x1 = vld1_u8(srow1 + x);
356                 uint8x8_t x2 = vld1_u8(srow2 + x);
357 
358                 tprev = tcurr;
359                 tcurr = tnext;
360                 tnext = vaddw_u8(vaddl_u8(x0, x1), x2);
361 
362                 if(!x) {
363                     tcurr = tnext;
364 
365                     // make border
366                     switch(cn)
367                     {
368                     case 2:
369                         if (borderType == BORDER_MODE_CONSTANT)
370                         {
371                             tcurr = vsetq_lane_u16(borderValue, tcurr, 6);
372                             tcurr = vsetq_lane_u16(borderValue, tcurr, 7);
373                         }
374                         else if (borderType == BORDER_MODE_REFLECT101)
375                         {
376                             tcurr = vsetq_lane_u16(vgetq_lane_u16(tcurr, 2),tcurr, 6);
377                             tcurr = vsetq_lane_u16(vgetq_lane_u16(tcurr, 3),tcurr, 6);
378                         }
379                         else
380                         {
381                             tcurr = vsetq_lane_u16(vgetq_lane_u16(tcurr, 0),tcurr, 6);
382                             tcurr = vsetq_lane_u16(vgetq_lane_u16(tcurr, 1),tcurr, 7);
383                         }
384                         break;
385                     case 3:
386                         if (borderType == BORDER_MODE_CONSTANT)
387                         {
388                             tcurr = vsetq_lane_u16(borderValue, tcurr, 5);
389                             tcurr = vsetq_lane_u16(borderValue, tcurr, 6);
390                             tcurr = vsetq_lane_u16(borderValue, tcurr, 7);
391                         }
392                         else if (borderType == BORDER_MODE_REFLECT101)
393                         {
394                             tcurr = vsetq_lane_u16(vgetq_lane_u16(tcurr, 3),tcurr, 5);
395                             tcurr = vsetq_lane_u16(vgetq_lane_u16(tcurr, 4),tcurr, 6);
396                             tcurr = vsetq_lane_u16(vgetq_lane_u16(tcurr, 5),tcurr, 7);
397                         }
398                         else
399                         {
400                             tcurr = vsetq_lane_u16(vgetq_lane_u16(tcurr, 0),tcurr, 5);
401                             tcurr = vsetq_lane_u16(vgetq_lane_u16(tcurr, 1),tcurr, 6);
402                             tcurr = vsetq_lane_u16(vgetq_lane_u16(tcurr, 2),tcurr, 7);
403                         }
404                         break;
405                     case 4:
406                         if (borderType == BORDER_MODE_CONSTANT)
407                         {
408                             tcurr = vsetq_lane_u16(borderValue, tcurr, 4);
409                             tcurr = vsetq_lane_u16(borderValue, tcurr, 5);
410                             tcurr = vsetq_lane_u16(borderValue, tcurr, 6);
411                             tcurr = vsetq_lane_u16(borderValue, tcurr, 7);
412                         }
413                         else if (borderType != BORDER_MODE_REFLECT101)
414                         {
415                             tcurr = vsetq_lane_u16(vgetq_lane_u16(tcurr, 0),tcurr, 4);
416                             tcurr = vsetq_lane_u16(vgetq_lane_u16(tcurr, 1),tcurr, 5);
417                             tcurr = vsetq_lane_u16(vgetq_lane_u16(tcurr, 2),tcurr, 6);
418                             tcurr = vsetq_lane_u16(vgetq_lane_u16(tcurr, 3),tcurr, 7);
419                         }
420                         break;
421                     }
422                     continue;
423                 }
424 
425                 if(cn==2)
426                     t0 = vextq_u16(tprev, tcurr, 6);
427                 else if(cn==3)
428                     t0 = vextq_u16(tprev, tcurr, 5);
429                 else if(cn==4)
430                     t0 = vextq_u16(tprev, tcurr, 4);
431 
432                 t1 = tcurr;
433 
434                 if(cn==2)
435                     t2 = vextq_u16(tcurr, tnext, 2);
436                 else if(cn==3)
437                     t2 = vextq_u16(tcurr, tnext, 3);
438                 else if(cn==4)
439                     t2 = vextq_u16(tcurr, tnext, 4);
440 
441                 t0 = vqaddq_u16(t0, vqaddq_u16(t1, t2));
442 
443 #ifdef FLOAT_VARIANT_1_9
444                 uint32x4_t tres1 = vmovl_u16(vget_low_u16(t0));
445                 uint32x4_t tres2 = vmovl_u16(vget_high_u16(t0));
446                 float32x4_t vf1 = vmulq_f32(v1_9, vcvtq_f32_u32(tres1));
447                 float32x4_t vf2 = vmulq_f32(v1_9, vcvtq_f32_u32(tres2));
448                 tres1 = vcvtq_u32_f32(vaddq_f32(vf1, v0_5));
449                 tres2 = vcvtq_u32_f32(vaddq_f32(vf2, v0_5));
450                 t0 = vcombine_u16(vmovn_u32(tres1),vmovn_u32(tres2));
451                 vst1_u8(drow + x - 8, vmovn_u16(t0));
452 #else
453                 int16x8_t tt0 = vqrdmulhq_s16(vreinterpretq_s16_u16(t0), vScale);
454                 uint8x8_t it0 = vmovn_u16(vreinterpretq_u16_s16(tt0));
455                 vst1_u8(drow + x - 8, it0);
456 #endif
457             }
458 
459             x -= 8;
460             if(x == colsn){
461                 x -= cn;
462             }
463             s16 prevx[4], rowx[4], nextx[4];
464             for( s32 k = 0; k < cn; k++ )
465             {
466                 prevx[(k + x%cn)%cn] = srow2[x+k-cn] + srow1[x+k-cn] + srow0[x+k-cn];
467                 rowx[(k + x%cn)%cn] = srow2[x+k] + srow1[x+k] + srow0[x+k];
468             }
469             for( ; x < colsn; x++ )
470             {
471                 size_t xx = x%cn;
472                 if(x+cn >= colsn) {
473                     // make border
474                     if (borderType == BORDER_MODE_CONSTANT)
475                     {
476                         nextx[xx] = borderValue;
477                     } else if (borderType == BORDER_MODE_REFLECT101)
478                     {
479                         nextx[xx] = srow2[x-cn] + srow1[x-cn] + srow0[x-cn];
480                     } else {
481                         nextx[xx] = srow2[x] + srow1[x] + srow0[x];
482                     }
483                 } else {
484                     nextx[xx] = srow2[x+cn] + srow1[x+cn] + srow0[x+cn];
485                 }
486                 *(drow+x) = internal::saturate_cast<u8>((prevx[xx] + rowx[xx] + nextx[xx])*(1/9.));
487                 prevx[xx] = rowx[xx];
488                 rowx[xx] = nextx[xx];
489             }
490         }
491     }
492 #else
493     (void)srcBase;
494     (void)srcStride;
495     (void)dstBase;
496     (void)dstStride;
497     (void)borderValue;
498 #endif
499 }
500 
blur5x5(const Size2D & size,s32 cn,const u8 * srcBase,ptrdiff_t srcStride,u8 * dstBase,ptrdiff_t dstStride,BORDER_MODE borderType,u8 borderValue)501 void blur5x5(const Size2D &size, s32 cn,
502              const u8 * srcBase, ptrdiff_t srcStride,
503              u8 * dstBase, ptrdiff_t dstStride,
504              BORDER_MODE borderType, u8 borderValue)
505 {
506     internal::assertSupportedConfiguration(isBlurU8Supported(size, cn, borderType));
507 #ifdef CAROTENE_NEON
508 #define FLOAT_VARIANT_1_25
509 #ifdef FLOAT_VARIANT_1_25
510     float32x4_t v1_25 = vdupq_n_f32 (1.0f/25.0f);
511     float32x4_t v0_5 = vdupq_n_f32 (.5f);
512 #else
513     const int16x8_t vScale = vmovq_n_s16(1310);
514 #endif
515     size_t colsn = size.width*cn;
516 
517     std::vector<u8> _tmp;
518     u8 *tmp = 0;
519     if (borderType == BORDER_MODE_CONSTANT)
520     {
521         _tmp.assign(colsn + 2*cn, borderValue);
522         tmp = &_tmp[cn];
523     }
524 
525     uint16x8_t tprev = vdupq_n_u16(0x0);
526     uint16x8_t tcurr = tprev;
527     uint16x8_t tnext = tprev;
528     uint16x8_t t0, t1, t2, t3, t4;
529     for( size_t y = 0; y < size.height; y++ )
530     {
531         const u8 *srow0, *srow1;
532         const u8 *srow2 = internal::getRowPtr(srcBase, srcStride, y);
533         const u8 *srow3, *srow4;
534         u8 *drow = internal::getRowPtr(dstBase, dstStride, y);
535         if (borderType == BORDER_MODE_REFLECT101) {
536             srow0 = internal::getRowPtr(srcBase, srcStride, y > 1 ? y-2 : 2-y);
537             srow1 = internal::getRowPtr(srcBase, srcStride, y > 0 ? y-1 : 1);
538             srow3 = internal::getRowPtr(srcBase, srcStride, y < size.height-1 ? y+1 : size.height-2);
539             srow4 = internal::getRowPtr(srcBase, srcStride, y < size.height-2 ? y+2 : (size.height<<1)-4-y);
540         } else  if (borderType == BORDER_MODE_CONSTANT) {
541             srow0 = y > 1 ? internal::getRowPtr(srcBase, srcStride, y-2) : tmp;
542             srow1 = y > 0 ? internal::getRowPtr(srcBase, srcStride, y-1) : tmp;
543             srow3 =  y < size.height-1 ? internal::getRowPtr(srcBase, srcStride, y+1) : tmp;
544             srow4 =  y < size.height-2 ? internal::getRowPtr(srcBase, srcStride, y+2) : tmp;
545         } else  if (borderType == BORDER_MODE_REFLECT) {
546             srow0 = internal::getRowPtr(srcBase, srcStride, y > 1 ? y-2 : 1-y);
547             srow1 = internal::getRowPtr(srcBase, srcStride, y > 0 ? y-1 : 0);
548             srow3 = internal::getRowPtr(srcBase, srcStride, y < size.height-1 ? y+1 : size.height-1);
549             srow4 = internal::getRowPtr(srcBase, srcStride, y < size.height-2 ? y+2 : (size.height<<1)-3-y);
550         } else { // BORDER_MODE_REPLICATE
551             srow0 = internal::getRowPtr(srcBase, srcStride, y > 1 ? y-2 : 0);
552             srow1 = internal::getRowPtr(srcBase, srcStride, y > 0 ? y-1 : 0);
553             srow3 = internal::getRowPtr(srcBase, srcStride, y < size.height-1 ? y+1 : size.height-1);
554             srow4 = internal::getRowPtr(srcBase, srcStride, y < size.height-2 ? y+2 : size.height-1);
555         }
556 
557         // do vertical convolution
558         size_t x = 0;
559         const size_t bcols = y + 3 < size.height ? colsn : (colsn - 8);
560         for( ; x <= bcols; x += 8 )
561         {
562             internal::prefetch(srow0 + x);
563             internal::prefetch(srow1 + x);
564             internal::prefetch(srow2 + x);
565             internal::prefetch(srow3 + x);
566             internal::prefetch(srow4 + x);
567 
568             uint8x8_t x0 = vld1_u8(srow0 + x);
569             uint8x8_t x1 = vld1_u8(srow1 + x);
570             uint8x8_t x2 = vld1_u8(srow2 + x);
571             uint8x8_t x3 = vld1_u8(srow3 + x);
572             uint8x8_t x4 = vld1_u8(srow4 + x);
573 
574             tprev = tcurr;
575             tcurr = tnext;
576             tnext = vaddw_u8(vaddq_u16(vaddl_u8(x0, x1), vaddl_u8(x2, x3)), x4);
577 
578             if(!x) {
579                 tcurr = tnext;
580 
581                 if(borderType == BORDER_MODE_REFLECT101 && size.width < 3)
582                 {
583                     x = 8;
584                     break;
585                 }
586 
587                 // make border
588                 switch(cn)
589                 {
590                 case 1:
591                     if (borderType == BORDER_MODE_CONSTANT)
592                     {
593                         tcurr = vsetq_lane_u16(borderValue, tcurr, 6);
594                         tcurr = vsetq_lane_u16(borderValue, tcurr, 7);
595                     }
596                     else if (borderType == BORDER_MODE_REFLECT101)
597                     {
598                         tcurr = vsetq_lane_u16(vgetq_lane_u16(tcurr, 2),tcurr, 6);
599                         tcurr = vsetq_lane_u16(vgetq_lane_u16(tcurr, 1),tcurr, 7);
600                     }
601                     else if (borderType == BORDER_MODE_REFLECT)
602                     {
603                         tcurr = vsetq_lane_u16(vgetq_lane_u16(tcurr, 1),tcurr, 6);
604                         tcurr = vsetq_lane_u16(vgetq_lane_u16(tcurr, 0),tcurr, 7);
605                     }
606                     else
607                     {
608                         tcurr = vsetq_lane_u16(vgetq_lane_u16(tcurr, 0),tcurr, 6);
609                         tcurr = vsetq_lane_u16(vgetq_lane_u16(tcurr, 0),tcurr, 7);
610                     }
611                     break;
612                 case 2:
613                     if (borderType == BORDER_MODE_CONSTANT)
614                     {
615                         tcurr = vsetq_lane_u16(borderValue, tcurr, 4);
616                         tcurr = vsetq_lane_u16(borderValue, tcurr, 5);
617                         tcurr = vsetq_lane_u16(borderValue, tcurr, 6);
618                         tcurr = vsetq_lane_u16(borderValue, tcurr, 7);
619                     }
620                     else if (borderType == BORDER_MODE_REFLECT101)
621                     {
622                         tcurr = vsetq_lane_u16(vgetq_lane_u16(tcurr, 2),tcurr, 6);
623                         tcurr = vsetq_lane_u16(vgetq_lane_u16(tcurr, 3),tcurr, 7);
624                     }
625                     else if (borderType == BORDER_MODE_REFLECT)
626                     {
627                         tcurr = vsetq_lane_u16(vgetq_lane_u16(tcurr, 2),tcurr, 4);
628                         tcurr = vsetq_lane_u16(vgetq_lane_u16(tcurr, 3),tcurr, 5);
629                         tcurr = vsetq_lane_u16(vgetq_lane_u16(tcurr, 0),tcurr, 6);
630                         tcurr = vsetq_lane_u16(vgetq_lane_u16(tcurr, 1),tcurr, 7);
631                     }
632                     else
633                     {
634                         tcurr = vsetq_lane_u16(vgetq_lane_u16(tcurr, 0),tcurr, 4);
635                         tcurr = vsetq_lane_u16(vgetq_lane_u16(tcurr, 1),tcurr, 5);
636                         tcurr = vsetq_lane_u16(vgetq_lane_u16(tcurr, 0),tcurr, 6);
637                         tcurr = vsetq_lane_u16(vgetq_lane_u16(tcurr, 1),tcurr, 7);
638                     }
639                     break;
640                 case 3:
641                     if (borderType == BORDER_MODE_CONSTANT)
642                     {
643                         tcurr = vsetq_lane_u16(borderValue, tcurr, 2);
644                         tcurr = vsetq_lane_u16(borderValue, tcurr, 3);
645                         tcurr = vsetq_lane_u16(borderValue, tcurr, 4);
646                         tcurr = vsetq_lane_u16(borderValue, tcurr, 5);
647                         tcurr = vsetq_lane_u16(borderValue, tcurr, 6);
648                         tcurr = vsetq_lane_u16(borderValue, tcurr, 7);
649                     }
650                     else if (borderType == BORDER_MODE_REFLECT101)
651                     {
652                         tprev = vsetq_lane_u16(vgetq_lane_u16(tcurr, 6),tcurr, 2);
653                         tprev = vsetq_lane_u16(vgetq_lane_u16(tcurr, 7),tprev, 3);
654                         tprev = vsetq_lane_u16(vgetq_lane_u16(tcurr, 3),tprev, 5);
655                         tprev = vsetq_lane_u16(vgetq_lane_u16(tcurr, 4),tprev, 6);
656                         tprev = vsetq_lane_u16(vgetq_lane_u16(tcurr, 5),tprev, 7);
657                         s16 lane8 = srow4[8] + srow3[8] + srow2[8] + srow1[8] + srow0[8];
658                         tcurr = vsetq_lane_u16(lane8,tprev, 4);
659                     }
660                     else if (borderType == BORDER_MODE_REFLECT)
661                     {
662                         tprev = vsetq_lane_u16(vgetq_lane_u16(tcurr, 3),tcurr, 2);
663                         tprev = vsetq_lane_u16(vgetq_lane_u16(tcurr, 4),tprev, 3);
664                         tprev = vsetq_lane_u16(vgetq_lane_u16(tcurr, 5),tprev, 4);
665                         tprev = vsetq_lane_u16(vgetq_lane_u16(tcurr, 0),tprev, 5);
666                         tprev = vsetq_lane_u16(vgetq_lane_u16(tcurr, 1),tprev, 6);
667                         tcurr = vsetq_lane_u16(vgetq_lane_u16(tcurr, 2),tprev, 7);
668                     }
669                     else
670                     {
671                         tprev = vsetq_lane_u16(vgetq_lane_u16(tcurr, 0),tcurr, 2);
672                         tprev = vsetq_lane_u16(vgetq_lane_u16(tcurr, 1),tprev, 3);
673                         tprev = vsetq_lane_u16(vgetq_lane_u16(tcurr, 2),tprev, 4);
674                         tprev = vsetq_lane_u16(vgetq_lane_u16(tcurr, 0),tprev, 5);
675                         tprev = vsetq_lane_u16(vgetq_lane_u16(tcurr, 1),tprev, 6);
676                         tcurr = vsetq_lane_u16(vgetq_lane_u16(tcurr, 2),tprev, 7);
677                     }
678                     break;
679                 case 4:
680                     if (borderType == BORDER_MODE_CONSTANT)
681                     {
682                         tcurr = vsetq_lane_u16(borderValue, tcurr, 0);
683                         tcurr = vsetq_lane_u16(borderValue, tcurr, 1);
684                         tcurr = vsetq_lane_u16(borderValue, tcurr, 2);
685                         tcurr = vsetq_lane_u16(borderValue, tcurr, 3);
686                         tcurr = vsetq_lane_u16(borderValue, tcurr, 4);
687                         tcurr = vsetq_lane_u16(borderValue, tcurr, 5);
688                         tcurr = vsetq_lane_u16(borderValue, tcurr, 6);
689                         tcurr = vsetq_lane_u16(borderValue, tcurr, 7);
690                     }
691                     else if (borderType == BORDER_MODE_REFLECT101)
692                     {
693                         s16 lane8  = srow4[ 8] + srow3[ 8] + srow2[ 8] + srow1[ 8] + srow0[ 8];
694                         s16 lane9  = srow4[ 9] + srow3[ 9] + srow2[ 9] + srow1[ 9] + srow0[ 9];
695                         s16 lane10 = srow4[10] + srow3[10] + srow2[10] + srow1[10] + srow0[10];
696                         s16 lane11 = srow4[11] + srow3[11] + srow2[11] + srow1[11] + srow0[11];
697                         tprev = vsetq_lane_u16( lane8,tcurr, 0);
698                         tprev = vsetq_lane_u16( lane9,tprev, 1);
699                         tprev = vsetq_lane_u16(lane10,tprev, 2);
700                         tcurr = vsetq_lane_u16(lane11,tprev, 3);
701                     }
702                     else if (borderType == BORDER_MODE_REFLECT)
703                     {
704                         tcurr = vcombine_u16(vget_high_u16(tcurr),vget_low_u16(tcurr));//swap 64-bit parts
705                     }
706                     else
707                     {
708                         tcurr = vcombine_u16(vget_low_u16(tcurr),vget_low_u16(tcurr));//double 64-bit part
709                     }
710                     break;
711                 }
712                 continue;
713             }
714             switch(cn)
715             {
716             case 1:
717                 t0 = vextq_u16(tprev, tcurr, 6);
718                 t1 = vextq_u16(tprev, tcurr, 7);
719                 t2 = tcurr;
720                 t3 = vextq_u16(tcurr, tnext, 1);
721                 t4 = vextq_u16(tcurr, tnext, 2);
722                 break;
723             case 2:
724                 t0 = vextq_u16(tprev, tcurr, 4);
725                 t1 = vextq_u16(tprev, tcurr, 6);
726                 t2 = tcurr;
727                 t3 = vextq_u16(tcurr, tnext, 2);
728                 t4 = vextq_u16(tcurr, tnext, 4);
729                 break;
730             case 3:
731                 t0 = vextq_u16(tprev, tcurr, 2);
732                 t1 = vextq_u16(tprev, tcurr, 5);
733                 t2 = tcurr;
734                 t3 = vextq_u16(tcurr, tnext, 3);
735                 t4 = vextq_u16(tcurr, tnext, 6);
736                 break;
737             case 4:
738                 t0 = tprev;
739                 t1 = vextq_u16(tprev, tcurr, 4);
740                 t2 = tcurr;
741                 t3 = vextq_u16(tcurr, tnext, 4);
742                 t4 = tnext;
743                 break;
744             default:
745                 internal::assertSupportedConfiguration(false);//Unsupported channels number
746                 return;
747             }
748             t0 = vqaddq_u16(vqaddq_u16(vqaddq_u16(t0, t1), vqaddq_u16(t2, t3)), t4);
749 
750 #ifdef FLOAT_VARIANT_1_25
751             uint32x4_t tres1 = vmovl_u16(vget_low_u16(t0));
752             uint32x4_t tres2 = vmovl_u16(vget_high_u16(t0));
753             float32x4_t vf1 = vmulq_f32(v1_25, vcvtq_f32_u32(tres1));
754             float32x4_t vf2 = vmulq_f32(v1_25, vcvtq_f32_u32(tres2));
755             tres1 = vcvtq_u32_f32(vaddq_f32(vf1, v0_5));
756             tres2 = vcvtq_u32_f32(vaddq_f32(vf2, v0_5));
757             t0 = vcombine_u16(vmovn_u32(tres1),vmovn_u32(tres2));
758             vst1_u8(drow + x - 8, vmovn_u16(t0));
759 #else
760             int16x8_t tt0 = vqrdmulhq_s16(vreinterpretq_s16_u16(t0), vScale);
761             uint8x8_t it0 = vmovn_u16(vreinterpretq_u16_s16(tt0));
762             vst1_u8(drow + x - 8, it0);
763 #endif
764         }
765 
766         x -= 8;
767         if(x == colsn){
768             x -= cn;
769         }
770         s16 pprevx[4], prevx[4], rowx[4], nextx[4], nnextx[4];
771         ptrdiff_t px = x / cn;
772         for( s32 k = 0; k < cn; k++ )
773         {
774             ptrdiff_t ploc;
775             ploc = internal::borderInterpolate(px-2, size.width, borderType);
776             pprevx[k] = ploc < 0 ? 5*borderValue :
777                                    srow4[ploc*cn+k] + srow3[ploc*cn+k] + srow2[ploc*cn+k] + srow1[ploc*cn+k] + srow0[ploc*cn+k];
778 
779             ploc = internal::borderInterpolate(px-1, size.width, borderType);
780             prevx[k]  = ploc < 0 ? 5*borderValue :
781                                    srow4[ploc*cn+k] + srow3[ploc*cn+k] + srow2[ploc*cn+k] + srow1[ploc*cn+k] + srow0[ploc*cn+k];
782 
783             rowx[k]   = srow4[px*cn+k] + srow3[px*cn+k] + srow2[px*cn+k] + srow1[px*cn+k] + srow0[px*cn+k];
784 
785             ploc = internal::borderInterpolate(px+1, size.width, borderType);
786             nextx[k]  = ploc < 0 ? 5*borderValue :
787                                    srow4[ploc*cn+k] + srow3[ploc*cn+k] + srow2[ploc*cn+k] + srow1[ploc*cn+k] + srow0[ploc*cn+k];
788         }
789         x = px*cn;
790         for( ; x < colsn; x+=cn, px++ )
791         {
792             for( s32 k = 0; k < cn; k++ )
793             {
794                 ptrdiff_t ploc = internal::borderInterpolate(px+2, size.width, borderType);
795                 nnextx[k] = ploc < 0 ? 5*borderValue :
796                                        srow4[ploc*cn+k] + srow3[ploc*cn+k] + srow2[ploc*cn+k] + srow1[ploc*cn+k] + srow0[ploc*cn+k];
797                 *(drow+x+k) = internal::saturate_cast<u8>((pprevx[k] + prevx[k] + rowx[k] + nextx[k] +nnextx[k])*(1/25.));
798                 pprevx[k] = prevx[k];
799                 prevx[k]  = rowx[k];
800                 rowx[k]   = nextx[k];
801                 nextx[k]  = nnextx[k];
802             }
803         }
804     }
805 #else
806     (void)srcBase;
807     (void)srcStride;
808     (void)dstBase;
809     (void)dstStride;
810     (void)borderValue;
811 #endif
812 }
813 
isBlurF32Supported(const Size2D & size,s32 cn,BORDER_MODE border)814 bool isBlurF32Supported(const Size2D &size, s32 cn, BORDER_MODE border)
815 {
816     return isSupportedConfiguration() &&
817            cn > 0 && cn <= 4 &&
818            size.width*cn >= 4 && size.height >= 2 &&
819            (border == BORDER_MODE_CONSTANT ||
820             border == BORDER_MODE_REFLECT101 ||
821             border == BORDER_MODE_REFLECT ||
822             border == BORDER_MODE_REPLICATE ||
823             border == BORDER_MODE_WRAP);
824 }
825 
blur3x3(const Size2D & size,s32 cn,const f32 * srcBase,ptrdiff_t srcStride,f32 * dstBase,ptrdiff_t dstStride,BORDER_MODE borderType,f32 borderValue,Margin borderMargin)826 void blur3x3(const Size2D &size, s32 cn,
827              const f32 * srcBase, ptrdiff_t srcStride,
828              f32 * dstBase, ptrdiff_t dstStride,
829              BORDER_MODE borderType, f32 borderValue, Margin borderMargin)
830 {
831     internal::assertSupportedConfiguration(isBlurF32Supported(size, cn, borderType));
832 #ifdef CAROTENE_NEON
833     size_t colsn = size.width * cn;
834 
835     std::vector<f32> _tmp;
836     f32 *tmp = 0;
837     if (borderType == BORDER_MODE_CONSTANT)
838     {
839         _tmp.assign(colsn + 2*cn, borderValue);
840         tmp = &_tmp[cn];
841     }
842 
843     ptrdiff_t idx_l = internal::borderInterpolate(-1, size.width, borderType, borderMargin.left, borderMargin.right) * cn;
844     ptrdiff_t idx_r = internal::borderInterpolate(size.width, size.width, borderType, borderMargin.left, borderMargin.right) * cn;
845 
846     //2-line buffer
847     std::vector<f32> _buf(4*(cn * (size.width + 2) + 32 / sizeof(f32)));
848     f32* lanea = internal::alignPtr(&_buf[cn], 32);
849     f32* laneA = internal::alignPtr(lanea + cn * (size.width + 2), 32);
850 
851     f32* laneb = internal::alignPtr(laneA + cn * (size.width + 2), 32);
852     f32* laneB = internal::alignPtr(laneb + cn * (size.width + 2), 32);
853 
854     if (borderType == BORDER_MODE_CONSTANT)
855         for (s32 k = 0; k < cn; ++k)
856         {
857             lanea[-cn+k] = borderValue;
858             lanea[colsn+k] = borderValue;
859             laneA[-cn+k] = borderValue;
860             laneA[colsn+k] = borderValue;
861             laneb[-cn+k] = borderValue;
862             laneb[colsn+k] = borderValue;
863             laneB[-cn+k] = borderValue;
864             laneB[colsn+k] = borderValue;
865         }
866 
867     size_t i = 0;
868     f32* dsta = internal::getRowPtr(dstBase, dstStride, 0);
869     for (; i < size.height-1; i+=2)
870     {
871         //vertical convolution
872         ptrdiff_t idx_rm1 = internal::borderInterpolate(i - 1, size.height, borderType, borderMargin.top, borderMargin.bottom);
873         ptrdiff_t idx_rp2 = internal::borderInterpolate(i + 2, size.height, borderType, borderMargin.top, borderMargin.bottom);
874 
875         const f32* ln0 = idx_rm1 >= -(ptrdiff_t)borderMargin.top ? internal::getRowPtr(srcBase, srcStride, idx_rm1) : tmp;
876         const f32* ln1 = internal::getRowPtr(srcBase, srcStride, i);
877         const f32* ln2 = internal::getRowPtr(srcBase, srcStride, i + 1);
878         const f32* ln3 = idx_rp2 >= -(ptrdiff_t)borderMargin.top ? internal::getRowPtr(srcBase, srcStride, idx_rp2) : tmp;
879 
880         size_t x = 0;
881         for (; x <= colsn - 4; x += 4)
882         {
883             internal::prefetch(ln1 + x);
884             internal::prefetch(ln2 + x);
885             internal::prefetch(ln0 + x);
886             internal::prefetch(ln3 + x);
887 box3x3f32_vert:
888             float32x4_t v1 = vld1q_f32(ln1 + x);
889             float32x4_t v2 = vld1q_f32(ln2 + x);
890             float32x4_t v0 = vld1q_f32(ln0 + x);
891             float32x4_t v3 = vld1q_f32(ln3 + x);
892 
893             float32x4_t v = vaddq_f32(v1, v2);
894             float32x4_t w0 = vaddq_f32(v, v0);
895             float32x4_t w1 = vaddq_f32(v, v3);
896 
897             vst1q_f32(lanea + x, w0);
898             vst1q_f32(laneb + x, w1);
899         }
900         if(x < colsn)
901         {
902             x = colsn-4;
903             goto box3x3f32_vert;
904         }
905 
906         //left&right borders
907         if (borderType != BORDER_MODE_CONSTANT)
908             for (s32 k = 0; k < cn; ++k)
909             {
910                 lanea[-cn+k] = lanea[idx_l + k];
911                 lanea[colsn+k] = lanea[idx_r + k];
912                 laneb[-cn+k] = laneb[idx_l + k];
913                 laneb[colsn+k] = laneb[idx_r + k];
914             }
915 
916         //horizontal convolution (2 lines from previous iteration)
917         if (i > 0)
918         {
919             f32* dstb = internal::getRowPtr(dstBase, dstStride, i-1);
920             x = 0;
921             for (; x <= colsn - 4; x += 4)
922             {
923                 internal::prefetch(laneA + x + cn);
924                 internal::prefetch(laneB + x + cn);
925 box3x3f32_horiz:
926                 float32x4_t lane0a = vld1q_f32(laneA + x - cn);
927                 float32x4_t lane2a = vld1q_f32(laneA + x + cn);
928                 float32x4_t lane1a = vld1q_f32(laneA + x);
929 
930                 float32x4_t lane0b = vld1q_f32(laneB + x - cn);
931                 float32x4_t lane2b = vld1q_f32(laneB + x + cn);
932                 float32x4_t lane1b = vld1q_f32(laneB + x);
933 
934                 float32x4_t va = vaddq_f32(lane0a, lane2a);
935                 float32x4_t vb = vaddq_f32(lane0b, lane2b);
936                 float32x4_t wa = vaddq_f32(va, lane1a);
937                 float32x4_t wb = vaddq_f32(vb, lane1b);
938 
939                 vst1q_f32(dsta + x, wa);
940                 vst1q_f32(dstb + x, wb);
941             }
942             if(x < colsn)
943             {
944                 x = colsn-4;
945                 goto box3x3f32_horiz;
946             }
947             dsta = internal::getRowPtr(dstBase, dstStride, i);
948         }
949 
950         std::swap(lanea, laneA);
951         std::swap(laneb, laneB);
952     }
953 
954     //last line
955     if(i < size.height)
956     {
957         //vertical convolution
958         ptrdiff_t idx_rm1 = internal::borderInterpolate(i - 1, size.height, borderType, borderMargin.top, borderMargin.bottom);
959         ptrdiff_t idx_rp1 = internal::borderInterpolate(i + 1, size.height, borderType, borderMargin.top, borderMargin.bottom);
960 
961         const f32* ln0 = idx_rm1 >= -(ptrdiff_t)borderMargin.top ? internal::getRowPtr(srcBase, srcStride, idx_rm1) : tmp;
962         const f32* ln1 = internal::getRowPtr(srcBase, srcStride, i);
963         const f32* ln2 = idx_rp1 >= -(ptrdiff_t)borderMargin.top ? internal::getRowPtr(srcBase, srcStride, idx_rp1) : tmp;
964 
965         size_t x = 0;
966         for (; x <= colsn - 4; x += 4)
967         {
968             internal::prefetch(ln0 + x);
969             internal::prefetch(ln1 + x);
970             internal::prefetch(ln2 + x);
971 box3x3f32_vert_ll:
972             float32x4_t v0 = vld1q_f32(ln0+x);
973             float32x4_t v1 = vld1q_f32(ln1+x);
974             float32x4_t v2 = vld1q_f32(ln2+x);
975 
976             float32x4_t v = vaddq_f32(v0, v1);
977             float32x4_t w = vaddq_f32(v, v2);
978 
979             vst1q_f32(lanea + x, w);
980         }
981         if(x < colsn)
982         {
983             x = colsn-4;
984             goto box3x3f32_vert_ll;
985         }
986 
987         //left&right borders
988         if (borderType != BORDER_MODE_CONSTANT)
989             for (s32 k = 0; k < cn; ++k)
990             {
991                 lanea[-cn+k] = lanea[idx_l + k];
992                 lanea[colsn+k] = lanea[idx_r + k];
993             }
994 
995         //horizontal convolution (last 3 lines)
996         x = 0;
997         f32* dstb = internal::getRowPtr(dstBase, dstStride, i-1);
998         f32* dstc = internal::getRowPtr(dstBase, dstStride, i);
999         for (; x <= colsn - 4; x += 4)
1000         {
1001             internal::prefetch(laneA + x + cn);
1002             internal::prefetch(laneB + x + cn);
1003             internal::prefetch(lanea + x + cn);
1004 box3x3f32_horiz_ll:
1005             float32x4_t lane0a = vld1q_f32(laneA + x - cn);
1006             float32x4_t lane2a = vld1q_f32(laneA + x + cn);
1007             float32x4_t lane1a = vld1q_f32(laneA + x);
1008 
1009             float32x4_t lane0b = vld1q_f32(laneB + x - cn);
1010             float32x4_t lane2b = vld1q_f32(laneB + x + cn);
1011             float32x4_t lane1b = vld1q_f32(laneB + x);
1012 
1013             float32x4_t lane0c = vld1q_f32(lanea + x - cn);
1014             float32x4_t lane2c = vld1q_f32(lanea + x + cn);
1015             float32x4_t lane1c = vld1q_f32(lanea + x);
1016 
1017             float32x4_t va = vaddq_f32(lane0a, lane2a);
1018             float32x4_t vb = vaddq_f32(lane0b, lane2b);
1019             float32x4_t vc = vaddq_f32(lane0c, lane2c);
1020             float32x4_t wa = vaddq_f32(va, lane1a);
1021             float32x4_t wb = vaddq_f32(vb, lane1b);
1022             float32x4_t wc = vaddq_f32(vc, lane1c);
1023 
1024             vst1q_f32(dsta + x, wa);
1025             vst1q_f32(dstb + x, wb);
1026             vst1q_f32(dstc + x, wc);
1027         }
1028         if(x < colsn)
1029         {
1030             x = colsn-4;
1031             goto box3x3f32_horiz_ll;
1032         }
1033     }
1034     else
1035     {
1036         //horizontal convolution (last 2 lines)
1037         f32* dstb = internal::getRowPtr(dstBase, dstStride, i-1);
1038         size_t x = 0;
1039         for (; x <= colsn - 4; x += 4)
1040         {
1041             internal::prefetch(laneA + x + cn);
1042             internal::prefetch(laneB + x + cn);
1043 box3x3f32_horiz_last2:
1044             float32x4_t lane0a = vld1q_f32(laneA + x - cn);
1045             float32x4_t lane2a = vld1q_f32(laneA + x + cn);
1046             float32x4_t lane1a = vld1q_f32(laneA + x);
1047 
1048             float32x4_t lane0b = vld1q_f32(laneB + x - cn);
1049             float32x4_t lane2b = vld1q_f32(laneB + x + cn);
1050             float32x4_t lane1b = vld1q_f32(laneB + x);
1051 
1052             float32x4_t va = vaddq_f32(lane0a, lane2a);
1053             float32x4_t vb = vaddq_f32(lane0b, lane2b);
1054             float32x4_t wa = vaddq_f32(va, lane1a);
1055             float32x4_t wb = vaddq_f32(vb, lane1b);
1056 
1057             vst1q_f32(dsta + x, wa);
1058             vst1q_f32(dstb + x, wb);
1059         }
1060         if(x < colsn)
1061         {
1062             x = colsn-4;
1063             goto box3x3f32_horiz_last2;
1064         }
1065     }
1066 #else
1067     (void)srcBase;
1068     (void)srcStride;
1069     (void)dstBase;
1070     (void)dstStride;
1071     (void)borderValue;
1072     (void)borderMargin;
1073 #endif
1074 }
1075 
isBlurS32Supported(const Size2D & size,s32 cn,BORDER_MODE border)1076 bool isBlurS32Supported(const Size2D &size, s32 cn, BORDER_MODE border)
1077 {
1078     return isSupportedConfiguration() &&
1079            cn > 0 && cn <= 4 &&
1080            size.width*cn >= 4 && size.height >= 2 &&
1081            (border == BORDER_MODE_CONSTANT ||
1082             border == BORDER_MODE_REFLECT101 ||
1083             border == BORDER_MODE_REFLECT ||
1084             border == BORDER_MODE_REPLICATE ||
1085             border == BORDER_MODE_WRAP);
1086 }
1087 
blur3x3(const Size2D & size,s32 cn,const s32 * srcBase,ptrdiff_t srcStride,s32 * dstBase,ptrdiff_t dstStride,BORDER_MODE borderType,s32 borderValue,Margin borderMargin)1088 void blur3x3(const Size2D &size, s32 cn,
1089              const s32 * srcBase, ptrdiff_t srcStride,
1090              s32 * dstBase, ptrdiff_t dstStride,
1091              BORDER_MODE borderType, s32 borderValue, Margin borderMargin)
1092 {
1093     internal::assertSupportedConfiguration(isBlurS32Supported(size, cn, borderType));
1094 #ifdef CAROTENE_NEON
1095     size_t colsn = size.width * cn;
1096 
1097     std::vector<s32> _tmp;
1098     s32 *tmp = 0;
1099     if (borderType == BORDER_MODE_CONSTANT)
1100     {
1101         _tmp.assign(colsn + 2*cn, borderValue);
1102         tmp = &_tmp[cn];
1103     }
1104 
1105     ptrdiff_t idx_l = internal::borderInterpolate(-1, size.width, borderType, borderMargin.left, borderMargin.right) * cn;
1106     ptrdiff_t idx_r = internal::borderInterpolate(size.width, size.width, borderType, borderMargin.left, borderMargin.right) * cn;
1107 
1108     //2-line buffer
1109     std::vector<s32> _buf(4*(cn * (size.width + 2) + 32 / sizeof(s32)));
1110     s32* lanea = internal::alignPtr(&_buf[cn], 32);
1111     s32* laneA = internal::alignPtr(lanea + cn * (size.width + 2), 32);
1112 
1113     s32* laneb = internal::alignPtr(laneA + cn * (size.width + 2), 32);
1114     s32* laneB = internal::alignPtr(laneb + cn * (size.width + 2), 32);
1115 
1116     if (borderType == BORDER_MODE_CONSTANT)
1117         for (s32 k = 0; k < cn; ++k)
1118         {
1119             lanea[-cn+k] = borderValue;
1120             lanea[colsn+k] = borderValue;
1121             laneA[-cn+k] = borderValue;
1122             laneA[colsn+k] = borderValue;
1123             laneb[-cn+k] = borderValue;
1124             laneb[colsn+k] = borderValue;
1125             laneB[-cn+k] = borderValue;
1126             laneB[colsn+k] = borderValue;
1127         }
1128 
1129     size_t i = 0;
1130     s32* dsta = internal::getRowPtr(dstBase, dstStride, 0);
1131     for (; i < size.height-1; i+=2)
1132     {
1133         //vertical convolution
1134         ptrdiff_t idx_rm1 = internal::borderInterpolate(i - 1, size.height, borderType, borderMargin.top, borderMargin.bottom);
1135         ptrdiff_t idx_rp2 = internal::borderInterpolate(i + 2, size.height, borderType, borderMargin.top, borderMargin.bottom);
1136 
1137         const s32* ln0 = idx_rm1 >= -(ptrdiff_t)borderMargin.top ? internal::getRowPtr(srcBase, srcStride, idx_rm1) : tmp;
1138         const s32* ln1 = internal::getRowPtr(srcBase, srcStride, i);
1139         const s32* ln2 = internal::getRowPtr(srcBase, srcStride, i + 1);
1140         const s32* ln3 = idx_rp2 >= -(ptrdiff_t)borderMargin.top ? internal::getRowPtr(srcBase, srcStride, idx_rp2) : tmp;
1141 
1142         size_t x = 0;
1143         for (; x <= colsn - 4; x += 4)
1144         {
1145             internal::prefetch(ln1 + x);
1146             internal::prefetch(ln2 + x);
1147             internal::prefetch(ln0 + x);
1148             internal::prefetch(ln3 + x);
1149 box3x3s32_vert:
1150             int32x4_t v1 = vld1q_s32(ln1 + x);
1151             int32x4_t v2 = vld1q_s32(ln2 + x);
1152             int32x4_t v0 = vld1q_s32(ln0 + x);
1153             int32x4_t v3 = vld1q_s32(ln3 + x);
1154 
1155             int32x4_t v = vaddq_s32(v1, v2);
1156             int32x4_t w0 = vaddq_s32(v, v0);
1157             int32x4_t w1 = vaddq_s32(v, v3);
1158 
1159             vst1q_s32(lanea + x, w0);
1160             vst1q_s32(laneb + x, w1);
1161         }
1162         if(x < colsn)
1163         {
1164             x = colsn-4;
1165             goto box3x3s32_vert;
1166         }
1167 
1168         //left&right borders
1169         if (borderType != BORDER_MODE_CONSTANT)
1170             for (s32 k = 0; k < cn; ++k)
1171             {
1172                 lanea[-cn+k] = lanea[idx_l + k];
1173                 lanea[colsn+k] = lanea[idx_r + k];
1174                 laneb[-cn+k] = laneb[idx_l + k];
1175                 laneb[colsn+k] = laneb[idx_r + k];
1176             }
1177 
1178         //horizontal convolution (2 lines from previous iteration)
1179         if (i > 0)
1180         {
1181             s32* dstb = internal::getRowPtr(dstBase, dstStride, i-1);
1182             x = 0;
1183             for (; x <= colsn - 4; x += 4)
1184             {
1185                 internal::prefetch(laneA + x + cn);
1186                 internal::prefetch(laneB + x + cn);
1187 box3x3s32_horiz:
1188                 int32x4_t lane0a = vld1q_s32(laneA + x - cn);
1189                 int32x4_t lane2a = vld1q_s32(laneA + x + cn);
1190                 int32x4_t lane1a = vld1q_s32(laneA + x);
1191 
1192                 int32x4_t lane0b = vld1q_s32(laneB + x - cn);
1193                 int32x4_t lane2b = vld1q_s32(laneB + x + cn);
1194                 int32x4_t lane1b = vld1q_s32(laneB + x);
1195 
1196                 int32x4_t va = vaddq_s32(lane0a, lane2a);
1197                 int32x4_t vb = vaddq_s32(lane0b, lane2b);
1198                 int32x4_t wa = vaddq_s32(va, lane1a);
1199                 int32x4_t wb = vaddq_s32(vb, lane1b);
1200 
1201                 vst1q_s32(dsta + x, wa);
1202                 vst1q_s32(dstb + x, wb);
1203             }
1204             if(x < colsn)
1205             {
1206                 x = colsn-4;
1207                 goto box3x3s32_horiz;
1208             }
1209             dsta = internal::getRowPtr(dstBase, dstStride, i);
1210         }
1211 
1212         std::swap(lanea, laneA);
1213         std::swap(laneb, laneB);
1214     }
1215     //last line
1216     if(i < size.height)
1217     {
1218         //vertical convolution
1219         ptrdiff_t idx_rm1 = internal::borderInterpolate(i - 1, size.height, borderType, borderMargin.top, borderMargin.bottom);
1220         ptrdiff_t idx_rp1 = internal::borderInterpolate(i + 1, size.height, borderType, borderMargin.top, borderMargin.bottom);
1221 
1222         const s32* ln0 = idx_rm1 >= -(ptrdiff_t)borderMargin.top ? internal::getRowPtr(srcBase, srcStride, idx_rm1) : tmp;
1223         const s32* ln1 = internal::getRowPtr(srcBase, srcStride, i);
1224         const s32* ln2 = idx_rp1 >= -(ptrdiff_t)borderMargin.top ? internal::getRowPtr(srcBase, srcStride, idx_rp1) : tmp;
1225 
1226         size_t x = 0;
1227         for (; x <= colsn - 4; x += 4)
1228         {
1229             internal::prefetch(ln0 + x);
1230             internal::prefetch(ln1 + x);
1231             internal::prefetch(ln2 + x);
1232 box3x3s32_vert_ll:
1233             int32x4_t v0 = vld1q_s32(ln0+x);
1234             int32x4_t v1 = vld1q_s32(ln1+x);
1235             int32x4_t v2 = vld1q_s32(ln2+x);
1236 
1237             int32x4_t v = vaddq_s32(v0, v1);
1238             int32x4_t w = vaddq_s32(v, v2);
1239 
1240             vst1q_s32(lanea + x, w);
1241         }
1242         if(x < colsn)
1243         {
1244             x = colsn-4;
1245             goto box3x3s32_vert_ll;
1246         }
1247 
1248         //left&right borders
1249         if (borderType != BORDER_MODE_CONSTANT)
1250             for (s32 k = 0; k < cn; ++k)
1251             {
1252                 lanea[-cn+k] = lanea[idx_l + k];
1253                 lanea[colsn+k] = lanea[idx_r + k];
1254             }
1255 
1256         //horizontal convolution (last 3 lines)
1257         x = 0;
1258         s32* dstb = internal::getRowPtr(dstBase, dstStride, i-1);
1259         s32* dstc = internal::getRowPtr(dstBase, dstStride, i);
1260         for (; x <= colsn - 4; x += 4)
1261         {
1262             internal::prefetch(laneA + x + cn);
1263             internal::prefetch(laneB + x + cn);
1264             internal::prefetch(lanea + x + cn);
1265 box3x3s32_horiz_ll:
1266             int32x4_t lane0a = vld1q_s32(laneA + x - cn);
1267             int32x4_t lane2a = vld1q_s32(laneA + x + cn);
1268             int32x4_t lane1a = vld1q_s32(laneA + x);
1269 
1270             int32x4_t lane0b = vld1q_s32(laneB + x - cn);
1271             int32x4_t lane2b = vld1q_s32(laneB + x + cn);
1272             int32x4_t lane1b = vld1q_s32(laneB + x);
1273 
1274             int32x4_t lane0c = vld1q_s32(lanea + x - cn);
1275             int32x4_t lane2c = vld1q_s32(lanea + x + cn);
1276             int32x4_t lane1c = vld1q_s32(lanea + x);
1277 
1278             int32x4_t va = vaddq_s32(lane0a, lane2a);
1279             int32x4_t vb = vaddq_s32(lane0b, lane2b);
1280             int32x4_t vc = vaddq_s32(lane0c, lane2c);
1281             int32x4_t wa = vaddq_s32(va, lane1a);
1282             int32x4_t wb = vaddq_s32(vb, lane1b);
1283             int32x4_t wc = vaddq_s32(vc, lane1c);
1284 
1285             vst1q_s32(dsta + x, wa);
1286             vst1q_s32(dstb + x, wb);
1287             vst1q_s32(dstc + x, wc);
1288         }
1289         if(x < colsn)
1290         {
1291             x = colsn-4;
1292             goto box3x3s32_horiz_ll;
1293         }
1294     }
1295     else
1296     {
1297         //horizontal convolution (last 2 lines)
1298         s32* dstb = internal::getRowPtr(dstBase, dstStride, i-1);
1299         size_t x = 0;
1300         for (; x <= colsn - 4; x += 4)
1301         {
1302             internal::prefetch(laneA + x + cn);
1303             internal::prefetch(laneB + x + cn);
1304 box3x3s32_horiz_last2:
1305             int32x4_t lane0a = vld1q_s32(laneA + x - cn);
1306             int32x4_t lane2a = vld1q_s32(laneA + x + cn);
1307             int32x4_t lane1a = vld1q_s32(laneA + x);
1308 
1309             int32x4_t lane0b = vld1q_s32(laneB + x - cn);
1310             int32x4_t lane2b = vld1q_s32(laneB + x + cn);
1311             int32x4_t lane1b = vld1q_s32(laneB + x);
1312 
1313             int32x4_t va = vaddq_s32(lane0a, lane2a);
1314             int32x4_t vb = vaddq_s32(lane0b, lane2b);
1315             int32x4_t wa = vaddq_s32(va, lane1a);
1316             int32x4_t wb = vaddq_s32(vb, lane1b);
1317 
1318             vst1q_s32(dsta + x, wa);
1319             vst1q_s32(dstb + x, wb);
1320         }
1321         if(x < colsn)
1322         {
1323             x = colsn-4;
1324             goto box3x3s32_horiz_last2;
1325         }
1326     }
1327 #else
1328     (void)srcBase;
1329     (void)srcStride;
1330     (void)dstBase;
1331     (void)dstStride;
1332     (void)borderValue;
1333     (void)borderMargin;
1334 #endif
1335 }
1336 
1337 } //namespace CAROTENE_NS
1338