• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * By downloading, copying, installing or using the software you agree to this license.
3  * If you do not agree to this license, do not download, install,
4  * copy or use the software.
5  *
6  *
7  *                           License Agreement
8  *                For Open Source Computer Vision Library
9  *                        (3-clause BSD License)
10  *
11  * Copyright (C) 2015, NVIDIA Corporation, all rights reserved.
12  * Third party copyrights are property of their respective owners.
13  *
14  * Redistribution and use in source and binary forms, with or without modification,
15  * are permitted provided that the following conditions are met:
16  *
17  *   * Redistributions of source code must retain the above copyright notice,
18  *     this list of conditions and the following disclaimer.
19  *
20  *   * Redistributions in binary form must reproduce the above copyright notice,
21  *     this list of conditions and the following disclaimer in the documentation
22  *     and/or other materials provided with the distribution.
23  *
24  *   * Neither the names of the copyright holders nor the names of the contributors
25  *     may be used to endorse or promote products derived from this software
26  *     without specific prior written permission.
27  *
28  * This software is provided by the copyright holders and contributors "as is" and
29  * any express or implied warranties, including, but not limited to, the implied
30  * warranties of merchantability and fitness for a particular purpose are disclaimed.
31  * In no event shall copyright holders or contributors be liable for any direct,
32  * indirect, incidental, special, exemplary, or consequential damages
33  * (including, but not limited to, procurement of substitute goods or services;
34  * loss of use, data, or profits; or business interruption) however caused
35  * and on any theory of liability, whether in contract, strict liability,
36  * or tort (including negligence or otherwise) arising in any way out of
37  * the use of this software, even if advised of the possibility of such damage.
38  */
39 
40 #include "common.hpp"
41 #include "saturate_cast.hpp"
42 
43 #include <vector>
44 
45 namespace CAROTENE_NS {
46 
isLaplacian3x3Supported(const Size2D & size,BORDER_MODE border)47 bool isLaplacian3x3Supported(const Size2D &size, BORDER_MODE border)
48 {
49     return isSupportedConfiguration() && size.width >= 8 &&
50         (border == BORDER_MODE_CONSTANT ||
51             border == BORDER_MODE_REPLICATE);
52 }
53 
Laplacian3x3(const Size2D & size,const u8 * srcBase,ptrdiff_t srcStride,u8 * dstBase,ptrdiff_t dstStride,BORDER_MODE border,u8 borderValue)54 void Laplacian3x3(const Size2D &size,
55                   const u8 * srcBase, ptrdiff_t srcStride,
56                   u8 * dstBase, ptrdiff_t dstStride,
57                   BORDER_MODE border, u8 borderValue)
58 {
59     internal::assertSupportedConfiguration(isLaplacian3x3Supported(size, border));
60 #ifdef CAROTENE_NEON
61     const uint16x8_t v_border_x3 = vdupq_n_u16(borderValue * 3);
62     const uint16x8_t v_zero = vdupq_n_u16(0);
63     const uint8x8_t v_border = vdup_n_u8(borderValue);
64 
65     uint8x8_t vsub;
66     uint16x8_t tprev = v_zero, tcurr = v_zero, tnext = v_zero;
67     uint16x8_t t0 = v_zero, t1 = v_zero, t2 = v_zero;
68 
69     ptrdiff_t width = (ptrdiff_t)size.width, height = (ptrdiff_t)size.height;
70 
71     for (ptrdiff_t y = 0; y < height; ++y)
72     {
73         const u8 * srow0 = y == 0 && border == BORDER_MODE_CONSTANT ? NULL : internal::getRowPtr(srcBase, srcStride, std::max<ptrdiff_t>(y - 1, 0));
74         const u8 * srow1 = internal::getRowPtr(srcBase, srcStride, y);
75         const u8 * srow2 = y + 1 == height && border == BORDER_MODE_CONSTANT ? NULL : internal::getRowPtr(srcBase, srcStride, std::min(y + 1, height - 1));
76         u8 * drow = internal::getRowPtr(dstBase, dstStride, y);
77 
78         s16 prevx = 0, currx = 0, nextx = 0;
79         ptrdiff_t x = 0;
80         const ptrdiff_t bwidth = y + 2 < height ? width : (width - 8);
81 
82         // perform vertical convolution
83         for ( ; x <= bwidth; x += 8)
84         {
85             internal::prefetch(srow0 + x);
86             internal::prefetch(srow1 + x);
87             internal::prefetch(srow2 + x);
88 
89             uint8x8_t x0 = !srow0 ? v_border : vld1_u8(srow0 + x);
90             uint8x8_t x1 = vld1_u8(srow1 + x);
91             uint8x8_t x2 = !srow2 ? v_border : vld1_u8(srow2 + x);
92 
93             // calculate values for plain CPU part below if needed
94             if (x + 8 >= bwidth)
95             {
96                 ptrdiff_t x3 = x == width ? width - 1 : x;
97                 ptrdiff_t x4 = border == BORDER_MODE_CONSTANT ? x3 - 1 : std::max<ptrdiff_t>(x3 - 1, 0);
98 
99                 if (border == BORDER_MODE_CONSTANT && x4 < 0)
100                     prevx = borderValue;
101                 else
102                     prevx = (srow2 ? srow2[x4] : borderValue) + srow1[x4] + (srow0 ? srow0[x4] : borderValue);
103 
104                 currx = (srow2 ? srow2[x3] : borderValue) + srow1[x3] + (srow0 ? srow0[x3] : borderValue);
105             }
106 
107             // make shift
108             if (x)
109             {
110                 tprev = tcurr;
111                 tcurr = tnext;
112             }
113 
114             // and calculate next value
115             tnext = vaddw_u8(vaddl_u8(x0, x1), x2);
116 
117             // make extrapolation for the first elements
118             if (!x)
119             {
120                 // make border
121                 if (border == BORDER_MODE_CONSTANT)
122                     tcurr = v_border_x3;
123                 else if (border == BORDER_MODE_REPLICATE)
124                     tcurr = vdupq_n_u16(vgetq_lane_u16(tnext, 0));
125 
126                 vsub = x1;
127 
128                 continue;
129             }
130 
131             // combine 3 "shifted" vectors
132             t0 = vextq_u16(tprev, tcurr, 7);
133             t1 = tcurr;
134             t2 = vextq_u16(tcurr, tnext, 1);
135 
136             // and add them
137             t0 = vqaddq_u16(t0, vqaddq_u16(t1, t2));
138 
139             int16x8_t tt0 = vsubq_s16(vreinterpretq_s16_u16(t0),
140                                       vreinterpretq_s16_u16(vaddw_u8(vshll_n_u8(vsub, 3), vsub)));
141             uint8x8_t it0 = vqmovun_s16(tt0);
142             vst1_u8(drow + x - 8, it0);
143 
144             vsub = x1;
145         }
146 
147         x -= 8;
148         if (x == width)
149             --x;
150 
151         for ( ; x < width; ++x)
152         {
153             // make extrapolation for the last elements
154             if (x + 1 >= width)
155             {
156                 if (border == BORDER_MODE_CONSTANT)
157                     nextx = borderValue * 3;
158                 else if (border == BORDER_MODE_REPLICATE)
159                     nextx = srow2[x] + srow1[x] + srow0[x];
160             }
161             else
162             {
163                 nextx = (srow2 ? srow2[x + 1] : borderValue) +
164                                  srow1[x + 1] +
165                         (srow0 ? srow0[x + 1] : borderValue);
166             }
167 
168             s32 val = (prevx + currx + nextx) - 9 * srow1[x];
169             drow[x] = internal::saturate_cast<u8>((s32)val);
170 
171             // make shift
172             prevx = currx;
173             currx = nextx;
174         }
175     }
176 #else
177     (void)size;
178     (void)srcBase;
179     (void)srcStride;
180     (void)dstBase;
181     (void)dstStride;
182     (void)border;
183     (void)borderValue;
184 #endif
185 }
186 
isLaplacianOpenCVSupported(const Size2D & size,BORDER_MODE border)187 bool isLaplacianOpenCVSupported(const Size2D &size, BORDER_MODE border)
188 {
189     return isSupportedConfiguration() &&
190         size.width >= 8 && size.height >= 1 &&
191         (border == BORDER_MODE_CONSTANT   ||
192          border == BORDER_MODE_REFLECT    ||
193          border == BORDER_MODE_REFLECT101 ||
194          border == BORDER_MODE_REPLICATE);
195 }
196 
Laplacian1OpenCV(const Size2D & size,const u8 * srcBase,ptrdiff_t srcStride,s16 * dstBase,ptrdiff_t dstStride,BORDER_MODE border,u8 borderValue)197 void Laplacian1OpenCV(const Size2D &size,
198                       const u8 * srcBase, ptrdiff_t srcStride,
199                       s16 * dstBase, ptrdiff_t dstStride,
200                       BORDER_MODE border, u8 borderValue)
201 {
202     internal::assertSupportedConfiguration(isLaplacianOpenCVSupported(size, border));
203 #ifdef CAROTENE_NEON
204     ptrdiff_t rows = size.height, cols = size.width;
205 
206     std::vector<u8> _tmp;
207     u8 *tmp = 0;
208     if (border == BORDER_MODE_CONSTANT)
209     {
210         _tmp.assign(cols + 4,borderValue);
211         tmp = &_tmp[2];
212     }
213 
214     for( ptrdiff_t y = 0; y < rows; y++ )
215     {
216         const u8* v0 = 0;
217         const u8* v1 = internal::getRowPtr(srcBase, srcStride, y);
218         const u8* v2 = 0;
219         // make border
220         if (border == BORDER_MODE_REFLECT101) {
221             v0 = internal::getRowPtr(srcBase, srcStride, y > 0 ? y-1 : y+1);
222             v2 = internal::getRowPtr(srcBase, srcStride, y < rows-1 ? y+1 : rows > 1 ? rows-2 : 0);
223         } else  if (border == BORDER_MODE_CONSTANT) {
224             v0 = y > 0 ? internal::getRowPtr(srcBase, srcStride, y-1) : tmp;
225             v2 =  y < rows-1 ? internal::getRowPtr(srcBase, srcStride, y+1) : tmp;
226         } else {
227             v0 = internal::getRowPtr(srcBase, srcStride, y > 0 ? y-1 : 0);
228             v2 = internal::getRowPtr(srcBase, srcStride, y < rows-1 ? y+1 : rows > 0 ? rows-1 : 0);
229         }
230         s16* drow = internal::getRowPtr(dstBase, dstStride, y);
231 
232         int16x8_t tcurr = vmovq_n_s16(0x0);
233         int16x8_t tnext = vmovq_n_s16(0x0);
234         int16x8_t t0, t2;
235         uint8x8_t xx0 = vmov_n_u8(0x0);
236         uint8x8_t xx1 = vmov_n_u8(0x0);
237         uint8x8_t xx2 = vmov_n_u8(0x0);
238         ptrdiff_t x = 0;
239         const ptrdiff_t bcols = y + 2 < rows ? cols : (cols - 8);
240         for( ; x <= bcols; x += 8 )
241         {
242             internal::prefetch(v0 + x);
243             internal::prefetch(v1 + x);
244             internal::prefetch(v2 + x);
245 
246             uint8x8_t x0 = vld1_u8(v0 + x);
247             uint8x8_t x1 = vld1_u8(v1 + x);
248             uint8x8_t x2 = vld1_u8(v2 + x);
249 
250             if(x) {
251                 xx0 = xx1;
252                 xx1 = xx2;
253             } else {
254                 xx1 = x1;
255                 // make border
256                     if (border == BORDER_MODE_REPLICATE || border == BORDER_MODE_REFLECT)
257                     {
258                         xx1 = vset_lane_u8(vget_lane_u8(x1, 0),x1, 7);
259                     }
260                     else if (border == BORDER_MODE_CONSTANT)
261                     {
262                         xx1 = vset_lane_u8(borderValue, x1, 7);
263                     }
264                     else if (border == BORDER_MODE_REFLECT101)
265                     {
266                         xx1 = vset_lane_u8(vget_lane_u8(x1, 1),x1, 7);
267                     }
268             }
269             xx2 = x1;
270 
271             if(x) {
272                 tcurr = tnext;
273             }
274             tnext = vsubq_s16(vreinterpretq_s16_u16(vaddl_u8(x0, x2)),
275                               vreinterpretq_s16_u16(vshll_n_u8(x1, 2)));
276 
277             if(!x) {
278                 tcurr = tnext;
279                 continue;
280             }
281             t0 = vreinterpretq_s16_u16(vmovl_u8(vext_u8(xx0, xx1, 7)));
282             t2 = vreinterpretq_s16_u16(vmovl_u8(vext_u8(xx1, xx2, 1)));
283             t0 = vaddq_s16(vqaddq_s16(t0, t2), tcurr);
284 
285             vst1q_s16(drow + x - 8, t0);
286         }
287 
288         x -= 8;
289         if(x == cols){
290             x--;
291         }
292 
293         for( ; x < cols; x++ )
294         {
295             s16 nextx;
296             s16 prevx;
297             // make border
298             if (border == BORDER_MODE_REPLICATE || border == BORDER_MODE_REFLECT)
299             {
300                 prevx = x == 0 ? v1[0] : v1[x-1];
301                 nextx = x == cols-1 ? v1[x] : v1[x+1];
302             }
303             else if (border == BORDER_MODE_REFLECT101)
304             {
305                 prevx = x == 0 ? v1[1] : v1[x-1];
306                 nextx = x == cols-1 ? v1[x-1] : v1[x+1];
307             }
308             else //if (border == BORDER_MODE_CONSTANT)
309             {
310                 prevx = x == 0 ? borderValue : v1[x-1];
311                 nextx = x == cols-1 ? borderValue : v1[x+1];
312             }
313             *(drow+x) = prevx + nextx - 4*v1[x] + v0[x] + v2[x];
314         }
315     }
316 #else
317     (void)size;
318     (void)srcBase;
319     (void)srcStride;
320     (void)dstBase;
321     (void)dstStride;
322     (void)border;
323     (void)borderValue;
324 #endif
325 }
326 
Laplacian3OpenCV(const Size2D & size,const u8 * srcBase,ptrdiff_t srcStride,s16 * dstBase,ptrdiff_t dstStride,BORDER_MODE border,u8 borderValue)327 void Laplacian3OpenCV(const Size2D &size,
328                       const u8 * srcBase, ptrdiff_t srcStride,
329                       s16 * dstBase, ptrdiff_t dstStride,
330                       BORDER_MODE border, u8 borderValue)
331 {
332     internal::assertSupportedConfiguration(isLaplacianOpenCVSupported(size, border));
333 #ifdef CAROTENE_NEON
334     ptrdiff_t rows = size.height, cols = size.width;
335 
336     std::vector<u8> _tmp;
337     u8 *tmp = 0;
338     if (border == BORDER_MODE_CONSTANT)
339     {
340         _tmp.assign(cols + 4,borderValue);
341         tmp = &_tmp[2];
342     }
343 
344     for( ptrdiff_t y = 0; y < rows; y++ )
345     {
346         const u8* v0 = 0;
347         const u8* v1 = internal::getRowPtr(srcBase, srcStride, y);
348         const u8* v2 = 0;
349         // make border
350         if (border == BORDER_MODE_REFLECT101) {
351             v0 = internal::getRowPtr(srcBase, srcStride, y > 0 ? y-1 : y+1);
352             v2 = internal::getRowPtr(srcBase, srcStride, y < rows-1 ? y+1 : rows > 1 ? rows-2 : 0);
353         } else  if (border == BORDER_MODE_CONSTANT) {
354             v0 = y > 0 ? internal::getRowPtr(srcBase, srcStride, y-1) : tmp;
355             v2 = y < rows-1 ? internal::getRowPtr(srcBase, srcStride, y+1) : tmp;
356         } else {
357             v0 = internal::getRowPtr(srcBase, srcStride, y > 0 ? y-1 : 0);
358             v2 = internal::getRowPtr(srcBase, srcStride, y < rows-1 ? y+1 : rows > 0 ? rows-1 : 0);
359         }
360         s16* drow = internal::getRowPtr(dstBase, dstStride, y);
361 
362         int16x8_t tprev = vmovq_n_s16(0x0);
363         int16x8_t tcurr = vmovq_n_s16(0x0);
364         int16x8_t tnext = vmovq_n_s16(0x0);
365         int16x8_t tc = vmovq_n_s16(0x0);
366         int16x8_t t0, t2, tcnext;
367         ptrdiff_t x = 0;
368         const ptrdiff_t bcols = y + 2 < rows ? cols : (cols - 8);
369         for( ; x <= bcols; x += 8 )
370         {
371             internal::prefetch(v0 + x);
372             internal::prefetch(v1 + x);
373             internal::prefetch(v2 + x);
374 
375             uint8x8_t x0 = vld1_u8(v0 + x);
376             uint8x8_t x1 = vld1_u8(v1 + x);
377             uint8x8_t x2 = vld1_u8(v2 + x);
378             tcnext = vreinterpretq_s16_u16(vshll_n_u8(x1, 2));
379 
380             if(x) {
381                 tprev = tcurr;
382                 tcurr = tnext;
383             }
384             tnext = vreinterpretq_s16_u16(vaddl_u8(x0, x2));
385 
386             if(!x) {
387                 tcurr = tnext;
388                 tc = tcnext;
389 
390                 // make border
391                     if (border == BORDER_MODE_REPLICATE || border == BORDER_MODE_REFLECT)
392                     {
393                         tcurr = vsetq_lane_s16(vgetq_lane_s16(tcurr, 0),tcurr, 7);
394                     }
395                     else if (border == BORDER_MODE_CONSTANT)
396                     {
397                         tcurr = vsetq_lane_s16(borderValue, tcurr, 7);
398                     }
399                     else if (border == BORDER_MODE_REFLECT101)
400                     {
401                         tcurr = vsetq_lane_s16(vgetq_lane_s16(tcurr, 1),tcurr, 7);
402                     }
403                 continue;
404             }
405 
406             t0 = vextq_s16(tprev, tcurr, 7);
407             t2 = vextq_s16(tcurr, tnext, 1);
408 
409             t0 = vsubq_s16(vqaddq_s16(t0, t2), tc);
410             tc = tcnext;
411 
412             t0 = vshlq_n_s16(t0, 1);
413             vst1q_s16(drow + x - 8, t0);
414         }
415         x -= 8;
416         if(x == cols){
417             x--;
418         }
419 
420         for( ; x < cols; x++ )
421         {
422             s16 nextx, nextx2;
423             s16 prevx, prevx2;
424             // make border
425             if (border == BORDER_MODE_REPLICATE || border == BORDER_MODE_REFLECT)
426             {
427                 prevx = x == 0 ? v0[0] : v0[x-1];
428                 prevx2 = x == 0 ? v2[0] : v2[x-1];
429                 nextx = x == cols-1 ? v0[x] : v0[x+1];
430                 nextx2 = x == cols-1 ? v2[x] : v2[x+1];
431             }
432             else if (border == BORDER_MODE_REFLECT101)
433             {
434                 prevx = x == 0 ? v0[1] : v0[x-1];
435                 prevx2 = x == 0 ? v2[1] : v2[x-1];
436                 nextx = x == cols-1 ? v0[x-1] : v0[x+1];
437                 nextx2 = x == cols-1 ? v2[x-1] : v2[x+1];
438             }
439             else //if (border == BORDER_MODE_CONSTANT)
440             {
441                 prevx = x == 0 ? borderValue : v0[x-1];
442                 prevx2 = x == 0 ? borderValue : v2[x-1];
443                 nextx = x == cols-1 ? borderValue : v0[x+1];
444                 nextx2 = x == cols-1 ? borderValue : v2[x+1];
445             }
446             s16 res = prevx + nextx - 4*v1[x] + prevx2 + nextx2;
447             *(drow+x) = 2*res;
448         }
449     }
450 #else
451     (void)size;
452     (void)srcBase;
453     (void)srcStride;
454     (void)dstBase;
455     (void)dstStride;
456     (void)border;
457     (void)borderValue;
458 #endif
459 }
460 
Laplacian5OpenCV(const Size2D & size,const u8 * srcBase,ptrdiff_t srcStride,s16 * dstBase,ptrdiff_t dstStride,BORDER_MODE border,u8 borderValue)461 void Laplacian5OpenCV(const Size2D &size,
462                       const u8 * srcBase, ptrdiff_t srcStride,
463                       s16 * dstBase, ptrdiff_t dstStride,
464                       BORDER_MODE border, u8 borderValue)
465 {
466     internal::assertSupportedConfiguration(isLaplacianOpenCVSupported(size, border));
467 #ifdef CAROTENE_NEON
468     ptrdiff_t rows = size.height, cols = size.width;
469 
470     std::vector<u8> _tmp;
471     u8 *tmp = 0;
472     if (border == BORDER_MODE_CONSTANT)
473     {
474         _tmp.assign(cols + 4,borderValue);
475         tmp = &_tmp[2];
476     }
477 
478     for( ptrdiff_t y = 0; y < rows; y++ )
479     {
480         const u8* v0 = 0;
481         const u8* v1 = 0;
482         const u8* v2 = internal::getRowPtr(srcBase, srcStride, y);
483         const u8* v3 = 0;
484         const u8* v4 = 0;
485         // make border
486         if (border == BORDER_MODE_REPLICATE) {
487             v0 = internal::getRowPtr(srcBase, srcStride, y > 1 ? y-2 : 0);
488             v1 = internal::getRowPtr(srcBase, srcStride, y > 0 ? y-1 : 0);
489             v3 = internal::getRowPtr(srcBase, srcStride, y < rows-1 ? y+1 : rows > 0 ? rows-1 : 0);
490             v4 = internal::getRowPtr(srcBase, srcStride, y < rows-2 ? y+2 : rows > 0 ? rows-1 : 0);
491         } else if (border == BORDER_MODE_REFLECT) {
492             v0 = internal::getRowPtr(srcBase, srcStride, y > 1 ? y-2 : rows > 1 ? 1-y : 0);
493             v1 = internal::getRowPtr(srcBase, srcStride, y > 0 ? y-1 : 0);
494             v3 = internal::getRowPtr(srcBase, srcStride, y < rows-1 ? y+1 : rows > 0 ? rows-1 : 0);
495             v4 = internal::getRowPtr(srcBase, srcStride, y < rows-2 ? y+2 : rows > 1 ? 2*rows-(y+3) : 0);
496         } else if (border == BORDER_MODE_REFLECT101) {
497             v0 = internal::getRowPtr(srcBase, srcStride, y > 1 ? y-2 : rows > 2-y ? 2-y : 0); ///check
498             v1 = internal::getRowPtr(srcBase, srcStride, y > 0 ? y-1 : rows > 1 ? 1 : 0);
499             v3 = internal::getRowPtr(srcBase, srcStride, y < rows-1 ? y+1 : rows > 1 ? rows-2 : 0);
500             v4 = internal::getRowPtr(srcBase, srcStride, y < rows-2 ? y+2 : rows > 2 ? 2*rows-(y+4) : 0);///bad if rows=2 y=1   rows - 4 + (2,1)
501         } else if (border == BORDER_MODE_CONSTANT) {
502             v0 = y > 1 ? internal::getRowPtr(srcBase, srcStride, y-2) : tmp;
503             v1 = y > 0 ? internal::getRowPtr(srcBase, srcStride, y-1) : tmp;
504             v3 = y < rows-1 ? internal::getRowPtr(srcBase, srcStride, y+1) : tmp;
505             v4 = y < rows-2 ? internal::getRowPtr(srcBase, srcStride, y+2) : tmp;
506         }
507         s16* drow = internal::getRowPtr(dstBase, dstStride, y);
508 
509         int16x8_t tnext, tc, t0;
510         int16x8_t tnext2, tnext3;
511         int16x8_t tnext1Old, tnext2Old, tnext3Old;
512         int16x8_t tnext4OldOldOld, tnext5OldOldOld;
513 
514         int16x8_t tcurr1 = vmovq_n_s16(0x0);
515         int16x8_t tnext1 = vmovq_n_s16(0x0);
516         int16x8_t tprev1 = vmovq_n_s16(0x0);
517         int16x8_t tpprev1 = vmovq_n_s16(0x0);
518         int16x8_t tppprev1 = vmovq_n_s16(0x0);
519 
520         int16x8_t tnext4Old = vmovq_n_s16(0x0);
521         int16x8_t tnext5Old = vmovq_n_s16(0x0);
522         int16x8_t tnext1OldOld = vmovq_n_s16(0x0);
523         int16x8_t tnext2OldOld = vmovq_n_s16(0x0);
524         int16x8_t tnext3OldOld = vmovq_n_s16(0x0);
525         int16x8_t tnext4OldOld = vmovq_n_s16(0x0);
526         int16x8_t tnext5OldOld = vmovq_n_s16(0x0);
527 
528         // do vertical convolution
529         ptrdiff_t x = 0;
530         const ptrdiff_t bcols = y + 3 < rows ? cols : (cols - 8);
531         for( ; x <= bcols; x += 8 )
532         {
533             internal::prefetch(v0 + x);
534             internal::prefetch(v1 + x);
535             internal::prefetch(v2 + x);
536             internal::prefetch(v3 + x);
537             internal::prefetch(v4 + x);
538 
539             uint8x8_t x0 = vld1_u8(v0 + x);
540             uint8x8_t x1 = vld1_u8(v1 + x);
541             uint8x8_t x2 = vld1_u8(v2 + x);
542             uint8x8_t x3 = vld1_u8(v3 + x);
543             uint8x8_t x4 = vld1_u8(v4 + x);
544             if(x) {
545                 tcurr1 = tnext1;
546             }
547 
548             tnext4OldOldOld = tnext4Old;
549             tnext5OldOldOld = tnext5Old;
550             tnext1Old = tnext1OldOld;
551             tnext2Old = tnext2OldOld;
552             tnext3Old = tnext3OldOld;
553             tnext4Old = tnext4OldOld;
554             tnext5Old = tnext5OldOld;
555 
556             tnext3 = vreinterpretq_s16_u16(vaddq_u16(vaddl_u8(x3, x2),vaddl_u8(x2, x1)));
557             tnext3 = vshlq_n_s16(tnext3, 1);
558 
559             tc = vreinterpretq_s16_u16(vsubl_u8(x4, x2));
560             tnext = vreinterpretq_s16_u16(vsubl_u8(x2, x0));
561             tnext2 = vsubq_s16(tc, tnext);
562 
563             tnext1 = vaddq_s16(tnext3, tnext2);
564             // tnext1 = x0 + 2*x1 + 2*x2 + 2*x3 + x4
565 
566             tnext2 = vshlq_n_s16(tnext2, 1);
567             // tnext2 = 2*x4 - 4*x2 + 2*x0
568 
569             tnext3 = vsubq_s16(tnext2, vshlq_n_s16(tnext3, 1));
570             // tnext3 = 2*x0 - 4*x1 - 12*x2 - 4*x3  + 2*x4
571 
572             tnext1OldOld = tnext1;
573             tnext2OldOld = tnext2;
574             tnext3OldOld = tnext3;
575             tnext4OldOld = tnext2;
576             tnext5OldOld = tnext1;
577 
578             if(x) {
579                 tnext1 = vextq_s16(tnext1Old, tnext1, 2);
580                 tcurr1 = vextq_s16(tnext2Old, tnext2, 1);
581                 tprev1 = tnext3Old;
582 
583                 if(x!=8) {
584                     tpprev1 = vextq_s16(tnext4OldOldOld, tnext4Old, 7);
585                     tppprev1 = vextq_s16(tnext5OldOldOld, tnext5Old, 6);
586                 }
587             }
588 
589             if(!x) {
590                 // make border
591                 if (border == BORDER_MODE_REPLICATE) {
592                     tpprev1 = vextq_s16(tnext2, tnext2, 7);
593                     tpprev1 = vsetq_lane_s16(vgetq_lane_s16(tpprev1, 1),tpprev1, 0);
594 
595                     tprev1 = vextq_s16(tnext1, tnext1, 6);
596                     tprev1 = vsetq_lane_s16(vgetq_lane_s16(tprev1, 2),tprev1, 0);
597                     tprev1 = vsetq_lane_s16(vgetq_lane_s16(tprev1, 2),tprev1, 1);
598                 } else if (border == BORDER_MODE_REFLECT) {
599                     tpprev1 = vextq_s16(tnext2, tnext2, 7);
600                     tpprev1 = vsetq_lane_s16(vgetq_lane_s16(tpprev1, 1),tpprev1, 0);
601 
602                     tprev1 = vextq_s16(tnext1, tnext1, 6);
603                     tprev1 = vsetq_lane_s16(vgetq_lane_s16(tprev1, 3),tprev1, 0);
604                     tprev1 = vsetq_lane_s16(vgetq_lane_s16(tprev1, 2),tprev1, 1);
605                 } else if (border == BORDER_MODE_REFLECT101) {
606                     tpprev1 = vextq_s16(tnext2, tnext2, 7);
607                     tpprev1 = vsetq_lane_s16(vgetq_lane_s16(tpprev1, 2),tpprev1, 0);
608 
609                     tprev1 = vextq_s16(tnext1, tnext1, 6);
610                     tprev1 = vsetq_lane_s16(vgetq_lane_s16(tprev1, 3),tprev1, 1);
611                     tprev1 = vsetq_lane_s16(vgetq_lane_s16(tprev1, 4),tprev1, 0);
612                 } else if (border == BORDER_MODE_CONSTANT) {
613                     tpprev1 = vextq_s16(tnext2, tnext2, 7);
614                     tpprev1 = vsetq_lane_s16(borderValue, tpprev1, 0);
615 
616                     tprev1 = vextq_s16(tnext1, tnext1, 6);
617                     tprev1 = vsetq_lane_s16(borderValue, tprev1, 0);
618                     tprev1 = vsetq_lane_s16(borderValue, tprev1, 1);
619                 }
620                 tppprev1 = tprev1;
621                 continue;
622             }
623 
624             t0 = vaddq_s16(vaddq_s16(vqaddq_s16(tcurr1, tprev1), vqaddq_s16(tpprev1, tppprev1)), tnext1);
625             t0 = vaddq_s16(t0, t0);
626             vst1q_s16(drow + x - 8, t0);
627         }
628         x -= 8;
629         if(x >= cols - 1)
630             x = cols-2;
631 
632         s16 pprevx = 0;
633         s16 prevx = 0;
634         s16 nextx = 0;
635         s16 nnextx = 0;
636 
637         for( ; x < cols; x++ )
638         {
639             if (x == 0) {
640                 // make border
641                 if (border == BORDER_MODE_REPLICATE) {
642                     pprevx = v0[0] + 2*v1[0] + 2*v2[0] + 2*v3[0] + v4[0];
643                     prevx = 2*v0[0] - 4*v2[0] + 2*v4[0];
644                 } else if (border == BORDER_MODE_REFLECT) {
645                     pprevx = v0[1] + 2*v1[1] + 2*v2[1] + 2*v3[1] + v4[1];
646                     prevx = 2*v0[0] - 4*v2[0] + 2*v4[0];
647                 } else if (border == BORDER_MODE_REFLECT101) {
648                     pprevx = v0[2] + 2*v1[2] + 2*v2[2] + 2*v3[2] + v4[2];
649                     prevx = 2*v0[1] - 4*v2[1] + 2*v4[1];
650                 } else if (border == BORDER_MODE_CONSTANT) {
651                     pprevx = 8 * borderValue;
652                     prevx = 0;
653                 }
654             } else if (x == 1) {
655                 // make border
656                 if (border == BORDER_MODE_REPLICATE || border == BORDER_MODE_REFLECT) {
657                     pprevx = v0[0] + 2*v1[0] + 2*v2[0] + 2*v3[0] + v4[0];
658                 } else if (border == BORDER_MODE_REFLECT101) {
659                     pprevx = v0[1] + 2*v1[1] + 2*v2[1] + 2*v3[1] + v4[1];
660                 } else if (border == BORDER_MODE_CONSTANT) {
661                     pprevx = 8 * borderValue;
662                 }
663                 prevx = 2*v0[0] - 4*v2[0] + 2*v4[0];
664             } else {
665                 pprevx = v0[x-2] + 2*v1[x-2] + 2*v2[x-2] + 2*v3[x-2] + v4[x-2];
666                 prevx = 2*v0[x-1] - 4*v2[x-1] + 2*v4[x-1];
667             }
668             s16 currx = 2*v0[x] - 4*v1[x] - 12*v2[x] - 4*v3[x] + 2*v4[x];
669             if (x == cols-1) {
670                 // make border
671                 if (border == BORDER_MODE_REPLICATE) {
672                     nextx = 2*v0[x] - 4*v2[x] + 2*v4[x];
673                     nnextx = v0[x] + 2*v1[x] + 2*v2[x] + 2*v3[x] + v4[x];
674                 } else if (border == BORDER_MODE_REFLECT) {
675                     nextx = 2*v0[x] - 4*v2[x] + 2*v4[x];
676                     nnextx = v0[x-1] + 2*v1[x-1] + 2*v2[x-1] + 2*v3[x-1] + v4[x-1];
677                 } else if (border == BORDER_MODE_REFLECT101) {
678                     nextx = 2*v0[x-1] - 4*v2[x-1] + 2*v4[x-1];
679                     nnextx = v0[x-2] + 2*v1[x-2] + 2*v2[x-2] + 2*v3[x-2] + v4[x-2];
680                 } else if (border == BORDER_MODE_CONSTANT) {
681                     nextx = 0;
682                     nnextx = 8 * borderValue;
683                 }
684             } else if (x == cols-2) {
685                 // make border
686                 if (border == BORDER_MODE_REPLICATE || border == BORDER_MODE_REFLECT) {
687                     nnextx = v0[x+1] + 2*v1[x+1] + 2*v2[x+1] + 2*v3[x+1] + v4[x+1];
688                 } else if (border == BORDER_MODE_REFLECT101) {
689                     nnextx = v0[x] + 2*v1[x] + 2*v2[x] + 2*v3[x] + v4[x];
690                 } else if (border == BORDER_MODE_CONSTANT) {
691                     nnextx = 8 * borderValue;
692                 }
693                 nextx = 2*v0[x+1] - 4*v2[x+1] + 2*v4[x+1];
694             } else {
695                 nextx = 2*v0[x+1] - 4*v2[x+1] + 2*v4[x+1];
696                 nnextx = v0[x+2] + 2*v1[x+2] + 2*v2[x+2] + 2*v3[x+2] + v4[x+2];
697             }
698             s16 res = pprevx + prevx + currx + nextx + nnextx;
699             *(drow+x) = 2*res;
700         }
701     }
702 #else
703     (void)size;
704     (void)srcBase;
705     (void)srcStride;
706     (void)dstBase;
707     (void)dstStride;
708     (void)border;
709     (void)borderValue;
710 #endif
711 }
712 
713 } // namespace CAROTENE_NS
714