1 /*
2 * By downloading, copying, installing or using the software you agree to this license.
3 * If you do not agree to this license, do not download, install,
4 * copy or use the software.
5 *
6 *
7 * License Agreement
8 * For Open Source Computer Vision Library
9 * (3-clause BSD License)
10 *
11 * Copyright (C) 2015, NVIDIA Corporation, all rights reserved.
12 * Third party copyrights are property of their respective owners.
13 *
14 * Redistribution and use in source and binary forms, with or without modification,
15 * are permitted provided that the following conditions are met:
16 *
17 * * Redistributions of source code must retain the above copyright notice,
18 * this list of conditions and the following disclaimer.
19 *
20 * * Redistributions in binary form must reproduce the above copyright notice,
21 * this list of conditions and the following disclaimer in the documentation
22 * and/or other materials provided with the distribution.
23 *
24 * * Neither the names of the copyright holders nor the names of the contributors
25 * may be used to endorse or promote products derived from this software
26 * without specific prior written permission.
27 *
28 * This software is provided by the copyright holders and contributors "as is" and
29 * any express or implied warranties, including, but not limited to, the implied
30 * warranties of merchantability and fitness for a particular purpose are disclaimed.
31 * In no event shall copyright holders or contributors be liable for any direct,
32 * indirect, incidental, special, exemplary, or consequential damages
33 * (including, but not limited to, procurement of substitute goods or services;
34 * loss of use, data, or profits; or business interruption) however caused
35 * and on any theory of liability, whether in contract, strict liability,
36 * or tort (including negligence or otherwise) arising in any way out of
37 * the use of this software, even if advised of the possibility of such damage.
38 */
39
40 #include "common.hpp"
41 #include "saturate_cast.hpp"
42
43 #include <vector>
44
45 namespace CAROTENE_NS {
46
isLaplacian3x3Supported(const Size2D & size,BORDER_MODE border)47 bool isLaplacian3x3Supported(const Size2D &size, BORDER_MODE border)
48 {
49 return isSupportedConfiguration() && size.width >= 8 &&
50 (border == BORDER_MODE_CONSTANT ||
51 border == BORDER_MODE_REPLICATE);
52 }
53
Laplacian3x3(const Size2D & size,const u8 * srcBase,ptrdiff_t srcStride,u8 * dstBase,ptrdiff_t dstStride,BORDER_MODE border,u8 borderValue)54 void Laplacian3x3(const Size2D &size,
55 const u8 * srcBase, ptrdiff_t srcStride,
56 u8 * dstBase, ptrdiff_t dstStride,
57 BORDER_MODE border, u8 borderValue)
58 {
59 internal::assertSupportedConfiguration(isLaplacian3x3Supported(size, border));
60 #ifdef CAROTENE_NEON
61 const uint16x8_t v_border_x3 = vdupq_n_u16(borderValue * 3);
62 const uint16x8_t v_zero = vdupq_n_u16(0);
63 const uint8x8_t v_border = vdup_n_u8(borderValue);
64
65 uint8x8_t vsub;
66 uint16x8_t tprev = v_zero, tcurr = v_zero, tnext = v_zero;
67 uint16x8_t t0 = v_zero, t1 = v_zero, t2 = v_zero;
68
69 ptrdiff_t width = (ptrdiff_t)size.width, height = (ptrdiff_t)size.height;
70
71 for (ptrdiff_t y = 0; y < height; ++y)
72 {
73 const u8 * srow0 = y == 0 && border == BORDER_MODE_CONSTANT ? NULL : internal::getRowPtr(srcBase, srcStride, std::max<ptrdiff_t>(y - 1, 0));
74 const u8 * srow1 = internal::getRowPtr(srcBase, srcStride, y);
75 const u8 * srow2 = y + 1 == height && border == BORDER_MODE_CONSTANT ? NULL : internal::getRowPtr(srcBase, srcStride, std::min(y + 1, height - 1));
76 u8 * drow = internal::getRowPtr(dstBase, dstStride, y);
77
78 s16 prevx = 0, currx = 0, nextx = 0;
79 ptrdiff_t x = 0;
80 const ptrdiff_t bwidth = y + 2 < height ? width : (width - 8);
81
82 // perform vertical convolution
83 for ( ; x <= bwidth; x += 8)
84 {
85 internal::prefetch(srow0 + x);
86 internal::prefetch(srow1 + x);
87 internal::prefetch(srow2 + x);
88
89 uint8x8_t x0 = !srow0 ? v_border : vld1_u8(srow0 + x);
90 uint8x8_t x1 = vld1_u8(srow1 + x);
91 uint8x8_t x2 = !srow2 ? v_border : vld1_u8(srow2 + x);
92
93 // calculate values for plain CPU part below if needed
94 if (x + 8 >= bwidth)
95 {
96 ptrdiff_t x3 = x == width ? width - 1 : x;
97 ptrdiff_t x4 = border == BORDER_MODE_CONSTANT ? x3 - 1 : std::max<ptrdiff_t>(x3 - 1, 0);
98
99 if (border == BORDER_MODE_CONSTANT && x4 < 0)
100 prevx = borderValue;
101 else
102 prevx = (srow2 ? srow2[x4] : borderValue) + srow1[x4] + (srow0 ? srow0[x4] : borderValue);
103
104 currx = (srow2 ? srow2[x3] : borderValue) + srow1[x3] + (srow0 ? srow0[x3] : borderValue);
105 }
106
107 // make shift
108 if (x)
109 {
110 tprev = tcurr;
111 tcurr = tnext;
112 }
113
114 // and calculate next value
115 tnext = vaddw_u8(vaddl_u8(x0, x1), x2);
116
117 // make extrapolation for the first elements
118 if (!x)
119 {
120 // make border
121 if (border == BORDER_MODE_CONSTANT)
122 tcurr = v_border_x3;
123 else if (border == BORDER_MODE_REPLICATE)
124 tcurr = vdupq_n_u16(vgetq_lane_u16(tnext, 0));
125
126 vsub = x1;
127
128 continue;
129 }
130
131 // combine 3 "shifted" vectors
132 t0 = vextq_u16(tprev, tcurr, 7);
133 t1 = tcurr;
134 t2 = vextq_u16(tcurr, tnext, 1);
135
136 // and add them
137 t0 = vqaddq_u16(t0, vqaddq_u16(t1, t2));
138
139 int16x8_t tt0 = vsubq_s16(vreinterpretq_s16_u16(t0),
140 vreinterpretq_s16_u16(vaddw_u8(vshll_n_u8(vsub, 3), vsub)));
141 uint8x8_t it0 = vqmovun_s16(tt0);
142 vst1_u8(drow + x - 8, it0);
143
144 vsub = x1;
145 }
146
147 x -= 8;
148 if (x == width)
149 --x;
150
151 for ( ; x < width; ++x)
152 {
153 // make extrapolation for the last elements
154 if (x + 1 >= width)
155 {
156 if (border == BORDER_MODE_CONSTANT)
157 nextx = borderValue * 3;
158 else if (border == BORDER_MODE_REPLICATE)
159 nextx = srow2[x] + srow1[x] + srow0[x];
160 }
161 else
162 {
163 nextx = (srow2 ? srow2[x + 1] : borderValue) +
164 srow1[x + 1] +
165 (srow0 ? srow0[x + 1] : borderValue);
166 }
167
168 s32 val = (prevx + currx + nextx) - 9 * srow1[x];
169 drow[x] = internal::saturate_cast<u8>((s32)val);
170
171 // make shift
172 prevx = currx;
173 currx = nextx;
174 }
175 }
176 #else
177 (void)size;
178 (void)srcBase;
179 (void)srcStride;
180 (void)dstBase;
181 (void)dstStride;
182 (void)border;
183 (void)borderValue;
184 #endif
185 }
186
isLaplacianOpenCVSupported(const Size2D & size,BORDER_MODE border)187 bool isLaplacianOpenCVSupported(const Size2D &size, BORDER_MODE border)
188 {
189 return isSupportedConfiguration() &&
190 size.width >= 8 && size.height >= 1 &&
191 (border == BORDER_MODE_CONSTANT ||
192 border == BORDER_MODE_REFLECT ||
193 border == BORDER_MODE_REFLECT101 ||
194 border == BORDER_MODE_REPLICATE);
195 }
196
Laplacian1OpenCV(const Size2D & size,const u8 * srcBase,ptrdiff_t srcStride,s16 * dstBase,ptrdiff_t dstStride,BORDER_MODE border,u8 borderValue)197 void Laplacian1OpenCV(const Size2D &size,
198 const u8 * srcBase, ptrdiff_t srcStride,
199 s16 * dstBase, ptrdiff_t dstStride,
200 BORDER_MODE border, u8 borderValue)
201 {
202 internal::assertSupportedConfiguration(isLaplacianOpenCVSupported(size, border));
203 #ifdef CAROTENE_NEON
204 ptrdiff_t rows = size.height, cols = size.width;
205
206 std::vector<u8> _tmp;
207 u8 *tmp = 0;
208 if (border == BORDER_MODE_CONSTANT)
209 {
210 _tmp.assign(cols + 4,borderValue);
211 tmp = &_tmp[2];
212 }
213
214 for( ptrdiff_t y = 0; y < rows; y++ )
215 {
216 const u8* v0 = 0;
217 const u8* v1 = internal::getRowPtr(srcBase, srcStride, y);
218 const u8* v2 = 0;
219 // make border
220 if (border == BORDER_MODE_REFLECT101) {
221 v0 = internal::getRowPtr(srcBase, srcStride, y > 0 ? y-1 : y+1);
222 v2 = internal::getRowPtr(srcBase, srcStride, y < rows-1 ? y+1 : rows > 1 ? rows-2 : 0);
223 } else if (border == BORDER_MODE_CONSTANT) {
224 v0 = y > 0 ? internal::getRowPtr(srcBase, srcStride, y-1) : tmp;
225 v2 = y < rows-1 ? internal::getRowPtr(srcBase, srcStride, y+1) : tmp;
226 } else {
227 v0 = internal::getRowPtr(srcBase, srcStride, y > 0 ? y-1 : 0);
228 v2 = internal::getRowPtr(srcBase, srcStride, y < rows-1 ? y+1 : rows > 0 ? rows-1 : 0);
229 }
230 s16* drow = internal::getRowPtr(dstBase, dstStride, y);
231
232 int16x8_t tcurr = vmovq_n_s16(0x0);
233 int16x8_t tnext = vmovq_n_s16(0x0);
234 int16x8_t t0, t2;
235 uint8x8_t xx0 = vmov_n_u8(0x0);
236 uint8x8_t xx1 = vmov_n_u8(0x0);
237 uint8x8_t xx2 = vmov_n_u8(0x0);
238 ptrdiff_t x = 0;
239 const ptrdiff_t bcols = y + 2 < rows ? cols : (cols - 8);
240 for( ; x <= bcols; x += 8 )
241 {
242 internal::prefetch(v0 + x);
243 internal::prefetch(v1 + x);
244 internal::prefetch(v2 + x);
245
246 uint8x8_t x0 = vld1_u8(v0 + x);
247 uint8x8_t x1 = vld1_u8(v1 + x);
248 uint8x8_t x2 = vld1_u8(v2 + x);
249
250 if(x) {
251 xx0 = xx1;
252 xx1 = xx2;
253 } else {
254 xx1 = x1;
255 // make border
256 if (border == BORDER_MODE_REPLICATE || border == BORDER_MODE_REFLECT)
257 {
258 xx1 = vset_lane_u8(vget_lane_u8(x1, 0),x1, 7);
259 }
260 else if (border == BORDER_MODE_CONSTANT)
261 {
262 xx1 = vset_lane_u8(borderValue, x1, 7);
263 }
264 else if (border == BORDER_MODE_REFLECT101)
265 {
266 xx1 = vset_lane_u8(vget_lane_u8(x1, 1),x1, 7);
267 }
268 }
269 xx2 = x1;
270
271 if(x) {
272 tcurr = tnext;
273 }
274 tnext = vsubq_s16(vreinterpretq_s16_u16(vaddl_u8(x0, x2)),
275 vreinterpretq_s16_u16(vshll_n_u8(x1, 2)));
276
277 if(!x) {
278 tcurr = tnext;
279 continue;
280 }
281 t0 = vreinterpretq_s16_u16(vmovl_u8(vext_u8(xx0, xx1, 7)));
282 t2 = vreinterpretq_s16_u16(vmovl_u8(vext_u8(xx1, xx2, 1)));
283 t0 = vaddq_s16(vqaddq_s16(t0, t2), tcurr);
284
285 vst1q_s16(drow + x - 8, t0);
286 }
287
288 x -= 8;
289 if(x == cols){
290 x--;
291 }
292
293 for( ; x < cols; x++ )
294 {
295 s16 nextx;
296 s16 prevx;
297 // make border
298 if (border == BORDER_MODE_REPLICATE || border == BORDER_MODE_REFLECT)
299 {
300 prevx = x == 0 ? v1[0] : v1[x-1];
301 nextx = x == cols-1 ? v1[x] : v1[x+1];
302 }
303 else if (border == BORDER_MODE_REFLECT101)
304 {
305 prevx = x == 0 ? v1[1] : v1[x-1];
306 nextx = x == cols-1 ? v1[x-1] : v1[x+1];
307 }
308 else //if (border == BORDER_MODE_CONSTANT)
309 {
310 prevx = x == 0 ? borderValue : v1[x-1];
311 nextx = x == cols-1 ? borderValue : v1[x+1];
312 }
313 *(drow+x) = prevx + nextx - 4*v1[x] + v0[x] + v2[x];
314 }
315 }
316 #else
317 (void)size;
318 (void)srcBase;
319 (void)srcStride;
320 (void)dstBase;
321 (void)dstStride;
322 (void)border;
323 (void)borderValue;
324 #endif
325 }
326
Laplacian3OpenCV(const Size2D & size,const u8 * srcBase,ptrdiff_t srcStride,s16 * dstBase,ptrdiff_t dstStride,BORDER_MODE border,u8 borderValue)327 void Laplacian3OpenCV(const Size2D &size,
328 const u8 * srcBase, ptrdiff_t srcStride,
329 s16 * dstBase, ptrdiff_t dstStride,
330 BORDER_MODE border, u8 borderValue)
331 {
332 internal::assertSupportedConfiguration(isLaplacianOpenCVSupported(size, border));
333 #ifdef CAROTENE_NEON
334 ptrdiff_t rows = size.height, cols = size.width;
335
336 std::vector<u8> _tmp;
337 u8 *tmp = 0;
338 if (border == BORDER_MODE_CONSTANT)
339 {
340 _tmp.assign(cols + 4,borderValue);
341 tmp = &_tmp[2];
342 }
343
344 for( ptrdiff_t y = 0; y < rows; y++ )
345 {
346 const u8* v0 = 0;
347 const u8* v1 = internal::getRowPtr(srcBase, srcStride, y);
348 const u8* v2 = 0;
349 // make border
350 if (border == BORDER_MODE_REFLECT101) {
351 v0 = internal::getRowPtr(srcBase, srcStride, y > 0 ? y-1 : y+1);
352 v2 = internal::getRowPtr(srcBase, srcStride, y < rows-1 ? y+1 : rows > 1 ? rows-2 : 0);
353 } else if (border == BORDER_MODE_CONSTANT) {
354 v0 = y > 0 ? internal::getRowPtr(srcBase, srcStride, y-1) : tmp;
355 v2 = y < rows-1 ? internal::getRowPtr(srcBase, srcStride, y+1) : tmp;
356 } else {
357 v0 = internal::getRowPtr(srcBase, srcStride, y > 0 ? y-1 : 0);
358 v2 = internal::getRowPtr(srcBase, srcStride, y < rows-1 ? y+1 : rows > 0 ? rows-1 : 0);
359 }
360 s16* drow = internal::getRowPtr(dstBase, dstStride, y);
361
362 int16x8_t tprev = vmovq_n_s16(0x0);
363 int16x8_t tcurr = vmovq_n_s16(0x0);
364 int16x8_t tnext = vmovq_n_s16(0x0);
365 int16x8_t tc = vmovq_n_s16(0x0);
366 int16x8_t t0, t2, tcnext;
367 ptrdiff_t x = 0;
368 const ptrdiff_t bcols = y + 2 < rows ? cols : (cols - 8);
369 for( ; x <= bcols; x += 8 )
370 {
371 internal::prefetch(v0 + x);
372 internal::prefetch(v1 + x);
373 internal::prefetch(v2 + x);
374
375 uint8x8_t x0 = vld1_u8(v0 + x);
376 uint8x8_t x1 = vld1_u8(v1 + x);
377 uint8x8_t x2 = vld1_u8(v2 + x);
378 tcnext = vreinterpretq_s16_u16(vshll_n_u8(x1, 2));
379
380 if(x) {
381 tprev = tcurr;
382 tcurr = tnext;
383 }
384 tnext = vreinterpretq_s16_u16(vaddl_u8(x0, x2));
385
386 if(!x) {
387 tcurr = tnext;
388 tc = tcnext;
389
390 // make border
391 if (border == BORDER_MODE_REPLICATE || border == BORDER_MODE_REFLECT)
392 {
393 tcurr = vsetq_lane_s16(vgetq_lane_s16(tcurr, 0),tcurr, 7);
394 }
395 else if (border == BORDER_MODE_CONSTANT)
396 {
397 tcurr = vsetq_lane_s16(borderValue, tcurr, 7);
398 }
399 else if (border == BORDER_MODE_REFLECT101)
400 {
401 tcurr = vsetq_lane_s16(vgetq_lane_s16(tcurr, 1),tcurr, 7);
402 }
403 continue;
404 }
405
406 t0 = vextq_s16(tprev, tcurr, 7);
407 t2 = vextq_s16(tcurr, tnext, 1);
408
409 t0 = vsubq_s16(vqaddq_s16(t0, t2), tc);
410 tc = tcnext;
411
412 t0 = vshlq_n_s16(t0, 1);
413 vst1q_s16(drow + x - 8, t0);
414 }
415 x -= 8;
416 if(x == cols){
417 x--;
418 }
419
420 for( ; x < cols; x++ )
421 {
422 s16 nextx, nextx2;
423 s16 prevx, prevx2;
424 // make border
425 if (border == BORDER_MODE_REPLICATE || border == BORDER_MODE_REFLECT)
426 {
427 prevx = x == 0 ? v0[0] : v0[x-1];
428 prevx2 = x == 0 ? v2[0] : v2[x-1];
429 nextx = x == cols-1 ? v0[x] : v0[x+1];
430 nextx2 = x == cols-1 ? v2[x] : v2[x+1];
431 }
432 else if (border == BORDER_MODE_REFLECT101)
433 {
434 prevx = x == 0 ? v0[1] : v0[x-1];
435 prevx2 = x == 0 ? v2[1] : v2[x-1];
436 nextx = x == cols-1 ? v0[x-1] : v0[x+1];
437 nextx2 = x == cols-1 ? v2[x-1] : v2[x+1];
438 }
439 else //if (border == BORDER_MODE_CONSTANT)
440 {
441 prevx = x == 0 ? borderValue : v0[x-1];
442 prevx2 = x == 0 ? borderValue : v2[x-1];
443 nextx = x == cols-1 ? borderValue : v0[x+1];
444 nextx2 = x == cols-1 ? borderValue : v2[x+1];
445 }
446 s16 res = prevx + nextx - 4*v1[x] + prevx2 + nextx2;
447 *(drow+x) = 2*res;
448 }
449 }
450 #else
451 (void)size;
452 (void)srcBase;
453 (void)srcStride;
454 (void)dstBase;
455 (void)dstStride;
456 (void)border;
457 (void)borderValue;
458 #endif
459 }
460
Laplacian5OpenCV(const Size2D & size,const u8 * srcBase,ptrdiff_t srcStride,s16 * dstBase,ptrdiff_t dstStride,BORDER_MODE border,u8 borderValue)461 void Laplacian5OpenCV(const Size2D &size,
462 const u8 * srcBase, ptrdiff_t srcStride,
463 s16 * dstBase, ptrdiff_t dstStride,
464 BORDER_MODE border, u8 borderValue)
465 {
466 internal::assertSupportedConfiguration(isLaplacianOpenCVSupported(size, border));
467 #ifdef CAROTENE_NEON
468 ptrdiff_t rows = size.height, cols = size.width;
469
470 std::vector<u8> _tmp;
471 u8 *tmp = 0;
472 if (border == BORDER_MODE_CONSTANT)
473 {
474 _tmp.assign(cols + 4,borderValue);
475 tmp = &_tmp[2];
476 }
477
478 for( ptrdiff_t y = 0; y < rows; y++ )
479 {
480 const u8* v0 = 0;
481 const u8* v1 = 0;
482 const u8* v2 = internal::getRowPtr(srcBase, srcStride, y);
483 const u8* v3 = 0;
484 const u8* v4 = 0;
485 // make border
486 if (border == BORDER_MODE_REPLICATE) {
487 v0 = internal::getRowPtr(srcBase, srcStride, y > 1 ? y-2 : 0);
488 v1 = internal::getRowPtr(srcBase, srcStride, y > 0 ? y-1 : 0);
489 v3 = internal::getRowPtr(srcBase, srcStride, y < rows-1 ? y+1 : rows > 0 ? rows-1 : 0);
490 v4 = internal::getRowPtr(srcBase, srcStride, y < rows-2 ? y+2 : rows > 0 ? rows-1 : 0);
491 } else if (border == BORDER_MODE_REFLECT) {
492 v0 = internal::getRowPtr(srcBase, srcStride, y > 1 ? y-2 : rows > 1 ? 1-y : 0);
493 v1 = internal::getRowPtr(srcBase, srcStride, y > 0 ? y-1 : 0);
494 v3 = internal::getRowPtr(srcBase, srcStride, y < rows-1 ? y+1 : rows > 0 ? rows-1 : 0);
495 v4 = internal::getRowPtr(srcBase, srcStride, y < rows-2 ? y+2 : rows > 1 ? 2*rows-(y+3) : 0);
496 } else if (border == BORDER_MODE_REFLECT101) {
497 v0 = internal::getRowPtr(srcBase, srcStride, y > 1 ? y-2 : rows > 2-y ? 2-y : 0); ///check
498 v1 = internal::getRowPtr(srcBase, srcStride, y > 0 ? y-1 : rows > 1 ? 1 : 0);
499 v3 = internal::getRowPtr(srcBase, srcStride, y < rows-1 ? y+1 : rows > 1 ? rows-2 : 0);
500 v4 = internal::getRowPtr(srcBase, srcStride, y < rows-2 ? y+2 : rows > 2 ? 2*rows-(y+4) : 0);///bad if rows=2 y=1 rows - 4 + (2,1)
501 } else if (border == BORDER_MODE_CONSTANT) {
502 v0 = y > 1 ? internal::getRowPtr(srcBase, srcStride, y-2) : tmp;
503 v1 = y > 0 ? internal::getRowPtr(srcBase, srcStride, y-1) : tmp;
504 v3 = y < rows-1 ? internal::getRowPtr(srcBase, srcStride, y+1) : tmp;
505 v4 = y < rows-2 ? internal::getRowPtr(srcBase, srcStride, y+2) : tmp;
506 }
507 s16* drow = internal::getRowPtr(dstBase, dstStride, y);
508
509 int16x8_t tnext, tc, t0;
510 int16x8_t tnext2, tnext3;
511 int16x8_t tnext1Old, tnext2Old, tnext3Old;
512 int16x8_t tnext4OldOldOld, tnext5OldOldOld;
513
514 int16x8_t tcurr1 = vmovq_n_s16(0x0);
515 int16x8_t tnext1 = vmovq_n_s16(0x0);
516 int16x8_t tprev1 = vmovq_n_s16(0x0);
517 int16x8_t tpprev1 = vmovq_n_s16(0x0);
518 int16x8_t tppprev1 = vmovq_n_s16(0x0);
519
520 int16x8_t tnext4Old = vmovq_n_s16(0x0);
521 int16x8_t tnext5Old = vmovq_n_s16(0x0);
522 int16x8_t tnext1OldOld = vmovq_n_s16(0x0);
523 int16x8_t tnext2OldOld = vmovq_n_s16(0x0);
524 int16x8_t tnext3OldOld = vmovq_n_s16(0x0);
525 int16x8_t tnext4OldOld = vmovq_n_s16(0x0);
526 int16x8_t tnext5OldOld = vmovq_n_s16(0x0);
527
528 // do vertical convolution
529 ptrdiff_t x = 0;
530 const ptrdiff_t bcols = y + 3 < rows ? cols : (cols - 8);
531 for( ; x <= bcols; x += 8 )
532 {
533 internal::prefetch(v0 + x);
534 internal::prefetch(v1 + x);
535 internal::prefetch(v2 + x);
536 internal::prefetch(v3 + x);
537 internal::prefetch(v4 + x);
538
539 uint8x8_t x0 = vld1_u8(v0 + x);
540 uint8x8_t x1 = vld1_u8(v1 + x);
541 uint8x8_t x2 = vld1_u8(v2 + x);
542 uint8x8_t x3 = vld1_u8(v3 + x);
543 uint8x8_t x4 = vld1_u8(v4 + x);
544 if(x) {
545 tcurr1 = tnext1;
546 }
547
548 tnext4OldOldOld = tnext4Old;
549 tnext5OldOldOld = tnext5Old;
550 tnext1Old = tnext1OldOld;
551 tnext2Old = tnext2OldOld;
552 tnext3Old = tnext3OldOld;
553 tnext4Old = tnext4OldOld;
554 tnext5Old = tnext5OldOld;
555
556 tnext3 = vreinterpretq_s16_u16(vaddq_u16(vaddl_u8(x3, x2),vaddl_u8(x2, x1)));
557 tnext3 = vshlq_n_s16(tnext3, 1);
558
559 tc = vreinterpretq_s16_u16(vsubl_u8(x4, x2));
560 tnext = vreinterpretq_s16_u16(vsubl_u8(x2, x0));
561 tnext2 = vsubq_s16(tc, tnext);
562
563 tnext1 = vaddq_s16(tnext3, tnext2);
564 // tnext1 = x0 + 2*x1 + 2*x2 + 2*x3 + x4
565
566 tnext2 = vshlq_n_s16(tnext2, 1);
567 // tnext2 = 2*x4 - 4*x2 + 2*x0
568
569 tnext3 = vsubq_s16(tnext2, vshlq_n_s16(tnext3, 1));
570 // tnext3 = 2*x0 - 4*x1 - 12*x2 - 4*x3 + 2*x4
571
572 tnext1OldOld = tnext1;
573 tnext2OldOld = tnext2;
574 tnext3OldOld = tnext3;
575 tnext4OldOld = tnext2;
576 tnext5OldOld = tnext1;
577
578 if(x) {
579 tnext1 = vextq_s16(tnext1Old, tnext1, 2);
580 tcurr1 = vextq_s16(tnext2Old, tnext2, 1);
581 tprev1 = tnext3Old;
582
583 if(x!=8) {
584 tpprev1 = vextq_s16(tnext4OldOldOld, tnext4Old, 7);
585 tppprev1 = vextq_s16(tnext5OldOldOld, tnext5Old, 6);
586 }
587 }
588
589 if(!x) {
590 // make border
591 if (border == BORDER_MODE_REPLICATE) {
592 tpprev1 = vextq_s16(tnext2, tnext2, 7);
593 tpprev1 = vsetq_lane_s16(vgetq_lane_s16(tpprev1, 1),tpprev1, 0);
594
595 tprev1 = vextq_s16(tnext1, tnext1, 6);
596 tprev1 = vsetq_lane_s16(vgetq_lane_s16(tprev1, 2),tprev1, 0);
597 tprev1 = vsetq_lane_s16(vgetq_lane_s16(tprev1, 2),tprev1, 1);
598 } else if (border == BORDER_MODE_REFLECT) {
599 tpprev1 = vextq_s16(tnext2, tnext2, 7);
600 tpprev1 = vsetq_lane_s16(vgetq_lane_s16(tpprev1, 1),tpprev1, 0);
601
602 tprev1 = vextq_s16(tnext1, tnext1, 6);
603 tprev1 = vsetq_lane_s16(vgetq_lane_s16(tprev1, 3),tprev1, 0);
604 tprev1 = vsetq_lane_s16(vgetq_lane_s16(tprev1, 2),tprev1, 1);
605 } else if (border == BORDER_MODE_REFLECT101) {
606 tpprev1 = vextq_s16(tnext2, tnext2, 7);
607 tpprev1 = vsetq_lane_s16(vgetq_lane_s16(tpprev1, 2),tpprev1, 0);
608
609 tprev1 = vextq_s16(tnext1, tnext1, 6);
610 tprev1 = vsetq_lane_s16(vgetq_lane_s16(tprev1, 3),tprev1, 1);
611 tprev1 = vsetq_lane_s16(vgetq_lane_s16(tprev1, 4),tprev1, 0);
612 } else if (border == BORDER_MODE_CONSTANT) {
613 tpprev1 = vextq_s16(tnext2, tnext2, 7);
614 tpprev1 = vsetq_lane_s16(borderValue, tpprev1, 0);
615
616 tprev1 = vextq_s16(tnext1, tnext1, 6);
617 tprev1 = vsetq_lane_s16(borderValue, tprev1, 0);
618 tprev1 = vsetq_lane_s16(borderValue, tprev1, 1);
619 }
620 tppprev1 = tprev1;
621 continue;
622 }
623
624 t0 = vaddq_s16(vaddq_s16(vqaddq_s16(tcurr1, tprev1), vqaddq_s16(tpprev1, tppprev1)), tnext1);
625 t0 = vaddq_s16(t0, t0);
626 vst1q_s16(drow + x - 8, t0);
627 }
628 x -= 8;
629 if(x >= cols - 1)
630 x = cols-2;
631
632 s16 pprevx = 0;
633 s16 prevx = 0;
634 s16 nextx = 0;
635 s16 nnextx = 0;
636
637 for( ; x < cols; x++ )
638 {
639 if (x == 0) {
640 // make border
641 if (border == BORDER_MODE_REPLICATE) {
642 pprevx = v0[0] + 2*v1[0] + 2*v2[0] + 2*v3[0] + v4[0];
643 prevx = 2*v0[0] - 4*v2[0] + 2*v4[0];
644 } else if (border == BORDER_MODE_REFLECT) {
645 pprevx = v0[1] + 2*v1[1] + 2*v2[1] + 2*v3[1] + v4[1];
646 prevx = 2*v0[0] - 4*v2[0] + 2*v4[0];
647 } else if (border == BORDER_MODE_REFLECT101) {
648 pprevx = v0[2] + 2*v1[2] + 2*v2[2] + 2*v3[2] + v4[2];
649 prevx = 2*v0[1] - 4*v2[1] + 2*v4[1];
650 } else if (border == BORDER_MODE_CONSTANT) {
651 pprevx = 8 * borderValue;
652 prevx = 0;
653 }
654 } else if (x == 1) {
655 // make border
656 if (border == BORDER_MODE_REPLICATE || border == BORDER_MODE_REFLECT) {
657 pprevx = v0[0] + 2*v1[0] + 2*v2[0] + 2*v3[0] + v4[0];
658 } else if (border == BORDER_MODE_REFLECT101) {
659 pprevx = v0[1] + 2*v1[1] + 2*v2[1] + 2*v3[1] + v4[1];
660 } else if (border == BORDER_MODE_CONSTANT) {
661 pprevx = 8 * borderValue;
662 }
663 prevx = 2*v0[0] - 4*v2[0] + 2*v4[0];
664 } else {
665 pprevx = v0[x-2] + 2*v1[x-2] + 2*v2[x-2] + 2*v3[x-2] + v4[x-2];
666 prevx = 2*v0[x-1] - 4*v2[x-1] + 2*v4[x-1];
667 }
668 s16 currx = 2*v0[x] - 4*v1[x] - 12*v2[x] - 4*v3[x] + 2*v4[x];
669 if (x == cols-1) {
670 // make border
671 if (border == BORDER_MODE_REPLICATE) {
672 nextx = 2*v0[x] - 4*v2[x] + 2*v4[x];
673 nnextx = v0[x] + 2*v1[x] + 2*v2[x] + 2*v3[x] + v4[x];
674 } else if (border == BORDER_MODE_REFLECT) {
675 nextx = 2*v0[x] - 4*v2[x] + 2*v4[x];
676 nnextx = v0[x-1] + 2*v1[x-1] + 2*v2[x-1] + 2*v3[x-1] + v4[x-1];
677 } else if (border == BORDER_MODE_REFLECT101) {
678 nextx = 2*v0[x-1] - 4*v2[x-1] + 2*v4[x-1];
679 nnextx = v0[x-2] + 2*v1[x-2] + 2*v2[x-2] + 2*v3[x-2] + v4[x-2];
680 } else if (border == BORDER_MODE_CONSTANT) {
681 nextx = 0;
682 nnextx = 8 * borderValue;
683 }
684 } else if (x == cols-2) {
685 // make border
686 if (border == BORDER_MODE_REPLICATE || border == BORDER_MODE_REFLECT) {
687 nnextx = v0[x+1] + 2*v1[x+1] + 2*v2[x+1] + 2*v3[x+1] + v4[x+1];
688 } else if (border == BORDER_MODE_REFLECT101) {
689 nnextx = v0[x] + 2*v1[x] + 2*v2[x] + 2*v3[x] + v4[x];
690 } else if (border == BORDER_MODE_CONSTANT) {
691 nnextx = 8 * borderValue;
692 }
693 nextx = 2*v0[x+1] - 4*v2[x+1] + 2*v4[x+1];
694 } else {
695 nextx = 2*v0[x+1] - 4*v2[x+1] + 2*v4[x+1];
696 nnextx = v0[x+2] + 2*v1[x+2] + 2*v2[x+2] + 2*v3[x+2] + v4[x+2];
697 }
698 s16 res = pprevx + prevx + currx + nextx + nnextx;
699 *(drow+x) = 2*res;
700 }
701 }
702 #else
703 (void)size;
704 (void)srcBase;
705 (void)srcStride;
706 (void)dstBase;
707 (void)dstStride;
708 (void)border;
709 (void)borderValue;
710 #endif
711 }
712
713 } // namespace CAROTENE_NS
714