• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * By downloading, copying, installing or using the software you agree to this license.
3  * If you do not agree to this license, do not download, install,
4  * copy or use the software.
5  *
6  *
7  *                           License Agreement
8  *                For Open Source Computer Vision Library
9  *                        (3-clause BSD License)
10  *
11  * Copyright (C) 2014, NVIDIA Corporation, all rights reserved.
12  * Third party copyrights are property of their respective owners.
13  *
14  * Redistribution and use in source and binary forms, with or without modification,
15  * are permitted provided that the following conditions are met:
16  *
17  *   * Redistributions of source code must retain the above copyright notice,
18  *     this list of conditions and the following disclaimer.
19  *
20  *   * Redistributions in binary form must reproduce the above copyright notice,
21  *     this list of conditions and the following disclaimer in the documentation
22  *     and/or other materials provided with the distribution.
23  *
24  *   * Neither the names of the copyright holders nor the names of the contributors
25  *     may be used to endorse or promote products derived from this software
26  *     without specific prior written permission.
27  *
28  * This software is provided by the copyright holders and contributors "as is" and
29  * any express or implied warranties, including, but not limited to, the implied
30  * warranties of merchantability and fitness for a particular purpose are disclaimed.
31  * In no event shall copyright holders or contributors be liable for any direct,
32  * indirect, incidental, special, exemplary, or consequential damages
33  * (including, but not limited to, procurement of substitute goods or services;
34  * loss of use, data, or profits; or business interruption) however caused
35  * and on any theory of liability, whether in contract, strict liability,
36  * or tort (including negligence or otherwise) arising in any way out of
37  * the use of this software, even if advised of the possibility of such damage.
38  */
39 
40 #include "common.hpp"
41 #include "saturate_cast.hpp"
42 
43 namespace CAROTENE_NS {
44 
isConvolutionSupported(const Size2D & size,const Size2D & ksize,BORDER_MODE border)45 bool isConvolutionSupported(const Size2D &size, const Size2D &ksize,
46                             BORDER_MODE border)
47 {
48     return isSupportedConfiguration() && size.width >= 8 &&
49         (border == BORDER_MODE_CONSTANT ||
50             border == BORDER_MODE_REPLICATE) &&
51         (ksize.width == 3) && (ksize.height == 3);
52 }
53 
54 #ifdef CAROTENE_NEON
55 
56 namespace {
57 
58 template <int shift>
vshrq_s32(int32x4_t value)59 int32x4_t vshrq_s32(int32x4_t value)
60 {
61     return vshrq_n_s32(value, shift);
62 }
63 
64 template <>
vshrq_s32(int32x4_t value)65 int32x4_t vshrq_s32<0>(int32x4_t value)
66 {
67     return value;
68 }
69 
70 } // namespace
71 
72 typedef int32x4_t (* vshrq_s32_func)(int32x4_t value);
73 
74 #endif
75 
convolution(const Size2D & size,const u8 * srcBase,ptrdiff_t srcStride,u8 * dstBase,ptrdiff_t dstStride,BORDER_MODE border,u8 borderValue,const Size2D & ksize,s16 * kernelBase,u32 scale)76 void convolution(const Size2D &size,
77                  const u8 * srcBase, ptrdiff_t srcStride,
78                  u8 * dstBase, ptrdiff_t dstStride,
79                  BORDER_MODE border, u8 borderValue,
80                  const Size2D & ksize, s16 * kernelBase, u32 scale)
81 {
82     internal::assertSupportedConfiguration(isConvolutionSupported(size, ksize, border));
83 #ifdef CAROTENE_NEON
84     const uint8x8_t v_zero_u8 = vdup_n_u8(0);
85     const uint8x8_t v_border = vdup_n_u8(borderValue);
86     const int32x4_t v_zero_s32 = vdupq_n_s32(0);
87 
88     uint8x8_t tprev[3] = { v_zero_u8, v_zero_u8, v_zero_u8 },
89               tcurr[3] = { v_zero_u8, v_zero_u8, v_zero_u8 },
90               tnext[3] = { v_zero_u8, v_zero_u8, v_zero_u8 };
91     uint8x8_t t0 = v_zero_u8, t1 = v_zero_u8, t2 = v_zero_u8;
92 
93     ptrdiff_t width = (ptrdiff_t)size.width, height = (ptrdiff_t)size.height;
94     static const vshrq_s32_func vshrq_s32_a[33] =
95     {
96         vshrq_s32<0>,
97         vshrq_s32<1>,
98         vshrq_s32<2>,
99         vshrq_s32<3>,
100         vshrq_s32<4>,
101         vshrq_s32<5>,
102         vshrq_s32<6>,
103         vshrq_s32<7>,
104         vshrq_s32<8>,
105         vshrq_s32<9>,
106         vshrq_s32<10>,
107         vshrq_s32<11>,
108         vshrq_s32<12>,
109         vshrq_s32<13>,
110         vshrq_s32<14>,
111         vshrq_s32<15>,
112         vshrq_s32<16>,
113         vshrq_s32<17>,
114         vshrq_s32<18>,
115         vshrq_s32<19>,
116         vshrq_s32<20>,
117         vshrq_s32<21>,
118         vshrq_s32<22>,
119         vshrq_s32<23>,
120         vshrq_s32<24>,
121         vshrq_s32<25>,
122         vshrq_s32<26>,
123         vshrq_s32<27>,
124         vshrq_s32<28>,
125         vshrq_s32<29>,
126         vshrq_s32<30>,
127         vshrq_s32<31>,
128         vshrq_s32<32>
129     };
130     vshrq_s32_func vshrq_s32_p = vshrq_s32_a[scale];
131 
132     for (ptrdiff_t y = 0; y < height; ++y)
133     {
134         const u8 * srow0 = y == 0 && border == BORDER_MODE_CONSTANT ? NULL : internal::getRowPtr(srcBase, srcStride, std::max<ptrdiff_t>(y - 1, 0));
135         const u8 * srow1 = internal::getRowPtr(srcBase, srcStride, y);
136         const u8 * srow2 = y + 1 == height && border == BORDER_MODE_CONSTANT ? NULL : internal::getRowPtr(srcBase, srcStride, std::min(y + 1, height - 1));
137         u8 * drow = internal::getRowPtr(dstBase, dstStride, y);
138 
139         u8 prevx[3] = { 0, 0, 0 },
140            currx[3] = { 0, 0, 0 },
141            nextx[3] = { 0, 0, 0 };
142         ptrdiff_t x = 0;
143         const ptrdiff_t bwidth = y + 2 < height ? width : (width - 8);
144 
145         // perform vertical convolution
146         for ( ; x <= bwidth; x += 8)
147         {
148             internal::prefetch(srow0 + x);
149             internal::prefetch(srow1 + x);
150             internal::prefetch(srow2 + x);
151 
152             uint8x8_t x0 = !srow0 ? v_border : vld1_u8(srow0 + x);
153             uint8x8_t x1 = vld1_u8(srow1 + x);
154             uint8x8_t x2 = !srow2 ? v_border : vld1_u8(srow2 + x);
155 
156             // calculate values for plain CPU part below if needed
157             if (x + 8 >= bwidth)
158             {
159                 ptrdiff_t x3 = x == width ? width - 1 : x;
160                 ptrdiff_t x4 = border == BORDER_MODE_CONSTANT ? x3 - 1 : std::max<ptrdiff_t>(x3 - 1, 0);
161 
162                 if (border == BORDER_MODE_CONSTANT && x4 < 0)
163                     prevx[0] = prevx[1] = prevx[2] = borderValue;
164                 else
165                 {
166                     prevx[0] = srow0 ? srow0[x4] : borderValue;
167                     prevx[1] =         srow1[x4]              ;
168                     prevx[2] = srow2 ? srow2[x4] : borderValue;
169                 }
170 
171                 currx[0] = srow0 ? srow0[x3] : borderValue;
172                 currx[1] =         srow1[x3]              ;
173                 currx[2] = srow2 ? srow2[x3] : borderValue;
174             }
175 
176             // make shift
177             if (x)
178             {
179                 tprev[0] = tcurr[0];
180                 tcurr[0] = tnext[0];
181 
182                 tprev[1] = tcurr[1];
183                 tcurr[1] = tnext[1];
184 
185                 tprev[2] = tcurr[2];
186                 tcurr[2] = tnext[2];
187             }
188 
189             tnext[0] = x0;
190             tnext[1] = x1;
191             tnext[2] = x2;
192 
193             // make extrapolation for the first elements
194             if (!x)
195             {
196                 // make border
197                 if (border == BORDER_MODE_CONSTANT)
198                     tcurr[0] = tcurr[1] = tcurr[2] = v_border;
199                 else if (border == BORDER_MODE_REPLICATE)
200                 {
201                     tcurr[0] = vdup_n_u8(vget_lane_u8(tnext[0], 0));
202                     tcurr[1] = vdup_n_u8(vget_lane_u8(tnext[1], 0));
203                     tcurr[2] = vdup_n_u8(vget_lane_u8(tnext[2], 0));
204                 }
205 
206                 continue;
207             }
208 
209             int32x4_t v_dst0 = v_zero_s32, v_dst1 = v_zero_s32;
210 
211             {
212                 // combine 3 "shifted" vectors
213                 t0 = vext_u8(tprev[0], tcurr[0], 7);
214                 t1 = tcurr[0];
215                 t2 = vext_u8(tcurr[0], tnext[0], 1);
216 
217                 int16x8_t t0_16s = vreinterpretq_s16_u16(vmovl_u8(t0));
218                 int16x8_t t1_16s = vreinterpretq_s16_u16(vmovl_u8(t1));
219                 int16x8_t t2_16s = vreinterpretq_s16_u16(vmovl_u8(t2));
220 
221                 v_dst0 = vmlal_n_s16(v_dst0, vget_low_s16(t0_16s), kernelBase[8]);
222                 v_dst0 = vmlal_n_s16(v_dst0, vget_low_s16(t1_16s), kernelBase[7]);
223                 v_dst0 = vmlal_n_s16(v_dst0, vget_low_s16(t2_16s), kernelBase[6]);
224 
225                 v_dst1 = vmlal_n_s16(v_dst1, vget_high_s16(t0_16s), kernelBase[8]);
226                 v_dst1 = vmlal_n_s16(v_dst1, vget_high_s16(t1_16s), kernelBase[7]);
227                 v_dst1 = vmlal_n_s16(v_dst1, vget_high_s16(t2_16s), kernelBase[6]);
228             }
229 
230             {
231                 // combine 3 "shifted" vectors
232                 t0 = vext_u8(tprev[1], tcurr[1], 7);
233                 t1 = tcurr[1];
234                 t2 = vext_u8(tcurr[1], tnext[1], 1);
235 
236                 int16x8_t t0_16s = vreinterpretq_s16_u16(vmovl_u8(t0));
237                 int16x8_t t1_16s = vreinterpretq_s16_u16(vmovl_u8(t1));
238                 int16x8_t t2_16s = vreinterpretq_s16_u16(vmovl_u8(t2));
239 
240                 v_dst0 = vmlal_n_s16(v_dst0, vget_low_s16(t0_16s), kernelBase[5]);
241                 v_dst0 = vmlal_n_s16(v_dst0, vget_low_s16(t1_16s), kernelBase[4]);
242                 v_dst0 = vmlal_n_s16(v_dst0, vget_low_s16(t2_16s), kernelBase[3]);
243 
244                 v_dst1 = vmlal_n_s16(v_dst1, vget_high_s16(t0_16s), kernelBase[5]);
245                 v_dst1 = vmlal_n_s16(v_dst1, vget_high_s16(t1_16s), kernelBase[4]);
246                 v_dst1 = vmlal_n_s16(v_dst1, vget_high_s16(t2_16s), kernelBase[3]);
247             }
248 
249             {
250                 // combine 3 "shifted" vectors
251                 t0 = vext_u8(tprev[2], tcurr[2], 7);
252                 t1 = tcurr[2];
253                 t2 = vext_u8(tcurr[2], tnext[2], 1);
254 
255                 int16x8_t t0_16s = vreinterpretq_s16_u16(vmovl_u8(t0));
256                 int16x8_t t1_16s = vreinterpretq_s16_u16(vmovl_u8(t1));
257                 int16x8_t t2_16s = vreinterpretq_s16_u16(vmovl_u8(t2));
258 
259                 v_dst0 = vmlal_n_s16(v_dst0, vget_low_s16(t0_16s), kernelBase[2]);
260                 v_dst0 = vmlal_n_s16(v_dst0, vget_low_s16(t1_16s), kernelBase[1]);
261                 v_dst0 = vmlal_n_s16(v_dst0, vget_low_s16(t2_16s), kernelBase[0]);
262 
263                 v_dst1 = vmlal_n_s16(v_dst1, vget_high_s16(t0_16s), kernelBase[2]);
264                 v_dst1 = vmlal_n_s16(v_dst1, vget_high_s16(t1_16s), kernelBase[1]);
265                 v_dst1 = vmlal_n_s16(v_dst1, vget_high_s16(t2_16s), kernelBase[0]);
266             }
267 
268 
269             // make scale
270             v_dst0 = vshrq_s32_p(v_dst0);
271             v_dst1 = vshrq_s32_p(v_dst1);
272 
273             // and add them
274             vst1_u8(drow + x - 8, vqmovn_u16(vcombine_u16(vqmovun_s32(v_dst0),
275                                                           vqmovun_s32(v_dst1))));
276         }
277 
278         x -= 8;
279         if (x == width)
280             --x;
281 
282         for ( ; x < width; ++x)
283         {
284             // make extrapolation for the last elements
285             if (x + 1 >= width)
286             {
287                 if (border == BORDER_MODE_CONSTANT)
288                 {
289                     nextx[0] = borderValue;
290                     nextx[1] = borderValue;
291                     nextx[2] = borderValue;
292                 }
293                 else if (border == BORDER_MODE_REPLICATE)
294                 {
295                     nextx[0] = srow0[x];
296                     nextx[1] = srow1[x];
297                     nextx[2] = srow2[x];
298                 }
299             }
300             else
301             {
302                 nextx[0] = srow0 ? srow0[x + 1] : borderValue;
303                 nextx[1] =         srow1[x + 1]              ;
304                 nextx[2] = srow2 ? srow2[x + 1] : borderValue;
305             }
306 
307             s32 val = 0;
308             for (s32 _y = 0; _y < 3; ++_y)
309                 val += prevx[_y] * kernelBase[(2 - _y) * 3 + 2] +
310                        currx[_y] * kernelBase[(2 - _y) * 3 + 1] +
311                        nextx[_y] * kernelBase[(2 - _y) * 3 + 0];
312 
313             drow[x] = internal::saturate_cast<u8>(val >> scale);
314 
315             // make shift
316             prevx[0] = currx[0];
317             currx[0] = nextx[0];
318 
319             prevx[1] = currx[1];
320             currx[1] = nextx[1];
321 
322             prevx[2] = currx[2];
323             currx[2] = nextx[2];
324         }
325     }
326 #else
327     (void)size;
328     (void)srcBase;
329     (void)srcStride;
330     (void)dstBase;
331     (void)dstStride;
332     (void)border;
333     (void)borderValue;
334     (void)ksize;
335     (void)kernelBase;
336     (void)scale;
337 #endif
338 }
339 
340 } // namespace CAROTENE_NS
341