• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*M///////////////////////////////////////////////////////////////////////////////////////
2 //
3 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
4 //
5 //  By downloading, copying, installing or using the software you agree to this license.
6 //  If you do not agree to this license, do not download, install,
7 //  copy or use the software.
8 //
9 //
10 //                           License Agreement
11 //                For Open Source Computer Vision Library
12 //
13 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
14 // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
15 // Copyright (C) 2014, Itseez Inc., all rights reserved.
16 // Third party copyrights are property of their respective owners.
17 /
18 // Redistribution and use in source and binary forms, with or without modification,
19 // are permitted provided that the following conditions are met:
20 //
21 //   * Redistribution's of source code must retain the above copyright notice,
22 //     this list of conditions and the following disclaimer.
23 //
24 //   * Redistribution's in binary form must reproduce the above copyright notice,
25 //     this list of conditions and the following disclaimer in the documentation
26 //     and/or other materials provided with the distribution.
27 //
28 //   * The name of the copyright holders may not be used to endorse or promote products
29 //     derived from this software without specific prior written permission.
30 //
31 // This software is provided by the copyright holders and contributors "as is" and
32 // any express or implied warranties, including, but not limited to, the implied
33 // warranties of merchantability and fitness for a particular purpose are disclaimed.
34 // In no event shall the Intel Corporation or contributors be liable for any direct,
35 // indirect, incidental, special, exemplary, or consequential damages
36 // (including, but not limited to, procurement of substitute goods or services;
37 // loss of use, data, or profits; or business interruption) however caused
38 // and on any theory of liability, whether in contract, strict liability,
39 // or tort (including negligence or otherwise) arising in any way out of
40 // the use of this software, even if advised of the possibility of such damage.
41 //
42 //M*/
43 
44 #include "precomp.hpp"
45 #include "opencl_kernels_imgproc.hpp"
46 
47 namespace cv
48 {
49 
50 template <typename T, typename AT>
51 struct Acc_SIMD
52 {
operator ()cv::Acc_SIMD53     int operator() (const T *, AT *, const uchar *, int, int) const
54     {
55         return 0;
56     }
57 };
58 
59 template <typename T, typename AT>
60 struct AccSqr_SIMD
61 {
operator ()cv::AccSqr_SIMD62     int operator() (const T *, AT *, const uchar *, int, int) const
63     {
64         return 0;
65     }
66 };
67 
68 template <typename T, typename AT>
69 struct AccProd_SIMD
70 {
operator ()cv::AccProd_SIMD71     int operator() (const T *, const T *, AT *, const uchar *, int, int) const
72     {
73         return 0;
74     }
75 };
76 
77 template <typename T, typename AT>
78 struct AccW_SIMD
79 {
operator ()cv::AccW_SIMD80     int operator() (const T *, AT *, const uchar *, int, int, AT) const
81     {
82         return 0;
83     }
84 };
85 
86 #if CV_NEON
87 
88 template <>
89 struct Acc_SIMD<uchar, float>
90 {
operator ()cv::Acc_SIMD91     int operator() (const uchar * src, float * dst, const uchar * mask, int len, int cn) const
92     {
93         int x = 0;
94 
95         if (!mask)
96         {
97             len *= cn;
98             for ( ; x <= len - 16; x += 16)
99             {
100                 uint8x16_t v_src = vld1q_u8(src + x);
101                 uint16x8_t v_src0 = vmovl_u8(vget_low_u8(v_src)), v_src1 = vmovl_u8(vget_high_u8(v_src));
102 
103                 vst1q_f32(dst + x, vaddq_f32(vld1q_f32(dst + x), vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src0)))));
104                 vst1q_f32(dst + x + 4, vaddq_f32(vld1q_f32(dst + x + 4), vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src0)))));
105                 vst1q_f32(dst + x + 8, vaddq_f32(vld1q_f32(dst + x + 8), vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src1)))));
106                 vst1q_f32(dst + x + 12, vaddq_f32(vld1q_f32(dst + x + 12), vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src1)))));
107             }
108         }
109         else if (cn == 1)
110         {
111             uint8x16_t v_255 = vdupq_n_u8(255), v_0 = vdupq_n_u8(0);
112 
113             for ( ; x <= len - 16; x += 16)
114             {
115                 uint8x16_t v_src = vandq_u8(vld1q_u8(src + x), veorq_u8(v_255, vceqq_u8(vld1q_u8(mask + x), v_0)));
116                 uint16x8_t v_src0 = vmovl_u8(vget_low_u8(v_src)), v_src1 = vmovl_u8(vget_high_u8(v_src));
117 
118                 vst1q_f32(dst + x, vaddq_f32(vld1q_f32(dst + x), vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src0)))));
119                 vst1q_f32(dst + x + 4, vaddq_f32(vld1q_f32(dst + x + 4), vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src0)))));
120                 vst1q_f32(dst + x + 8, vaddq_f32(vld1q_f32(dst + x + 8), vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src1)))));
121                 vst1q_f32(dst + x + 12, vaddq_f32(vld1q_f32(dst + x + 12), vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src1)))));
122             }
123         }
124 
125         return x;
126     }
127 };
128 
129 template <>
130 struct Acc_SIMD<ushort, float>
131 {
operator ()cv::Acc_SIMD132     int operator() (const ushort * src, float * dst, const uchar * mask, int len, int cn) const
133     {
134         int x = 0;
135 
136         if (!mask)
137         {
138             len *= cn;
139             for ( ; x <= len - 8; x += 8)
140             {
141                 uint16x8_t v_src = vld1q_u16(src + x);
142                 uint32x4_t v_src0 = vmovl_u16(vget_low_u16(v_src)), v_src1 = vmovl_u16(vget_high_u16(v_src));
143 
144                 vst1q_f32(dst + x, vaddq_f32(vld1q_f32(dst + x), vcvtq_f32_u32(v_src0)));
145                 vst1q_f32(dst + x + 4, vaddq_f32(vld1q_f32(dst + x + 4), vcvtq_f32_u32(v_src1)));
146             }
147         }
148 
149         return x;
150     }
151 };
152 
153 template <>
154 struct Acc_SIMD<float, float>
155 {
operator ()cv::Acc_SIMD156     int operator() (const float * src, float * dst, const uchar * mask, int len, int cn) const
157     {
158         int x = 0;
159 
160         if (!mask)
161         {
162             len *= cn;
163             for ( ; x <= len - 8; x += 8)
164             {
165                 vst1q_f32(dst + x, vaddq_f32(vld1q_f32(dst + x), vld1q_f32(src + x)));
166                 vst1q_f32(dst + x + 4, vaddq_f32(vld1q_f32(dst + x + 4), vld1q_f32(src + x + 4)));
167             }
168         }
169 
170         return x;
171     }
172 };
173 
174 template <>
175 struct AccSqr_SIMD<uchar, float>
176 {
operator ()cv::AccSqr_SIMD177     int operator() (const uchar * src, float * dst, const uchar * mask, int len, int cn) const
178     {
179         int x = 0;
180 
181         if (!mask)
182         {
183             len *= cn;
184             for ( ; x <= len - 16; x += 16)
185             {
186                 uint8x16_t v_src = vld1q_u8(src + x);
187                 uint8x8_t v_src_0 = vget_low_u8(v_src), v_src_1 = vget_high_u8(v_src);
188                 uint16x8_t v_src0 = vmull_u8(v_src_0, v_src_0), v_src1 = vmull_u8(v_src_1, v_src_1);
189 
190                 vst1q_f32(dst + x, vaddq_f32(vld1q_f32(dst + x), vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src0)))));
191                 vst1q_f32(dst + x + 4, vaddq_f32(vld1q_f32(dst + x + 4), vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src0)))));
192                 vst1q_f32(dst + x + 8, vaddq_f32(vld1q_f32(dst + x + 8), vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src1)))));
193                 vst1q_f32(dst + x + 12, vaddq_f32(vld1q_f32(dst + x + 12), vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src1)))));
194             }
195         }
196         else if (cn == 1)
197         {
198             uint8x16_t v_255 = vdupq_n_u8(255), v_0 = vdupq_n_u8(0);
199 
200             for ( ; x <= len - 16; x += 16)
201             {
202                 uint8x16_t v_src = vandq_u8(vld1q_u8(src + x), veorq_u8(v_255, vceqq_u8(vld1q_u8(mask + x), v_0)));
203                 uint8x8_t v_src_0 = vget_low_u8(v_src), v_src_1 = vget_high_u8(v_src);
204                 uint16x8_t v_src0 = vmull_u8(v_src_0, v_src_0), v_src1 = vmull_u8(v_src_1, v_src_1);
205 
206                 vst1q_f32(dst + x, vaddq_f32(vld1q_f32(dst + x), vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src0)))));
207                 vst1q_f32(dst + x + 4, vaddq_f32(vld1q_f32(dst + x + 4), vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src0)))));
208                 vst1q_f32(dst + x + 8, vaddq_f32(vld1q_f32(dst + x + 8), vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src1)))));
209                 vst1q_f32(dst + x + 12, vaddq_f32(vld1q_f32(dst + x + 12), vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src1)))));
210             }
211         }
212 
213         return x;
214     }
215 };
216 
217 template <>
218 struct AccSqr_SIMD<ushort, float>
219 {
operator ()cv::AccSqr_SIMD220     int operator() (const ushort * src, float * dst, const uchar * mask, int len, int cn) const
221     {
222         int x = 0;
223 
224         if (!mask)
225         {
226             len *= cn;
227             for ( ; x <= len - 8; x += 8)
228             {
229                 uint16x8_t v_src = vld1q_u16(src + x);
230                 uint16x4_t v_src_0 = vget_low_u16(v_src), v_src_1 = vget_high_u16(v_src);
231                 uint32x4_t v_src0 = vmull_u16(v_src_0, v_src_0), v_src1 = vmull_u16(v_src_1, v_src_1);
232 
233                 vst1q_f32(dst + x, vaddq_f32(vld1q_f32(dst + x), vcvtq_f32_u32(v_src0)));
234                 vst1q_f32(dst + x + 4, vaddq_f32(vld1q_f32(dst + x + 4), vcvtq_f32_u32(v_src1)));
235             }
236         }
237         else if (cn == 1)
238         {
239             uint8x8_t v_255 = vdup_n_u8(255), v_0 = vdup_n_u8(0);
240 
241             for ( ; x <= len - 8; x += 8)
242             {
243                 uint8x8_t v_mask_src = veor_u8(v_255, vceq_u8(vld1_u8(mask + x), v_0));
244                 uint8x8x2_t v_mask_zp = vzip_u8(v_mask_src, v_mask_src);
245                 uint16x8_t v_mask = vreinterpretq_u16_u8(vcombine_u8(v_mask_zp.val[0], v_mask_zp.val[1])),
246                            v_src = vandq_u16(vld1q_u16(src + x), v_mask);
247 
248                 uint16x4_t v_src_0 = vget_low_u16(v_src), v_src_1 = vget_high_u16(v_src);
249                 uint32x4_t v_src0 = vmull_u16(v_src_0, v_src_0), v_src1 = vmull_u16(v_src_1, v_src_1);
250 
251                 vst1q_f32(dst + x, vaddq_f32(vld1q_f32(dst + x), vcvtq_f32_u32(v_src0)));
252                 vst1q_f32(dst + x + 4, vaddq_f32(vld1q_f32(dst + x + 4), vcvtq_f32_u32(v_src1)));
253             }
254         }
255 
256         return x;
257     }
258 };
259 
260 template <>
261 struct AccSqr_SIMD<float, float>
262 {
operator ()cv::AccSqr_SIMD263     int operator() (const float * src, float * dst, const uchar * mask, int len, int cn) const
264     {
265         int x = 0;
266 
267         if (!mask)
268         {
269             len *= cn;
270             for ( ; x <= len - 8; x += 8)
271             {
272                 float32x4_t v_src = vld1q_f32(src + x);
273                 vst1q_f32(dst + x, vmlaq_f32(vld1q_f32(dst + x), v_src, v_src));
274 
275                 v_src = vld1q_f32(src + x + 4);
276                 vst1q_f32(dst + x + 4, vmlaq_f32(vld1q_f32(dst + x + 4), v_src, v_src));
277             }
278         }
279 
280         return x;
281     }
282 };
283 
284 template <>
285 struct AccProd_SIMD<uchar, float>
286 {
operator ()cv::AccProd_SIMD287     int operator() (const uchar * src1, const uchar * src2, float * dst, const uchar * mask, int len, int cn) const
288     {
289         int x = 0;
290 
291         if (!mask)
292         {
293             len *= cn;
294             for ( ; x <= len - 16; x += 16)
295             {
296                 uint8x16_t v_1src = vld1q_u8(src1 + x), v_2src = vld1q_u8(src2 + x);
297                 uint16x8_t v_src0 = vmull_u8(vget_low_u8(v_1src), vget_low_u8(v_2src)),
298                            v_src1 = vmull_u8(vget_high_u8(v_1src), vget_high_u8(v_2src));
299 
300                 vst1q_f32(dst + x, vaddq_f32(vld1q_f32(dst + x), vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src0)))));
301                 vst1q_f32(dst + x + 4, vaddq_f32(vld1q_f32(dst + x + 4), vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src0)))));
302                 vst1q_f32(dst + x + 8, vaddq_f32(vld1q_f32(dst + x + 8), vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src1)))));
303                 vst1q_f32(dst + x + 12, vaddq_f32(vld1q_f32(dst + x + 12), vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src1)))));
304             }
305         }
306         else if (cn == 1)
307         {
308             uint8x16_t v_255 = vdupq_n_u8(255), v_0 = vdupq_n_u8(0);
309 
310             for ( ; x <= len - 16; x += 16)
311             {
312                 uint8x16_t v_mask = veorq_u8(v_255, vceqq_u8(vld1q_u8(mask + x), v_0));
313                 uint8x16_t v_1src = vandq_u8(vld1q_u8(src1 + x), v_mask), v_2src = vandq_u8(vld1q_u8(src2 + x), v_mask);
314                 uint16x8_t v_src0 = vmull_u8(vget_low_u8(v_1src), vget_low_u8(v_2src)),
315                            v_src1 = vmull_u8(vget_high_u8(v_1src), vget_high_u8(v_2src));
316 
317                 vst1q_f32(dst + x, vaddq_f32(vld1q_f32(dst + x), vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src0)))));
318                 vst1q_f32(dst + x + 4, vaddq_f32(vld1q_f32(dst + x + 4), vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src0)))));
319                 vst1q_f32(dst + x + 8, vaddq_f32(vld1q_f32(dst + x + 8), vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src1)))));
320                 vst1q_f32(dst + x + 12, vaddq_f32(vld1q_f32(dst + x + 12), vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src1)))));
321             }
322         }
323 
324         return x;
325     }
326 };
327 
328 template <>
329 struct AccProd_SIMD<ushort, float>
330 {
operator ()cv::AccProd_SIMD331     int operator() (const ushort * src1, const ushort * src2, float * dst, const uchar * mask, int len, int cn) const
332     {
333         int x = 0;
334 
335         if (!mask)
336         {
337             len *= cn;
338             for ( ; x <= len - 8; x += 8)
339             {
340                 uint16x8_t v_1src = vld1q_u16(src1 + x), v_2src = vld1q_u16(src2 + x);
341                 uint32x4_t v_src0 = vmull_u16(vget_low_u16(v_1src), vget_low_u16(v_2src)),
342                            v_src1 = vmull_u16(vget_high_u16(v_1src), vget_high_u16(v_2src));
343 
344                 vst1q_f32(dst + x, vaddq_f32(vld1q_f32(dst + x), vcvtq_f32_u32(v_src0)));
345                 vst1q_f32(dst + x + 4, vaddq_f32(vld1q_f32(dst + x + 4), vcvtq_f32_u32(v_src1)));
346             }
347         }
348         else if (cn == 1)
349         {
350             uint8x8_t v_255 = vdup_n_u8(255), v_0 = vdup_n_u8(0);
351 
352             for ( ; x <= len - 8; x += 8)
353             {
354                 uint8x8_t v_mask_src = veor_u8(v_255, vceq_u8(vld1_u8(mask + x), v_0));
355                 uint8x8x2_t v_mask_zp = vzip_u8(v_mask_src, v_mask_src);
356                 uint16x8_t v_mask = vreinterpretq_u16_u8(vcombine_u8(v_mask_zp.val[0], v_mask_zp.val[1])),
357                            v_1src = vandq_u16(vld1q_u16(src1 + x), v_mask),
358                            v_2src = vandq_u16(vld1q_u16(src2 + x), v_mask);
359 
360                 uint32x4_t v_src0 = vmull_u16(vget_low_u16(v_1src), vget_low_u16(v_2src)),
361                            v_src1 = vmull_u16(vget_high_u16(v_1src), vget_high_u16(v_2src));
362 
363                 vst1q_f32(dst + x, vaddq_f32(vld1q_f32(dst + x), vcvtq_f32_u32(v_src0)));
364                 vst1q_f32(dst + x + 4, vaddq_f32(vld1q_f32(dst + x + 4), vcvtq_f32_u32(v_src1)));
365             }
366         }
367 
368         return x;
369     }
370 };
371 
372 template <>
373 struct AccProd_SIMD<float, float>
374 {
operator ()cv::AccProd_SIMD375     int operator() (const float * src1, const float * src2, float * dst, const uchar * mask, int len, int cn) const
376     {
377         int x = 0;
378 
379         if (!mask)
380         {
381             len *= cn;
382             for ( ; x <= len - 8; x += 8)
383             {
384                 vst1q_f32(dst + x, vmlaq_f32(vld1q_f32(dst + x), vld1q_f32(src1 + x), vld1q_f32(src2 + x)));
385                 vst1q_f32(dst + x + 4, vmlaq_f32(vld1q_f32(dst + x + 4), vld1q_f32(src1 + x + 4), vld1q_f32(src2 + x + 4)));
386             }
387         }
388 
389         return x;
390     }
391 };
392 
393 template <>
394 struct AccW_SIMD<uchar, float>
395 {
operator ()cv::AccW_SIMD396     int operator() (const uchar * src, float * dst, const uchar * mask, int len, int cn, float alpha) const
397     {
398         int x = 0;
399         float32x4_t v_alpha = vdupq_n_f32(alpha), v_beta = vdupq_n_f32(1.0f - alpha);
400 
401         if (!mask)
402         {
403             len *= cn;
404             for ( ; x <= len - 16; x += 16)
405             {
406                 uint8x16_t v_src = vld1q_u8(src + x);
407                 uint16x8_t v_src0 = vmovl_u8(vget_low_u8(v_src)), v_src1 = vmovl_u8(vget_high_u8(v_src));
408 
409                 vst1q_f32(dst + x, vmlaq_f32(vmulq_f32(vld1q_f32(dst + x), v_beta),
410                                              vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src0))), v_alpha));
411                 vst1q_f32(dst + x + 4, vmlaq_f32(vmulq_f32(vld1q_f32(dst + x + 4), v_beta),
412                                              vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src0))), v_alpha));
413                 vst1q_f32(dst + x + 8, vmlaq_f32(vmulq_f32(vld1q_f32(dst + x + 8), v_beta),
414                                                  vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src1))), v_alpha));
415                 vst1q_f32(dst + x + 12, vmlaq_f32(vmulq_f32(vld1q_f32(dst + x + 12), v_beta),
416                                                   vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src1))), v_alpha));
417             }
418         }
419 
420         return x;
421     }
422 };
423 
424 template <>
425 struct AccW_SIMD<ushort, float>
426 {
operator ()cv::AccW_SIMD427     int operator() (const ushort * src, float * dst, const uchar * mask, int len, int cn, float alpha) const
428     {
429         int x = 0;
430         float32x4_t v_alpha = vdupq_n_f32(alpha), v_beta = vdupq_n_f32(1.0f - alpha);
431 
432         if (!mask)
433         {
434             len *= cn;
435             for ( ; x <= len - 8; x += 8)
436             {
437                 uint16x8_t v_src = vld1q_u16(src + x);
438                 uint32x4_t v_src0 = vmovl_u16(vget_low_u16(v_src)), v_src1 = vmovl_u16(vget_high_u16(v_src));
439 
440                 vst1q_f32(dst + x, vmlaq_f32(vmulq_f32(vld1q_f32(dst + x), v_beta), vcvtq_f32_u32(v_src0), v_alpha));
441                 vst1q_f32(dst + x + 4, vmlaq_f32(vmulq_f32(vld1q_f32(dst + x + 4), v_beta), vcvtq_f32_u32(v_src1), v_alpha));
442             }
443         }
444 
445         return x;
446     }
447 };
448 
449 template <>
450 struct AccW_SIMD<float, float>
451 {
operator ()cv::AccW_SIMD452     int operator() (const float * src, float * dst, const uchar * mask, int len, int cn, float alpha) const
453     {
454         int x = 0;
455         float32x4_t v_alpha = vdupq_n_f32(alpha), v_beta = vdupq_n_f32(1.0f - alpha);
456 
457         if (!mask)
458         {
459             len *= cn;
460             for ( ; x <= len - 8; x += 8)
461             {
462                 vst1q_f32(dst + x, vmlaq_f32(vmulq_f32(vld1q_f32(dst + x), v_beta), vld1q_f32(src + x), v_alpha));
463                 vst1q_f32(dst + x + 4, vmlaq_f32(vmulq_f32(vld1q_f32(dst + x + 4), v_beta), vld1q_f32(src + x + 4), v_alpha));
464             }
465         }
466 
467         return x;
468     }
469 };
470 
471 #endif
472 
473 template<typename T, typename AT> void
acc_(const T * src,AT * dst,const uchar * mask,int len,int cn)474 acc_( const T* src, AT* dst, const uchar* mask, int len, int cn )
475 {
476     int i = Acc_SIMD<T, AT>()(src, dst, mask, len, cn);
477 
478     if( !mask )
479     {
480         len *= cn;
481         #if CV_ENABLE_UNROLLED
482         for( ; i <= len - 4; i += 4 )
483         {
484             AT t0, t1;
485             t0 = src[i] + dst[i];
486             t1 = src[i+1] + dst[i+1];
487             dst[i] = t0; dst[i+1] = t1;
488 
489             t0 = src[i+2] + dst[i+2];
490             t1 = src[i+3] + dst[i+3];
491             dst[i+2] = t0; dst[i+3] = t1;
492         }
493         #endif
494         for( ; i < len; i++ )
495             dst[i] += src[i];
496     }
497     else if( cn == 1 )
498     {
499         for( ; i < len; i++ )
500         {
501             if( mask[i] )
502                 dst[i] += src[i];
503         }
504     }
505     else if( cn == 3 )
506     {
507         for( ; i < len; i++, src += 3, dst += 3 )
508         {
509             if( mask[i] )
510             {
511                 AT t0 = src[0] + dst[0];
512                 AT t1 = src[1] + dst[1];
513                 AT t2 = src[2] + dst[2];
514 
515                 dst[0] = t0; dst[1] = t1; dst[2] = t2;
516             }
517         }
518     }
519     else
520     {
521         for( ; i < len; i++, src += cn, dst += cn )
522             if( mask[i] )
523             {
524                 for( int k = 0; k < cn; k++ )
525                     dst[k] += src[k];
526             }
527     }
528 }
529 
530 
531 template<typename T, typename AT> void
accSqr_(const T * src,AT * dst,const uchar * mask,int len,int cn)532 accSqr_( const T* src, AT* dst, const uchar* mask, int len, int cn )
533 {
534     int i = AccSqr_SIMD<T, AT>()(src, dst, mask, len, cn);
535 
536     if( !mask )
537     {
538         len *= cn;
539          #if CV_ENABLE_UNROLLED
540         for( ; i <= len - 4; i += 4 )
541         {
542             AT t0, t1;
543             t0 = (AT)src[i]*src[i] + dst[i];
544             t1 = (AT)src[i+1]*src[i+1] + dst[i+1];
545             dst[i] = t0; dst[i+1] = t1;
546 
547             t0 = (AT)src[i+2]*src[i+2] + dst[i+2];
548             t1 = (AT)src[i+3]*src[i+3] + dst[i+3];
549             dst[i+2] = t0; dst[i+3] = t1;
550         }
551         #endif
552         for( ; i < len; i++ )
553             dst[i] += (AT)src[i]*src[i];
554     }
555     else if( cn == 1 )
556     {
557         for( ; i < len; i++ )
558         {
559             if( mask[i] )
560                 dst[i] += (AT)src[i]*src[i];
561         }
562     }
563     else if( cn == 3 )
564     {
565         for( ; i < len; i++, src += 3, dst += 3 )
566         {
567             if( mask[i] )
568             {
569                 AT t0 = (AT)src[0]*src[0] + dst[0];
570                 AT t1 = (AT)src[1]*src[1] + dst[1];
571                 AT t2 = (AT)src[2]*src[2] + dst[2];
572 
573                 dst[0] = t0; dst[1] = t1; dst[2] = t2;
574             }
575         }
576     }
577     else
578     {
579         for( ; i < len; i++, src += cn, dst += cn )
580             if( mask[i] )
581             {
582                 for( int k = 0; k < cn; k++ )
583                     dst[k] += (AT)src[k]*src[k];
584             }
585     }
586 }
587 
588 
589 template<typename T, typename AT> void
accProd_(const T * src1,const T * src2,AT * dst,const uchar * mask,int len,int cn)590 accProd_( const T* src1, const T* src2, AT* dst, const uchar* mask, int len, int cn )
591 {
592     int i = AccProd_SIMD<T, AT>()(src1, src2, dst, mask, len, cn);
593 
594     if( !mask )
595     {
596         len *= cn;
597         #if CV_ENABLE_UNROLLED
598         for( ; i <= len - 4; i += 4 )
599         {
600             AT t0, t1;
601             t0 = (AT)src1[i]*src2[i] + dst[i];
602             t1 = (AT)src1[i+1]*src2[i+1] + dst[i+1];
603             dst[i] = t0; dst[i+1] = t1;
604 
605             t0 = (AT)src1[i+2]*src2[i+2] + dst[i+2];
606             t1 = (AT)src1[i+3]*src2[i+3] + dst[i+3];
607             dst[i+2] = t0; dst[i+3] = t1;
608         }
609         #endif
610         for( ; i < len; i++ )
611             dst[i] += (AT)src1[i]*src2[i];
612     }
613     else if( cn == 1 )
614     {
615         for( ; i < len; i++ )
616         {
617             if( mask[i] )
618                 dst[i] += (AT)src1[i]*src2[i];
619         }
620     }
621     else if( cn == 3 )
622     {
623         for( ; i < len; i++, src1 += 3, src2 += 3, dst += 3 )
624         {
625             if( mask[i] )
626             {
627                 AT t0 = (AT)src1[0]*src2[0] + dst[0];
628                 AT t1 = (AT)src1[1]*src2[1] + dst[1];
629                 AT t2 = (AT)src1[2]*src2[2] + dst[2];
630 
631                 dst[0] = t0; dst[1] = t1; dst[2] = t2;
632             }
633         }
634     }
635     else
636     {
637         for( ; i < len; i++, src1 += cn, src2 += cn, dst += cn )
638             if( mask[i] )
639             {
640                 for( int k = 0; k < cn; k++ )
641                     dst[k] += (AT)src1[k]*src2[k];
642             }
643     }
644 }
645 
646 
647 template<typename T, typename AT> void
accW_(const T * src,AT * dst,const uchar * mask,int len,int cn,double alpha)648 accW_( const T* src, AT* dst, const uchar* mask, int len, int cn, double alpha )
649 {
650     AT a = (AT)alpha, b = 1 - a;
651     int i = AccW_SIMD<T, AT>()(src, dst, mask, len, cn, a);
652 
653     if( !mask )
654     {
655         len *= cn;
656         #if CV_ENABLE_UNROLLED
657         for( ; i <= len - 4; i += 4 )
658         {
659             AT t0, t1;
660             t0 = src[i]*a + dst[i]*b;
661             t1 = src[i+1]*a + dst[i+1]*b;
662             dst[i] = t0; dst[i+1] = t1;
663 
664             t0 = src[i+2]*a + dst[i+2]*b;
665             t1 = src[i+3]*a + dst[i+3]*b;
666             dst[i+2] = t0; dst[i+3] = t1;
667         }
668         #endif
669         for( ; i < len; i++ )
670             dst[i] = src[i]*a + dst[i]*b;
671     }
672     else if( cn == 1 )
673     {
674         for( ; i < len; i++ )
675         {
676             if( mask[i] )
677                 dst[i] = src[i]*a + dst[i]*b;
678         }
679     }
680     else if( cn == 3 )
681     {
682         for( ; i < len; i++, src += 3, dst += 3 )
683         {
684             if( mask[i] )
685             {
686                 AT t0 = src[0]*a + dst[0]*b;
687                 AT t1 = src[1]*a + dst[1]*b;
688                 AT t2 = src[2]*a + dst[2]*b;
689 
690                 dst[0] = t0; dst[1] = t1; dst[2] = t2;
691             }
692         }
693     }
694     else
695     {
696         for( ; i < len; i++, src += cn, dst += cn )
697             if( mask[i] )
698             {
699                 for( int k = 0; k < cn; k++ )
700                     dst[k] = src[k]*a + dst[k]*b;
701             }
702     }
703 }
704 
705 
706 #define DEF_ACC_FUNCS(suffix, type, acctype) \
707 static void acc_##suffix(const type* src, acctype* dst, \
708                          const uchar* mask, int len, int cn) \
709 { acc_(src, dst, mask, len, cn); } \
710 \
711 static void accSqr_##suffix(const type* src, acctype* dst, \
712                             const uchar* mask, int len, int cn) \
713 { accSqr_(src, dst, mask, len, cn); } \
714 \
715 static void accProd_##suffix(const type* src1, const type* src2, \
716                              acctype* dst, const uchar* mask, int len, int cn) \
717 { accProd_(src1, src2, dst, mask, len, cn); } \
718 \
719 static void accW_##suffix(const type* src, acctype* dst, \
720                           const uchar* mask, int len, int cn, double alpha) \
721 { accW_(src, dst, mask, len, cn, alpha); }
722 
723 
724 DEF_ACC_FUNCS(8u32f, uchar, float)
725 DEF_ACC_FUNCS(8u64f, uchar, double)
726 DEF_ACC_FUNCS(16u32f, ushort, float)
727 DEF_ACC_FUNCS(16u64f, ushort, double)
728 DEF_ACC_FUNCS(32f, float, float)
729 DEF_ACC_FUNCS(32f64f, float, double)
730 DEF_ACC_FUNCS(64f, double, double)
731 
732 
733 typedef void (*AccFunc)(const uchar*, uchar*, const uchar*, int, int);
734 typedef void (*AccProdFunc)(const uchar*, const uchar*, uchar*, const uchar*, int, int);
735 typedef void (*AccWFunc)(const uchar*, uchar*, const uchar*, int, int, double);
736 
737 static AccFunc accTab[] =
738 {
739     (AccFunc)acc_8u32f, (AccFunc)acc_8u64f,
740     (AccFunc)acc_16u32f, (AccFunc)acc_16u64f,
741     (AccFunc)acc_32f, (AccFunc)acc_32f64f,
742     (AccFunc)acc_64f
743 };
744 
745 static AccFunc accSqrTab[] =
746 {
747     (AccFunc)accSqr_8u32f, (AccFunc)accSqr_8u64f,
748     (AccFunc)accSqr_16u32f, (AccFunc)accSqr_16u64f,
749     (AccFunc)accSqr_32f, (AccFunc)accSqr_32f64f,
750     (AccFunc)accSqr_64f
751 };
752 
753 static AccProdFunc accProdTab[] =
754 {
755     (AccProdFunc)accProd_8u32f, (AccProdFunc)accProd_8u64f,
756     (AccProdFunc)accProd_16u32f, (AccProdFunc)accProd_16u64f,
757     (AccProdFunc)accProd_32f, (AccProdFunc)accProd_32f64f,
758     (AccProdFunc)accProd_64f
759 };
760 
761 static AccWFunc accWTab[] =
762 {
763     (AccWFunc)accW_8u32f, (AccWFunc)accW_8u64f,
764     (AccWFunc)accW_16u32f, (AccWFunc)accW_16u64f,
765     (AccWFunc)accW_32f, (AccWFunc)accW_32f64f,
766     (AccWFunc)accW_64f
767 };
768 
getAccTabIdx(int sdepth,int ddepth)769 inline int getAccTabIdx(int sdepth, int ddepth)
770 {
771     return sdepth == CV_8U && ddepth == CV_32F ? 0 :
772            sdepth == CV_8U && ddepth == CV_64F ? 1 :
773            sdepth == CV_16U && ddepth == CV_32F ? 2 :
774            sdepth == CV_16U && ddepth == CV_64F ? 3 :
775            sdepth == CV_32F && ddepth == CV_32F ? 4 :
776            sdepth == CV_32F && ddepth == CV_64F ? 5 :
777            sdepth == CV_64F && ddepth == CV_64F ? 6 : -1;
778 }
779 
780 #ifdef HAVE_OPENCL
781 
782 enum
783 {
784     ACCUMULATE = 0,
785     ACCUMULATE_SQUARE = 1,
786     ACCUMULATE_PRODUCT = 2,
787     ACCUMULATE_WEIGHTED = 3
788 };
789 
ocl_accumulate(InputArray _src,InputArray _src2,InputOutputArray _dst,double alpha,InputArray _mask,int op_type)790 static bool ocl_accumulate( InputArray _src, InputArray _src2, InputOutputArray _dst, double alpha,
791                             InputArray _mask, int op_type )
792 {
793     CV_Assert(op_type == ACCUMULATE || op_type == ACCUMULATE_SQUARE ||
794               op_type == ACCUMULATE_PRODUCT || op_type == ACCUMULATE_WEIGHTED);
795 
796     const ocl::Device & dev = ocl::Device::getDefault();
797     bool haveMask = !_mask.empty(), doubleSupport = dev.doubleFPConfig() > 0;
798     int stype = _src.type(), sdepth = CV_MAT_DEPTH(stype), cn = CV_MAT_CN(stype), ddepth = _dst.depth();
799     int kercn = haveMask ? cn : ocl::predictOptimalVectorWidthMax(_src, _src2, _dst), rowsPerWI = dev.isIntel() ? 4 : 1;
800 
801     if (!doubleSupport && (sdepth == CV_64F || ddepth == CV_64F))
802         return false;
803 
804     const char * const opMap[4] = { "ACCUMULATE", "ACCUMULATE_SQUARE", "ACCUMULATE_PRODUCT",
805                                    "ACCUMULATE_WEIGHTED" };
806 
807     char cvt[40];
808     ocl::Kernel k("accumulate", ocl::imgproc::accumulate_oclsrc,
809                   format("-D %s%s -D srcT1=%s -D cn=%d -D dstT1=%s%s -D rowsPerWI=%d -D convertToDT=%s",
810                          opMap[op_type], haveMask ? " -D HAVE_MASK" : "",
811                          ocl::typeToStr(sdepth), kercn, ocl::typeToStr(ddepth),
812                          doubleSupport ? " -D DOUBLE_SUPPORT" : "", rowsPerWI,
813                          ocl::convertTypeStr(sdepth, ddepth, 1, cvt)));
814     if (k.empty())
815         return false;
816 
817     UMat src = _src.getUMat(), src2 = _src2.getUMat(), dst = _dst.getUMat(), mask = _mask.getUMat();
818 
819     ocl::KernelArg srcarg = ocl::KernelArg::ReadOnlyNoSize(src),
820             src2arg = ocl::KernelArg::ReadOnlyNoSize(src2),
821             dstarg = ocl::KernelArg::ReadWrite(dst, cn, kercn),
822             maskarg = ocl::KernelArg::ReadOnlyNoSize(mask);
823 
824     int argidx = k.set(0, srcarg);
825     if (op_type == ACCUMULATE_PRODUCT)
826         argidx = k.set(argidx, src2arg);
827     argidx = k.set(argidx, dstarg);
828     if (op_type == ACCUMULATE_WEIGHTED)
829     {
830         if (ddepth == CV_32F)
831             argidx = k.set(argidx, (float)alpha);
832         else
833             argidx = k.set(argidx, alpha);
834     }
835     if (haveMask)
836         k.set(argidx, maskarg);
837 
838     size_t globalsize[2] = { src.cols * cn / kercn, (src.rows + rowsPerWI - 1) / rowsPerWI };
839     return k.run(2, globalsize, NULL, false);
840 }
841 
842 #endif
843 
844 }
845 
accumulate(InputArray _src,InputOutputArray _dst,InputArray _mask)846 void cv::accumulate( InputArray _src, InputOutputArray _dst, InputArray _mask )
847 {
848     int stype = _src.type(), sdepth = CV_MAT_DEPTH(stype), scn = CV_MAT_CN(stype);
849     int dtype = _dst.type(), ddepth = CV_MAT_DEPTH(dtype), dcn = CV_MAT_CN(dtype);
850 
851     CV_Assert( _src.sameSize(_dst) && dcn == scn );
852     CV_Assert( _mask.empty() || (_src.sameSize(_mask) && _mask.type() == CV_8U) );
853 
854     CV_OCL_RUN(_src.dims() <= 2 && _dst.isUMat(),
855                ocl_accumulate(_src, noArray(), _dst, 0.0, _mask, ACCUMULATE))
856 
857     Mat src = _src.getMat(), dst = _dst.getMat(), mask = _mask.getMat();
858 
859 #if defined HAVE_IPP
860     CV_IPP_CHECK()
861     {
862         if (src.dims <= 2 || (src.isContinuous() && dst.isContinuous() && (mask.empty() || mask.isContinuous())))
863         {
864             typedef IppStatus (CV_STDCALL * ippiAdd)(const void * pSrc, int srcStep, Ipp32f * pSrcDst, int srcdstStep, IppiSize roiSize);
865             typedef IppStatus (CV_STDCALL * ippiAddMask)(const void * pSrc, int srcStep, const Ipp8u * pMask, int maskStep, Ipp32f * pSrcDst,
866                                                         int srcDstStep, IppiSize roiSize);
867             ippiAdd ippFunc = 0;
868             ippiAddMask ippFuncMask = 0;
869 
870             if (mask.empty())
871             {
872                 CV_SUPPRESS_DEPRECATED_START
873                 ippFunc = sdepth == CV_8U && ddepth == CV_32F ? (ippiAdd)ippiAdd_8u32f_C1IR :
874                     sdepth == CV_16U && ddepth == CV_32F ? (ippiAdd)ippiAdd_16u32f_C1IR :
875                     sdepth == CV_32F && ddepth == CV_32F ? (ippiAdd)ippiAdd_32f_C1IR : 0;
876                 CV_SUPPRESS_DEPRECATED_END
877             }
878             else if (scn == 1)
879             {
880                 ippFuncMask = sdepth == CV_8U && ddepth == CV_32F ? (ippiAddMask)ippiAdd_8u32f_C1IMR :
881                     sdepth == CV_16U && ddepth == CV_32F ? (ippiAddMask)ippiAdd_16u32f_C1IMR :
882                     sdepth == CV_32F && ddepth == CV_32F ? (ippiAddMask)ippiAdd_32f_C1IMR : 0;
883             }
884 
885             if (ippFunc || ippFuncMask)
886             {
887                 IppStatus status = ippStsNoErr;
888 
889                 Size size = src.size();
890                 int srcstep = (int)src.step, dststep = (int)dst.step, maskstep = (int)mask.step;
891                 if (src.isContinuous() && dst.isContinuous() && mask.isContinuous())
892                 {
893                     srcstep = static_cast<int>(src.total() * src.elemSize());
894                     dststep = static_cast<int>(dst.total() * dst.elemSize());
895                     maskstep = static_cast<int>(mask.total() * mask.elemSize());
896                     size.width = static_cast<int>(src.total());
897                     size.height = 1;
898                 }
899                 size.width *= scn;
900 
901                 if (mask.empty())
902                     status = ippFunc(src.ptr(), srcstep, dst.ptr<Ipp32f>(), dststep, ippiSize(size.width, size.height));
903                 else
904                     status = ippFuncMask(src.ptr(), srcstep, mask.ptr<Ipp8u>(), maskstep,
905                                          dst.ptr<Ipp32f>(), dststep, ippiSize(size.width, size.height));
906 
907                 if (status >= 0)
908                 {
909                     CV_IMPL_ADD(CV_IMPL_IPP);
910                     return;
911                 }
912                 setIppErrorStatus();
913             }
914         }
915     }
916 #endif
917 
918     int fidx = getAccTabIdx(sdepth, ddepth);
919     AccFunc func = fidx >= 0 ? accTab[fidx] : 0;
920     CV_Assert( func != 0 );
921 
922     const Mat* arrays[] = {&src, &dst, &mask, 0};
923     uchar* ptrs[3];
924     NAryMatIterator it(arrays, ptrs);
925     int len = (int)it.size;
926 
927     for( size_t i = 0; i < it.nplanes; i++, ++it )
928         func(ptrs[0], ptrs[1], ptrs[2], len, scn);
929 }
930 
accumulateSquare(InputArray _src,InputOutputArray _dst,InputArray _mask)931 void cv::accumulateSquare( InputArray _src, InputOutputArray _dst, InputArray _mask )
932 {
933     int stype = _src.type(), sdepth = CV_MAT_DEPTH(stype), scn = CV_MAT_CN(stype);
934     int dtype = _dst.type(), ddepth = CV_MAT_DEPTH(dtype), dcn = CV_MAT_CN(dtype);
935 
936     CV_Assert( _src.sameSize(_dst) && dcn == scn );
937     CV_Assert( _mask.empty() || (_src.sameSize(_mask) && _mask.type() == CV_8U) );
938 
939     CV_OCL_RUN(_src.dims() <= 2 && _dst.isUMat(),
940                ocl_accumulate(_src, noArray(), _dst, 0.0, _mask, ACCUMULATE_SQUARE))
941 
942     Mat src = _src.getMat(), dst = _dst.getMat(), mask = _mask.getMat();
943 
944 #if defined(HAVE_IPP)
945     CV_IPP_CHECK()
946     {
947         if (src.dims <= 2 || (src.isContinuous() && dst.isContinuous() && (mask.empty() || mask.isContinuous())))
948         {
949             typedef IppStatus (CV_STDCALL * ippiAddSquare)(const void * pSrc, int srcStep, Ipp32f * pSrcDst, int srcdstStep, IppiSize roiSize);
950             typedef IppStatus (CV_STDCALL * ippiAddSquareMask)(const void * pSrc, int srcStep, const Ipp8u * pMask, int maskStep, Ipp32f * pSrcDst,
951                                                                int srcDstStep, IppiSize roiSize);
952             ippiAddSquare ippFunc = 0;
953             ippiAddSquareMask ippFuncMask = 0;
954 
955             if (mask.empty())
956             {
957                 ippFunc = sdepth == CV_8U && ddepth == CV_32F ? (ippiAddSquare)ippiAddSquare_8u32f_C1IR :
958                     sdepth == CV_16U && ddepth == CV_32F ? (ippiAddSquare)ippiAddSquare_16u32f_C1IR :
959                     sdepth == CV_32F && ddepth == CV_32F ? (ippiAddSquare)ippiAddSquare_32f_C1IR : 0;
960             }
961             else if (scn == 1)
962             {
963                 ippFuncMask = sdepth == CV_8U && ddepth == CV_32F ? (ippiAddSquareMask)ippiAddSquare_8u32f_C1IMR :
964                     sdepth == CV_16U && ddepth == CV_32F ? (ippiAddSquareMask)ippiAddSquare_16u32f_C1IMR :
965                     sdepth == CV_32F && ddepth == CV_32F ? (ippiAddSquareMask)ippiAddSquare_32f_C1IMR : 0;
966             }
967 
968             if (ippFunc || ippFuncMask)
969             {
970                 IppStatus status = ippStsNoErr;
971 
972                 Size size = src.size();
973                 int srcstep = (int)src.step, dststep = (int)dst.step, maskstep = (int)mask.step;
974                 if (src.isContinuous() && dst.isContinuous() && mask.isContinuous())
975                 {
976                     srcstep = static_cast<int>(src.total() * src.elemSize());
977                     dststep = static_cast<int>(dst.total() * dst.elemSize());
978                     maskstep = static_cast<int>(mask.total() * mask.elemSize());
979                     size.width = static_cast<int>(src.total());
980                     size.height = 1;
981                 }
982                 size.width *= scn;
983 
984                 if (mask.empty())
985                     status = ippFunc(src.ptr(), srcstep, dst.ptr<Ipp32f>(), dststep, ippiSize(size.width, size.height));
986                 else
987                     status = ippFuncMask(src.ptr(), srcstep, mask.ptr<Ipp8u>(), maskstep,
988                                          dst.ptr<Ipp32f>(), dststep, ippiSize(size.width, size.height));
989 
990                 if (status >= 0)
991                 {
992                     CV_IMPL_ADD(CV_IMPL_IPP);
993                     return;
994                 }
995                 setIppErrorStatus();
996             }
997         }
998     }
999 #endif
1000 
1001     int fidx = getAccTabIdx(sdepth, ddepth);
1002     AccFunc func = fidx >= 0 ? accSqrTab[fidx] : 0;
1003     CV_Assert( func != 0 );
1004 
1005     const Mat* arrays[] = {&src, &dst, &mask, 0};
1006     uchar* ptrs[3];
1007     NAryMatIterator it(arrays, ptrs);
1008     int len = (int)it.size;
1009 
1010     for( size_t i = 0; i < it.nplanes; i++, ++it )
1011         func(ptrs[0], ptrs[1], ptrs[2], len, scn);
1012 }
1013 
accumulateProduct(InputArray _src1,InputArray _src2,InputOutputArray _dst,InputArray _mask)1014 void cv::accumulateProduct( InputArray _src1, InputArray _src2,
1015                             InputOutputArray _dst, InputArray _mask )
1016 {
1017     int stype = _src1.type(), sdepth = CV_MAT_DEPTH(stype), scn = CV_MAT_CN(stype);
1018     int dtype = _dst.type(), ddepth = CV_MAT_DEPTH(dtype), dcn = CV_MAT_CN(dtype);
1019 
1020     CV_Assert( _src1.sameSize(_src2) && stype == _src2.type() );
1021     CV_Assert( _src1.sameSize(_dst) && dcn == scn );
1022     CV_Assert( _mask.empty() || (_src1.sameSize(_mask) && _mask.type() == CV_8U) );
1023 
1024     CV_OCL_RUN(_src1.dims() <= 2 && _dst.isUMat(),
1025                ocl_accumulate(_src1, _src2, _dst, 0.0, _mask, ACCUMULATE_PRODUCT))
1026 
1027     Mat src1 = _src1.getMat(), src2 = _src2.getMat(), dst = _dst.getMat(), mask = _mask.getMat();
1028 
1029 #if defined(HAVE_IPP)
1030     CV_IPP_CHECK()
1031     {
1032         if (src1.dims <= 2 || (src1.isContinuous() && src2.isContinuous() && dst.isContinuous()))
1033         {
1034             typedef IppStatus (CV_STDCALL * ippiAddProduct)(const void * pSrc1, int src1Step, const void * pSrc2,
1035                                                             int src2Step, Ipp32f * pSrcDst, int srcDstStep, IppiSize roiSize);
1036             typedef IppStatus (CV_STDCALL * ippiAddProductMask)(const void * pSrc1, int src1Step, const void * pSrc2, int src2Step,
1037                                                                 const Ipp8u * pMask, int maskStep, Ipp32f * pSrcDst, int srcDstStep, IppiSize roiSize);
1038             ippiAddProduct ippFunc = 0;
1039             ippiAddProductMask ippFuncMask = 0;
1040 
1041             if (mask.empty())
1042             {
1043                 ippFunc = sdepth == CV_8U && ddepth == CV_32F ? (ippiAddProduct)ippiAddProduct_8u32f_C1IR :
1044                     sdepth == CV_16U && ddepth == CV_32F ? (ippiAddProduct)ippiAddProduct_16u32f_C1IR :
1045                     sdepth == CV_32F && ddepth == CV_32F ? (ippiAddProduct)ippiAddProduct_32f_C1IR : 0;
1046             }
1047             else if (scn == 1)
1048             {
1049                 ippFuncMask = sdepth == CV_8U && ddepth == CV_32F ? (ippiAddProductMask)ippiAddProduct_8u32f_C1IMR :
1050                     sdepth == CV_16U && ddepth == CV_32F ? (ippiAddProductMask)ippiAddProduct_16u32f_C1IMR :
1051                     sdepth == CV_32F && ddepth == CV_32F ? (ippiAddProductMask)ippiAddProduct_32f_C1IMR : 0;
1052             }
1053 
1054             if (ippFunc || ippFuncMask)
1055             {
1056                 IppStatus status = ippStsNoErr;
1057 
1058                 Size size = src1.size();
1059                 int src1step = (int)src1.step, src2step = (int)src2.step, dststep = (int)dst.step, maskstep = (int)mask.step;
1060                 if (src1.isContinuous() && src2.isContinuous() && dst.isContinuous() && mask.isContinuous())
1061                 {
1062                     src1step = static_cast<int>(src1.total() * src1.elemSize());
1063                     src2step = static_cast<int>(src2.total() * src2.elemSize());
1064                     dststep = static_cast<int>(dst.total() * dst.elemSize());
1065                     maskstep = static_cast<int>(mask.total() * mask.elemSize());
1066                     size.width = static_cast<int>(src1.total());
1067                     size.height = 1;
1068                 }
1069                 size.width *= scn;
1070 
1071                 if (mask.empty())
1072                     status = ippFunc(src1.ptr(), src1step, src2.ptr(), src2step, dst.ptr<Ipp32f>(),
1073                                      dststep, ippiSize(size.width, size.height));
1074                 else
1075                     status = ippFuncMask(src1.ptr(), src1step, src2.ptr(), src2step, mask.ptr<Ipp8u>(), maskstep,
1076                                          dst.ptr<Ipp32f>(), dststep, ippiSize(size.width, size.height));
1077 
1078                 if (status >= 0)
1079                 {
1080                     CV_IMPL_ADD(CV_IMPL_IPP);
1081                     return;
1082                 }
1083                 setIppErrorStatus();
1084             }
1085         }
1086     }
1087 #endif
1088 
1089     int fidx = getAccTabIdx(sdepth, ddepth);
1090     AccProdFunc func = fidx >= 0 ? accProdTab[fidx] : 0;
1091     CV_Assert( func != 0 );
1092 
1093     const Mat* arrays[] = {&src1, &src2, &dst, &mask, 0};
1094     uchar* ptrs[4];
1095     NAryMatIterator it(arrays, ptrs);
1096     int len = (int)it.size;
1097 
1098     for( size_t i = 0; i < it.nplanes; i++, ++it )
1099         func(ptrs[0], ptrs[1], ptrs[2], ptrs[3], len, scn);
1100 }
1101 
accumulateWeighted(InputArray _src,InputOutputArray _dst,double alpha,InputArray _mask)1102 void cv::accumulateWeighted( InputArray _src, InputOutputArray _dst,
1103                              double alpha, InputArray _mask )
1104 {
1105     int stype = _src.type(), sdepth = CV_MAT_DEPTH(stype), scn = CV_MAT_CN(stype);
1106     int dtype = _dst.type(), ddepth = CV_MAT_DEPTH(dtype), dcn = CV_MAT_CN(dtype);
1107 
1108     CV_Assert( _src.sameSize(_dst) && dcn == scn );
1109     CV_Assert( _mask.empty() || (_src.sameSize(_mask) && _mask.type() == CV_8U) );
1110 
1111     CV_OCL_RUN(_src.dims() <= 2 && _dst.isUMat(),
1112                ocl_accumulate(_src, noArray(), _dst, alpha, _mask, ACCUMULATE_WEIGHTED))
1113 
1114     Mat src = _src.getMat(), dst = _dst.getMat(), mask = _mask.getMat();
1115 
1116 #if defined(HAVE_IPP)
1117     CV_IPP_CHECK()
1118     {
1119         if (src.dims <= 2 || (src.isContinuous() && dst.isContinuous() && mask.isContinuous()))
1120         {
1121             typedef IppStatus (CV_STDCALL * ippiAddWeighted)(const void * pSrc, int srcStep, Ipp32f * pSrcDst, int srcdstStep,
1122                                                              IppiSize roiSize, Ipp32f alpha);
1123             typedef IppStatus (CV_STDCALL * ippiAddWeightedMask)(const void * pSrc, int srcStep, const Ipp8u * pMask,
1124                                                                  int maskStep, Ipp32f * pSrcDst,
1125                                                                  int srcDstStep, IppiSize roiSize, Ipp32f alpha);
1126             ippiAddWeighted ippFunc = 0;
1127             ippiAddWeightedMask ippFuncMask = 0;
1128 
1129             if (mask.empty())
1130             {
1131                 ippFunc = sdepth == CV_8U && ddepth == CV_32F ? (ippiAddWeighted)ippiAddWeighted_8u32f_C1IR :
1132                     sdepth == CV_16U && ddepth == CV_32F ? (ippiAddWeighted)ippiAddWeighted_16u32f_C1IR :
1133                     sdepth == CV_32F && ddepth == CV_32F ? (ippiAddWeighted)ippiAddWeighted_32f_C1IR : 0;
1134             }
1135             else if (scn == 1)
1136             {
1137                 ippFuncMask = sdepth == CV_8U && ddepth == CV_32F ? (ippiAddWeightedMask)ippiAddWeighted_8u32f_C1IMR :
1138                     sdepth == CV_16U && ddepth == CV_32F ? (ippiAddWeightedMask)ippiAddWeighted_16u32f_C1IMR :
1139                     sdepth == CV_32F && ddepth == CV_32F ? (ippiAddWeightedMask)ippiAddWeighted_32f_C1IMR : 0;
1140             }
1141 
1142             if (ippFunc || ippFuncMask)
1143             {
1144                 IppStatus status = ippStsNoErr;
1145 
1146                 Size size = src.size();
1147                 int srcstep = (int)src.step, dststep = (int)dst.step, maskstep = (int)mask.step;
1148                 if (src.isContinuous() && dst.isContinuous() && mask.isContinuous())
1149                 {
1150                     srcstep = static_cast<int>(src.total() * src.elemSize());
1151                     dststep = static_cast<int>(dst.total() * dst.elemSize());
1152                     maskstep = static_cast<int>(mask.total() * mask.elemSize());
1153                     size.width = static_cast<int>((int)src.total());
1154                     size.height = 1;
1155                 }
1156                 size.width *= scn;
1157 
1158                 if (mask.empty())
1159                     status = ippFunc(src.ptr(), srcstep, dst.ptr<Ipp32f>(), dststep, ippiSize(size.width, size.height), (Ipp32f)alpha);
1160                 else
1161                     status = ippFuncMask(src.ptr(), srcstep, mask.ptr<Ipp8u>(), maskstep,
1162                                          dst.ptr<Ipp32f>(), dststep, ippiSize(size.width, size.height), (Ipp32f)alpha);
1163 
1164                 if (status >= 0)
1165                 {
1166                     CV_IMPL_ADD(CV_IMPL_IPP);
1167                     return;
1168                 }
1169                 setIppErrorStatus();
1170             }
1171         }
1172     }
1173 #endif
1174 
1175     int fidx = getAccTabIdx(sdepth, ddepth);
1176     AccWFunc func = fidx >= 0 ? accWTab[fidx] : 0;
1177     CV_Assert( func != 0 );
1178 
1179     const Mat* arrays[] = {&src, &dst, &mask, 0};
1180     uchar* ptrs[3];
1181     NAryMatIterator it(arrays, ptrs);
1182     int len = (int)it.size;
1183 
1184     for( size_t i = 0; i < it.nplanes; i++, ++it )
1185         func(ptrs[0], ptrs[1], ptrs[2], len, scn, alpha);
1186 }
1187 
1188 
1189 CV_IMPL void
cvAcc(const void * arr,void * sumarr,const void * maskarr)1190 cvAcc( const void* arr, void* sumarr, const void* maskarr )
1191 {
1192     cv::Mat src = cv::cvarrToMat(arr), dst = cv::cvarrToMat(sumarr), mask;
1193     if( maskarr )
1194         mask = cv::cvarrToMat(maskarr);
1195     cv::accumulate( src, dst, mask );
1196 }
1197 
1198 CV_IMPL void
cvSquareAcc(const void * arr,void * sumarr,const void * maskarr)1199 cvSquareAcc( const void* arr, void* sumarr, const void* maskarr )
1200 {
1201     cv::Mat src = cv::cvarrToMat(arr), dst = cv::cvarrToMat(sumarr), mask;
1202     if( maskarr )
1203         mask = cv::cvarrToMat(maskarr);
1204     cv::accumulateSquare( src, dst, mask );
1205 }
1206 
1207 CV_IMPL void
cvMultiplyAcc(const void * arr1,const void * arr2,void * sumarr,const void * maskarr)1208 cvMultiplyAcc( const void* arr1, const void* arr2,
1209                void* sumarr, const void* maskarr )
1210 {
1211     cv::Mat src1 = cv::cvarrToMat(arr1), src2 = cv::cvarrToMat(arr2);
1212     cv::Mat dst = cv::cvarrToMat(sumarr), mask;
1213     if( maskarr )
1214         mask = cv::cvarrToMat(maskarr);
1215     cv::accumulateProduct( src1, src2, dst, mask );
1216 }
1217 
1218 CV_IMPL void
cvRunningAvg(const void * arr,void * sumarr,double alpha,const void * maskarr)1219 cvRunningAvg( const void* arr, void* sumarr, double alpha, const void* maskarr )
1220 {
1221     cv::Mat src = cv::cvarrToMat(arr), dst = cv::cvarrToMat(sumarr), mask;
1222     if( maskarr )
1223         mask = cv::cvarrToMat(maskarr);
1224     cv::accumulateWeighted( src, dst, alpha, mask );
1225 }
1226 
1227 /* End of file. */
1228