• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*M///////////////////////////////////////////////////////////////////////////////////////
2 //
3 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
4 //
5 //  By downloading, copying, installing or using the software you agree to this license.
6 //  If you do not agree to this license, do not download, install,
7 //  copy or use the software.
8 //
9 //
10 //                           License Agreement
11 //                For Open Source Computer Vision Library
12 //
13 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
14 // Copyright (C) 2009-2011, Willow Garage Inc., all rights reserved.
15 // Copyright (C) 2014-2015, Itseez Inc., all rights reserved.
16 // Third party copyrights are property of their respective owners.
17 //
18 // Redistribution and use in source and binary forms, with or without modification,
19 // are permitted provided that the following conditions are met:
20 //
21 //   * Redistribution's of source code must retain the above copyright notice,
22 //     this list of conditions and the following disclaimer.
23 //
24 //   * Redistribution's in binary form must reproduce the above copyright notice,
25 //     this list of conditions and the following disclaimer in the documentation
26 //     and/or other materials provided with the distribution.
27 //
28 //   * The name of the copyright holders may not be used to endorse or promote products
29 //     derived from this software without specific prior written permission.
30 //
31 // This software is provided by the copyright holders and contributors "as is" and
32 // any express or implied warranties, including, but not limited to, the implied
33 // warranties of merchantability and fitness for a particular purpose are disclaimed.
34 // In no event shall the Intel Corporation or contributors be liable for any direct,
35 // indirect, incidental, special, exemplary, or consequential damages
36 // (including, but not limited to, procurement of substitute goods or services;
37 // loss of use, data, or profits; or business interruption) however caused
38 // and on any theory of liability, whether in contract, strict liability,
39 // or tort (including negligence or otherwise) arising in any way out of
40 // the use of this software, even if advised of the possibility of such damage.
41 //
42 //M*/
43 
44 /* ////////////////////////////////////////////////////////////////////
45 //
46 //  Arithmetic and logical operations: +, -, *, /, &, |, ^, ~, abs ...
47 //
48 // */
49 
50 #include "precomp.hpp"
51 #include "opencl_kernels_core.hpp"
52 
53 namespace cv
54 {
55 
56 struct NOP {};
57 
58 #if CV_SSE2 || CV_NEON
59 
60 #define FUNCTOR_TEMPLATE(name)          \
61     template<typename T> struct name {}
62 
63 FUNCTOR_TEMPLATE(VLoadStore128);
64 #if CV_SSE2
65 FUNCTOR_TEMPLATE(VLoadStore64);
66 FUNCTOR_TEMPLATE(VLoadStore128Aligned);
67 #if CV_AVX2
68 FUNCTOR_TEMPLATE(VLoadStore256);
69 FUNCTOR_TEMPLATE(VLoadStore256Aligned);
70 #endif
71 #endif
72 
73 #endif
74 
75 template<typename T, class Op, class VOp>
vBinOp(const T * src1,size_t step1,const T * src2,size_t step2,T * dst,size_t step,Size sz)76 void vBinOp(const T* src1, size_t step1, const T* src2, size_t step2, T* dst, size_t step, Size sz)
77 {
78 #if CV_SSE2 || CV_NEON
79     VOp vop;
80 #endif
81     Op op;
82 
83     for( ; sz.height--; src1 = (const T *)((const uchar *)src1 + step1),
84                         src2 = (const T *)((const uchar *)src2 + step2),
85                         dst = (T *)((uchar *)dst + step) )
86     {
87         int x = 0;
88 
89 #if CV_NEON || CV_SSE2
90 #if CV_AVX2
91         if( USE_AVX2 )
92         {
93             for( ; x <= sz.width - 32/(int)sizeof(T); x += 32/sizeof(T) )
94             {
95                 typename VLoadStore256<T>::reg_type r0 = VLoadStore256<T>::load(src1 + x);
96                 r0 = vop(r0, VLoadStore256<T>::load(src2 + x));
97                 VLoadStore256<T>::store(dst + x, r0);
98             }
99         }
100 #else
101 #if CV_SSE2
102         if( USE_SSE2 )
103         {
104 #endif // CV_SSE2
105             for( ; x <= sz.width - 32/(int)sizeof(T); x += 32/sizeof(T) )
106             {
107                 typename VLoadStore128<T>::reg_type r0 = VLoadStore128<T>::load(src1 + x               );
108                 typename VLoadStore128<T>::reg_type r1 = VLoadStore128<T>::load(src1 + x + 16/sizeof(T));
109                 r0 = vop(r0, VLoadStore128<T>::load(src2 + x               ));
110                 r1 = vop(r1, VLoadStore128<T>::load(src2 + x + 16/sizeof(T)));
111                 VLoadStore128<T>::store(dst + x               , r0);
112                 VLoadStore128<T>::store(dst + x + 16/sizeof(T), r1);
113             }
114 #if CV_SSE2
115         }
116 #endif // CV_SSE2
117 #endif // CV_AVX2
118 #endif // CV_NEON || CV_SSE2
119 
120 #if CV_AVX2
121         // nothing
122 #elif CV_SSE2
123         if( USE_SSE2 )
124         {
125             for( ; x <= sz.width - 8/(int)sizeof(T); x += 8/sizeof(T) )
126             {
127                 typename VLoadStore64<T>::reg_type r = VLoadStore64<T>::load(src1 + x);
128                 r = vop(r, VLoadStore64<T>::load(src2 + x));
129                 VLoadStore64<T>::store(dst + x, r);
130             }
131         }
132 #endif
133 
134 #if CV_ENABLE_UNROLLED
135         for( ; x <= sz.width - 4; x += 4 )
136         {
137             T v0 = op(src1[x], src2[x]);
138             T v1 = op(src1[x+1], src2[x+1]);
139             dst[x] = v0; dst[x+1] = v1;
140             v0 = op(src1[x+2], src2[x+2]);
141             v1 = op(src1[x+3], src2[x+3]);
142             dst[x+2] = v0; dst[x+3] = v1;
143         }
144 #endif
145 
146         for( ; x < sz.width; x++ )
147             dst[x] = op(src1[x], src2[x]);
148     }
149 }
150 
151 template<typename T, class Op, class Op32>
vBinOp32(const T * src1,size_t step1,const T * src2,size_t step2,T * dst,size_t step,Size sz)152 void vBinOp32(const T* src1, size_t step1, const T* src2, size_t step2,
153               T* dst, size_t step, Size sz)
154 {
155 #if CV_SSE2 || CV_NEON
156     Op32 op32;
157 #endif
158     Op op;
159 
160     for( ; sz.height--; src1 = (const T *)((const uchar *)src1 + step1),
161                         src2 = (const T *)((const uchar *)src2 + step2),
162                         dst = (T *)((uchar *)dst + step) )
163     {
164         int x = 0;
165 
166 #if CV_AVX2
167         if( USE_AVX2 )
168         {
169             if( (((size_t)src1|(size_t)src2|(size_t)dst)&31) == 0 )
170             {
171                 for( ; x <= sz.width - 8; x += 8 )
172                 {
173                     typename VLoadStore256Aligned<T>::reg_type r0 = VLoadStore256Aligned<T>::load(src1 + x);
174                     r0 = op32(r0, VLoadStore256Aligned<T>::load(src2 + x));
175                     VLoadStore256Aligned<T>::store(dst + x, r0);
176                 }
177             }
178         }
179 #elif CV_SSE2
180         if( USE_SSE2 )
181         {
182             if( (((size_t)src1|(size_t)src2|(size_t)dst)&15) == 0 )
183             {
184                 for( ; x <= sz.width - 8; x += 8 )
185                 {
186                     typename VLoadStore128Aligned<T>::reg_type r0 = VLoadStore128Aligned<T>::load(src1 + x    );
187                     typename VLoadStore128Aligned<T>::reg_type r1 = VLoadStore128Aligned<T>::load(src1 + x + 4);
188                     r0 = op32(r0, VLoadStore128Aligned<T>::load(src2 + x    ));
189                     r1 = op32(r1, VLoadStore128Aligned<T>::load(src2 + x + 4));
190                     VLoadStore128Aligned<T>::store(dst + x    , r0);
191                     VLoadStore128Aligned<T>::store(dst + x + 4, r1);
192                 }
193             }
194         }
195 #endif // CV_AVX2
196 
197 #if CV_NEON || CV_SSE2
198 #if CV_AVX2
199         if( USE_AVX2 )
200         {
201             for( ; x <= sz.width - 8; x += 8 )
202             {
203                 typename VLoadStore256<T>::reg_type r0 = VLoadStore256<T>::load(src1 + x);
204                 r0 = op32(r0, VLoadStore256<T>::load(src2 + x));
205                 VLoadStore256<T>::store(dst + x, r0);
206             }
207         }
208 #else
209 #if CV_SSE2
210         if( USE_SSE2 )
211         {
212 #endif // CV_SSE2
213             for( ; x <= sz.width - 8; x += 8 )
214             {
215                 typename VLoadStore128<T>::reg_type r0 = VLoadStore128<T>::load(src1 + x    );
216                 typename VLoadStore128<T>::reg_type r1 = VLoadStore128<T>::load(src1 + x + 4);
217                 r0 = op32(r0, VLoadStore128<T>::load(src2 + x    ));
218                 r1 = op32(r1, VLoadStore128<T>::load(src2 + x + 4));
219                 VLoadStore128<T>::store(dst + x    , r0);
220                 VLoadStore128<T>::store(dst + x + 4, r1);
221             }
222 #if CV_SSE2
223         }
224 #endif // CV_SSE2
225 #endif // CV_AVX2
226 #endif // CV_NEON || CV_SSE2
227 
228 #if CV_ENABLE_UNROLLED
229         for( ; x <= sz.width - 4; x += 4 )
230         {
231             T v0 = op(src1[x], src2[x]);
232             T v1 = op(src1[x+1], src2[x+1]);
233             dst[x] = v0; dst[x+1] = v1;
234             v0 = op(src1[x+2], src2[x+2]);
235             v1 = op(src1[x+3], src2[x+3]);
236             dst[x+2] = v0; dst[x+3] = v1;
237         }
238 #endif
239 
240         for( ; x < sz.width; x++ )
241             dst[x] = op(src1[x], src2[x]);
242     }
243 }
244 
245 
246 template<typename T, class Op, class Op64>
vBinOp64(const T * src1,size_t step1,const T * src2,size_t step2,T * dst,size_t step,Size sz)247 void vBinOp64(const T* src1, size_t step1, const T* src2, size_t step2,
248                T* dst, size_t step, Size sz)
249 {
250 #if CV_SSE2
251     Op64 op64;
252 #endif
253     Op op;
254 
255     for( ; sz.height--; src1 = (const T *)((const uchar *)src1 + step1),
256                         src2 = (const T *)((const uchar *)src2 + step2),
257                         dst = (T *)((uchar *)dst + step) )
258     {
259         int x = 0;
260 
261 #if CV_AVX2
262         if( USE_AVX2 )
263         {
264             if( (((size_t)src1|(size_t)src2|(size_t)dst)&31) == 0 )
265             {
266                 for( ; x <= sz.width - 4; x += 4 )
267                 {
268                     typename VLoadStore256Aligned<T>::reg_type r0 = VLoadStore256Aligned<T>::load(src1 + x);
269                     r0 = op64(r0, VLoadStore256Aligned<T>::load(src2 + x));
270                     VLoadStore256Aligned<T>::store(dst + x, r0);
271                 }
272             }
273         }
274 #elif CV_SSE2
275         if( USE_SSE2 )
276         {
277             if( (((size_t)src1|(size_t)src2|(size_t)dst)&15) == 0 )
278             {
279                 for( ; x <= sz.width - 4; x += 4 )
280                 {
281                     typename VLoadStore128Aligned<T>::reg_type r0 = VLoadStore128Aligned<T>::load(src1 + x    );
282                     typename VLoadStore128Aligned<T>::reg_type r1 = VLoadStore128Aligned<T>::load(src1 + x + 2);
283                     r0 = op64(r0, VLoadStore128Aligned<T>::load(src2 + x    ));
284                     r1 = op64(r1, VLoadStore128Aligned<T>::load(src2 + x + 2));
285                     VLoadStore128Aligned<T>::store(dst + x    , r0);
286                     VLoadStore128Aligned<T>::store(dst + x + 2, r1);
287                 }
288             }
289         }
290 #endif
291 
292         for( ; x <= sz.width - 4; x += 4 )
293         {
294             T v0 = op(src1[x], src2[x]);
295             T v1 = op(src1[x+1], src2[x+1]);
296             dst[x] = v0; dst[x+1] = v1;
297             v0 = op(src1[x+2], src2[x+2]);
298             v1 = op(src1[x+3], src2[x+3]);
299             dst[x+2] = v0; dst[x+3] = v1;
300         }
301 
302         for( ; x < sz.width; x++ )
303             dst[x] = op(src1[x], src2[x]);
304     }
305 }
306 
307 #if CV_AVX2
308 
309 #define FUNCTOR_LOADSTORE_CAST(name, template_arg, register_type, load_body, store_body)         \
310     template <>                                                                                  \
311     struct name<template_arg>{                                                                   \
312         typedef register_type reg_type;                                                          \
313         static reg_type load(const template_arg * p) { return load_body ((const reg_type *)p); } \
314         static void store(template_arg * p, reg_type v) { store_body ((reg_type *)p, v); }       \
315     }
316 
317 #define FUNCTOR_LOADSTORE(name, template_arg, register_type, load_body, store_body) \
318     template <>                                                                     \
319     struct name<template_arg>{                                                      \
320         typedef register_type reg_type;                                             \
321         static reg_type load(const template_arg * p) { return load_body (p); }      \
322         static void store(template_arg * p, reg_type v) { store_body (p, v); }      \
323     }
324 
325 #define FUNCTOR_CLOSURE_2arg(name, template_arg, body)                         \
326     template<>                                                                 \
327     struct name<template_arg>                                                  \
328     {                                                                          \
329         VLoadStore256<template_arg>::reg_type operator()(                      \
330                         const VLoadStore256<template_arg>::reg_type & a,       \
331                         const VLoadStore256<template_arg>::reg_type & b) const \
332         {                                                                      \
333             body;                                                              \
334         }                                                                      \
335     }
336 
337 #define FUNCTOR_CLOSURE_1arg(name, template_arg, body)                         \
338     template<>                                                                 \
339     struct name<template_arg>                                                  \
340     {                                                                          \
341         VLoadStore256<template_arg>::reg_type operator()(                      \
342                         const VLoadStore256<template_arg>::reg_type & a,       \
343                         const VLoadStore256<template_arg>::reg_type &  ) const \
344         {                                                                      \
345             body;                                                              \
346         }                                                                      \
347     }
348 
349 FUNCTOR_LOADSTORE_CAST(VLoadStore256,  uchar, __m256i, _mm256_loadu_si256, _mm256_storeu_si256);
350 FUNCTOR_LOADSTORE_CAST(VLoadStore256,  schar, __m256i, _mm256_loadu_si256, _mm256_storeu_si256);
351 FUNCTOR_LOADSTORE_CAST(VLoadStore256, ushort, __m256i, _mm256_loadu_si256, _mm256_storeu_si256);
352 FUNCTOR_LOADSTORE_CAST(VLoadStore256,  short, __m256i, _mm256_loadu_si256, _mm256_storeu_si256);
353 FUNCTOR_LOADSTORE_CAST(VLoadStore256,    int, __m256i, _mm256_loadu_si256, _mm256_storeu_si256);
354 FUNCTOR_LOADSTORE(     VLoadStore256,  float, __m256 , _mm256_loadu_ps   , _mm256_storeu_ps   );
355 FUNCTOR_LOADSTORE(     VLoadStore256, double, __m256d, _mm256_loadu_pd   , _mm256_storeu_pd   );
356 
357 FUNCTOR_LOADSTORE_CAST(VLoadStore256Aligned,    int, __m256i, _mm256_load_si256, _mm256_store_si256);
358 FUNCTOR_LOADSTORE(     VLoadStore256Aligned,  float, __m256 , _mm256_load_ps   , _mm256_store_ps   );
359 FUNCTOR_LOADSTORE(     VLoadStore256Aligned, double, __m256d, _mm256_load_pd   , _mm256_store_pd   );
360 
361 FUNCTOR_TEMPLATE(VAdd);
362 FUNCTOR_CLOSURE_2arg(VAdd,  uchar, return _mm256_adds_epu8 (a, b));
363 FUNCTOR_CLOSURE_2arg(VAdd,  schar, return _mm256_adds_epi8 (a, b));
364 FUNCTOR_CLOSURE_2arg(VAdd, ushort, return _mm256_adds_epu16(a, b));
365 FUNCTOR_CLOSURE_2arg(VAdd,  short, return _mm256_adds_epi16(a, b));
366 FUNCTOR_CLOSURE_2arg(VAdd,    int, return _mm256_add_epi32 (a, b));
367 FUNCTOR_CLOSURE_2arg(VAdd,  float, return _mm256_add_ps    (a, b));
368 FUNCTOR_CLOSURE_2arg(VAdd, double, return _mm256_add_pd    (a, b));
369 
370 FUNCTOR_TEMPLATE(VSub);
371 FUNCTOR_CLOSURE_2arg(VSub,  uchar, return _mm256_subs_epu8 (a, b));
372 FUNCTOR_CLOSURE_2arg(VSub,  schar, return _mm256_subs_epi8 (a, b));
373 FUNCTOR_CLOSURE_2arg(VSub, ushort, return _mm256_subs_epu16(a, b));
374 FUNCTOR_CLOSURE_2arg(VSub,  short, return _mm256_subs_epi16(a, b));
375 FUNCTOR_CLOSURE_2arg(VSub,    int, return _mm256_sub_epi32 (a, b));
376 FUNCTOR_CLOSURE_2arg(VSub,  float, return _mm256_sub_ps    (a, b));
377 FUNCTOR_CLOSURE_2arg(VSub, double, return _mm256_sub_pd    (a, b));
378 
379 FUNCTOR_TEMPLATE(VMin);
380 FUNCTOR_CLOSURE_2arg(VMin,  uchar, return _mm256_min_epu8 (a, b));
381 FUNCTOR_CLOSURE_2arg(VMin,  schar, return _mm256_min_epi8 (a, b));
382 FUNCTOR_CLOSURE_2arg(VMin, ushort, return _mm256_min_epi16(a, b));
383 FUNCTOR_CLOSURE_2arg(VMin,  short, return _mm256_min_epi16(a, b));
384 FUNCTOR_CLOSURE_2arg(VMin,    int, return _mm256_min_epi32(a, b));
385 FUNCTOR_CLOSURE_2arg(VMin,  float, return _mm256_min_ps   (a, b));
386 FUNCTOR_CLOSURE_2arg(VMin, double, return _mm256_min_pd   (a, b));
387 
388 FUNCTOR_TEMPLATE(VMax);
389 FUNCTOR_CLOSURE_2arg(VMax,  uchar, return _mm256_max_epu8 (a, b));
390 FUNCTOR_CLOSURE_2arg(VMax,  schar, return _mm256_max_epi8 (a, b));
391 FUNCTOR_CLOSURE_2arg(VMax, ushort, return _mm256_max_epu16(a, b));
392 FUNCTOR_CLOSURE_2arg(VMax,  short, return _mm256_max_epi16(a, b));
393 FUNCTOR_CLOSURE_2arg(VMax,    int, return _mm256_max_epi32(a, b));
394 FUNCTOR_CLOSURE_2arg(VMax,  float, return _mm256_max_ps   (a, b));
395 FUNCTOR_CLOSURE_2arg(VMax, double, return _mm256_max_pd   (a, b));
396 
397 
398 static unsigned int CV_DECL_ALIGNED(32) v32f_absmask[] = { 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff,
399                                                            0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff };
400 static unsigned int CV_DECL_ALIGNED(32) v64f_absmask[] = { 0xffffffff, 0x7fffffff, 0xffffffff, 0x7fffffff,
401                                                            0xffffffff, 0x7fffffff, 0xffffffff, 0x7fffffff };
402 
403 FUNCTOR_TEMPLATE(VAbsDiff);
404 FUNCTOR_CLOSURE_2arg(VAbsDiff,  uchar,
405         return _mm256_add_epi8(_mm256_subs_epu8(a, b), _mm256_subs_epu8(b, a));
406     );
407 FUNCTOR_CLOSURE_2arg(VAbsDiff,  schar,
408         __m256i d = _mm256_subs_epi8(a, b);
409         __m256i m = _mm256_cmpgt_epi8(b, a);
410         return _mm256_subs_epi8(_mm256_xor_si256(d, m), m);
411     );
412 FUNCTOR_CLOSURE_2arg(VAbsDiff, ushort,
413         return _mm256_add_epi16(_mm256_subs_epu16(a, b), _mm256_subs_epu16(b, a));
414     );
415 FUNCTOR_CLOSURE_2arg(VAbsDiff,  short,
416         __m256i M = _mm256_max_epi16(a, b);
417         __m256i m = _mm256_min_epi16(a, b);
418         return _mm256_subs_epi16(M, m);
419     );
420 FUNCTOR_CLOSURE_2arg(VAbsDiff,    int,
421         __m256i d = _mm256_sub_epi32(a, b);
422         __m256i m = _mm256_cmpgt_epi32(b, a);
423         return _mm256_sub_epi32(_mm256_xor_si256(d, m), m);
424     );
425 FUNCTOR_CLOSURE_2arg(VAbsDiff,  float,
426         return _mm256_and_ps(_mm256_sub_ps(a, b), *(const __m256*)v32f_absmask);
427     );
428 FUNCTOR_CLOSURE_2arg(VAbsDiff, double,
429         return _mm256_and_pd(_mm256_sub_pd(a, b), *(const __m256d*)v64f_absmask);
430     );
431 
432 FUNCTOR_TEMPLATE(VAnd);
433 FUNCTOR_CLOSURE_2arg(VAnd, uchar, return _mm256_and_si256(a, b));
434 FUNCTOR_TEMPLATE(VOr);
435 FUNCTOR_CLOSURE_2arg(VOr , uchar, return _mm256_or_si256 (a, b));
436 FUNCTOR_TEMPLATE(VXor);
437 FUNCTOR_CLOSURE_2arg(VXor, uchar, return _mm256_xor_si256(a, b));
438 FUNCTOR_TEMPLATE(VNot);
439 FUNCTOR_CLOSURE_1arg(VNot, uchar, return _mm256_xor_si256(_mm256_set1_epi32(-1), a));
440 
441 #elif CV_SSE2
442 
443 #define FUNCTOR_LOADSTORE_CAST(name, template_arg, register_type, load_body, store_body)\
444     template <>                                                                                  \
445     struct name<template_arg>{                                                                   \
446         typedef register_type reg_type;                                                          \
447         static reg_type load(const template_arg * p) { return load_body ((const reg_type *)p); } \
448         static void store(template_arg * p, reg_type v) { store_body ((reg_type *)p, v); }       \
449     }
450 
451 #define FUNCTOR_LOADSTORE(name, template_arg, register_type, load_body, store_body)\
452     template <>                                                                \
453     struct name<template_arg>{                                                 \
454         typedef register_type reg_type;                                        \
455         static reg_type load(const template_arg * p) { return load_body (p); } \
456         static void store(template_arg * p, reg_type v) { store_body (p, v); } \
457     }
458 
459 #define FUNCTOR_CLOSURE_2arg(name, template_arg, body)\
460     template<>                                                                 \
461     struct name<template_arg>                                                  \
462     {                                                                          \
463         VLoadStore128<template_arg>::reg_type operator()(                      \
464                         const VLoadStore128<template_arg>::reg_type & a,       \
465                         const VLoadStore128<template_arg>::reg_type & b) const \
466         {                                                                      \
467             body;                                                              \
468         }                                                                      \
469     }
470 
471 #define FUNCTOR_CLOSURE_1arg(name, template_arg, body)\
472     template<>                                                                 \
473     struct name<template_arg>                                                  \
474     {                                                                          \
475         VLoadStore128<template_arg>::reg_type operator()(                      \
476                         const VLoadStore128<template_arg>::reg_type & a,       \
477                         const VLoadStore128<template_arg>::reg_type &  ) const \
478         {                                                                      \
479             body;                                                              \
480         }                                                                      \
481     }
482 
483 FUNCTOR_LOADSTORE_CAST(VLoadStore128,  uchar, __m128i, _mm_loadu_si128, _mm_storeu_si128);
484 FUNCTOR_LOADSTORE_CAST(VLoadStore128,  schar, __m128i, _mm_loadu_si128, _mm_storeu_si128);
485 FUNCTOR_LOADSTORE_CAST(VLoadStore128, ushort, __m128i, _mm_loadu_si128, _mm_storeu_si128);
486 FUNCTOR_LOADSTORE_CAST(VLoadStore128,  short, __m128i, _mm_loadu_si128, _mm_storeu_si128);
487 FUNCTOR_LOADSTORE_CAST(VLoadStore128,    int, __m128i, _mm_loadu_si128, _mm_storeu_si128);
488 FUNCTOR_LOADSTORE(     VLoadStore128,  float, __m128 , _mm_loadu_ps   , _mm_storeu_ps   );
489 FUNCTOR_LOADSTORE(     VLoadStore128, double, __m128d, _mm_loadu_pd   , _mm_storeu_pd   );
490 
491 FUNCTOR_LOADSTORE_CAST(VLoadStore64,  uchar, __m128i, _mm_loadl_epi64, _mm_storel_epi64);
492 FUNCTOR_LOADSTORE_CAST(VLoadStore64,  schar, __m128i, _mm_loadl_epi64, _mm_storel_epi64);
493 FUNCTOR_LOADSTORE_CAST(VLoadStore64, ushort, __m128i, _mm_loadl_epi64, _mm_storel_epi64);
494 FUNCTOR_LOADSTORE_CAST(VLoadStore64,  short, __m128i, _mm_loadl_epi64, _mm_storel_epi64);
495 
496 FUNCTOR_LOADSTORE_CAST(VLoadStore128Aligned,    int, __m128i, _mm_load_si128, _mm_store_si128);
497 FUNCTOR_LOADSTORE(     VLoadStore128Aligned,  float, __m128 , _mm_load_ps   , _mm_store_ps   );
498 FUNCTOR_LOADSTORE(     VLoadStore128Aligned, double, __m128d, _mm_load_pd   , _mm_store_pd   );
499 
500 FUNCTOR_TEMPLATE(VAdd);
501 FUNCTOR_CLOSURE_2arg(VAdd,  uchar, return _mm_adds_epu8 (a, b));
502 FUNCTOR_CLOSURE_2arg(VAdd,  schar, return _mm_adds_epi8 (a, b));
503 FUNCTOR_CLOSURE_2arg(VAdd, ushort, return _mm_adds_epu16(a, b));
504 FUNCTOR_CLOSURE_2arg(VAdd,  short, return _mm_adds_epi16(a, b));
505 FUNCTOR_CLOSURE_2arg(VAdd,    int, return _mm_add_epi32 (a, b));
506 FUNCTOR_CLOSURE_2arg(VAdd,  float, return _mm_add_ps    (a, b));
507 FUNCTOR_CLOSURE_2arg(VAdd, double, return _mm_add_pd    (a, b));
508 
509 FUNCTOR_TEMPLATE(VSub);
510 FUNCTOR_CLOSURE_2arg(VSub,  uchar, return _mm_subs_epu8 (a, b));
511 FUNCTOR_CLOSURE_2arg(VSub,  schar, return _mm_subs_epi8 (a, b));
512 FUNCTOR_CLOSURE_2arg(VSub, ushort, return _mm_subs_epu16(a, b));
513 FUNCTOR_CLOSURE_2arg(VSub,  short, return _mm_subs_epi16(a, b));
514 FUNCTOR_CLOSURE_2arg(VSub,    int, return _mm_sub_epi32 (a, b));
515 FUNCTOR_CLOSURE_2arg(VSub,  float, return _mm_sub_ps    (a, b));
516 FUNCTOR_CLOSURE_2arg(VSub, double, return _mm_sub_pd    (a, b));
517 
518 FUNCTOR_TEMPLATE(VMin);
519 FUNCTOR_CLOSURE_2arg(VMin, uchar, return _mm_min_epu8(a, b));
520 FUNCTOR_CLOSURE_2arg(VMin, schar,
521         __m128i m = _mm_cmpgt_epi8(a, b);
522         return _mm_xor_si128(a, _mm_and_si128(_mm_xor_si128(a, b), m));
523     );
524 FUNCTOR_CLOSURE_2arg(VMin, ushort, return _mm_subs_epu16(a, _mm_subs_epu16(a, b)));
525 FUNCTOR_CLOSURE_2arg(VMin,  short, return _mm_min_epi16(a, b));
526 FUNCTOR_CLOSURE_2arg(VMin,    int,
527         __m128i m = _mm_cmpgt_epi32(a, b);
528         return _mm_xor_si128(a, _mm_and_si128(_mm_xor_si128(a, b), m));
529     );
530 FUNCTOR_CLOSURE_2arg(VMin,  float, return _mm_min_ps(a, b));
531 FUNCTOR_CLOSURE_2arg(VMin, double, return _mm_min_pd(a, b));
532 
533 FUNCTOR_TEMPLATE(VMax);
534 FUNCTOR_CLOSURE_2arg(VMax, uchar, return _mm_max_epu8(a, b));
535 FUNCTOR_CLOSURE_2arg(VMax, schar,
536         __m128i m = _mm_cmpgt_epi8(b, a);
537         return _mm_xor_si128(a, _mm_and_si128(_mm_xor_si128(a, b), m));
538     );
539 FUNCTOR_CLOSURE_2arg(VMax, ushort, return _mm_adds_epu16(_mm_subs_epu16(a, b), b));
540 FUNCTOR_CLOSURE_2arg(VMax,  short, return _mm_max_epi16(a, b));
541 FUNCTOR_CLOSURE_2arg(VMax,    int,
542         __m128i m = _mm_cmpgt_epi32(b, a);
543         return _mm_xor_si128(a, _mm_and_si128(_mm_xor_si128(a, b), m));
544     );
545 FUNCTOR_CLOSURE_2arg(VMax,  float, return _mm_max_ps(a, b));
546 FUNCTOR_CLOSURE_2arg(VMax, double, return _mm_max_pd(a, b));
547 
548 
549 static unsigned int CV_DECL_ALIGNED(16) v32f_absmask[] = { 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff };
550 static unsigned int CV_DECL_ALIGNED(16) v64f_absmask[] = { 0xffffffff, 0x7fffffff, 0xffffffff, 0x7fffffff };
551 
552 FUNCTOR_TEMPLATE(VAbsDiff);
553 FUNCTOR_CLOSURE_2arg(VAbsDiff,  uchar,
554         return _mm_add_epi8(_mm_subs_epu8(a, b), _mm_subs_epu8(b, a));
555     );
556 FUNCTOR_CLOSURE_2arg(VAbsDiff,  schar,
557         __m128i d = _mm_subs_epi8(a, b);
558         __m128i m = _mm_cmpgt_epi8(b, a);
559         return _mm_subs_epi8(_mm_xor_si128(d, m), m);
560     );
561 FUNCTOR_CLOSURE_2arg(VAbsDiff, ushort,
562         return _mm_add_epi16(_mm_subs_epu16(a, b), _mm_subs_epu16(b, a));
563     );
564 FUNCTOR_CLOSURE_2arg(VAbsDiff,  short,
565         __m128i M = _mm_max_epi16(a, b);
566         __m128i m = _mm_min_epi16(a, b);
567         return _mm_subs_epi16(M, m);
568     );
569 FUNCTOR_CLOSURE_2arg(VAbsDiff,    int,
570         __m128i d = _mm_sub_epi32(a, b);
571         __m128i m = _mm_cmpgt_epi32(b, a);
572         return _mm_sub_epi32(_mm_xor_si128(d, m), m);
573     );
574 FUNCTOR_CLOSURE_2arg(VAbsDiff,  float,
575         return _mm_and_ps(_mm_sub_ps(a,b), *(const __m128*)v32f_absmask);
576     );
577 FUNCTOR_CLOSURE_2arg(VAbsDiff, double,
578         return _mm_and_pd(_mm_sub_pd(a,b), *(const __m128d*)v64f_absmask);
579     );
580 
581 FUNCTOR_TEMPLATE(VAnd);
582 FUNCTOR_CLOSURE_2arg(VAnd, uchar, return _mm_and_si128(a, b));
583 FUNCTOR_TEMPLATE(VOr);
584 FUNCTOR_CLOSURE_2arg(VOr , uchar, return _mm_or_si128 (a, b));
585 FUNCTOR_TEMPLATE(VXor);
586 FUNCTOR_CLOSURE_2arg(VXor, uchar, return _mm_xor_si128(a, b));
587 FUNCTOR_TEMPLATE(VNot);
588 FUNCTOR_CLOSURE_1arg(VNot, uchar, return _mm_xor_si128(_mm_set1_epi32(-1), a));
589 #endif
590 
591 #if CV_NEON
592 
593 #define FUNCTOR_LOADSTORE(name, template_arg, register_type, load_body, store_body)\
594     template <>                                                                \
595     struct name<template_arg>{                                                 \
596         typedef register_type reg_type;                                        \
597         static reg_type load(const template_arg * p) { return load_body (p);}; \
598         static void store(template_arg * p, reg_type v) { store_body (p, v);}; \
599     }
600 
601 #define FUNCTOR_CLOSURE_2arg(name, template_arg, body)\
602     template<>                                                         \
603     struct name<template_arg>                                          \
604     {                                                                  \
605         VLoadStore128<template_arg>::reg_type operator()(              \
606                         VLoadStore128<template_arg>::reg_type a,       \
607                         VLoadStore128<template_arg>::reg_type b) const \
608         {                                                              \
609             return body;                                               \
610         };                                                             \
611     }
612 
613 #define FUNCTOR_CLOSURE_1arg(name, template_arg, body)\
614     template<>                                                         \
615     struct name<template_arg>                                          \
616     {                                                                  \
617         VLoadStore128<template_arg>::reg_type operator()(              \
618                         VLoadStore128<template_arg>::reg_type a,       \
619                         VLoadStore128<template_arg>::reg_type  ) const \
620         {                                                              \
621             return body;                                               \
622         };                                                             \
623     }
624 
625 FUNCTOR_LOADSTORE(VLoadStore128,  uchar,  uint8x16_t, vld1q_u8 , vst1q_u8 );
626 FUNCTOR_LOADSTORE(VLoadStore128,  schar,   int8x16_t, vld1q_s8 , vst1q_s8 );
627 FUNCTOR_LOADSTORE(VLoadStore128, ushort,  uint16x8_t, vld1q_u16, vst1q_u16);
628 FUNCTOR_LOADSTORE(VLoadStore128,  short,   int16x8_t, vld1q_s16, vst1q_s16);
629 FUNCTOR_LOADSTORE(VLoadStore128,    int,   int32x4_t, vld1q_s32, vst1q_s32);
630 FUNCTOR_LOADSTORE(VLoadStore128,  float, float32x4_t, vld1q_f32, vst1q_f32);
631 
632 FUNCTOR_TEMPLATE(VAdd);
633 FUNCTOR_CLOSURE_2arg(VAdd,  uchar, vqaddq_u8 (a, b));
634 FUNCTOR_CLOSURE_2arg(VAdd,  schar, vqaddq_s8 (a, b));
635 FUNCTOR_CLOSURE_2arg(VAdd, ushort, vqaddq_u16(a, b));
636 FUNCTOR_CLOSURE_2arg(VAdd,  short, vqaddq_s16(a, b));
637 FUNCTOR_CLOSURE_2arg(VAdd,    int, vaddq_s32 (a, b));
638 FUNCTOR_CLOSURE_2arg(VAdd,  float, vaddq_f32 (a, b));
639 
640 FUNCTOR_TEMPLATE(VSub);
641 FUNCTOR_CLOSURE_2arg(VSub,  uchar, vqsubq_u8 (a, b));
642 FUNCTOR_CLOSURE_2arg(VSub,  schar, vqsubq_s8 (a, b));
643 FUNCTOR_CLOSURE_2arg(VSub, ushort, vqsubq_u16(a, b));
644 FUNCTOR_CLOSURE_2arg(VSub,  short, vqsubq_s16(a, b));
645 FUNCTOR_CLOSURE_2arg(VSub,    int, vsubq_s32 (a, b));
646 FUNCTOR_CLOSURE_2arg(VSub,  float, vsubq_f32 (a, b));
647 
648 FUNCTOR_TEMPLATE(VMin);
649 FUNCTOR_CLOSURE_2arg(VMin,  uchar, vminq_u8 (a, b));
650 FUNCTOR_CLOSURE_2arg(VMin,  schar, vminq_s8 (a, b));
651 FUNCTOR_CLOSURE_2arg(VMin, ushort, vminq_u16(a, b));
652 FUNCTOR_CLOSURE_2arg(VMin,  short, vminq_s16(a, b));
653 FUNCTOR_CLOSURE_2arg(VMin,    int, vminq_s32(a, b));
654 FUNCTOR_CLOSURE_2arg(VMin,  float, vminq_f32(a, b));
655 
656 FUNCTOR_TEMPLATE(VMax);
657 FUNCTOR_CLOSURE_2arg(VMax,  uchar, vmaxq_u8 (a, b));
658 FUNCTOR_CLOSURE_2arg(VMax,  schar, vmaxq_s8 (a, b));
659 FUNCTOR_CLOSURE_2arg(VMax, ushort, vmaxq_u16(a, b));
660 FUNCTOR_CLOSURE_2arg(VMax,  short, vmaxq_s16(a, b));
661 FUNCTOR_CLOSURE_2arg(VMax,    int, vmaxq_s32(a, b));
662 FUNCTOR_CLOSURE_2arg(VMax,  float, vmaxq_f32(a, b));
663 
664 FUNCTOR_TEMPLATE(VAbsDiff);
665 FUNCTOR_CLOSURE_2arg(VAbsDiff,  uchar, vabdq_u8  (a, b));
666 FUNCTOR_CLOSURE_2arg(VAbsDiff,  schar, vqabsq_s8 (vqsubq_s8(a, b)));
667 FUNCTOR_CLOSURE_2arg(VAbsDiff, ushort, vabdq_u16 (a, b));
668 FUNCTOR_CLOSURE_2arg(VAbsDiff,  short, vqabsq_s16(vqsubq_s16(a, b)));
669 FUNCTOR_CLOSURE_2arg(VAbsDiff,    int, vabdq_s32 (a, b));
670 FUNCTOR_CLOSURE_2arg(VAbsDiff,  float, vabdq_f32 (a, b));
671 
672 FUNCTOR_TEMPLATE(VAnd);
673 FUNCTOR_CLOSURE_2arg(VAnd, uchar, vandq_u8(a, b));
674 FUNCTOR_TEMPLATE(VOr);
675 FUNCTOR_CLOSURE_2arg(VOr , uchar, vorrq_u8(a, b));
676 FUNCTOR_TEMPLATE(VXor);
677 FUNCTOR_CLOSURE_2arg(VXor, uchar, veorq_u8(a, b));
678 FUNCTOR_TEMPLATE(VNot);
679 FUNCTOR_CLOSURE_1arg(VNot, uchar, vmvnq_u8(a   ));
680 #endif
681 
682 #if CV_SSE2 || CV_NEON
683 #define IF_SIMD(op) op
684 #else
685 #define IF_SIMD(op) NOP
686 #endif
687 
operator ()(uchar a,uchar b) const688 template<> inline uchar OpAdd<uchar>::operator ()(uchar a, uchar b) const
689 { return CV_FAST_CAST_8U(a + b); }
operator ()(uchar a,uchar b) const690 template<> inline uchar OpSub<uchar>::operator ()(uchar a, uchar b) const
691 { return CV_FAST_CAST_8U(a - b); }
692 
693 template<typename T> struct OpAbsDiff
694 {
695     typedef T type1;
696     typedef T type2;
697     typedef T rtype;
operator ()cv::OpAbsDiff698     T operator()(T a, T b) const { return (T)std::abs(a - b); }
699 };
700 
operator ()(short a,short b) const701 template<> inline short OpAbsDiff<short>::operator ()(short a, short b) const
702 { return saturate_cast<short>(std::abs(a - b)); }
703 
operator ()(schar a,schar b) const704 template<> inline schar OpAbsDiff<schar>::operator ()(schar a, schar b) const
705 { return saturate_cast<schar>(std::abs(a - b)); }
706 
707 template<typename T, typename WT=T> struct OpAbsDiffS
708 {
709     typedef T type1;
710     typedef WT type2;
711     typedef T rtype;
operator ()cv::OpAbsDiffS712     T operator()(T a, WT b) const { return saturate_cast<T>(std::abs(a - b)); }
713 };
714 
715 template<typename T> struct OpAnd
716 {
717     typedef T type1;
718     typedef T type2;
719     typedef T rtype;
operator ()cv::OpAnd720     T operator()( T a, T b ) const { return a & b; }
721 };
722 
723 template<typename T> struct OpOr
724 {
725     typedef T type1;
726     typedef T type2;
727     typedef T rtype;
operator ()cv::OpOr728     T operator()( T a, T b ) const { return a | b; }
729 };
730 
731 template<typename T> struct OpXor
732 {
733     typedef T type1;
734     typedef T type2;
735     typedef T rtype;
operator ()cv::OpXor736     T operator()( T a, T b ) const { return a ^ b; }
737 };
738 
739 template<typename T> struct OpNot
740 {
741     typedef T type1;
742     typedef T type2;
743     typedef T rtype;
operator ()cv::OpNot744     T operator()( T a, T ) const { return ~a; }
745 };
746 
747 #if (ARITHM_USE_IPP == 1)
fixSteps(Size sz,size_t elemSize,size_t & step1,size_t & step2,size_t & step)748 static inline void fixSteps(Size sz, size_t elemSize, size_t& step1, size_t& step2, size_t& step)
749 {
750     if( sz.height == 1 )
751         step1 = step2 = step = sz.width*elemSize;
752 }
753 #endif
754 
add8u(const uchar * src1,size_t step1,const uchar * src2,size_t step2,uchar * dst,size_t step,Size sz,void *)755 static void add8u( const uchar* src1, size_t step1,
756                    const uchar* src2, size_t step2,
757                    uchar* dst, size_t step, Size sz, void* )
758 {
759 #if (ARITHM_USE_IPP == 1)
760     CV_IPP_CHECK()
761     {
762         fixSteps(sz, sizeof(dst[0]), step1, step2, step);
763         if (0 <= ippiAdd_8u_C1RSfs(src1, (int)step1, src2, (int)step2, dst, (int)step, ippiSize(sz), 0))
764         {
765             CV_IMPL_ADD(CV_IMPL_IPP);
766             return;
767         }
768         setIppErrorStatus();
769     }
770 #endif
771     (vBinOp<uchar, OpAdd<uchar>, IF_SIMD(VAdd<uchar>)>(src1, step1, src2, step2, dst, step, sz));
772 }
773 
add8s(const schar * src1,size_t step1,const schar * src2,size_t step2,schar * dst,size_t step,Size sz,void *)774 static void add8s( const schar* src1, size_t step1,
775                    const schar* src2, size_t step2,
776                    schar* dst, size_t step, Size sz, void* )
777 {
778     vBinOp<schar, OpAdd<schar>, IF_SIMD(VAdd<schar>)>(src1, step1, src2, step2, dst, step, sz);
779 }
780 
add16u(const ushort * src1,size_t step1,const ushort * src2,size_t step2,ushort * dst,size_t step,Size sz,void *)781 static void add16u( const ushort* src1, size_t step1,
782                     const ushort* src2, size_t step2,
783                     ushort* dst, size_t step, Size sz, void* )
784 {
785 #if (ARITHM_USE_IPP == 1)
786     CV_IPP_CHECK()
787     {
788         fixSteps(sz, sizeof(dst[0]), step1, step2, step);
789         if (0 <= ippiAdd_16u_C1RSfs(src1, (int)step1, src2, (int)step2, dst, (int)step, ippiSize(sz), 0))
790         {
791             CV_IMPL_ADD(CV_IMPL_IPP);
792             return;
793         }
794         setIppErrorStatus();
795     }
796 #endif
797     (vBinOp<ushort, OpAdd<ushort>, IF_SIMD(VAdd<ushort>)>(src1, step1, src2, step2, dst, step, sz));
798 }
799 
add16s(const short * src1,size_t step1,const short * src2,size_t step2,short * dst,size_t step,Size sz,void *)800 static void add16s( const short* src1, size_t step1,
801                     const short* src2, size_t step2,
802                     short* dst, size_t step, Size sz, void* )
803 {
804 #if (ARITHM_USE_IPP == 1)
805     CV_IPP_CHECK()
806     {
807         fixSteps(sz, sizeof(dst[0]), step1, step2, step);
808         if (0 <= ippiAdd_16s_C1RSfs(src1, (int)step1, src2, (int)step2, dst, (int)step, ippiSize(sz), 0))
809         {
810             CV_IMPL_ADD(CV_IMPL_IPP);
811             return;
812         }
813         setIppErrorStatus();
814     }
815 #endif
816     (vBinOp<short, OpAdd<short>, IF_SIMD(VAdd<short>)>(src1, step1, src2, step2, dst, step, sz));
817 }
818 
add32s(const int * src1,size_t step1,const int * src2,size_t step2,int * dst,size_t step,Size sz,void *)819 static void add32s( const int* src1, size_t step1,
820                     const int* src2, size_t step2,
821                     int* dst, size_t step, Size sz, void* )
822 {
823     vBinOp32<int, OpAdd<int>, IF_SIMD(VAdd<int>)>(src1, step1, src2, step2, dst, step, sz);
824 }
825 
add32f(const float * src1,size_t step1,const float * src2,size_t step2,float * dst,size_t step,Size sz,void *)826 static void add32f( const float* src1, size_t step1,
827                     const float* src2, size_t step2,
828                     float* dst, size_t step, Size sz, void* )
829 {
830 #if (ARITHM_USE_IPP == 1)
831     CV_IPP_CHECK()
832     {
833         fixSteps(sz, sizeof(dst[0]), step1, step2, step);
834         if (0 <= ippiAdd_32f_C1R(src1, (int)step1, src2, (int)step2, dst, (int)step, ippiSize(sz)))
835         {
836             CV_IMPL_ADD(CV_IMPL_IPP);
837             return;
838         }
839         setIppErrorStatus();
840     }
841 #endif
842     (vBinOp32<float, OpAdd<float>, IF_SIMD(VAdd<float>)>(src1, step1, src2, step2, dst, step, sz));
843 }
844 
add64f(const double * src1,size_t step1,const double * src2,size_t step2,double * dst,size_t step,Size sz,void *)845 static void add64f( const double* src1, size_t step1,
846                     const double* src2, size_t step2,
847                     double* dst, size_t step, Size sz, void* )
848 {
849     vBinOp64<double, OpAdd<double>, IF_SIMD(VAdd<double>)>(src1, step1, src2, step2, dst, step, sz);
850 }
851 
sub8u(const uchar * src1,size_t step1,const uchar * src2,size_t step2,uchar * dst,size_t step,Size sz,void *)852 static void sub8u( const uchar* src1, size_t step1,
853                    const uchar* src2, size_t step2,
854                    uchar* dst, size_t step, Size sz, void* )
855 {
856 #if (ARITHM_USE_IPP == 1)
857     CV_IPP_CHECK()
858     {
859         fixSteps(sz, sizeof(dst[0]), step1, step2, step);
860         if (0 <= ippiSub_8u_C1RSfs(src2, (int)step2, src1, (int)step1, dst, (int)step, ippiSize(sz), 0))
861         {
862             CV_IMPL_ADD(CV_IMPL_IPP);
863             return;
864         }
865         setIppErrorStatus();
866     }
867 #endif
868     (vBinOp<uchar, OpSub<uchar>, IF_SIMD(VSub<uchar>)>(src1, step1, src2, step2, dst, step, sz));
869 }
870 
sub8s(const schar * src1,size_t step1,const schar * src2,size_t step2,schar * dst,size_t step,Size sz,void *)871 static void sub8s( const schar* src1, size_t step1,
872                    const schar* src2, size_t step2,
873                    schar* dst, size_t step, Size sz, void* )
874 {
875     vBinOp<schar, OpSub<schar>, IF_SIMD(VSub<schar>)>(src1, step1, src2, step2, dst, step, sz);
876 }
877 
sub16u(const ushort * src1,size_t step1,const ushort * src2,size_t step2,ushort * dst,size_t step,Size sz,void *)878 static void sub16u( const ushort* src1, size_t step1,
879                     const ushort* src2, size_t step2,
880                     ushort* dst, size_t step, Size sz, void* )
881 {
882 #if (ARITHM_USE_IPP == 1)
883     CV_IPP_CHECK()
884     {
885         fixSteps(sz, sizeof(dst[0]), step1, step2, step);
886         if (0 <= ippiSub_16u_C1RSfs(src2, (int)step2, src1, (int)step1, dst, (int)step, ippiSize(sz), 0))
887         {
888             CV_IMPL_ADD(CV_IMPL_IPP);
889             return;
890         }
891         setIppErrorStatus();
892     }
893 #endif
894     (vBinOp<ushort, OpSub<ushort>, IF_SIMD(VSub<ushort>)>(src1, step1, src2, step2, dst, step, sz));
895 }
896 
sub16s(const short * src1,size_t step1,const short * src2,size_t step2,short * dst,size_t step,Size sz,void *)897 static void sub16s( const short* src1, size_t step1,
898                     const short* src2, size_t step2,
899                     short* dst, size_t step, Size sz, void* )
900 {
901 #if (ARITHM_USE_IPP == 1)
902     CV_IPP_CHECK()
903     {
904         fixSteps(sz, sizeof(dst[0]), step1, step2, step);
905         if (0 <= ippiSub_16s_C1RSfs(src2, (int)step2, src1, (int)step1, dst, (int)step, ippiSize(sz), 0))
906         {
907             CV_IMPL_ADD(CV_IMPL_IPP);
908             return;
909         }
910         setIppErrorStatus();
911     }
912 #endif
913     (vBinOp<short, OpSub<short>, IF_SIMD(VSub<short>)>(src1, step1, src2, step2, dst, step, sz));
914 }
915 
sub32s(const int * src1,size_t step1,const int * src2,size_t step2,int * dst,size_t step,Size sz,void *)916 static void sub32s( const int* src1, size_t step1,
917                     const int* src2, size_t step2,
918                     int* dst, size_t step, Size sz, void* )
919 {
920     vBinOp32<int, OpSub<int>, IF_SIMD(VSub<int>)>(src1, step1, src2, step2, dst, step, sz);
921 }
922 
sub32f(const float * src1,size_t step1,const float * src2,size_t step2,float * dst,size_t step,Size sz,void *)923 static void sub32f( const float* src1, size_t step1,
924                    const float* src2, size_t step2,
925                    float* dst, size_t step, Size sz, void* )
926 {
927 #if (ARITHM_USE_IPP == 1)
928     CV_IPP_CHECK()
929     {
930         fixSteps(sz, sizeof(dst[0]), step1, step2, step);
931         if (0 <= ippiSub_32f_C1R(src2, (int)step2, src1, (int)step1, dst, (int)step, ippiSize(sz)))
932         {
933             CV_IMPL_ADD(CV_IMPL_IPP);
934             return;
935         }
936         setIppErrorStatus();
937     }
938 #endif
939     (vBinOp32<float, OpSub<float>, IF_SIMD(VSub<float>)>(src1, step1, src2, step2, dst, step, sz));
940 }
941 
sub64f(const double * src1,size_t step1,const double * src2,size_t step2,double * dst,size_t step,Size sz,void *)942 static void sub64f( const double* src1, size_t step1,
943                     const double* src2, size_t step2,
944                     double* dst, size_t step, Size sz, void* )
945 {
946     vBinOp64<double, OpSub<double>, IF_SIMD(VSub<double>)>(src1, step1, src2, step2, dst, step, sz);
947 }
948 
operator ()(uchar a,uchar b) const949 template<> inline uchar OpMin<uchar>::operator ()(uchar a, uchar b) const { return CV_MIN_8U(a, b); }
operator ()(uchar a,uchar b) const950 template<> inline uchar OpMax<uchar>::operator ()(uchar a, uchar b) const { return CV_MAX_8U(a, b); }
951 
max8u(const uchar * src1,size_t step1,const uchar * src2,size_t step2,uchar * dst,size_t step,Size sz,void *)952 static void max8u( const uchar* src1, size_t step1,
953                    const uchar* src2, size_t step2,
954                    uchar* dst, size_t step, Size sz, void* )
955 {
956 #if (ARITHM_USE_IPP == 1)
957     CV_IPP_CHECK()
958     {
959         uchar* s1 = (uchar*)src1;
960         uchar* s2 = (uchar*)src2;
961         uchar* d  = dst;
962         fixSteps(sz, sizeof(dst[0]), step1, step2, step);
963         int i = 0;
964         for(; i < sz.height; i++)
965         {
966             if (0 > ippsMaxEvery_8u(s1, s2, d, sz.width))
967                 break;
968             s1 += step1;
969             s2 += step2;
970             d  += step;
971         }
972         if (i == sz.height)
973         {
974             CV_IMPL_ADD(CV_IMPL_IPP);
975             return;
976         }
977         setIppErrorStatus();
978     }
979 #endif
980     vBinOp<uchar, OpMax<uchar>, IF_SIMD(VMax<uchar>)>(src1, step1, src2, step2, dst, step, sz);
981 }
982 
max8s(const schar * src1,size_t step1,const schar * src2,size_t step2,schar * dst,size_t step,Size sz,void *)983 static void max8s( const schar* src1, size_t step1,
984                    const schar* src2, size_t step2,
985                    schar* dst, size_t step, Size sz, void* )
986 {
987     vBinOp<schar, OpMax<schar>, IF_SIMD(VMax<schar>)>(src1, step1, src2, step2, dst, step, sz);
988 }
989 
max16u(const ushort * src1,size_t step1,const ushort * src2,size_t step2,ushort * dst,size_t step,Size sz,void *)990 static void max16u( const ushort* src1, size_t step1,
991                     const ushort* src2, size_t step2,
992                     ushort* dst, size_t step, Size sz, void* )
993 {
994 #if (ARITHM_USE_IPP == 1)
995     CV_IPP_CHECK()
996     {
997         ushort* s1 = (ushort*)src1;
998         ushort* s2 = (ushort*)src2;
999         ushort* d  = dst;
1000         fixSteps(sz, sizeof(dst[0]), step1, step2, step);
1001         int i = 0;
1002         for(; i < sz.height; i++)
1003         {
1004             if (0 > ippsMaxEvery_16u(s1, s2, d, sz.width))
1005                 break;
1006             s1 = (ushort*)((uchar*)s1 + step1);
1007             s2 = (ushort*)((uchar*)s2 + step2);
1008             d  = (ushort*)((uchar*)d + step);
1009         }
1010         if (i == sz.height)
1011         {
1012             CV_IMPL_ADD(CV_IMPL_IPP);
1013             return;
1014         }
1015         setIppErrorStatus();
1016     }
1017 #endif
1018     vBinOp<ushort, OpMax<ushort>, IF_SIMD(VMax<ushort>)>(src1, step1, src2, step2, dst, step, sz);
1019 }
1020 
max16s(const short * src1,size_t step1,const short * src2,size_t step2,short * dst,size_t step,Size sz,void *)1021 static void max16s( const short* src1, size_t step1,
1022                     const short* src2, size_t step2,
1023                     short* dst, size_t step, Size sz, void* )
1024 {
1025     vBinOp<short, OpMax<short>, IF_SIMD(VMax<short>)>(src1, step1, src2, step2, dst, step, sz);
1026 }
1027 
max32s(const int * src1,size_t step1,const int * src2,size_t step2,int * dst,size_t step,Size sz,void *)1028 static void max32s( const int* src1, size_t step1,
1029                     const int* src2, size_t step2,
1030                     int* dst, size_t step, Size sz, void* )
1031 {
1032     vBinOp32<int, OpMax<int>, IF_SIMD(VMax<int>)>(src1, step1, src2, step2, dst, step, sz);
1033 }
1034 
max32f(const float * src1,size_t step1,const float * src2,size_t step2,float * dst,size_t step,Size sz,void *)1035 static void max32f( const float* src1, size_t step1,
1036                     const float* src2, size_t step2,
1037                     float* dst, size_t step, Size sz, void* )
1038 {
1039 #if (ARITHM_USE_IPP == 1)
1040     CV_IPP_CHECK()
1041     {
1042         float* s1 = (float*)src1;
1043         float* s2 = (float*)src2;
1044         float* d  = dst;
1045         fixSteps(sz, sizeof(dst[0]), step1, step2, step);
1046         int i = 0;
1047         for(; i < sz.height; i++)
1048         {
1049             if (0 > ippsMaxEvery_32f(s1, s2, d, sz.width))
1050                 break;
1051             s1 = (float*)((uchar*)s1 + step1);
1052             s2 = (float*)((uchar*)s2 + step2);
1053             d  = (float*)((uchar*)d + step);
1054         }
1055         if (i == sz.height)
1056         {
1057             CV_IMPL_ADD(CV_IMPL_IPP);
1058             return;
1059         }
1060         setIppErrorStatus();
1061     }
1062 #endif
1063     vBinOp32<float, OpMax<float>, IF_SIMD(VMax<float>)>(src1, step1, src2, step2, dst, step, sz);
1064 }
1065 
max64f(const double * src1,size_t step1,const double * src2,size_t step2,double * dst,size_t step,Size sz,void *)1066 static void max64f( const double* src1, size_t step1,
1067                     const double* src2, size_t step2,
1068                     double* dst, size_t step, Size sz, void* )
1069 {
1070 #if ARITHM_USE_IPP == 1
1071     CV_IPP_CHECK()
1072     {
1073         double* s1 = (double*)src1;
1074         double* s2 = (double*)src2;
1075         double* d  = dst;
1076         fixSteps(sz, sizeof(dst[0]), step1, step2, step);
1077         int i = 0;
1078         for(; i < sz.height; i++)
1079         {
1080             if (0 > ippsMaxEvery_64f(s1, s2, d, sz.width))
1081                 break;
1082             s1 = (double*)((uchar*)s1 + step1);
1083             s2 = (double*)((uchar*)s2 + step2);
1084             d  = (double*)((uchar*)d + step);
1085         }
1086         if (i == sz.height)
1087         {
1088             CV_IMPL_ADD(CV_IMPL_IPP);
1089             return;
1090         }
1091         setIppErrorStatus();
1092     }
1093 #endif
1094     vBinOp64<double, OpMax<double>, IF_SIMD(VMax<double>)>(src1, step1, src2, step2, dst, step, sz);
1095 }
1096 
min8u(const uchar * src1,size_t step1,const uchar * src2,size_t step2,uchar * dst,size_t step,Size sz,void *)1097 static void min8u( const uchar* src1, size_t step1,
1098                    const uchar* src2, size_t step2,
1099                    uchar* dst, size_t step, Size sz, void* )
1100 {
1101 #if (ARITHM_USE_IPP == 1)
1102     CV_IPP_CHECK()
1103     {
1104         uchar* s1 = (uchar*)src1;
1105         uchar* s2 = (uchar*)src2;
1106         uchar* d  = dst;
1107         fixSteps(sz, sizeof(dst[0]), step1, step2, step);
1108         int i = 0;
1109         for(; i < sz.height; i++)
1110         {
1111             if (0 > ippsMinEvery_8u(s1, s2, d, sz.width))
1112                 break;
1113             s1 += step1;
1114             s2 += step2;
1115             d  += step;
1116         }
1117         if (i == sz.height)
1118         {
1119             CV_IMPL_ADD(CV_IMPL_IPP);
1120             return;
1121         }
1122         setIppErrorStatus();
1123     }
1124 #endif
1125     vBinOp<uchar, OpMin<uchar>, IF_SIMD(VMin<uchar>)>(src1, step1, src2, step2, dst, step, sz);
1126 }
1127 
min8s(const schar * src1,size_t step1,const schar * src2,size_t step2,schar * dst,size_t step,Size sz,void *)1128 static void min8s( const schar* src1, size_t step1,
1129                    const schar* src2, size_t step2,
1130                    schar* dst, size_t step, Size sz, void* )
1131 {
1132     vBinOp<schar, OpMin<schar>, IF_SIMD(VMin<schar>)>(src1, step1, src2, step2, dst, step, sz);
1133 }
1134 
min16u(const ushort * src1,size_t step1,const ushort * src2,size_t step2,ushort * dst,size_t step,Size sz,void *)1135 static void min16u( const ushort* src1, size_t step1,
1136                     const ushort* src2, size_t step2,
1137                     ushort* dst, size_t step, Size sz, void* )
1138 {
1139 #if (ARITHM_USE_IPP == 1)
1140     CV_IPP_CHECK()
1141     {
1142         ushort* s1 = (ushort*)src1;
1143         ushort* s2 = (ushort*)src2;
1144         ushort* d  = dst;
1145         fixSteps(sz, sizeof(dst[0]), step1, step2, step);
1146         int i = 0;
1147         for(; i < sz.height; i++)
1148         {
1149             if (0 > ippsMinEvery_16u(s1, s2, d, sz.width))
1150                 break;
1151             s1 = (ushort*)((uchar*)s1 + step1);
1152             s2 = (ushort*)((uchar*)s2 + step2);
1153             d  = (ushort*)((uchar*)d + step);
1154         }
1155         if (i == sz.height)
1156         {
1157             CV_IMPL_ADD(CV_IMPL_IPP);
1158             return;
1159         }
1160         setIppErrorStatus();
1161     }
1162 #endif
1163     vBinOp<ushort, OpMin<ushort>, IF_SIMD(VMin<ushort>)>(src1, step1, src2, step2, dst, step, sz);
1164 }
1165 
min16s(const short * src1,size_t step1,const short * src2,size_t step2,short * dst,size_t step,Size sz,void *)1166 static void min16s( const short* src1, size_t step1,
1167                     const short* src2, size_t step2,
1168                     short* dst, size_t step, Size sz, void* )
1169 {
1170     vBinOp<short, OpMin<short>, IF_SIMD(VMin<short>)>(src1, step1, src2, step2, dst, step, sz);
1171 }
1172 
min32s(const int * src1,size_t step1,const int * src2,size_t step2,int * dst,size_t step,Size sz,void *)1173 static void min32s( const int* src1, size_t step1,
1174                     const int* src2, size_t step2,
1175                     int* dst, size_t step, Size sz, void* )
1176 {
1177     vBinOp32<int, OpMin<int>, IF_SIMD(VMin<int>)>(src1, step1, src2, step2, dst, step, sz);
1178 }
1179 
min32f(const float * src1,size_t step1,const float * src2,size_t step2,float * dst,size_t step,Size sz,void *)1180 static void min32f( const float* src1, size_t step1,
1181                     const float* src2, size_t step2,
1182                     float* dst, size_t step, Size sz, void* )
1183 {
1184 #if (ARITHM_USE_IPP == 1)
1185     CV_IPP_CHECK()
1186     {
1187         float* s1 = (float*)src1;
1188         float* s2 = (float*)src2;
1189         float* d  = dst;
1190         fixSteps(sz, sizeof(dst[0]), step1, step2, step);
1191         int i = 0;
1192         for(; i < sz.height; i++)
1193         {
1194             if (0 > ippsMinEvery_32f(s1, s2, d, sz.width))
1195                 break;
1196             s1 = (float*)((uchar*)s1 + step1);
1197             s2 = (float*)((uchar*)s2 + step2);
1198             d  = (float*)((uchar*)d + step);
1199         }
1200         if (i == sz.height)
1201         {
1202             CV_IMPL_ADD(CV_IMPL_IPP);
1203             return;
1204         }
1205         setIppErrorStatus();
1206     }
1207 #endif
1208     vBinOp32<float, OpMin<float>, IF_SIMD(VMin<float>)>(src1, step1, src2, step2, dst, step, sz);
1209 }
1210 
min64f(const double * src1,size_t step1,const double * src2,size_t step2,double * dst,size_t step,Size sz,void *)1211 static void min64f( const double* src1, size_t step1,
1212                     const double* src2, size_t step2,
1213                     double* dst, size_t step, Size sz, void* )
1214 {
1215 #if ARITHM_USE_IPP == 1
1216     CV_IPP_CHECK()
1217     {
1218         double* s1 = (double*)src1;
1219         double* s2 = (double*)src2;
1220         double* d  = dst;
1221         fixSteps(sz, sizeof(dst[0]), step1, step2, step);
1222         int i = 0;
1223         for(; i < sz.height; i++)
1224         {
1225             if (0 > ippsMinEvery_64f(s1, s2, d, sz.width))
1226                 break;
1227             s1 = (double*)((uchar*)s1 + step1);
1228             s2 = (double*)((uchar*)s2 + step2);
1229             d  = (double*)((uchar*)d + step);
1230         }
1231         if (i == sz.height)
1232         {
1233             CV_IMPL_ADD(CV_IMPL_IPP);
1234             return;
1235         }
1236         setIppErrorStatus();
1237     }
1238 #endif
1239     vBinOp64<double, OpMin<double>, IF_SIMD(VMin<double>)>(src1, step1, src2, step2, dst, step, sz);
1240 }
1241 
absdiff8u(const uchar * src1,size_t step1,const uchar * src2,size_t step2,uchar * dst,size_t step,Size sz,void *)1242 static void absdiff8u( const uchar* src1, size_t step1,
1243                        const uchar* src2, size_t step2,
1244                        uchar* dst, size_t step, Size sz, void* )
1245 {
1246 #if (ARITHM_USE_IPP == 1)
1247     CV_IPP_CHECK()
1248     {
1249         fixSteps(sz, sizeof(dst[0]), step1, step2, step);
1250         if (0 <= ippiAbsDiff_8u_C1R(src1, (int)step1, src2, (int)step2, dst, (int)step, ippiSize(sz)))
1251         {
1252             CV_IMPL_ADD(CV_IMPL_IPP);
1253             return;
1254         }
1255         setIppErrorStatus();
1256     }
1257 #endif
1258     (vBinOp<uchar, OpAbsDiff<uchar>, IF_SIMD(VAbsDiff<uchar>)>(src1, step1, src2, step2, dst, step, sz));
1259 }
1260 
absdiff8s(const schar * src1,size_t step1,const schar * src2,size_t step2,schar * dst,size_t step,Size sz,void *)1261 static void absdiff8s( const schar* src1, size_t step1,
1262                        const schar* src2, size_t step2,
1263                        schar* dst, size_t step, Size sz, void* )
1264 {
1265     vBinOp<schar, OpAbsDiff<schar>, IF_SIMD(VAbsDiff<schar>)>(src1, step1, src2, step2, dst, step, sz);
1266 }
1267 
absdiff16u(const ushort * src1,size_t step1,const ushort * src2,size_t step2,ushort * dst,size_t step,Size sz,void *)1268 static void absdiff16u( const ushort* src1, size_t step1,
1269                         const ushort* src2, size_t step2,
1270                         ushort* dst, size_t step, Size sz, void* )
1271 {
1272 #if (ARITHM_USE_IPP == 1)
1273     CV_IPP_CHECK()
1274     {
1275         fixSteps(sz, sizeof(dst[0]), step1, step2, step);
1276         if (0 <= ippiAbsDiff_16u_C1R(src1, (int)step1, src2, (int)step2, dst, (int)step, ippiSize(sz)))
1277         {
1278             CV_IMPL_ADD(CV_IMPL_IPP);
1279             return;
1280         }
1281         setIppErrorStatus();
1282     }
1283 #endif
1284     (vBinOp<ushort, OpAbsDiff<ushort>, IF_SIMD(VAbsDiff<ushort>)>(src1, step1, src2, step2, dst, step, sz));
1285 }
1286 
absdiff16s(const short * src1,size_t step1,const short * src2,size_t step2,short * dst,size_t step,Size sz,void *)1287 static void absdiff16s( const short* src1, size_t step1,
1288                         const short* src2, size_t step2,
1289                         short* dst, size_t step, Size sz, void* )
1290 {
1291     vBinOp<short, OpAbsDiff<short>, IF_SIMD(VAbsDiff<short>)>(src1, step1, src2, step2, dst, step, sz);
1292 }
1293 
absdiff32s(const int * src1,size_t step1,const int * src2,size_t step2,int * dst,size_t step,Size sz,void *)1294 static void absdiff32s( const int* src1, size_t step1,
1295                         const int* src2, size_t step2,
1296                         int* dst, size_t step, Size sz, void* )
1297 {
1298     vBinOp32<int, OpAbsDiff<int>, IF_SIMD(VAbsDiff<int>)>(src1, step1, src2, step2, dst, step, sz);
1299 }
1300 
absdiff32f(const float * src1,size_t step1,const float * src2,size_t step2,float * dst,size_t step,Size sz,void *)1301 static void absdiff32f( const float* src1, size_t step1,
1302                         const float* src2, size_t step2,
1303                         float* dst, size_t step, Size sz, void* )
1304 {
1305 #if (ARITHM_USE_IPP == 1)
1306     CV_IPP_CHECK()
1307     {
1308         fixSteps(sz, sizeof(dst[0]), step1, step2, step);
1309         if (0 <= ippiAbsDiff_32f_C1R(src1, (int)step1, src2, (int)step2, dst, (int)step, ippiSize(sz)))
1310         {
1311             CV_IMPL_ADD(CV_IMPL_IPP);
1312             return;
1313         }
1314         setIppErrorStatus();
1315     }
1316 #endif
1317     (vBinOp32<float, OpAbsDiff<float>, IF_SIMD(VAbsDiff<float>)>(src1, step1, src2, step2, dst, step, sz));
1318 }
1319 
absdiff64f(const double * src1,size_t step1,const double * src2,size_t step2,double * dst,size_t step,Size sz,void *)1320 static void absdiff64f( const double* src1, size_t step1,
1321                         const double* src2, size_t step2,
1322                         double* dst, size_t step, Size sz, void* )
1323 {
1324     vBinOp64<double, OpAbsDiff<double>, IF_SIMD(VAbsDiff<double>)>(src1, step1, src2, step2, dst, step, sz);
1325 }
1326 
1327 
and8u(const uchar * src1,size_t step1,const uchar * src2,size_t step2,uchar * dst,size_t step,Size sz,void *)1328 static void and8u( const uchar* src1, size_t step1,
1329                    const uchar* src2, size_t step2,
1330                    uchar* dst, size_t step, Size sz, void* )
1331 {
1332 #if (ARITHM_USE_IPP == 1)
1333     CV_IPP_CHECK()
1334     {
1335         fixSteps(sz, sizeof(dst[0]), step1, step2, step);
1336         if (0 <= ippiAnd_8u_C1R(src1, (int)step1, src2, (int)step2, dst, (int)step, ippiSize(sz)))
1337         {
1338             CV_IMPL_ADD(CV_IMPL_IPP);
1339             return;
1340         }
1341         setIppErrorStatus();
1342     }
1343 #endif
1344     (vBinOp<uchar, OpAnd<uchar>, IF_SIMD(VAnd<uchar>)>(src1, step1, src2, step2, dst, step, sz));
1345 }
1346 
or8u(const uchar * src1,size_t step1,const uchar * src2,size_t step2,uchar * dst,size_t step,Size sz,void *)1347 static void or8u( const uchar* src1, size_t step1,
1348                   const uchar* src2, size_t step2,
1349                   uchar* dst, size_t step, Size sz, void* )
1350 {
1351 #if (ARITHM_USE_IPP == 1)
1352     CV_IPP_CHECK()
1353     {
1354         fixSteps(sz, sizeof(dst[0]), step1, step2, step);
1355         if (0 <= ippiOr_8u_C1R(src1, (int)step1, src2, (int)step2, dst, (int)step, ippiSize(sz)))
1356         {
1357             CV_IMPL_ADD(CV_IMPL_IPP);
1358             return;
1359         }
1360         setIppErrorStatus();
1361     }
1362 #endif
1363     (vBinOp<uchar, OpOr<uchar>, IF_SIMD(VOr<uchar>)>(src1, step1, src2, step2, dst, step, sz));
1364 }
1365 
xor8u(const uchar * src1,size_t step1,const uchar * src2,size_t step2,uchar * dst,size_t step,Size sz,void *)1366 static void xor8u( const uchar* src1, size_t step1,
1367                    const uchar* src2, size_t step2,
1368                    uchar* dst, size_t step, Size sz, void* )
1369 {
1370 #if (ARITHM_USE_IPP == 1)
1371     CV_IPP_CHECK()
1372     {
1373         fixSteps(sz, sizeof(dst[0]), step1, step2, step);
1374         if (0 <= ippiXor_8u_C1R(src1, (int)step1, src2, (int)step2, dst, (int)step, ippiSize(sz)))
1375         {
1376             CV_IMPL_ADD(CV_IMPL_IPP);
1377             return;
1378         }
1379         setIppErrorStatus();
1380     }
1381 #endif
1382     (vBinOp<uchar, OpXor<uchar>, IF_SIMD(VXor<uchar>)>(src1, step1, src2, step2, dst, step, sz));
1383 }
1384 
not8u(const uchar * src1,size_t step1,const uchar * src2,size_t step2,uchar * dst,size_t step,Size sz,void *)1385 static void not8u( const uchar* src1, size_t step1,
1386                    const uchar* src2, size_t step2,
1387                    uchar* dst, size_t step, Size sz, void* )
1388 {
1389 #if (ARITHM_USE_IPP == 1)
1390     CV_IPP_CHECK()
1391     {
1392         fixSteps(sz, sizeof(dst[0]), step1, step2, step); (void)src2;
1393         if (0 <= ippiNot_8u_C1R(src1, (int)step1, dst, (int)step, ippiSize(sz)))
1394         {
1395             CV_IMPL_ADD(CV_IMPL_IPP);
1396             return;
1397         }
1398         setIppErrorStatus();
1399     }
1400 #endif
1401     (vBinOp<uchar, OpNot<uchar>, IF_SIMD(VNot<uchar>)>(src1, step1, src2, step2, dst, step, sz));
1402 }
1403 
1404 /****************************************************************************************\
1405 *                                   logical operations                                   *
1406 \****************************************************************************************/
1407 
convertAndUnrollScalar(const Mat & sc,int buftype,uchar * scbuf,size_t blocksize)1408 void convertAndUnrollScalar( const Mat& sc, int buftype, uchar* scbuf, size_t blocksize )
1409 {
1410     int scn = (int)sc.total(), cn = CV_MAT_CN(buftype);
1411     size_t esz = CV_ELEM_SIZE(buftype);
1412     getConvertFunc(sc.depth(), buftype)(sc.ptr(), 1, 0, 1, scbuf, 1, Size(std::min(cn, scn), 1), 0);
1413     // unroll the scalar
1414     if( scn < cn )
1415     {
1416         CV_Assert( scn == 1 );
1417         size_t esz1 = CV_ELEM_SIZE1(buftype);
1418         for( size_t i = esz1; i < esz; i++ )
1419             scbuf[i] = scbuf[i - esz1];
1420     }
1421     for( size_t i = esz; i < blocksize*esz; i++ )
1422         scbuf[i] = scbuf[i - esz];
1423 }
1424 
1425 
1426 enum { OCL_OP_ADD=0, OCL_OP_SUB=1, OCL_OP_RSUB=2, OCL_OP_ABSDIFF=3, OCL_OP_MUL=4,
1427        OCL_OP_MUL_SCALE=5, OCL_OP_DIV_SCALE=6, OCL_OP_RECIP_SCALE=7, OCL_OP_ADDW=8,
1428        OCL_OP_AND=9, OCL_OP_OR=10, OCL_OP_XOR=11, OCL_OP_NOT=12, OCL_OP_MIN=13, OCL_OP_MAX=14,
1429        OCL_OP_RDIV_SCALE=15 };
1430 
1431 #ifdef HAVE_OPENCL
1432 
1433 static const char* oclop2str[] = { "OP_ADD", "OP_SUB", "OP_RSUB", "OP_ABSDIFF",
1434     "OP_MUL", "OP_MUL_SCALE", "OP_DIV_SCALE", "OP_RECIP_SCALE",
1435     "OP_ADDW", "OP_AND", "OP_OR", "OP_XOR", "OP_NOT", "OP_MIN", "OP_MAX", "OP_RDIV_SCALE", 0 };
1436 
ocl_binary_op(InputArray _src1,InputArray _src2,OutputArray _dst,InputArray _mask,bool bitwise,int oclop,bool haveScalar)1437 static bool ocl_binary_op(InputArray _src1, InputArray _src2, OutputArray _dst,
1438                           InputArray _mask, bool bitwise, int oclop, bool haveScalar )
1439 {
1440     bool haveMask = !_mask.empty();
1441     int srctype = _src1.type();
1442     int srcdepth = CV_MAT_DEPTH(srctype);
1443     int cn = CV_MAT_CN(srctype);
1444 
1445     const ocl::Device d = ocl::Device::getDefault();
1446     bool doubleSupport = d.doubleFPConfig() > 0;
1447     if( oclop < 0 || ((haveMask || haveScalar) && cn > 4) ||
1448             (!doubleSupport && srcdepth == CV_64F && !bitwise))
1449         return false;
1450 
1451     char opts[1024];
1452     int kercn = haveMask || haveScalar ? cn : ocl::predictOptimalVectorWidth(_src1, _src2, _dst);
1453     int scalarcn = kercn == 3 ? 4 : kercn;
1454     int rowsPerWI = d.isIntel() ? 4 : 1;
1455 
1456     sprintf(opts, "-D %s%s -D %s -D dstT=%s%s -D dstT_C1=%s -D workST=%s -D cn=%d -D rowsPerWI=%d",
1457             haveMask ? "MASK_" : "", haveScalar ? "UNARY_OP" : "BINARY_OP", oclop2str[oclop],
1458             bitwise ? ocl::memopTypeToStr(CV_MAKETYPE(srcdepth, kercn)) :
1459                 ocl::typeToStr(CV_MAKETYPE(srcdepth, kercn)), doubleSupport ? " -D DOUBLE_SUPPORT" : "",
1460             bitwise ? ocl::memopTypeToStr(CV_MAKETYPE(srcdepth, 1)) :
1461                 ocl::typeToStr(CV_MAKETYPE(srcdepth, 1)),
1462             bitwise ? ocl::memopTypeToStr(CV_MAKETYPE(srcdepth, scalarcn)) :
1463                 ocl::typeToStr(CV_MAKETYPE(srcdepth, scalarcn)),
1464             kercn, rowsPerWI);
1465 
1466     ocl::Kernel k("KF", ocl::core::arithm_oclsrc, opts);
1467     if (k.empty())
1468         return false;
1469 
1470     UMat src1 = _src1.getUMat(), src2;
1471     UMat dst = _dst.getUMat(), mask = _mask.getUMat();
1472 
1473     ocl::KernelArg src1arg = ocl::KernelArg::ReadOnlyNoSize(src1, cn, kercn);
1474     ocl::KernelArg dstarg = haveMask ? ocl::KernelArg::ReadWrite(dst, cn, kercn) :
1475                                        ocl::KernelArg::WriteOnly(dst, cn, kercn);
1476     ocl::KernelArg maskarg = ocl::KernelArg::ReadOnlyNoSize(mask, 1);
1477 
1478     if( haveScalar )
1479     {
1480         size_t esz = CV_ELEM_SIZE1(srctype)*scalarcn;
1481         double buf[4] = {0,0,0,0};
1482 
1483         if( oclop != OCL_OP_NOT )
1484         {
1485             Mat src2sc = _src2.getMat();
1486             convertAndUnrollScalar(src2sc, srctype, (uchar*)buf, 1);
1487         }
1488 
1489         ocl::KernelArg scalararg = ocl::KernelArg(0, 0, 0, 0, buf, esz);
1490 
1491         if( !haveMask )
1492             k.args(src1arg, dstarg, scalararg);
1493         else
1494             k.args(src1arg, maskarg, dstarg, scalararg);
1495     }
1496     else
1497     {
1498         src2 = _src2.getUMat();
1499         ocl::KernelArg src2arg = ocl::KernelArg::ReadOnlyNoSize(src2, cn, kercn);
1500 
1501         if( !haveMask )
1502             k.args(src1arg, src2arg, dstarg);
1503         else
1504             k.args(src1arg, src2arg, maskarg, dstarg);
1505     }
1506 
1507     size_t globalsize[] = { src1.cols * cn / kercn, (src1.rows + rowsPerWI - 1) / rowsPerWI };
1508     return k.run(2, globalsize, 0, false);
1509 }
1510 
1511 #endif
1512 
binary_op(InputArray _src1,InputArray _src2,OutputArray _dst,InputArray _mask,const BinaryFunc * tab,bool bitwise,int oclop)1513 static void binary_op( InputArray _src1, InputArray _src2, OutputArray _dst,
1514                        InputArray _mask, const BinaryFunc* tab,
1515                        bool bitwise, int oclop )
1516 {
1517     const _InputArray *psrc1 = &_src1, *psrc2 = &_src2;
1518     int kind1 = psrc1->kind(), kind2 = psrc2->kind();
1519     int type1 = psrc1->type(), depth1 = CV_MAT_DEPTH(type1), cn = CV_MAT_CN(type1);
1520     int type2 = psrc2->type(), depth2 = CV_MAT_DEPTH(type2), cn2 = CV_MAT_CN(type2);
1521     int dims1 = psrc1->dims(), dims2 = psrc2->dims();
1522     Size sz1 = dims1 <= 2 ? psrc1->size() : Size();
1523     Size sz2 = dims2 <= 2 ? psrc2->size() : Size();
1524 #ifdef HAVE_OPENCL
1525     bool use_opencl = (kind1 == _InputArray::UMAT || kind2 == _InputArray::UMAT) &&
1526             dims1 <= 2 && dims2 <= 2;
1527 #endif
1528     bool haveMask = !_mask.empty(), haveScalar = false;
1529     BinaryFunc func;
1530 
1531     if( dims1 <= 2 && dims2 <= 2 && kind1 == kind2 && sz1 == sz2 && type1 == type2 && !haveMask )
1532     {
1533         _dst.create(sz1, type1);
1534         CV_OCL_RUN(use_opencl,
1535                    ocl_binary_op(*psrc1, *psrc2, _dst, _mask, bitwise, oclop, false))
1536 
1537         if( bitwise )
1538         {
1539             func = *tab;
1540             cn = (int)CV_ELEM_SIZE(type1);
1541         }
1542         else
1543             func = tab[depth1];
1544 
1545         Mat src1 = psrc1->getMat(), src2 = psrc2->getMat(), dst = _dst.getMat();
1546         Size sz = getContinuousSize(src1, src2, dst);
1547         size_t len = sz.width*(size_t)cn;
1548         if( len == (size_t)(int)len )
1549         {
1550             sz.width = (int)len;
1551             func(src1.ptr(), src1.step, src2.ptr(), src2.step, dst.ptr(), dst.step, sz, 0);
1552             return;
1553         }
1554     }
1555 
1556     if( oclop == OCL_OP_NOT )
1557         haveScalar = true;
1558     else if( (kind1 == _InputArray::MATX) + (kind2 == _InputArray::MATX) == 1 ||
1559         !psrc1->sameSize(*psrc2) || type1 != type2 )
1560     {
1561         if( checkScalar(*psrc1, type2, kind1, kind2) )
1562         {
1563             // src1 is a scalar; swap it with src2
1564             swap(psrc1, psrc2);
1565             swap(type1, type2);
1566             swap(depth1, depth2);
1567             swap(cn, cn2);
1568             swap(sz1, sz2);
1569         }
1570         else if( !checkScalar(*psrc2, type1, kind2, kind1) )
1571             CV_Error( CV_StsUnmatchedSizes,
1572                       "The operation is neither 'array op array' (where arrays have the same size and type), "
1573                       "nor 'array op scalar', nor 'scalar op array'" );
1574         haveScalar = true;
1575     }
1576     else
1577     {
1578         CV_Assert( psrc1->sameSize(*psrc2) && type1 == type2 );
1579     }
1580 
1581     size_t esz = CV_ELEM_SIZE(type1);
1582     size_t blocksize0 = (BLOCK_SIZE + esz-1)/esz;
1583     BinaryFunc copymask = 0;
1584     bool reallocate = false;
1585 
1586     if( haveMask )
1587     {
1588         int mtype = _mask.type();
1589         CV_Assert( (mtype == CV_8U || mtype == CV_8S) && _mask.sameSize(*psrc1));
1590         copymask = getCopyMaskFunc(esz);
1591         reallocate = !_dst.sameSize(*psrc1) || _dst.type() != type1;
1592     }
1593 
1594     AutoBuffer<uchar> _buf;
1595     uchar *scbuf = 0, *maskbuf = 0;
1596 
1597     _dst.createSameSize(*psrc1, type1);
1598     // if this is mask operation and dst has been reallocated,
1599     // we have to clear the destination
1600     if( haveMask && reallocate )
1601         _dst.setTo(0.);
1602 
1603     CV_OCL_RUN(use_opencl,
1604                ocl_binary_op(*psrc1, *psrc2, _dst, _mask, bitwise, oclop, haveScalar))
1605 
1606 
1607     Mat src1 = psrc1->getMat(), src2 = psrc2->getMat();
1608     Mat dst = _dst.getMat(), mask = _mask.getMat();
1609 
1610     if( bitwise )
1611     {
1612         func = *tab;
1613         cn = (int)esz;
1614     }
1615     else
1616         func = tab[depth1];
1617 
1618     if( !haveScalar )
1619     {
1620         const Mat* arrays[] = { &src1, &src2, &dst, &mask, 0 };
1621         uchar* ptrs[4];
1622 
1623         NAryMatIterator it(arrays, ptrs);
1624         size_t total = it.size, blocksize = total;
1625 
1626         if( blocksize*cn > INT_MAX )
1627             blocksize = INT_MAX/cn;
1628 
1629         if( haveMask )
1630         {
1631             blocksize = std::min(blocksize, blocksize0);
1632             _buf.allocate(blocksize*esz);
1633             maskbuf = _buf;
1634         }
1635 
1636         for( size_t i = 0; i < it.nplanes; i++, ++it )
1637         {
1638             for( size_t j = 0; j < total; j += blocksize )
1639             {
1640                 int bsz = (int)MIN(total - j, blocksize);
1641 
1642                 func( ptrs[0], 0, ptrs[1], 0, haveMask ? maskbuf : ptrs[2], 0, Size(bsz*cn, 1), 0 );
1643                 if( haveMask )
1644                 {
1645                     copymask( maskbuf, 0, ptrs[3], 0, ptrs[2], 0, Size(bsz, 1), &esz );
1646                     ptrs[3] += bsz;
1647                 }
1648 
1649                 bsz *= (int)esz;
1650                 ptrs[0] += bsz; ptrs[1] += bsz; ptrs[2] += bsz;
1651             }
1652         }
1653     }
1654     else
1655     {
1656         const Mat* arrays[] = { &src1, &dst, &mask, 0 };
1657         uchar* ptrs[3];
1658 
1659         NAryMatIterator it(arrays, ptrs);
1660         size_t total = it.size, blocksize = std::min(total, blocksize0);
1661 
1662         _buf.allocate(blocksize*(haveMask ? 2 : 1)*esz + 32);
1663         scbuf = _buf;
1664         maskbuf = alignPtr(scbuf + blocksize*esz, 16);
1665 
1666         convertAndUnrollScalar( src2, src1.type(), scbuf, blocksize);
1667 
1668         for( size_t i = 0; i < it.nplanes; i++, ++it )
1669         {
1670             for( size_t j = 0; j < total; j += blocksize )
1671             {
1672                 int bsz = (int)MIN(total - j, blocksize);
1673 
1674                 func( ptrs[0], 0, scbuf, 0, haveMask ? maskbuf : ptrs[1], 0, Size(bsz*cn, 1), 0 );
1675                 if( haveMask )
1676                 {
1677                     copymask( maskbuf, 0, ptrs[2], 0, ptrs[1], 0, Size(bsz, 1), &esz );
1678                     ptrs[2] += bsz;
1679                 }
1680 
1681                 bsz *= (int)esz;
1682                 ptrs[0] += bsz; ptrs[1] += bsz;
1683             }
1684         }
1685     }
1686 }
1687 
getMaxTab()1688 static BinaryFunc* getMaxTab()
1689 {
1690     static BinaryFunc maxTab[] =
1691     {
1692         (BinaryFunc)GET_OPTIMIZED(max8u), (BinaryFunc)GET_OPTIMIZED(max8s),
1693         (BinaryFunc)GET_OPTIMIZED(max16u), (BinaryFunc)GET_OPTIMIZED(max16s),
1694         (BinaryFunc)GET_OPTIMIZED(max32s),
1695         (BinaryFunc)GET_OPTIMIZED(max32f), (BinaryFunc)max64f,
1696         0
1697     };
1698 
1699     return maxTab;
1700 }
1701 
getMinTab()1702 static BinaryFunc* getMinTab()
1703 {
1704     static BinaryFunc minTab[] =
1705     {
1706         (BinaryFunc)GET_OPTIMIZED(min8u), (BinaryFunc)GET_OPTIMIZED(min8s),
1707         (BinaryFunc)GET_OPTIMIZED(min16u), (BinaryFunc)GET_OPTIMIZED(min16s),
1708         (BinaryFunc)GET_OPTIMIZED(min32s),
1709         (BinaryFunc)GET_OPTIMIZED(min32f), (BinaryFunc)min64f,
1710         0
1711     };
1712 
1713     return minTab;
1714 }
1715 
1716 }
1717 
bitwise_and(InputArray a,InputArray b,OutputArray c,InputArray mask)1718 void cv::bitwise_and(InputArray a, InputArray b, OutputArray c, InputArray mask)
1719 {
1720     BinaryFunc f = (BinaryFunc)GET_OPTIMIZED(and8u);
1721     binary_op(a, b, c, mask, &f, true, OCL_OP_AND);
1722 }
1723 
bitwise_or(InputArray a,InputArray b,OutputArray c,InputArray mask)1724 void cv::bitwise_or(InputArray a, InputArray b, OutputArray c, InputArray mask)
1725 {
1726     BinaryFunc f = (BinaryFunc)GET_OPTIMIZED(or8u);
1727     binary_op(a, b, c, mask, &f, true, OCL_OP_OR);
1728 }
1729 
bitwise_xor(InputArray a,InputArray b,OutputArray c,InputArray mask)1730 void cv::bitwise_xor(InputArray a, InputArray b, OutputArray c, InputArray mask)
1731 {
1732     BinaryFunc f = (BinaryFunc)GET_OPTIMIZED(xor8u);
1733     binary_op(a, b, c, mask, &f, true, OCL_OP_XOR);
1734 }
1735 
bitwise_not(InputArray a,OutputArray c,InputArray mask)1736 void cv::bitwise_not(InputArray a, OutputArray c, InputArray mask)
1737 {
1738     BinaryFunc f = (BinaryFunc)GET_OPTIMIZED(not8u);
1739     binary_op(a, a, c, mask, &f, true, OCL_OP_NOT);
1740 }
1741 
max(InputArray src1,InputArray src2,OutputArray dst)1742 void cv::max( InputArray src1, InputArray src2, OutputArray dst )
1743 {
1744     binary_op(src1, src2, dst, noArray(), getMaxTab(), false, OCL_OP_MAX );
1745 }
1746 
min(InputArray src1,InputArray src2,OutputArray dst)1747 void cv::min( InputArray src1, InputArray src2, OutputArray dst )
1748 {
1749     binary_op(src1, src2, dst, noArray(), getMinTab(), false, OCL_OP_MIN );
1750 }
1751 
max(const Mat & src1,const Mat & src2,Mat & dst)1752 void cv::max(const Mat& src1, const Mat& src2, Mat& dst)
1753 {
1754     OutputArray _dst(dst);
1755     binary_op(src1, src2, _dst, noArray(), getMaxTab(), false, OCL_OP_MAX );
1756 }
1757 
min(const Mat & src1,const Mat & src2,Mat & dst)1758 void cv::min(const Mat& src1, const Mat& src2, Mat& dst)
1759 {
1760     OutputArray _dst(dst);
1761     binary_op(src1, src2, _dst, noArray(), getMinTab(), false, OCL_OP_MIN );
1762 }
1763 
max(const UMat & src1,const UMat & src2,UMat & dst)1764 void cv::max(const UMat& src1, const UMat& src2, UMat& dst)
1765 {
1766     OutputArray _dst(dst);
1767     binary_op(src1, src2, _dst, noArray(), getMaxTab(), false, OCL_OP_MAX );
1768 }
1769 
min(const UMat & src1,const UMat & src2,UMat & dst)1770 void cv::min(const UMat& src1, const UMat& src2, UMat& dst)
1771 {
1772     OutputArray _dst(dst);
1773     binary_op(src1, src2, _dst, noArray(), getMinTab(), false, OCL_OP_MIN );
1774 }
1775 
1776 
1777 /****************************************************************************************\
1778 *                                      add/subtract                                      *
1779 \****************************************************************************************/
1780 
1781 namespace cv
1782 {
1783 
actualScalarDepth(const double * data,int len)1784 static int actualScalarDepth(const double* data, int len)
1785 {
1786     int i = 0, minval = INT_MAX, maxval = INT_MIN;
1787     for(; i < len; ++i)
1788     {
1789         int ival = cvRound(data[i]);
1790         if( ival != data[i] )
1791             break;
1792         minval = MIN(minval, ival);
1793         maxval = MAX(maxval, ival);
1794     }
1795     return i < len ? CV_64F :
1796         minval >= 0 && maxval <= (int)UCHAR_MAX ? CV_8U :
1797         minval >= (int)SCHAR_MIN && maxval <= (int)SCHAR_MAX ? CV_8S :
1798         minval >= 0 && maxval <= (int)USHRT_MAX ? CV_16U :
1799         minval >= (int)SHRT_MIN && maxval <= (int)SHRT_MAX ? CV_16S :
1800         CV_32S;
1801 }
1802 
1803 #ifdef HAVE_OPENCL
1804 
ocl_arithm_op(InputArray _src1,InputArray _src2,OutputArray _dst,InputArray _mask,int wtype,void * usrdata,int oclop,bool haveScalar)1805 static bool ocl_arithm_op(InputArray _src1, InputArray _src2, OutputArray _dst,
1806                           InputArray _mask, int wtype,
1807                           void* usrdata, int oclop,
1808                           bool haveScalar )
1809 {
1810     const ocl::Device d = ocl::Device::getDefault();
1811     bool doubleSupport = d.doubleFPConfig() > 0;
1812     int type1 = _src1.type(), depth1 = CV_MAT_DEPTH(type1), cn = CV_MAT_CN(type1);
1813     bool haveMask = !_mask.empty();
1814 
1815     if ( (haveMask || haveScalar) && cn > 4 )
1816         return false;
1817 
1818     int dtype = _dst.type(), ddepth = CV_MAT_DEPTH(dtype), wdepth = std::max(CV_32S, CV_MAT_DEPTH(wtype));
1819     if (!doubleSupport)
1820         wdepth = std::min(wdepth, CV_32F);
1821 
1822     wtype = CV_MAKETYPE(wdepth, cn);
1823     int type2 = haveScalar ? wtype : _src2.type(), depth2 = CV_MAT_DEPTH(type2);
1824     if (!doubleSupport && (depth2 == CV_64F || depth1 == CV_64F))
1825         return false;
1826 
1827     int kercn = haveMask || haveScalar ? cn : ocl::predictOptimalVectorWidth(_src1, _src2, _dst);
1828     int scalarcn = kercn == 3 ? 4 : kercn, rowsPerWI = d.isIntel() ? 4 : 1;
1829 
1830     char cvtstr[4][32], opts[1024];
1831     sprintf(opts, "-D %s%s -D %s -D srcT1=%s -D srcT1_C1=%s -D srcT2=%s -D srcT2_C1=%s "
1832             "-D dstT=%s -D dstT_C1=%s -D workT=%s -D workST=%s -D scaleT=%s -D wdepth=%d -D convertToWT1=%s "
1833             "-D convertToWT2=%s -D convertToDT=%s%s -D cn=%d -D rowsPerWI=%d -D convertFromU=%s",
1834             (haveMask ? "MASK_" : ""), (haveScalar ? "UNARY_OP" : "BINARY_OP"),
1835             oclop2str[oclop], ocl::typeToStr(CV_MAKETYPE(depth1, kercn)),
1836             ocl::typeToStr(depth1), ocl::typeToStr(CV_MAKETYPE(depth2, kercn)),
1837             ocl::typeToStr(depth2), ocl::typeToStr(CV_MAKETYPE(ddepth, kercn)),
1838             ocl::typeToStr(ddepth), ocl::typeToStr(CV_MAKETYPE(wdepth, kercn)),
1839             ocl::typeToStr(CV_MAKETYPE(wdepth, scalarcn)),
1840             ocl::typeToStr(wdepth), wdepth,
1841             ocl::convertTypeStr(depth1, wdepth, kercn, cvtstr[0]),
1842             ocl::convertTypeStr(depth2, wdepth, kercn, cvtstr[1]),
1843             ocl::convertTypeStr(wdepth, ddepth, kercn, cvtstr[2]),
1844             doubleSupport ? " -D DOUBLE_SUPPORT" : "", kercn, rowsPerWI,
1845             oclop == OCL_OP_ABSDIFF && wdepth == CV_32S && ddepth == wdepth ?
1846             ocl::convertTypeStr(CV_8U, ddepth, kercn, cvtstr[3]) : "noconvert");
1847 
1848     size_t usrdata_esz = CV_ELEM_SIZE(wdepth);
1849     const uchar* usrdata_p = (const uchar*)usrdata;
1850     const double* usrdata_d = (const double*)usrdata;
1851     float usrdata_f[3];
1852     int i, n = oclop == OCL_OP_MUL_SCALE || oclop == OCL_OP_DIV_SCALE ||
1853         oclop == OCL_OP_RDIV_SCALE || oclop == OCL_OP_RECIP_SCALE ? 1 : oclop == OCL_OP_ADDW ? 3 : 0;
1854     if( n > 0 && wdepth == CV_32F )
1855     {
1856         for( i = 0; i < n; i++ )
1857             usrdata_f[i] = (float)usrdata_d[i];
1858         usrdata_p = (const uchar*)usrdata_f;
1859     }
1860 
1861     ocl::Kernel k("KF", ocl::core::arithm_oclsrc, opts);
1862     if (k.empty())
1863         return false;
1864 
1865     UMat src1 = _src1.getUMat(), src2;
1866     UMat dst = _dst.getUMat(), mask = _mask.getUMat();
1867 
1868     ocl::KernelArg src1arg = ocl::KernelArg::ReadOnlyNoSize(src1, cn, kercn);
1869     ocl::KernelArg dstarg = haveMask ? ocl::KernelArg::ReadWrite(dst, cn, kercn) :
1870                                        ocl::KernelArg::WriteOnly(dst, cn, kercn);
1871     ocl::KernelArg maskarg = ocl::KernelArg::ReadOnlyNoSize(mask, 1);
1872 
1873     if( haveScalar )
1874     {
1875         size_t esz = CV_ELEM_SIZE1(wtype)*scalarcn;
1876         double buf[4]={0,0,0,0};
1877         Mat src2sc = _src2.getMat();
1878 
1879         if( !src2sc.empty() )
1880             convertAndUnrollScalar(src2sc, wtype, (uchar*)buf, 1);
1881         ocl::KernelArg scalararg = ocl::KernelArg(0, 0, 0, 0, buf, esz);
1882 
1883         if( !haveMask )
1884         {
1885             if(n == 0)
1886                 k.args(src1arg, dstarg, scalararg);
1887             else if(n == 1)
1888                 k.args(src1arg, dstarg, scalararg,
1889                        ocl::KernelArg(0, 0, 0, 0, usrdata_p, usrdata_esz));
1890             else
1891                 CV_Error(Error::StsNotImplemented, "unsupported number of extra parameters");
1892         }
1893         else
1894             k.args(src1arg, maskarg, dstarg, scalararg);
1895     }
1896     else
1897     {
1898         src2 = _src2.getUMat();
1899         ocl::KernelArg src2arg = ocl::KernelArg::ReadOnlyNoSize(src2, cn, kercn);
1900 
1901         if( !haveMask )
1902         {
1903             if (n == 0)
1904                 k.args(src1arg, src2arg, dstarg);
1905             else if (n == 1)
1906                 k.args(src1arg, src2arg, dstarg,
1907                        ocl::KernelArg(0, 0, 0, 0, usrdata_p, usrdata_esz));
1908             else if (n == 3)
1909                 k.args(src1arg, src2arg, dstarg,
1910                        ocl::KernelArg(0, 0, 0, 0, usrdata_p, usrdata_esz),
1911                        ocl::KernelArg(0, 0, 0, 0, usrdata_p + usrdata_esz, usrdata_esz),
1912                        ocl::KernelArg(0, 0, 0, 0, usrdata_p + usrdata_esz*2, usrdata_esz));
1913             else
1914                 CV_Error(Error::StsNotImplemented, "unsupported number of extra parameters");
1915         }
1916         else
1917             k.args(src1arg, src2arg, maskarg, dstarg);
1918     }
1919 
1920     size_t globalsize[] = { src1.cols * cn / kercn, (src1.rows + rowsPerWI - 1) / rowsPerWI };
1921     return k.run(2, globalsize, NULL, false);
1922 }
1923 
1924 #endif
1925 
arithm_op(InputArray _src1,InputArray _src2,OutputArray _dst,InputArray _mask,int dtype,BinaryFunc * tab,bool muldiv=false,void * usrdata=0,int oclop=-1)1926 static void arithm_op(InputArray _src1, InputArray _src2, OutputArray _dst,
1927                       InputArray _mask, int dtype, BinaryFunc* tab, bool muldiv=false,
1928                       void* usrdata=0, int oclop=-1 )
1929 {
1930     const _InputArray *psrc1 = &_src1, *psrc2 = &_src2;
1931     int kind1 = psrc1->kind(), kind2 = psrc2->kind();
1932     bool haveMask = !_mask.empty();
1933     bool reallocate = false;
1934     int type1 = psrc1->type(), depth1 = CV_MAT_DEPTH(type1), cn = CV_MAT_CN(type1);
1935     int type2 = psrc2->type(), depth2 = CV_MAT_DEPTH(type2), cn2 = CV_MAT_CN(type2);
1936     int wtype, dims1 = psrc1->dims(), dims2 = psrc2->dims();
1937     Size sz1 = dims1 <= 2 ? psrc1->size() : Size();
1938     Size sz2 = dims2 <= 2 ? psrc2->size() : Size();
1939 #ifdef HAVE_OPENCL
1940     bool use_opencl = OCL_PERFORMANCE_CHECK(_dst.isUMat()) && dims1 <= 2 && dims2 <= 2;
1941 #endif
1942     bool src1Scalar = checkScalar(*psrc1, type2, kind1, kind2);
1943     bool src2Scalar = checkScalar(*psrc2, type1, kind2, kind1);
1944 
1945     if( (kind1 == kind2 || cn == 1) && sz1 == sz2 && dims1 <= 2 && dims2 <= 2 && type1 == type2 &&
1946         !haveMask && ((!_dst.fixedType() && (dtype < 0 || CV_MAT_DEPTH(dtype) == depth1)) ||
1947                        (_dst.fixedType() && _dst.type() == type1)) &&
1948         ((src1Scalar && src2Scalar) || (!src1Scalar && !src2Scalar)) )
1949     {
1950         _dst.createSameSize(*psrc1, type1);
1951         CV_OCL_RUN(use_opencl,
1952             ocl_arithm_op(*psrc1, *psrc2, _dst, _mask,
1953                           (!usrdata ? type1 : std::max(depth1, CV_32F)),
1954                           usrdata, oclop, false))
1955 
1956         Mat src1 = psrc1->getMat(), src2 = psrc2->getMat(), dst = _dst.getMat();
1957         Size sz = getContinuousSize(src1, src2, dst, src1.channels());
1958         tab[depth1](src1.ptr(), src1.step, src2.ptr(), src2.step, dst.ptr(), dst.step, sz, usrdata);
1959         return;
1960     }
1961 
1962     bool haveScalar = false, swapped12 = false;
1963 
1964     if( dims1 != dims2 || sz1 != sz2 || cn != cn2 ||
1965         (kind1 == _InputArray::MATX && (sz1 == Size(1,4) || sz1 == Size(1,1))) ||
1966         (kind2 == _InputArray::MATX && (sz2 == Size(1,4) || sz2 == Size(1,1))) )
1967     {
1968         if( checkScalar(*psrc1, type2, kind1, kind2) )
1969         {
1970             // src1 is a scalar; swap it with src2
1971             swap(psrc1, psrc2);
1972             swap(sz1, sz2);
1973             swap(type1, type2);
1974             swap(depth1, depth2);
1975             swap(cn, cn2);
1976             swap(dims1, dims2);
1977             swapped12 = true;
1978             if( oclop == OCL_OP_SUB )
1979                 oclop = OCL_OP_RSUB;
1980             if ( oclop == OCL_OP_DIV_SCALE )
1981                 oclop = OCL_OP_RDIV_SCALE;
1982         }
1983         else if( !checkScalar(*psrc2, type1, kind2, kind1) )
1984             CV_Error( CV_StsUnmatchedSizes,
1985                      "The operation is neither 'array op array' "
1986                      "(where arrays have the same size and the same number of channels), "
1987                      "nor 'array op scalar', nor 'scalar op array'" );
1988         haveScalar = true;
1989         CV_Assert(type2 == CV_64F && (sz2.height == 1 || sz2.height == 4));
1990 
1991         if (!muldiv)
1992         {
1993             Mat sc = psrc2->getMat();
1994             depth2 = actualScalarDepth(sc.ptr<double>(), cn);
1995             if( depth2 == CV_64F && (depth1 < CV_32S || depth1 == CV_32F) )
1996                 depth2 = CV_32F;
1997         }
1998         else
1999             depth2 = CV_64F;
2000     }
2001 
2002     if( dtype < 0 )
2003     {
2004         if( _dst.fixedType() )
2005             dtype = _dst.type();
2006         else
2007         {
2008             if( !haveScalar && type1 != type2 )
2009                 CV_Error(CV_StsBadArg,
2010                      "When the input arrays in add/subtract/multiply/divide functions have different types, "
2011                      "the output array type must be explicitly specified");
2012             dtype = type1;
2013         }
2014     }
2015     dtype = CV_MAT_DEPTH(dtype);
2016 
2017     if( depth1 == depth2 && dtype == depth1 )
2018         wtype = dtype;
2019     else if( !muldiv )
2020     {
2021         wtype = depth1 <= CV_8S && depth2 <= CV_8S ? CV_16S :
2022                 depth1 <= CV_32S && depth2 <= CV_32S ? CV_32S : std::max(depth1, depth2);
2023         wtype = std::max(wtype, dtype);
2024 
2025         // when the result of addition should be converted to an integer type,
2026         // and just one of the input arrays is floating-point, it makes sense to convert that input to integer type before the operation,
2027         // instead of converting the other input to floating-point and then converting the operation result back to integers.
2028         if( dtype < CV_32F && (depth1 < CV_32F || depth2 < CV_32F) )
2029             wtype = CV_32S;
2030     }
2031     else
2032     {
2033         wtype = std::max(depth1, std::max(depth2, CV_32F));
2034         wtype = std::max(wtype, dtype);
2035     }
2036 
2037     dtype = CV_MAKETYPE(dtype, cn);
2038     wtype = CV_MAKETYPE(wtype, cn);
2039 
2040     if( haveMask )
2041     {
2042         int mtype = _mask.type();
2043         CV_Assert( (mtype == CV_8UC1 || mtype == CV_8SC1) && _mask.sameSize(*psrc1) );
2044         reallocate = !_dst.sameSize(*psrc1) || _dst.type() != dtype;
2045     }
2046 
2047     _dst.createSameSize(*psrc1, dtype);
2048     if( reallocate )
2049         _dst.setTo(0.);
2050 
2051     CV_OCL_RUN(use_opencl,
2052                ocl_arithm_op(*psrc1, *psrc2, _dst, _mask, wtype,
2053                usrdata, oclop, haveScalar))
2054 
2055     BinaryFunc cvtsrc1 = type1 == wtype ? 0 : getConvertFunc(type1, wtype);
2056     BinaryFunc cvtsrc2 = type2 == type1 ? cvtsrc1 : type2 == wtype ? 0 : getConvertFunc(type2, wtype);
2057     BinaryFunc cvtdst = dtype == wtype ? 0 : getConvertFunc(wtype, dtype);
2058 
2059     size_t esz1 = CV_ELEM_SIZE(type1), esz2 = CV_ELEM_SIZE(type2);
2060     size_t dsz = CV_ELEM_SIZE(dtype), wsz = CV_ELEM_SIZE(wtype);
2061     size_t blocksize0 = (size_t)(BLOCK_SIZE + wsz-1)/wsz;
2062     BinaryFunc copymask = getCopyMaskFunc(dsz);
2063     Mat src1 = psrc1->getMat(), src2 = psrc2->getMat(), dst = _dst.getMat(), mask = _mask.getMat();
2064 
2065     AutoBuffer<uchar> _buf;
2066     uchar *buf, *maskbuf = 0, *buf1 = 0, *buf2 = 0, *wbuf = 0;
2067     size_t bufesz = (cvtsrc1 ? wsz : 0) +
2068                     (cvtsrc2 || haveScalar ? wsz : 0) +
2069                     (cvtdst ? wsz : 0) +
2070                     (haveMask ? dsz : 0);
2071     BinaryFunc func = tab[CV_MAT_DEPTH(wtype)];
2072 
2073     if( !haveScalar )
2074     {
2075         const Mat* arrays[] = { &src1, &src2, &dst, &mask, 0 };
2076         uchar* ptrs[4];
2077 
2078         NAryMatIterator it(arrays, ptrs);
2079         size_t total = it.size, blocksize = total;
2080 
2081         if( haveMask || cvtsrc1 || cvtsrc2 || cvtdst )
2082             blocksize = std::min(blocksize, blocksize0);
2083 
2084         _buf.allocate(bufesz*blocksize + 64);
2085         buf = _buf;
2086         if( cvtsrc1 )
2087             buf1 = buf, buf = alignPtr(buf + blocksize*wsz, 16);
2088         if( cvtsrc2 )
2089             buf2 = buf, buf = alignPtr(buf + blocksize*wsz, 16);
2090         wbuf = maskbuf = buf;
2091         if( cvtdst )
2092             buf = alignPtr(buf + blocksize*wsz, 16);
2093         if( haveMask )
2094             maskbuf = buf;
2095 
2096         for( size_t i = 0; i < it.nplanes; i++, ++it )
2097         {
2098             for( size_t j = 0; j < total; j += blocksize )
2099             {
2100                 int bsz = (int)MIN(total - j, blocksize);
2101                 Size bszn(bsz*cn, 1);
2102                 const uchar *sptr1 = ptrs[0], *sptr2 = ptrs[1];
2103                 uchar* dptr = ptrs[2];
2104                 if( cvtsrc1 )
2105                 {
2106                     cvtsrc1( sptr1, 1, 0, 1, buf1, 1, bszn, 0 );
2107                     sptr1 = buf1;
2108                 }
2109                 if( ptrs[0] == ptrs[1] )
2110                     sptr2 = sptr1;
2111                 else if( cvtsrc2 )
2112                 {
2113                     cvtsrc2( sptr2, 1, 0, 1, buf2, 1, bszn, 0 );
2114                     sptr2 = buf2;
2115                 }
2116 
2117                 if( !haveMask && !cvtdst )
2118                     func( sptr1, 1, sptr2, 1, dptr, 1, bszn, usrdata );
2119                 else
2120                 {
2121                     func( sptr1, 1, sptr2, 1, wbuf, 0, bszn, usrdata );
2122                     if( !haveMask )
2123                         cvtdst( wbuf, 1, 0, 1, dptr, 1, bszn, 0 );
2124                     else if( !cvtdst )
2125                     {
2126                         copymask( wbuf, 1, ptrs[3], 1, dptr, 1, Size(bsz, 1), &dsz );
2127                         ptrs[3] += bsz;
2128                     }
2129                     else
2130                     {
2131                         cvtdst( wbuf, 1, 0, 1, maskbuf, 1, bszn, 0 );
2132                         copymask( maskbuf, 1, ptrs[3], 1, dptr, 1, Size(bsz, 1), &dsz );
2133                         ptrs[3] += bsz;
2134                     }
2135                 }
2136                 ptrs[0] += bsz*esz1; ptrs[1] += bsz*esz2; ptrs[2] += bsz*dsz;
2137             }
2138         }
2139     }
2140     else
2141     {
2142         const Mat* arrays[] = { &src1, &dst, &mask, 0 };
2143         uchar* ptrs[3];
2144 
2145         NAryMatIterator it(arrays, ptrs);
2146         size_t total = it.size, blocksize = std::min(total, blocksize0);
2147 
2148         _buf.allocate(bufesz*blocksize + 64);
2149         buf = _buf;
2150         if( cvtsrc1 )
2151             buf1 = buf, buf = alignPtr(buf + blocksize*wsz, 16);
2152         buf2 = buf; buf = alignPtr(buf + blocksize*wsz, 16);
2153         wbuf = maskbuf = buf;
2154         if( cvtdst )
2155             buf = alignPtr(buf + blocksize*wsz, 16);
2156         if( haveMask )
2157             maskbuf = buf;
2158 
2159         convertAndUnrollScalar( src2, wtype, buf2, blocksize);
2160 
2161         for( size_t i = 0; i < it.nplanes; i++, ++it )
2162         {
2163             for( size_t j = 0; j < total; j += blocksize )
2164             {
2165                 int bsz = (int)MIN(total - j, blocksize);
2166                 Size bszn(bsz*cn, 1);
2167                 const uchar *sptr1 = ptrs[0];
2168                 const uchar* sptr2 = buf2;
2169                 uchar* dptr = ptrs[1];
2170 
2171                 if( cvtsrc1 )
2172                 {
2173                     cvtsrc1( sptr1, 1, 0, 1, buf1, 1, bszn, 0 );
2174                     sptr1 = buf1;
2175                 }
2176 
2177                 if( swapped12 )
2178                     std::swap(sptr1, sptr2);
2179 
2180                 if( !haveMask && !cvtdst )
2181                     func( sptr1, 1, sptr2, 1, dptr, 1, bszn, usrdata );
2182                 else
2183                 {
2184                     func( sptr1, 1, sptr2, 1, wbuf, 1, bszn, usrdata );
2185                     if( !haveMask )
2186                         cvtdst( wbuf, 1, 0, 1, dptr, 1, bszn, 0 );
2187                     else if( !cvtdst )
2188                     {
2189                         copymask( wbuf, 1, ptrs[2], 1, dptr, 1, Size(bsz, 1), &dsz );
2190                         ptrs[2] += bsz;
2191                     }
2192                     else
2193                     {
2194                         cvtdst( wbuf, 1, 0, 1, maskbuf, 1, bszn, 0 );
2195                         copymask( maskbuf, 1, ptrs[2], 1, dptr, 1, Size(bsz, 1), &dsz );
2196                         ptrs[2] += bsz;
2197                     }
2198                 }
2199                 ptrs[0] += bsz*esz1; ptrs[1] += bsz*dsz;
2200             }
2201         }
2202     }
2203 }
2204 
getAddTab()2205 static BinaryFunc* getAddTab()
2206 {
2207     static BinaryFunc addTab[] =
2208     {
2209         (BinaryFunc)GET_OPTIMIZED(add8u), (BinaryFunc)GET_OPTIMIZED(add8s),
2210         (BinaryFunc)GET_OPTIMIZED(add16u), (BinaryFunc)GET_OPTIMIZED(add16s),
2211         (BinaryFunc)GET_OPTIMIZED(add32s),
2212         (BinaryFunc)GET_OPTIMIZED(add32f), (BinaryFunc)add64f,
2213         0
2214     };
2215 
2216     return addTab;
2217 }
2218 
getSubTab()2219 static BinaryFunc* getSubTab()
2220 {
2221     static BinaryFunc subTab[] =
2222     {
2223         (BinaryFunc)GET_OPTIMIZED(sub8u), (BinaryFunc)GET_OPTIMIZED(sub8s),
2224         (BinaryFunc)GET_OPTIMIZED(sub16u), (BinaryFunc)GET_OPTIMIZED(sub16s),
2225         (BinaryFunc)GET_OPTIMIZED(sub32s),
2226         (BinaryFunc)GET_OPTIMIZED(sub32f), (BinaryFunc)sub64f,
2227         0
2228     };
2229 
2230     return subTab;
2231 }
2232 
getAbsDiffTab()2233 static BinaryFunc* getAbsDiffTab()
2234 {
2235     static BinaryFunc absDiffTab[] =
2236     {
2237         (BinaryFunc)GET_OPTIMIZED(absdiff8u), (BinaryFunc)GET_OPTIMIZED(absdiff8s),
2238         (BinaryFunc)GET_OPTIMIZED(absdiff16u), (BinaryFunc)GET_OPTIMIZED(absdiff16s),
2239         (BinaryFunc)GET_OPTIMIZED(absdiff32s),
2240         (BinaryFunc)GET_OPTIMIZED(absdiff32f), (BinaryFunc)absdiff64f,
2241         0
2242     };
2243 
2244     return absDiffTab;
2245 }
2246 
2247 }
2248 
add(InputArray src1,InputArray src2,OutputArray dst,InputArray mask,int dtype)2249 void cv::add( InputArray src1, InputArray src2, OutputArray dst,
2250           InputArray mask, int dtype )
2251 {
2252     arithm_op(src1, src2, dst, mask, dtype, getAddTab(), false, 0, OCL_OP_ADD );
2253 }
2254 
subtract(InputArray _src1,InputArray _src2,OutputArray _dst,InputArray mask,int dtype)2255 void cv::subtract( InputArray _src1, InputArray _src2, OutputArray _dst,
2256                InputArray mask, int dtype )
2257 {
2258 #ifdef HAVE_TEGRA_OPTIMIZATION
2259     if (tegra::useTegra())
2260     {
2261         int kind1 = _src1.kind(), kind2 = _src2.kind();
2262         Mat src1 = _src1.getMat(), src2 = _src2.getMat();
2263         bool src1Scalar = checkScalar(src1, _src2.type(), kind1, kind2);
2264         bool src2Scalar = checkScalar(src2, _src1.type(), kind2, kind1);
2265 
2266         if (!src1Scalar && !src2Scalar &&
2267             src1.depth() == CV_8U && src2.type() == src1.type() &&
2268             src1.dims == 2 && src2.size() == src1.size() &&
2269             mask.empty())
2270         {
2271             if (dtype < 0)
2272             {
2273                 if (_dst.fixedType())
2274                 {
2275                     dtype = _dst.depth();
2276                 }
2277                 else
2278                 {
2279                     dtype = src1.depth();
2280                 }
2281             }
2282 
2283             dtype = CV_MAT_DEPTH(dtype);
2284 
2285             if (!_dst.fixedType() || dtype == _dst.depth())
2286             {
2287                 _dst.create(src1.size(), CV_MAKE_TYPE(dtype, src1.channels()));
2288 
2289                 if (dtype == CV_16S)
2290                 {
2291                     Mat dst = _dst.getMat();
2292                     if(tegra::subtract_8u8u16s(src1, src2, dst))
2293                         return;
2294                 }
2295                 else if (dtype == CV_32F)
2296                 {
2297                     Mat dst = _dst.getMat();
2298                     if(tegra::subtract_8u8u32f(src1, src2, dst))
2299                         return;
2300                 }
2301                 else if (dtype == CV_8S)
2302                 {
2303                     Mat dst = _dst.getMat();
2304                     if(tegra::subtract_8u8u8s(src1, src2, dst))
2305                         return;
2306                 }
2307             }
2308         }
2309     }
2310 #endif
2311     arithm_op(_src1, _src2, _dst, mask, dtype, getSubTab(), false, 0, OCL_OP_SUB );
2312 }
2313 
absdiff(InputArray src1,InputArray src2,OutputArray dst)2314 void cv::absdiff( InputArray src1, InputArray src2, OutputArray dst )
2315 {
2316     arithm_op(src1, src2, dst, noArray(), -1, getAbsDiffTab(), false, 0, OCL_OP_ABSDIFF);
2317 }
2318 
2319 /****************************************************************************************\
2320 *                                    multiply/divide                                     *
2321 \****************************************************************************************/
2322 
2323 namespace cv
2324 {
2325 
2326 template <typename T, typename WT>
2327 struct Mul_SIMD
2328 {
operator ()cv::Mul_SIMD2329     int operator() (const T *, const T *, T *, int, WT) const
2330     {
2331         return 0;
2332     }
2333 };
2334 
2335 #if CV_NEON
2336 
2337 template <>
2338 struct Mul_SIMD<uchar, float>
2339 {
operator ()cv::Mul_SIMD2340     int operator() (const uchar * src1, const uchar * src2, uchar * dst, int width, float scale) const
2341     {
2342         int x = 0;
2343 
2344         if( scale == 1.0f )
2345             for ( ; x <= width - 8; x += 8)
2346             {
2347                 uint16x8_t v_src1 = vmovl_u8(vld1_u8(src1 + x));
2348                 uint16x8_t v_src2 = vmovl_u8(vld1_u8(src2 + x));
2349 
2350                 float32x4_t v_dst1 = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src1))),
2351                                                vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src2))));
2352                 float32x4_t v_dst2 = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src1))),
2353                                                vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src2))));
2354 
2355                 uint16x8_t v_dst = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst1)),
2356                                                 vqmovn_u32(cv_vrndq_u32_f32(v_dst2)));
2357                 vst1_u8(dst + x, vqmovn_u16(v_dst));
2358             }
2359         else
2360         {
2361             float32x4_t v_scale = vdupq_n_f32(scale);
2362             for ( ; x <= width - 8; x += 8)
2363             {
2364                 uint16x8_t v_src1 = vmovl_u8(vld1_u8(src1 + x));
2365                 uint16x8_t v_src2 = vmovl_u8(vld1_u8(src2 + x));
2366 
2367                 float32x4_t v_dst1 = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src1))),
2368                                                vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src2))));
2369                 v_dst1 = vmulq_f32(v_dst1, v_scale);
2370                 float32x4_t v_dst2 = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src1))),
2371                                                vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src2))));
2372                 v_dst2 = vmulq_f32(v_dst2, v_scale);
2373 
2374                 uint16x8_t v_dst = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst1)),
2375                                                 vqmovn_u32(cv_vrndq_u32_f32(v_dst2)));
2376                 vst1_u8(dst + x, vqmovn_u16(v_dst));
2377             }
2378         }
2379 
2380         return x;
2381     }
2382 };
2383 
2384 template <>
2385 struct Mul_SIMD<schar, float>
2386 {
operator ()cv::Mul_SIMD2387     int operator() (const schar * src1, const schar * src2, schar * dst, int width, float scale) const
2388     {
2389         int x = 0;
2390 
2391         if( scale == 1.0f )
2392             for ( ; x <= width - 8; x += 8)
2393             {
2394                 int16x8_t v_src1 = vmovl_s8(vld1_s8(src1 + x));
2395                 int16x8_t v_src2 = vmovl_s8(vld1_s8(src2 + x));
2396 
2397                 float32x4_t v_dst1 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src1))),
2398                                                vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src2))));
2399                 float32x4_t v_dst2 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src1))),
2400                                                vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src2))));
2401 
2402                 int16x8_t v_dst = vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst1)),
2403                                                vqmovn_s32(cv_vrndq_s32_f32(v_dst2)));
2404                 vst1_s8(dst + x, vqmovn_s16(v_dst));
2405             }
2406         else
2407         {
2408             float32x4_t v_scale = vdupq_n_f32(scale);
2409             for ( ; x <= width - 8; x += 8)
2410             {
2411                 int16x8_t v_src1 = vmovl_s8(vld1_s8(src1 + x));
2412                 int16x8_t v_src2 = vmovl_s8(vld1_s8(src2 + x));
2413 
2414                 float32x4_t v_dst1 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src1))),
2415                                                vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src2))));
2416                 v_dst1 = vmulq_f32(v_dst1, v_scale);
2417                 float32x4_t v_dst2 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src1))),
2418                                                vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src2))));
2419                 v_dst2 = vmulq_f32(v_dst2, v_scale);
2420 
2421                 int16x8_t v_dst = vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst1)),
2422                                                vqmovn_s32(cv_vrndq_s32_f32(v_dst2)));
2423                 vst1_s8(dst + x, vqmovn_s16(v_dst));
2424             }
2425         }
2426 
2427         return x;
2428     }
2429 };
2430 
2431 template <>
2432 struct Mul_SIMD<ushort, float>
2433 {
operator ()cv::Mul_SIMD2434     int operator() (const ushort * src1, const ushort * src2, ushort * dst, int width, float scale) const
2435     {
2436         int x = 0;
2437 
2438         if( scale == 1.0f )
2439             for ( ; x <= width - 8; x += 8)
2440             {
2441                 uint16x8_t v_src1 = vld1q_u16(src1 + x), v_src2 = vld1q_u16(src2 + x);
2442 
2443                 float32x4_t v_dst1 = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src1))),
2444                                                vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src2))));
2445                 float32x4_t v_dst2 = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src1))),
2446                                                vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src2))));
2447 
2448                 uint16x8_t v_dst = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst1)),
2449                                                 vqmovn_u32(cv_vrndq_u32_f32(v_dst2)));
2450                 vst1q_u16(dst + x, v_dst);
2451             }
2452         else
2453         {
2454             float32x4_t v_scale = vdupq_n_f32(scale);
2455             for ( ; x <= width - 8; x += 8)
2456             {
2457                 uint16x8_t v_src1 = vld1q_u16(src1 + x), v_src2 = vld1q_u16(src2 + x);
2458 
2459                 float32x4_t v_dst1 = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src1))),
2460                                                vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src2))));
2461                 v_dst1 = vmulq_f32(v_dst1, v_scale);
2462                 float32x4_t v_dst2 = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src1))),
2463                                                vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src2))));
2464                 v_dst2 = vmulq_f32(v_dst2, v_scale);
2465 
2466                 uint16x8_t v_dst = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst1)),
2467                                                 vqmovn_u32(cv_vrndq_u32_f32(v_dst2)));
2468                 vst1q_u16(dst + x, v_dst);
2469             }
2470         }
2471 
2472         return x;
2473     }
2474 };
2475 
2476 template <>
2477 struct Mul_SIMD<short, float>
2478 {
operator ()cv::Mul_SIMD2479     int operator() (const short * src1, const short * src2, short * dst, int width, float scale) const
2480     {
2481         int x = 0;
2482 
2483         if( scale == 1.0f )
2484             for ( ; x <= width - 8; x += 8)
2485             {
2486                 int16x8_t v_src1 = vld1q_s16(src1 + x), v_src2 = vld1q_s16(src2 + x);
2487 
2488                 float32x4_t v_dst1 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src1))),
2489                                                vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src2))));
2490                 float32x4_t v_dst2 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src1))),
2491                                                vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src2))));
2492 
2493                 int16x8_t v_dst = vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst1)),
2494                                                vqmovn_s32(cv_vrndq_s32_f32(v_dst2)));
2495                 vst1q_s16(dst + x, v_dst);
2496             }
2497         else
2498         {
2499             float32x4_t v_scale = vdupq_n_f32(scale);
2500             for ( ; x <= width - 8; x += 8)
2501             {
2502                 int16x8_t v_src1 = vld1q_s16(src1 + x), v_src2 = vld1q_s16(src2 + x);
2503 
2504                 float32x4_t v_dst1 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src1))),
2505                                                vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src2))));
2506                 v_dst1 = vmulq_f32(v_dst1, v_scale);
2507                 float32x4_t v_dst2 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src1))),
2508                                                vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src2))));
2509                 v_dst2 = vmulq_f32(v_dst2, v_scale);
2510 
2511                 int16x8_t v_dst = vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst1)),
2512                                                vqmovn_s32(cv_vrndq_s32_f32(v_dst2)));
2513                 vst1q_s16(dst + x, v_dst);
2514             }
2515         }
2516 
2517         return x;
2518     }
2519 };
2520 
2521 template <>
2522 struct Mul_SIMD<float, float>
2523 {
operator ()cv::Mul_SIMD2524     int operator() (const float * src1, const float * src2, float * dst, int width, float scale) const
2525     {
2526         int x = 0;
2527 
2528         if( scale == 1.0f )
2529             for ( ; x <= width - 8; x += 8)
2530             {
2531                 float32x4_t v_dst1 = vmulq_f32(vld1q_f32(src1 + x), vld1q_f32(src2 + x));
2532                 float32x4_t v_dst2 = vmulq_f32(vld1q_f32(src1 + x + 4), vld1q_f32(src2 + x + 4));
2533                 vst1q_f32(dst + x, v_dst1);
2534                 vst1q_f32(dst + x + 4, v_dst2);
2535             }
2536         else
2537         {
2538             float32x4_t v_scale = vdupq_n_f32(scale);
2539             for ( ; x <= width - 8; x += 8)
2540             {
2541                 float32x4_t v_dst1 = vmulq_f32(vld1q_f32(src1 + x), vld1q_f32(src2 + x));
2542                 v_dst1 = vmulq_f32(v_dst1, v_scale);
2543 
2544                 float32x4_t v_dst2 = vmulq_f32(vld1q_f32(src1 + x + 4), vld1q_f32(src2 + x + 4));
2545                 v_dst2 = vmulq_f32(v_dst2, v_scale);
2546 
2547                 vst1q_f32(dst + x, v_dst1);
2548                 vst1q_f32(dst + x + 4, v_dst2);
2549             }
2550         }
2551 
2552         return x;
2553     }
2554 };
2555 
2556 #elif CV_SSE2
2557 
2558 #if CV_SSE4_1
2559 
2560 template <>
2561 struct Mul_SIMD<ushort, float>
2562 {
Mul_SIMDcv::Mul_SIMD2563     Mul_SIMD()
2564     {
2565         haveSSE = checkHardwareSupport(CV_CPU_SSE4_1);
2566     }
2567 
operator ()cv::Mul_SIMD2568     int operator() (const ushort * src1, const ushort * src2, ushort * dst, int width, float scale) const
2569     {
2570         int x = 0;
2571 
2572         if (!haveSSE)
2573             return x;
2574 
2575         __m128i v_zero = _mm_setzero_si128();
2576 
2577         if( scale != 1.0f )
2578         {
2579             __m128 v_scale = _mm_set1_ps(scale);
2580             for ( ; x <= width - 8; x += 8)
2581             {
2582                 __m128i v_src1 = _mm_loadu_si128((__m128i const *)(src1 + x));
2583                 __m128i v_src2 = _mm_loadu_si128((__m128i const *)(src2 + x));
2584 
2585                 __m128 v_dst1 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src1, v_zero)),
2586                                            _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src2, v_zero)));
2587                 v_dst1 = _mm_mul_ps(v_dst1, v_scale);
2588 
2589                 __m128 v_dst2 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src1, v_zero)),
2590                                            _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src2, v_zero)));
2591                 v_dst2 = _mm_mul_ps(v_dst2, v_scale);
2592 
2593                 __m128i v_dsti = _mm_packus_epi32(_mm_cvtps_epi32(v_dst1), _mm_cvtps_epi32(v_dst2));
2594                 _mm_storeu_si128((__m128i *)(dst + x), v_dsti);
2595             }
2596         }
2597 
2598         return x;
2599     }
2600 
2601     bool haveSSE;
2602 };
2603 
2604 #endif
2605 
2606 template <>
2607 struct Mul_SIMD<schar, float>
2608 {
Mul_SIMDcv::Mul_SIMD2609     Mul_SIMD()
2610     {
2611         haveSSE = checkHardwareSupport(CV_CPU_SSE2);
2612     }
2613 
operator ()cv::Mul_SIMD2614     int operator() (const schar * src1, const schar * src2, schar * dst, int width, float scale) const
2615     {
2616         int x = 0;
2617 
2618         if (!haveSSE)
2619             return x;
2620 
2621         __m128i v_zero = _mm_setzero_si128();
2622 
2623         if( scale == 1.0f )
2624             for ( ; x <= width - 8; x += 8)
2625             {
2626                 __m128i v_src1 = _mm_loadl_epi64((__m128i const *)(src1 + x));
2627                 __m128i v_src2 = _mm_loadl_epi64((__m128i const *)(src2 + x));
2628 
2629                 v_src1 = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, v_src1), 8);
2630                 v_src2 = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, v_src2), 8);
2631 
2632                 __m128 v_dst1 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src1), 16)),
2633                                            _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src2), 16)));
2634 
2635                 __m128 v_dst2 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src1), 16)),
2636                                            _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src2), 16)));
2637 
2638                 __m128i v_dsti = _mm_packs_epi32(_mm_cvtps_epi32(v_dst1), _mm_cvtps_epi32(v_dst2));
2639                 _mm_storel_epi64((__m128i *)(dst + x), _mm_packs_epi16(v_dsti, v_zero));
2640             }
2641         else
2642         {
2643             __m128 v_scale = _mm_set1_ps(scale);
2644             for ( ; x <= width - 8; x += 8)
2645             {
2646                 __m128i v_src1 = _mm_loadl_epi64((__m128i const *)(src1 + x));
2647                 __m128i v_src2 = _mm_loadl_epi64((__m128i const *)(src2 + x));
2648 
2649                 v_src1 = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, v_src1), 8);
2650                 v_src2 = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, v_src2), 8);
2651 
2652                 __m128 v_dst1 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src1), 16)),
2653                                            _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src2), 16)));
2654                 v_dst1 = _mm_mul_ps(v_dst1, v_scale);
2655 
2656                 __m128 v_dst2 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src1), 16)),
2657                                            _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src2), 16)));
2658                 v_dst2 = _mm_mul_ps(v_dst2, v_scale);
2659 
2660                 __m128i v_dsti = _mm_packs_epi32(_mm_cvtps_epi32(v_dst1), _mm_cvtps_epi32(v_dst2));
2661                 _mm_storel_epi64((__m128i *)(dst + x), _mm_packs_epi16(v_dsti, v_zero));
2662             }
2663         }
2664 
2665         return x;
2666     }
2667 
2668     bool haveSSE;
2669 };
2670 
2671 template <>
2672 struct Mul_SIMD<short, float>
2673 {
Mul_SIMDcv::Mul_SIMD2674     Mul_SIMD()
2675     {
2676         haveSSE = checkHardwareSupport(CV_CPU_SSE2);
2677     }
2678 
operator ()cv::Mul_SIMD2679     int operator() (const short * src1, const short * src2, short * dst, int width, float scale) const
2680     {
2681         int x = 0;
2682 
2683         if (!haveSSE)
2684             return x;
2685 
2686         __m128i v_zero = _mm_setzero_si128();
2687 
2688         if( scale != 1.0f )
2689         {
2690             __m128 v_scale = _mm_set1_ps(scale);
2691             for ( ; x <= width - 8; x += 8)
2692             {
2693                 __m128i v_src1 = _mm_loadu_si128((__m128i const *)(src1 + x));
2694                 __m128i v_src2 = _mm_loadu_si128((__m128i const *)(src2 + x));
2695 
2696                 __m128 v_dst1 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src1), 16)),
2697                                            _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src2), 16)));
2698                 v_dst1 = _mm_mul_ps(v_dst1, v_scale);
2699 
2700                 __m128 v_dst2 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src1), 16)),
2701                                            _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src2), 16)));
2702                 v_dst2 = _mm_mul_ps(v_dst2, v_scale);
2703 
2704                 __m128i v_dsti = _mm_packs_epi32(_mm_cvtps_epi32(v_dst1), _mm_cvtps_epi32(v_dst2));
2705                 _mm_storeu_si128((__m128i *)(dst + x), v_dsti);
2706             }
2707         }
2708 
2709         return x;
2710     }
2711 
2712     bool haveSSE;
2713 };
2714 
2715 #endif
2716 
2717 template<typename T, typename WT> static void
mul_(const T * src1,size_t step1,const T * src2,size_t step2,T * dst,size_t step,Size size,WT scale)2718 mul_( const T* src1, size_t step1, const T* src2, size_t step2,
2719       T* dst, size_t step, Size size, WT scale )
2720 {
2721     step1 /= sizeof(src1[0]);
2722     step2 /= sizeof(src2[0]);
2723     step /= sizeof(dst[0]);
2724 
2725     Mul_SIMD<T, WT> vop;
2726 
2727     if( scale == (WT)1. )
2728     {
2729         for( ; size.height--; src1 += step1, src2 += step2, dst += step )
2730         {
2731             int i = vop(src1, src2, dst, size.width, scale);
2732             #if CV_ENABLE_UNROLLED
2733             for(; i <= size.width - 4; i += 4 )
2734             {
2735                 T t0;
2736                 T t1;
2737                 t0 = saturate_cast<T>(src1[i  ] * src2[i  ]);
2738                 t1 = saturate_cast<T>(src1[i+1] * src2[i+1]);
2739                 dst[i  ] = t0;
2740                 dst[i+1] = t1;
2741 
2742                 t0 = saturate_cast<T>(src1[i+2] * src2[i+2]);
2743                 t1 = saturate_cast<T>(src1[i+3] * src2[i+3]);
2744                 dst[i+2] = t0;
2745                 dst[i+3] = t1;
2746             }
2747             #endif
2748             for( ; i < size.width; i++ )
2749                 dst[i] = saturate_cast<T>(src1[i] * src2[i]);
2750         }
2751     }
2752     else
2753     {
2754         for( ; size.height--; src1 += step1, src2 += step2, dst += step )
2755         {
2756             int i = vop(src1, src2, dst, size.width, scale);
2757             #if CV_ENABLE_UNROLLED
2758             for(; i <= size.width - 4; i += 4 )
2759             {
2760                 T t0 = saturate_cast<T>(scale*(WT)src1[i]*src2[i]);
2761                 T t1 = saturate_cast<T>(scale*(WT)src1[i+1]*src2[i+1]);
2762                 dst[i] = t0; dst[i+1] = t1;
2763 
2764                 t0 = saturate_cast<T>(scale*(WT)src1[i+2]*src2[i+2]);
2765                 t1 = saturate_cast<T>(scale*(WT)src1[i+3]*src2[i+3]);
2766                 dst[i+2] = t0; dst[i+3] = t1;
2767             }
2768             #endif
2769             for( ; i < size.width; i++ )
2770                 dst[i] = saturate_cast<T>(scale*(WT)src1[i]*src2[i]);
2771         }
2772     }
2773 }
2774 
2775 template <typename T>
2776 struct Div_SIMD
2777 {
operator ()cv::Div_SIMD2778     int operator() (const T *, const T *, T *, int, double) const
2779     {
2780         return 0;
2781     }
2782 };
2783 
2784 template <typename T>
2785 struct Recip_SIMD
2786 {
operator ()cv::Recip_SIMD2787     int operator() (const T *, T *, int, double) const
2788     {
2789         return 0;
2790     }
2791 };
2792 
2793 
2794 #if CV_SIMD128
2795 
2796 template <>
2797 struct Div_SIMD<uchar>
2798 {
2799     bool haveSIMD;
Div_SIMDcv::Div_SIMD2800     Div_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); }
2801 
operator ()cv::Div_SIMD2802     int operator() (const uchar * src1, const uchar * src2, uchar * dst, int width, double scale) const
2803     {
2804         int x = 0;
2805 
2806         if (!haveSIMD)
2807             return x;
2808 
2809         v_float32x4 v_scale = v_setall_f32((float)scale);
2810         v_uint16x8 v_zero = v_setzero_u16();
2811 
2812         for ( ; x <= width - 8; x += 8)
2813         {
2814             v_uint16x8 v_src1 = v_load_expand(src1 + x);
2815             v_uint16x8 v_src2 = v_load_expand(src2 + x);
2816 
2817             v_uint32x4 t0, t1, t2, t3;
2818             v_expand(v_src1, t0, t1);
2819             v_expand(v_src2, t2, t3);
2820 
2821             v_float32x4 f0 = v_cvt_f32(v_reinterpret_as_s32(t0));
2822             v_float32x4 f1 = v_cvt_f32(v_reinterpret_as_s32(t1));
2823 
2824             v_float32x4 f2 = v_cvt_f32(v_reinterpret_as_s32(t2));
2825             v_float32x4 f3 = v_cvt_f32(v_reinterpret_as_s32(t3));
2826 
2827             f0 = f0 * v_scale / f2;
2828             f1 = f1 * v_scale / f3;
2829 
2830             v_int32x4 i0 = v_round(f0), i1 = v_round(f1);
2831             v_uint16x8 res = v_pack_u(i0, i1);
2832 
2833             res = v_select(v_src2 == v_zero, v_zero, res);
2834             v_pack_store(dst + x, res);
2835         }
2836 
2837         return x;
2838     }
2839 };
2840 
2841 
2842 template <>
2843 struct Div_SIMD<schar>
2844 {
2845     bool haveSIMD;
Div_SIMDcv::Div_SIMD2846     Div_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); }
2847 
operator ()cv::Div_SIMD2848     int operator() (const schar * src1, const schar * src2, schar * dst, int width, double scale) const
2849     {
2850         int x = 0;
2851 
2852         if (!haveSIMD)
2853             return x;
2854 
2855         v_float32x4 v_scale = v_setall_f32((float)scale);
2856         v_int16x8 v_zero = v_setzero_s16();
2857 
2858         for ( ; x <= width - 8; x += 8)
2859         {
2860             v_int16x8 v_src1 = v_load_expand(src1 + x);
2861             v_int16x8 v_src2 = v_load_expand(src2 + x);
2862 
2863             v_int32x4 t0, t1, t2, t3;
2864             v_expand(v_src1, t0, t1);
2865             v_expand(v_src2, t2, t3);
2866 
2867             v_float32x4 f0 = v_cvt_f32(t0);
2868             v_float32x4 f1 = v_cvt_f32(t1);
2869 
2870             v_float32x4 f2 = v_cvt_f32(t2);
2871             v_float32x4 f3 = v_cvt_f32(t3);
2872 
2873             f0 = f0 * v_scale / f2;
2874             f1 = f1 * v_scale / f3;
2875 
2876             v_int32x4 i0 = v_round(f0), i1 = v_round(f1);
2877             v_int16x8 res = v_pack(i0, i1);
2878 
2879             res = v_select(v_src2 == v_zero, v_zero, res);
2880             v_pack_store(dst + x, res);
2881         }
2882 
2883         return x;
2884     }
2885 };
2886 
2887 
2888 template <>
2889 struct Div_SIMD<ushort>
2890 {
2891     bool haveSIMD;
Div_SIMDcv::Div_SIMD2892     Div_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); }
2893 
operator ()cv::Div_SIMD2894     int operator() (const ushort * src1, const ushort * src2, ushort * dst, int width, double scale) const
2895     {
2896         int x = 0;
2897 
2898         if (!haveSIMD)
2899             return x;
2900 
2901         v_float32x4 v_scale = v_setall_f32((float)scale);
2902         v_uint16x8 v_zero = v_setzero_u16();
2903 
2904         for ( ; x <= width - 8; x += 8)
2905         {
2906             v_uint16x8 v_src1 = v_load(src1 + x);
2907             v_uint16x8 v_src2 = v_load(src2 + x);
2908 
2909             v_uint32x4 t0, t1, t2, t3;
2910             v_expand(v_src1, t0, t1);
2911             v_expand(v_src2, t2, t3);
2912 
2913             v_float32x4 f0 = v_cvt_f32(v_reinterpret_as_s32(t0));
2914             v_float32x4 f1 = v_cvt_f32(v_reinterpret_as_s32(t1));
2915 
2916             v_float32x4 f2 = v_cvt_f32(v_reinterpret_as_s32(t2));
2917             v_float32x4 f3 = v_cvt_f32(v_reinterpret_as_s32(t3));
2918 
2919             f0 = f0 * v_scale / f2;
2920             f1 = f1 * v_scale / f3;
2921 
2922             v_int32x4 i0 = v_round(f0), i1 = v_round(f1);
2923             v_uint16x8 res = v_pack_u(i0, i1);
2924 
2925             res = v_select(v_src2 == v_zero, v_zero, res);
2926             v_store(dst + x, res);
2927         }
2928 
2929         return x;
2930     }
2931 };
2932 
2933 template <>
2934 struct Div_SIMD<short>
2935 {
2936     bool haveSIMD;
Div_SIMDcv::Div_SIMD2937     Div_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); }
2938 
operator ()cv::Div_SIMD2939     int operator() (const short * src1, const short * src2, short * dst, int width, double scale) const
2940     {
2941         int x = 0;
2942 
2943         if (!haveSIMD)
2944             return x;
2945 
2946         v_float32x4 v_scale = v_setall_f32((float)scale);
2947         v_int16x8 v_zero = v_setzero_s16();
2948 
2949         for ( ; x <= width - 8; x += 8)
2950         {
2951             v_int16x8 v_src1 = v_load(src1 + x);
2952             v_int16x8 v_src2 = v_load(src2 + x);
2953 
2954             v_int32x4 t0, t1, t2, t3;
2955             v_expand(v_src1, t0, t1);
2956             v_expand(v_src2, t2, t3);
2957 
2958             v_float32x4 f0 = v_cvt_f32(t0);
2959             v_float32x4 f1 = v_cvt_f32(t1);
2960 
2961             v_float32x4 f2 = v_cvt_f32(t2);
2962             v_float32x4 f3 = v_cvt_f32(t3);
2963 
2964             f0 = f0 * v_scale / f2;
2965             f1 = f1 * v_scale / f3;
2966 
2967             v_int32x4 i0 = v_round(f0), i1 = v_round(f1);
2968             v_int16x8 res = v_pack(i0, i1);
2969 
2970             res = v_select(v_src2 == v_zero, v_zero, res);
2971             v_store(dst + x, res);
2972         }
2973 
2974         return x;
2975     }
2976 };
2977 
2978 template <>
2979 struct Div_SIMD<int>
2980 {
2981     bool haveSIMD;
Div_SIMDcv::Div_SIMD2982     Div_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); }
2983 
operator ()cv::Div_SIMD2984     int operator() (const int * src1, const int * src2, int * dst, int width, double scale) const
2985     {
2986         int x = 0;
2987 
2988         if (!haveSIMD)
2989             return x;
2990 
2991         v_float32x4 v_scale = v_setall_f32((float)scale);
2992         v_int32x4 v_zero = v_setzero_s32();
2993 
2994         for ( ; x <= width - 8; x += 8)
2995         {
2996             v_int32x4 t0 = v_load(src1 + x);
2997             v_int32x4 t1 = v_load(src1 + x + 4);
2998             v_int32x4 t2 = v_load(src2 + x);
2999             v_int32x4 t3 = v_load(src2 + x + 4);
3000 
3001             v_float32x4 f0 = v_cvt_f32(t0);
3002             v_float32x4 f1 = v_cvt_f32(t1);
3003             v_float32x4 f2 = v_cvt_f32(t2);
3004             v_float32x4 f3 = v_cvt_f32(t3);
3005 
3006             f0 = f0 * v_scale / f2;
3007             f1 = f1 * v_scale / f3;
3008 
3009             v_int32x4 res0 = v_round(f0), res1 = v_round(f1);
3010 
3011             res0 = v_select(t2 == v_zero, v_zero, res0);
3012             res1 = v_select(t3 == v_zero, v_zero, res1);
3013             v_store(dst + x, res0);
3014             v_store(dst + x + 4, res1);
3015         }
3016 
3017         return x;
3018     }
3019 };
3020 
3021 
3022 template <>
3023 struct Div_SIMD<float>
3024 {
3025     bool haveSIMD;
Div_SIMDcv::Div_SIMD3026     Div_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); }
3027 
operator ()cv::Div_SIMD3028     int operator() (const float * src1, const float * src2, float * dst, int width, double scale) const
3029     {
3030         int x = 0;
3031 
3032         if (!haveSIMD)
3033             return x;
3034 
3035         v_float32x4 v_scale = v_setall_f32((float)scale);
3036         v_float32x4 v_zero = v_setzero_f32();
3037 
3038         for ( ; x <= width - 8; x += 8)
3039         {
3040             v_float32x4 f0 = v_load(src1 + x);
3041             v_float32x4 f1 = v_load(src1 + x + 4);
3042             v_float32x4 f2 = v_load(src2 + x);
3043             v_float32x4 f3 = v_load(src2 + x + 4);
3044 
3045             v_float32x4 res0 = f0 * v_scale / f2;
3046             v_float32x4 res1 = f1 * v_scale / f3;
3047 
3048             res0 = v_select(f2 == v_zero, v_zero, res0);
3049             res1 = v_select(f3 == v_zero, v_zero, res1);
3050 
3051             v_store(dst + x, res0);
3052             v_store(dst + x + 4, res1);
3053         }
3054 
3055         return x;
3056     }
3057 };
3058 
3059 
3060 ///////////////////////// RECIPROCAL //////////////////////
3061 
3062 template <>
3063 struct Recip_SIMD<uchar>
3064 {
3065     bool haveSIMD;
Recip_SIMDcv::Recip_SIMD3066     Recip_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); }
3067 
operator ()cv::Recip_SIMD3068     int operator() (const uchar * src2, uchar * dst, int width, double scale) const
3069     {
3070         int x = 0;
3071 
3072         if (!haveSIMD)
3073             return x;
3074 
3075         v_float32x4 v_scale = v_setall_f32((float)scale);
3076         v_uint16x8 v_zero = v_setzero_u16();
3077 
3078         for ( ; x <= width - 8; x += 8)
3079         {
3080             v_uint16x8 v_src2 = v_load_expand(src2 + x);
3081 
3082             v_uint32x4 t0, t1;
3083             v_expand(v_src2, t0, t1);
3084 
3085             v_float32x4 f0 = v_cvt_f32(v_reinterpret_as_s32(t0));
3086             v_float32x4 f1 = v_cvt_f32(v_reinterpret_as_s32(t1));
3087 
3088             f0 = v_scale / f0;
3089             f1 = v_scale / f1;
3090 
3091             v_int32x4 i0 = v_round(f0), i1 = v_round(f1);
3092             v_uint16x8 res = v_pack_u(i0, i1);
3093 
3094             res = v_select(v_src2 == v_zero, v_zero, res);
3095             v_pack_store(dst + x, res);
3096         }
3097 
3098         return x;
3099     }
3100 };
3101 
3102 
3103 template <>
3104 struct Recip_SIMD<schar>
3105 {
3106     bool haveSIMD;
Recip_SIMDcv::Recip_SIMD3107     Recip_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); }
3108 
operator ()cv::Recip_SIMD3109     int operator() (const schar * src2, schar * dst, int width, double scale) const
3110     {
3111         int x = 0;
3112 
3113         if (!haveSIMD)
3114             return x;
3115 
3116         v_float32x4 v_scale = v_setall_f32((float)scale);
3117         v_int16x8 v_zero = v_setzero_s16();
3118 
3119         for ( ; x <= width - 8; x += 8)
3120         {
3121             v_int16x8 v_src2 = v_load_expand(src2 + x);
3122 
3123             v_int32x4 t0, t1;
3124             v_expand(v_src2, t0, t1);
3125 
3126             v_float32x4 f0 = v_cvt_f32(t0);
3127             v_float32x4 f1 = v_cvt_f32(t1);
3128 
3129             f0 = v_scale / f0;
3130             f1 = v_scale / f1;
3131 
3132             v_int32x4 i0 = v_round(f0), i1 = v_round(f1);
3133             v_int16x8 res = v_pack(i0, i1);
3134 
3135             res = v_select(v_src2 == v_zero, v_zero, res);
3136             v_pack_store(dst + x, res);
3137         }
3138 
3139         return x;
3140     }
3141 };
3142 
3143 
3144 template <>
3145 struct Recip_SIMD<ushort>
3146 {
3147     bool haveSIMD;
Recip_SIMDcv::Recip_SIMD3148     Recip_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); }
3149 
operator ()cv::Recip_SIMD3150     int operator() (const ushort * src2, ushort * dst, int width, double scale) const
3151     {
3152         int x = 0;
3153 
3154         if (!haveSIMD)
3155             return x;
3156 
3157         v_float32x4 v_scale = v_setall_f32((float)scale);
3158         v_uint16x8 v_zero = v_setzero_u16();
3159 
3160         for ( ; x <= width - 8; x += 8)
3161         {
3162             v_uint16x8 v_src2 = v_load(src2 + x);
3163 
3164             v_uint32x4 t0, t1;
3165             v_expand(v_src2, t0, t1);
3166 
3167             v_float32x4 f0 = v_cvt_f32(v_reinterpret_as_s32(t0));
3168             v_float32x4 f1 = v_cvt_f32(v_reinterpret_as_s32(t1));
3169 
3170             f0 = v_scale / f0;
3171             f1 = v_scale / f1;
3172 
3173             v_int32x4 i0 = v_round(f0), i1 = v_round(f1);
3174             v_uint16x8 res = v_pack_u(i0, i1);
3175 
3176             res = v_select(v_src2 == v_zero, v_zero, res);
3177             v_store(dst + x, res);
3178         }
3179 
3180         return x;
3181     }
3182 };
3183 
3184 template <>
3185 struct Recip_SIMD<short>
3186 {
3187     bool haveSIMD;
Recip_SIMDcv::Recip_SIMD3188     Recip_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); }
3189 
operator ()cv::Recip_SIMD3190     int operator() (const short * src2, short * dst, int width, double scale) const
3191     {
3192         int x = 0;
3193 
3194         if (!haveSIMD)
3195             return x;
3196 
3197         v_float32x4 v_scale = v_setall_f32((float)scale);
3198         v_int16x8 v_zero = v_setzero_s16();
3199 
3200         for ( ; x <= width - 8; x += 8)
3201         {
3202             v_int16x8 v_src2 = v_load(src2 + x);
3203 
3204             v_int32x4 t0, t1;
3205             v_expand(v_src2, t0, t1);
3206 
3207             v_float32x4 f0 = v_cvt_f32(t0);
3208             v_float32x4 f1 = v_cvt_f32(t1);
3209 
3210             f0 = v_scale / f0;
3211             f1 = v_scale / f1;
3212 
3213             v_int32x4 i0 = v_round(f0), i1 = v_round(f1);
3214             v_int16x8 res = v_pack(i0, i1);
3215 
3216             res = v_select(v_src2 == v_zero, v_zero, res);
3217             v_store(dst + x, res);
3218         }
3219 
3220         return x;
3221     }
3222 };
3223 
3224 template <>
3225 struct Recip_SIMD<int>
3226 {
3227     bool haveSIMD;
Recip_SIMDcv::Recip_SIMD3228     Recip_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); }
3229 
operator ()cv::Recip_SIMD3230     int operator() (const int * src2, int * dst, int width, double scale) const
3231     {
3232         int x = 0;
3233 
3234         if (!haveSIMD)
3235             return x;
3236 
3237         v_float32x4 v_scale = v_setall_f32((float)scale);
3238         v_int32x4 v_zero = v_setzero_s32();
3239 
3240         for ( ; x <= width - 8; x += 8)
3241         {
3242             v_int32x4 t0 = v_load(src2 + x);
3243             v_int32x4 t1 = v_load(src2 + x + 4);
3244 
3245             v_float32x4 f0 = v_cvt_f32(t0);
3246             v_float32x4 f1 = v_cvt_f32(t1);
3247 
3248             f0 = v_scale / f0;
3249             f1 = v_scale / f1;
3250 
3251             v_int32x4 res0 = v_round(f0), res1 = v_round(f1);
3252 
3253             res0 = v_select(t0 == v_zero, v_zero, res0);
3254             res1 = v_select(t1 == v_zero, v_zero, res1);
3255             v_store(dst + x, res0);
3256             v_store(dst + x + 4, res1);
3257         }
3258 
3259         return x;
3260     }
3261 };
3262 
3263 
3264 template <>
3265 struct Recip_SIMD<float>
3266 {
3267     bool haveSIMD;
Recip_SIMDcv::Recip_SIMD3268     Recip_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); }
3269 
operator ()cv::Recip_SIMD3270     int operator() (const float * src2, float * dst, int width, double scale) const
3271     {
3272         int x = 0;
3273 
3274         if (!haveSIMD)
3275             return x;
3276 
3277         v_float32x4 v_scale = v_setall_f32((float)scale);
3278         v_float32x4 v_zero = v_setzero_f32();
3279 
3280         for ( ; x <= width - 8; x += 8)
3281         {
3282             v_float32x4 f0 = v_load(src2 + x);
3283             v_float32x4 f1 = v_load(src2 + x + 4);
3284 
3285             v_float32x4 res0 = v_scale / f0;
3286             v_float32x4 res1 = v_scale / f1;
3287 
3288             res0 = v_select(f0 == v_zero, v_zero, res0);
3289             res1 = v_select(f1 == v_zero, v_zero, res1);
3290 
3291             v_store(dst + x, res0);
3292             v_store(dst + x + 4, res1);
3293         }
3294 
3295         return x;
3296     }
3297 };
3298 
3299 #if CV_SIMD128_64F
3300 
3301 template <>
3302 struct Div_SIMD<double>
3303 {
3304     bool haveSIMD;
Div_SIMDcv::Div_SIMD3305     Div_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); }
3306 
operator ()cv::Div_SIMD3307     int operator() (const double * src1, const double * src2, double * dst, int width, double scale) const
3308     {
3309         int x = 0;
3310 
3311         if (!haveSIMD)
3312             return x;
3313 
3314         v_float64x2 v_scale = v_setall_f64(scale);
3315         v_float64x2 v_zero = v_setzero_f64();
3316 
3317         for ( ; x <= width - 4; x += 4)
3318         {
3319             v_float64x2 f0 = v_load(src1 + x);
3320             v_float64x2 f1 = v_load(src1 + x + 2);
3321             v_float64x2 f2 = v_load(src2 + x);
3322             v_float64x2 f3 = v_load(src2 + x + 2);
3323 
3324             v_float64x2 res0 = f0 * v_scale / f2;
3325             v_float64x2 res1 = f1 * v_scale / f3;
3326 
3327             res0 = v_select(f0 == v_zero, v_zero, res0);
3328             res1 = v_select(f1 == v_zero, v_zero, res1);
3329 
3330             v_store(dst + x, res0);
3331             v_store(dst + x + 2, res1);
3332         }
3333 
3334         return x;
3335     }
3336 };
3337 
3338 template <>
3339 struct Recip_SIMD<double>
3340 {
3341     bool haveSIMD;
Recip_SIMDcv::Recip_SIMD3342     Recip_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); }
3343 
operator ()cv::Recip_SIMD3344     int operator() (const double * src2, double * dst, int width, double scale) const
3345     {
3346         int x = 0;
3347 
3348         if (!haveSIMD)
3349             return x;
3350 
3351         v_float64x2 v_scale = v_setall_f64(scale);
3352         v_float64x2 v_zero = v_setzero_f64();
3353 
3354         for ( ; x <= width - 4; x += 4)
3355         {
3356             v_float64x2 f0 = v_load(src2 + x);
3357             v_float64x2 f1 = v_load(src2 + x + 2);
3358 
3359             v_float64x2 res0 = v_scale / f0;
3360             v_float64x2 res1 = v_scale / f1;
3361 
3362             res0 = v_select(f0 == v_zero, v_zero, res0);
3363             res1 = v_select(f1 == v_zero, v_zero, res1);
3364 
3365             v_store(dst + x, res0);
3366             v_store(dst + x + 2, res1);
3367         }
3368 
3369         return x;
3370     }
3371 };
3372 
3373 #endif
3374 
3375 #endif
3376 
3377 template<typename T> static void
div_i(const T * src1,size_t step1,const T * src2,size_t step2,T * dst,size_t step,Size size,double scale)3378 div_i( const T* src1, size_t step1, const T* src2, size_t step2,
3379       T* dst, size_t step, Size size, double scale )
3380 {
3381     step1 /= sizeof(src1[0]);
3382     step2 /= sizeof(src2[0]);
3383     step /= sizeof(dst[0]);
3384 
3385     Div_SIMD<T> vop;
3386     float scale_f = (float)scale;
3387 
3388     for( ; size.height--; src1 += step1, src2 += step2, dst += step )
3389     {
3390         int i = vop(src1, src2, dst, size.width, scale);
3391         for( ; i < size.width; i++ )
3392         {
3393             T num = src1[i], denom = src2[i];
3394             dst[i] = denom != 0 ? saturate_cast<T>(num*scale_f/denom) : (T)0;
3395         }
3396     }
3397 }
3398 
3399 template<typename T> static void
div_f(const T * src1,size_t step1,const T * src2,size_t step2,T * dst,size_t step,Size size,double scale)3400 div_f( const T* src1, size_t step1, const T* src2, size_t step2,
3401       T* dst, size_t step, Size size, double scale )
3402 {
3403     T scale_f = (T)scale;
3404     step1 /= sizeof(src1[0]);
3405     step2 /= sizeof(src2[0]);
3406     step /= sizeof(dst[0]);
3407 
3408     Div_SIMD<T> vop;
3409 
3410     for( ; size.height--; src1 += step1, src2 += step2, dst += step )
3411     {
3412         int i = vop(src1, src2, dst, size.width, scale);
3413         for( ; i < size.width; i++ )
3414         {
3415             T num = src1[i], denom = src2[i];
3416             dst[i] = denom != 0 ? saturate_cast<T>(num*scale_f/denom) : (T)0;
3417         }
3418     }
3419 }
3420 
3421 template<typename T> static void
recip_i(const T *,size_t,const T * src2,size_t step2,T * dst,size_t step,Size size,double scale)3422 recip_i( const T*, size_t, const T* src2, size_t step2,
3423          T* dst, size_t step, Size size, double scale )
3424 {
3425     step2 /= sizeof(src2[0]);
3426     step /= sizeof(dst[0]);
3427 
3428     Recip_SIMD<T> vop;
3429     float scale_f = (float)scale;
3430 
3431     for( ; size.height--; src2 += step2, dst += step )
3432     {
3433         int i = vop(src2, dst, size.width, scale);
3434         for( ; i < size.width; i++ )
3435         {
3436             T denom = src2[i];
3437             dst[i] = denom != 0 ? saturate_cast<T>(scale_f/denom) : (T)0;
3438         }
3439     }
3440 }
3441 
3442 template<typename T> static void
recip_f(const T *,size_t,const T * src2,size_t step2,T * dst,size_t step,Size size,double scale)3443 recip_f( const T*, size_t, const T* src2, size_t step2,
3444          T* dst, size_t step, Size size, double scale )
3445 {
3446     T scale_f = (T)scale;
3447     step2 /= sizeof(src2[0]);
3448     step /= sizeof(dst[0]);
3449 
3450     Recip_SIMD<T> vop;
3451 
3452     for( ; size.height--; src2 += step2, dst += step )
3453     {
3454         int i = vop(src2, dst, size.width, scale);
3455         for( ; i < size.width; i++ )
3456         {
3457             T denom = src2[i];
3458             dst[i] = denom != 0 ? saturate_cast<T>(scale_f/denom) : (T)0;
3459         }
3460     }
3461 }
3462 
3463 
mul8u(const uchar * src1,size_t step1,const uchar * src2,size_t step2,uchar * dst,size_t step,Size sz,void * scale)3464 static void mul8u( const uchar* src1, size_t step1, const uchar* src2, size_t step2,
3465                    uchar* dst, size_t step, Size sz, void* scale)
3466 {
3467     float fscale = (float)*(const double*)scale;
3468 #if defined HAVE_IPP
3469     CV_IPP_CHECK()
3470     {
3471         if (std::fabs(fscale - 1) <= FLT_EPSILON)
3472         {
3473             if (ippiMul_8u_C1RSfs(src1, (int)step1, src2, (int)step2, dst, (int)step, ippiSize(sz), 0) >= 0)
3474             {
3475                 CV_IMPL_ADD(CV_IMPL_IPP);
3476                 return;
3477             }
3478             setIppErrorStatus();
3479         }
3480     }
3481 #endif
3482     mul_(src1, step1, src2, step2, dst, step, sz, fscale);
3483 }
3484 
mul8s(const schar * src1,size_t step1,const schar * src2,size_t step2,schar * dst,size_t step,Size sz,void * scale)3485 static void mul8s( const schar* src1, size_t step1, const schar* src2, size_t step2,
3486                    schar* dst, size_t step, Size sz, void* scale)
3487 {
3488     mul_(src1, step1, src2, step2, dst, step, sz, (float)*(const double*)scale);
3489 }
3490 
mul16u(const ushort * src1,size_t step1,const ushort * src2,size_t step2,ushort * dst,size_t step,Size sz,void * scale)3491 static void mul16u( const ushort* src1, size_t step1, const ushort* src2, size_t step2,
3492                     ushort* dst, size_t step, Size sz, void* scale)
3493 {
3494     float fscale = (float)*(const double*)scale;
3495 #if defined HAVE_IPP
3496     CV_IPP_CHECK()
3497     {
3498         if (std::fabs(fscale - 1) <= FLT_EPSILON)
3499         {
3500             if (ippiMul_16u_C1RSfs(src1, (int)step1, src2, (int)step2, dst, (int)step, ippiSize(sz), 0) >= 0)
3501             {
3502                 CV_IMPL_ADD(CV_IMPL_IPP);
3503                 return;
3504             }
3505             setIppErrorStatus();
3506         }
3507     }
3508 #endif
3509     mul_(src1, step1, src2, step2, dst, step, sz, fscale);
3510 }
3511 
mul16s(const short * src1,size_t step1,const short * src2,size_t step2,short * dst,size_t step,Size sz,void * scale)3512 static void mul16s( const short* src1, size_t step1, const short* src2, size_t step2,
3513                     short* dst, size_t step, Size sz, void* scale)
3514 {
3515     float fscale = (float)*(const double*)scale;
3516 #if defined HAVE_IPP
3517     CV_IPP_CHECK()
3518     {
3519         if (std::fabs(fscale - 1) <= FLT_EPSILON)
3520         {
3521             if (ippiMul_16s_C1RSfs(src1, (int)step1, src2, (int)step2, dst, (int)step, ippiSize(sz), 0) >= 0)
3522             {
3523                 CV_IMPL_ADD(CV_IMPL_IPP);
3524                 return;
3525             }
3526             setIppErrorStatus();
3527         }
3528     }
3529 #endif
3530     mul_(src1, step1, src2, step2, dst, step, sz, fscale);
3531 }
3532 
mul32s(const int * src1,size_t step1,const int * src2,size_t step2,int * dst,size_t step,Size sz,void * scale)3533 static void mul32s( const int* src1, size_t step1, const int* src2, size_t step2,
3534                     int* dst, size_t step, Size sz, void* scale)
3535 {
3536     mul_(src1, step1, src2, step2, dst, step, sz, *(const double*)scale);
3537 }
3538 
mul32f(const float * src1,size_t step1,const float * src2,size_t step2,float * dst,size_t step,Size sz,void * scale)3539 static void mul32f( const float* src1, size_t step1, const float* src2, size_t step2,
3540                     float* dst, size_t step, Size sz, void* scale)
3541 {
3542     float fscale = (float)*(const double*)scale;
3543 #if defined HAVE_IPP
3544     CV_IPP_CHECK()
3545     {
3546         if (std::fabs(fscale - 1) <= FLT_EPSILON)
3547         {
3548             if (ippiMul_32f_C1R(src1, (int)step1, src2, (int)step2, dst, (int)step, ippiSize(sz)) >= 0)
3549             {
3550                 CV_IMPL_ADD(CV_IMPL_IPP);
3551                 return;
3552             }
3553             setIppErrorStatus();
3554         }
3555     }
3556 #endif
3557     mul_(src1, step1, src2, step2, dst, step, sz, fscale);
3558 }
3559 
mul64f(const double * src1,size_t step1,const double * src2,size_t step2,double * dst,size_t step,Size sz,void * scale)3560 static void mul64f( const double* src1, size_t step1, const double* src2, size_t step2,
3561                     double* dst, size_t step, Size sz, void* scale)
3562 {
3563     mul_(src1, step1, src2, step2, dst, step, sz, *(const double*)scale);
3564 }
3565 
div8u(const uchar * src1,size_t step1,const uchar * src2,size_t step2,uchar * dst,size_t step,Size sz,void * scale)3566 static void div8u( const uchar* src1, size_t step1, const uchar* src2, size_t step2,
3567                    uchar* dst, size_t step, Size sz, void* scale)
3568 {
3569     if( src1 )
3570         div_i(src1, step1, src2, step2, dst, step, sz, *(const double*)scale);
3571     else
3572         recip_i(src1, step1, src2, step2, dst, step, sz, *(const double*)scale);
3573 }
3574 
div8s(const schar * src1,size_t step1,const schar * src2,size_t step2,schar * dst,size_t step,Size sz,void * scale)3575 static void div8s( const schar* src1, size_t step1, const schar* src2, size_t step2,
3576                   schar* dst, size_t step, Size sz, void* scale)
3577 {
3578     div_i(src1, step1, src2, step2, dst, step, sz, *(const double*)scale);
3579 }
3580 
div16u(const ushort * src1,size_t step1,const ushort * src2,size_t step2,ushort * dst,size_t step,Size sz,void * scale)3581 static void div16u( const ushort* src1, size_t step1, const ushort* src2, size_t step2,
3582                     ushort* dst, size_t step, Size sz, void* scale)
3583 {
3584     div_i(src1, step1, src2, step2, dst, step, sz, *(const double*)scale);
3585 }
3586 
div16s(const short * src1,size_t step1,const short * src2,size_t step2,short * dst,size_t step,Size sz,void * scale)3587 static void div16s( const short* src1, size_t step1, const short* src2, size_t step2,
3588                     short* dst, size_t step, Size sz, void* scale)
3589 {
3590     div_i(src1, step1, src2, step2, dst, step, sz, *(const double*)scale);
3591 }
3592 
div32s(const int * src1,size_t step1,const int * src2,size_t step2,int * dst,size_t step,Size sz,void * scale)3593 static void div32s( const int* src1, size_t step1, const int* src2, size_t step2,
3594                     int* dst, size_t step, Size sz, void* scale)
3595 {
3596     div_i(src1, step1, src2, step2, dst, step, sz, *(const double*)scale);
3597 }
3598 
div32f(const float * src1,size_t step1,const float * src2,size_t step2,float * dst,size_t step,Size sz,void * scale)3599 static void div32f( const float* src1, size_t step1, const float* src2, size_t step2,
3600                     float* dst, size_t step, Size sz, void* scale)
3601 {
3602     div_f(src1, step1, src2, step2, dst, step, sz, *(const double*)scale);
3603 }
3604 
div64f(const double * src1,size_t step1,const double * src2,size_t step2,double * dst,size_t step,Size sz,void * scale)3605 static void div64f( const double* src1, size_t step1, const double* src2, size_t step2,
3606                     double* dst, size_t step, Size sz, void* scale)
3607 {
3608     div_f(src1, step1, src2, step2, dst, step, sz, *(const double*)scale);
3609 }
3610 
recip8u(const uchar * src1,size_t step1,const uchar * src2,size_t step2,uchar * dst,size_t step,Size sz,void * scale)3611 static void recip8u( const uchar* src1, size_t step1, const uchar* src2, size_t step2,
3612                   uchar* dst, size_t step, Size sz, void* scale)
3613 {
3614     recip_i(src1, step1, src2, step2, dst, step, sz, *(const double*)scale);
3615 }
3616 
recip8s(const schar * src1,size_t step1,const schar * src2,size_t step2,schar * dst,size_t step,Size sz,void * scale)3617 static void recip8s( const schar* src1, size_t step1, const schar* src2, size_t step2,
3618                   schar* dst, size_t step, Size sz, void* scale)
3619 {
3620     recip_i(src1, step1, src2, step2, dst, step, sz, *(const double*)scale);
3621 }
3622 
recip16u(const ushort * src1,size_t step1,const ushort * src2,size_t step2,ushort * dst,size_t step,Size sz,void * scale)3623 static void recip16u( const ushort* src1, size_t step1, const ushort* src2, size_t step2,
3624                    ushort* dst, size_t step, Size sz, void* scale)
3625 {
3626     recip_i(src1, step1, src2, step2, dst, step, sz, *(const double*)scale);
3627 }
3628 
recip16s(const short * src1,size_t step1,const short * src2,size_t step2,short * dst,size_t step,Size sz,void * scale)3629 static void recip16s( const short* src1, size_t step1, const short* src2, size_t step2,
3630                    short* dst, size_t step, Size sz, void* scale)
3631 {
3632     recip_i(src1, step1, src2, step2, dst, step, sz, *(const double*)scale);
3633 }
3634 
recip32s(const int * src1,size_t step1,const int * src2,size_t step2,int * dst,size_t step,Size sz,void * scale)3635 static void recip32s( const int* src1, size_t step1, const int* src2, size_t step2,
3636                    int* dst, size_t step, Size sz, void* scale)
3637 {
3638     recip_i(src1, step1, src2, step2, dst, step, sz, *(const double*)scale);
3639 }
3640 
recip32f(const float * src1,size_t step1,const float * src2,size_t step2,float * dst,size_t step,Size sz,void * scale)3641 static void recip32f( const float* src1, size_t step1, const float* src2, size_t step2,
3642                    float* dst, size_t step, Size sz, void* scale)
3643 {
3644     recip_f(src1, step1, src2, step2, dst, step, sz, *(const double*)scale);
3645 }
3646 
recip64f(const double * src1,size_t step1,const double * src2,size_t step2,double * dst,size_t step,Size sz,void * scale)3647 static void recip64f( const double* src1, size_t step1, const double* src2, size_t step2,
3648                    double* dst, size_t step, Size sz, void* scale)
3649 {
3650     recip_f(src1, step1, src2, step2, dst, step, sz, *(const double*)scale);
3651 }
3652 
3653 
getMulTab()3654 static BinaryFunc* getMulTab()
3655 {
3656     static BinaryFunc mulTab[] =
3657     {
3658         (BinaryFunc)mul8u, (BinaryFunc)mul8s, (BinaryFunc)mul16u,
3659         (BinaryFunc)mul16s, (BinaryFunc)mul32s, (BinaryFunc)mul32f,
3660         (BinaryFunc)mul64f, 0
3661     };
3662 
3663     return mulTab;
3664 }
3665 
getDivTab()3666 static BinaryFunc* getDivTab()
3667 {
3668     static BinaryFunc divTab[] =
3669     {
3670         (BinaryFunc)div8u, (BinaryFunc)div8s, (BinaryFunc)div16u,
3671         (BinaryFunc)div16s, (BinaryFunc)div32s, (BinaryFunc)div32f,
3672         (BinaryFunc)div64f, 0
3673     };
3674 
3675     return divTab;
3676 }
3677 
getRecipTab()3678 static BinaryFunc* getRecipTab()
3679 {
3680     static BinaryFunc recipTab[] =
3681     {
3682         (BinaryFunc)recip8u, (BinaryFunc)recip8s, (BinaryFunc)recip16u,
3683         (BinaryFunc)recip16s, (BinaryFunc)recip32s, (BinaryFunc)recip32f,
3684         (BinaryFunc)recip64f, 0
3685     };
3686 
3687     return recipTab;
3688 }
3689 
3690 }
3691 
multiply(InputArray src1,InputArray src2,OutputArray dst,double scale,int dtype)3692 void cv::multiply(InputArray src1, InputArray src2,
3693                   OutputArray dst, double scale, int dtype)
3694 {
3695     arithm_op(src1, src2, dst, noArray(), dtype, getMulTab(),
3696               true, &scale, std::abs(scale - 1.0) < DBL_EPSILON ? OCL_OP_MUL : OCL_OP_MUL_SCALE);
3697 }
3698 
divide(InputArray src1,InputArray src2,OutputArray dst,double scale,int dtype)3699 void cv::divide(InputArray src1, InputArray src2,
3700                 OutputArray dst, double scale, int dtype)
3701 {
3702     arithm_op(src1, src2, dst, noArray(), dtype, getDivTab(), true, &scale, OCL_OP_DIV_SCALE);
3703 }
3704 
divide(double scale,InputArray src2,OutputArray dst,int dtype)3705 void cv::divide(double scale, InputArray src2,
3706                 OutputArray dst, int dtype)
3707 {
3708     arithm_op(src2, src2, dst, noArray(), dtype, getRecipTab(), true, &scale, OCL_OP_RECIP_SCALE);
3709 }
3710 
3711 /****************************************************************************************\
3712 *                                      addWeighted                                       *
3713 \****************************************************************************************/
3714 
3715 namespace cv
3716 {
3717 
3718 template <typename T, typename WT>
3719 struct AddWeighted_SIMD
3720 {
operator ()cv::AddWeighted_SIMD3721     int operator() (const T *, const T *, T *, int, WT, WT, WT) const
3722     {
3723         return 0;
3724     }
3725 };
3726 
3727 #if CV_SSE2
3728 
3729 template <>
3730 struct AddWeighted_SIMD<schar, float>
3731 {
AddWeighted_SIMDcv::AddWeighted_SIMD3732     AddWeighted_SIMD()
3733     {
3734         haveSSE2 = checkHardwareSupport(CV_CPU_SSE2);
3735     }
3736 
operator ()cv::AddWeighted_SIMD3737     int operator() (const schar * src1, const schar * src2, schar * dst, int width, float alpha, float beta, float gamma) const
3738     {
3739         int x = 0;
3740 
3741         if (!haveSSE2)
3742             return x;
3743 
3744         __m128i v_zero = _mm_setzero_si128();
3745         __m128 v_alpha = _mm_set1_ps(alpha), v_beta = _mm_set1_ps(beta),
3746                v_gamma = _mm_set1_ps(gamma);
3747 
3748         for( ; x <= width - 8; x += 8 )
3749         {
3750             __m128i v_src1 = _mm_loadl_epi64((const __m128i *)(src1 + x));
3751             __m128i v_src2 = _mm_loadl_epi64((const __m128i *)(src2 + x));
3752 
3753             __m128i v_src1_p = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, v_src1), 8);
3754             __m128i v_src2_p = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, v_src2), 8);
3755 
3756             __m128 v_dstf0 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src1_p), 16)), v_alpha);
3757             v_dstf0 = _mm_add_ps(_mm_add_ps(v_dstf0, v_gamma),
3758                                  _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src2_p), 16)), v_beta));
3759 
3760             __m128 v_dstf1 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src1_p), 16)), v_alpha);
3761             v_dstf1 = _mm_add_ps(_mm_add_ps(v_dstf1, v_gamma),
3762                                  _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src2_p), 16)), v_beta));
3763 
3764             __m128i v_dst16 = _mm_packs_epi32(_mm_cvtps_epi32(v_dstf0),
3765                                               _mm_cvtps_epi32(v_dstf1));
3766 
3767             _mm_storel_epi64((__m128i *)(dst + x), _mm_packs_epi16(v_dst16, v_zero));
3768         }
3769 
3770         return x;
3771     }
3772 
3773     bool haveSSE2;
3774 };
3775 
3776 template <>
3777 struct AddWeighted_SIMD<short, float>
3778 {
AddWeighted_SIMDcv::AddWeighted_SIMD3779     AddWeighted_SIMD()
3780     {
3781         haveSSE2 = checkHardwareSupport(CV_CPU_SSE2);
3782     }
3783 
operator ()cv::AddWeighted_SIMD3784     int operator() (const short * src1, const short * src2, short * dst, int width, float alpha, float beta, float gamma) const
3785     {
3786         int x = 0;
3787 
3788         if (!haveSSE2)
3789             return x;
3790 
3791         __m128i v_zero = _mm_setzero_si128();
3792         __m128 v_alpha = _mm_set1_ps(alpha), v_beta = _mm_set1_ps(beta),
3793                v_gamma = _mm_set1_ps(gamma);
3794 
3795         for( ; x <= width - 8; x += 8 )
3796         {
3797             __m128i v_src1 = _mm_loadu_si128((const __m128i *)(src1 + x));
3798             __m128i v_src2 = _mm_loadu_si128((const __m128i *)(src2 + x));
3799 
3800             __m128 v_dstf0 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src1), 16)), v_alpha);
3801             v_dstf0 = _mm_add_ps(_mm_add_ps(v_dstf0, v_gamma),
3802                                  _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src2), 16)), v_beta));
3803 
3804             __m128 v_dstf1 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src1), 16)), v_alpha);
3805             v_dstf1 = _mm_add_ps(_mm_add_ps(v_dstf1, v_gamma),
3806                                  _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src2), 16)), v_beta));
3807 
3808             _mm_storeu_si128((__m128i *)(dst + x), _mm_packs_epi32(_mm_cvtps_epi32(v_dstf0),
3809                                                                    _mm_cvtps_epi32(v_dstf1)));
3810         }
3811 
3812         return x;
3813     }
3814 
3815     bool haveSSE2;
3816 };
3817 
3818 #if CV_SSE4_1
3819 
3820 template <>
3821 struct AddWeighted_SIMD<ushort, float>
3822 {
AddWeighted_SIMDcv::AddWeighted_SIMD3823     AddWeighted_SIMD()
3824     {
3825         haveSSE4_1 = checkHardwareSupport(CV_CPU_SSE4_1);
3826     }
3827 
operator ()cv::AddWeighted_SIMD3828     int operator() (const ushort * src1, const ushort * src2, ushort * dst, int width, float alpha, float beta, float gamma) const
3829     {
3830         int x = 0;
3831 
3832         if (!haveSSE4_1)
3833             return x;
3834 
3835         __m128i v_zero = _mm_setzero_si128();
3836         __m128 v_alpha = _mm_set1_ps(alpha), v_beta = _mm_set1_ps(beta),
3837                v_gamma = _mm_set1_ps(gamma);
3838 
3839         for( ; x <= width - 8; x += 8 )
3840         {
3841             __m128i v_src1 = _mm_loadu_si128((const __m128i *)(src1 + x));
3842             __m128i v_src2 = _mm_loadu_si128((const __m128i *)(src2 + x));
3843 
3844             __m128 v_dstf0 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src1, v_zero)), v_alpha);
3845             v_dstf0 = _mm_add_ps(_mm_add_ps(v_dstf0, v_gamma),
3846                                  _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src2, v_zero)), v_beta));
3847 
3848             __m128 v_dstf1 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src1, v_zero)), v_alpha);
3849             v_dstf1 = _mm_add_ps(_mm_add_ps(v_dstf1, v_gamma),
3850                                  _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src2, v_zero)), v_beta));
3851 
3852             _mm_storeu_si128((__m128i *)(dst + x), _mm_packus_epi32(_mm_cvtps_epi32(v_dstf0),
3853                                                                     _mm_cvtps_epi32(v_dstf1)));
3854         }
3855 
3856         return x;
3857     }
3858 
3859     bool haveSSE4_1;
3860 };
3861 
3862 #endif
3863 
3864 #elif CV_NEON
3865 
3866 template <>
3867 struct AddWeighted_SIMD<schar, float>
3868 {
operator ()cv::AddWeighted_SIMD3869     int operator() (const schar * src1, const schar * src2, schar * dst, int width, float alpha, float beta, float gamma) const
3870     {
3871         int x = 0;
3872 
3873         float32x4_t g = vdupq_n_f32 (gamma);
3874 
3875         for( ; x <= width - 8; x += 8 )
3876         {
3877             int8x8_t in1 = vld1_s8(src1 + x);
3878             int16x8_t in1_16 = vmovl_s8(in1);
3879             float32x4_t in1_f_l = vcvtq_f32_s32(vmovl_s16(vget_low_s16(in1_16)));
3880             float32x4_t in1_f_h = vcvtq_f32_s32(vmovl_s16(vget_high_s16(in1_16)));
3881 
3882             int8x8_t in2 = vld1_s8(src2+x);
3883             int16x8_t in2_16 = vmovl_s8(in2);
3884             float32x4_t in2_f_l = vcvtq_f32_s32(vmovl_s16(vget_low_s16(in2_16)));
3885             float32x4_t in2_f_h = vcvtq_f32_s32(vmovl_s16(vget_high_s16(in2_16)));
3886 
3887             float32x4_t out_f_l = vaddq_f32(vmulq_n_f32(in1_f_l, alpha), vmulq_n_f32(in2_f_l, beta));
3888             float32x4_t out_f_h = vaddq_f32(vmulq_n_f32(in1_f_h, alpha), vmulq_n_f32(in2_f_h, beta));
3889             out_f_l = vaddq_f32(out_f_l, g);
3890             out_f_h = vaddq_f32(out_f_h, g);
3891 
3892             int16x4_t out_16_l = vqmovn_s32(cv_vrndq_s32_f32(out_f_l));
3893             int16x4_t out_16_h = vqmovn_s32(cv_vrndq_s32_f32(out_f_h));
3894 
3895             int16x8_t out_16 = vcombine_s16(out_16_l, out_16_h);
3896             int8x8_t out = vqmovn_s16(out_16);
3897 
3898             vst1_s8(dst + x, out);
3899         }
3900 
3901         return x;
3902     }
3903 };
3904 
3905 template <>
3906 struct AddWeighted_SIMD<ushort, float>
3907 {
operator ()cv::AddWeighted_SIMD3908     int operator() (const ushort * src1, const ushort * src2, ushort * dst, int width, float alpha, float beta, float gamma) const
3909     {
3910         int x = 0;
3911 
3912         float32x4_t g = vdupq_n_f32(gamma);
3913 
3914         for( ; x <= width - 8; x += 8 )
3915         {
3916             uint16x8_t v_src1 = vld1q_u16(src1 + x), v_src2 = vld1q_u16(src2 + x);
3917 
3918             float32x4_t v_s1 = vmulq_n_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src1))), alpha);
3919             float32x4_t v_s2 = vmulq_n_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src2))), beta);
3920             uint16x4_t v_dst1 = vqmovn_u32(cv_vrndq_u32_f32(vaddq_f32(vaddq_f32(v_s1, v_s2), g)));
3921 
3922             v_s1 = vmulq_n_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src1))), alpha);
3923             v_s2 = vmulq_n_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src2))), beta);
3924             uint16x4_t v_dst2 = vqmovn_u32(cv_vrndq_u32_f32(vaddq_f32(vaddq_f32(v_s1, v_s2), g)));
3925 
3926             vst1q_u16(dst + x, vcombine_u16(v_dst1, v_dst2));
3927         }
3928 
3929         return x;
3930     }
3931 };
3932 
3933 template <>
3934 struct AddWeighted_SIMD<short, float>
3935 {
operator ()cv::AddWeighted_SIMD3936     int operator() (const short * src1, const short * src2, short * dst, int width, float alpha, float beta, float gamma) const
3937     {
3938         int x = 0;
3939 
3940         float32x4_t g = vdupq_n_f32(gamma);
3941 
3942         for( ; x <= width - 8; x += 8 )
3943         {
3944             int16x8_t v_src1 = vld1q_s16(src1 + x), v_src2 = vld1q_s16(src2 + x);
3945 
3946             float32x4_t v_s1 = vmulq_n_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src1))), alpha);
3947             float32x4_t v_s2 = vmulq_n_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src2))), beta);
3948             int16x4_t v_dst1 = vqmovn_s32(cv_vrndq_s32_f32(vaddq_f32(vaddq_f32(v_s1, v_s2), g)));
3949 
3950             v_s1 = vmulq_n_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src1))), alpha);
3951             v_s2 = vmulq_n_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src2))), beta);
3952             int16x4_t v_dst2 = vqmovn_s32(cv_vrndq_s32_f32(vaddq_f32(vaddq_f32(v_s1, v_s2), g)));
3953 
3954             vst1q_s16(dst + x, vcombine_s16(v_dst1, v_dst2));
3955         }
3956 
3957         return x;
3958     }
3959 };
3960 
3961 #endif
3962 
3963 template<typename T, typename WT> static void
addWeighted_(const T * src1,size_t step1,const T * src2,size_t step2,T * dst,size_t step,Size size,void * _scalars)3964 addWeighted_( const T* src1, size_t step1, const T* src2, size_t step2,
3965               T* dst, size_t step, Size size, void* _scalars )
3966 {
3967     const double* scalars = (const double*)_scalars;
3968     WT alpha = (WT)scalars[0], beta = (WT)scalars[1], gamma = (WT)scalars[2];
3969     step1 /= sizeof(src1[0]);
3970     step2 /= sizeof(src2[0]);
3971     step /= sizeof(dst[0]);
3972 
3973     AddWeighted_SIMD<T, WT> vop;
3974 
3975     for( ; size.height--; src1 += step1, src2 += step2, dst += step )
3976     {
3977         int x = vop(src1, src2, dst, size.width, alpha, beta, gamma);
3978         #if CV_ENABLE_UNROLLED
3979         for( ; x <= size.width - 4; x += 4 )
3980         {
3981             T t0 = saturate_cast<T>(src1[x]*alpha + src2[x]*beta + gamma);
3982             T t1 = saturate_cast<T>(src1[x+1]*alpha + src2[x+1]*beta + gamma);
3983             dst[x] = t0; dst[x+1] = t1;
3984 
3985             t0 = saturate_cast<T>(src1[x+2]*alpha + src2[x+2]*beta + gamma);
3986             t1 = saturate_cast<T>(src1[x+3]*alpha + src2[x+3]*beta + gamma);
3987             dst[x+2] = t0; dst[x+3] = t1;
3988         }
3989         #endif
3990         for( ; x < size.width; x++ )
3991             dst[x] = saturate_cast<T>(src1[x]*alpha + src2[x]*beta + gamma);
3992     }
3993 }
3994 
3995 
3996 static void
addWeighted8u(const uchar * src1,size_t step1,const uchar * src2,size_t step2,uchar * dst,size_t step,Size size,void * _scalars)3997 addWeighted8u( const uchar* src1, size_t step1,
3998                const uchar* src2, size_t step2,
3999                uchar* dst, size_t step, Size size,
4000                void* _scalars )
4001 {
4002     const double* scalars = (const double*)_scalars;
4003     float alpha = (float)scalars[0], beta = (float)scalars[1], gamma = (float)scalars[2];
4004 
4005     for( ; size.height--; src1 += step1, src2 += step2, dst += step )
4006     {
4007         int x = 0;
4008 
4009 #if CV_SSE2
4010         if( USE_SSE2 )
4011         {
4012             __m128 a4 = _mm_set1_ps(alpha), b4 = _mm_set1_ps(beta), g4 = _mm_set1_ps(gamma);
4013             __m128i z = _mm_setzero_si128();
4014 
4015             for( ; x <= size.width - 8; x += 8 )
4016             {
4017                 __m128i u = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(src1 + x)), z);
4018                 __m128i v = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(src2 + x)), z);
4019 
4020                 __m128 u0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(u, z));
4021                 __m128 u1 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(u, z));
4022                 __m128 v0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v, z));
4023                 __m128 v1 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v, z));
4024 
4025                 u0 = _mm_add_ps(_mm_mul_ps(u0, a4), _mm_mul_ps(v0, b4));
4026                 u1 = _mm_add_ps(_mm_mul_ps(u1, a4), _mm_mul_ps(v1, b4));
4027                 u0 = _mm_add_ps(u0, g4); u1 = _mm_add_ps(u1, g4);
4028 
4029                 u = _mm_packs_epi32(_mm_cvtps_epi32(u0), _mm_cvtps_epi32(u1));
4030                 u = _mm_packus_epi16(u, u);
4031 
4032                 _mm_storel_epi64((__m128i*)(dst + x), u);
4033             }
4034         }
4035 #elif CV_NEON
4036         float32x4_t g = vdupq_n_f32 (gamma);
4037 
4038         for( ; x <= size.width - 8; x += 8 )
4039         {
4040             uint8x8_t in1 = vld1_u8(src1+x);
4041             uint16x8_t in1_16 = vmovl_u8(in1);
4042             float32x4_t in1_f_l = vcvtq_f32_u32(vmovl_u16(vget_low_u16(in1_16)));
4043             float32x4_t in1_f_h = vcvtq_f32_u32(vmovl_u16(vget_high_u16(in1_16)));
4044 
4045             uint8x8_t in2 = vld1_u8(src2+x);
4046             uint16x8_t in2_16 = vmovl_u8(in2);
4047             float32x4_t in2_f_l = vcvtq_f32_u32(vmovl_u16(vget_low_u16(in2_16)));
4048             float32x4_t in2_f_h = vcvtq_f32_u32(vmovl_u16(vget_high_u16(in2_16)));
4049 
4050             float32x4_t out_f_l = vaddq_f32(vmulq_n_f32(in1_f_l, alpha), vmulq_n_f32(in2_f_l, beta));
4051             float32x4_t out_f_h = vaddq_f32(vmulq_n_f32(in1_f_h, alpha), vmulq_n_f32(in2_f_h, beta));
4052             out_f_l = vaddq_f32(out_f_l, g);
4053             out_f_h = vaddq_f32(out_f_h, g);
4054 
4055             uint16x4_t out_16_l = vqmovun_s32(cv_vrndq_s32_f32(out_f_l));
4056             uint16x4_t out_16_h = vqmovun_s32(cv_vrndq_s32_f32(out_f_h));
4057 
4058             uint16x8_t out_16 = vcombine_u16(out_16_l, out_16_h);
4059             uint8x8_t out = vqmovn_u16(out_16);
4060 
4061             vst1_u8(dst+x, out);
4062         }
4063 #endif
4064         #if CV_ENABLE_UNROLLED
4065         for( ; x <= size.width - 4; x += 4 )
4066         {
4067             float t0, t1;
4068             t0 = CV_8TO32F(src1[x])*alpha + CV_8TO32F(src2[x])*beta + gamma;
4069             t1 = CV_8TO32F(src1[x+1])*alpha + CV_8TO32F(src2[x+1])*beta + gamma;
4070 
4071             dst[x] = saturate_cast<uchar>(t0);
4072             dst[x+1] = saturate_cast<uchar>(t1);
4073 
4074             t0 = CV_8TO32F(src1[x+2])*alpha + CV_8TO32F(src2[x+2])*beta + gamma;
4075             t1 = CV_8TO32F(src1[x+3])*alpha + CV_8TO32F(src2[x+3])*beta + gamma;
4076 
4077             dst[x+2] = saturate_cast<uchar>(t0);
4078             dst[x+3] = saturate_cast<uchar>(t1);
4079         }
4080         #endif
4081 
4082         for( ; x < size.width; x++ )
4083         {
4084             float t0 = CV_8TO32F(src1[x])*alpha + CV_8TO32F(src2[x])*beta + gamma;
4085             dst[x] = saturate_cast<uchar>(t0);
4086         }
4087     }
4088 }
4089 
addWeighted8s(const schar * src1,size_t step1,const schar * src2,size_t step2,schar * dst,size_t step,Size sz,void * scalars)4090 static void addWeighted8s( const schar* src1, size_t step1, const schar* src2, size_t step2,
4091                            schar* dst, size_t step, Size sz, void* scalars )
4092 {
4093     addWeighted_<schar, float>(src1, step1, src2, step2, dst, step, sz, scalars);
4094 }
4095 
addWeighted16u(const ushort * src1,size_t step1,const ushort * src2,size_t step2,ushort * dst,size_t step,Size sz,void * scalars)4096 static void addWeighted16u( const ushort* src1, size_t step1, const ushort* src2, size_t step2,
4097                             ushort* dst, size_t step, Size sz, void* scalars )
4098 {
4099     addWeighted_<ushort, float>(src1, step1, src2, step2, dst, step, sz, scalars);
4100 }
4101 
addWeighted16s(const short * src1,size_t step1,const short * src2,size_t step2,short * dst,size_t step,Size sz,void * scalars)4102 static void addWeighted16s( const short* src1, size_t step1, const short* src2, size_t step2,
4103                             short* dst, size_t step, Size sz, void* scalars )
4104 {
4105     addWeighted_<short, float>(src1, step1, src2, step2, dst, step, sz, scalars);
4106 }
4107 
addWeighted32s(const int * src1,size_t step1,const int * src2,size_t step2,int * dst,size_t step,Size sz,void * scalars)4108 static void addWeighted32s( const int* src1, size_t step1, const int* src2, size_t step2,
4109                             int* dst, size_t step, Size sz, void* scalars )
4110 {
4111     addWeighted_<int, double>(src1, step1, src2, step2, dst, step, sz, scalars);
4112 }
4113 
addWeighted32f(const float * src1,size_t step1,const float * src2,size_t step2,float * dst,size_t step,Size sz,void * scalars)4114 static void addWeighted32f( const float* src1, size_t step1, const float* src2, size_t step2,
4115                             float* dst, size_t step, Size sz, void* scalars )
4116 {
4117     addWeighted_<float, double>(src1, step1, src2, step2, dst, step, sz, scalars);
4118 }
4119 
addWeighted64f(const double * src1,size_t step1,const double * src2,size_t step2,double * dst,size_t step,Size sz,void * scalars)4120 static void addWeighted64f( const double* src1, size_t step1, const double* src2, size_t step2,
4121                             double* dst, size_t step, Size sz, void* scalars )
4122 {
4123     addWeighted_<double, double>(src1, step1, src2, step2, dst, step, sz, scalars);
4124 }
4125 
getAddWeightedTab()4126 static BinaryFunc* getAddWeightedTab()
4127 {
4128     static BinaryFunc addWeightedTab[] =
4129     {
4130         (BinaryFunc)GET_OPTIMIZED(addWeighted8u), (BinaryFunc)GET_OPTIMIZED(addWeighted8s), (BinaryFunc)GET_OPTIMIZED(addWeighted16u),
4131         (BinaryFunc)GET_OPTIMIZED(addWeighted16s), (BinaryFunc)GET_OPTIMIZED(addWeighted32s), (BinaryFunc)addWeighted32f,
4132         (BinaryFunc)addWeighted64f, 0
4133     };
4134 
4135     return addWeightedTab;
4136 }
4137 
4138 }
4139 
addWeighted(InputArray src1,double alpha,InputArray src2,double beta,double gamma,OutputArray dst,int dtype)4140 void cv::addWeighted( InputArray src1, double alpha, InputArray src2,
4141                       double beta, double gamma, OutputArray dst, int dtype )
4142 {
4143     double scalars[] = {alpha, beta, gamma};
4144     arithm_op(src1, src2, dst, noArray(), dtype, getAddWeightedTab(), true, scalars, OCL_OP_ADDW);
4145 }
4146 
4147 
4148 /****************************************************************************************\
4149 *                                          compare                                       *
4150 \****************************************************************************************/
4151 
4152 namespace cv
4153 {
4154 
4155 template <typename T>
4156 struct Cmp_SIMD
4157 {
Cmp_SIMDcv::Cmp_SIMD4158     explicit Cmp_SIMD(int)
4159     {
4160     }
4161 
operator ()cv::Cmp_SIMD4162     int operator () (const T *, const T *, uchar *, int) const
4163     {
4164         return 0;
4165     }
4166 };
4167 
4168 #if CV_NEON
4169 
4170 template <>
4171 struct Cmp_SIMD<schar>
4172 {
Cmp_SIMDcv::Cmp_SIMD4173     explicit Cmp_SIMD(int code_) :
4174         code(code_)
4175     {
4176         CV_Assert(code == CMP_GT || code == CMP_LE ||
4177                   code == CMP_EQ || code == CMP_NE);
4178 
4179         v_mask = vdupq_n_u8(255);
4180     }
4181 
operator ()cv::Cmp_SIMD4182     int operator () (const schar * src1, const schar * src2, uchar * dst, int width) const
4183     {
4184         int x = 0;
4185 
4186         if (code == CMP_GT)
4187             for ( ; x <= width - 16; x += 16)
4188                 vst1q_u8(dst + x, vcgtq_s8(vld1q_s8(src1 + x), vld1q_s8(src2 + x)));
4189         else if (code == CMP_LE)
4190             for ( ; x <= width - 16; x += 16)
4191                 vst1q_u8(dst + x, vcleq_s8(vld1q_s8(src1 + x), vld1q_s8(src2 + x)));
4192         else if (code == CMP_EQ)
4193             for ( ; x <= width - 16; x += 16)
4194                 vst1q_u8(dst + x, vceqq_s8(vld1q_s8(src1 + x), vld1q_s8(src2 + x)));
4195         else if (code == CMP_NE)
4196             for ( ; x <= width - 16; x += 16)
4197                 vst1q_u8(dst + x, veorq_u8(vceqq_s8(vld1q_s8(src1 + x), vld1q_s8(src2 + x)), v_mask));
4198 
4199         return x;
4200     }
4201 
4202     int code;
4203     uint8x16_t v_mask;
4204 };
4205 
4206 template <>
4207 struct Cmp_SIMD<ushort>
4208 {
Cmp_SIMDcv::Cmp_SIMD4209     explicit Cmp_SIMD(int code_) :
4210         code(code_)
4211     {
4212         CV_Assert(code == CMP_GT || code == CMP_LE ||
4213                   code == CMP_EQ || code == CMP_NE);
4214 
4215         v_mask = vdup_n_u8(255);
4216     }
4217 
operator ()cv::Cmp_SIMD4218     int operator () (const ushort * src1, const ushort * src2, uchar * dst, int width) const
4219     {
4220         int x = 0;
4221 
4222         if (code == CMP_GT)
4223             for ( ; x <= width - 8; x += 8)
4224             {
4225                 uint16x8_t v_dst = vcgtq_u16(vld1q_u16(src1 + x), vld1q_u16(src2 + x));
4226                 vst1_u8(dst + x, vmovn_u16(v_dst));
4227             }
4228         else if (code == CMP_LE)
4229             for ( ; x <= width - 8; x += 8)
4230             {
4231                 uint16x8_t v_dst = vcleq_u16(vld1q_u16(src1 + x), vld1q_u16(src2 + x));
4232                 vst1_u8(dst + x, vmovn_u16(v_dst));
4233             }
4234         else if (code == CMP_EQ)
4235             for ( ; x <= width - 8; x += 8)
4236             {
4237                 uint16x8_t v_dst = vceqq_u16(vld1q_u16(src1 + x), vld1q_u16(src2 + x));
4238                 vst1_u8(dst + x, vmovn_u16(v_dst));
4239             }
4240         else if (code == CMP_NE)
4241             for ( ; x <= width - 8; x += 8)
4242             {
4243                 uint16x8_t v_dst = vceqq_u16(vld1q_u16(src1 + x), vld1q_u16(src2 + x));
4244                 vst1_u8(dst + x, veor_u8(vmovn_u16(v_dst), v_mask));
4245             }
4246 
4247         return x;
4248     }
4249 
4250     int code;
4251     uint8x8_t v_mask;
4252 };
4253 
4254 template <>
4255 struct Cmp_SIMD<int>
4256 {
Cmp_SIMDcv::Cmp_SIMD4257     explicit Cmp_SIMD(int code_) :
4258         code(code_)
4259     {
4260         CV_Assert(code == CMP_GT || code == CMP_LE ||
4261                   code == CMP_EQ || code == CMP_NE);
4262 
4263         v_mask = vdup_n_u8(255);
4264     }
4265 
operator ()cv::Cmp_SIMD4266     int operator () (const int * src1, const int * src2, uchar * dst, int width) const
4267     {
4268         int x = 0;
4269 
4270         if (code == CMP_GT)
4271             for ( ; x <= width - 8; x += 8)
4272             {
4273                 uint32x4_t v_dst1 = vcgtq_s32(vld1q_s32(src1 + x), vld1q_s32(src2 + x));
4274                 uint32x4_t v_dst2 = vcgtq_s32(vld1q_s32(src1 + x + 4), vld1q_s32(src2 + x + 4));
4275                 vst1_u8(dst + x, vmovn_u16(vcombine_u16(vmovn_u32(v_dst1), vmovn_u32(v_dst2))));
4276             }
4277         else if (code == CMP_LE)
4278             for ( ; x <= width - 8; x += 8)
4279             {
4280                 uint32x4_t v_dst1 = vcleq_s32(vld1q_s32(src1 + x), vld1q_s32(src2 + x));
4281                 uint32x4_t v_dst2 = vcleq_s32(vld1q_s32(src1 + x + 4), vld1q_s32(src2 + x + 4));
4282                 vst1_u8(dst + x, vmovn_u16(vcombine_u16(vmovn_u32(v_dst1), vmovn_u32(v_dst2))));
4283             }
4284         else if (code == CMP_EQ)
4285             for ( ; x <= width - 8; x += 8)
4286             {
4287                 uint32x4_t v_dst1 = vceqq_s32(vld1q_s32(src1 + x), vld1q_s32(src2 + x));
4288                 uint32x4_t v_dst2 = vceqq_s32(vld1q_s32(src1 + x + 4), vld1q_s32(src2 + x + 4));
4289                 vst1_u8(dst + x, vmovn_u16(vcombine_u16(vmovn_u32(v_dst1), vmovn_u32(v_dst2))));
4290             }
4291         else if (code == CMP_NE)
4292             for ( ; x <= width - 8; x += 8)
4293             {
4294                 uint32x4_t v_dst1 = vceqq_s32(vld1q_s32(src1 + x), vld1q_s32(src2 + x));
4295                 uint32x4_t v_dst2 = vceqq_s32(vld1q_s32(src1 + x + 4), vld1q_s32(src2 + x + 4));
4296                 uint8x8_t v_dst = vmovn_u16(vcombine_u16(vmovn_u32(v_dst1), vmovn_u32(v_dst2)));
4297                 vst1_u8(dst + x, veor_u8(v_dst, v_mask));
4298             }
4299 
4300         return x;
4301     }
4302 
4303     int code;
4304     uint8x8_t v_mask;
4305 };
4306 
4307 template <>
4308 struct Cmp_SIMD<float>
4309 {
Cmp_SIMDcv::Cmp_SIMD4310     explicit Cmp_SIMD(int code_) :
4311         code(code_)
4312     {
4313         CV_Assert(code == CMP_GT || code == CMP_LE ||
4314                   code == CMP_EQ || code == CMP_NE);
4315 
4316         v_mask = vdup_n_u8(255);
4317     }
4318 
operator ()cv::Cmp_SIMD4319     int operator () (const float * src1, const float * src2, uchar * dst, int width) const
4320     {
4321         int x = 0;
4322 
4323         if (code == CMP_GT)
4324             for ( ; x <= width - 8; x += 8)
4325             {
4326                 uint32x4_t v_dst1 = vcgtq_f32(vld1q_f32(src1 + x), vld1q_f32(src2 + x));
4327                 uint32x4_t v_dst2 = vcgtq_f32(vld1q_f32(src1 + x + 4), vld1q_f32(src2 + x + 4));
4328                 vst1_u8(dst + x, vmovn_u16(vcombine_u16(vmovn_u32(v_dst1), vmovn_u32(v_dst2))));
4329             }
4330         else if (code == CMP_LE)
4331             for ( ; x <= width - 8; x += 8)
4332             {
4333                 uint32x4_t v_dst1 = vcleq_f32(vld1q_f32(src1 + x), vld1q_f32(src2 + x));
4334                 uint32x4_t v_dst2 = vcleq_f32(vld1q_f32(src1 + x + 4), vld1q_f32(src2 + x + 4));
4335                 vst1_u8(dst + x, vmovn_u16(vcombine_u16(vmovn_u32(v_dst1), vmovn_u32(v_dst2))));
4336             }
4337         else if (code == CMP_EQ)
4338             for ( ; x <= width - 8; x += 8)
4339             {
4340                 uint32x4_t v_dst1 = vceqq_f32(vld1q_f32(src1 + x), vld1q_f32(src2 + x));
4341                 uint32x4_t v_dst2 = vceqq_f32(vld1q_f32(src1 + x + 4), vld1q_f32(src2 + x + 4));
4342                 vst1_u8(dst + x, vmovn_u16(vcombine_u16(vmovn_u32(v_dst1), vmovn_u32(v_dst2))));
4343             }
4344         else if (code == CMP_NE)
4345             for ( ; x <= width - 8; x += 8)
4346             {
4347                 uint32x4_t v_dst1 = vceqq_f32(vld1q_f32(src1 + x), vld1q_f32(src2 + x));
4348                 uint32x4_t v_dst2 = vceqq_f32(vld1q_f32(src1 + x + 4), vld1q_f32(src2 + x + 4));
4349                 uint8x8_t v_dst = vmovn_u16(vcombine_u16(vmovn_u32(v_dst1), vmovn_u32(v_dst2)));
4350                 vst1_u8(dst + x, veor_u8(v_dst, v_mask));
4351             }
4352 
4353         return x;
4354     }
4355 
4356     int code;
4357     uint8x8_t v_mask;
4358 };
4359 
4360 #elif CV_SSE2
4361 
4362 template <>
4363 struct Cmp_SIMD<schar>
4364 {
Cmp_SIMDcv::Cmp_SIMD4365     explicit Cmp_SIMD(int code_) :
4366         code(code_)
4367     {
4368         CV_Assert(code == CMP_GT || code == CMP_LE ||
4369                   code == CMP_EQ || code == CMP_NE);
4370 
4371         haveSSE = checkHardwareSupport(CV_CPU_SSE2);
4372 
4373         v_mask = _mm_set1_epi8(-1);
4374     }
4375 
operator ()cv::Cmp_SIMD4376     int operator () (const schar * src1, const schar * src2, uchar * dst, int width) const
4377     {
4378         int x = 0;
4379 
4380         if (!haveSSE)
4381             return x;
4382 
4383         if (code == CMP_GT)
4384             for ( ; x <= width - 16; x += 16)
4385                 _mm_storeu_si128((__m128i *)(dst + x), _mm_cmpgt_epi8(_mm_loadu_si128((const __m128i *)(src1 + x)),
4386                                                                       _mm_loadu_si128((const __m128i *)(src2 + x))));
4387         else if (code == CMP_LE)
4388             for ( ; x <= width - 16; x += 16)
4389             {
4390                 __m128i v_gt = _mm_cmpgt_epi8(_mm_loadu_si128((const __m128i *)(src1 + x)),
4391                                               _mm_loadu_si128((const __m128i *)(src2 + x)));
4392                 _mm_storeu_si128((__m128i *)(dst + x), _mm_xor_si128(v_mask, v_gt));
4393             }
4394         else if (code == CMP_EQ)
4395             for ( ; x <= width - 16; x += 16)
4396                 _mm_storeu_si128((__m128i *)(dst + x), _mm_cmpeq_epi8(_mm_loadu_si128((const __m128i *)(src1 + x)),
4397                                                                       _mm_loadu_si128((const __m128i *)(src2 + x))));
4398         else if (code == CMP_NE)
4399             for ( ; x <= width - 16; x += 16)
4400             {
4401                 __m128i v_eq = _mm_cmpeq_epi8(_mm_loadu_si128((const __m128i *)(src1 + x)),
4402                                               _mm_loadu_si128((const __m128i *)(src2 + x)));
4403                 _mm_storeu_si128((__m128i *)(dst + x), _mm_xor_si128(v_mask, v_eq));
4404             }
4405 
4406         return x;
4407     }
4408 
4409     int code;
4410     __m128i v_mask;
4411     bool haveSSE;
4412 };
4413 
4414 template <>
4415 struct Cmp_SIMD<int>
4416 {
Cmp_SIMDcv::Cmp_SIMD4417     explicit Cmp_SIMD(int code_) :
4418         code(code_)
4419     {
4420         CV_Assert(code == CMP_GT || code == CMP_LE ||
4421                   code == CMP_EQ || code == CMP_NE);
4422 
4423         haveSSE = checkHardwareSupport(CV_CPU_SSE2);
4424 
4425         v_mask = _mm_set1_epi32(0xffffffff);
4426     }
4427 
operator ()cv::Cmp_SIMD4428     int operator () (const int * src1, const int * src2, uchar * dst, int width) const
4429     {
4430         int x = 0;
4431 
4432         if (!haveSSE)
4433             return x;
4434 
4435         if (code == CMP_GT)
4436             for ( ; x <= width - 8; x += 8)
4437             {
4438                 __m128i v_dst0 = _mm_cmpgt_epi32(_mm_loadu_si128((const __m128i *)(src1 + x)),
4439                                                  _mm_loadu_si128((const __m128i *)(src2 + x)));
4440                 __m128i v_dst1 = _mm_cmpgt_epi32(_mm_loadu_si128((const __m128i *)(src1 + x + 4)),
4441                                                  _mm_loadu_si128((const __m128i *)(src2 + x + 4)));
4442 
4443                 _mm_storel_epi64((__m128i *)(dst + x), _mm_packs_epi16(_mm_packs_epi32(v_dst0, v_dst1), v_mask));
4444             }
4445         else if (code == CMP_LE)
4446             for ( ; x <= width - 8; x += 8)
4447             {
4448                 __m128i v_dst0 = _mm_cmpgt_epi32(_mm_loadu_si128((const __m128i *)(src1 + x)),
4449                                                  _mm_loadu_si128((const __m128i *)(src2 + x)));
4450                 __m128i v_dst1 = _mm_cmpgt_epi32(_mm_loadu_si128((const __m128i *)(src1 + x + 4)),
4451                                                  _mm_loadu_si128((const __m128i *)(src2 + x + 4)));
4452 
4453                 _mm_storel_epi64((__m128i *)(dst + x), _mm_xor_si128(_mm_packs_epi16(_mm_packs_epi32(v_dst0, v_dst1), v_mask), v_mask));
4454             }
4455         else if (code == CMP_EQ)
4456             for ( ; x <= width - 8; x += 8)
4457             {
4458                 __m128i v_dst0 = _mm_cmpeq_epi32(_mm_loadu_si128((const __m128i *)(src1 + x)),
4459                                                  _mm_loadu_si128((const __m128i *)(src2 + x)));
4460                 __m128i v_dst1 = _mm_cmpeq_epi32(_mm_loadu_si128((const __m128i *)(src1 + x + 4)),
4461                                                  _mm_loadu_si128((const __m128i *)(src2 + x + 4)));
4462 
4463                 _mm_storel_epi64((__m128i *)(dst + x), _mm_packs_epi16(_mm_packs_epi32(v_dst0, v_dst1), v_mask));
4464             }
4465         else if (code == CMP_NE)
4466             for ( ; x <= width - 8; x += 8)
4467             {
4468                 __m128i v_dst0 = _mm_cmpeq_epi32(_mm_loadu_si128((const __m128i *)(src1 + x)),
4469                                                  _mm_loadu_si128((const __m128i *)(src2 + x)));
4470                 __m128i v_dst1 = _mm_cmpeq_epi32(_mm_loadu_si128((const __m128i *)(src1 + x + 4)),
4471                                                  _mm_loadu_si128((const __m128i *)(src2 + x + 4)));
4472 
4473                 _mm_storel_epi64((__m128i *)(dst + x), _mm_xor_si128(v_mask, _mm_packs_epi16(_mm_packs_epi32(v_dst0, v_dst1), v_mask)));
4474             }
4475 
4476         return x;
4477     }
4478 
4479     int code;
4480     __m128i v_mask;
4481     bool haveSSE;
4482 };
4483 
4484 #endif
4485 
4486 template<typename T> static void
cmp_(const T * src1,size_t step1,const T * src2,size_t step2,uchar * dst,size_t step,Size size,int code)4487 cmp_(const T* src1, size_t step1, const T* src2, size_t step2,
4488      uchar* dst, size_t step, Size size, int code)
4489 {
4490     step1 /= sizeof(src1[0]);
4491     step2 /= sizeof(src2[0]);
4492     if( code == CMP_GE || code == CMP_LT )
4493     {
4494         std::swap(src1, src2);
4495         std::swap(step1, step2);
4496         code = code == CMP_GE ? CMP_LE : CMP_GT;
4497     }
4498 
4499     Cmp_SIMD<T> vop(code);
4500 
4501     if( code == CMP_GT || code == CMP_LE )
4502     {
4503         int m = code == CMP_GT ? 0 : 255;
4504         for( ; size.height--; src1 += step1, src2 += step2, dst += step )
4505         {
4506             int x = vop(src1, src2, dst, size.width);
4507             #if CV_ENABLE_UNROLLED
4508             for( ; x <= size.width - 4; x += 4 )
4509             {
4510                 int t0, t1;
4511                 t0 = -(src1[x] > src2[x]) ^ m;
4512                 t1 = -(src1[x+1] > src2[x+1]) ^ m;
4513                 dst[x] = (uchar)t0; dst[x+1] = (uchar)t1;
4514                 t0 = -(src1[x+2] > src2[x+2]) ^ m;
4515                 t1 = -(src1[x+3] > src2[x+3]) ^ m;
4516                 dst[x+2] = (uchar)t0; dst[x+3] = (uchar)t1;
4517             }
4518             #endif
4519             for( ; x < size.width; x++ )
4520                 dst[x] = (uchar)(-(src1[x] > src2[x]) ^ m);
4521         }
4522     }
4523     else if( code == CMP_EQ || code == CMP_NE )
4524     {
4525         int m = code == CMP_EQ ? 0 : 255;
4526         for( ; size.height--; src1 += step1, src2 += step2, dst += step )
4527         {
4528             int x = 0;
4529             #if CV_ENABLE_UNROLLED
4530             for( ; x <= size.width - 4; x += 4 )
4531             {
4532                 int t0, t1;
4533                 t0 = -(src1[x] == src2[x]) ^ m;
4534                 t1 = -(src1[x+1] == src2[x+1]) ^ m;
4535                 dst[x] = (uchar)t0; dst[x+1] = (uchar)t1;
4536                 t0 = -(src1[x+2] == src2[x+2]) ^ m;
4537                 t1 = -(src1[x+3] == src2[x+3]) ^ m;
4538                 dst[x+2] = (uchar)t0; dst[x+3] = (uchar)t1;
4539             }
4540             #endif
4541             for( ; x < size.width; x++ )
4542                 dst[x] = (uchar)(-(src1[x] == src2[x]) ^ m);
4543         }
4544     }
4545 }
4546 
4547 #if ARITHM_USE_IPP
convert_cmp(int _cmpop)4548 inline static IppCmpOp convert_cmp(int _cmpop)
4549 {
4550     return _cmpop == CMP_EQ ? ippCmpEq :
4551         _cmpop == CMP_GT ? ippCmpGreater :
4552         _cmpop == CMP_GE ? ippCmpGreaterEq :
4553         _cmpop == CMP_LT ? ippCmpLess :
4554         _cmpop == CMP_LE ? ippCmpLessEq :
4555         (IppCmpOp)-1;
4556 }
4557 #endif
4558 
cmp8u(const uchar * src1,size_t step1,const uchar * src2,size_t step2,uchar * dst,size_t step,Size size,void * _cmpop)4559 static void cmp8u(const uchar* src1, size_t step1, const uchar* src2, size_t step2,
4560                   uchar* dst, size_t step, Size size, void* _cmpop)
4561 {
4562 #if ARITHM_USE_IPP
4563     CV_IPP_CHECK()
4564     {
4565         IppCmpOp op = convert_cmp(*(int *)_cmpop);
4566         if( op  >= 0 )
4567         {
4568             fixSteps(size, sizeof(dst[0]), step1, step2, step);
4569             if (0 <= ippiCompare_8u_C1R(src1, (int)step1, src2, (int)step2, dst, (int)step, ippiSize(size), op))
4570             {
4571                 CV_IMPL_ADD(CV_IMPL_IPP);
4572                 return;
4573             }
4574             setIppErrorStatus();
4575         }
4576     }
4577 #endif
4578   //vz optimized  cmp_(src1, step1, src2, step2, dst, step, size, *(int*)_cmpop);
4579     int code = *(int*)_cmpop;
4580     step1 /= sizeof(src1[0]);
4581     step2 /= sizeof(src2[0]);
4582     if( code == CMP_GE || code == CMP_LT )
4583     {
4584         std::swap(src1, src2);
4585         std::swap(step1, step2);
4586         code = code == CMP_GE ? CMP_LE : CMP_GT;
4587     }
4588 
4589     if( code == CMP_GT || code == CMP_LE )
4590     {
4591         int m = code == CMP_GT ? 0 : 255;
4592         for( ; size.height--; src1 += step1, src2 += step2, dst += step )
4593         {
4594             int x =0;
4595             #if CV_SSE2
4596             if( USE_SSE2 )
4597             {
4598                 __m128i m128 = code == CMP_GT ? _mm_setzero_si128() : _mm_set1_epi8 (-1);
4599                 __m128i c128 = _mm_set1_epi8 (-128);
4600                 for( ; x <= size.width - 16; x += 16 )
4601                 {
4602                     __m128i r00 = _mm_loadu_si128((const __m128i*)(src1 + x));
4603                     __m128i r10 = _mm_loadu_si128((const __m128i*)(src2 + x));
4604                     // no simd for 8u comparison, that's why we need the trick
4605                     r00 = _mm_sub_epi8(r00,c128);
4606                     r10 = _mm_sub_epi8(r10,c128);
4607 
4608                     r00 =_mm_xor_si128(_mm_cmpgt_epi8(r00, r10), m128);
4609                     _mm_storeu_si128((__m128i*)(dst + x),r00);
4610 
4611                 }
4612             }
4613             #elif CV_NEON
4614             uint8x16_t mask = code == CMP_GT ? vdupq_n_u8(0) : vdupq_n_u8(255);
4615 
4616             for( ; x <= size.width - 16; x += 16 )
4617             {
4618                 vst1q_u8(dst+x, veorq_u8(vcgtq_u8(vld1q_u8(src1+x), vld1q_u8(src2+x)), mask));
4619             }
4620 
4621            #endif
4622 
4623             for( ; x < size.width; x++ ){
4624                 dst[x] = (uchar)(-(src1[x] > src2[x]) ^ m);
4625             }
4626         }
4627     }
4628     else if( code == CMP_EQ || code == CMP_NE )
4629     {
4630         int m = code == CMP_EQ ? 0 : 255;
4631         for( ; size.height--; src1 += step1, src2 += step2, dst += step )
4632         {
4633             int x = 0;
4634             #if CV_SSE2
4635             if( USE_SSE2 )
4636             {
4637                 __m128i m128 =  code == CMP_EQ ? _mm_setzero_si128() : _mm_set1_epi8 (-1);
4638                 for( ; x <= size.width - 16; x += 16 )
4639                 {
4640                     __m128i r00 = _mm_loadu_si128((const __m128i*)(src1 + x));
4641                     __m128i r10 = _mm_loadu_si128((const __m128i*)(src2 + x));
4642                     r00 = _mm_xor_si128 ( _mm_cmpeq_epi8 (r00, r10), m128);
4643                     _mm_storeu_si128((__m128i*)(dst + x), r00);
4644                 }
4645             }
4646             #elif CV_NEON
4647             uint8x16_t mask = code == CMP_EQ ? vdupq_n_u8(0) : vdupq_n_u8(255);
4648 
4649             for( ; x <= size.width - 16; x += 16 )
4650             {
4651                 vst1q_u8(dst+x, veorq_u8(vceqq_u8(vld1q_u8(src1+x), vld1q_u8(src2+x)), mask));
4652             }
4653            #endif
4654            for( ; x < size.width; x++ )
4655                 dst[x] = (uchar)(-(src1[x] == src2[x]) ^ m);
4656         }
4657     }
4658 }
4659 
cmp8s(const schar * src1,size_t step1,const schar * src2,size_t step2,uchar * dst,size_t step,Size size,void * _cmpop)4660 static void cmp8s(const schar* src1, size_t step1, const schar* src2, size_t step2,
4661                   uchar* dst, size_t step, Size size, void* _cmpop)
4662 {
4663     cmp_(src1, step1, src2, step2, dst, step, size, *(int*)_cmpop);
4664 }
4665 
cmp16u(const ushort * src1,size_t step1,const ushort * src2,size_t step2,uchar * dst,size_t step,Size size,void * _cmpop)4666 static void cmp16u(const ushort* src1, size_t step1, const ushort* src2, size_t step2,
4667                   uchar* dst, size_t step, Size size, void* _cmpop)
4668 {
4669 #if ARITHM_USE_IPP
4670     CV_IPP_CHECK()
4671     {
4672         IppCmpOp op = convert_cmp(*(int *)_cmpop);
4673         if( op  >= 0 )
4674         {
4675             fixSteps(size, sizeof(dst[0]), step1, step2, step);
4676             if (0 <= ippiCompare_16u_C1R(src1, (int)step1, src2, (int)step2, dst, (int)step, ippiSize(size), op))
4677             {
4678                 CV_IMPL_ADD(CV_IMPL_IPP);
4679                 return;
4680             }
4681             setIppErrorStatus();
4682         }
4683     }
4684 #endif
4685     cmp_(src1, step1, src2, step2, dst, step, size, *(int*)_cmpop);
4686 }
4687 
cmp16s(const short * src1,size_t step1,const short * src2,size_t step2,uchar * dst,size_t step,Size size,void * _cmpop)4688 static void cmp16s(const short* src1, size_t step1, const short* src2, size_t step2,
4689                   uchar* dst, size_t step, Size size, void* _cmpop)
4690 {
4691 #if ARITHM_USE_IPP
4692     CV_IPP_CHECK()
4693     {
4694         IppCmpOp op = convert_cmp(*(int *)_cmpop);
4695         if( op  > 0 )
4696         {
4697             fixSteps(size, sizeof(dst[0]), step1, step2, step);
4698             if (0 <= ippiCompare_16s_C1R(src1, (int)step1, src2, (int)step2, dst, (int)step, ippiSize(size), op))
4699             {
4700                 CV_IMPL_ADD(CV_IMPL_IPP);
4701                 return;
4702             }
4703             setIppErrorStatus();
4704         }
4705     }
4706 #endif
4707    //vz optimized cmp_(src1, step1, src2, step2, dst, step, size, *(int*)_cmpop);
4708 
4709     int code = *(int*)_cmpop;
4710     step1 /= sizeof(src1[0]);
4711     step2 /= sizeof(src2[0]);
4712     if( code == CMP_GE || code == CMP_LT )
4713     {
4714         std::swap(src1, src2);
4715         std::swap(step1, step2);
4716         code = code == CMP_GE ? CMP_LE : CMP_GT;
4717     }
4718 
4719     if( code == CMP_GT || code == CMP_LE )
4720     {
4721         int m = code == CMP_GT ? 0 : 255;
4722         for( ; size.height--; src1 += step1, src2 += step2, dst += step )
4723         {
4724             int x =0;
4725             #if CV_SSE2
4726             if( USE_SSE2)
4727             {
4728                 __m128i m128 =  code == CMP_GT ? _mm_setzero_si128() : _mm_set1_epi16 (-1);
4729                 for( ; x <= size.width - 16; x += 16 )
4730                 {
4731                     __m128i r00 = _mm_loadu_si128((const __m128i*)(src1 + x));
4732                     __m128i r10 = _mm_loadu_si128((const __m128i*)(src2 + x));
4733                     r00 = _mm_xor_si128 ( _mm_cmpgt_epi16 (r00, r10), m128);
4734                     __m128i r01 = _mm_loadu_si128((const __m128i*)(src1 + x + 8));
4735                     __m128i r11 = _mm_loadu_si128((const __m128i*)(src2 + x + 8));
4736                     r01 = _mm_xor_si128 ( _mm_cmpgt_epi16 (r01, r11), m128);
4737                     r11 = _mm_packs_epi16(r00, r01);
4738                     _mm_storeu_si128((__m128i*)(dst + x), r11);
4739                 }
4740                 if( x <= size.width-8)
4741                 {
4742                     __m128i r00 = _mm_loadu_si128((const __m128i*)(src1 + x));
4743                     __m128i r10 = _mm_loadu_si128((const __m128i*)(src2 + x));
4744                     r00 = _mm_xor_si128 ( _mm_cmpgt_epi16 (r00, r10), m128);
4745                     r10 = _mm_packs_epi16(r00, r00);
4746                     _mm_storel_epi64((__m128i*)(dst + x), r10);
4747 
4748                     x += 8;
4749                 }
4750             }
4751             #elif CV_NEON
4752             uint8x16_t mask = code == CMP_GT ? vdupq_n_u8(0) : vdupq_n_u8(255);
4753 
4754             for( ; x <= size.width - 16; x += 16 )
4755             {
4756                 int16x8_t in1 = vld1q_s16(src1 + x);
4757                 int16x8_t in2 = vld1q_s16(src2 + x);
4758                 uint8x8_t t1 = vmovn_u16(vcgtq_s16(in1, in2));
4759 
4760                 in1 = vld1q_s16(src1 + x + 8);
4761                 in2 = vld1q_s16(src2 + x + 8);
4762                 uint8x8_t t2 = vmovn_u16(vcgtq_s16(in1, in2));
4763 
4764                 vst1q_u8(dst+x, veorq_u8(vcombine_u8(t1, t2), mask));
4765             }
4766             #endif
4767 
4768             for( ; x < size.width; x++ ){
4769                  dst[x] = (uchar)(-(src1[x] > src2[x]) ^ m);
4770             }
4771         }
4772     }
4773     else if( code == CMP_EQ || code == CMP_NE )
4774     {
4775         int m = code == CMP_EQ ? 0 : 255;
4776         for( ; size.height--; src1 += step1, src2 += step2, dst += step )
4777         {
4778             int x = 0;
4779             #if CV_SSE2
4780             if( USE_SSE2 )
4781             {
4782                 __m128i m128 =  code == CMP_EQ ? _mm_setzero_si128() : _mm_set1_epi16 (-1);
4783                 for( ; x <= size.width - 16; x += 16 )
4784                 {
4785                     __m128i r00 = _mm_loadu_si128((const __m128i*)(src1 + x));
4786                     __m128i r10 = _mm_loadu_si128((const __m128i*)(src2 + x));
4787                     r00 = _mm_xor_si128 ( _mm_cmpeq_epi16 (r00, r10), m128);
4788                     __m128i r01 = _mm_loadu_si128((const __m128i*)(src1 + x + 8));
4789                     __m128i r11 = _mm_loadu_si128((const __m128i*)(src2 + x + 8));
4790                     r01 = _mm_xor_si128 ( _mm_cmpeq_epi16 (r01, r11), m128);
4791                     r11 = _mm_packs_epi16(r00, r01);
4792                     _mm_storeu_si128((__m128i*)(dst + x), r11);
4793                 }
4794                 if( x <= size.width - 8)
4795                 {
4796                     __m128i r00 = _mm_loadu_si128((const __m128i*)(src1 + x));
4797                     __m128i r10 = _mm_loadu_si128((const __m128i*)(src2 + x));
4798                     r00 = _mm_xor_si128 ( _mm_cmpeq_epi16 (r00, r10), m128);
4799                     r10 = _mm_packs_epi16(r00, r00);
4800                     _mm_storel_epi64((__m128i*)(dst + x), r10);
4801 
4802                     x += 8;
4803                 }
4804             }
4805             #elif CV_NEON
4806             uint8x16_t mask = code == CMP_EQ ? vdupq_n_u8(0) : vdupq_n_u8(255);
4807 
4808             for( ; x <= size.width - 16; x += 16 )
4809             {
4810                 int16x8_t in1 = vld1q_s16(src1 + x);
4811                 int16x8_t in2 = vld1q_s16(src2 + x);
4812                 uint8x8_t t1 = vmovn_u16(vceqq_s16(in1, in2));
4813 
4814                 in1 = vld1q_s16(src1 + x + 8);
4815                 in2 = vld1q_s16(src2 + x + 8);
4816                 uint8x8_t t2 = vmovn_u16(vceqq_s16(in1, in2));
4817 
4818                 vst1q_u8(dst+x, veorq_u8(vcombine_u8(t1, t2), mask));
4819             }
4820             #endif
4821             for( ; x < size.width; x++ )
4822                 dst[x] = (uchar)(-(src1[x] == src2[x]) ^ m);
4823         }
4824     }
4825 }
4826 
cmp32s(const int * src1,size_t step1,const int * src2,size_t step2,uchar * dst,size_t step,Size size,void * _cmpop)4827 static void cmp32s(const int* src1, size_t step1, const int* src2, size_t step2,
4828                    uchar* dst, size_t step, Size size, void* _cmpop)
4829 {
4830     cmp_(src1, step1, src2, step2, dst, step, size, *(int*)_cmpop);
4831 }
4832 
cmp32f(const float * src1,size_t step1,const float * src2,size_t step2,uchar * dst,size_t step,Size size,void * _cmpop)4833 static void cmp32f(const float* src1, size_t step1, const float* src2, size_t step2,
4834                   uchar* dst, size_t step, Size size, void* _cmpop)
4835 {
4836 #if ARITHM_USE_IPP
4837     CV_IPP_CHECK()
4838     {
4839         IppCmpOp op = convert_cmp(*(int *)_cmpop);
4840         if( op  >= 0 )
4841         {
4842             fixSteps(size, sizeof(dst[0]), step1, step2, step);
4843             if (0 <= ippiCompare_32f_C1R(src1, (int)step1, src2, (int)step2, dst, (int)step, ippiSize(size), op))
4844             {
4845                 CV_IMPL_ADD(CV_IMPL_IPP);
4846                 return;
4847             }
4848             setIppErrorStatus();
4849         }
4850     }
4851 #endif
4852     cmp_(src1, step1, src2, step2, dst, step, size, *(int*)_cmpop);
4853 }
4854 
cmp64f(const double * src1,size_t step1,const double * src2,size_t step2,uchar * dst,size_t step,Size size,void * _cmpop)4855 static void cmp64f(const double* src1, size_t step1, const double* src2, size_t step2,
4856                   uchar* dst, size_t step, Size size, void* _cmpop)
4857 {
4858     cmp_(src1, step1, src2, step2, dst, step, size, *(int*)_cmpop);
4859 }
4860 
getCmpFunc(int depth)4861 static BinaryFunc getCmpFunc(int depth)
4862 {
4863     static BinaryFunc cmpTab[] =
4864     {
4865         (BinaryFunc)GET_OPTIMIZED(cmp8u), (BinaryFunc)GET_OPTIMIZED(cmp8s),
4866         (BinaryFunc)GET_OPTIMIZED(cmp16u), (BinaryFunc)GET_OPTIMIZED(cmp16s),
4867         (BinaryFunc)GET_OPTIMIZED(cmp32s),
4868         (BinaryFunc)GET_OPTIMIZED(cmp32f), (BinaryFunc)cmp64f,
4869         0
4870     };
4871 
4872     return cmpTab[depth];
4873 }
4874 
getMinVal(int depth)4875 static double getMinVal(int depth)
4876 {
4877     static const double tab[] = {0, -128, 0, -32768, INT_MIN, -FLT_MAX, -DBL_MAX, 0};
4878     return tab[depth];
4879 }
4880 
getMaxVal(int depth)4881 static double getMaxVal(int depth)
4882 {
4883     static const double tab[] = {255, 127, 65535, 32767, INT_MAX, FLT_MAX, DBL_MAX, 0};
4884     return tab[depth];
4885 }
4886 
4887 #ifdef HAVE_OPENCL
4888 
ocl_compare(InputArray _src1,InputArray _src2,OutputArray _dst,int op,bool haveScalar)4889 static bool ocl_compare(InputArray _src1, InputArray _src2, OutputArray _dst, int op, bool haveScalar)
4890 {
4891     const ocl::Device& dev = ocl::Device::getDefault();
4892     bool doubleSupport = dev.doubleFPConfig() > 0;
4893     int type1 = _src1.type(), depth1 = CV_MAT_DEPTH(type1), cn = CV_MAT_CN(type1),
4894             type2 = _src2.type(), depth2 = CV_MAT_DEPTH(type2);
4895 
4896     if (!doubleSupport && depth1 == CV_64F)
4897         return false;
4898 
4899     if (!haveScalar && (!_src1.sameSize(_src2) || type1 != type2))
4900             return false;
4901 
4902     int kercn = haveScalar ? cn : ocl::predictOptimalVectorWidth(_src1, _src2, _dst), rowsPerWI = dev.isIntel() ? 4 : 1;
4903     // Workaround for bug with "?:" operator in AMD OpenCL compiler
4904     if (depth1 >= CV_16U)
4905         kercn = 1;
4906 
4907     int scalarcn = kercn == 3 ? 4 : kercn;
4908     const char * const operationMap[] = { "==", ">", ">=", "<", "<=", "!=" };
4909     char cvt[40];
4910 
4911     String opts = format("-D %s -D srcT1=%s -D dstT=%s -D workT=srcT1 -D cn=%d"
4912                          " -D convertToDT=%s -D OP_CMP -D CMP_OPERATOR=%s -D srcT1_C1=%s"
4913                          " -D srcT2_C1=%s -D dstT_C1=%s -D workST=%s -D rowsPerWI=%d%s",
4914                          haveScalar ? "UNARY_OP" : "BINARY_OP",
4915                          ocl::typeToStr(CV_MAKE_TYPE(depth1, kercn)),
4916                          ocl::typeToStr(CV_8UC(kercn)), kercn,
4917                          ocl::convertTypeStr(depth1, CV_8U, kercn, cvt),
4918                          operationMap[op], ocl::typeToStr(depth1),
4919                          ocl::typeToStr(depth1), ocl::typeToStr(CV_8U),
4920                          ocl::typeToStr(CV_MAKE_TYPE(depth1, scalarcn)), rowsPerWI,
4921                          doubleSupport ? " -D DOUBLE_SUPPORT" : "");
4922 
4923     ocl::Kernel k("KF", ocl::core::arithm_oclsrc, opts);
4924     if (k.empty())
4925         return false;
4926 
4927     UMat src1 = _src1.getUMat();
4928     Size size = src1.size();
4929     _dst.create(size, CV_8UC(cn));
4930     UMat dst = _dst.getUMat();
4931 
4932     if (haveScalar)
4933     {
4934         size_t esz = CV_ELEM_SIZE1(type1) * scalarcn;
4935         double buf[4] = { 0, 0, 0, 0 };
4936         Mat src2 = _src2.getMat();
4937 
4938         if( depth1 > CV_32S )
4939             convertAndUnrollScalar( src2, depth1, (uchar *)buf, kercn );
4940         else
4941         {
4942             double fval = 0;
4943             getConvertFunc(depth2, CV_64F)(src2.ptr(), 1, 0, 1, (uchar *)&fval, 1, Size(1, 1), 0);
4944             if( fval < getMinVal(depth1) )
4945                 return dst.setTo(Scalar::all(op == CMP_GT || op == CMP_GE || op == CMP_NE ? 255 : 0)), true;
4946 
4947             if( fval > getMaxVal(depth1) )
4948                 return dst.setTo(Scalar::all(op == CMP_LT || op == CMP_LE || op == CMP_NE ? 255 : 0)), true;
4949 
4950             int ival = cvRound(fval);
4951             if( fval != ival )
4952             {
4953                 if( op == CMP_LT || op == CMP_GE )
4954                     ival = cvCeil(fval);
4955                 else if( op == CMP_LE || op == CMP_GT )
4956                     ival = cvFloor(fval);
4957                 else
4958                     return dst.setTo(Scalar::all(op == CMP_NE ? 255 : 0)), true;
4959             }
4960             convertAndUnrollScalar(Mat(1, 1, CV_32S, &ival), depth1, (uchar *)buf, kercn);
4961         }
4962 
4963         ocl::KernelArg scalararg = ocl::KernelArg(0, 0, 0, 0, buf, esz);
4964 
4965         k.args(ocl::KernelArg::ReadOnlyNoSize(src1, cn, kercn),
4966                ocl::KernelArg::WriteOnly(dst, cn, kercn), scalararg);
4967     }
4968     else
4969     {
4970         UMat src2 = _src2.getUMat();
4971 
4972         k.args(ocl::KernelArg::ReadOnlyNoSize(src1),
4973                ocl::KernelArg::ReadOnlyNoSize(src2),
4974                ocl::KernelArg::WriteOnly(dst, cn, kercn));
4975     }
4976 
4977     size_t globalsize[2] = { dst.cols * cn / kercn, (dst.rows + rowsPerWI - 1) / rowsPerWI };
4978     return k.run(2, globalsize, NULL, false);
4979 }
4980 
4981 #endif
4982 
4983 }
4984 
compare(InputArray _src1,InputArray _src2,OutputArray _dst,int op)4985 void cv::compare(InputArray _src1, InputArray _src2, OutputArray _dst, int op)
4986 {
4987     CV_Assert( op == CMP_LT || op == CMP_LE || op == CMP_EQ ||
4988                op == CMP_NE || op == CMP_GE || op == CMP_GT );
4989 
4990     bool haveScalar = false;
4991 
4992     if ((_src1.isMatx() + _src2.isMatx()) == 1
4993             || !_src1.sameSize(_src2)
4994             || _src1.type() != _src2.type())
4995     {
4996         if (checkScalar(_src1, _src2.type(), _src1.kind(), _src2.kind()))
4997         {
4998             op = op == CMP_LT ? CMP_GT : op == CMP_LE ? CMP_GE :
4999                 op == CMP_GE ? CMP_LE : op == CMP_GT ? CMP_LT : op;
5000             // src1 is a scalar; swap it with src2
5001             compare(_src2, _src1, _dst, op);
5002             return;
5003         }
5004         else if( !checkScalar(_src2, _src1.type(), _src2.kind(), _src1.kind()) )
5005             CV_Error( CV_StsUnmatchedSizes,
5006                      "The operation is neither 'array op array' (where arrays have the same size and the same type), "
5007                      "nor 'array op scalar', nor 'scalar op array'" );
5008         haveScalar = true;
5009     }
5010 
5011     CV_OCL_RUN(_src1.dims() <= 2 && _src2.dims() <= 2 && OCL_PERFORMANCE_CHECK(_dst.isUMat()),
5012                ocl_compare(_src1, _src2, _dst, op, haveScalar))
5013 
5014     int kind1 = _src1.kind(), kind2 = _src2.kind();
5015     Mat src1 = _src1.getMat(), src2 = _src2.getMat();
5016 
5017     if( kind1 == kind2 && src1.dims <= 2 && src2.dims <= 2 && src1.size() == src2.size() && src1.type() == src2.type() )
5018     {
5019         int cn = src1.channels();
5020         _dst.create(src1.size(), CV_8UC(cn));
5021         Mat dst = _dst.getMat();
5022         Size sz = getContinuousSize(src1, src2, dst, src1.channels());
5023         getCmpFunc(src1.depth())(src1.ptr(), src1.step, src2.ptr(), src2.step, dst.ptr(), dst.step, sz, &op);
5024         return;
5025     }
5026 
5027     int cn = src1.channels(), depth1 = src1.depth(), depth2 = src2.depth();
5028 
5029     _dst.create(src1.dims, src1.size, CV_8UC(cn));
5030     src1 = src1.reshape(1); src2 = src2.reshape(1);
5031     Mat dst = _dst.getMat().reshape(1);
5032 
5033     size_t esz = src1.elemSize();
5034     size_t blocksize0 = (size_t)(BLOCK_SIZE + esz-1)/esz;
5035     BinaryFunc func = getCmpFunc(depth1);
5036 
5037     if( !haveScalar )
5038     {
5039         const Mat* arrays[] = { &src1, &src2, &dst, 0 };
5040         uchar* ptrs[3];
5041 
5042         NAryMatIterator it(arrays, ptrs);
5043         size_t total = it.size;
5044 
5045         for( size_t i = 0; i < it.nplanes; i++, ++it )
5046             func( ptrs[0], 0, ptrs[1], 0, ptrs[2], 0, Size((int)total, 1), &op );
5047     }
5048     else
5049     {
5050         const Mat* arrays[] = { &src1, &dst, 0 };
5051         uchar* ptrs[2];
5052 
5053         NAryMatIterator it(arrays, ptrs);
5054         size_t total = it.size, blocksize = std::min(total, blocksize0);
5055 
5056         AutoBuffer<uchar> _buf(blocksize*esz);
5057         uchar *buf = _buf;
5058 
5059         if( depth1 > CV_32S )
5060             convertAndUnrollScalar( src2, depth1, buf, blocksize );
5061         else
5062         {
5063             double fval=0;
5064             getConvertFunc(depth2, CV_64F)(src2.ptr(), 1, 0, 1, (uchar*)&fval, 1, Size(1,1), 0);
5065             if( fval < getMinVal(depth1) )
5066             {
5067                 dst = Scalar::all(op == CMP_GT || op == CMP_GE || op == CMP_NE ? 255 : 0);
5068                 return;
5069             }
5070 
5071             if( fval > getMaxVal(depth1) )
5072             {
5073                 dst = Scalar::all(op == CMP_LT || op == CMP_LE || op == CMP_NE ? 255 : 0);
5074                 return;
5075             }
5076 
5077             int ival = cvRound(fval);
5078             if( fval != ival )
5079             {
5080                 if( op == CMP_LT || op == CMP_GE )
5081                     ival = cvCeil(fval);
5082                 else if( op == CMP_LE || op == CMP_GT )
5083                     ival = cvFloor(fval);
5084                 else
5085                 {
5086                     dst = Scalar::all(op == CMP_NE ? 255 : 0);
5087                     return;
5088                 }
5089             }
5090             convertAndUnrollScalar(Mat(1, 1, CV_32S, &ival), depth1, buf, blocksize);
5091         }
5092 
5093         for( size_t i = 0; i < it.nplanes; i++, ++it )
5094         {
5095             for( size_t j = 0; j < total; j += blocksize )
5096             {
5097                 int bsz = (int)MIN(total - j, blocksize);
5098                 func( ptrs[0], 0, buf, 0, ptrs[1], 0, Size(bsz, 1), &op);
5099                 ptrs[0] += bsz*esz;
5100                 ptrs[1] += bsz;
5101             }
5102         }
5103     }
5104 }
5105 
5106 /****************************************************************************************\
5107 *                                        inRange                                         *
5108 \****************************************************************************************/
5109 
5110 namespace cv
5111 {
5112 
5113 template <typename T>
5114 struct InRange_SIMD
5115 {
operator ()cv::InRange_SIMD5116     int operator () (const T *, const T *, const T *, uchar *, int) const
5117     {
5118         return 0;
5119     }
5120 };
5121 
5122 #if CV_SSE2
5123 
5124 template <>
5125 struct InRange_SIMD<uchar>
5126 {
operator ()cv::InRange_SIMD5127     int operator () (const uchar * src1, const uchar * src2, const uchar * src3,
5128                      uchar * dst, int len) const
5129     {
5130         int x = 0;
5131 
5132         if (USE_SSE2)
5133         {
5134             __m128i v_full = _mm_set1_epi8(-1), v_128 = _mm_set1_epi8(-128);
5135 
5136             for ( ; x <= len - 16; x += 16 )
5137             {
5138                 __m128i v_src = _mm_add_epi8(_mm_loadu_si128((const __m128i *)(src1 + x)), v_128);
5139                 __m128i v_mask1 = _mm_cmpgt_epi8(_mm_add_epi8(_mm_loadu_si128((const __m128i *)(src2 + x)), v_128), v_src);
5140                 __m128i v_mask2 = _mm_cmpgt_epi8(v_src, _mm_add_epi8(_mm_loadu_si128((const __m128i *)(src3 + x)), v_128));
5141                 _mm_storeu_si128((__m128i *)(dst + x), _mm_andnot_si128(_mm_or_si128(v_mask1, v_mask2), v_full));
5142             }
5143         }
5144 
5145         return x;
5146     }
5147 };
5148 
5149 template <>
5150 struct InRange_SIMD<schar>
5151 {
operator ()cv::InRange_SIMD5152     int operator () (const schar * src1, const schar * src2, const schar * src3,
5153                      uchar * dst, int len) const
5154     {
5155         int x = 0;
5156 
5157         if (USE_SSE2)
5158         {
5159             __m128i v_full = _mm_set1_epi8(-1);
5160 
5161             for ( ; x <= len - 16; x += 16 )
5162             {
5163                 __m128i v_src = _mm_loadu_si128((const __m128i *)(src1 + x));
5164                 __m128i v_mask1 = _mm_cmpgt_epi8(_mm_loadu_si128((const __m128i *)(src2 + x)), v_src);
5165                 __m128i v_mask2 = _mm_cmpgt_epi8(v_src, _mm_loadu_si128((const __m128i *)(src3 + x)));
5166                 _mm_storeu_si128((__m128i *)(dst + x), _mm_andnot_si128(_mm_or_si128(v_mask1, v_mask2), v_full));
5167             }
5168         }
5169 
5170         return x;
5171     }
5172 };
5173 
5174 template <>
5175 struct InRange_SIMD<ushort>
5176 {
operator ()cv::InRange_SIMD5177     int operator () (const ushort * src1, const ushort * src2, const ushort * src3,
5178                      uchar * dst, int len) const
5179     {
5180         int x = 0;
5181 
5182         if (USE_SSE2)
5183         {
5184             __m128i v_zero = _mm_setzero_si128(), v_full = _mm_set1_epi16(-1), v_32768 = _mm_set1_epi16(-32768);
5185 
5186             for ( ; x <= len - 8; x += 8 )
5187             {
5188                 __m128i v_src = _mm_add_epi16(_mm_loadu_si128((const __m128i *)(src1 + x)), v_32768);
5189                 __m128i v_mask1 = _mm_cmpgt_epi16(_mm_add_epi16(_mm_loadu_si128((const __m128i *)(src2 + x)), v_32768), v_src);
5190                 __m128i v_mask2 = _mm_cmpgt_epi16(v_src, _mm_add_epi16(_mm_loadu_si128((const __m128i *)(src3 + x)), v_32768));
5191                 __m128i v_res = _mm_andnot_si128(_mm_or_si128(v_mask1, v_mask2), v_full);
5192                 _mm_storel_epi64((__m128i *)(dst + x), _mm_packus_epi16(_mm_srli_epi16(v_res, 8), v_zero));
5193             }
5194         }
5195 
5196         return x;
5197     }
5198 };
5199 
5200 template <>
5201 struct InRange_SIMD<short>
5202 {
operator ()cv::InRange_SIMD5203     int operator () (const short * src1, const short * src2, const short * src3,
5204                      uchar * dst, int len) const
5205     {
5206         int x = 0;
5207 
5208         if (USE_SSE2)
5209         {
5210             __m128i v_zero = _mm_setzero_si128(), v_full = _mm_set1_epi16(-1);
5211 
5212             for ( ; x <= len - 8; x += 8 )
5213             {
5214                 __m128i v_src = _mm_loadu_si128((const __m128i *)(src1 + x));
5215                 __m128i v_mask1 = _mm_cmpgt_epi16(_mm_loadu_si128((const __m128i *)(src2 + x)), v_src);
5216                 __m128i v_mask2 = _mm_cmpgt_epi16(v_src, _mm_loadu_si128((const __m128i *)(src3 + x)));
5217                 __m128i v_res = _mm_andnot_si128(_mm_or_si128(v_mask1, v_mask2), v_full);
5218                 _mm_storel_epi64((__m128i *)(dst + x), _mm_packus_epi16(_mm_srli_epi16(v_res, 8), v_zero));
5219             }
5220         }
5221 
5222         return x;
5223     }
5224 };
5225 
5226 template <>
5227 struct InRange_SIMD<int>
5228 {
operator ()cv::InRange_SIMD5229     int operator () (const int * src1, const int * src2, const int * src3,
5230                      uchar * dst, int len) const
5231     {
5232         int x = 0;
5233 
5234         if (USE_SSE2)
5235         {
5236             __m128i v_zero = _mm_setzero_si128(), v_full = _mm_set1_epi32(-1);
5237 
5238             for ( ; x <= len - 8; x += 8 )
5239             {
5240                 __m128i v_src = _mm_loadu_si128((const __m128i *)(src1 + x));
5241                 __m128i v_res1 = _mm_or_si128(_mm_cmpgt_epi32(_mm_loadu_si128((const __m128i *)(src2 + x)), v_src),
5242                     _mm_cmpgt_epi32(v_src, _mm_loadu_si128((const __m128i *)(src3 + x))));
5243 
5244                 v_src = _mm_loadu_si128((const __m128i *)(src1 + x + 4));
5245                 __m128i v_res2 = _mm_or_si128(_mm_cmpgt_epi32(_mm_loadu_si128((const __m128i *)(src2 + x + 4)), v_src),
5246                     _mm_cmpgt_epi32(v_src, _mm_loadu_si128((const __m128i *)(src3 + x + 4))));
5247 
5248                 __m128i v_res = _mm_packs_epi32(_mm_srli_epi32(_mm_andnot_si128(v_res1, v_full), 16),
5249                                                 _mm_srli_epi32(_mm_andnot_si128(v_res2, v_full), 16));
5250                 _mm_storel_epi64((__m128i *)(dst + x), _mm_packus_epi16(v_res, v_zero));
5251             }
5252         }
5253 
5254         return x;
5255     }
5256 };
5257 
5258 template <>
5259 struct InRange_SIMD<float>
5260 {
operator ()cv::InRange_SIMD5261     int operator () (const float * src1, const float * src2, const float * src3,
5262                      uchar * dst, int len) const
5263     {
5264         int x = 0;
5265 
5266         if (USE_SSE2)
5267         {
5268             __m128i v_zero = _mm_setzero_si128();
5269 
5270             for ( ; x <= len - 8; x += 8 )
5271             {
5272                 __m128 v_src = _mm_loadu_ps(src1 + x);
5273                 __m128 v_res1 = _mm_and_ps(_mm_cmple_ps(_mm_loadu_ps(src2 + x), v_src),
5274                     _mm_cmple_ps(v_src, _mm_loadu_ps(src3 + x)));
5275 
5276                 v_src = _mm_loadu_ps(src1 + x + 4);
5277                 __m128 v_res2 = _mm_and_ps(_mm_cmple_ps(_mm_loadu_ps(src2 + x + 4), v_src),
5278                     _mm_cmple_ps(v_src, _mm_loadu_ps(src3 + x + 4)));
5279 
5280                 __m128i v_res1i = _mm_cvtps_epi32(v_res1), v_res2i = _mm_cvtps_epi32(v_res2);
5281                 __m128i v_res = _mm_packs_epi32(_mm_srli_epi32(v_res1i, 16), _mm_srli_epi32(v_res2i, 16));
5282                 _mm_storel_epi64((__m128i *)(dst + x), _mm_packus_epi16(v_res, v_zero));
5283             }
5284         }
5285 
5286         return x;
5287     }
5288 };
5289 
5290 #elif CV_NEON
5291 
5292 template <>
5293 struct InRange_SIMD<uchar>
5294 {
operator ()cv::InRange_SIMD5295     int operator () (const uchar * src1, const uchar * src2, const uchar * src3,
5296                      uchar * dst, int len) const
5297     {
5298         int x = 0;
5299 
5300         for ( ; x <= len - 16; x += 16 )
5301         {
5302             uint8x16_t values = vld1q_u8(src1 + x);
5303             uint8x16_t low = vld1q_u8(src2 + x);
5304             uint8x16_t high = vld1q_u8(src3 + x);
5305 
5306             vst1q_u8(dst + x, vandq_u8(vcgeq_u8(values, low), vcgeq_u8(high, values)));
5307         }
5308         return x;
5309     }
5310 };
5311 
5312 template <>
5313 struct InRange_SIMD<schar>
5314 {
operator ()cv::InRange_SIMD5315     int operator () (const schar * src1, const schar * src2, const schar * src3,
5316                      uchar * dst, int len) const
5317     {
5318         int x = 0;
5319 
5320         for ( ; x <= len - 16; x += 16 )
5321         {
5322             int8x16_t values = vld1q_s8(src1 + x);
5323             int8x16_t low = vld1q_s8(src2 + x);
5324             int8x16_t high = vld1q_s8(src3 + x);
5325 
5326             vst1q_u8(dst + x, vandq_u8(vcgeq_s8(values, low), vcgeq_s8(high, values)));
5327         }
5328         return x;
5329     }
5330 };
5331 
5332 template <>
5333 struct InRange_SIMD<ushort>
5334 {
operator ()cv::InRange_SIMD5335     int operator () (const ushort * src1, const ushort * src2, const ushort * src3,
5336                      uchar * dst, int len) const
5337     {
5338         int x = 0;
5339 
5340         for ( ; x <= len - 16; x += 16 )
5341         {
5342             uint16x8_t values = vld1q_u16((const uint16_t*)(src1 + x));
5343             uint16x8_t low = vld1q_u16((const uint16_t*)(src2 + x));
5344             uint16x8_t high = vld1q_u16((const uint16_t*)(src3 + x));
5345             uint8x8_t  r1 = vmovn_u16(vandq_u16(vcgeq_u16(values, low), vcgeq_u16(high, values)));
5346 
5347             values = vld1q_u16((const uint16_t*)(src1 + x + 8));
5348             low = vld1q_u16((const uint16_t*)(src2 + x + 8));
5349             high = vld1q_u16((const uint16_t*)(src3 + x + 8));
5350             uint8x8_t  r2 = vmovn_u16(vandq_u16(vcgeq_u16(values, low), vcgeq_u16(high, values)));
5351 
5352             vst1q_u8(dst + x, vcombine_u8(r1, r2));
5353         }
5354         return x;
5355     }
5356 };
5357 
5358 template <>
5359 struct InRange_SIMD<short>
5360 {
operator ()cv::InRange_SIMD5361     int operator () (const short * src1, const short * src2, const short * src3,
5362                      uchar * dst, int len) const
5363     {
5364         int x = 0;
5365 
5366         for ( ; x <= len - 16; x += 16 )
5367         {
5368             int16x8_t values = vld1q_s16((const int16_t*)(src1 + x));
5369             int16x8_t low = vld1q_s16((const int16_t*)(src2 + x));
5370             int16x8_t high = vld1q_s16((const int16_t*)(src3 + x));
5371             uint8x8_t r1 = vmovn_u16(vandq_u16(vcgeq_s16(values, low), vcgeq_s16(high, values)));
5372 
5373             values = vld1q_s16((const int16_t*)(src1 + x + 8));
5374             low = vld1q_s16((const int16_t*)(src2 + x + 8));
5375             high = vld1q_s16((const int16_t*)(src3 + x + 8));
5376             uint8x8_t r2 = vmovn_u16(vandq_u16(vcgeq_s16(values, low), vcgeq_s16(high, values)));
5377 
5378             vst1q_u8(dst + x, vcombine_u8(r1, r2));
5379         }
5380         return x;
5381     }
5382 };
5383 
5384 template <>
5385 struct InRange_SIMD<int>
5386 {
operator ()cv::InRange_SIMD5387     int operator () (const int * src1, const int * src2, const int * src3,
5388                      uchar * dst, int len) const
5389     {
5390         int x = 0;
5391 
5392         for ( ; x <= len - 8; x += 8 )
5393         {
5394             int32x4_t values = vld1q_s32((const int32_t*)(src1 + x));
5395             int32x4_t low = vld1q_s32((const int32_t*)(src2 + x));
5396             int32x4_t high = vld1q_s32((const int32_t*)(src3 + x));
5397 
5398             uint16x4_t r1 = vmovn_u32(vandq_u32(vcgeq_s32(values, low), vcgeq_s32(high, values)));
5399 
5400             values = vld1q_s32((const int32_t*)(src1 + x + 4));
5401             low = vld1q_s32((const int32_t*)(src2 + x + 4));
5402             high = vld1q_s32((const int32_t*)(src3 + x + 4));
5403 
5404             uint16x4_t r2 = vmovn_u32(vandq_u32(vcgeq_s32(values, low), vcgeq_s32(high, values)));
5405 
5406             uint16x8_t res_16 = vcombine_u16(r1, r2);
5407 
5408             vst1_u8(dst + x, vmovn_u16(res_16));
5409         }
5410         return x;
5411     }
5412 };
5413 
5414 template <>
5415 struct InRange_SIMD<float>
5416 {
operator ()cv::InRange_SIMD5417     int operator () (const float * src1, const float * src2, const float * src3,
5418                      uchar * dst, int len) const
5419     {
5420         int x = 0;
5421 
5422         for ( ; x <= len - 8; x += 8 )
5423         {
5424             float32x4_t values = vld1q_f32((const float32_t*)(src1 + x));
5425             float32x4_t low = vld1q_f32((const float32_t*)(src2 + x));
5426             float32x4_t high = vld1q_f32((const float32_t*)(src3 + x));
5427 
5428             uint16x4_t r1 = vmovn_u32(vandq_u32(vcgeq_f32(values, low), vcgeq_f32(high, values)));
5429 
5430             values = vld1q_f32((const float32_t*)(src1 + x + 4));
5431             low = vld1q_f32((const float32_t*)(src2 + x + 4));
5432             high = vld1q_f32((const float32_t*)(src3 + x + 4));
5433 
5434             uint16x4_t r2 = vmovn_u32(vandq_u32(vcgeq_f32(values, low), vcgeq_f32(high, values)));
5435 
5436             uint16x8_t res_16 = vcombine_u16(r1, r2);
5437 
5438             vst1_u8(dst + x, vmovn_u16(res_16));
5439         }
5440         return x;
5441     }
5442 };
5443 
5444 #endif
5445 
5446 template <typename T>
inRange_(const T * src1,size_t step1,const T * src2,size_t step2,const T * src3,size_t step3,uchar * dst,size_t step,Size size)5447 static void inRange_(const T* src1, size_t step1, const T* src2, size_t step2,
5448          const T* src3, size_t step3, uchar* dst, size_t step,
5449          Size size)
5450 {
5451     step1 /= sizeof(src1[0]);
5452     step2 /= sizeof(src2[0]);
5453     step3 /= sizeof(src3[0]);
5454 
5455     InRange_SIMD<T> vop;
5456 
5457     for( ; size.height--; src1 += step1, src2 += step2, src3 += step3, dst += step )
5458     {
5459         int x = vop(src1, src2, src3, dst, size.width);
5460         #if CV_ENABLE_UNROLLED
5461         for( ; x <= size.width - 4; x += 4 )
5462         {
5463             int t0, t1;
5464             t0 = src2[x] <= src1[x] && src1[x] <= src3[x];
5465             t1 = src2[x+1] <= src1[x+1] && src1[x+1] <= src3[x+1];
5466             dst[x] = (uchar)-t0; dst[x+1] = (uchar)-t1;
5467             t0 = src2[x+2] <= src1[x+2] && src1[x+2] <= src3[x+2];
5468             t1 = src2[x+3] <= src1[x+3] && src1[x+3] <= src3[x+3];
5469             dst[x+2] = (uchar)-t0; dst[x+3] = (uchar)-t1;
5470         }
5471         #endif
5472         for( ; x < size.width; x++ )
5473             dst[x] = (uchar)-(src2[x] <= src1[x] && src1[x] <= src3[x]);
5474     }
5475 }
5476 
5477 
inRange8u(const uchar * src1,size_t step1,const uchar * src2,size_t step2,const uchar * src3,size_t step3,uchar * dst,size_t step,Size size)5478 static void inRange8u(const uchar* src1, size_t step1, const uchar* src2, size_t step2,
5479                       const uchar* src3, size_t step3, uchar* dst, size_t step, Size size)
5480 {
5481     inRange_(src1, step1, src2, step2, src3, step3, dst, step, size);
5482 }
5483 
inRange8s(const schar * src1,size_t step1,const schar * src2,size_t step2,const schar * src3,size_t step3,uchar * dst,size_t step,Size size)5484 static void inRange8s(const schar* src1, size_t step1, const schar* src2, size_t step2,
5485                       const schar* src3, size_t step3, uchar* dst, size_t step, Size size)
5486 {
5487     inRange_(src1, step1, src2, step2, src3, step3, dst, step, size);
5488 }
5489 
inRange16u(const ushort * src1,size_t step1,const ushort * src2,size_t step2,const ushort * src3,size_t step3,uchar * dst,size_t step,Size size)5490 static void inRange16u(const ushort* src1, size_t step1, const ushort* src2, size_t step2,
5491                        const ushort* src3, size_t step3, uchar* dst, size_t step, Size size)
5492 {
5493     inRange_(src1, step1, src2, step2, src3, step3, dst, step, size);
5494 }
5495 
inRange16s(const short * src1,size_t step1,const short * src2,size_t step2,const short * src3,size_t step3,uchar * dst,size_t step,Size size)5496 static void inRange16s(const short* src1, size_t step1, const short* src2, size_t step2,
5497                        const short* src3, size_t step3, uchar* dst, size_t step, Size size)
5498 {
5499     inRange_(src1, step1, src2, step2, src3, step3, dst, step, size);
5500 }
5501 
inRange32s(const int * src1,size_t step1,const int * src2,size_t step2,const int * src3,size_t step3,uchar * dst,size_t step,Size size)5502 static void inRange32s(const int* src1, size_t step1, const int* src2, size_t step2,
5503                        const int* src3, size_t step3, uchar* dst, size_t step, Size size)
5504 {
5505     inRange_(src1, step1, src2, step2, src3, step3, dst, step, size);
5506 }
5507 
inRange32f(const float * src1,size_t step1,const float * src2,size_t step2,const float * src3,size_t step3,uchar * dst,size_t step,Size size)5508 static void inRange32f(const float* src1, size_t step1, const float* src2, size_t step2,
5509                        const float* src3, size_t step3, uchar* dst, size_t step, Size size)
5510 {
5511     inRange_(src1, step1, src2, step2, src3, step3, dst, step, size);
5512 }
5513 
inRange64f(const double * src1,size_t step1,const double * src2,size_t step2,const double * src3,size_t step3,uchar * dst,size_t step,Size size)5514 static void inRange64f(const double* src1, size_t step1, const double* src2, size_t step2,
5515                        const double* src3, size_t step3, uchar* dst, size_t step, Size size)
5516 {
5517     inRange_(src1, step1, src2, step2, src3, step3, dst, step, size);
5518 }
5519 
inRangeReduce(const uchar * src,uchar * dst,size_t len,int cn)5520 static void inRangeReduce(const uchar* src, uchar* dst, size_t len, int cn)
5521 {
5522     int k = cn % 4 ? cn % 4 : 4;
5523     size_t i, j;
5524     if( k == 1 )
5525         for( i = j = 0; i < len; i++, j += cn )
5526             dst[i] = src[j];
5527     else if( k == 2 )
5528         for( i = j = 0; i < len; i++, j += cn )
5529             dst[i] = src[j] & src[j+1];
5530     else if( k == 3 )
5531         for( i = j = 0; i < len; i++, j += cn )
5532             dst[i] = src[j] & src[j+1] & src[j+2];
5533     else
5534         for( i = j = 0; i < len; i++, j += cn )
5535             dst[i] = src[j] & src[j+1] & src[j+2] & src[j+3];
5536 
5537     for( ; k < cn; k += 4 )
5538     {
5539         for( i = 0, j = k; i < len; i++, j += cn )
5540             dst[i] &= src[j] & src[j+1] & src[j+2] & src[j+3];
5541     }
5542 }
5543 
5544 typedef void (*InRangeFunc)( const uchar* src1, size_t step1, const uchar* src2, size_t step2,
5545                              const uchar* src3, size_t step3, uchar* dst, size_t step, Size sz );
5546 
getInRangeFunc(int depth)5547 static InRangeFunc getInRangeFunc(int depth)
5548 {
5549     static InRangeFunc inRangeTab[] =
5550     {
5551         (InRangeFunc)GET_OPTIMIZED(inRange8u), (InRangeFunc)GET_OPTIMIZED(inRange8s), (InRangeFunc)GET_OPTIMIZED(inRange16u),
5552         (InRangeFunc)GET_OPTIMIZED(inRange16s), (InRangeFunc)GET_OPTIMIZED(inRange32s), (InRangeFunc)GET_OPTIMIZED(inRange32f),
5553         (InRangeFunc)inRange64f, 0
5554     };
5555 
5556     return inRangeTab[depth];
5557 }
5558 
5559 #ifdef HAVE_OPENCL
5560 
ocl_inRange(InputArray _src,InputArray _lowerb,InputArray _upperb,OutputArray _dst)5561 static bool ocl_inRange( InputArray _src, InputArray _lowerb,
5562                          InputArray _upperb, OutputArray _dst )
5563 {
5564     const ocl::Device & d = ocl::Device::getDefault();
5565     int skind = _src.kind(), lkind = _lowerb.kind(), ukind = _upperb.kind();
5566     Size ssize = _src.size(), lsize = _lowerb.size(), usize = _upperb.size();
5567     int stype = _src.type(), ltype = _lowerb.type(), utype = _upperb.type();
5568     int sdepth = CV_MAT_DEPTH(stype), ldepth = CV_MAT_DEPTH(ltype), udepth = CV_MAT_DEPTH(utype);
5569     int cn = CV_MAT_CN(stype), rowsPerWI = d.isIntel() ? 4 : 1;
5570     bool lbScalar = false, ubScalar = false;
5571 
5572     if( (lkind == _InputArray::MATX && skind != _InputArray::MATX) ||
5573         ssize != lsize || stype != ltype )
5574     {
5575         if( !checkScalar(_lowerb, stype, lkind, skind) )
5576             CV_Error( CV_StsUnmatchedSizes,
5577                      "The lower bounary is neither an array of the same size and same type as src, nor a scalar");
5578         lbScalar = true;
5579     }
5580 
5581     if( (ukind == _InputArray::MATX && skind != _InputArray::MATX) ||
5582         ssize != usize || stype != utype )
5583     {
5584         if( !checkScalar(_upperb, stype, ukind, skind) )
5585             CV_Error( CV_StsUnmatchedSizes,
5586                      "The upper bounary is neither an array of the same size and same type as src, nor a scalar");
5587         ubScalar = true;
5588     }
5589 
5590     if (lbScalar != ubScalar)
5591         return false;
5592 
5593     bool doubleSupport = d.doubleFPConfig() > 0,
5594             haveScalar = lbScalar && ubScalar;
5595 
5596     if ( (!doubleSupport && sdepth == CV_64F) ||
5597          (!haveScalar && (sdepth != ldepth || sdepth != udepth)) )
5598         return false;
5599 
5600     int kercn = haveScalar ? cn : std::max(std::min(ocl::predictOptimalVectorWidth(_src, _lowerb, _upperb, _dst), 4), cn);
5601     if (kercn % cn != 0)
5602         kercn = cn;
5603     int colsPerWI = kercn / cn;
5604     String opts = format("%s-D cn=%d -D srcT=%s -D srcT1=%s -D dstT=%s -D kercn=%d -D depth=%d%s -D colsPerWI=%d",
5605                            haveScalar ? "-D HAVE_SCALAR " : "", cn, ocl::typeToStr(CV_MAKE_TYPE(sdepth, kercn)),
5606                            ocl::typeToStr(sdepth), ocl::typeToStr(CV_8UC(colsPerWI)), kercn, sdepth,
5607                            doubleSupport ? " -D DOUBLE_SUPPORT" : "", colsPerWI);
5608 
5609     ocl::Kernel ker("inrange", ocl::core::inrange_oclsrc, opts);
5610     if (ker.empty())
5611         return false;
5612 
5613     _dst.create(ssize, CV_8UC1);
5614     UMat src = _src.getUMat(), dst = _dst.getUMat(), lscalaru, uscalaru;
5615     Mat lscalar, uscalar;
5616 
5617     if (lbScalar && ubScalar)
5618     {
5619         lscalar = _lowerb.getMat();
5620         uscalar = _upperb.getMat();
5621 
5622         size_t esz = src.elemSize();
5623         size_t blocksize = 36;
5624 
5625         AutoBuffer<uchar> _buf(blocksize*(((int)lbScalar + (int)ubScalar)*esz + cn) + 2*cn*sizeof(int) + 128);
5626         uchar *buf = alignPtr(_buf + blocksize*cn, 16);
5627 
5628         if( ldepth != sdepth && sdepth < CV_32S )
5629         {
5630             int* ilbuf = (int*)alignPtr(buf + blocksize*esz, 16);
5631             int* iubuf = ilbuf + cn;
5632 
5633             BinaryFunc sccvtfunc = getConvertFunc(ldepth, CV_32S);
5634             sccvtfunc(lscalar.ptr(), 1, 0, 1, (uchar*)ilbuf, 1, Size(cn, 1), 0);
5635             sccvtfunc(uscalar.ptr(), 1, 0, 1, (uchar*)iubuf, 1, Size(cn, 1), 0);
5636             int minval = cvRound(getMinVal(sdepth)), maxval = cvRound(getMaxVal(sdepth));
5637 
5638             for( int k = 0; k < cn; k++ )
5639             {
5640                 if( ilbuf[k] > iubuf[k] || ilbuf[k] > maxval || iubuf[k] < minval )
5641                     ilbuf[k] = minval+1, iubuf[k] = minval;
5642             }
5643             lscalar = Mat(cn, 1, CV_32S, ilbuf);
5644             uscalar = Mat(cn, 1, CV_32S, iubuf);
5645         }
5646 
5647         lscalar.convertTo(lscalar, stype);
5648         uscalar.convertTo(uscalar, stype);
5649     }
5650     else
5651     {
5652         lscalaru = _lowerb.getUMat();
5653         uscalaru = _upperb.getUMat();
5654     }
5655 
5656     ocl::KernelArg srcarg = ocl::KernelArg::ReadOnlyNoSize(src),
5657             dstarg = ocl::KernelArg::WriteOnly(dst, 1, colsPerWI);
5658 
5659     if (haveScalar)
5660     {
5661         lscalar.copyTo(lscalaru);
5662         uscalar.copyTo(uscalaru);
5663 
5664         ker.args(srcarg, dstarg, ocl::KernelArg::PtrReadOnly(lscalaru),
5665                ocl::KernelArg::PtrReadOnly(uscalaru), rowsPerWI);
5666     }
5667     else
5668         ker.args(srcarg, dstarg, ocl::KernelArg::ReadOnlyNoSize(lscalaru),
5669                ocl::KernelArg::ReadOnlyNoSize(uscalaru), rowsPerWI);
5670 
5671     size_t globalsize[2] = { ssize.width / colsPerWI, (ssize.height + rowsPerWI - 1) / rowsPerWI };
5672     return ker.run(2, globalsize, NULL, false);
5673 }
5674 
5675 #endif
5676 
5677 }
5678 
inRange(InputArray _src,InputArray _lowerb,InputArray _upperb,OutputArray _dst)5679 void cv::inRange(InputArray _src, InputArray _lowerb,
5680                  InputArray _upperb, OutputArray _dst)
5681 {
5682     CV_OCL_RUN(_src.dims() <= 2 && _lowerb.dims() <= 2 &&
5683                _upperb.dims() <= 2 && OCL_PERFORMANCE_CHECK(_dst.isUMat()),
5684                ocl_inRange(_src, _lowerb, _upperb, _dst))
5685 
5686     int skind = _src.kind(), lkind = _lowerb.kind(), ukind = _upperb.kind();
5687     Mat src = _src.getMat(), lb = _lowerb.getMat(), ub = _upperb.getMat();
5688 
5689     bool lbScalar = false, ubScalar = false;
5690 
5691     if( (lkind == _InputArray::MATX && skind != _InputArray::MATX) ||
5692         src.size != lb.size || src.type() != lb.type() )
5693     {
5694         if( !checkScalar(lb, src.type(), lkind, skind) )
5695             CV_Error( CV_StsUnmatchedSizes,
5696                      "The lower bounary is neither an array of the same size and same type as src, nor a scalar");
5697         lbScalar = true;
5698     }
5699 
5700     if( (ukind == _InputArray::MATX && skind != _InputArray::MATX) ||
5701         src.size != ub.size || src.type() != ub.type() )
5702     {
5703         if( !checkScalar(ub, src.type(), ukind, skind) )
5704             CV_Error( CV_StsUnmatchedSizes,
5705                      "The upper bounary is neither an array of the same size and same type as src, nor a scalar");
5706         ubScalar = true;
5707     }
5708 
5709     CV_Assert(lbScalar == ubScalar);
5710 
5711     int cn = src.channels(), depth = src.depth();
5712 
5713     size_t esz = src.elemSize();
5714     size_t blocksize0 = (size_t)(BLOCK_SIZE + esz-1)/esz;
5715 
5716     _dst.create(src.dims, src.size, CV_8UC1);
5717     Mat dst = _dst.getMat();
5718     InRangeFunc func = getInRangeFunc(depth);
5719 
5720     const Mat* arrays_sc[] = { &src, &dst, 0 };
5721     const Mat* arrays_nosc[] = { &src, &dst, &lb, &ub, 0 };
5722     uchar* ptrs[4];
5723 
5724     NAryMatIterator it(lbScalar && ubScalar ? arrays_sc : arrays_nosc, ptrs);
5725     size_t total = it.size, blocksize = std::min(total, blocksize0);
5726 
5727     AutoBuffer<uchar> _buf(blocksize*(((int)lbScalar + (int)ubScalar)*esz + cn) + 2*cn*sizeof(int) + 128);
5728     uchar *buf = _buf, *mbuf = buf, *lbuf = 0, *ubuf = 0;
5729     buf = alignPtr(buf + blocksize*cn, 16);
5730 
5731     if( lbScalar && ubScalar )
5732     {
5733         lbuf = buf;
5734         ubuf = buf = alignPtr(buf + blocksize*esz, 16);
5735 
5736         CV_Assert( lb.type() == ub.type() );
5737         int scdepth = lb.depth();
5738 
5739         if( scdepth != depth && depth < CV_32S )
5740         {
5741             int* ilbuf = (int*)alignPtr(buf + blocksize*esz, 16);
5742             int* iubuf = ilbuf + cn;
5743 
5744             BinaryFunc sccvtfunc = getConvertFunc(scdepth, CV_32S);
5745             sccvtfunc(lb.ptr(), 1, 0, 1, (uchar*)ilbuf, 1, Size(cn, 1), 0);
5746             sccvtfunc(ub.ptr(), 1, 0, 1, (uchar*)iubuf, 1, Size(cn, 1), 0);
5747             int minval = cvRound(getMinVal(depth)), maxval = cvRound(getMaxVal(depth));
5748 
5749             for( int k = 0; k < cn; k++ )
5750             {
5751                 if( ilbuf[k] > iubuf[k] || ilbuf[k] > maxval || iubuf[k] < minval )
5752                     ilbuf[k] = minval+1, iubuf[k] = minval;
5753             }
5754             lb = Mat(cn, 1, CV_32S, ilbuf);
5755             ub = Mat(cn, 1, CV_32S, iubuf);
5756         }
5757 
5758         convertAndUnrollScalar( lb, src.type(), lbuf, blocksize );
5759         convertAndUnrollScalar( ub, src.type(), ubuf, blocksize );
5760     }
5761 
5762     for( size_t i = 0; i < it.nplanes; i++, ++it )
5763     {
5764         for( size_t j = 0; j < total; j += blocksize )
5765         {
5766             int bsz = (int)MIN(total - j, blocksize);
5767             size_t delta = bsz*esz;
5768             uchar *lptr = lbuf, *uptr = ubuf;
5769             if( !lbScalar )
5770             {
5771                 lptr = ptrs[2];
5772                 ptrs[2] += delta;
5773             }
5774             if( !ubScalar )
5775             {
5776                 int idx = !lbScalar ? 3 : 2;
5777                 uptr = ptrs[idx];
5778                 ptrs[idx] += delta;
5779             }
5780             func( ptrs[0], 0, lptr, 0, uptr, 0, cn == 1 ? ptrs[1] : mbuf, 0, Size(bsz*cn, 1));
5781             if( cn > 1 )
5782                 inRangeReduce(mbuf, ptrs[1], bsz, cn);
5783             ptrs[0] += delta;
5784             ptrs[1] += bsz;
5785         }
5786     }
5787 }
5788 
5789 /****************************************************************************************\
5790 *                                Earlier API: cvAdd etc.                                 *
5791 \****************************************************************************************/
5792 
5793 CV_IMPL void
cvNot(const CvArr * srcarr,CvArr * dstarr)5794 cvNot( const CvArr* srcarr, CvArr* dstarr )
5795 {
5796     cv::Mat src = cv::cvarrToMat(srcarr), dst = cv::cvarrToMat(dstarr);
5797     CV_Assert( src.size == dst.size && src.type() == dst.type() );
5798     cv::bitwise_not( src, dst );
5799 }
5800 
5801 
5802 CV_IMPL void
cvAnd(const CvArr * srcarr1,const CvArr * srcarr2,CvArr * dstarr,const CvArr * maskarr)5803 cvAnd( const CvArr* srcarr1, const CvArr* srcarr2, CvArr* dstarr, const CvArr* maskarr )
5804 {
5805     cv::Mat src1 = cv::cvarrToMat(srcarr1), src2 = cv::cvarrToMat(srcarr2),
5806         dst = cv::cvarrToMat(dstarr), mask;
5807     CV_Assert( src1.size == dst.size && src1.type() == dst.type() );
5808     if( maskarr )
5809         mask = cv::cvarrToMat(maskarr);
5810     cv::bitwise_and( src1, src2, dst, mask );
5811 }
5812 
5813 
5814 CV_IMPL void
cvOr(const CvArr * srcarr1,const CvArr * srcarr2,CvArr * dstarr,const CvArr * maskarr)5815 cvOr( const CvArr* srcarr1, const CvArr* srcarr2, CvArr* dstarr, const CvArr* maskarr )
5816 {
5817     cv::Mat src1 = cv::cvarrToMat(srcarr1), src2 = cv::cvarrToMat(srcarr2),
5818         dst = cv::cvarrToMat(dstarr), mask;
5819     CV_Assert( src1.size == dst.size && src1.type() == dst.type() );
5820     if( maskarr )
5821         mask = cv::cvarrToMat(maskarr);
5822     cv::bitwise_or( src1, src2, dst, mask );
5823 }
5824 
5825 
5826 CV_IMPL void
cvXor(const CvArr * srcarr1,const CvArr * srcarr2,CvArr * dstarr,const CvArr * maskarr)5827 cvXor( const CvArr* srcarr1, const CvArr* srcarr2, CvArr* dstarr, const CvArr* maskarr )
5828 {
5829     cv::Mat src1 = cv::cvarrToMat(srcarr1), src2 = cv::cvarrToMat(srcarr2),
5830         dst = cv::cvarrToMat(dstarr), mask;
5831     CV_Assert( src1.size == dst.size && src1.type() == dst.type() );
5832     if( maskarr )
5833         mask = cv::cvarrToMat(maskarr);
5834     cv::bitwise_xor( src1, src2, dst, mask );
5835 }
5836 
5837 
5838 CV_IMPL void
cvAndS(const CvArr * srcarr,CvScalar s,CvArr * dstarr,const CvArr * maskarr)5839 cvAndS( const CvArr* srcarr, CvScalar s, CvArr* dstarr, const CvArr* maskarr )
5840 {
5841     cv::Mat src = cv::cvarrToMat(srcarr), dst = cv::cvarrToMat(dstarr), mask;
5842     CV_Assert( src.size == dst.size && src.type() == dst.type() );
5843     if( maskarr )
5844         mask = cv::cvarrToMat(maskarr);
5845     cv::bitwise_and( src, (const cv::Scalar&)s, dst, mask );
5846 }
5847 
5848 
5849 CV_IMPL void
cvOrS(const CvArr * srcarr,CvScalar s,CvArr * dstarr,const CvArr * maskarr)5850 cvOrS( const CvArr* srcarr, CvScalar s, CvArr* dstarr, const CvArr* maskarr )
5851 {
5852     cv::Mat src = cv::cvarrToMat(srcarr), dst = cv::cvarrToMat(dstarr), mask;
5853     CV_Assert( src.size == dst.size && src.type() == dst.type() );
5854     if( maskarr )
5855         mask = cv::cvarrToMat(maskarr);
5856     cv::bitwise_or( src, (const cv::Scalar&)s, dst, mask );
5857 }
5858 
5859 
5860 CV_IMPL void
cvXorS(const CvArr * srcarr,CvScalar s,CvArr * dstarr,const CvArr * maskarr)5861 cvXorS( const CvArr* srcarr, CvScalar s, CvArr* dstarr, const CvArr* maskarr )
5862 {
5863     cv::Mat src = cv::cvarrToMat(srcarr), dst = cv::cvarrToMat(dstarr), mask;
5864     CV_Assert( src.size == dst.size && src.type() == dst.type() );
5865     if( maskarr )
5866         mask = cv::cvarrToMat(maskarr);
5867     cv::bitwise_xor( src, (const cv::Scalar&)s, dst, mask );
5868 }
5869 
5870 
cvAdd(const CvArr * srcarr1,const CvArr * srcarr2,CvArr * dstarr,const CvArr * maskarr)5871 CV_IMPL void cvAdd( const CvArr* srcarr1, const CvArr* srcarr2, CvArr* dstarr, const CvArr* maskarr )
5872 {
5873     cv::Mat src1 = cv::cvarrToMat(srcarr1), src2 = cv::cvarrToMat(srcarr2),
5874         dst = cv::cvarrToMat(dstarr), mask;
5875     CV_Assert( src1.size == dst.size && src1.channels() == dst.channels() );
5876     if( maskarr )
5877         mask = cv::cvarrToMat(maskarr);
5878     cv::add( src1, src2, dst, mask, dst.type() );
5879 }
5880 
5881 
cvSub(const CvArr * srcarr1,const CvArr * srcarr2,CvArr * dstarr,const CvArr * maskarr)5882 CV_IMPL void cvSub( const CvArr* srcarr1, const CvArr* srcarr2, CvArr* dstarr, const CvArr* maskarr )
5883 {
5884     cv::Mat src1 = cv::cvarrToMat(srcarr1), src2 = cv::cvarrToMat(srcarr2),
5885         dst = cv::cvarrToMat(dstarr), mask;
5886     CV_Assert( src1.size == dst.size && src1.channels() == dst.channels() );
5887     if( maskarr )
5888         mask = cv::cvarrToMat(maskarr);
5889     cv::subtract( src1, src2, dst, mask, dst.type() );
5890 }
5891 
5892 
cvAddS(const CvArr * srcarr1,CvScalar value,CvArr * dstarr,const CvArr * maskarr)5893 CV_IMPL void cvAddS( const CvArr* srcarr1, CvScalar value, CvArr* dstarr, const CvArr* maskarr )
5894 {
5895     cv::Mat src1 = cv::cvarrToMat(srcarr1),
5896         dst = cv::cvarrToMat(dstarr), mask;
5897     CV_Assert( src1.size == dst.size && src1.channels() == dst.channels() );
5898     if( maskarr )
5899         mask = cv::cvarrToMat(maskarr);
5900     cv::add( src1, (const cv::Scalar&)value, dst, mask, dst.type() );
5901 }
5902 
5903 
cvSubRS(const CvArr * srcarr1,CvScalar value,CvArr * dstarr,const CvArr * maskarr)5904 CV_IMPL void cvSubRS( const CvArr* srcarr1, CvScalar value, CvArr* dstarr, const CvArr* maskarr )
5905 {
5906     cv::Mat src1 = cv::cvarrToMat(srcarr1),
5907         dst = cv::cvarrToMat(dstarr), mask;
5908     CV_Assert( src1.size == dst.size && src1.channels() == dst.channels() );
5909     if( maskarr )
5910         mask = cv::cvarrToMat(maskarr);
5911     cv::subtract( (const cv::Scalar&)value, src1, dst, mask, dst.type() );
5912 }
5913 
5914 
cvMul(const CvArr * srcarr1,const CvArr * srcarr2,CvArr * dstarr,double scale)5915 CV_IMPL void cvMul( const CvArr* srcarr1, const CvArr* srcarr2,
5916                     CvArr* dstarr, double scale )
5917 {
5918     cv::Mat src1 = cv::cvarrToMat(srcarr1), src2 = cv::cvarrToMat(srcarr2),
5919         dst = cv::cvarrToMat(dstarr);
5920     CV_Assert( src1.size == dst.size && src1.channels() == dst.channels() );
5921     cv::multiply( src1, src2, dst, scale, dst.type() );
5922 }
5923 
5924 
cvDiv(const CvArr * srcarr1,const CvArr * srcarr2,CvArr * dstarr,double scale)5925 CV_IMPL void cvDiv( const CvArr* srcarr1, const CvArr* srcarr2,
5926                     CvArr* dstarr, double scale )
5927 {
5928     cv::Mat src2 = cv::cvarrToMat(srcarr2),
5929         dst = cv::cvarrToMat(dstarr), mask;
5930     CV_Assert( src2.size == dst.size && src2.channels() == dst.channels() );
5931 
5932     if( srcarr1 )
5933         cv::divide( cv::cvarrToMat(srcarr1), src2, dst, scale, dst.type() );
5934     else
5935         cv::divide( scale, src2, dst, dst.type() );
5936 }
5937 
5938 
5939 CV_IMPL void
cvAddWeighted(const CvArr * srcarr1,double alpha,const CvArr * srcarr2,double beta,double gamma,CvArr * dstarr)5940 cvAddWeighted( const CvArr* srcarr1, double alpha,
5941                const CvArr* srcarr2, double beta,
5942                double gamma, CvArr* dstarr )
5943 {
5944     cv::Mat src1 = cv::cvarrToMat(srcarr1), src2 = cv::cvarrToMat(srcarr2),
5945         dst = cv::cvarrToMat(dstarr);
5946     CV_Assert( src1.size == dst.size && src1.channels() == dst.channels() );
5947     cv::addWeighted( src1, alpha, src2, beta, gamma, dst, dst.type() );
5948 }
5949 
5950 
5951 CV_IMPL  void
cvAbsDiff(const CvArr * srcarr1,const CvArr * srcarr2,CvArr * dstarr)5952 cvAbsDiff( const CvArr* srcarr1, const CvArr* srcarr2, CvArr* dstarr )
5953 {
5954     cv::Mat src1 = cv::cvarrToMat(srcarr1), dst = cv::cvarrToMat(dstarr);
5955     CV_Assert( src1.size == dst.size && src1.type() == dst.type() );
5956 
5957     cv::absdiff( src1, cv::cvarrToMat(srcarr2), dst );
5958 }
5959 
5960 
5961 CV_IMPL void
cvAbsDiffS(const CvArr * srcarr1,CvArr * dstarr,CvScalar scalar)5962 cvAbsDiffS( const CvArr* srcarr1, CvArr* dstarr, CvScalar scalar )
5963 {
5964     cv::Mat src1 = cv::cvarrToMat(srcarr1), dst = cv::cvarrToMat(dstarr);
5965     CV_Assert( src1.size == dst.size && src1.type() == dst.type() );
5966 
5967     cv::absdiff( src1, (const cv::Scalar&)scalar, dst );
5968 }
5969 
5970 
5971 CV_IMPL void
cvInRange(const void * srcarr1,const void * srcarr2,const void * srcarr3,void * dstarr)5972 cvInRange( const void* srcarr1, const void* srcarr2,
5973            const void* srcarr3, void* dstarr )
5974 {
5975     cv::Mat src1 = cv::cvarrToMat(srcarr1), dst = cv::cvarrToMat(dstarr);
5976     CV_Assert( src1.size == dst.size && dst.type() == CV_8U );
5977 
5978     cv::inRange( src1, cv::cvarrToMat(srcarr2), cv::cvarrToMat(srcarr3), dst );
5979 }
5980 
5981 
5982 CV_IMPL void
cvInRangeS(const void * srcarr1,CvScalar lowerb,CvScalar upperb,void * dstarr)5983 cvInRangeS( const void* srcarr1, CvScalar lowerb, CvScalar upperb, void* dstarr )
5984 {
5985     cv::Mat src1 = cv::cvarrToMat(srcarr1), dst = cv::cvarrToMat(dstarr);
5986     CV_Assert( src1.size == dst.size && dst.type() == CV_8U );
5987 
5988     cv::inRange( src1, (const cv::Scalar&)lowerb, (const cv::Scalar&)upperb, dst );
5989 }
5990 
5991 
5992 CV_IMPL void
cvCmp(const void * srcarr1,const void * srcarr2,void * dstarr,int cmp_op)5993 cvCmp( const void* srcarr1, const void* srcarr2, void* dstarr, int cmp_op )
5994 {
5995     cv::Mat src1 = cv::cvarrToMat(srcarr1), dst = cv::cvarrToMat(dstarr);
5996     CV_Assert( src1.size == dst.size && dst.type() == CV_8U );
5997 
5998     cv::compare( src1, cv::cvarrToMat(srcarr2), dst, cmp_op );
5999 }
6000 
6001 
6002 CV_IMPL void
cvCmpS(const void * srcarr1,double value,void * dstarr,int cmp_op)6003 cvCmpS( const void* srcarr1, double value, void* dstarr, int cmp_op )
6004 {
6005     cv::Mat src1 = cv::cvarrToMat(srcarr1), dst = cv::cvarrToMat(dstarr);
6006     CV_Assert( src1.size == dst.size && dst.type() == CV_8U );
6007 
6008     cv::compare( src1, value, dst, cmp_op );
6009 }
6010 
6011 
6012 CV_IMPL void
cvMin(const void * srcarr1,const void * srcarr2,void * dstarr)6013 cvMin( const void* srcarr1, const void* srcarr2, void* dstarr )
6014 {
6015     cv::Mat src1 = cv::cvarrToMat(srcarr1), dst = cv::cvarrToMat(dstarr);
6016     CV_Assert( src1.size == dst.size && src1.type() == dst.type() );
6017 
6018     cv::min( src1, cv::cvarrToMat(srcarr2), dst );
6019 }
6020 
6021 
6022 CV_IMPL void
cvMax(const void * srcarr1,const void * srcarr2,void * dstarr)6023 cvMax( const void* srcarr1, const void* srcarr2, void* dstarr )
6024 {
6025     cv::Mat src1 = cv::cvarrToMat(srcarr1), dst = cv::cvarrToMat(dstarr);
6026     CV_Assert( src1.size == dst.size && src1.type() == dst.type() );
6027 
6028     cv::max( src1, cv::cvarrToMat(srcarr2), dst );
6029 }
6030 
6031 
6032 CV_IMPL void
cvMinS(const void * srcarr1,double value,void * dstarr)6033 cvMinS( const void* srcarr1, double value, void* dstarr )
6034 {
6035     cv::Mat src1 = cv::cvarrToMat(srcarr1), dst = cv::cvarrToMat(dstarr);
6036     CV_Assert( src1.size == dst.size && src1.type() == dst.type() );
6037 
6038     cv::min( src1, value, dst );
6039 }
6040 
6041 
6042 CV_IMPL void
cvMaxS(const void * srcarr1,double value,void * dstarr)6043 cvMaxS( const void* srcarr1, double value, void* dstarr )
6044 {
6045     cv::Mat src1 = cv::cvarrToMat(srcarr1), dst = cv::cvarrToMat(dstarr);
6046     CV_Assert( src1.size == dst.size && src1.type() == dst.type() );
6047 
6048     cv::max( src1, value, dst );
6049 }
6050 
6051 /* End of file. */
6052