1 /*M///////////////////////////////////////////////////////////////////////////////////////
2 //
3 // IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
4 //
5 // By downloading, copying, installing or using the software you agree to this license.
6 // If you do not agree to this license, do not download, install,
7 // copy or use the software.
8 //
9 //
10 // License Agreement
11 // For Open Source Computer Vision Library
12 //
13 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
14 // Copyright (C) 2009-2011, Willow Garage Inc., all rights reserved.
15 // Copyright (C) 2014-2015, Itseez Inc., all rights reserved.
16 // Third party copyrights are property of their respective owners.
17 //
18 // Redistribution and use in source and binary forms, with or without modification,
19 // are permitted provided that the following conditions are met:
20 //
21 // * Redistribution's of source code must retain the above copyright notice,
22 // this list of conditions and the following disclaimer.
23 //
24 // * Redistribution's in binary form must reproduce the above copyright notice,
25 // this list of conditions and the following disclaimer in the documentation
26 // and/or other materials provided with the distribution.
27 //
28 // * The name of the copyright holders may not be used to endorse or promote products
29 // derived from this software without specific prior written permission.
30 //
31 // This software is provided by the copyright holders and contributors "as is" and
32 // any express or implied warranties, including, but not limited to, the implied
33 // warranties of merchantability and fitness for a particular purpose are disclaimed.
34 // In no event shall the Intel Corporation or contributors be liable for any direct,
35 // indirect, incidental, special, exemplary, or consequential damages
36 // (including, but not limited to, procurement of substitute goods or services;
37 // loss of use, data, or profits; or business interruption) however caused
38 // and on any theory of liability, whether in contract, strict liability,
39 // or tort (including negligence or otherwise) arising in any way out of
40 // the use of this software, even if advised of the possibility of such damage.
41 //
42 //M*/
43
44 /* ////////////////////////////////////////////////////////////////////
45 //
46 // Arithmetic and logical operations: +, -, *, /, &, |, ^, ~, abs ...
47 //
48 // */
49
50 #include "precomp.hpp"
51 #include "opencl_kernels_core.hpp"
52
53 namespace cv
54 {
55
56 struct NOP {};
57
58 #if CV_SSE2 || CV_NEON
59
60 #define FUNCTOR_TEMPLATE(name) \
61 template<typename T> struct name {}
62
63 FUNCTOR_TEMPLATE(VLoadStore128);
64 #if CV_SSE2
65 FUNCTOR_TEMPLATE(VLoadStore64);
66 FUNCTOR_TEMPLATE(VLoadStore128Aligned);
67 #if CV_AVX2
68 FUNCTOR_TEMPLATE(VLoadStore256);
69 FUNCTOR_TEMPLATE(VLoadStore256Aligned);
70 #endif
71 #endif
72
73 #endif
74
75 template<typename T, class Op, class VOp>
vBinOp(const T * src1,size_t step1,const T * src2,size_t step2,T * dst,size_t step,Size sz)76 void vBinOp(const T* src1, size_t step1, const T* src2, size_t step2, T* dst, size_t step, Size sz)
77 {
78 #if CV_SSE2 || CV_NEON
79 VOp vop;
80 #endif
81 Op op;
82
83 for( ; sz.height--; src1 = (const T *)((const uchar *)src1 + step1),
84 src2 = (const T *)((const uchar *)src2 + step2),
85 dst = (T *)((uchar *)dst + step) )
86 {
87 int x = 0;
88
89 #if CV_NEON || CV_SSE2
90 #if CV_AVX2
91 if( USE_AVX2 )
92 {
93 for( ; x <= sz.width - 32/(int)sizeof(T); x += 32/sizeof(T) )
94 {
95 typename VLoadStore256<T>::reg_type r0 = VLoadStore256<T>::load(src1 + x);
96 r0 = vop(r0, VLoadStore256<T>::load(src2 + x));
97 VLoadStore256<T>::store(dst + x, r0);
98 }
99 }
100 #else
101 #if CV_SSE2
102 if( USE_SSE2 )
103 {
104 #endif // CV_SSE2
105 for( ; x <= sz.width - 32/(int)sizeof(T); x += 32/sizeof(T) )
106 {
107 typename VLoadStore128<T>::reg_type r0 = VLoadStore128<T>::load(src1 + x );
108 typename VLoadStore128<T>::reg_type r1 = VLoadStore128<T>::load(src1 + x + 16/sizeof(T));
109 r0 = vop(r0, VLoadStore128<T>::load(src2 + x ));
110 r1 = vop(r1, VLoadStore128<T>::load(src2 + x + 16/sizeof(T)));
111 VLoadStore128<T>::store(dst + x , r0);
112 VLoadStore128<T>::store(dst + x + 16/sizeof(T), r1);
113 }
114 #if CV_SSE2
115 }
116 #endif // CV_SSE2
117 #endif // CV_AVX2
118 #endif // CV_NEON || CV_SSE2
119
120 #if CV_AVX2
121 // nothing
122 #elif CV_SSE2
123 if( USE_SSE2 )
124 {
125 for( ; x <= sz.width - 8/(int)sizeof(T); x += 8/sizeof(T) )
126 {
127 typename VLoadStore64<T>::reg_type r = VLoadStore64<T>::load(src1 + x);
128 r = vop(r, VLoadStore64<T>::load(src2 + x));
129 VLoadStore64<T>::store(dst + x, r);
130 }
131 }
132 #endif
133
134 #if CV_ENABLE_UNROLLED
135 for( ; x <= sz.width - 4; x += 4 )
136 {
137 T v0 = op(src1[x], src2[x]);
138 T v1 = op(src1[x+1], src2[x+1]);
139 dst[x] = v0; dst[x+1] = v1;
140 v0 = op(src1[x+2], src2[x+2]);
141 v1 = op(src1[x+3], src2[x+3]);
142 dst[x+2] = v0; dst[x+3] = v1;
143 }
144 #endif
145
146 for( ; x < sz.width; x++ )
147 dst[x] = op(src1[x], src2[x]);
148 }
149 }
150
151 template<typename T, class Op, class Op32>
vBinOp32(const T * src1,size_t step1,const T * src2,size_t step2,T * dst,size_t step,Size sz)152 void vBinOp32(const T* src1, size_t step1, const T* src2, size_t step2,
153 T* dst, size_t step, Size sz)
154 {
155 #if CV_SSE2 || CV_NEON
156 Op32 op32;
157 #endif
158 Op op;
159
160 for( ; sz.height--; src1 = (const T *)((const uchar *)src1 + step1),
161 src2 = (const T *)((const uchar *)src2 + step2),
162 dst = (T *)((uchar *)dst + step) )
163 {
164 int x = 0;
165
166 #if CV_AVX2
167 if( USE_AVX2 )
168 {
169 if( (((size_t)src1|(size_t)src2|(size_t)dst)&31) == 0 )
170 {
171 for( ; x <= sz.width - 8; x += 8 )
172 {
173 typename VLoadStore256Aligned<T>::reg_type r0 = VLoadStore256Aligned<T>::load(src1 + x);
174 r0 = op32(r0, VLoadStore256Aligned<T>::load(src2 + x));
175 VLoadStore256Aligned<T>::store(dst + x, r0);
176 }
177 }
178 }
179 #elif CV_SSE2
180 if( USE_SSE2 )
181 {
182 if( (((size_t)src1|(size_t)src2|(size_t)dst)&15) == 0 )
183 {
184 for( ; x <= sz.width - 8; x += 8 )
185 {
186 typename VLoadStore128Aligned<T>::reg_type r0 = VLoadStore128Aligned<T>::load(src1 + x );
187 typename VLoadStore128Aligned<T>::reg_type r1 = VLoadStore128Aligned<T>::load(src1 + x + 4);
188 r0 = op32(r0, VLoadStore128Aligned<T>::load(src2 + x ));
189 r1 = op32(r1, VLoadStore128Aligned<T>::load(src2 + x + 4));
190 VLoadStore128Aligned<T>::store(dst + x , r0);
191 VLoadStore128Aligned<T>::store(dst + x + 4, r1);
192 }
193 }
194 }
195 #endif // CV_AVX2
196
197 #if CV_NEON || CV_SSE2
198 #if CV_AVX2
199 if( USE_AVX2 )
200 {
201 for( ; x <= sz.width - 8; x += 8 )
202 {
203 typename VLoadStore256<T>::reg_type r0 = VLoadStore256<T>::load(src1 + x);
204 r0 = op32(r0, VLoadStore256<T>::load(src2 + x));
205 VLoadStore256<T>::store(dst + x, r0);
206 }
207 }
208 #else
209 #if CV_SSE2
210 if( USE_SSE2 )
211 {
212 #endif // CV_SSE2
213 for( ; x <= sz.width - 8; x += 8 )
214 {
215 typename VLoadStore128<T>::reg_type r0 = VLoadStore128<T>::load(src1 + x );
216 typename VLoadStore128<T>::reg_type r1 = VLoadStore128<T>::load(src1 + x + 4);
217 r0 = op32(r0, VLoadStore128<T>::load(src2 + x ));
218 r1 = op32(r1, VLoadStore128<T>::load(src2 + x + 4));
219 VLoadStore128<T>::store(dst + x , r0);
220 VLoadStore128<T>::store(dst + x + 4, r1);
221 }
222 #if CV_SSE2
223 }
224 #endif // CV_SSE2
225 #endif // CV_AVX2
226 #endif // CV_NEON || CV_SSE2
227
228 #if CV_ENABLE_UNROLLED
229 for( ; x <= sz.width - 4; x += 4 )
230 {
231 T v0 = op(src1[x], src2[x]);
232 T v1 = op(src1[x+1], src2[x+1]);
233 dst[x] = v0; dst[x+1] = v1;
234 v0 = op(src1[x+2], src2[x+2]);
235 v1 = op(src1[x+3], src2[x+3]);
236 dst[x+2] = v0; dst[x+3] = v1;
237 }
238 #endif
239
240 for( ; x < sz.width; x++ )
241 dst[x] = op(src1[x], src2[x]);
242 }
243 }
244
245
246 template<typename T, class Op, class Op64>
vBinOp64(const T * src1,size_t step1,const T * src2,size_t step2,T * dst,size_t step,Size sz)247 void vBinOp64(const T* src1, size_t step1, const T* src2, size_t step2,
248 T* dst, size_t step, Size sz)
249 {
250 #if CV_SSE2
251 Op64 op64;
252 #endif
253 Op op;
254
255 for( ; sz.height--; src1 = (const T *)((const uchar *)src1 + step1),
256 src2 = (const T *)((const uchar *)src2 + step2),
257 dst = (T *)((uchar *)dst + step) )
258 {
259 int x = 0;
260
261 #if CV_AVX2
262 if( USE_AVX2 )
263 {
264 if( (((size_t)src1|(size_t)src2|(size_t)dst)&31) == 0 )
265 {
266 for( ; x <= sz.width - 4; x += 4 )
267 {
268 typename VLoadStore256Aligned<T>::reg_type r0 = VLoadStore256Aligned<T>::load(src1 + x);
269 r0 = op64(r0, VLoadStore256Aligned<T>::load(src2 + x));
270 VLoadStore256Aligned<T>::store(dst + x, r0);
271 }
272 }
273 }
274 #elif CV_SSE2
275 if( USE_SSE2 )
276 {
277 if( (((size_t)src1|(size_t)src2|(size_t)dst)&15) == 0 )
278 {
279 for( ; x <= sz.width - 4; x += 4 )
280 {
281 typename VLoadStore128Aligned<T>::reg_type r0 = VLoadStore128Aligned<T>::load(src1 + x );
282 typename VLoadStore128Aligned<T>::reg_type r1 = VLoadStore128Aligned<T>::load(src1 + x + 2);
283 r0 = op64(r0, VLoadStore128Aligned<T>::load(src2 + x ));
284 r1 = op64(r1, VLoadStore128Aligned<T>::load(src2 + x + 2));
285 VLoadStore128Aligned<T>::store(dst + x , r0);
286 VLoadStore128Aligned<T>::store(dst + x + 2, r1);
287 }
288 }
289 }
290 #endif
291
292 for( ; x <= sz.width - 4; x += 4 )
293 {
294 T v0 = op(src1[x], src2[x]);
295 T v1 = op(src1[x+1], src2[x+1]);
296 dst[x] = v0; dst[x+1] = v1;
297 v0 = op(src1[x+2], src2[x+2]);
298 v1 = op(src1[x+3], src2[x+3]);
299 dst[x+2] = v0; dst[x+3] = v1;
300 }
301
302 for( ; x < sz.width; x++ )
303 dst[x] = op(src1[x], src2[x]);
304 }
305 }
306
307 #if CV_AVX2
308
309 #define FUNCTOR_LOADSTORE_CAST(name, template_arg, register_type, load_body, store_body) \
310 template <> \
311 struct name<template_arg>{ \
312 typedef register_type reg_type; \
313 static reg_type load(const template_arg * p) { return load_body ((const reg_type *)p); } \
314 static void store(template_arg * p, reg_type v) { store_body ((reg_type *)p, v); } \
315 }
316
317 #define FUNCTOR_LOADSTORE(name, template_arg, register_type, load_body, store_body) \
318 template <> \
319 struct name<template_arg>{ \
320 typedef register_type reg_type; \
321 static reg_type load(const template_arg * p) { return load_body (p); } \
322 static void store(template_arg * p, reg_type v) { store_body (p, v); } \
323 }
324
325 #define FUNCTOR_CLOSURE_2arg(name, template_arg, body) \
326 template<> \
327 struct name<template_arg> \
328 { \
329 VLoadStore256<template_arg>::reg_type operator()( \
330 const VLoadStore256<template_arg>::reg_type & a, \
331 const VLoadStore256<template_arg>::reg_type & b) const \
332 { \
333 body; \
334 } \
335 }
336
337 #define FUNCTOR_CLOSURE_1arg(name, template_arg, body) \
338 template<> \
339 struct name<template_arg> \
340 { \
341 VLoadStore256<template_arg>::reg_type operator()( \
342 const VLoadStore256<template_arg>::reg_type & a, \
343 const VLoadStore256<template_arg>::reg_type & ) const \
344 { \
345 body; \
346 } \
347 }
348
349 FUNCTOR_LOADSTORE_CAST(VLoadStore256, uchar, __m256i, _mm256_loadu_si256, _mm256_storeu_si256);
350 FUNCTOR_LOADSTORE_CAST(VLoadStore256, schar, __m256i, _mm256_loadu_si256, _mm256_storeu_si256);
351 FUNCTOR_LOADSTORE_CAST(VLoadStore256, ushort, __m256i, _mm256_loadu_si256, _mm256_storeu_si256);
352 FUNCTOR_LOADSTORE_CAST(VLoadStore256, short, __m256i, _mm256_loadu_si256, _mm256_storeu_si256);
353 FUNCTOR_LOADSTORE_CAST(VLoadStore256, int, __m256i, _mm256_loadu_si256, _mm256_storeu_si256);
354 FUNCTOR_LOADSTORE( VLoadStore256, float, __m256 , _mm256_loadu_ps , _mm256_storeu_ps );
355 FUNCTOR_LOADSTORE( VLoadStore256, double, __m256d, _mm256_loadu_pd , _mm256_storeu_pd );
356
357 FUNCTOR_LOADSTORE_CAST(VLoadStore256Aligned, int, __m256i, _mm256_load_si256, _mm256_store_si256);
358 FUNCTOR_LOADSTORE( VLoadStore256Aligned, float, __m256 , _mm256_load_ps , _mm256_store_ps );
359 FUNCTOR_LOADSTORE( VLoadStore256Aligned, double, __m256d, _mm256_load_pd , _mm256_store_pd );
360
361 FUNCTOR_TEMPLATE(VAdd);
362 FUNCTOR_CLOSURE_2arg(VAdd, uchar, return _mm256_adds_epu8 (a, b));
363 FUNCTOR_CLOSURE_2arg(VAdd, schar, return _mm256_adds_epi8 (a, b));
364 FUNCTOR_CLOSURE_2arg(VAdd, ushort, return _mm256_adds_epu16(a, b));
365 FUNCTOR_CLOSURE_2arg(VAdd, short, return _mm256_adds_epi16(a, b));
366 FUNCTOR_CLOSURE_2arg(VAdd, int, return _mm256_add_epi32 (a, b));
367 FUNCTOR_CLOSURE_2arg(VAdd, float, return _mm256_add_ps (a, b));
368 FUNCTOR_CLOSURE_2arg(VAdd, double, return _mm256_add_pd (a, b));
369
370 FUNCTOR_TEMPLATE(VSub);
371 FUNCTOR_CLOSURE_2arg(VSub, uchar, return _mm256_subs_epu8 (a, b));
372 FUNCTOR_CLOSURE_2arg(VSub, schar, return _mm256_subs_epi8 (a, b));
373 FUNCTOR_CLOSURE_2arg(VSub, ushort, return _mm256_subs_epu16(a, b));
374 FUNCTOR_CLOSURE_2arg(VSub, short, return _mm256_subs_epi16(a, b));
375 FUNCTOR_CLOSURE_2arg(VSub, int, return _mm256_sub_epi32 (a, b));
376 FUNCTOR_CLOSURE_2arg(VSub, float, return _mm256_sub_ps (a, b));
377 FUNCTOR_CLOSURE_2arg(VSub, double, return _mm256_sub_pd (a, b));
378
379 FUNCTOR_TEMPLATE(VMin);
380 FUNCTOR_CLOSURE_2arg(VMin, uchar, return _mm256_min_epu8 (a, b));
381 FUNCTOR_CLOSURE_2arg(VMin, schar, return _mm256_min_epi8 (a, b));
382 FUNCTOR_CLOSURE_2arg(VMin, ushort, return _mm256_min_epi16(a, b));
383 FUNCTOR_CLOSURE_2arg(VMin, short, return _mm256_min_epi16(a, b));
384 FUNCTOR_CLOSURE_2arg(VMin, int, return _mm256_min_epi32(a, b));
385 FUNCTOR_CLOSURE_2arg(VMin, float, return _mm256_min_ps (a, b));
386 FUNCTOR_CLOSURE_2arg(VMin, double, return _mm256_min_pd (a, b));
387
388 FUNCTOR_TEMPLATE(VMax);
389 FUNCTOR_CLOSURE_2arg(VMax, uchar, return _mm256_max_epu8 (a, b));
390 FUNCTOR_CLOSURE_2arg(VMax, schar, return _mm256_max_epi8 (a, b));
391 FUNCTOR_CLOSURE_2arg(VMax, ushort, return _mm256_max_epu16(a, b));
392 FUNCTOR_CLOSURE_2arg(VMax, short, return _mm256_max_epi16(a, b));
393 FUNCTOR_CLOSURE_2arg(VMax, int, return _mm256_max_epi32(a, b));
394 FUNCTOR_CLOSURE_2arg(VMax, float, return _mm256_max_ps (a, b));
395 FUNCTOR_CLOSURE_2arg(VMax, double, return _mm256_max_pd (a, b));
396
397
398 static unsigned int CV_DECL_ALIGNED(32) v32f_absmask[] = { 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff,
399 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff };
400 static unsigned int CV_DECL_ALIGNED(32) v64f_absmask[] = { 0xffffffff, 0x7fffffff, 0xffffffff, 0x7fffffff,
401 0xffffffff, 0x7fffffff, 0xffffffff, 0x7fffffff };
402
403 FUNCTOR_TEMPLATE(VAbsDiff);
404 FUNCTOR_CLOSURE_2arg(VAbsDiff, uchar,
405 return _mm256_add_epi8(_mm256_subs_epu8(a, b), _mm256_subs_epu8(b, a));
406 );
407 FUNCTOR_CLOSURE_2arg(VAbsDiff, schar,
408 __m256i d = _mm256_subs_epi8(a, b);
409 __m256i m = _mm256_cmpgt_epi8(b, a);
410 return _mm256_subs_epi8(_mm256_xor_si256(d, m), m);
411 );
412 FUNCTOR_CLOSURE_2arg(VAbsDiff, ushort,
413 return _mm256_add_epi16(_mm256_subs_epu16(a, b), _mm256_subs_epu16(b, a));
414 );
415 FUNCTOR_CLOSURE_2arg(VAbsDiff, short,
416 __m256i M = _mm256_max_epi16(a, b);
417 __m256i m = _mm256_min_epi16(a, b);
418 return _mm256_subs_epi16(M, m);
419 );
420 FUNCTOR_CLOSURE_2arg(VAbsDiff, int,
421 __m256i d = _mm256_sub_epi32(a, b);
422 __m256i m = _mm256_cmpgt_epi32(b, a);
423 return _mm256_sub_epi32(_mm256_xor_si256(d, m), m);
424 );
425 FUNCTOR_CLOSURE_2arg(VAbsDiff, float,
426 return _mm256_and_ps(_mm256_sub_ps(a, b), *(const __m256*)v32f_absmask);
427 );
428 FUNCTOR_CLOSURE_2arg(VAbsDiff, double,
429 return _mm256_and_pd(_mm256_sub_pd(a, b), *(const __m256d*)v64f_absmask);
430 );
431
432 FUNCTOR_TEMPLATE(VAnd);
433 FUNCTOR_CLOSURE_2arg(VAnd, uchar, return _mm256_and_si256(a, b));
434 FUNCTOR_TEMPLATE(VOr);
435 FUNCTOR_CLOSURE_2arg(VOr , uchar, return _mm256_or_si256 (a, b));
436 FUNCTOR_TEMPLATE(VXor);
437 FUNCTOR_CLOSURE_2arg(VXor, uchar, return _mm256_xor_si256(a, b));
438 FUNCTOR_TEMPLATE(VNot);
439 FUNCTOR_CLOSURE_1arg(VNot, uchar, return _mm256_xor_si256(_mm256_set1_epi32(-1), a));
440
441 #elif CV_SSE2
442
443 #define FUNCTOR_LOADSTORE_CAST(name, template_arg, register_type, load_body, store_body)\
444 template <> \
445 struct name<template_arg>{ \
446 typedef register_type reg_type; \
447 static reg_type load(const template_arg * p) { return load_body ((const reg_type *)p); } \
448 static void store(template_arg * p, reg_type v) { store_body ((reg_type *)p, v); } \
449 }
450
451 #define FUNCTOR_LOADSTORE(name, template_arg, register_type, load_body, store_body)\
452 template <> \
453 struct name<template_arg>{ \
454 typedef register_type reg_type; \
455 static reg_type load(const template_arg * p) { return load_body (p); } \
456 static void store(template_arg * p, reg_type v) { store_body (p, v); } \
457 }
458
459 #define FUNCTOR_CLOSURE_2arg(name, template_arg, body)\
460 template<> \
461 struct name<template_arg> \
462 { \
463 VLoadStore128<template_arg>::reg_type operator()( \
464 const VLoadStore128<template_arg>::reg_type & a, \
465 const VLoadStore128<template_arg>::reg_type & b) const \
466 { \
467 body; \
468 } \
469 }
470
471 #define FUNCTOR_CLOSURE_1arg(name, template_arg, body)\
472 template<> \
473 struct name<template_arg> \
474 { \
475 VLoadStore128<template_arg>::reg_type operator()( \
476 const VLoadStore128<template_arg>::reg_type & a, \
477 const VLoadStore128<template_arg>::reg_type & ) const \
478 { \
479 body; \
480 } \
481 }
482
483 FUNCTOR_LOADSTORE_CAST(VLoadStore128, uchar, __m128i, _mm_loadu_si128, _mm_storeu_si128);
484 FUNCTOR_LOADSTORE_CAST(VLoadStore128, schar, __m128i, _mm_loadu_si128, _mm_storeu_si128);
485 FUNCTOR_LOADSTORE_CAST(VLoadStore128, ushort, __m128i, _mm_loadu_si128, _mm_storeu_si128);
486 FUNCTOR_LOADSTORE_CAST(VLoadStore128, short, __m128i, _mm_loadu_si128, _mm_storeu_si128);
487 FUNCTOR_LOADSTORE_CAST(VLoadStore128, int, __m128i, _mm_loadu_si128, _mm_storeu_si128);
488 FUNCTOR_LOADSTORE( VLoadStore128, float, __m128 , _mm_loadu_ps , _mm_storeu_ps );
489 FUNCTOR_LOADSTORE( VLoadStore128, double, __m128d, _mm_loadu_pd , _mm_storeu_pd );
490
491 FUNCTOR_LOADSTORE_CAST(VLoadStore64, uchar, __m128i, _mm_loadl_epi64, _mm_storel_epi64);
492 FUNCTOR_LOADSTORE_CAST(VLoadStore64, schar, __m128i, _mm_loadl_epi64, _mm_storel_epi64);
493 FUNCTOR_LOADSTORE_CAST(VLoadStore64, ushort, __m128i, _mm_loadl_epi64, _mm_storel_epi64);
494 FUNCTOR_LOADSTORE_CAST(VLoadStore64, short, __m128i, _mm_loadl_epi64, _mm_storel_epi64);
495
496 FUNCTOR_LOADSTORE_CAST(VLoadStore128Aligned, int, __m128i, _mm_load_si128, _mm_store_si128);
497 FUNCTOR_LOADSTORE( VLoadStore128Aligned, float, __m128 , _mm_load_ps , _mm_store_ps );
498 FUNCTOR_LOADSTORE( VLoadStore128Aligned, double, __m128d, _mm_load_pd , _mm_store_pd );
499
500 FUNCTOR_TEMPLATE(VAdd);
501 FUNCTOR_CLOSURE_2arg(VAdd, uchar, return _mm_adds_epu8 (a, b));
502 FUNCTOR_CLOSURE_2arg(VAdd, schar, return _mm_adds_epi8 (a, b));
503 FUNCTOR_CLOSURE_2arg(VAdd, ushort, return _mm_adds_epu16(a, b));
504 FUNCTOR_CLOSURE_2arg(VAdd, short, return _mm_adds_epi16(a, b));
505 FUNCTOR_CLOSURE_2arg(VAdd, int, return _mm_add_epi32 (a, b));
506 FUNCTOR_CLOSURE_2arg(VAdd, float, return _mm_add_ps (a, b));
507 FUNCTOR_CLOSURE_2arg(VAdd, double, return _mm_add_pd (a, b));
508
509 FUNCTOR_TEMPLATE(VSub);
510 FUNCTOR_CLOSURE_2arg(VSub, uchar, return _mm_subs_epu8 (a, b));
511 FUNCTOR_CLOSURE_2arg(VSub, schar, return _mm_subs_epi8 (a, b));
512 FUNCTOR_CLOSURE_2arg(VSub, ushort, return _mm_subs_epu16(a, b));
513 FUNCTOR_CLOSURE_2arg(VSub, short, return _mm_subs_epi16(a, b));
514 FUNCTOR_CLOSURE_2arg(VSub, int, return _mm_sub_epi32 (a, b));
515 FUNCTOR_CLOSURE_2arg(VSub, float, return _mm_sub_ps (a, b));
516 FUNCTOR_CLOSURE_2arg(VSub, double, return _mm_sub_pd (a, b));
517
518 FUNCTOR_TEMPLATE(VMin);
519 FUNCTOR_CLOSURE_2arg(VMin, uchar, return _mm_min_epu8(a, b));
520 FUNCTOR_CLOSURE_2arg(VMin, schar,
521 __m128i m = _mm_cmpgt_epi8(a, b);
522 return _mm_xor_si128(a, _mm_and_si128(_mm_xor_si128(a, b), m));
523 );
524 FUNCTOR_CLOSURE_2arg(VMin, ushort, return _mm_subs_epu16(a, _mm_subs_epu16(a, b)));
525 FUNCTOR_CLOSURE_2arg(VMin, short, return _mm_min_epi16(a, b));
526 FUNCTOR_CLOSURE_2arg(VMin, int,
527 __m128i m = _mm_cmpgt_epi32(a, b);
528 return _mm_xor_si128(a, _mm_and_si128(_mm_xor_si128(a, b), m));
529 );
530 FUNCTOR_CLOSURE_2arg(VMin, float, return _mm_min_ps(a, b));
531 FUNCTOR_CLOSURE_2arg(VMin, double, return _mm_min_pd(a, b));
532
533 FUNCTOR_TEMPLATE(VMax);
534 FUNCTOR_CLOSURE_2arg(VMax, uchar, return _mm_max_epu8(a, b));
535 FUNCTOR_CLOSURE_2arg(VMax, schar,
536 __m128i m = _mm_cmpgt_epi8(b, a);
537 return _mm_xor_si128(a, _mm_and_si128(_mm_xor_si128(a, b), m));
538 );
539 FUNCTOR_CLOSURE_2arg(VMax, ushort, return _mm_adds_epu16(_mm_subs_epu16(a, b), b));
540 FUNCTOR_CLOSURE_2arg(VMax, short, return _mm_max_epi16(a, b));
541 FUNCTOR_CLOSURE_2arg(VMax, int,
542 __m128i m = _mm_cmpgt_epi32(b, a);
543 return _mm_xor_si128(a, _mm_and_si128(_mm_xor_si128(a, b), m));
544 );
545 FUNCTOR_CLOSURE_2arg(VMax, float, return _mm_max_ps(a, b));
546 FUNCTOR_CLOSURE_2arg(VMax, double, return _mm_max_pd(a, b));
547
548
549 static unsigned int CV_DECL_ALIGNED(16) v32f_absmask[] = { 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff };
550 static unsigned int CV_DECL_ALIGNED(16) v64f_absmask[] = { 0xffffffff, 0x7fffffff, 0xffffffff, 0x7fffffff };
551
552 FUNCTOR_TEMPLATE(VAbsDiff);
553 FUNCTOR_CLOSURE_2arg(VAbsDiff, uchar,
554 return _mm_add_epi8(_mm_subs_epu8(a, b), _mm_subs_epu8(b, a));
555 );
556 FUNCTOR_CLOSURE_2arg(VAbsDiff, schar,
557 __m128i d = _mm_subs_epi8(a, b);
558 __m128i m = _mm_cmpgt_epi8(b, a);
559 return _mm_subs_epi8(_mm_xor_si128(d, m), m);
560 );
561 FUNCTOR_CLOSURE_2arg(VAbsDiff, ushort,
562 return _mm_add_epi16(_mm_subs_epu16(a, b), _mm_subs_epu16(b, a));
563 );
564 FUNCTOR_CLOSURE_2arg(VAbsDiff, short,
565 __m128i M = _mm_max_epi16(a, b);
566 __m128i m = _mm_min_epi16(a, b);
567 return _mm_subs_epi16(M, m);
568 );
569 FUNCTOR_CLOSURE_2arg(VAbsDiff, int,
570 __m128i d = _mm_sub_epi32(a, b);
571 __m128i m = _mm_cmpgt_epi32(b, a);
572 return _mm_sub_epi32(_mm_xor_si128(d, m), m);
573 );
574 FUNCTOR_CLOSURE_2arg(VAbsDiff, float,
575 return _mm_and_ps(_mm_sub_ps(a,b), *(const __m128*)v32f_absmask);
576 );
577 FUNCTOR_CLOSURE_2arg(VAbsDiff, double,
578 return _mm_and_pd(_mm_sub_pd(a,b), *(const __m128d*)v64f_absmask);
579 );
580
581 FUNCTOR_TEMPLATE(VAnd);
582 FUNCTOR_CLOSURE_2arg(VAnd, uchar, return _mm_and_si128(a, b));
583 FUNCTOR_TEMPLATE(VOr);
584 FUNCTOR_CLOSURE_2arg(VOr , uchar, return _mm_or_si128 (a, b));
585 FUNCTOR_TEMPLATE(VXor);
586 FUNCTOR_CLOSURE_2arg(VXor, uchar, return _mm_xor_si128(a, b));
587 FUNCTOR_TEMPLATE(VNot);
588 FUNCTOR_CLOSURE_1arg(VNot, uchar, return _mm_xor_si128(_mm_set1_epi32(-1), a));
589 #endif
590
591 #if CV_NEON
592
593 #define FUNCTOR_LOADSTORE(name, template_arg, register_type, load_body, store_body)\
594 template <> \
595 struct name<template_arg>{ \
596 typedef register_type reg_type; \
597 static reg_type load(const template_arg * p) { return load_body (p);}; \
598 static void store(template_arg * p, reg_type v) { store_body (p, v);}; \
599 }
600
601 #define FUNCTOR_CLOSURE_2arg(name, template_arg, body)\
602 template<> \
603 struct name<template_arg> \
604 { \
605 VLoadStore128<template_arg>::reg_type operator()( \
606 VLoadStore128<template_arg>::reg_type a, \
607 VLoadStore128<template_arg>::reg_type b) const \
608 { \
609 return body; \
610 }; \
611 }
612
613 #define FUNCTOR_CLOSURE_1arg(name, template_arg, body)\
614 template<> \
615 struct name<template_arg> \
616 { \
617 VLoadStore128<template_arg>::reg_type operator()( \
618 VLoadStore128<template_arg>::reg_type a, \
619 VLoadStore128<template_arg>::reg_type ) const \
620 { \
621 return body; \
622 }; \
623 }
624
625 FUNCTOR_LOADSTORE(VLoadStore128, uchar, uint8x16_t, vld1q_u8 , vst1q_u8 );
626 FUNCTOR_LOADSTORE(VLoadStore128, schar, int8x16_t, vld1q_s8 , vst1q_s8 );
627 FUNCTOR_LOADSTORE(VLoadStore128, ushort, uint16x8_t, vld1q_u16, vst1q_u16);
628 FUNCTOR_LOADSTORE(VLoadStore128, short, int16x8_t, vld1q_s16, vst1q_s16);
629 FUNCTOR_LOADSTORE(VLoadStore128, int, int32x4_t, vld1q_s32, vst1q_s32);
630 FUNCTOR_LOADSTORE(VLoadStore128, float, float32x4_t, vld1q_f32, vst1q_f32);
631
632 FUNCTOR_TEMPLATE(VAdd);
633 FUNCTOR_CLOSURE_2arg(VAdd, uchar, vqaddq_u8 (a, b));
634 FUNCTOR_CLOSURE_2arg(VAdd, schar, vqaddq_s8 (a, b));
635 FUNCTOR_CLOSURE_2arg(VAdd, ushort, vqaddq_u16(a, b));
636 FUNCTOR_CLOSURE_2arg(VAdd, short, vqaddq_s16(a, b));
637 FUNCTOR_CLOSURE_2arg(VAdd, int, vaddq_s32 (a, b));
638 FUNCTOR_CLOSURE_2arg(VAdd, float, vaddq_f32 (a, b));
639
640 FUNCTOR_TEMPLATE(VSub);
641 FUNCTOR_CLOSURE_2arg(VSub, uchar, vqsubq_u8 (a, b));
642 FUNCTOR_CLOSURE_2arg(VSub, schar, vqsubq_s8 (a, b));
643 FUNCTOR_CLOSURE_2arg(VSub, ushort, vqsubq_u16(a, b));
644 FUNCTOR_CLOSURE_2arg(VSub, short, vqsubq_s16(a, b));
645 FUNCTOR_CLOSURE_2arg(VSub, int, vsubq_s32 (a, b));
646 FUNCTOR_CLOSURE_2arg(VSub, float, vsubq_f32 (a, b));
647
648 FUNCTOR_TEMPLATE(VMin);
649 FUNCTOR_CLOSURE_2arg(VMin, uchar, vminq_u8 (a, b));
650 FUNCTOR_CLOSURE_2arg(VMin, schar, vminq_s8 (a, b));
651 FUNCTOR_CLOSURE_2arg(VMin, ushort, vminq_u16(a, b));
652 FUNCTOR_CLOSURE_2arg(VMin, short, vminq_s16(a, b));
653 FUNCTOR_CLOSURE_2arg(VMin, int, vminq_s32(a, b));
654 FUNCTOR_CLOSURE_2arg(VMin, float, vminq_f32(a, b));
655
656 FUNCTOR_TEMPLATE(VMax);
657 FUNCTOR_CLOSURE_2arg(VMax, uchar, vmaxq_u8 (a, b));
658 FUNCTOR_CLOSURE_2arg(VMax, schar, vmaxq_s8 (a, b));
659 FUNCTOR_CLOSURE_2arg(VMax, ushort, vmaxq_u16(a, b));
660 FUNCTOR_CLOSURE_2arg(VMax, short, vmaxq_s16(a, b));
661 FUNCTOR_CLOSURE_2arg(VMax, int, vmaxq_s32(a, b));
662 FUNCTOR_CLOSURE_2arg(VMax, float, vmaxq_f32(a, b));
663
664 FUNCTOR_TEMPLATE(VAbsDiff);
665 FUNCTOR_CLOSURE_2arg(VAbsDiff, uchar, vabdq_u8 (a, b));
666 FUNCTOR_CLOSURE_2arg(VAbsDiff, schar, vqabsq_s8 (vqsubq_s8(a, b)));
667 FUNCTOR_CLOSURE_2arg(VAbsDiff, ushort, vabdq_u16 (a, b));
668 FUNCTOR_CLOSURE_2arg(VAbsDiff, short, vqabsq_s16(vqsubq_s16(a, b)));
669 FUNCTOR_CLOSURE_2arg(VAbsDiff, int, vabdq_s32 (a, b));
670 FUNCTOR_CLOSURE_2arg(VAbsDiff, float, vabdq_f32 (a, b));
671
672 FUNCTOR_TEMPLATE(VAnd);
673 FUNCTOR_CLOSURE_2arg(VAnd, uchar, vandq_u8(a, b));
674 FUNCTOR_TEMPLATE(VOr);
675 FUNCTOR_CLOSURE_2arg(VOr , uchar, vorrq_u8(a, b));
676 FUNCTOR_TEMPLATE(VXor);
677 FUNCTOR_CLOSURE_2arg(VXor, uchar, veorq_u8(a, b));
678 FUNCTOR_TEMPLATE(VNot);
679 FUNCTOR_CLOSURE_1arg(VNot, uchar, vmvnq_u8(a ));
680 #endif
681
682 #if CV_SSE2 || CV_NEON
683 #define IF_SIMD(op) op
684 #else
685 #define IF_SIMD(op) NOP
686 #endif
687
operator ()(uchar a,uchar b) const688 template<> inline uchar OpAdd<uchar>::operator ()(uchar a, uchar b) const
689 { return CV_FAST_CAST_8U(a + b); }
operator ()(uchar a,uchar b) const690 template<> inline uchar OpSub<uchar>::operator ()(uchar a, uchar b) const
691 { return CV_FAST_CAST_8U(a - b); }
692
693 template<typename T> struct OpAbsDiff
694 {
695 typedef T type1;
696 typedef T type2;
697 typedef T rtype;
operator ()cv::OpAbsDiff698 T operator()(T a, T b) const { return (T)std::abs(a - b); }
699 };
700
operator ()(short a,short b) const701 template<> inline short OpAbsDiff<short>::operator ()(short a, short b) const
702 { return saturate_cast<short>(std::abs(a - b)); }
703
operator ()(schar a,schar b) const704 template<> inline schar OpAbsDiff<schar>::operator ()(schar a, schar b) const
705 { return saturate_cast<schar>(std::abs(a - b)); }
706
707 template<typename T, typename WT=T> struct OpAbsDiffS
708 {
709 typedef T type1;
710 typedef WT type2;
711 typedef T rtype;
operator ()cv::OpAbsDiffS712 T operator()(T a, WT b) const { return saturate_cast<T>(std::abs(a - b)); }
713 };
714
715 template<typename T> struct OpAnd
716 {
717 typedef T type1;
718 typedef T type2;
719 typedef T rtype;
operator ()cv::OpAnd720 T operator()( T a, T b ) const { return a & b; }
721 };
722
723 template<typename T> struct OpOr
724 {
725 typedef T type1;
726 typedef T type2;
727 typedef T rtype;
operator ()cv::OpOr728 T operator()( T a, T b ) const { return a | b; }
729 };
730
731 template<typename T> struct OpXor
732 {
733 typedef T type1;
734 typedef T type2;
735 typedef T rtype;
operator ()cv::OpXor736 T operator()( T a, T b ) const { return a ^ b; }
737 };
738
739 template<typename T> struct OpNot
740 {
741 typedef T type1;
742 typedef T type2;
743 typedef T rtype;
operator ()cv::OpNot744 T operator()( T a, T ) const { return ~a; }
745 };
746
747 #if (ARITHM_USE_IPP == 1)
fixSteps(Size sz,size_t elemSize,size_t & step1,size_t & step2,size_t & step)748 static inline void fixSteps(Size sz, size_t elemSize, size_t& step1, size_t& step2, size_t& step)
749 {
750 if( sz.height == 1 )
751 step1 = step2 = step = sz.width*elemSize;
752 }
753 #endif
754
add8u(const uchar * src1,size_t step1,const uchar * src2,size_t step2,uchar * dst,size_t step,Size sz,void *)755 static void add8u( const uchar* src1, size_t step1,
756 const uchar* src2, size_t step2,
757 uchar* dst, size_t step, Size sz, void* )
758 {
759 #if (ARITHM_USE_IPP == 1)
760 CV_IPP_CHECK()
761 {
762 fixSteps(sz, sizeof(dst[0]), step1, step2, step);
763 if (0 <= ippiAdd_8u_C1RSfs(src1, (int)step1, src2, (int)step2, dst, (int)step, ippiSize(sz), 0))
764 {
765 CV_IMPL_ADD(CV_IMPL_IPP);
766 return;
767 }
768 setIppErrorStatus();
769 }
770 #endif
771 (vBinOp<uchar, OpAdd<uchar>, IF_SIMD(VAdd<uchar>)>(src1, step1, src2, step2, dst, step, sz));
772 }
773
add8s(const schar * src1,size_t step1,const schar * src2,size_t step2,schar * dst,size_t step,Size sz,void *)774 static void add8s( const schar* src1, size_t step1,
775 const schar* src2, size_t step2,
776 schar* dst, size_t step, Size sz, void* )
777 {
778 vBinOp<schar, OpAdd<schar>, IF_SIMD(VAdd<schar>)>(src1, step1, src2, step2, dst, step, sz);
779 }
780
add16u(const ushort * src1,size_t step1,const ushort * src2,size_t step2,ushort * dst,size_t step,Size sz,void *)781 static void add16u( const ushort* src1, size_t step1,
782 const ushort* src2, size_t step2,
783 ushort* dst, size_t step, Size sz, void* )
784 {
785 #if (ARITHM_USE_IPP == 1)
786 CV_IPP_CHECK()
787 {
788 fixSteps(sz, sizeof(dst[0]), step1, step2, step);
789 if (0 <= ippiAdd_16u_C1RSfs(src1, (int)step1, src2, (int)step2, dst, (int)step, ippiSize(sz), 0))
790 {
791 CV_IMPL_ADD(CV_IMPL_IPP);
792 return;
793 }
794 setIppErrorStatus();
795 }
796 #endif
797 (vBinOp<ushort, OpAdd<ushort>, IF_SIMD(VAdd<ushort>)>(src1, step1, src2, step2, dst, step, sz));
798 }
799
add16s(const short * src1,size_t step1,const short * src2,size_t step2,short * dst,size_t step,Size sz,void *)800 static void add16s( const short* src1, size_t step1,
801 const short* src2, size_t step2,
802 short* dst, size_t step, Size sz, void* )
803 {
804 #if (ARITHM_USE_IPP == 1)
805 CV_IPP_CHECK()
806 {
807 fixSteps(sz, sizeof(dst[0]), step1, step2, step);
808 if (0 <= ippiAdd_16s_C1RSfs(src1, (int)step1, src2, (int)step2, dst, (int)step, ippiSize(sz), 0))
809 {
810 CV_IMPL_ADD(CV_IMPL_IPP);
811 return;
812 }
813 setIppErrorStatus();
814 }
815 #endif
816 (vBinOp<short, OpAdd<short>, IF_SIMD(VAdd<short>)>(src1, step1, src2, step2, dst, step, sz));
817 }
818
add32s(const int * src1,size_t step1,const int * src2,size_t step2,int * dst,size_t step,Size sz,void *)819 static void add32s( const int* src1, size_t step1,
820 const int* src2, size_t step2,
821 int* dst, size_t step, Size sz, void* )
822 {
823 vBinOp32<int, OpAdd<int>, IF_SIMD(VAdd<int>)>(src1, step1, src2, step2, dst, step, sz);
824 }
825
add32f(const float * src1,size_t step1,const float * src2,size_t step2,float * dst,size_t step,Size sz,void *)826 static void add32f( const float* src1, size_t step1,
827 const float* src2, size_t step2,
828 float* dst, size_t step, Size sz, void* )
829 {
830 #if (ARITHM_USE_IPP == 1)
831 CV_IPP_CHECK()
832 {
833 fixSteps(sz, sizeof(dst[0]), step1, step2, step);
834 if (0 <= ippiAdd_32f_C1R(src1, (int)step1, src2, (int)step2, dst, (int)step, ippiSize(sz)))
835 {
836 CV_IMPL_ADD(CV_IMPL_IPP);
837 return;
838 }
839 setIppErrorStatus();
840 }
841 #endif
842 (vBinOp32<float, OpAdd<float>, IF_SIMD(VAdd<float>)>(src1, step1, src2, step2, dst, step, sz));
843 }
844
add64f(const double * src1,size_t step1,const double * src2,size_t step2,double * dst,size_t step,Size sz,void *)845 static void add64f( const double* src1, size_t step1,
846 const double* src2, size_t step2,
847 double* dst, size_t step, Size sz, void* )
848 {
849 vBinOp64<double, OpAdd<double>, IF_SIMD(VAdd<double>)>(src1, step1, src2, step2, dst, step, sz);
850 }
851
sub8u(const uchar * src1,size_t step1,const uchar * src2,size_t step2,uchar * dst,size_t step,Size sz,void *)852 static void sub8u( const uchar* src1, size_t step1,
853 const uchar* src2, size_t step2,
854 uchar* dst, size_t step, Size sz, void* )
855 {
856 #if (ARITHM_USE_IPP == 1)
857 CV_IPP_CHECK()
858 {
859 fixSteps(sz, sizeof(dst[0]), step1, step2, step);
860 if (0 <= ippiSub_8u_C1RSfs(src2, (int)step2, src1, (int)step1, dst, (int)step, ippiSize(sz), 0))
861 {
862 CV_IMPL_ADD(CV_IMPL_IPP);
863 return;
864 }
865 setIppErrorStatus();
866 }
867 #endif
868 (vBinOp<uchar, OpSub<uchar>, IF_SIMD(VSub<uchar>)>(src1, step1, src2, step2, dst, step, sz));
869 }
870
sub8s(const schar * src1,size_t step1,const schar * src2,size_t step2,schar * dst,size_t step,Size sz,void *)871 static void sub8s( const schar* src1, size_t step1,
872 const schar* src2, size_t step2,
873 schar* dst, size_t step, Size sz, void* )
874 {
875 vBinOp<schar, OpSub<schar>, IF_SIMD(VSub<schar>)>(src1, step1, src2, step2, dst, step, sz);
876 }
877
sub16u(const ushort * src1,size_t step1,const ushort * src2,size_t step2,ushort * dst,size_t step,Size sz,void *)878 static void sub16u( const ushort* src1, size_t step1,
879 const ushort* src2, size_t step2,
880 ushort* dst, size_t step, Size sz, void* )
881 {
882 #if (ARITHM_USE_IPP == 1)
883 CV_IPP_CHECK()
884 {
885 fixSteps(sz, sizeof(dst[0]), step1, step2, step);
886 if (0 <= ippiSub_16u_C1RSfs(src2, (int)step2, src1, (int)step1, dst, (int)step, ippiSize(sz), 0))
887 {
888 CV_IMPL_ADD(CV_IMPL_IPP);
889 return;
890 }
891 setIppErrorStatus();
892 }
893 #endif
894 (vBinOp<ushort, OpSub<ushort>, IF_SIMD(VSub<ushort>)>(src1, step1, src2, step2, dst, step, sz));
895 }
896
sub16s(const short * src1,size_t step1,const short * src2,size_t step2,short * dst,size_t step,Size sz,void *)897 static void sub16s( const short* src1, size_t step1,
898 const short* src2, size_t step2,
899 short* dst, size_t step, Size sz, void* )
900 {
901 #if (ARITHM_USE_IPP == 1)
902 CV_IPP_CHECK()
903 {
904 fixSteps(sz, sizeof(dst[0]), step1, step2, step);
905 if (0 <= ippiSub_16s_C1RSfs(src2, (int)step2, src1, (int)step1, dst, (int)step, ippiSize(sz), 0))
906 {
907 CV_IMPL_ADD(CV_IMPL_IPP);
908 return;
909 }
910 setIppErrorStatus();
911 }
912 #endif
913 (vBinOp<short, OpSub<short>, IF_SIMD(VSub<short>)>(src1, step1, src2, step2, dst, step, sz));
914 }
915
sub32s(const int * src1,size_t step1,const int * src2,size_t step2,int * dst,size_t step,Size sz,void *)916 static void sub32s( const int* src1, size_t step1,
917 const int* src2, size_t step2,
918 int* dst, size_t step, Size sz, void* )
919 {
920 vBinOp32<int, OpSub<int>, IF_SIMD(VSub<int>)>(src1, step1, src2, step2, dst, step, sz);
921 }
922
sub32f(const float * src1,size_t step1,const float * src2,size_t step2,float * dst,size_t step,Size sz,void *)923 static void sub32f( const float* src1, size_t step1,
924 const float* src2, size_t step2,
925 float* dst, size_t step, Size sz, void* )
926 {
927 #if (ARITHM_USE_IPP == 1)
928 CV_IPP_CHECK()
929 {
930 fixSteps(sz, sizeof(dst[0]), step1, step2, step);
931 if (0 <= ippiSub_32f_C1R(src2, (int)step2, src1, (int)step1, dst, (int)step, ippiSize(sz)))
932 {
933 CV_IMPL_ADD(CV_IMPL_IPP);
934 return;
935 }
936 setIppErrorStatus();
937 }
938 #endif
939 (vBinOp32<float, OpSub<float>, IF_SIMD(VSub<float>)>(src1, step1, src2, step2, dst, step, sz));
940 }
941
sub64f(const double * src1,size_t step1,const double * src2,size_t step2,double * dst,size_t step,Size sz,void *)942 static void sub64f( const double* src1, size_t step1,
943 const double* src2, size_t step2,
944 double* dst, size_t step, Size sz, void* )
945 {
946 vBinOp64<double, OpSub<double>, IF_SIMD(VSub<double>)>(src1, step1, src2, step2, dst, step, sz);
947 }
948
operator ()(uchar a,uchar b) const949 template<> inline uchar OpMin<uchar>::operator ()(uchar a, uchar b) const { return CV_MIN_8U(a, b); }
operator ()(uchar a,uchar b) const950 template<> inline uchar OpMax<uchar>::operator ()(uchar a, uchar b) const { return CV_MAX_8U(a, b); }
951
max8u(const uchar * src1,size_t step1,const uchar * src2,size_t step2,uchar * dst,size_t step,Size sz,void *)952 static void max8u( const uchar* src1, size_t step1,
953 const uchar* src2, size_t step2,
954 uchar* dst, size_t step, Size sz, void* )
955 {
956 #if (ARITHM_USE_IPP == 1)
957 CV_IPP_CHECK()
958 {
959 uchar* s1 = (uchar*)src1;
960 uchar* s2 = (uchar*)src2;
961 uchar* d = dst;
962 fixSteps(sz, sizeof(dst[0]), step1, step2, step);
963 int i = 0;
964 for(; i < sz.height; i++)
965 {
966 if (0 > ippsMaxEvery_8u(s1, s2, d, sz.width))
967 break;
968 s1 += step1;
969 s2 += step2;
970 d += step;
971 }
972 if (i == sz.height)
973 {
974 CV_IMPL_ADD(CV_IMPL_IPP);
975 return;
976 }
977 setIppErrorStatus();
978 }
979 #endif
980 vBinOp<uchar, OpMax<uchar>, IF_SIMD(VMax<uchar>)>(src1, step1, src2, step2, dst, step, sz);
981 }
982
max8s(const schar * src1,size_t step1,const schar * src2,size_t step2,schar * dst,size_t step,Size sz,void *)983 static void max8s( const schar* src1, size_t step1,
984 const schar* src2, size_t step2,
985 schar* dst, size_t step, Size sz, void* )
986 {
987 vBinOp<schar, OpMax<schar>, IF_SIMD(VMax<schar>)>(src1, step1, src2, step2, dst, step, sz);
988 }
989
max16u(const ushort * src1,size_t step1,const ushort * src2,size_t step2,ushort * dst,size_t step,Size sz,void *)990 static void max16u( const ushort* src1, size_t step1,
991 const ushort* src2, size_t step2,
992 ushort* dst, size_t step, Size sz, void* )
993 {
994 #if (ARITHM_USE_IPP == 1)
995 CV_IPP_CHECK()
996 {
997 ushort* s1 = (ushort*)src1;
998 ushort* s2 = (ushort*)src2;
999 ushort* d = dst;
1000 fixSteps(sz, sizeof(dst[0]), step1, step2, step);
1001 int i = 0;
1002 for(; i < sz.height; i++)
1003 {
1004 if (0 > ippsMaxEvery_16u(s1, s2, d, sz.width))
1005 break;
1006 s1 = (ushort*)((uchar*)s1 + step1);
1007 s2 = (ushort*)((uchar*)s2 + step2);
1008 d = (ushort*)((uchar*)d + step);
1009 }
1010 if (i == sz.height)
1011 {
1012 CV_IMPL_ADD(CV_IMPL_IPP);
1013 return;
1014 }
1015 setIppErrorStatus();
1016 }
1017 #endif
1018 vBinOp<ushort, OpMax<ushort>, IF_SIMD(VMax<ushort>)>(src1, step1, src2, step2, dst, step, sz);
1019 }
1020
max16s(const short * src1,size_t step1,const short * src2,size_t step2,short * dst,size_t step,Size sz,void *)1021 static void max16s( const short* src1, size_t step1,
1022 const short* src2, size_t step2,
1023 short* dst, size_t step, Size sz, void* )
1024 {
1025 vBinOp<short, OpMax<short>, IF_SIMD(VMax<short>)>(src1, step1, src2, step2, dst, step, sz);
1026 }
1027
max32s(const int * src1,size_t step1,const int * src2,size_t step2,int * dst,size_t step,Size sz,void *)1028 static void max32s( const int* src1, size_t step1,
1029 const int* src2, size_t step2,
1030 int* dst, size_t step, Size sz, void* )
1031 {
1032 vBinOp32<int, OpMax<int>, IF_SIMD(VMax<int>)>(src1, step1, src2, step2, dst, step, sz);
1033 }
1034
max32f(const float * src1,size_t step1,const float * src2,size_t step2,float * dst,size_t step,Size sz,void *)1035 static void max32f( const float* src1, size_t step1,
1036 const float* src2, size_t step2,
1037 float* dst, size_t step, Size sz, void* )
1038 {
1039 #if (ARITHM_USE_IPP == 1)
1040 CV_IPP_CHECK()
1041 {
1042 float* s1 = (float*)src1;
1043 float* s2 = (float*)src2;
1044 float* d = dst;
1045 fixSteps(sz, sizeof(dst[0]), step1, step2, step);
1046 int i = 0;
1047 for(; i < sz.height; i++)
1048 {
1049 if (0 > ippsMaxEvery_32f(s1, s2, d, sz.width))
1050 break;
1051 s1 = (float*)((uchar*)s1 + step1);
1052 s2 = (float*)((uchar*)s2 + step2);
1053 d = (float*)((uchar*)d + step);
1054 }
1055 if (i == sz.height)
1056 {
1057 CV_IMPL_ADD(CV_IMPL_IPP);
1058 return;
1059 }
1060 setIppErrorStatus();
1061 }
1062 #endif
1063 vBinOp32<float, OpMax<float>, IF_SIMD(VMax<float>)>(src1, step1, src2, step2, dst, step, sz);
1064 }
1065
max64f(const double * src1,size_t step1,const double * src2,size_t step2,double * dst,size_t step,Size sz,void *)1066 static void max64f( const double* src1, size_t step1,
1067 const double* src2, size_t step2,
1068 double* dst, size_t step, Size sz, void* )
1069 {
1070 #if ARITHM_USE_IPP == 1
1071 CV_IPP_CHECK()
1072 {
1073 double* s1 = (double*)src1;
1074 double* s2 = (double*)src2;
1075 double* d = dst;
1076 fixSteps(sz, sizeof(dst[0]), step1, step2, step);
1077 int i = 0;
1078 for(; i < sz.height; i++)
1079 {
1080 if (0 > ippsMaxEvery_64f(s1, s2, d, sz.width))
1081 break;
1082 s1 = (double*)((uchar*)s1 + step1);
1083 s2 = (double*)((uchar*)s2 + step2);
1084 d = (double*)((uchar*)d + step);
1085 }
1086 if (i == sz.height)
1087 {
1088 CV_IMPL_ADD(CV_IMPL_IPP);
1089 return;
1090 }
1091 setIppErrorStatus();
1092 }
1093 #endif
1094 vBinOp64<double, OpMax<double>, IF_SIMD(VMax<double>)>(src1, step1, src2, step2, dst, step, sz);
1095 }
1096
min8u(const uchar * src1,size_t step1,const uchar * src2,size_t step2,uchar * dst,size_t step,Size sz,void *)1097 static void min8u( const uchar* src1, size_t step1,
1098 const uchar* src2, size_t step2,
1099 uchar* dst, size_t step, Size sz, void* )
1100 {
1101 #if (ARITHM_USE_IPP == 1)
1102 CV_IPP_CHECK()
1103 {
1104 uchar* s1 = (uchar*)src1;
1105 uchar* s2 = (uchar*)src2;
1106 uchar* d = dst;
1107 fixSteps(sz, sizeof(dst[0]), step1, step2, step);
1108 int i = 0;
1109 for(; i < sz.height; i++)
1110 {
1111 if (0 > ippsMinEvery_8u(s1, s2, d, sz.width))
1112 break;
1113 s1 += step1;
1114 s2 += step2;
1115 d += step;
1116 }
1117 if (i == sz.height)
1118 {
1119 CV_IMPL_ADD(CV_IMPL_IPP);
1120 return;
1121 }
1122 setIppErrorStatus();
1123 }
1124 #endif
1125 vBinOp<uchar, OpMin<uchar>, IF_SIMD(VMin<uchar>)>(src1, step1, src2, step2, dst, step, sz);
1126 }
1127
min8s(const schar * src1,size_t step1,const schar * src2,size_t step2,schar * dst,size_t step,Size sz,void *)1128 static void min8s( const schar* src1, size_t step1,
1129 const schar* src2, size_t step2,
1130 schar* dst, size_t step, Size sz, void* )
1131 {
1132 vBinOp<schar, OpMin<schar>, IF_SIMD(VMin<schar>)>(src1, step1, src2, step2, dst, step, sz);
1133 }
1134
min16u(const ushort * src1,size_t step1,const ushort * src2,size_t step2,ushort * dst,size_t step,Size sz,void *)1135 static void min16u( const ushort* src1, size_t step1,
1136 const ushort* src2, size_t step2,
1137 ushort* dst, size_t step, Size sz, void* )
1138 {
1139 #if (ARITHM_USE_IPP == 1)
1140 CV_IPP_CHECK()
1141 {
1142 ushort* s1 = (ushort*)src1;
1143 ushort* s2 = (ushort*)src2;
1144 ushort* d = dst;
1145 fixSteps(sz, sizeof(dst[0]), step1, step2, step);
1146 int i = 0;
1147 for(; i < sz.height; i++)
1148 {
1149 if (0 > ippsMinEvery_16u(s1, s2, d, sz.width))
1150 break;
1151 s1 = (ushort*)((uchar*)s1 + step1);
1152 s2 = (ushort*)((uchar*)s2 + step2);
1153 d = (ushort*)((uchar*)d + step);
1154 }
1155 if (i == sz.height)
1156 {
1157 CV_IMPL_ADD(CV_IMPL_IPP);
1158 return;
1159 }
1160 setIppErrorStatus();
1161 }
1162 #endif
1163 vBinOp<ushort, OpMin<ushort>, IF_SIMD(VMin<ushort>)>(src1, step1, src2, step2, dst, step, sz);
1164 }
1165
min16s(const short * src1,size_t step1,const short * src2,size_t step2,short * dst,size_t step,Size sz,void *)1166 static void min16s( const short* src1, size_t step1,
1167 const short* src2, size_t step2,
1168 short* dst, size_t step, Size sz, void* )
1169 {
1170 vBinOp<short, OpMin<short>, IF_SIMD(VMin<short>)>(src1, step1, src2, step2, dst, step, sz);
1171 }
1172
min32s(const int * src1,size_t step1,const int * src2,size_t step2,int * dst,size_t step,Size sz,void *)1173 static void min32s( const int* src1, size_t step1,
1174 const int* src2, size_t step2,
1175 int* dst, size_t step, Size sz, void* )
1176 {
1177 vBinOp32<int, OpMin<int>, IF_SIMD(VMin<int>)>(src1, step1, src2, step2, dst, step, sz);
1178 }
1179
min32f(const float * src1,size_t step1,const float * src2,size_t step2,float * dst,size_t step,Size sz,void *)1180 static void min32f( const float* src1, size_t step1,
1181 const float* src2, size_t step2,
1182 float* dst, size_t step, Size sz, void* )
1183 {
1184 #if (ARITHM_USE_IPP == 1)
1185 CV_IPP_CHECK()
1186 {
1187 float* s1 = (float*)src1;
1188 float* s2 = (float*)src2;
1189 float* d = dst;
1190 fixSteps(sz, sizeof(dst[0]), step1, step2, step);
1191 int i = 0;
1192 for(; i < sz.height; i++)
1193 {
1194 if (0 > ippsMinEvery_32f(s1, s2, d, sz.width))
1195 break;
1196 s1 = (float*)((uchar*)s1 + step1);
1197 s2 = (float*)((uchar*)s2 + step2);
1198 d = (float*)((uchar*)d + step);
1199 }
1200 if (i == sz.height)
1201 {
1202 CV_IMPL_ADD(CV_IMPL_IPP);
1203 return;
1204 }
1205 setIppErrorStatus();
1206 }
1207 #endif
1208 vBinOp32<float, OpMin<float>, IF_SIMD(VMin<float>)>(src1, step1, src2, step2, dst, step, sz);
1209 }
1210
min64f(const double * src1,size_t step1,const double * src2,size_t step2,double * dst,size_t step,Size sz,void *)1211 static void min64f( const double* src1, size_t step1,
1212 const double* src2, size_t step2,
1213 double* dst, size_t step, Size sz, void* )
1214 {
1215 #if ARITHM_USE_IPP == 1
1216 CV_IPP_CHECK()
1217 {
1218 double* s1 = (double*)src1;
1219 double* s2 = (double*)src2;
1220 double* d = dst;
1221 fixSteps(sz, sizeof(dst[0]), step1, step2, step);
1222 int i = 0;
1223 for(; i < sz.height; i++)
1224 {
1225 if (0 > ippsMinEvery_64f(s1, s2, d, sz.width))
1226 break;
1227 s1 = (double*)((uchar*)s1 + step1);
1228 s2 = (double*)((uchar*)s2 + step2);
1229 d = (double*)((uchar*)d + step);
1230 }
1231 if (i == sz.height)
1232 {
1233 CV_IMPL_ADD(CV_IMPL_IPP);
1234 return;
1235 }
1236 setIppErrorStatus();
1237 }
1238 #endif
1239 vBinOp64<double, OpMin<double>, IF_SIMD(VMin<double>)>(src1, step1, src2, step2, dst, step, sz);
1240 }
1241
absdiff8u(const uchar * src1,size_t step1,const uchar * src2,size_t step2,uchar * dst,size_t step,Size sz,void *)1242 static void absdiff8u( const uchar* src1, size_t step1,
1243 const uchar* src2, size_t step2,
1244 uchar* dst, size_t step, Size sz, void* )
1245 {
1246 #if (ARITHM_USE_IPP == 1)
1247 CV_IPP_CHECK()
1248 {
1249 fixSteps(sz, sizeof(dst[0]), step1, step2, step);
1250 if (0 <= ippiAbsDiff_8u_C1R(src1, (int)step1, src2, (int)step2, dst, (int)step, ippiSize(sz)))
1251 {
1252 CV_IMPL_ADD(CV_IMPL_IPP);
1253 return;
1254 }
1255 setIppErrorStatus();
1256 }
1257 #endif
1258 (vBinOp<uchar, OpAbsDiff<uchar>, IF_SIMD(VAbsDiff<uchar>)>(src1, step1, src2, step2, dst, step, sz));
1259 }
1260
absdiff8s(const schar * src1,size_t step1,const schar * src2,size_t step2,schar * dst,size_t step,Size sz,void *)1261 static void absdiff8s( const schar* src1, size_t step1,
1262 const schar* src2, size_t step2,
1263 schar* dst, size_t step, Size sz, void* )
1264 {
1265 vBinOp<schar, OpAbsDiff<schar>, IF_SIMD(VAbsDiff<schar>)>(src1, step1, src2, step2, dst, step, sz);
1266 }
1267
absdiff16u(const ushort * src1,size_t step1,const ushort * src2,size_t step2,ushort * dst,size_t step,Size sz,void *)1268 static void absdiff16u( const ushort* src1, size_t step1,
1269 const ushort* src2, size_t step2,
1270 ushort* dst, size_t step, Size sz, void* )
1271 {
1272 #if (ARITHM_USE_IPP == 1)
1273 CV_IPP_CHECK()
1274 {
1275 fixSteps(sz, sizeof(dst[0]), step1, step2, step);
1276 if (0 <= ippiAbsDiff_16u_C1R(src1, (int)step1, src2, (int)step2, dst, (int)step, ippiSize(sz)))
1277 {
1278 CV_IMPL_ADD(CV_IMPL_IPP);
1279 return;
1280 }
1281 setIppErrorStatus();
1282 }
1283 #endif
1284 (vBinOp<ushort, OpAbsDiff<ushort>, IF_SIMD(VAbsDiff<ushort>)>(src1, step1, src2, step2, dst, step, sz));
1285 }
1286
absdiff16s(const short * src1,size_t step1,const short * src2,size_t step2,short * dst,size_t step,Size sz,void *)1287 static void absdiff16s( const short* src1, size_t step1,
1288 const short* src2, size_t step2,
1289 short* dst, size_t step, Size sz, void* )
1290 {
1291 vBinOp<short, OpAbsDiff<short>, IF_SIMD(VAbsDiff<short>)>(src1, step1, src2, step2, dst, step, sz);
1292 }
1293
absdiff32s(const int * src1,size_t step1,const int * src2,size_t step2,int * dst,size_t step,Size sz,void *)1294 static void absdiff32s( const int* src1, size_t step1,
1295 const int* src2, size_t step2,
1296 int* dst, size_t step, Size sz, void* )
1297 {
1298 vBinOp32<int, OpAbsDiff<int>, IF_SIMD(VAbsDiff<int>)>(src1, step1, src2, step2, dst, step, sz);
1299 }
1300
absdiff32f(const float * src1,size_t step1,const float * src2,size_t step2,float * dst,size_t step,Size sz,void *)1301 static void absdiff32f( const float* src1, size_t step1,
1302 const float* src2, size_t step2,
1303 float* dst, size_t step, Size sz, void* )
1304 {
1305 #if (ARITHM_USE_IPP == 1)
1306 CV_IPP_CHECK()
1307 {
1308 fixSteps(sz, sizeof(dst[0]), step1, step2, step);
1309 if (0 <= ippiAbsDiff_32f_C1R(src1, (int)step1, src2, (int)step2, dst, (int)step, ippiSize(sz)))
1310 {
1311 CV_IMPL_ADD(CV_IMPL_IPP);
1312 return;
1313 }
1314 setIppErrorStatus();
1315 }
1316 #endif
1317 (vBinOp32<float, OpAbsDiff<float>, IF_SIMD(VAbsDiff<float>)>(src1, step1, src2, step2, dst, step, sz));
1318 }
1319
absdiff64f(const double * src1,size_t step1,const double * src2,size_t step2,double * dst,size_t step,Size sz,void *)1320 static void absdiff64f( const double* src1, size_t step1,
1321 const double* src2, size_t step2,
1322 double* dst, size_t step, Size sz, void* )
1323 {
1324 vBinOp64<double, OpAbsDiff<double>, IF_SIMD(VAbsDiff<double>)>(src1, step1, src2, step2, dst, step, sz);
1325 }
1326
1327
and8u(const uchar * src1,size_t step1,const uchar * src2,size_t step2,uchar * dst,size_t step,Size sz,void *)1328 static void and8u( const uchar* src1, size_t step1,
1329 const uchar* src2, size_t step2,
1330 uchar* dst, size_t step, Size sz, void* )
1331 {
1332 #if (ARITHM_USE_IPP == 1)
1333 CV_IPP_CHECK()
1334 {
1335 fixSteps(sz, sizeof(dst[0]), step1, step2, step);
1336 if (0 <= ippiAnd_8u_C1R(src1, (int)step1, src2, (int)step2, dst, (int)step, ippiSize(sz)))
1337 {
1338 CV_IMPL_ADD(CV_IMPL_IPP);
1339 return;
1340 }
1341 setIppErrorStatus();
1342 }
1343 #endif
1344 (vBinOp<uchar, OpAnd<uchar>, IF_SIMD(VAnd<uchar>)>(src1, step1, src2, step2, dst, step, sz));
1345 }
1346
or8u(const uchar * src1,size_t step1,const uchar * src2,size_t step2,uchar * dst,size_t step,Size sz,void *)1347 static void or8u( const uchar* src1, size_t step1,
1348 const uchar* src2, size_t step2,
1349 uchar* dst, size_t step, Size sz, void* )
1350 {
1351 #if (ARITHM_USE_IPP == 1)
1352 CV_IPP_CHECK()
1353 {
1354 fixSteps(sz, sizeof(dst[0]), step1, step2, step);
1355 if (0 <= ippiOr_8u_C1R(src1, (int)step1, src2, (int)step2, dst, (int)step, ippiSize(sz)))
1356 {
1357 CV_IMPL_ADD(CV_IMPL_IPP);
1358 return;
1359 }
1360 setIppErrorStatus();
1361 }
1362 #endif
1363 (vBinOp<uchar, OpOr<uchar>, IF_SIMD(VOr<uchar>)>(src1, step1, src2, step2, dst, step, sz));
1364 }
1365
xor8u(const uchar * src1,size_t step1,const uchar * src2,size_t step2,uchar * dst,size_t step,Size sz,void *)1366 static void xor8u( const uchar* src1, size_t step1,
1367 const uchar* src2, size_t step2,
1368 uchar* dst, size_t step, Size sz, void* )
1369 {
1370 #if (ARITHM_USE_IPP == 1)
1371 CV_IPP_CHECK()
1372 {
1373 fixSteps(sz, sizeof(dst[0]), step1, step2, step);
1374 if (0 <= ippiXor_8u_C1R(src1, (int)step1, src2, (int)step2, dst, (int)step, ippiSize(sz)))
1375 {
1376 CV_IMPL_ADD(CV_IMPL_IPP);
1377 return;
1378 }
1379 setIppErrorStatus();
1380 }
1381 #endif
1382 (vBinOp<uchar, OpXor<uchar>, IF_SIMD(VXor<uchar>)>(src1, step1, src2, step2, dst, step, sz));
1383 }
1384
not8u(const uchar * src1,size_t step1,const uchar * src2,size_t step2,uchar * dst,size_t step,Size sz,void *)1385 static void not8u( const uchar* src1, size_t step1,
1386 const uchar* src2, size_t step2,
1387 uchar* dst, size_t step, Size sz, void* )
1388 {
1389 #if (ARITHM_USE_IPP == 1)
1390 CV_IPP_CHECK()
1391 {
1392 fixSteps(sz, sizeof(dst[0]), step1, step2, step); (void)src2;
1393 if (0 <= ippiNot_8u_C1R(src1, (int)step1, dst, (int)step, ippiSize(sz)))
1394 {
1395 CV_IMPL_ADD(CV_IMPL_IPP);
1396 return;
1397 }
1398 setIppErrorStatus();
1399 }
1400 #endif
1401 (vBinOp<uchar, OpNot<uchar>, IF_SIMD(VNot<uchar>)>(src1, step1, src2, step2, dst, step, sz));
1402 }
1403
1404 /****************************************************************************************\
1405 * logical operations *
1406 \****************************************************************************************/
1407
convertAndUnrollScalar(const Mat & sc,int buftype,uchar * scbuf,size_t blocksize)1408 void convertAndUnrollScalar( const Mat& sc, int buftype, uchar* scbuf, size_t blocksize )
1409 {
1410 int scn = (int)sc.total(), cn = CV_MAT_CN(buftype);
1411 size_t esz = CV_ELEM_SIZE(buftype);
1412 getConvertFunc(sc.depth(), buftype)(sc.ptr(), 1, 0, 1, scbuf, 1, Size(std::min(cn, scn), 1), 0);
1413 // unroll the scalar
1414 if( scn < cn )
1415 {
1416 CV_Assert( scn == 1 );
1417 size_t esz1 = CV_ELEM_SIZE1(buftype);
1418 for( size_t i = esz1; i < esz; i++ )
1419 scbuf[i] = scbuf[i - esz1];
1420 }
1421 for( size_t i = esz; i < blocksize*esz; i++ )
1422 scbuf[i] = scbuf[i - esz];
1423 }
1424
1425
1426 enum { OCL_OP_ADD=0, OCL_OP_SUB=1, OCL_OP_RSUB=2, OCL_OP_ABSDIFF=3, OCL_OP_MUL=4,
1427 OCL_OP_MUL_SCALE=5, OCL_OP_DIV_SCALE=6, OCL_OP_RECIP_SCALE=7, OCL_OP_ADDW=8,
1428 OCL_OP_AND=9, OCL_OP_OR=10, OCL_OP_XOR=11, OCL_OP_NOT=12, OCL_OP_MIN=13, OCL_OP_MAX=14,
1429 OCL_OP_RDIV_SCALE=15 };
1430
1431 #ifdef HAVE_OPENCL
1432
1433 static const char* oclop2str[] = { "OP_ADD", "OP_SUB", "OP_RSUB", "OP_ABSDIFF",
1434 "OP_MUL", "OP_MUL_SCALE", "OP_DIV_SCALE", "OP_RECIP_SCALE",
1435 "OP_ADDW", "OP_AND", "OP_OR", "OP_XOR", "OP_NOT", "OP_MIN", "OP_MAX", "OP_RDIV_SCALE", 0 };
1436
ocl_binary_op(InputArray _src1,InputArray _src2,OutputArray _dst,InputArray _mask,bool bitwise,int oclop,bool haveScalar)1437 static bool ocl_binary_op(InputArray _src1, InputArray _src2, OutputArray _dst,
1438 InputArray _mask, bool bitwise, int oclop, bool haveScalar )
1439 {
1440 bool haveMask = !_mask.empty();
1441 int srctype = _src1.type();
1442 int srcdepth = CV_MAT_DEPTH(srctype);
1443 int cn = CV_MAT_CN(srctype);
1444
1445 const ocl::Device d = ocl::Device::getDefault();
1446 bool doubleSupport = d.doubleFPConfig() > 0;
1447 if( oclop < 0 || ((haveMask || haveScalar) && cn > 4) ||
1448 (!doubleSupport && srcdepth == CV_64F && !bitwise))
1449 return false;
1450
1451 char opts[1024];
1452 int kercn = haveMask || haveScalar ? cn : ocl::predictOptimalVectorWidth(_src1, _src2, _dst);
1453 int scalarcn = kercn == 3 ? 4 : kercn;
1454 int rowsPerWI = d.isIntel() ? 4 : 1;
1455
1456 sprintf(opts, "-D %s%s -D %s -D dstT=%s%s -D dstT_C1=%s -D workST=%s -D cn=%d -D rowsPerWI=%d",
1457 haveMask ? "MASK_" : "", haveScalar ? "UNARY_OP" : "BINARY_OP", oclop2str[oclop],
1458 bitwise ? ocl::memopTypeToStr(CV_MAKETYPE(srcdepth, kercn)) :
1459 ocl::typeToStr(CV_MAKETYPE(srcdepth, kercn)), doubleSupport ? " -D DOUBLE_SUPPORT" : "",
1460 bitwise ? ocl::memopTypeToStr(CV_MAKETYPE(srcdepth, 1)) :
1461 ocl::typeToStr(CV_MAKETYPE(srcdepth, 1)),
1462 bitwise ? ocl::memopTypeToStr(CV_MAKETYPE(srcdepth, scalarcn)) :
1463 ocl::typeToStr(CV_MAKETYPE(srcdepth, scalarcn)),
1464 kercn, rowsPerWI);
1465
1466 ocl::Kernel k("KF", ocl::core::arithm_oclsrc, opts);
1467 if (k.empty())
1468 return false;
1469
1470 UMat src1 = _src1.getUMat(), src2;
1471 UMat dst = _dst.getUMat(), mask = _mask.getUMat();
1472
1473 ocl::KernelArg src1arg = ocl::KernelArg::ReadOnlyNoSize(src1, cn, kercn);
1474 ocl::KernelArg dstarg = haveMask ? ocl::KernelArg::ReadWrite(dst, cn, kercn) :
1475 ocl::KernelArg::WriteOnly(dst, cn, kercn);
1476 ocl::KernelArg maskarg = ocl::KernelArg::ReadOnlyNoSize(mask, 1);
1477
1478 if( haveScalar )
1479 {
1480 size_t esz = CV_ELEM_SIZE1(srctype)*scalarcn;
1481 double buf[4] = {0,0,0,0};
1482
1483 if( oclop != OCL_OP_NOT )
1484 {
1485 Mat src2sc = _src2.getMat();
1486 convertAndUnrollScalar(src2sc, srctype, (uchar*)buf, 1);
1487 }
1488
1489 ocl::KernelArg scalararg = ocl::KernelArg(0, 0, 0, 0, buf, esz);
1490
1491 if( !haveMask )
1492 k.args(src1arg, dstarg, scalararg);
1493 else
1494 k.args(src1arg, maskarg, dstarg, scalararg);
1495 }
1496 else
1497 {
1498 src2 = _src2.getUMat();
1499 ocl::KernelArg src2arg = ocl::KernelArg::ReadOnlyNoSize(src2, cn, kercn);
1500
1501 if( !haveMask )
1502 k.args(src1arg, src2arg, dstarg);
1503 else
1504 k.args(src1arg, src2arg, maskarg, dstarg);
1505 }
1506
1507 size_t globalsize[] = { src1.cols * cn / kercn, (src1.rows + rowsPerWI - 1) / rowsPerWI };
1508 return k.run(2, globalsize, 0, false);
1509 }
1510
1511 #endif
1512
binary_op(InputArray _src1,InputArray _src2,OutputArray _dst,InputArray _mask,const BinaryFunc * tab,bool bitwise,int oclop)1513 static void binary_op( InputArray _src1, InputArray _src2, OutputArray _dst,
1514 InputArray _mask, const BinaryFunc* tab,
1515 bool bitwise, int oclop )
1516 {
1517 const _InputArray *psrc1 = &_src1, *psrc2 = &_src2;
1518 int kind1 = psrc1->kind(), kind2 = psrc2->kind();
1519 int type1 = psrc1->type(), depth1 = CV_MAT_DEPTH(type1), cn = CV_MAT_CN(type1);
1520 int type2 = psrc2->type(), depth2 = CV_MAT_DEPTH(type2), cn2 = CV_MAT_CN(type2);
1521 int dims1 = psrc1->dims(), dims2 = psrc2->dims();
1522 Size sz1 = dims1 <= 2 ? psrc1->size() : Size();
1523 Size sz2 = dims2 <= 2 ? psrc2->size() : Size();
1524 #ifdef HAVE_OPENCL
1525 bool use_opencl = (kind1 == _InputArray::UMAT || kind2 == _InputArray::UMAT) &&
1526 dims1 <= 2 && dims2 <= 2;
1527 #endif
1528 bool haveMask = !_mask.empty(), haveScalar = false;
1529 BinaryFunc func;
1530
1531 if( dims1 <= 2 && dims2 <= 2 && kind1 == kind2 && sz1 == sz2 && type1 == type2 && !haveMask )
1532 {
1533 _dst.create(sz1, type1);
1534 CV_OCL_RUN(use_opencl,
1535 ocl_binary_op(*psrc1, *psrc2, _dst, _mask, bitwise, oclop, false))
1536
1537 if( bitwise )
1538 {
1539 func = *tab;
1540 cn = (int)CV_ELEM_SIZE(type1);
1541 }
1542 else
1543 func = tab[depth1];
1544
1545 Mat src1 = psrc1->getMat(), src2 = psrc2->getMat(), dst = _dst.getMat();
1546 Size sz = getContinuousSize(src1, src2, dst);
1547 size_t len = sz.width*(size_t)cn;
1548 if( len == (size_t)(int)len )
1549 {
1550 sz.width = (int)len;
1551 func(src1.ptr(), src1.step, src2.ptr(), src2.step, dst.ptr(), dst.step, sz, 0);
1552 return;
1553 }
1554 }
1555
1556 if( oclop == OCL_OP_NOT )
1557 haveScalar = true;
1558 else if( (kind1 == _InputArray::MATX) + (kind2 == _InputArray::MATX) == 1 ||
1559 !psrc1->sameSize(*psrc2) || type1 != type2 )
1560 {
1561 if( checkScalar(*psrc1, type2, kind1, kind2) )
1562 {
1563 // src1 is a scalar; swap it with src2
1564 swap(psrc1, psrc2);
1565 swap(type1, type2);
1566 swap(depth1, depth2);
1567 swap(cn, cn2);
1568 swap(sz1, sz2);
1569 }
1570 else if( !checkScalar(*psrc2, type1, kind2, kind1) )
1571 CV_Error( CV_StsUnmatchedSizes,
1572 "The operation is neither 'array op array' (where arrays have the same size and type), "
1573 "nor 'array op scalar', nor 'scalar op array'" );
1574 haveScalar = true;
1575 }
1576 else
1577 {
1578 CV_Assert( psrc1->sameSize(*psrc2) && type1 == type2 );
1579 }
1580
1581 size_t esz = CV_ELEM_SIZE(type1);
1582 size_t blocksize0 = (BLOCK_SIZE + esz-1)/esz;
1583 BinaryFunc copymask = 0;
1584 bool reallocate = false;
1585
1586 if( haveMask )
1587 {
1588 int mtype = _mask.type();
1589 CV_Assert( (mtype == CV_8U || mtype == CV_8S) && _mask.sameSize(*psrc1));
1590 copymask = getCopyMaskFunc(esz);
1591 reallocate = !_dst.sameSize(*psrc1) || _dst.type() != type1;
1592 }
1593
1594 AutoBuffer<uchar> _buf;
1595 uchar *scbuf = 0, *maskbuf = 0;
1596
1597 _dst.createSameSize(*psrc1, type1);
1598 // if this is mask operation and dst has been reallocated,
1599 // we have to clear the destination
1600 if( haveMask && reallocate )
1601 _dst.setTo(0.);
1602
1603 CV_OCL_RUN(use_opencl,
1604 ocl_binary_op(*psrc1, *psrc2, _dst, _mask, bitwise, oclop, haveScalar))
1605
1606
1607 Mat src1 = psrc1->getMat(), src2 = psrc2->getMat();
1608 Mat dst = _dst.getMat(), mask = _mask.getMat();
1609
1610 if( bitwise )
1611 {
1612 func = *tab;
1613 cn = (int)esz;
1614 }
1615 else
1616 func = tab[depth1];
1617
1618 if( !haveScalar )
1619 {
1620 const Mat* arrays[] = { &src1, &src2, &dst, &mask, 0 };
1621 uchar* ptrs[4];
1622
1623 NAryMatIterator it(arrays, ptrs);
1624 size_t total = it.size, blocksize = total;
1625
1626 if( blocksize*cn > INT_MAX )
1627 blocksize = INT_MAX/cn;
1628
1629 if( haveMask )
1630 {
1631 blocksize = std::min(blocksize, blocksize0);
1632 _buf.allocate(blocksize*esz);
1633 maskbuf = _buf;
1634 }
1635
1636 for( size_t i = 0; i < it.nplanes; i++, ++it )
1637 {
1638 for( size_t j = 0; j < total; j += blocksize )
1639 {
1640 int bsz = (int)MIN(total - j, blocksize);
1641
1642 func( ptrs[0], 0, ptrs[1], 0, haveMask ? maskbuf : ptrs[2], 0, Size(bsz*cn, 1), 0 );
1643 if( haveMask )
1644 {
1645 copymask( maskbuf, 0, ptrs[3], 0, ptrs[2], 0, Size(bsz, 1), &esz );
1646 ptrs[3] += bsz;
1647 }
1648
1649 bsz *= (int)esz;
1650 ptrs[0] += bsz; ptrs[1] += bsz; ptrs[2] += bsz;
1651 }
1652 }
1653 }
1654 else
1655 {
1656 const Mat* arrays[] = { &src1, &dst, &mask, 0 };
1657 uchar* ptrs[3];
1658
1659 NAryMatIterator it(arrays, ptrs);
1660 size_t total = it.size, blocksize = std::min(total, blocksize0);
1661
1662 _buf.allocate(blocksize*(haveMask ? 2 : 1)*esz + 32);
1663 scbuf = _buf;
1664 maskbuf = alignPtr(scbuf + blocksize*esz, 16);
1665
1666 convertAndUnrollScalar( src2, src1.type(), scbuf, blocksize);
1667
1668 for( size_t i = 0; i < it.nplanes; i++, ++it )
1669 {
1670 for( size_t j = 0; j < total; j += blocksize )
1671 {
1672 int bsz = (int)MIN(total - j, blocksize);
1673
1674 func( ptrs[0], 0, scbuf, 0, haveMask ? maskbuf : ptrs[1], 0, Size(bsz*cn, 1), 0 );
1675 if( haveMask )
1676 {
1677 copymask( maskbuf, 0, ptrs[2], 0, ptrs[1], 0, Size(bsz, 1), &esz );
1678 ptrs[2] += bsz;
1679 }
1680
1681 bsz *= (int)esz;
1682 ptrs[0] += bsz; ptrs[1] += bsz;
1683 }
1684 }
1685 }
1686 }
1687
getMaxTab()1688 static BinaryFunc* getMaxTab()
1689 {
1690 static BinaryFunc maxTab[] =
1691 {
1692 (BinaryFunc)GET_OPTIMIZED(max8u), (BinaryFunc)GET_OPTIMIZED(max8s),
1693 (BinaryFunc)GET_OPTIMIZED(max16u), (BinaryFunc)GET_OPTIMIZED(max16s),
1694 (BinaryFunc)GET_OPTIMIZED(max32s),
1695 (BinaryFunc)GET_OPTIMIZED(max32f), (BinaryFunc)max64f,
1696 0
1697 };
1698
1699 return maxTab;
1700 }
1701
getMinTab()1702 static BinaryFunc* getMinTab()
1703 {
1704 static BinaryFunc minTab[] =
1705 {
1706 (BinaryFunc)GET_OPTIMIZED(min8u), (BinaryFunc)GET_OPTIMIZED(min8s),
1707 (BinaryFunc)GET_OPTIMIZED(min16u), (BinaryFunc)GET_OPTIMIZED(min16s),
1708 (BinaryFunc)GET_OPTIMIZED(min32s),
1709 (BinaryFunc)GET_OPTIMIZED(min32f), (BinaryFunc)min64f,
1710 0
1711 };
1712
1713 return minTab;
1714 }
1715
1716 }
1717
bitwise_and(InputArray a,InputArray b,OutputArray c,InputArray mask)1718 void cv::bitwise_and(InputArray a, InputArray b, OutputArray c, InputArray mask)
1719 {
1720 BinaryFunc f = (BinaryFunc)GET_OPTIMIZED(and8u);
1721 binary_op(a, b, c, mask, &f, true, OCL_OP_AND);
1722 }
1723
bitwise_or(InputArray a,InputArray b,OutputArray c,InputArray mask)1724 void cv::bitwise_or(InputArray a, InputArray b, OutputArray c, InputArray mask)
1725 {
1726 BinaryFunc f = (BinaryFunc)GET_OPTIMIZED(or8u);
1727 binary_op(a, b, c, mask, &f, true, OCL_OP_OR);
1728 }
1729
bitwise_xor(InputArray a,InputArray b,OutputArray c,InputArray mask)1730 void cv::bitwise_xor(InputArray a, InputArray b, OutputArray c, InputArray mask)
1731 {
1732 BinaryFunc f = (BinaryFunc)GET_OPTIMIZED(xor8u);
1733 binary_op(a, b, c, mask, &f, true, OCL_OP_XOR);
1734 }
1735
bitwise_not(InputArray a,OutputArray c,InputArray mask)1736 void cv::bitwise_not(InputArray a, OutputArray c, InputArray mask)
1737 {
1738 BinaryFunc f = (BinaryFunc)GET_OPTIMIZED(not8u);
1739 binary_op(a, a, c, mask, &f, true, OCL_OP_NOT);
1740 }
1741
max(InputArray src1,InputArray src2,OutputArray dst)1742 void cv::max( InputArray src1, InputArray src2, OutputArray dst )
1743 {
1744 binary_op(src1, src2, dst, noArray(), getMaxTab(), false, OCL_OP_MAX );
1745 }
1746
min(InputArray src1,InputArray src2,OutputArray dst)1747 void cv::min( InputArray src1, InputArray src2, OutputArray dst )
1748 {
1749 binary_op(src1, src2, dst, noArray(), getMinTab(), false, OCL_OP_MIN );
1750 }
1751
max(const Mat & src1,const Mat & src2,Mat & dst)1752 void cv::max(const Mat& src1, const Mat& src2, Mat& dst)
1753 {
1754 OutputArray _dst(dst);
1755 binary_op(src1, src2, _dst, noArray(), getMaxTab(), false, OCL_OP_MAX );
1756 }
1757
min(const Mat & src1,const Mat & src2,Mat & dst)1758 void cv::min(const Mat& src1, const Mat& src2, Mat& dst)
1759 {
1760 OutputArray _dst(dst);
1761 binary_op(src1, src2, _dst, noArray(), getMinTab(), false, OCL_OP_MIN );
1762 }
1763
max(const UMat & src1,const UMat & src2,UMat & dst)1764 void cv::max(const UMat& src1, const UMat& src2, UMat& dst)
1765 {
1766 OutputArray _dst(dst);
1767 binary_op(src1, src2, _dst, noArray(), getMaxTab(), false, OCL_OP_MAX );
1768 }
1769
min(const UMat & src1,const UMat & src2,UMat & dst)1770 void cv::min(const UMat& src1, const UMat& src2, UMat& dst)
1771 {
1772 OutputArray _dst(dst);
1773 binary_op(src1, src2, _dst, noArray(), getMinTab(), false, OCL_OP_MIN );
1774 }
1775
1776
1777 /****************************************************************************************\
1778 * add/subtract *
1779 \****************************************************************************************/
1780
1781 namespace cv
1782 {
1783
actualScalarDepth(const double * data,int len)1784 static int actualScalarDepth(const double* data, int len)
1785 {
1786 int i = 0, minval = INT_MAX, maxval = INT_MIN;
1787 for(; i < len; ++i)
1788 {
1789 int ival = cvRound(data[i]);
1790 if( ival != data[i] )
1791 break;
1792 minval = MIN(minval, ival);
1793 maxval = MAX(maxval, ival);
1794 }
1795 return i < len ? CV_64F :
1796 minval >= 0 && maxval <= (int)UCHAR_MAX ? CV_8U :
1797 minval >= (int)SCHAR_MIN && maxval <= (int)SCHAR_MAX ? CV_8S :
1798 minval >= 0 && maxval <= (int)USHRT_MAX ? CV_16U :
1799 minval >= (int)SHRT_MIN && maxval <= (int)SHRT_MAX ? CV_16S :
1800 CV_32S;
1801 }
1802
1803 #ifdef HAVE_OPENCL
1804
ocl_arithm_op(InputArray _src1,InputArray _src2,OutputArray _dst,InputArray _mask,int wtype,void * usrdata,int oclop,bool haveScalar)1805 static bool ocl_arithm_op(InputArray _src1, InputArray _src2, OutputArray _dst,
1806 InputArray _mask, int wtype,
1807 void* usrdata, int oclop,
1808 bool haveScalar )
1809 {
1810 const ocl::Device d = ocl::Device::getDefault();
1811 bool doubleSupport = d.doubleFPConfig() > 0;
1812 int type1 = _src1.type(), depth1 = CV_MAT_DEPTH(type1), cn = CV_MAT_CN(type1);
1813 bool haveMask = !_mask.empty();
1814
1815 if ( (haveMask || haveScalar) && cn > 4 )
1816 return false;
1817
1818 int dtype = _dst.type(), ddepth = CV_MAT_DEPTH(dtype), wdepth = std::max(CV_32S, CV_MAT_DEPTH(wtype));
1819 if (!doubleSupport)
1820 wdepth = std::min(wdepth, CV_32F);
1821
1822 wtype = CV_MAKETYPE(wdepth, cn);
1823 int type2 = haveScalar ? wtype : _src2.type(), depth2 = CV_MAT_DEPTH(type2);
1824 if (!doubleSupport && (depth2 == CV_64F || depth1 == CV_64F))
1825 return false;
1826
1827 int kercn = haveMask || haveScalar ? cn : ocl::predictOptimalVectorWidth(_src1, _src2, _dst);
1828 int scalarcn = kercn == 3 ? 4 : kercn, rowsPerWI = d.isIntel() ? 4 : 1;
1829
1830 char cvtstr[4][32], opts[1024];
1831 sprintf(opts, "-D %s%s -D %s -D srcT1=%s -D srcT1_C1=%s -D srcT2=%s -D srcT2_C1=%s "
1832 "-D dstT=%s -D dstT_C1=%s -D workT=%s -D workST=%s -D scaleT=%s -D wdepth=%d -D convertToWT1=%s "
1833 "-D convertToWT2=%s -D convertToDT=%s%s -D cn=%d -D rowsPerWI=%d -D convertFromU=%s",
1834 (haveMask ? "MASK_" : ""), (haveScalar ? "UNARY_OP" : "BINARY_OP"),
1835 oclop2str[oclop], ocl::typeToStr(CV_MAKETYPE(depth1, kercn)),
1836 ocl::typeToStr(depth1), ocl::typeToStr(CV_MAKETYPE(depth2, kercn)),
1837 ocl::typeToStr(depth2), ocl::typeToStr(CV_MAKETYPE(ddepth, kercn)),
1838 ocl::typeToStr(ddepth), ocl::typeToStr(CV_MAKETYPE(wdepth, kercn)),
1839 ocl::typeToStr(CV_MAKETYPE(wdepth, scalarcn)),
1840 ocl::typeToStr(wdepth), wdepth,
1841 ocl::convertTypeStr(depth1, wdepth, kercn, cvtstr[0]),
1842 ocl::convertTypeStr(depth2, wdepth, kercn, cvtstr[1]),
1843 ocl::convertTypeStr(wdepth, ddepth, kercn, cvtstr[2]),
1844 doubleSupport ? " -D DOUBLE_SUPPORT" : "", kercn, rowsPerWI,
1845 oclop == OCL_OP_ABSDIFF && wdepth == CV_32S && ddepth == wdepth ?
1846 ocl::convertTypeStr(CV_8U, ddepth, kercn, cvtstr[3]) : "noconvert");
1847
1848 size_t usrdata_esz = CV_ELEM_SIZE(wdepth);
1849 const uchar* usrdata_p = (const uchar*)usrdata;
1850 const double* usrdata_d = (const double*)usrdata;
1851 float usrdata_f[3];
1852 int i, n = oclop == OCL_OP_MUL_SCALE || oclop == OCL_OP_DIV_SCALE ||
1853 oclop == OCL_OP_RDIV_SCALE || oclop == OCL_OP_RECIP_SCALE ? 1 : oclop == OCL_OP_ADDW ? 3 : 0;
1854 if( n > 0 && wdepth == CV_32F )
1855 {
1856 for( i = 0; i < n; i++ )
1857 usrdata_f[i] = (float)usrdata_d[i];
1858 usrdata_p = (const uchar*)usrdata_f;
1859 }
1860
1861 ocl::Kernel k("KF", ocl::core::arithm_oclsrc, opts);
1862 if (k.empty())
1863 return false;
1864
1865 UMat src1 = _src1.getUMat(), src2;
1866 UMat dst = _dst.getUMat(), mask = _mask.getUMat();
1867
1868 ocl::KernelArg src1arg = ocl::KernelArg::ReadOnlyNoSize(src1, cn, kercn);
1869 ocl::KernelArg dstarg = haveMask ? ocl::KernelArg::ReadWrite(dst, cn, kercn) :
1870 ocl::KernelArg::WriteOnly(dst, cn, kercn);
1871 ocl::KernelArg maskarg = ocl::KernelArg::ReadOnlyNoSize(mask, 1);
1872
1873 if( haveScalar )
1874 {
1875 size_t esz = CV_ELEM_SIZE1(wtype)*scalarcn;
1876 double buf[4]={0,0,0,0};
1877 Mat src2sc = _src2.getMat();
1878
1879 if( !src2sc.empty() )
1880 convertAndUnrollScalar(src2sc, wtype, (uchar*)buf, 1);
1881 ocl::KernelArg scalararg = ocl::KernelArg(0, 0, 0, 0, buf, esz);
1882
1883 if( !haveMask )
1884 {
1885 if(n == 0)
1886 k.args(src1arg, dstarg, scalararg);
1887 else if(n == 1)
1888 k.args(src1arg, dstarg, scalararg,
1889 ocl::KernelArg(0, 0, 0, 0, usrdata_p, usrdata_esz));
1890 else
1891 CV_Error(Error::StsNotImplemented, "unsupported number of extra parameters");
1892 }
1893 else
1894 k.args(src1arg, maskarg, dstarg, scalararg);
1895 }
1896 else
1897 {
1898 src2 = _src2.getUMat();
1899 ocl::KernelArg src2arg = ocl::KernelArg::ReadOnlyNoSize(src2, cn, kercn);
1900
1901 if( !haveMask )
1902 {
1903 if (n == 0)
1904 k.args(src1arg, src2arg, dstarg);
1905 else if (n == 1)
1906 k.args(src1arg, src2arg, dstarg,
1907 ocl::KernelArg(0, 0, 0, 0, usrdata_p, usrdata_esz));
1908 else if (n == 3)
1909 k.args(src1arg, src2arg, dstarg,
1910 ocl::KernelArg(0, 0, 0, 0, usrdata_p, usrdata_esz),
1911 ocl::KernelArg(0, 0, 0, 0, usrdata_p + usrdata_esz, usrdata_esz),
1912 ocl::KernelArg(0, 0, 0, 0, usrdata_p + usrdata_esz*2, usrdata_esz));
1913 else
1914 CV_Error(Error::StsNotImplemented, "unsupported number of extra parameters");
1915 }
1916 else
1917 k.args(src1arg, src2arg, maskarg, dstarg);
1918 }
1919
1920 size_t globalsize[] = { src1.cols * cn / kercn, (src1.rows + rowsPerWI - 1) / rowsPerWI };
1921 return k.run(2, globalsize, NULL, false);
1922 }
1923
1924 #endif
1925
arithm_op(InputArray _src1,InputArray _src2,OutputArray _dst,InputArray _mask,int dtype,BinaryFunc * tab,bool muldiv=false,void * usrdata=0,int oclop=-1)1926 static void arithm_op(InputArray _src1, InputArray _src2, OutputArray _dst,
1927 InputArray _mask, int dtype, BinaryFunc* tab, bool muldiv=false,
1928 void* usrdata=0, int oclop=-1 )
1929 {
1930 const _InputArray *psrc1 = &_src1, *psrc2 = &_src2;
1931 int kind1 = psrc1->kind(), kind2 = psrc2->kind();
1932 bool haveMask = !_mask.empty();
1933 bool reallocate = false;
1934 int type1 = psrc1->type(), depth1 = CV_MAT_DEPTH(type1), cn = CV_MAT_CN(type1);
1935 int type2 = psrc2->type(), depth2 = CV_MAT_DEPTH(type2), cn2 = CV_MAT_CN(type2);
1936 int wtype, dims1 = psrc1->dims(), dims2 = psrc2->dims();
1937 Size sz1 = dims1 <= 2 ? psrc1->size() : Size();
1938 Size sz2 = dims2 <= 2 ? psrc2->size() : Size();
1939 #ifdef HAVE_OPENCL
1940 bool use_opencl = OCL_PERFORMANCE_CHECK(_dst.isUMat()) && dims1 <= 2 && dims2 <= 2;
1941 #endif
1942 bool src1Scalar = checkScalar(*psrc1, type2, kind1, kind2);
1943 bool src2Scalar = checkScalar(*psrc2, type1, kind2, kind1);
1944
1945 if( (kind1 == kind2 || cn == 1) && sz1 == sz2 && dims1 <= 2 && dims2 <= 2 && type1 == type2 &&
1946 !haveMask && ((!_dst.fixedType() && (dtype < 0 || CV_MAT_DEPTH(dtype) == depth1)) ||
1947 (_dst.fixedType() && _dst.type() == type1)) &&
1948 ((src1Scalar && src2Scalar) || (!src1Scalar && !src2Scalar)) )
1949 {
1950 _dst.createSameSize(*psrc1, type1);
1951 CV_OCL_RUN(use_opencl,
1952 ocl_arithm_op(*psrc1, *psrc2, _dst, _mask,
1953 (!usrdata ? type1 : std::max(depth1, CV_32F)),
1954 usrdata, oclop, false))
1955
1956 Mat src1 = psrc1->getMat(), src2 = psrc2->getMat(), dst = _dst.getMat();
1957 Size sz = getContinuousSize(src1, src2, dst, src1.channels());
1958 tab[depth1](src1.ptr(), src1.step, src2.ptr(), src2.step, dst.ptr(), dst.step, sz, usrdata);
1959 return;
1960 }
1961
1962 bool haveScalar = false, swapped12 = false;
1963
1964 if( dims1 != dims2 || sz1 != sz2 || cn != cn2 ||
1965 (kind1 == _InputArray::MATX && (sz1 == Size(1,4) || sz1 == Size(1,1))) ||
1966 (kind2 == _InputArray::MATX && (sz2 == Size(1,4) || sz2 == Size(1,1))) )
1967 {
1968 if( checkScalar(*psrc1, type2, kind1, kind2) )
1969 {
1970 // src1 is a scalar; swap it with src2
1971 swap(psrc1, psrc2);
1972 swap(sz1, sz2);
1973 swap(type1, type2);
1974 swap(depth1, depth2);
1975 swap(cn, cn2);
1976 swap(dims1, dims2);
1977 swapped12 = true;
1978 if( oclop == OCL_OP_SUB )
1979 oclop = OCL_OP_RSUB;
1980 if ( oclop == OCL_OP_DIV_SCALE )
1981 oclop = OCL_OP_RDIV_SCALE;
1982 }
1983 else if( !checkScalar(*psrc2, type1, kind2, kind1) )
1984 CV_Error( CV_StsUnmatchedSizes,
1985 "The operation is neither 'array op array' "
1986 "(where arrays have the same size and the same number of channels), "
1987 "nor 'array op scalar', nor 'scalar op array'" );
1988 haveScalar = true;
1989 CV_Assert(type2 == CV_64F && (sz2.height == 1 || sz2.height == 4));
1990
1991 if (!muldiv)
1992 {
1993 Mat sc = psrc2->getMat();
1994 depth2 = actualScalarDepth(sc.ptr<double>(), cn);
1995 if( depth2 == CV_64F && (depth1 < CV_32S || depth1 == CV_32F) )
1996 depth2 = CV_32F;
1997 }
1998 else
1999 depth2 = CV_64F;
2000 }
2001
2002 if( dtype < 0 )
2003 {
2004 if( _dst.fixedType() )
2005 dtype = _dst.type();
2006 else
2007 {
2008 if( !haveScalar && type1 != type2 )
2009 CV_Error(CV_StsBadArg,
2010 "When the input arrays in add/subtract/multiply/divide functions have different types, "
2011 "the output array type must be explicitly specified");
2012 dtype = type1;
2013 }
2014 }
2015 dtype = CV_MAT_DEPTH(dtype);
2016
2017 if( depth1 == depth2 && dtype == depth1 )
2018 wtype = dtype;
2019 else if( !muldiv )
2020 {
2021 wtype = depth1 <= CV_8S && depth2 <= CV_8S ? CV_16S :
2022 depth1 <= CV_32S && depth2 <= CV_32S ? CV_32S : std::max(depth1, depth2);
2023 wtype = std::max(wtype, dtype);
2024
2025 // when the result of addition should be converted to an integer type,
2026 // and just one of the input arrays is floating-point, it makes sense to convert that input to integer type before the operation,
2027 // instead of converting the other input to floating-point and then converting the operation result back to integers.
2028 if( dtype < CV_32F && (depth1 < CV_32F || depth2 < CV_32F) )
2029 wtype = CV_32S;
2030 }
2031 else
2032 {
2033 wtype = std::max(depth1, std::max(depth2, CV_32F));
2034 wtype = std::max(wtype, dtype);
2035 }
2036
2037 dtype = CV_MAKETYPE(dtype, cn);
2038 wtype = CV_MAKETYPE(wtype, cn);
2039
2040 if( haveMask )
2041 {
2042 int mtype = _mask.type();
2043 CV_Assert( (mtype == CV_8UC1 || mtype == CV_8SC1) && _mask.sameSize(*psrc1) );
2044 reallocate = !_dst.sameSize(*psrc1) || _dst.type() != dtype;
2045 }
2046
2047 _dst.createSameSize(*psrc1, dtype);
2048 if( reallocate )
2049 _dst.setTo(0.);
2050
2051 CV_OCL_RUN(use_opencl,
2052 ocl_arithm_op(*psrc1, *psrc2, _dst, _mask, wtype,
2053 usrdata, oclop, haveScalar))
2054
2055 BinaryFunc cvtsrc1 = type1 == wtype ? 0 : getConvertFunc(type1, wtype);
2056 BinaryFunc cvtsrc2 = type2 == type1 ? cvtsrc1 : type2 == wtype ? 0 : getConvertFunc(type2, wtype);
2057 BinaryFunc cvtdst = dtype == wtype ? 0 : getConvertFunc(wtype, dtype);
2058
2059 size_t esz1 = CV_ELEM_SIZE(type1), esz2 = CV_ELEM_SIZE(type2);
2060 size_t dsz = CV_ELEM_SIZE(dtype), wsz = CV_ELEM_SIZE(wtype);
2061 size_t blocksize0 = (size_t)(BLOCK_SIZE + wsz-1)/wsz;
2062 BinaryFunc copymask = getCopyMaskFunc(dsz);
2063 Mat src1 = psrc1->getMat(), src2 = psrc2->getMat(), dst = _dst.getMat(), mask = _mask.getMat();
2064
2065 AutoBuffer<uchar> _buf;
2066 uchar *buf, *maskbuf = 0, *buf1 = 0, *buf2 = 0, *wbuf = 0;
2067 size_t bufesz = (cvtsrc1 ? wsz : 0) +
2068 (cvtsrc2 || haveScalar ? wsz : 0) +
2069 (cvtdst ? wsz : 0) +
2070 (haveMask ? dsz : 0);
2071 BinaryFunc func = tab[CV_MAT_DEPTH(wtype)];
2072
2073 if( !haveScalar )
2074 {
2075 const Mat* arrays[] = { &src1, &src2, &dst, &mask, 0 };
2076 uchar* ptrs[4];
2077
2078 NAryMatIterator it(arrays, ptrs);
2079 size_t total = it.size, blocksize = total;
2080
2081 if( haveMask || cvtsrc1 || cvtsrc2 || cvtdst )
2082 blocksize = std::min(blocksize, blocksize0);
2083
2084 _buf.allocate(bufesz*blocksize + 64);
2085 buf = _buf;
2086 if( cvtsrc1 )
2087 buf1 = buf, buf = alignPtr(buf + blocksize*wsz, 16);
2088 if( cvtsrc2 )
2089 buf2 = buf, buf = alignPtr(buf + blocksize*wsz, 16);
2090 wbuf = maskbuf = buf;
2091 if( cvtdst )
2092 buf = alignPtr(buf + blocksize*wsz, 16);
2093 if( haveMask )
2094 maskbuf = buf;
2095
2096 for( size_t i = 0; i < it.nplanes; i++, ++it )
2097 {
2098 for( size_t j = 0; j < total; j += blocksize )
2099 {
2100 int bsz = (int)MIN(total - j, blocksize);
2101 Size bszn(bsz*cn, 1);
2102 const uchar *sptr1 = ptrs[0], *sptr2 = ptrs[1];
2103 uchar* dptr = ptrs[2];
2104 if( cvtsrc1 )
2105 {
2106 cvtsrc1( sptr1, 1, 0, 1, buf1, 1, bszn, 0 );
2107 sptr1 = buf1;
2108 }
2109 if( ptrs[0] == ptrs[1] )
2110 sptr2 = sptr1;
2111 else if( cvtsrc2 )
2112 {
2113 cvtsrc2( sptr2, 1, 0, 1, buf2, 1, bszn, 0 );
2114 sptr2 = buf2;
2115 }
2116
2117 if( !haveMask && !cvtdst )
2118 func( sptr1, 1, sptr2, 1, dptr, 1, bszn, usrdata );
2119 else
2120 {
2121 func( sptr1, 1, sptr2, 1, wbuf, 0, bszn, usrdata );
2122 if( !haveMask )
2123 cvtdst( wbuf, 1, 0, 1, dptr, 1, bszn, 0 );
2124 else if( !cvtdst )
2125 {
2126 copymask( wbuf, 1, ptrs[3], 1, dptr, 1, Size(bsz, 1), &dsz );
2127 ptrs[3] += bsz;
2128 }
2129 else
2130 {
2131 cvtdst( wbuf, 1, 0, 1, maskbuf, 1, bszn, 0 );
2132 copymask( maskbuf, 1, ptrs[3], 1, dptr, 1, Size(bsz, 1), &dsz );
2133 ptrs[3] += bsz;
2134 }
2135 }
2136 ptrs[0] += bsz*esz1; ptrs[1] += bsz*esz2; ptrs[2] += bsz*dsz;
2137 }
2138 }
2139 }
2140 else
2141 {
2142 const Mat* arrays[] = { &src1, &dst, &mask, 0 };
2143 uchar* ptrs[3];
2144
2145 NAryMatIterator it(arrays, ptrs);
2146 size_t total = it.size, blocksize = std::min(total, blocksize0);
2147
2148 _buf.allocate(bufesz*blocksize + 64);
2149 buf = _buf;
2150 if( cvtsrc1 )
2151 buf1 = buf, buf = alignPtr(buf + blocksize*wsz, 16);
2152 buf2 = buf; buf = alignPtr(buf + blocksize*wsz, 16);
2153 wbuf = maskbuf = buf;
2154 if( cvtdst )
2155 buf = alignPtr(buf + blocksize*wsz, 16);
2156 if( haveMask )
2157 maskbuf = buf;
2158
2159 convertAndUnrollScalar( src2, wtype, buf2, blocksize);
2160
2161 for( size_t i = 0; i < it.nplanes; i++, ++it )
2162 {
2163 for( size_t j = 0; j < total; j += blocksize )
2164 {
2165 int bsz = (int)MIN(total - j, blocksize);
2166 Size bszn(bsz*cn, 1);
2167 const uchar *sptr1 = ptrs[0];
2168 const uchar* sptr2 = buf2;
2169 uchar* dptr = ptrs[1];
2170
2171 if( cvtsrc1 )
2172 {
2173 cvtsrc1( sptr1, 1, 0, 1, buf1, 1, bszn, 0 );
2174 sptr1 = buf1;
2175 }
2176
2177 if( swapped12 )
2178 std::swap(sptr1, sptr2);
2179
2180 if( !haveMask && !cvtdst )
2181 func( sptr1, 1, sptr2, 1, dptr, 1, bszn, usrdata );
2182 else
2183 {
2184 func( sptr1, 1, sptr2, 1, wbuf, 1, bszn, usrdata );
2185 if( !haveMask )
2186 cvtdst( wbuf, 1, 0, 1, dptr, 1, bszn, 0 );
2187 else if( !cvtdst )
2188 {
2189 copymask( wbuf, 1, ptrs[2], 1, dptr, 1, Size(bsz, 1), &dsz );
2190 ptrs[2] += bsz;
2191 }
2192 else
2193 {
2194 cvtdst( wbuf, 1, 0, 1, maskbuf, 1, bszn, 0 );
2195 copymask( maskbuf, 1, ptrs[2], 1, dptr, 1, Size(bsz, 1), &dsz );
2196 ptrs[2] += bsz;
2197 }
2198 }
2199 ptrs[0] += bsz*esz1; ptrs[1] += bsz*dsz;
2200 }
2201 }
2202 }
2203 }
2204
getAddTab()2205 static BinaryFunc* getAddTab()
2206 {
2207 static BinaryFunc addTab[] =
2208 {
2209 (BinaryFunc)GET_OPTIMIZED(add8u), (BinaryFunc)GET_OPTIMIZED(add8s),
2210 (BinaryFunc)GET_OPTIMIZED(add16u), (BinaryFunc)GET_OPTIMIZED(add16s),
2211 (BinaryFunc)GET_OPTIMIZED(add32s),
2212 (BinaryFunc)GET_OPTIMIZED(add32f), (BinaryFunc)add64f,
2213 0
2214 };
2215
2216 return addTab;
2217 }
2218
getSubTab()2219 static BinaryFunc* getSubTab()
2220 {
2221 static BinaryFunc subTab[] =
2222 {
2223 (BinaryFunc)GET_OPTIMIZED(sub8u), (BinaryFunc)GET_OPTIMIZED(sub8s),
2224 (BinaryFunc)GET_OPTIMIZED(sub16u), (BinaryFunc)GET_OPTIMIZED(sub16s),
2225 (BinaryFunc)GET_OPTIMIZED(sub32s),
2226 (BinaryFunc)GET_OPTIMIZED(sub32f), (BinaryFunc)sub64f,
2227 0
2228 };
2229
2230 return subTab;
2231 }
2232
getAbsDiffTab()2233 static BinaryFunc* getAbsDiffTab()
2234 {
2235 static BinaryFunc absDiffTab[] =
2236 {
2237 (BinaryFunc)GET_OPTIMIZED(absdiff8u), (BinaryFunc)GET_OPTIMIZED(absdiff8s),
2238 (BinaryFunc)GET_OPTIMIZED(absdiff16u), (BinaryFunc)GET_OPTIMIZED(absdiff16s),
2239 (BinaryFunc)GET_OPTIMIZED(absdiff32s),
2240 (BinaryFunc)GET_OPTIMIZED(absdiff32f), (BinaryFunc)absdiff64f,
2241 0
2242 };
2243
2244 return absDiffTab;
2245 }
2246
2247 }
2248
add(InputArray src1,InputArray src2,OutputArray dst,InputArray mask,int dtype)2249 void cv::add( InputArray src1, InputArray src2, OutputArray dst,
2250 InputArray mask, int dtype )
2251 {
2252 arithm_op(src1, src2, dst, mask, dtype, getAddTab(), false, 0, OCL_OP_ADD );
2253 }
2254
subtract(InputArray _src1,InputArray _src2,OutputArray _dst,InputArray mask,int dtype)2255 void cv::subtract( InputArray _src1, InputArray _src2, OutputArray _dst,
2256 InputArray mask, int dtype )
2257 {
2258 #ifdef HAVE_TEGRA_OPTIMIZATION
2259 if (tegra::useTegra())
2260 {
2261 int kind1 = _src1.kind(), kind2 = _src2.kind();
2262 Mat src1 = _src1.getMat(), src2 = _src2.getMat();
2263 bool src1Scalar = checkScalar(src1, _src2.type(), kind1, kind2);
2264 bool src2Scalar = checkScalar(src2, _src1.type(), kind2, kind1);
2265
2266 if (!src1Scalar && !src2Scalar &&
2267 src1.depth() == CV_8U && src2.type() == src1.type() &&
2268 src1.dims == 2 && src2.size() == src1.size() &&
2269 mask.empty())
2270 {
2271 if (dtype < 0)
2272 {
2273 if (_dst.fixedType())
2274 {
2275 dtype = _dst.depth();
2276 }
2277 else
2278 {
2279 dtype = src1.depth();
2280 }
2281 }
2282
2283 dtype = CV_MAT_DEPTH(dtype);
2284
2285 if (!_dst.fixedType() || dtype == _dst.depth())
2286 {
2287 _dst.create(src1.size(), CV_MAKE_TYPE(dtype, src1.channels()));
2288
2289 if (dtype == CV_16S)
2290 {
2291 Mat dst = _dst.getMat();
2292 if(tegra::subtract_8u8u16s(src1, src2, dst))
2293 return;
2294 }
2295 else if (dtype == CV_32F)
2296 {
2297 Mat dst = _dst.getMat();
2298 if(tegra::subtract_8u8u32f(src1, src2, dst))
2299 return;
2300 }
2301 else if (dtype == CV_8S)
2302 {
2303 Mat dst = _dst.getMat();
2304 if(tegra::subtract_8u8u8s(src1, src2, dst))
2305 return;
2306 }
2307 }
2308 }
2309 }
2310 #endif
2311 arithm_op(_src1, _src2, _dst, mask, dtype, getSubTab(), false, 0, OCL_OP_SUB );
2312 }
2313
absdiff(InputArray src1,InputArray src2,OutputArray dst)2314 void cv::absdiff( InputArray src1, InputArray src2, OutputArray dst )
2315 {
2316 arithm_op(src1, src2, dst, noArray(), -1, getAbsDiffTab(), false, 0, OCL_OP_ABSDIFF);
2317 }
2318
2319 /****************************************************************************************\
2320 * multiply/divide *
2321 \****************************************************************************************/
2322
2323 namespace cv
2324 {
2325
2326 template <typename T, typename WT>
2327 struct Mul_SIMD
2328 {
operator ()cv::Mul_SIMD2329 int operator() (const T *, const T *, T *, int, WT) const
2330 {
2331 return 0;
2332 }
2333 };
2334
2335 #if CV_NEON
2336
2337 template <>
2338 struct Mul_SIMD<uchar, float>
2339 {
operator ()cv::Mul_SIMD2340 int operator() (const uchar * src1, const uchar * src2, uchar * dst, int width, float scale) const
2341 {
2342 int x = 0;
2343
2344 if( scale == 1.0f )
2345 for ( ; x <= width - 8; x += 8)
2346 {
2347 uint16x8_t v_src1 = vmovl_u8(vld1_u8(src1 + x));
2348 uint16x8_t v_src2 = vmovl_u8(vld1_u8(src2 + x));
2349
2350 float32x4_t v_dst1 = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src1))),
2351 vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src2))));
2352 float32x4_t v_dst2 = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src1))),
2353 vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src2))));
2354
2355 uint16x8_t v_dst = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst1)),
2356 vqmovn_u32(cv_vrndq_u32_f32(v_dst2)));
2357 vst1_u8(dst + x, vqmovn_u16(v_dst));
2358 }
2359 else
2360 {
2361 float32x4_t v_scale = vdupq_n_f32(scale);
2362 for ( ; x <= width - 8; x += 8)
2363 {
2364 uint16x8_t v_src1 = vmovl_u8(vld1_u8(src1 + x));
2365 uint16x8_t v_src2 = vmovl_u8(vld1_u8(src2 + x));
2366
2367 float32x4_t v_dst1 = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src1))),
2368 vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src2))));
2369 v_dst1 = vmulq_f32(v_dst1, v_scale);
2370 float32x4_t v_dst2 = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src1))),
2371 vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src2))));
2372 v_dst2 = vmulq_f32(v_dst2, v_scale);
2373
2374 uint16x8_t v_dst = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst1)),
2375 vqmovn_u32(cv_vrndq_u32_f32(v_dst2)));
2376 vst1_u8(dst + x, vqmovn_u16(v_dst));
2377 }
2378 }
2379
2380 return x;
2381 }
2382 };
2383
2384 template <>
2385 struct Mul_SIMD<schar, float>
2386 {
operator ()cv::Mul_SIMD2387 int operator() (const schar * src1, const schar * src2, schar * dst, int width, float scale) const
2388 {
2389 int x = 0;
2390
2391 if( scale == 1.0f )
2392 for ( ; x <= width - 8; x += 8)
2393 {
2394 int16x8_t v_src1 = vmovl_s8(vld1_s8(src1 + x));
2395 int16x8_t v_src2 = vmovl_s8(vld1_s8(src2 + x));
2396
2397 float32x4_t v_dst1 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src1))),
2398 vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src2))));
2399 float32x4_t v_dst2 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src1))),
2400 vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src2))));
2401
2402 int16x8_t v_dst = vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst1)),
2403 vqmovn_s32(cv_vrndq_s32_f32(v_dst2)));
2404 vst1_s8(dst + x, vqmovn_s16(v_dst));
2405 }
2406 else
2407 {
2408 float32x4_t v_scale = vdupq_n_f32(scale);
2409 for ( ; x <= width - 8; x += 8)
2410 {
2411 int16x8_t v_src1 = vmovl_s8(vld1_s8(src1 + x));
2412 int16x8_t v_src2 = vmovl_s8(vld1_s8(src2 + x));
2413
2414 float32x4_t v_dst1 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src1))),
2415 vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src2))));
2416 v_dst1 = vmulq_f32(v_dst1, v_scale);
2417 float32x4_t v_dst2 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src1))),
2418 vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src2))));
2419 v_dst2 = vmulq_f32(v_dst2, v_scale);
2420
2421 int16x8_t v_dst = vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst1)),
2422 vqmovn_s32(cv_vrndq_s32_f32(v_dst2)));
2423 vst1_s8(dst + x, vqmovn_s16(v_dst));
2424 }
2425 }
2426
2427 return x;
2428 }
2429 };
2430
2431 template <>
2432 struct Mul_SIMD<ushort, float>
2433 {
operator ()cv::Mul_SIMD2434 int operator() (const ushort * src1, const ushort * src2, ushort * dst, int width, float scale) const
2435 {
2436 int x = 0;
2437
2438 if( scale == 1.0f )
2439 for ( ; x <= width - 8; x += 8)
2440 {
2441 uint16x8_t v_src1 = vld1q_u16(src1 + x), v_src2 = vld1q_u16(src2 + x);
2442
2443 float32x4_t v_dst1 = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src1))),
2444 vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src2))));
2445 float32x4_t v_dst2 = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src1))),
2446 vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src2))));
2447
2448 uint16x8_t v_dst = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst1)),
2449 vqmovn_u32(cv_vrndq_u32_f32(v_dst2)));
2450 vst1q_u16(dst + x, v_dst);
2451 }
2452 else
2453 {
2454 float32x4_t v_scale = vdupq_n_f32(scale);
2455 for ( ; x <= width - 8; x += 8)
2456 {
2457 uint16x8_t v_src1 = vld1q_u16(src1 + x), v_src2 = vld1q_u16(src2 + x);
2458
2459 float32x4_t v_dst1 = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src1))),
2460 vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src2))));
2461 v_dst1 = vmulq_f32(v_dst1, v_scale);
2462 float32x4_t v_dst2 = vmulq_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src1))),
2463 vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src2))));
2464 v_dst2 = vmulq_f32(v_dst2, v_scale);
2465
2466 uint16x8_t v_dst = vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst1)),
2467 vqmovn_u32(cv_vrndq_u32_f32(v_dst2)));
2468 vst1q_u16(dst + x, v_dst);
2469 }
2470 }
2471
2472 return x;
2473 }
2474 };
2475
2476 template <>
2477 struct Mul_SIMD<short, float>
2478 {
operator ()cv::Mul_SIMD2479 int operator() (const short * src1, const short * src2, short * dst, int width, float scale) const
2480 {
2481 int x = 0;
2482
2483 if( scale == 1.0f )
2484 for ( ; x <= width - 8; x += 8)
2485 {
2486 int16x8_t v_src1 = vld1q_s16(src1 + x), v_src2 = vld1q_s16(src2 + x);
2487
2488 float32x4_t v_dst1 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src1))),
2489 vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src2))));
2490 float32x4_t v_dst2 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src1))),
2491 vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src2))));
2492
2493 int16x8_t v_dst = vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst1)),
2494 vqmovn_s32(cv_vrndq_s32_f32(v_dst2)));
2495 vst1q_s16(dst + x, v_dst);
2496 }
2497 else
2498 {
2499 float32x4_t v_scale = vdupq_n_f32(scale);
2500 for ( ; x <= width - 8; x += 8)
2501 {
2502 int16x8_t v_src1 = vld1q_s16(src1 + x), v_src2 = vld1q_s16(src2 + x);
2503
2504 float32x4_t v_dst1 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src1))),
2505 vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src2))));
2506 v_dst1 = vmulq_f32(v_dst1, v_scale);
2507 float32x4_t v_dst2 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src1))),
2508 vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src2))));
2509 v_dst2 = vmulq_f32(v_dst2, v_scale);
2510
2511 int16x8_t v_dst = vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst1)),
2512 vqmovn_s32(cv_vrndq_s32_f32(v_dst2)));
2513 vst1q_s16(dst + x, v_dst);
2514 }
2515 }
2516
2517 return x;
2518 }
2519 };
2520
2521 template <>
2522 struct Mul_SIMD<float, float>
2523 {
operator ()cv::Mul_SIMD2524 int operator() (const float * src1, const float * src2, float * dst, int width, float scale) const
2525 {
2526 int x = 0;
2527
2528 if( scale == 1.0f )
2529 for ( ; x <= width - 8; x += 8)
2530 {
2531 float32x4_t v_dst1 = vmulq_f32(vld1q_f32(src1 + x), vld1q_f32(src2 + x));
2532 float32x4_t v_dst2 = vmulq_f32(vld1q_f32(src1 + x + 4), vld1q_f32(src2 + x + 4));
2533 vst1q_f32(dst + x, v_dst1);
2534 vst1q_f32(dst + x + 4, v_dst2);
2535 }
2536 else
2537 {
2538 float32x4_t v_scale = vdupq_n_f32(scale);
2539 for ( ; x <= width - 8; x += 8)
2540 {
2541 float32x4_t v_dst1 = vmulq_f32(vld1q_f32(src1 + x), vld1q_f32(src2 + x));
2542 v_dst1 = vmulq_f32(v_dst1, v_scale);
2543
2544 float32x4_t v_dst2 = vmulq_f32(vld1q_f32(src1 + x + 4), vld1q_f32(src2 + x + 4));
2545 v_dst2 = vmulq_f32(v_dst2, v_scale);
2546
2547 vst1q_f32(dst + x, v_dst1);
2548 vst1q_f32(dst + x + 4, v_dst2);
2549 }
2550 }
2551
2552 return x;
2553 }
2554 };
2555
2556 #elif CV_SSE2
2557
2558 #if CV_SSE4_1
2559
2560 template <>
2561 struct Mul_SIMD<ushort, float>
2562 {
Mul_SIMDcv::Mul_SIMD2563 Mul_SIMD()
2564 {
2565 haveSSE = checkHardwareSupport(CV_CPU_SSE4_1);
2566 }
2567
operator ()cv::Mul_SIMD2568 int operator() (const ushort * src1, const ushort * src2, ushort * dst, int width, float scale) const
2569 {
2570 int x = 0;
2571
2572 if (!haveSSE)
2573 return x;
2574
2575 __m128i v_zero = _mm_setzero_si128();
2576
2577 if( scale != 1.0f )
2578 {
2579 __m128 v_scale = _mm_set1_ps(scale);
2580 for ( ; x <= width - 8; x += 8)
2581 {
2582 __m128i v_src1 = _mm_loadu_si128((__m128i const *)(src1 + x));
2583 __m128i v_src2 = _mm_loadu_si128((__m128i const *)(src2 + x));
2584
2585 __m128 v_dst1 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src1, v_zero)),
2586 _mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src2, v_zero)));
2587 v_dst1 = _mm_mul_ps(v_dst1, v_scale);
2588
2589 __m128 v_dst2 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src1, v_zero)),
2590 _mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src2, v_zero)));
2591 v_dst2 = _mm_mul_ps(v_dst2, v_scale);
2592
2593 __m128i v_dsti = _mm_packus_epi32(_mm_cvtps_epi32(v_dst1), _mm_cvtps_epi32(v_dst2));
2594 _mm_storeu_si128((__m128i *)(dst + x), v_dsti);
2595 }
2596 }
2597
2598 return x;
2599 }
2600
2601 bool haveSSE;
2602 };
2603
2604 #endif
2605
2606 template <>
2607 struct Mul_SIMD<schar, float>
2608 {
Mul_SIMDcv::Mul_SIMD2609 Mul_SIMD()
2610 {
2611 haveSSE = checkHardwareSupport(CV_CPU_SSE2);
2612 }
2613
operator ()cv::Mul_SIMD2614 int operator() (const schar * src1, const schar * src2, schar * dst, int width, float scale) const
2615 {
2616 int x = 0;
2617
2618 if (!haveSSE)
2619 return x;
2620
2621 __m128i v_zero = _mm_setzero_si128();
2622
2623 if( scale == 1.0f )
2624 for ( ; x <= width - 8; x += 8)
2625 {
2626 __m128i v_src1 = _mm_loadl_epi64((__m128i const *)(src1 + x));
2627 __m128i v_src2 = _mm_loadl_epi64((__m128i const *)(src2 + x));
2628
2629 v_src1 = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, v_src1), 8);
2630 v_src2 = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, v_src2), 8);
2631
2632 __m128 v_dst1 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src1), 16)),
2633 _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src2), 16)));
2634
2635 __m128 v_dst2 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src1), 16)),
2636 _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src2), 16)));
2637
2638 __m128i v_dsti = _mm_packs_epi32(_mm_cvtps_epi32(v_dst1), _mm_cvtps_epi32(v_dst2));
2639 _mm_storel_epi64((__m128i *)(dst + x), _mm_packs_epi16(v_dsti, v_zero));
2640 }
2641 else
2642 {
2643 __m128 v_scale = _mm_set1_ps(scale);
2644 for ( ; x <= width - 8; x += 8)
2645 {
2646 __m128i v_src1 = _mm_loadl_epi64((__m128i const *)(src1 + x));
2647 __m128i v_src2 = _mm_loadl_epi64((__m128i const *)(src2 + x));
2648
2649 v_src1 = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, v_src1), 8);
2650 v_src2 = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, v_src2), 8);
2651
2652 __m128 v_dst1 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src1), 16)),
2653 _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src2), 16)));
2654 v_dst1 = _mm_mul_ps(v_dst1, v_scale);
2655
2656 __m128 v_dst2 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src1), 16)),
2657 _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src2), 16)));
2658 v_dst2 = _mm_mul_ps(v_dst2, v_scale);
2659
2660 __m128i v_dsti = _mm_packs_epi32(_mm_cvtps_epi32(v_dst1), _mm_cvtps_epi32(v_dst2));
2661 _mm_storel_epi64((__m128i *)(dst + x), _mm_packs_epi16(v_dsti, v_zero));
2662 }
2663 }
2664
2665 return x;
2666 }
2667
2668 bool haveSSE;
2669 };
2670
2671 template <>
2672 struct Mul_SIMD<short, float>
2673 {
Mul_SIMDcv::Mul_SIMD2674 Mul_SIMD()
2675 {
2676 haveSSE = checkHardwareSupport(CV_CPU_SSE2);
2677 }
2678
operator ()cv::Mul_SIMD2679 int operator() (const short * src1, const short * src2, short * dst, int width, float scale) const
2680 {
2681 int x = 0;
2682
2683 if (!haveSSE)
2684 return x;
2685
2686 __m128i v_zero = _mm_setzero_si128();
2687
2688 if( scale != 1.0f )
2689 {
2690 __m128 v_scale = _mm_set1_ps(scale);
2691 for ( ; x <= width - 8; x += 8)
2692 {
2693 __m128i v_src1 = _mm_loadu_si128((__m128i const *)(src1 + x));
2694 __m128i v_src2 = _mm_loadu_si128((__m128i const *)(src2 + x));
2695
2696 __m128 v_dst1 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src1), 16)),
2697 _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src2), 16)));
2698 v_dst1 = _mm_mul_ps(v_dst1, v_scale);
2699
2700 __m128 v_dst2 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src1), 16)),
2701 _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src2), 16)));
2702 v_dst2 = _mm_mul_ps(v_dst2, v_scale);
2703
2704 __m128i v_dsti = _mm_packs_epi32(_mm_cvtps_epi32(v_dst1), _mm_cvtps_epi32(v_dst2));
2705 _mm_storeu_si128((__m128i *)(dst + x), v_dsti);
2706 }
2707 }
2708
2709 return x;
2710 }
2711
2712 bool haveSSE;
2713 };
2714
2715 #endif
2716
2717 template<typename T, typename WT> static void
mul_(const T * src1,size_t step1,const T * src2,size_t step2,T * dst,size_t step,Size size,WT scale)2718 mul_( const T* src1, size_t step1, const T* src2, size_t step2,
2719 T* dst, size_t step, Size size, WT scale )
2720 {
2721 step1 /= sizeof(src1[0]);
2722 step2 /= sizeof(src2[0]);
2723 step /= sizeof(dst[0]);
2724
2725 Mul_SIMD<T, WT> vop;
2726
2727 if( scale == (WT)1. )
2728 {
2729 for( ; size.height--; src1 += step1, src2 += step2, dst += step )
2730 {
2731 int i = vop(src1, src2, dst, size.width, scale);
2732 #if CV_ENABLE_UNROLLED
2733 for(; i <= size.width - 4; i += 4 )
2734 {
2735 T t0;
2736 T t1;
2737 t0 = saturate_cast<T>(src1[i ] * src2[i ]);
2738 t1 = saturate_cast<T>(src1[i+1] * src2[i+1]);
2739 dst[i ] = t0;
2740 dst[i+1] = t1;
2741
2742 t0 = saturate_cast<T>(src1[i+2] * src2[i+2]);
2743 t1 = saturate_cast<T>(src1[i+3] * src2[i+3]);
2744 dst[i+2] = t0;
2745 dst[i+3] = t1;
2746 }
2747 #endif
2748 for( ; i < size.width; i++ )
2749 dst[i] = saturate_cast<T>(src1[i] * src2[i]);
2750 }
2751 }
2752 else
2753 {
2754 for( ; size.height--; src1 += step1, src2 += step2, dst += step )
2755 {
2756 int i = vop(src1, src2, dst, size.width, scale);
2757 #if CV_ENABLE_UNROLLED
2758 for(; i <= size.width - 4; i += 4 )
2759 {
2760 T t0 = saturate_cast<T>(scale*(WT)src1[i]*src2[i]);
2761 T t1 = saturate_cast<T>(scale*(WT)src1[i+1]*src2[i+1]);
2762 dst[i] = t0; dst[i+1] = t1;
2763
2764 t0 = saturate_cast<T>(scale*(WT)src1[i+2]*src2[i+2]);
2765 t1 = saturate_cast<T>(scale*(WT)src1[i+3]*src2[i+3]);
2766 dst[i+2] = t0; dst[i+3] = t1;
2767 }
2768 #endif
2769 for( ; i < size.width; i++ )
2770 dst[i] = saturate_cast<T>(scale*(WT)src1[i]*src2[i]);
2771 }
2772 }
2773 }
2774
2775 template <typename T>
2776 struct Div_SIMD
2777 {
operator ()cv::Div_SIMD2778 int operator() (const T *, const T *, T *, int, double) const
2779 {
2780 return 0;
2781 }
2782 };
2783
2784 template <typename T>
2785 struct Recip_SIMD
2786 {
operator ()cv::Recip_SIMD2787 int operator() (const T *, T *, int, double) const
2788 {
2789 return 0;
2790 }
2791 };
2792
2793
2794 #if CV_SIMD128
2795
2796 template <>
2797 struct Div_SIMD<uchar>
2798 {
2799 bool haveSIMD;
Div_SIMDcv::Div_SIMD2800 Div_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); }
2801
operator ()cv::Div_SIMD2802 int operator() (const uchar * src1, const uchar * src2, uchar * dst, int width, double scale) const
2803 {
2804 int x = 0;
2805
2806 if (!haveSIMD)
2807 return x;
2808
2809 v_float32x4 v_scale = v_setall_f32((float)scale);
2810 v_uint16x8 v_zero = v_setzero_u16();
2811
2812 for ( ; x <= width - 8; x += 8)
2813 {
2814 v_uint16x8 v_src1 = v_load_expand(src1 + x);
2815 v_uint16x8 v_src2 = v_load_expand(src2 + x);
2816
2817 v_uint32x4 t0, t1, t2, t3;
2818 v_expand(v_src1, t0, t1);
2819 v_expand(v_src2, t2, t3);
2820
2821 v_float32x4 f0 = v_cvt_f32(v_reinterpret_as_s32(t0));
2822 v_float32x4 f1 = v_cvt_f32(v_reinterpret_as_s32(t1));
2823
2824 v_float32x4 f2 = v_cvt_f32(v_reinterpret_as_s32(t2));
2825 v_float32x4 f3 = v_cvt_f32(v_reinterpret_as_s32(t3));
2826
2827 f0 = f0 * v_scale / f2;
2828 f1 = f1 * v_scale / f3;
2829
2830 v_int32x4 i0 = v_round(f0), i1 = v_round(f1);
2831 v_uint16x8 res = v_pack_u(i0, i1);
2832
2833 res = v_select(v_src2 == v_zero, v_zero, res);
2834 v_pack_store(dst + x, res);
2835 }
2836
2837 return x;
2838 }
2839 };
2840
2841
2842 template <>
2843 struct Div_SIMD<schar>
2844 {
2845 bool haveSIMD;
Div_SIMDcv::Div_SIMD2846 Div_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); }
2847
operator ()cv::Div_SIMD2848 int operator() (const schar * src1, const schar * src2, schar * dst, int width, double scale) const
2849 {
2850 int x = 0;
2851
2852 if (!haveSIMD)
2853 return x;
2854
2855 v_float32x4 v_scale = v_setall_f32((float)scale);
2856 v_int16x8 v_zero = v_setzero_s16();
2857
2858 for ( ; x <= width - 8; x += 8)
2859 {
2860 v_int16x8 v_src1 = v_load_expand(src1 + x);
2861 v_int16x8 v_src2 = v_load_expand(src2 + x);
2862
2863 v_int32x4 t0, t1, t2, t3;
2864 v_expand(v_src1, t0, t1);
2865 v_expand(v_src2, t2, t3);
2866
2867 v_float32x4 f0 = v_cvt_f32(t0);
2868 v_float32x4 f1 = v_cvt_f32(t1);
2869
2870 v_float32x4 f2 = v_cvt_f32(t2);
2871 v_float32x4 f3 = v_cvt_f32(t3);
2872
2873 f0 = f0 * v_scale / f2;
2874 f1 = f1 * v_scale / f3;
2875
2876 v_int32x4 i0 = v_round(f0), i1 = v_round(f1);
2877 v_int16x8 res = v_pack(i0, i1);
2878
2879 res = v_select(v_src2 == v_zero, v_zero, res);
2880 v_pack_store(dst + x, res);
2881 }
2882
2883 return x;
2884 }
2885 };
2886
2887
2888 template <>
2889 struct Div_SIMD<ushort>
2890 {
2891 bool haveSIMD;
Div_SIMDcv::Div_SIMD2892 Div_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); }
2893
operator ()cv::Div_SIMD2894 int operator() (const ushort * src1, const ushort * src2, ushort * dst, int width, double scale) const
2895 {
2896 int x = 0;
2897
2898 if (!haveSIMD)
2899 return x;
2900
2901 v_float32x4 v_scale = v_setall_f32((float)scale);
2902 v_uint16x8 v_zero = v_setzero_u16();
2903
2904 for ( ; x <= width - 8; x += 8)
2905 {
2906 v_uint16x8 v_src1 = v_load(src1 + x);
2907 v_uint16x8 v_src2 = v_load(src2 + x);
2908
2909 v_uint32x4 t0, t1, t2, t3;
2910 v_expand(v_src1, t0, t1);
2911 v_expand(v_src2, t2, t3);
2912
2913 v_float32x4 f0 = v_cvt_f32(v_reinterpret_as_s32(t0));
2914 v_float32x4 f1 = v_cvt_f32(v_reinterpret_as_s32(t1));
2915
2916 v_float32x4 f2 = v_cvt_f32(v_reinterpret_as_s32(t2));
2917 v_float32x4 f3 = v_cvt_f32(v_reinterpret_as_s32(t3));
2918
2919 f0 = f0 * v_scale / f2;
2920 f1 = f1 * v_scale / f3;
2921
2922 v_int32x4 i0 = v_round(f0), i1 = v_round(f1);
2923 v_uint16x8 res = v_pack_u(i0, i1);
2924
2925 res = v_select(v_src2 == v_zero, v_zero, res);
2926 v_store(dst + x, res);
2927 }
2928
2929 return x;
2930 }
2931 };
2932
2933 template <>
2934 struct Div_SIMD<short>
2935 {
2936 bool haveSIMD;
Div_SIMDcv::Div_SIMD2937 Div_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); }
2938
operator ()cv::Div_SIMD2939 int operator() (const short * src1, const short * src2, short * dst, int width, double scale) const
2940 {
2941 int x = 0;
2942
2943 if (!haveSIMD)
2944 return x;
2945
2946 v_float32x4 v_scale = v_setall_f32((float)scale);
2947 v_int16x8 v_zero = v_setzero_s16();
2948
2949 for ( ; x <= width - 8; x += 8)
2950 {
2951 v_int16x8 v_src1 = v_load(src1 + x);
2952 v_int16x8 v_src2 = v_load(src2 + x);
2953
2954 v_int32x4 t0, t1, t2, t3;
2955 v_expand(v_src1, t0, t1);
2956 v_expand(v_src2, t2, t3);
2957
2958 v_float32x4 f0 = v_cvt_f32(t0);
2959 v_float32x4 f1 = v_cvt_f32(t1);
2960
2961 v_float32x4 f2 = v_cvt_f32(t2);
2962 v_float32x4 f3 = v_cvt_f32(t3);
2963
2964 f0 = f0 * v_scale / f2;
2965 f1 = f1 * v_scale / f3;
2966
2967 v_int32x4 i0 = v_round(f0), i1 = v_round(f1);
2968 v_int16x8 res = v_pack(i0, i1);
2969
2970 res = v_select(v_src2 == v_zero, v_zero, res);
2971 v_store(dst + x, res);
2972 }
2973
2974 return x;
2975 }
2976 };
2977
2978 template <>
2979 struct Div_SIMD<int>
2980 {
2981 bool haveSIMD;
Div_SIMDcv::Div_SIMD2982 Div_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); }
2983
operator ()cv::Div_SIMD2984 int operator() (const int * src1, const int * src2, int * dst, int width, double scale) const
2985 {
2986 int x = 0;
2987
2988 if (!haveSIMD)
2989 return x;
2990
2991 v_float32x4 v_scale = v_setall_f32((float)scale);
2992 v_int32x4 v_zero = v_setzero_s32();
2993
2994 for ( ; x <= width - 8; x += 8)
2995 {
2996 v_int32x4 t0 = v_load(src1 + x);
2997 v_int32x4 t1 = v_load(src1 + x + 4);
2998 v_int32x4 t2 = v_load(src2 + x);
2999 v_int32x4 t3 = v_load(src2 + x + 4);
3000
3001 v_float32x4 f0 = v_cvt_f32(t0);
3002 v_float32x4 f1 = v_cvt_f32(t1);
3003 v_float32x4 f2 = v_cvt_f32(t2);
3004 v_float32x4 f3 = v_cvt_f32(t3);
3005
3006 f0 = f0 * v_scale / f2;
3007 f1 = f1 * v_scale / f3;
3008
3009 v_int32x4 res0 = v_round(f0), res1 = v_round(f1);
3010
3011 res0 = v_select(t2 == v_zero, v_zero, res0);
3012 res1 = v_select(t3 == v_zero, v_zero, res1);
3013 v_store(dst + x, res0);
3014 v_store(dst + x + 4, res1);
3015 }
3016
3017 return x;
3018 }
3019 };
3020
3021
3022 template <>
3023 struct Div_SIMD<float>
3024 {
3025 bool haveSIMD;
Div_SIMDcv::Div_SIMD3026 Div_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); }
3027
operator ()cv::Div_SIMD3028 int operator() (const float * src1, const float * src2, float * dst, int width, double scale) const
3029 {
3030 int x = 0;
3031
3032 if (!haveSIMD)
3033 return x;
3034
3035 v_float32x4 v_scale = v_setall_f32((float)scale);
3036 v_float32x4 v_zero = v_setzero_f32();
3037
3038 for ( ; x <= width - 8; x += 8)
3039 {
3040 v_float32x4 f0 = v_load(src1 + x);
3041 v_float32x4 f1 = v_load(src1 + x + 4);
3042 v_float32x4 f2 = v_load(src2 + x);
3043 v_float32x4 f3 = v_load(src2 + x + 4);
3044
3045 v_float32x4 res0 = f0 * v_scale / f2;
3046 v_float32x4 res1 = f1 * v_scale / f3;
3047
3048 res0 = v_select(f2 == v_zero, v_zero, res0);
3049 res1 = v_select(f3 == v_zero, v_zero, res1);
3050
3051 v_store(dst + x, res0);
3052 v_store(dst + x + 4, res1);
3053 }
3054
3055 return x;
3056 }
3057 };
3058
3059
3060 ///////////////////////// RECIPROCAL //////////////////////
3061
3062 template <>
3063 struct Recip_SIMD<uchar>
3064 {
3065 bool haveSIMD;
Recip_SIMDcv::Recip_SIMD3066 Recip_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); }
3067
operator ()cv::Recip_SIMD3068 int operator() (const uchar * src2, uchar * dst, int width, double scale) const
3069 {
3070 int x = 0;
3071
3072 if (!haveSIMD)
3073 return x;
3074
3075 v_float32x4 v_scale = v_setall_f32((float)scale);
3076 v_uint16x8 v_zero = v_setzero_u16();
3077
3078 for ( ; x <= width - 8; x += 8)
3079 {
3080 v_uint16x8 v_src2 = v_load_expand(src2 + x);
3081
3082 v_uint32x4 t0, t1;
3083 v_expand(v_src2, t0, t1);
3084
3085 v_float32x4 f0 = v_cvt_f32(v_reinterpret_as_s32(t0));
3086 v_float32x4 f1 = v_cvt_f32(v_reinterpret_as_s32(t1));
3087
3088 f0 = v_scale / f0;
3089 f1 = v_scale / f1;
3090
3091 v_int32x4 i0 = v_round(f0), i1 = v_round(f1);
3092 v_uint16x8 res = v_pack_u(i0, i1);
3093
3094 res = v_select(v_src2 == v_zero, v_zero, res);
3095 v_pack_store(dst + x, res);
3096 }
3097
3098 return x;
3099 }
3100 };
3101
3102
3103 template <>
3104 struct Recip_SIMD<schar>
3105 {
3106 bool haveSIMD;
Recip_SIMDcv::Recip_SIMD3107 Recip_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); }
3108
operator ()cv::Recip_SIMD3109 int operator() (const schar * src2, schar * dst, int width, double scale) const
3110 {
3111 int x = 0;
3112
3113 if (!haveSIMD)
3114 return x;
3115
3116 v_float32x4 v_scale = v_setall_f32((float)scale);
3117 v_int16x8 v_zero = v_setzero_s16();
3118
3119 for ( ; x <= width - 8; x += 8)
3120 {
3121 v_int16x8 v_src2 = v_load_expand(src2 + x);
3122
3123 v_int32x4 t0, t1;
3124 v_expand(v_src2, t0, t1);
3125
3126 v_float32x4 f0 = v_cvt_f32(t0);
3127 v_float32x4 f1 = v_cvt_f32(t1);
3128
3129 f0 = v_scale / f0;
3130 f1 = v_scale / f1;
3131
3132 v_int32x4 i0 = v_round(f0), i1 = v_round(f1);
3133 v_int16x8 res = v_pack(i0, i1);
3134
3135 res = v_select(v_src2 == v_zero, v_zero, res);
3136 v_pack_store(dst + x, res);
3137 }
3138
3139 return x;
3140 }
3141 };
3142
3143
3144 template <>
3145 struct Recip_SIMD<ushort>
3146 {
3147 bool haveSIMD;
Recip_SIMDcv::Recip_SIMD3148 Recip_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); }
3149
operator ()cv::Recip_SIMD3150 int operator() (const ushort * src2, ushort * dst, int width, double scale) const
3151 {
3152 int x = 0;
3153
3154 if (!haveSIMD)
3155 return x;
3156
3157 v_float32x4 v_scale = v_setall_f32((float)scale);
3158 v_uint16x8 v_zero = v_setzero_u16();
3159
3160 for ( ; x <= width - 8; x += 8)
3161 {
3162 v_uint16x8 v_src2 = v_load(src2 + x);
3163
3164 v_uint32x4 t0, t1;
3165 v_expand(v_src2, t0, t1);
3166
3167 v_float32x4 f0 = v_cvt_f32(v_reinterpret_as_s32(t0));
3168 v_float32x4 f1 = v_cvt_f32(v_reinterpret_as_s32(t1));
3169
3170 f0 = v_scale / f0;
3171 f1 = v_scale / f1;
3172
3173 v_int32x4 i0 = v_round(f0), i1 = v_round(f1);
3174 v_uint16x8 res = v_pack_u(i0, i1);
3175
3176 res = v_select(v_src2 == v_zero, v_zero, res);
3177 v_store(dst + x, res);
3178 }
3179
3180 return x;
3181 }
3182 };
3183
3184 template <>
3185 struct Recip_SIMD<short>
3186 {
3187 bool haveSIMD;
Recip_SIMDcv::Recip_SIMD3188 Recip_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); }
3189
operator ()cv::Recip_SIMD3190 int operator() (const short * src2, short * dst, int width, double scale) const
3191 {
3192 int x = 0;
3193
3194 if (!haveSIMD)
3195 return x;
3196
3197 v_float32x4 v_scale = v_setall_f32((float)scale);
3198 v_int16x8 v_zero = v_setzero_s16();
3199
3200 for ( ; x <= width - 8; x += 8)
3201 {
3202 v_int16x8 v_src2 = v_load(src2 + x);
3203
3204 v_int32x4 t0, t1;
3205 v_expand(v_src2, t0, t1);
3206
3207 v_float32x4 f0 = v_cvt_f32(t0);
3208 v_float32x4 f1 = v_cvt_f32(t1);
3209
3210 f0 = v_scale / f0;
3211 f1 = v_scale / f1;
3212
3213 v_int32x4 i0 = v_round(f0), i1 = v_round(f1);
3214 v_int16x8 res = v_pack(i0, i1);
3215
3216 res = v_select(v_src2 == v_zero, v_zero, res);
3217 v_store(dst + x, res);
3218 }
3219
3220 return x;
3221 }
3222 };
3223
3224 template <>
3225 struct Recip_SIMD<int>
3226 {
3227 bool haveSIMD;
Recip_SIMDcv::Recip_SIMD3228 Recip_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); }
3229
operator ()cv::Recip_SIMD3230 int operator() (const int * src2, int * dst, int width, double scale) const
3231 {
3232 int x = 0;
3233
3234 if (!haveSIMD)
3235 return x;
3236
3237 v_float32x4 v_scale = v_setall_f32((float)scale);
3238 v_int32x4 v_zero = v_setzero_s32();
3239
3240 for ( ; x <= width - 8; x += 8)
3241 {
3242 v_int32x4 t0 = v_load(src2 + x);
3243 v_int32x4 t1 = v_load(src2 + x + 4);
3244
3245 v_float32x4 f0 = v_cvt_f32(t0);
3246 v_float32x4 f1 = v_cvt_f32(t1);
3247
3248 f0 = v_scale / f0;
3249 f1 = v_scale / f1;
3250
3251 v_int32x4 res0 = v_round(f0), res1 = v_round(f1);
3252
3253 res0 = v_select(t0 == v_zero, v_zero, res0);
3254 res1 = v_select(t1 == v_zero, v_zero, res1);
3255 v_store(dst + x, res0);
3256 v_store(dst + x + 4, res1);
3257 }
3258
3259 return x;
3260 }
3261 };
3262
3263
3264 template <>
3265 struct Recip_SIMD<float>
3266 {
3267 bool haveSIMD;
Recip_SIMDcv::Recip_SIMD3268 Recip_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); }
3269
operator ()cv::Recip_SIMD3270 int operator() (const float * src2, float * dst, int width, double scale) const
3271 {
3272 int x = 0;
3273
3274 if (!haveSIMD)
3275 return x;
3276
3277 v_float32x4 v_scale = v_setall_f32((float)scale);
3278 v_float32x4 v_zero = v_setzero_f32();
3279
3280 for ( ; x <= width - 8; x += 8)
3281 {
3282 v_float32x4 f0 = v_load(src2 + x);
3283 v_float32x4 f1 = v_load(src2 + x + 4);
3284
3285 v_float32x4 res0 = v_scale / f0;
3286 v_float32x4 res1 = v_scale / f1;
3287
3288 res0 = v_select(f0 == v_zero, v_zero, res0);
3289 res1 = v_select(f1 == v_zero, v_zero, res1);
3290
3291 v_store(dst + x, res0);
3292 v_store(dst + x + 4, res1);
3293 }
3294
3295 return x;
3296 }
3297 };
3298
3299 #if CV_SIMD128_64F
3300
3301 template <>
3302 struct Div_SIMD<double>
3303 {
3304 bool haveSIMD;
Div_SIMDcv::Div_SIMD3305 Div_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); }
3306
operator ()cv::Div_SIMD3307 int operator() (const double * src1, const double * src2, double * dst, int width, double scale) const
3308 {
3309 int x = 0;
3310
3311 if (!haveSIMD)
3312 return x;
3313
3314 v_float64x2 v_scale = v_setall_f64(scale);
3315 v_float64x2 v_zero = v_setzero_f64();
3316
3317 for ( ; x <= width - 4; x += 4)
3318 {
3319 v_float64x2 f0 = v_load(src1 + x);
3320 v_float64x2 f1 = v_load(src1 + x + 2);
3321 v_float64x2 f2 = v_load(src2 + x);
3322 v_float64x2 f3 = v_load(src2 + x + 2);
3323
3324 v_float64x2 res0 = f0 * v_scale / f2;
3325 v_float64x2 res1 = f1 * v_scale / f3;
3326
3327 res0 = v_select(f0 == v_zero, v_zero, res0);
3328 res1 = v_select(f1 == v_zero, v_zero, res1);
3329
3330 v_store(dst + x, res0);
3331 v_store(dst + x + 2, res1);
3332 }
3333
3334 return x;
3335 }
3336 };
3337
3338 template <>
3339 struct Recip_SIMD<double>
3340 {
3341 bool haveSIMD;
Recip_SIMDcv::Recip_SIMD3342 Recip_SIMD() { haveSIMD = checkHardwareSupport(CV_CPU_SSE2) || checkHardwareSupport(CV_CPU_NEON); }
3343
operator ()cv::Recip_SIMD3344 int operator() (const double * src2, double * dst, int width, double scale) const
3345 {
3346 int x = 0;
3347
3348 if (!haveSIMD)
3349 return x;
3350
3351 v_float64x2 v_scale = v_setall_f64(scale);
3352 v_float64x2 v_zero = v_setzero_f64();
3353
3354 for ( ; x <= width - 4; x += 4)
3355 {
3356 v_float64x2 f0 = v_load(src2 + x);
3357 v_float64x2 f1 = v_load(src2 + x + 2);
3358
3359 v_float64x2 res0 = v_scale / f0;
3360 v_float64x2 res1 = v_scale / f1;
3361
3362 res0 = v_select(f0 == v_zero, v_zero, res0);
3363 res1 = v_select(f1 == v_zero, v_zero, res1);
3364
3365 v_store(dst + x, res0);
3366 v_store(dst + x + 2, res1);
3367 }
3368
3369 return x;
3370 }
3371 };
3372
3373 #endif
3374
3375 #endif
3376
3377 template<typename T> static void
div_i(const T * src1,size_t step1,const T * src2,size_t step2,T * dst,size_t step,Size size,double scale)3378 div_i( const T* src1, size_t step1, const T* src2, size_t step2,
3379 T* dst, size_t step, Size size, double scale )
3380 {
3381 step1 /= sizeof(src1[0]);
3382 step2 /= sizeof(src2[0]);
3383 step /= sizeof(dst[0]);
3384
3385 Div_SIMD<T> vop;
3386 float scale_f = (float)scale;
3387
3388 for( ; size.height--; src1 += step1, src2 += step2, dst += step )
3389 {
3390 int i = vop(src1, src2, dst, size.width, scale);
3391 for( ; i < size.width; i++ )
3392 {
3393 T num = src1[i], denom = src2[i];
3394 dst[i] = denom != 0 ? saturate_cast<T>(num*scale_f/denom) : (T)0;
3395 }
3396 }
3397 }
3398
3399 template<typename T> static void
div_f(const T * src1,size_t step1,const T * src2,size_t step2,T * dst,size_t step,Size size,double scale)3400 div_f( const T* src1, size_t step1, const T* src2, size_t step2,
3401 T* dst, size_t step, Size size, double scale )
3402 {
3403 T scale_f = (T)scale;
3404 step1 /= sizeof(src1[0]);
3405 step2 /= sizeof(src2[0]);
3406 step /= sizeof(dst[0]);
3407
3408 Div_SIMD<T> vop;
3409
3410 for( ; size.height--; src1 += step1, src2 += step2, dst += step )
3411 {
3412 int i = vop(src1, src2, dst, size.width, scale);
3413 for( ; i < size.width; i++ )
3414 {
3415 T num = src1[i], denom = src2[i];
3416 dst[i] = denom != 0 ? saturate_cast<T>(num*scale_f/denom) : (T)0;
3417 }
3418 }
3419 }
3420
3421 template<typename T> static void
recip_i(const T *,size_t,const T * src2,size_t step2,T * dst,size_t step,Size size,double scale)3422 recip_i( const T*, size_t, const T* src2, size_t step2,
3423 T* dst, size_t step, Size size, double scale )
3424 {
3425 step2 /= sizeof(src2[0]);
3426 step /= sizeof(dst[0]);
3427
3428 Recip_SIMD<T> vop;
3429 float scale_f = (float)scale;
3430
3431 for( ; size.height--; src2 += step2, dst += step )
3432 {
3433 int i = vop(src2, dst, size.width, scale);
3434 for( ; i < size.width; i++ )
3435 {
3436 T denom = src2[i];
3437 dst[i] = denom != 0 ? saturate_cast<T>(scale_f/denom) : (T)0;
3438 }
3439 }
3440 }
3441
3442 template<typename T> static void
recip_f(const T *,size_t,const T * src2,size_t step2,T * dst,size_t step,Size size,double scale)3443 recip_f( const T*, size_t, const T* src2, size_t step2,
3444 T* dst, size_t step, Size size, double scale )
3445 {
3446 T scale_f = (T)scale;
3447 step2 /= sizeof(src2[0]);
3448 step /= sizeof(dst[0]);
3449
3450 Recip_SIMD<T> vop;
3451
3452 for( ; size.height--; src2 += step2, dst += step )
3453 {
3454 int i = vop(src2, dst, size.width, scale);
3455 for( ; i < size.width; i++ )
3456 {
3457 T denom = src2[i];
3458 dst[i] = denom != 0 ? saturate_cast<T>(scale_f/denom) : (T)0;
3459 }
3460 }
3461 }
3462
3463
mul8u(const uchar * src1,size_t step1,const uchar * src2,size_t step2,uchar * dst,size_t step,Size sz,void * scale)3464 static void mul8u( const uchar* src1, size_t step1, const uchar* src2, size_t step2,
3465 uchar* dst, size_t step, Size sz, void* scale)
3466 {
3467 float fscale = (float)*(const double*)scale;
3468 #if defined HAVE_IPP
3469 CV_IPP_CHECK()
3470 {
3471 if (std::fabs(fscale - 1) <= FLT_EPSILON)
3472 {
3473 if (ippiMul_8u_C1RSfs(src1, (int)step1, src2, (int)step2, dst, (int)step, ippiSize(sz), 0) >= 0)
3474 {
3475 CV_IMPL_ADD(CV_IMPL_IPP);
3476 return;
3477 }
3478 setIppErrorStatus();
3479 }
3480 }
3481 #endif
3482 mul_(src1, step1, src2, step2, dst, step, sz, fscale);
3483 }
3484
mul8s(const schar * src1,size_t step1,const schar * src2,size_t step2,schar * dst,size_t step,Size sz,void * scale)3485 static void mul8s( const schar* src1, size_t step1, const schar* src2, size_t step2,
3486 schar* dst, size_t step, Size sz, void* scale)
3487 {
3488 mul_(src1, step1, src2, step2, dst, step, sz, (float)*(const double*)scale);
3489 }
3490
mul16u(const ushort * src1,size_t step1,const ushort * src2,size_t step2,ushort * dst,size_t step,Size sz,void * scale)3491 static void mul16u( const ushort* src1, size_t step1, const ushort* src2, size_t step2,
3492 ushort* dst, size_t step, Size sz, void* scale)
3493 {
3494 float fscale = (float)*(const double*)scale;
3495 #if defined HAVE_IPP
3496 CV_IPP_CHECK()
3497 {
3498 if (std::fabs(fscale - 1) <= FLT_EPSILON)
3499 {
3500 if (ippiMul_16u_C1RSfs(src1, (int)step1, src2, (int)step2, dst, (int)step, ippiSize(sz), 0) >= 0)
3501 {
3502 CV_IMPL_ADD(CV_IMPL_IPP);
3503 return;
3504 }
3505 setIppErrorStatus();
3506 }
3507 }
3508 #endif
3509 mul_(src1, step1, src2, step2, dst, step, sz, fscale);
3510 }
3511
mul16s(const short * src1,size_t step1,const short * src2,size_t step2,short * dst,size_t step,Size sz,void * scale)3512 static void mul16s( const short* src1, size_t step1, const short* src2, size_t step2,
3513 short* dst, size_t step, Size sz, void* scale)
3514 {
3515 float fscale = (float)*(const double*)scale;
3516 #if defined HAVE_IPP
3517 CV_IPP_CHECK()
3518 {
3519 if (std::fabs(fscale - 1) <= FLT_EPSILON)
3520 {
3521 if (ippiMul_16s_C1RSfs(src1, (int)step1, src2, (int)step2, dst, (int)step, ippiSize(sz), 0) >= 0)
3522 {
3523 CV_IMPL_ADD(CV_IMPL_IPP);
3524 return;
3525 }
3526 setIppErrorStatus();
3527 }
3528 }
3529 #endif
3530 mul_(src1, step1, src2, step2, dst, step, sz, fscale);
3531 }
3532
mul32s(const int * src1,size_t step1,const int * src2,size_t step2,int * dst,size_t step,Size sz,void * scale)3533 static void mul32s( const int* src1, size_t step1, const int* src2, size_t step2,
3534 int* dst, size_t step, Size sz, void* scale)
3535 {
3536 mul_(src1, step1, src2, step2, dst, step, sz, *(const double*)scale);
3537 }
3538
mul32f(const float * src1,size_t step1,const float * src2,size_t step2,float * dst,size_t step,Size sz,void * scale)3539 static void mul32f( const float* src1, size_t step1, const float* src2, size_t step2,
3540 float* dst, size_t step, Size sz, void* scale)
3541 {
3542 float fscale = (float)*(const double*)scale;
3543 #if defined HAVE_IPP
3544 CV_IPP_CHECK()
3545 {
3546 if (std::fabs(fscale - 1) <= FLT_EPSILON)
3547 {
3548 if (ippiMul_32f_C1R(src1, (int)step1, src2, (int)step2, dst, (int)step, ippiSize(sz)) >= 0)
3549 {
3550 CV_IMPL_ADD(CV_IMPL_IPP);
3551 return;
3552 }
3553 setIppErrorStatus();
3554 }
3555 }
3556 #endif
3557 mul_(src1, step1, src2, step2, dst, step, sz, fscale);
3558 }
3559
mul64f(const double * src1,size_t step1,const double * src2,size_t step2,double * dst,size_t step,Size sz,void * scale)3560 static void mul64f( const double* src1, size_t step1, const double* src2, size_t step2,
3561 double* dst, size_t step, Size sz, void* scale)
3562 {
3563 mul_(src1, step1, src2, step2, dst, step, sz, *(const double*)scale);
3564 }
3565
div8u(const uchar * src1,size_t step1,const uchar * src2,size_t step2,uchar * dst,size_t step,Size sz,void * scale)3566 static void div8u( const uchar* src1, size_t step1, const uchar* src2, size_t step2,
3567 uchar* dst, size_t step, Size sz, void* scale)
3568 {
3569 if( src1 )
3570 div_i(src1, step1, src2, step2, dst, step, sz, *(const double*)scale);
3571 else
3572 recip_i(src1, step1, src2, step2, dst, step, sz, *(const double*)scale);
3573 }
3574
div8s(const schar * src1,size_t step1,const schar * src2,size_t step2,schar * dst,size_t step,Size sz,void * scale)3575 static void div8s( const schar* src1, size_t step1, const schar* src2, size_t step2,
3576 schar* dst, size_t step, Size sz, void* scale)
3577 {
3578 div_i(src1, step1, src2, step2, dst, step, sz, *(const double*)scale);
3579 }
3580
div16u(const ushort * src1,size_t step1,const ushort * src2,size_t step2,ushort * dst,size_t step,Size sz,void * scale)3581 static void div16u( const ushort* src1, size_t step1, const ushort* src2, size_t step2,
3582 ushort* dst, size_t step, Size sz, void* scale)
3583 {
3584 div_i(src1, step1, src2, step2, dst, step, sz, *(const double*)scale);
3585 }
3586
div16s(const short * src1,size_t step1,const short * src2,size_t step2,short * dst,size_t step,Size sz,void * scale)3587 static void div16s( const short* src1, size_t step1, const short* src2, size_t step2,
3588 short* dst, size_t step, Size sz, void* scale)
3589 {
3590 div_i(src1, step1, src2, step2, dst, step, sz, *(const double*)scale);
3591 }
3592
div32s(const int * src1,size_t step1,const int * src2,size_t step2,int * dst,size_t step,Size sz,void * scale)3593 static void div32s( const int* src1, size_t step1, const int* src2, size_t step2,
3594 int* dst, size_t step, Size sz, void* scale)
3595 {
3596 div_i(src1, step1, src2, step2, dst, step, sz, *(const double*)scale);
3597 }
3598
div32f(const float * src1,size_t step1,const float * src2,size_t step2,float * dst,size_t step,Size sz,void * scale)3599 static void div32f( const float* src1, size_t step1, const float* src2, size_t step2,
3600 float* dst, size_t step, Size sz, void* scale)
3601 {
3602 div_f(src1, step1, src2, step2, dst, step, sz, *(const double*)scale);
3603 }
3604
div64f(const double * src1,size_t step1,const double * src2,size_t step2,double * dst,size_t step,Size sz,void * scale)3605 static void div64f( const double* src1, size_t step1, const double* src2, size_t step2,
3606 double* dst, size_t step, Size sz, void* scale)
3607 {
3608 div_f(src1, step1, src2, step2, dst, step, sz, *(const double*)scale);
3609 }
3610
recip8u(const uchar * src1,size_t step1,const uchar * src2,size_t step2,uchar * dst,size_t step,Size sz,void * scale)3611 static void recip8u( const uchar* src1, size_t step1, const uchar* src2, size_t step2,
3612 uchar* dst, size_t step, Size sz, void* scale)
3613 {
3614 recip_i(src1, step1, src2, step2, dst, step, sz, *(const double*)scale);
3615 }
3616
recip8s(const schar * src1,size_t step1,const schar * src2,size_t step2,schar * dst,size_t step,Size sz,void * scale)3617 static void recip8s( const schar* src1, size_t step1, const schar* src2, size_t step2,
3618 schar* dst, size_t step, Size sz, void* scale)
3619 {
3620 recip_i(src1, step1, src2, step2, dst, step, sz, *(const double*)scale);
3621 }
3622
recip16u(const ushort * src1,size_t step1,const ushort * src2,size_t step2,ushort * dst,size_t step,Size sz,void * scale)3623 static void recip16u( const ushort* src1, size_t step1, const ushort* src2, size_t step2,
3624 ushort* dst, size_t step, Size sz, void* scale)
3625 {
3626 recip_i(src1, step1, src2, step2, dst, step, sz, *(const double*)scale);
3627 }
3628
recip16s(const short * src1,size_t step1,const short * src2,size_t step2,short * dst,size_t step,Size sz,void * scale)3629 static void recip16s( const short* src1, size_t step1, const short* src2, size_t step2,
3630 short* dst, size_t step, Size sz, void* scale)
3631 {
3632 recip_i(src1, step1, src2, step2, dst, step, sz, *(const double*)scale);
3633 }
3634
recip32s(const int * src1,size_t step1,const int * src2,size_t step2,int * dst,size_t step,Size sz,void * scale)3635 static void recip32s( const int* src1, size_t step1, const int* src2, size_t step2,
3636 int* dst, size_t step, Size sz, void* scale)
3637 {
3638 recip_i(src1, step1, src2, step2, dst, step, sz, *(const double*)scale);
3639 }
3640
recip32f(const float * src1,size_t step1,const float * src2,size_t step2,float * dst,size_t step,Size sz,void * scale)3641 static void recip32f( const float* src1, size_t step1, const float* src2, size_t step2,
3642 float* dst, size_t step, Size sz, void* scale)
3643 {
3644 recip_f(src1, step1, src2, step2, dst, step, sz, *(const double*)scale);
3645 }
3646
recip64f(const double * src1,size_t step1,const double * src2,size_t step2,double * dst,size_t step,Size sz,void * scale)3647 static void recip64f( const double* src1, size_t step1, const double* src2, size_t step2,
3648 double* dst, size_t step, Size sz, void* scale)
3649 {
3650 recip_f(src1, step1, src2, step2, dst, step, sz, *(const double*)scale);
3651 }
3652
3653
getMulTab()3654 static BinaryFunc* getMulTab()
3655 {
3656 static BinaryFunc mulTab[] =
3657 {
3658 (BinaryFunc)mul8u, (BinaryFunc)mul8s, (BinaryFunc)mul16u,
3659 (BinaryFunc)mul16s, (BinaryFunc)mul32s, (BinaryFunc)mul32f,
3660 (BinaryFunc)mul64f, 0
3661 };
3662
3663 return mulTab;
3664 }
3665
getDivTab()3666 static BinaryFunc* getDivTab()
3667 {
3668 static BinaryFunc divTab[] =
3669 {
3670 (BinaryFunc)div8u, (BinaryFunc)div8s, (BinaryFunc)div16u,
3671 (BinaryFunc)div16s, (BinaryFunc)div32s, (BinaryFunc)div32f,
3672 (BinaryFunc)div64f, 0
3673 };
3674
3675 return divTab;
3676 }
3677
getRecipTab()3678 static BinaryFunc* getRecipTab()
3679 {
3680 static BinaryFunc recipTab[] =
3681 {
3682 (BinaryFunc)recip8u, (BinaryFunc)recip8s, (BinaryFunc)recip16u,
3683 (BinaryFunc)recip16s, (BinaryFunc)recip32s, (BinaryFunc)recip32f,
3684 (BinaryFunc)recip64f, 0
3685 };
3686
3687 return recipTab;
3688 }
3689
3690 }
3691
multiply(InputArray src1,InputArray src2,OutputArray dst,double scale,int dtype)3692 void cv::multiply(InputArray src1, InputArray src2,
3693 OutputArray dst, double scale, int dtype)
3694 {
3695 arithm_op(src1, src2, dst, noArray(), dtype, getMulTab(),
3696 true, &scale, std::abs(scale - 1.0) < DBL_EPSILON ? OCL_OP_MUL : OCL_OP_MUL_SCALE);
3697 }
3698
divide(InputArray src1,InputArray src2,OutputArray dst,double scale,int dtype)3699 void cv::divide(InputArray src1, InputArray src2,
3700 OutputArray dst, double scale, int dtype)
3701 {
3702 arithm_op(src1, src2, dst, noArray(), dtype, getDivTab(), true, &scale, OCL_OP_DIV_SCALE);
3703 }
3704
divide(double scale,InputArray src2,OutputArray dst,int dtype)3705 void cv::divide(double scale, InputArray src2,
3706 OutputArray dst, int dtype)
3707 {
3708 arithm_op(src2, src2, dst, noArray(), dtype, getRecipTab(), true, &scale, OCL_OP_RECIP_SCALE);
3709 }
3710
3711 /****************************************************************************************\
3712 * addWeighted *
3713 \****************************************************************************************/
3714
3715 namespace cv
3716 {
3717
3718 template <typename T, typename WT>
3719 struct AddWeighted_SIMD
3720 {
operator ()cv::AddWeighted_SIMD3721 int operator() (const T *, const T *, T *, int, WT, WT, WT) const
3722 {
3723 return 0;
3724 }
3725 };
3726
3727 #if CV_SSE2
3728
3729 template <>
3730 struct AddWeighted_SIMD<schar, float>
3731 {
AddWeighted_SIMDcv::AddWeighted_SIMD3732 AddWeighted_SIMD()
3733 {
3734 haveSSE2 = checkHardwareSupport(CV_CPU_SSE2);
3735 }
3736
operator ()cv::AddWeighted_SIMD3737 int operator() (const schar * src1, const schar * src2, schar * dst, int width, float alpha, float beta, float gamma) const
3738 {
3739 int x = 0;
3740
3741 if (!haveSSE2)
3742 return x;
3743
3744 __m128i v_zero = _mm_setzero_si128();
3745 __m128 v_alpha = _mm_set1_ps(alpha), v_beta = _mm_set1_ps(beta),
3746 v_gamma = _mm_set1_ps(gamma);
3747
3748 for( ; x <= width - 8; x += 8 )
3749 {
3750 __m128i v_src1 = _mm_loadl_epi64((const __m128i *)(src1 + x));
3751 __m128i v_src2 = _mm_loadl_epi64((const __m128i *)(src2 + x));
3752
3753 __m128i v_src1_p = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, v_src1), 8);
3754 __m128i v_src2_p = _mm_srai_epi16(_mm_unpacklo_epi8(v_zero, v_src2), 8);
3755
3756 __m128 v_dstf0 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src1_p), 16)), v_alpha);
3757 v_dstf0 = _mm_add_ps(_mm_add_ps(v_dstf0, v_gamma),
3758 _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src2_p), 16)), v_beta));
3759
3760 __m128 v_dstf1 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src1_p), 16)), v_alpha);
3761 v_dstf1 = _mm_add_ps(_mm_add_ps(v_dstf1, v_gamma),
3762 _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src2_p), 16)), v_beta));
3763
3764 __m128i v_dst16 = _mm_packs_epi32(_mm_cvtps_epi32(v_dstf0),
3765 _mm_cvtps_epi32(v_dstf1));
3766
3767 _mm_storel_epi64((__m128i *)(dst + x), _mm_packs_epi16(v_dst16, v_zero));
3768 }
3769
3770 return x;
3771 }
3772
3773 bool haveSSE2;
3774 };
3775
3776 template <>
3777 struct AddWeighted_SIMD<short, float>
3778 {
AddWeighted_SIMDcv::AddWeighted_SIMD3779 AddWeighted_SIMD()
3780 {
3781 haveSSE2 = checkHardwareSupport(CV_CPU_SSE2);
3782 }
3783
operator ()cv::AddWeighted_SIMD3784 int operator() (const short * src1, const short * src2, short * dst, int width, float alpha, float beta, float gamma) const
3785 {
3786 int x = 0;
3787
3788 if (!haveSSE2)
3789 return x;
3790
3791 __m128i v_zero = _mm_setzero_si128();
3792 __m128 v_alpha = _mm_set1_ps(alpha), v_beta = _mm_set1_ps(beta),
3793 v_gamma = _mm_set1_ps(gamma);
3794
3795 for( ; x <= width - 8; x += 8 )
3796 {
3797 __m128i v_src1 = _mm_loadu_si128((const __m128i *)(src1 + x));
3798 __m128i v_src2 = _mm_loadu_si128((const __m128i *)(src2 + x));
3799
3800 __m128 v_dstf0 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src1), 16)), v_alpha);
3801 v_dstf0 = _mm_add_ps(_mm_add_ps(v_dstf0, v_gamma),
3802 _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src2), 16)), v_beta));
3803
3804 __m128 v_dstf1 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src1), 16)), v_alpha);
3805 v_dstf1 = _mm_add_ps(_mm_add_ps(v_dstf1, v_gamma),
3806 _mm_mul_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src2), 16)), v_beta));
3807
3808 _mm_storeu_si128((__m128i *)(dst + x), _mm_packs_epi32(_mm_cvtps_epi32(v_dstf0),
3809 _mm_cvtps_epi32(v_dstf1)));
3810 }
3811
3812 return x;
3813 }
3814
3815 bool haveSSE2;
3816 };
3817
3818 #if CV_SSE4_1
3819
3820 template <>
3821 struct AddWeighted_SIMD<ushort, float>
3822 {
AddWeighted_SIMDcv::AddWeighted_SIMD3823 AddWeighted_SIMD()
3824 {
3825 haveSSE4_1 = checkHardwareSupport(CV_CPU_SSE4_1);
3826 }
3827
operator ()cv::AddWeighted_SIMD3828 int operator() (const ushort * src1, const ushort * src2, ushort * dst, int width, float alpha, float beta, float gamma) const
3829 {
3830 int x = 0;
3831
3832 if (!haveSSE4_1)
3833 return x;
3834
3835 __m128i v_zero = _mm_setzero_si128();
3836 __m128 v_alpha = _mm_set1_ps(alpha), v_beta = _mm_set1_ps(beta),
3837 v_gamma = _mm_set1_ps(gamma);
3838
3839 for( ; x <= width - 8; x += 8 )
3840 {
3841 __m128i v_src1 = _mm_loadu_si128((const __m128i *)(src1 + x));
3842 __m128i v_src2 = _mm_loadu_si128((const __m128i *)(src2 + x));
3843
3844 __m128 v_dstf0 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src1, v_zero)), v_alpha);
3845 v_dstf0 = _mm_add_ps(_mm_add_ps(v_dstf0, v_gamma),
3846 _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src2, v_zero)), v_beta));
3847
3848 __m128 v_dstf1 = _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src1, v_zero)), v_alpha);
3849 v_dstf1 = _mm_add_ps(_mm_add_ps(v_dstf1, v_gamma),
3850 _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src2, v_zero)), v_beta));
3851
3852 _mm_storeu_si128((__m128i *)(dst + x), _mm_packus_epi32(_mm_cvtps_epi32(v_dstf0),
3853 _mm_cvtps_epi32(v_dstf1)));
3854 }
3855
3856 return x;
3857 }
3858
3859 bool haveSSE4_1;
3860 };
3861
3862 #endif
3863
3864 #elif CV_NEON
3865
3866 template <>
3867 struct AddWeighted_SIMD<schar, float>
3868 {
operator ()cv::AddWeighted_SIMD3869 int operator() (const schar * src1, const schar * src2, schar * dst, int width, float alpha, float beta, float gamma) const
3870 {
3871 int x = 0;
3872
3873 float32x4_t g = vdupq_n_f32 (gamma);
3874
3875 for( ; x <= width - 8; x += 8 )
3876 {
3877 int8x8_t in1 = vld1_s8(src1 + x);
3878 int16x8_t in1_16 = vmovl_s8(in1);
3879 float32x4_t in1_f_l = vcvtq_f32_s32(vmovl_s16(vget_low_s16(in1_16)));
3880 float32x4_t in1_f_h = vcvtq_f32_s32(vmovl_s16(vget_high_s16(in1_16)));
3881
3882 int8x8_t in2 = vld1_s8(src2+x);
3883 int16x8_t in2_16 = vmovl_s8(in2);
3884 float32x4_t in2_f_l = vcvtq_f32_s32(vmovl_s16(vget_low_s16(in2_16)));
3885 float32x4_t in2_f_h = vcvtq_f32_s32(vmovl_s16(vget_high_s16(in2_16)));
3886
3887 float32x4_t out_f_l = vaddq_f32(vmulq_n_f32(in1_f_l, alpha), vmulq_n_f32(in2_f_l, beta));
3888 float32x4_t out_f_h = vaddq_f32(vmulq_n_f32(in1_f_h, alpha), vmulq_n_f32(in2_f_h, beta));
3889 out_f_l = vaddq_f32(out_f_l, g);
3890 out_f_h = vaddq_f32(out_f_h, g);
3891
3892 int16x4_t out_16_l = vqmovn_s32(cv_vrndq_s32_f32(out_f_l));
3893 int16x4_t out_16_h = vqmovn_s32(cv_vrndq_s32_f32(out_f_h));
3894
3895 int16x8_t out_16 = vcombine_s16(out_16_l, out_16_h);
3896 int8x8_t out = vqmovn_s16(out_16);
3897
3898 vst1_s8(dst + x, out);
3899 }
3900
3901 return x;
3902 }
3903 };
3904
3905 template <>
3906 struct AddWeighted_SIMD<ushort, float>
3907 {
operator ()cv::AddWeighted_SIMD3908 int operator() (const ushort * src1, const ushort * src2, ushort * dst, int width, float alpha, float beta, float gamma) const
3909 {
3910 int x = 0;
3911
3912 float32x4_t g = vdupq_n_f32(gamma);
3913
3914 for( ; x <= width - 8; x += 8 )
3915 {
3916 uint16x8_t v_src1 = vld1q_u16(src1 + x), v_src2 = vld1q_u16(src2 + x);
3917
3918 float32x4_t v_s1 = vmulq_n_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src1))), alpha);
3919 float32x4_t v_s2 = vmulq_n_f32(vcvtq_f32_u32(vmovl_u16(vget_low_u16(v_src2))), beta);
3920 uint16x4_t v_dst1 = vqmovn_u32(cv_vrndq_u32_f32(vaddq_f32(vaddq_f32(v_s1, v_s2), g)));
3921
3922 v_s1 = vmulq_n_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src1))), alpha);
3923 v_s2 = vmulq_n_f32(vcvtq_f32_u32(vmovl_u16(vget_high_u16(v_src2))), beta);
3924 uint16x4_t v_dst2 = vqmovn_u32(cv_vrndq_u32_f32(vaddq_f32(vaddq_f32(v_s1, v_s2), g)));
3925
3926 vst1q_u16(dst + x, vcombine_u16(v_dst1, v_dst2));
3927 }
3928
3929 return x;
3930 }
3931 };
3932
3933 template <>
3934 struct AddWeighted_SIMD<short, float>
3935 {
operator ()cv::AddWeighted_SIMD3936 int operator() (const short * src1, const short * src2, short * dst, int width, float alpha, float beta, float gamma) const
3937 {
3938 int x = 0;
3939
3940 float32x4_t g = vdupq_n_f32(gamma);
3941
3942 for( ; x <= width - 8; x += 8 )
3943 {
3944 int16x8_t v_src1 = vld1q_s16(src1 + x), v_src2 = vld1q_s16(src2 + x);
3945
3946 float32x4_t v_s1 = vmulq_n_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src1))), alpha);
3947 float32x4_t v_s2 = vmulq_n_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src2))), beta);
3948 int16x4_t v_dst1 = vqmovn_s32(cv_vrndq_s32_f32(vaddq_f32(vaddq_f32(v_s1, v_s2), g)));
3949
3950 v_s1 = vmulq_n_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src1))), alpha);
3951 v_s2 = vmulq_n_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src2))), beta);
3952 int16x4_t v_dst2 = vqmovn_s32(cv_vrndq_s32_f32(vaddq_f32(vaddq_f32(v_s1, v_s2), g)));
3953
3954 vst1q_s16(dst + x, vcombine_s16(v_dst1, v_dst2));
3955 }
3956
3957 return x;
3958 }
3959 };
3960
3961 #endif
3962
3963 template<typename T, typename WT> static void
addWeighted_(const T * src1,size_t step1,const T * src2,size_t step2,T * dst,size_t step,Size size,void * _scalars)3964 addWeighted_( const T* src1, size_t step1, const T* src2, size_t step2,
3965 T* dst, size_t step, Size size, void* _scalars )
3966 {
3967 const double* scalars = (const double*)_scalars;
3968 WT alpha = (WT)scalars[0], beta = (WT)scalars[1], gamma = (WT)scalars[2];
3969 step1 /= sizeof(src1[0]);
3970 step2 /= sizeof(src2[0]);
3971 step /= sizeof(dst[0]);
3972
3973 AddWeighted_SIMD<T, WT> vop;
3974
3975 for( ; size.height--; src1 += step1, src2 += step2, dst += step )
3976 {
3977 int x = vop(src1, src2, dst, size.width, alpha, beta, gamma);
3978 #if CV_ENABLE_UNROLLED
3979 for( ; x <= size.width - 4; x += 4 )
3980 {
3981 T t0 = saturate_cast<T>(src1[x]*alpha + src2[x]*beta + gamma);
3982 T t1 = saturate_cast<T>(src1[x+1]*alpha + src2[x+1]*beta + gamma);
3983 dst[x] = t0; dst[x+1] = t1;
3984
3985 t0 = saturate_cast<T>(src1[x+2]*alpha + src2[x+2]*beta + gamma);
3986 t1 = saturate_cast<T>(src1[x+3]*alpha + src2[x+3]*beta + gamma);
3987 dst[x+2] = t0; dst[x+3] = t1;
3988 }
3989 #endif
3990 for( ; x < size.width; x++ )
3991 dst[x] = saturate_cast<T>(src1[x]*alpha + src2[x]*beta + gamma);
3992 }
3993 }
3994
3995
3996 static void
addWeighted8u(const uchar * src1,size_t step1,const uchar * src2,size_t step2,uchar * dst,size_t step,Size size,void * _scalars)3997 addWeighted8u( const uchar* src1, size_t step1,
3998 const uchar* src2, size_t step2,
3999 uchar* dst, size_t step, Size size,
4000 void* _scalars )
4001 {
4002 const double* scalars = (const double*)_scalars;
4003 float alpha = (float)scalars[0], beta = (float)scalars[1], gamma = (float)scalars[2];
4004
4005 for( ; size.height--; src1 += step1, src2 += step2, dst += step )
4006 {
4007 int x = 0;
4008
4009 #if CV_SSE2
4010 if( USE_SSE2 )
4011 {
4012 __m128 a4 = _mm_set1_ps(alpha), b4 = _mm_set1_ps(beta), g4 = _mm_set1_ps(gamma);
4013 __m128i z = _mm_setzero_si128();
4014
4015 for( ; x <= size.width - 8; x += 8 )
4016 {
4017 __m128i u = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(src1 + x)), z);
4018 __m128i v = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(src2 + x)), z);
4019
4020 __m128 u0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(u, z));
4021 __m128 u1 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(u, z));
4022 __m128 v0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v, z));
4023 __m128 v1 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v, z));
4024
4025 u0 = _mm_add_ps(_mm_mul_ps(u0, a4), _mm_mul_ps(v0, b4));
4026 u1 = _mm_add_ps(_mm_mul_ps(u1, a4), _mm_mul_ps(v1, b4));
4027 u0 = _mm_add_ps(u0, g4); u1 = _mm_add_ps(u1, g4);
4028
4029 u = _mm_packs_epi32(_mm_cvtps_epi32(u0), _mm_cvtps_epi32(u1));
4030 u = _mm_packus_epi16(u, u);
4031
4032 _mm_storel_epi64((__m128i*)(dst + x), u);
4033 }
4034 }
4035 #elif CV_NEON
4036 float32x4_t g = vdupq_n_f32 (gamma);
4037
4038 for( ; x <= size.width - 8; x += 8 )
4039 {
4040 uint8x8_t in1 = vld1_u8(src1+x);
4041 uint16x8_t in1_16 = vmovl_u8(in1);
4042 float32x4_t in1_f_l = vcvtq_f32_u32(vmovl_u16(vget_low_u16(in1_16)));
4043 float32x4_t in1_f_h = vcvtq_f32_u32(vmovl_u16(vget_high_u16(in1_16)));
4044
4045 uint8x8_t in2 = vld1_u8(src2+x);
4046 uint16x8_t in2_16 = vmovl_u8(in2);
4047 float32x4_t in2_f_l = vcvtq_f32_u32(vmovl_u16(vget_low_u16(in2_16)));
4048 float32x4_t in2_f_h = vcvtq_f32_u32(vmovl_u16(vget_high_u16(in2_16)));
4049
4050 float32x4_t out_f_l = vaddq_f32(vmulq_n_f32(in1_f_l, alpha), vmulq_n_f32(in2_f_l, beta));
4051 float32x4_t out_f_h = vaddq_f32(vmulq_n_f32(in1_f_h, alpha), vmulq_n_f32(in2_f_h, beta));
4052 out_f_l = vaddq_f32(out_f_l, g);
4053 out_f_h = vaddq_f32(out_f_h, g);
4054
4055 uint16x4_t out_16_l = vqmovun_s32(cv_vrndq_s32_f32(out_f_l));
4056 uint16x4_t out_16_h = vqmovun_s32(cv_vrndq_s32_f32(out_f_h));
4057
4058 uint16x8_t out_16 = vcombine_u16(out_16_l, out_16_h);
4059 uint8x8_t out = vqmovn_u16(out_16);
4060
4061 vst1_u8(dst+x, out);
4062 }
4063 #endif
4064 #if CV_ENABLE_UNROLLED
4065 for( ; x <= size.width - 4; x += 4 )
4066 {
4067 float t0, t1;
4068 t0 = CV_8TO32F(src1[x])*alpha + CV_8TO32F(src2[x])*beta + gamma;
4069 t1 = CV_8TO32F(src1[x+1])*alpha + CV_8TO32F(src2[x+1])*beta + gamma;
4070
4071 dst[x] = saturate_cast<uchar>(t0);
4072 dst[x+1] = saturate_cast<uchar>(t1);
4073
4074 t0 = CV_8TO32F(src1[x+2])*alpha + CV_8TO32F(src2[x+2])*beta + gamma;
4075 t1 = CV_8TO32F(src1[x+3])*alpha + CV_8TO32F(src2[x+3])*beta + gamma;
4076
4077 dst[x+2] = saturate_cast<uchar>(t0);
4078 dst[x+3] = saturate_cast<uchar>(t1);
4079 }
4080 #endif
4081
4082 for( ; x < size.width; x++ )
4083 {
4084 float t0 = CV_8TO32F(src1[x])*alpha + CV_8TO32F(src2[x])*beta + gamma;
4085 dst[x] = saturate_cast<uchar>(t0);
4086 }
4087 }
4088 }
4089
addWeighted8s(const schar * src1,size_t step1,const schar * src2,size_t step2,schar * dst,size_t step,Size sz,void * scalars)4090 static void addWeighted8s( const schar* src1, size_t step1, const schar* src2, size_t step2,
4091 schar* dst, size_t step, Size sz, void* scalars )
4092 {
4093 addWeighted_<schar, float>(src1, step1, src2, step2, dst, step, sz, scalars);
4094 }
4095
addWeighted16u(const ushort * src1,size_t step1,const ushort * src2,size_t step2,ushort * dst,size_t step,Size sz,void * scalars)4096 static void addWeighted16u( const ushort* src1, size_t step1, const ushort* src2, size_t step2,
4097 ushort* dst, size_t step, Size sz, void* scalars )
4098 {
4099 addWeighted_<ushort, float>(src1, step1, src2, step2, dst, step, sz, scalars);
4100 }
4101
addWeighted16s(const short * src1,size_t step1,const short * src2,size_t step2,short * dst,size_t step,Size sz,void * scalars)4102 static void addWeighted16s( const short* src1, size_t step1, const short* src2, size_t step2,
4103 short* dst, size_t step, Size sz, void* scalars )
4104 {
4105 addWeighted_<short, float>(src1, step1, src2, step2, dst, step, sz, scalars);
4106 }
4107
addWeighted32s(const int * src1,size_t step1,const int * src2,size_t step2,int * dst,size_t step,Size sz,void * scalars)4108 static void addWeighted32s( const int* src1, size_t step1, const int* src2, size_t step2,
4109 int* dst, size_t step, Size sz, void* scalars )
4110 {
4111 addWeighted_<int, double>(src1, step1, src2, step2, dst, step, sz, scalars);
4112 }
4113
addWeighted32f(const float * src1,size_t step1,const float * src2,size_t step2,float * dst,size_t step,Size sz,void * scalars)4114 static void addWeighted32f( const float* src1, size_t step1, const float* src2, size_t step2,
4115 float* dst, size_t step, Size sz, void* scalars )
4116 {
4117 addWeighted_<float, double>(src1, step1, src2, step2, dst, step, sz, scalars);
4118 }
4119
addWeighted64f(const double * src1,size_t step1,const double * src2,size_t step2,double * dst,size_t step,Size sz,void * scalars)4120 static void addWeighted64f( const double* src1, size_t step1, const double* src2, size_t step2,
4121 double* dst, size_t step, Size sz, void* scalars )
4122 {
4123 addWeighted_<double, double>(src1, step1, src2, step2, dst, step, sz, scalars);
4124 }
4125
getAddWeightedTab()4126 static BinaryFunc* getAddWeightedTab()
4127 {
4128 static BinaryFunc addWeightedTab[] =
4129 {
4130 (BinaryFunc)GET_OPTIMIZED(addWeighted8u), (BinaryFunc)GET_OPTIMIZED(addWeighted8s), (BinaryFunc)GET_OPTIMIZED(addWeighted16u),
4131 (BinaryFunc)GET_OPTIMIZED(addWeighted16s), (BinaryFunc)GET_OPTIMIZED(addWeighted32s), (BinaryFunc)addWeighted32f,
4132 (BinaryFunc)addWeighted64f, 0
4133 };
4134
4135 return addWeightedTab;
4136 }
4137
4138 }
4139
addWeighted(InputArray src1,double alpha,InputArray src2,double beta,double gamma,OutputArray dst,int dtype)4140 void cv::addWeighted( InputArray src1, double alpha, InputArray src2,
4141 double beta, double gamma, OutputArray dst, int dtype )
4142 {
4143 double scalars[] = {alpha, beta, gamma};
4144 arithm_op(src1, src2, dst, noArray(), dtype, getAddWeightedTab(), true, scalars, OCL_OP_ADDW);
4145 }
4146
4147
4148 /****************************************************************************************\
4149 * compare *
4150 \****************************************************************************************/
4151
4152 namespace cv
4153 {
4154
4155 template <typename T>
4156 struct Cmp_SIMD
4157 {
Cmp_SIMDcv::Cmp_SIMD4158 explicit Cmp_SIMD(int)
4159 {
4160 }
4161
operator ()cv::Cmp_SIMD4162 int operator () (const T *, const T *, uchar *, int) const
4163 {
4164 return 0;
4165 }
4166 };
4167
4168 #if CV_NEON
4169
4170 template <>
4171 struct Cmp_SIMD<schar>
4172 {
Cmp_SIMDcv::Cmp_SIMD4173 explicit Cmp_SIMD(int code_) :
4174 code(code_)
4175 {
4176 CV_Assert(code == CMP_GT || code == CMP_LE ||
4177 code == CMP_EQ || code == CMP_NE);
4178
4179 v_mask = vdupq_n_u8(255);
4180 }
4181
operator ()cv::Cmp_SIMD4182 int operator () (const schar * src1, const schar * src2, uchar * dst, int width) const
4183 {
4184 int x = 0;
4185
4186 if (code == CMP_GT)
4187 for ( ; x <= width - 16; x += 16)
4188 vst1q_u8(dst + x, vcgtq_s8(vld1q_s8(src1 + x), vld1q_s8(src2 + x)));
4189 else if (code == CMP_LE)
4190 for ( ; x <= width - 16; x += 16)
4191 vst1q_u8(dst + x, vcleq_s8(vld1q_s8(src1 + x), vld1q_s8(src2 + x)));
4192 else if (code == CMP_EQ)
4193 for ( ; x <= width - 16; x += 16)
4194 vst1q_u8(dst + x, vceqq_s8(vld1q_s8(src1 + x), vld1q_s8(src2 + x)));
4195 else if (code == CMP_NE)
4196 for ( ; x <= width - 16; x += 16)
4197 vst1q_u8(dst + x, veorq_u8(vceqq_s8(vld1q_s8(src1 + x), vld1q_s8(src2 + x)), v_mask));
4198
4199 return x;
4200 }
4201
4202 int code;
4203 uint8x16_t v_mask;
4204 };
4205
4206 template <>
4207 struct Cmp_SIMD<ushort>
4208 {
Cmp_SIMDcv::Cmp_SIMD4209 explicit Cmp_SIMD(int code_) :
4210 code(code_)
4211 {
4212 CV_Assert(code == CMP_GT || code == CMP_LE ||
4213 code == CMP_EQ || code == CMP_NE);
4214
4215 v_mask = vdup_n_u8(255);
4216 }
4217
operator ()cv::Cmp_SIMD4218 int operator () (const ushort * src1, const ushort * src2, uchar * dst, int width) const
4219 {
4220 int x = 0;
4221
4222 if (code == CMP_GT)
4223 for ( ; x <= width - 8; x += 8)
4224 {
4225 uint16x8_t v_dst = vcgtq_u16(vld1q_u16(src1 + x), vld1q_u16(src2 + x));
4226 vst1_u8(dst + x, vmovn_u16(v_dst));
4227 }
4228 else if (code == CMP_LE)
4229 for ( ; x <= width - 8; x += 8)
4230 {
4231 uint16x8_t v_dst = vcleq_u16(vld1q_u16(src1 + x), vld1q_u16(src2 + x));
4232 vst1_u8(dst + x, vmovn_u16(v_dst));
4233 }
4234 else if (code == CMP_EQ)
4235 for ( ; x <= width - 8; x += 8)
4236 {
4237 uint16x8_t v_dst = vceqq_u16(vld1q_u16(src1 + x), vld1q_u16(src2 + x));
4238 vst1_u8(dst + x, vmovn_u16(v_dst));
4239 }
4240 else if (code == CMP_NE)
4241 for ( ; x <= width - 8; x += 8)
4242 {
4243 uint16x8_t v_dst = vceqq_u16(vld1q_u16(src1 + x), vld1q_u16(src2 + x));
4244 vst1_u8(dst + x, veor_u8(vmovn_u16(v_dst), v_mask));
4245 }
4246
4247 return x;
4248 }
4249
4250 int code;
4251 uint8x8_t v_mask;
4252 };
4253
4254 template <>
4255 struct Cmp_SIMD<int>
4256 {
Cmp_SIMDcv::Cmp_SIMD4257 explicit Cmp_SIMD(int code_) :
4258 code(code_)
4259 {
4260 CV_Assert(code == CMP_GT || code == CMP_LE ||
4261 code == CMP_EQ || code == CMP_NE);
4262
4263 v_mask = vdup_n_u8(255);
4264 }
4265
operator ()cv::Cmp_SIMD4266 int operator () (const int * src1, const int * src2, uchar * dst, int width) const
4267 {
4268 int x = 0;
4269
4270 if (code == CMP_GT)
4271 for ( ; x <= width - 8; x += 8)
4272 {
4273 uint32x4_t v_dst1 = vcgtq_s32(vld1q_s32(src1 + x), vld1q_s32(src2 + x));
4274 uint32x4_t v_dst2 = vcgtq_s32(vld1q_s32(src1 + x + 4), vld1q_s32(src2 + x + 4));
4275 vst1_u8(dst + x, vmovn_u16(vcombine_u16(vmovn_u32(v_dst1), vmovn_u32(v_dst2))));
4276 }
4277 else if (code == CMP_LE)
4278 for ( ; x <= width - 8; x += 8)
4279 {
4280 uint32x4_t v_dst1 = vcleq_s32(vld1q_s32(src1 + x), vld1q_s32(src2 + x));
4281 uint32x4_t v_dst2 = vcleq_s32(vld1q_s32(src1 + x + 4), vld1q_s32(src2 + x + 4));
4282 vst1_u8(dst + x, vmovn_u16(vcombine_u16(vmovn_u32(v_dst1), vmovn_u32(v_dst2))));
4283 }
4284 else if (code == CMP_EQ)
4285 for ( ; x <= width - 8; x += 8)
4286 {
4287 uint32x4_t v_dst1 = vceqq_s32(vld1q_s32(src1 + x), vld1q_s32(src2 + x));
4288 uint32x4_t v_dst2 = vceqq_s32(vld1q_s32(src1 + x + 4), vld1q_s32(src2 + x + 4));
4289 vst1_u8(dst + x, vmovn_u16(vcombine_u16(vmovn_u32(v_dst1), vmovn_u32(v_dst2))));
4290 }
4291 else if (code == CMP_NE)
4292 for ( ; x <= width - 8; x += 8)
4293 {
4294 uint32x4_t v_dst1 = vceqq_s32(vld1q_s32(src1 + x), vld1q_s32(src2 + x));
4295 uint32x4_t v_dst2 = vceqq_s32(vld1q_s32(src1 + x + 4), vld1q_s32(src2 + x + 4));
4296 uint8x8_t v_dst = vmovn_u16(vcombine_u16(vmovn_u32(v_dst1), vmovn_u32(v_dst2)));
4297 vst1_u8(dst + x, veor_u8(v_dst, v_mask));
4298 }
4299
4300 return x;
4301 }
4302
4303 int code;
4304 uint8x8_t v_mask;
4305 };
4306
4307 template <>
4308 struct Cmp_SIMD<float>
4309 {
Cmp_SIMDcv::Cmp_SIMD4310 explicit Cmp_SIMD(int code_) :
4311 code(code_)
4312 {
4313 CV_Assert(code == CMP_GT || code == CMP_LE ||
4314 code == CMP_EQ || code == CMP_NE);
4315
4316 v_mask = vdup_n_u8(255);
4317 }
4318
operator ()cv::Cmp_SIMD4319 int operator () (const float * src1, const float * src2, uchar * dst, int width) const
4320 {
4321 int x = 0;
4322
4323 if (code == CMP_GT)
4324 for ( ; x <= width - 8; x += 8)
4325 {
4326 uint32x4_t v_dst1 = vcgtq_f32(vld1q_f32(src1 + x), vld1q_f32(src2 + x));
4327 uint32x4_t v_dst2 = vcgtq_f32(vld1q_f32(src1 + x + 4), vld1q_f32(src2 + x + 4));
4328 vst1_u8(dst + x, vmovn_u16(vcombine_u16(vmovn_u32(v_dst1), vmovn_u32(v_dst2))));
4329 }
4330 else if (code == CMP_LE)
4331 for ( ; x <= width - 8; x += 8)
4332 {
4333 uint32x4_t v_dst1 = vcleq_f32(vld1q_f32(src1 + x), vld1q_f32(src2 + x));
4334 uint32x4_t v_dst2 = vcleq_f32(vld1q_f32(src1 + x + 4), vld1q_f32(src2 + x + 4));
4335 vst1_u8(dst + x, vmovn_u16(vcombine_u16(vmovn_u32(v_dst1), vmovn_u32(v_dst2))));
4336 }
4337 else if (code == CMP_EQ)
4338 for ( ; x <= width - 8; x += 8)
4339 {
4340 uint32x4_t v_dst1 = vceqq_f32(vld1q_f32(src1 + x), vld1q_f32(src2 + x));
4341 uint32x4_t v_dst2 = vceqq_f32(vld1q_f32(src1 + x + 4), vld1q_f32(src2 + x + 4));
4342 vst1_u8(dst + x, vmovn_u16(vcombine_u16(vmovn_u32(v_dst1), vmovn_u32(v_dst2))));
4343 }
4344 else if (code == CMP_NE)
4345 for ( ; x <= width - 8; x += 8)
4346 {
4347 uint32x4_t v_dst1 = vceqq_f32(vld1q_f32(src1 + x), vld1q_f32(src2 + x));
4348 uint32x4_t v_dst2 = vceqq_f32(vld1q_f32(src1 + x + 4), vld1q_f32(src2 + x + 4));
4349 uint8x8_t v_dst = vmovn_u16(vcombine_u16(vmovn_u32(v_dst1), vmovn_u32(v_dst2)));
4350 vst1_u8(dst + x, veor_u8(v_dst, v_mask));
4351 }
4352
4353 return x;
4354 }
4355
4356 int code;
4357 uint8x8_t v_mask;
4358 };
4359
4360 #elif CV_SSE2
4361
4362 template <>
4363 struct Cmp_SIMD<schar>
4364 {
Cmp_SIMDcv::Cmp_SIMD4365 explicit Cmp_SIMD(int code_) :
4366 code(code_)
4367 {
4368 CV_Assert(code == CMP_GT || code == CMP_LE ||
4369 code == CMP_EQ || code == CMP_NE);
4370
4371 haveSSE = checkHardwareSupport(CV_CPU_SSE2);
4372
4373 v_mask = _mm_set1_epi8(-1);
4374 }
4375
operator ()cv::Cmp_SIMD4376 int operator () (const schar * src1, const schar * src2, uchar * dst, int width) const
4377 {
4378 int x = 0;
4379
4380 if (!haveSSE)
4381 return x;
4382
4383 if (code == CMP_GT)
4384 for ( ; x <= width - 16; x += 16)
4385 _mm_storeu_si128((__m128i *)(dst + x), _mm_cmpgt_epi8(_mm_loadu_si128((const __m128i *)(src1 + x)),
4386 _mm_loadu_si128((const __m128i *)(src2 + x))));
4387 else if (code == CMP_LE)
4388 for ( ; x <= width - 16; x += 16)
4389 {
4390 __m128i v_gt = _mm_cmpgt_epi8(_mm_loadu_si128((const __m128i *)(src1 + x)),
4391 _mm_loadu_si128((const __m128i *)(src2 + x)));
4392 _mm_storeu_si128((__m128i *)(dst + x), _mm_xor_si128(v_mask, v_gt));
4393 }
4394 else if (code == CMP_EQ)
4395 for ( ; x <= width - 16; x += 16)
4396 _mm_storeu_si128((__m128i *)(dst + x), _mm_cmpeq_epi8(_mm_loadu_si128((const __m128i *)(src1 + x)),
4397 _mm_loadu_si128((const __m128i *)(src2 + x))));
4398 else if (code == CMP_NE)
4399 for ( ; x <= width - 16; x += 16)
4400 {
4401 __m128i v_eq = _mm_cmpeq_epi8(_mm_loadu_si128((const __m128i *)(src1 + x)),
4402 _mm_loadu_si128((const __m128i *)(src2 + x)));
4403 _mm_storeu_si128((__m128i *)(dst + x), _mm_xor_si128(v_mask, v_eq));
4404 }
4405
4406 return x;
4407 }
4408
4409 int code;
4410 __m128i v_mask;
4411 bool haveSSE;
4412 };
4413
4414 template <>
4415 struct Cmp_SIMD<int>
4416 {
Cmp_SIMDcv::Cmp_SIMD4417 explicit Cmp_SIMD(int code_) :
4418 code(code_)
4419 {
4420 CV_Assert(code == CMP_GT || code == CMP_LE ||
4421 code == CMP_EQ || code == CMP_NE);
4422
4423 haveSSE = checkHardwareSupport(CV_CPU_SSE2);
4424
4425 v_mask = _mm_set1_epi32(0xffffffff);
4426 }
4427
operator ()cv::Cmp_SIMD4428 int operator () (const int * src1, const int * src2, uchar * dst, int width) const
4429 {
4430 int x = 0;
4431
4432 if (!haveSSE)
4433 return x;
4434
4435 if (code == CMP_GT)
4436 for ( ; x <= width - 8; x += 8)
4437 {
4438 __m128i v_dst0 = _mm_cmpgt_epi32(_mm_loadu_si128((const __m128i *)(src1 + x)),
4439 _mm_loadu_si128((const __m128i *)(src2 + x)));
4440 __m128i v_dst1 = _mm_cmpgt_epi32(_mm_loadu_si128((const __m128i *)(src1 + x + 4)),
4441 _mm_loadu_si128((const __m128i *)(src2 + x + 4)));
4442
4443 _mm_storel_epi64((__m128i *)(dst + x), _mm_packs_epi16(_mm_packs_epi32(v_dst0, v_dst1), v_mask));
4444 }
4445 else if (code == CMP_LE)
4446 for ( ; x <= width - 8; x += 8)
4447 {
4448 __m128i v_dst0 = _mm_cmpgt_epi32(_mm_loadu_si128((const __m128i *)(src1 + x)),
4449 _mm_loadu_si128((const __m128i *)(src2 + x)));
4450 __m128i v_dst1 = _mm_cmpgt_epi32(_mm_loadu_si128((const __m128i *)(src1 + x + 4)),
4451 _mm_loadu_si128((const __m128i *)(src2 + x + 4)));
4452
4453 _mm_storel_epi64((__m128i *)(dst + x), _mm_xor_si128(_mm_packs_epi16(_mm_packs_epi32(v_dst0, v_dst1), v_mask), v_mask));
4454 }
4455 else if (code == CMP_EQ)
4456 for ( ; x <= width - 8; x += 8)
4457 {
4458 __m128i v_dst0 = _mm_cmpeq_epi32(_mm_loadu_si128((const __m128i *)(src1 + x)),
4459 _mm_loadu_si128((const __m128i *)(src2 + x)));
4460 __m128i v_dst1 = _mm_cmpeq_epi32(_mm_loadu_si128((const __m128i *)(src1 + x + 4)),
4461 _mm_loadu_si128((const __m128i *)(src2 + x + 4)));
4462
4463 _mm_storel_epi64((__m128i *)(dst + x), _mm_packs_epi16(_mm_packs_epi32(v_dst0, v_dst1), v_mask));
4464 }
4465 else if (code == CMP_NE)
4466 for ( ; x <= width - 8; x += 8)
4467 {
4468 __m128i v_dst0 = _mm_cmpeq_epi32(_mm_loadu_si128((const __m128i *)(src1 + x)),
4469 _mm_loadu_si128((const __m128i *)(src2 + x)));
4470 __m128i v_dst1 = _mm_cmpeq_epi32(_mm_loadu_si128((const __m128i *)(src1 + x + 4)),
4471 _mm_loadu_si128((const __m128i *)(src2 + x + 4)));
4472
4473 _mm_storel_epi64((__m128i *)(dst + x), _mm_xor_si128(v_mask, _mm_packs_epi16(_mm_packs_epi32(v_dst0, v_dst1), v_mask)));
4474 }
4475
4476 return x;
4477 }
4478
4479 int code;
4480 __m128i v_mask;
4481 bool haveSSE;
4482 };
4483
4484 #endif
4485
4486 template<typename T> static void
cmp_(const T * src1,size_t step1,const T * src2,size_t step2,uchar * dst,size_t step,Size size,int code)4487 cmp_(const T* src1, size_t step1, const T* src2, size_t step2,
4488 uchar* dst, size_t step, Size size, int code)
4489 {
4490 step1 /= sizeof(src1[0]);
4491 step2 /= sizeof(src2[0]);
4492 if( code == CMP_GE || code == CMP_LT )
4493 {
4494 std::swap(src1, src2);
4495 std::swap(step1, step2);
4496 code = code == CMP_GE ? CMP_LE : CMP_GT;
4497 }
4498
4499 Cmp_SIMD<T> vop(code);
4500
4501 if( code == CMP_GT || code == CMP_LE )
4502 {
4503 int m = code == CMP_GT ? 0 : 255;
4504 for( ; size.height--; src1 += step1, src2 += step2, dst += step )
4505 {
4506 int x = vop(src1, src2, dst, size.width);
4507 #if CV_ENABLE_UNROLLED
4508 for( ; x <= size.width - 4; x += 4 )
4509 {
4510 int t0, t1;
4511 t0 = -(src1[x] > src2[x]) ^ m;
4512 t1 = -(src1[x+1] > src2[x+1]) ^ m;
4513 dst[x] = (uchar)t0; dst[x+1] = (uchar)t1;
4514 t0 = -(src1[x+2] > src2[x+2]) ^ m;
4515 t1 = -(src1[x+3] > src2[x+3]) ^ m;
4516 dst[x+2] = (uchar)t0; dst[x+3] = (uchar)t1;
4517 }
4518 #endif
4519 for( ; x < size.width; x++ )
4520 dst[x] = (uchar)(-(src1[x] > src2[x]) ^ m);
4521 }
4522 }
4523 else if( code == CMP_EQ || code == CMP_NE )
4524 {
4525 int m = code == CMP_EQ ? 0 : 255;
4526 for( ; size.height--; src1 += step1, src2 += step2, dst += step )
4527 {
4528 int x = 0;
4529 #if CV_ENABLE_UNROLLED
4530 for( ; x <= size.width - 4; x += 4 )
4531 {
4532 int t0, t1;
4533 t0 = -(src1[x] == src2[x]) ^ m;
4534 t1 = -(src1[x+1] == src2[x+1]) ^ m;
4535 dst[x] = (uchar)t0; dst[x+1] = (uchar)t1;
4536 t0 = -(src1[x+2] == src2[x+2]) ^ m;
4537 t1 = -(src1[x+3] == src2[x+3]) ^ m;
4538 dst[x+2] = (uchar)t0; dst[x+3] = (uchar)t1;
4539 }
4540 #endif
4541 for( ; x < size.width; x++ )
4542 dst[x] = (uchar)(-(src1[x] == src2[x]) ^ m);
4543 }
4544 }
4545 }
4546
4547 #if ARITHM_USE_IPP
convert_cmp(int _cmpop)4548 inline static IppCmpOp convert_cmp(int _cmpop)
4549 {
4550 return _cmpop == CMP_EQ ? ippCmpEq :
4551 _cmpop == CMP_GT ? ippCmpGreater :
4552 _cmpop == CMP_GE ? ippCmpGreaterEq :
4553 _cmpop == CMP_LT ? ippCmpLess :
4554 _cmpop == CMP_LE ? ippCmpLessEq :
4555 (IppCmpOp)-1;
4556 }
4557 #endif
4558
cmp8u(const uchar * src1,size_t step1,const uchar * src2,size_t step2,uchar * dst,size_t step,Size size,void * _cmpop)4559 static void cmp8u(const uchar* src1, size_t step1, const uchar* src2, size_t step2,
4560 uchar* dst, size_t step, Size size, void* _cmpop)
4561 {
4562 #if ARITHM_USE_IPP
4563 CV_IPP_CHECK()
4564 {
4565 IppCmpOp op = convert_cmp(*(int *)_cmpop);
4566 if( op >= 0 )
4567 {
4568 fixSteps(size, sizeof(dst[0]), step1, step2, step);
4569 if (0 <= ippiCompare_8u_C1R(src1, (int)step1, src2, (int)step2, dst, (int)step, ippiSize(size), op))
4570 {
4571 CV_IMPL_ADD(CV_IMPL_IPP);
4572 return;
4573 }
4574 setIppErrorStatus();
4575 }
4576 }
4577 #endif
4578 //vz optimized cmp_(src1, step1, src2, step2, dst, step, size, *(int*)_cmpop);
4579 int code = *(int*)_cmpop;
4580 step1 /= sizeof(src1[0]);
4581 step2 /= sizeof(src2[0]);
4582 if( code == CMP_GE || code == CMP_LT )
4583 {
4584 std::swap(src1, src2);
4585 std::swap(step1, step2);
4586 code = code == CMP_GE ? CMP_LE : CMP_GT;
4587 }
4588
4589 if( code == CMP_GT || code == CMP_LE )
4590 {
4591 int m = code == CMP_GT ? 0 : 255;
4592 for( ; size.height--; src1 += step1, src2 += step2, dst += step )
4593 {
4594 int x =0;
4595 #if CV_SSE2
4596 if( USE_SSE2 )
4597 {
4598 __m128i m128 = code == CMP_GT ? _mm_setzero_si128() : _mm_set1_epi8 (-1);
4599 __m128i c128 = _mm_set1_epi8 (-128);
4600 for( ; x <= size.width - 16; x += 16 )
4601 {
4602 __m128i r00 = _mm_loadu_si128((const __m128i*)(src1 + x));
4603 __m128i r10 = _mm_loadu_si128((const __m128i*)(src2 + x));
4604 // no simd for 8u comparison, that's why we need the trick
4605 r00 = _mm_sub_epi8(r00,c128);
4606 r10 = _mm_sub_epi8(r10,c128);
4607
4608 r00 =_mm_xor_si128(_mm_cmpgt_epi8(r00, r10), m128);
4609 _mm_storeu_si128((__m128i*)(dst + x),r00);
4610
4611 }
4612 }
4613 #elif CV_NEON
4614 uint8x16_t mask = code == CMP_GT ? vdupq_n_u8(0) : vdupq_n_u8(255);
4615
4616 for( ; x <= size.width - 16; x += 16 )
4617 {
4618 vst1q_u8(dst+x, veorq_u8(vcgtq_u8(vld1q_u8(src1+x), vld1q_u8(src2+x)), mask));
4619 }
4620
4621 #endif
4622
4623 for( ; x < size.width; x++ ){
4624 dst[x] = (uchar)(-(src1[x] > src2[x]) ^ m);
4625 }
4626 }
4627 }
4628 else if( code == CMP_EQ || code == CMP_NE )
4629 {
4630 int m = code == CMP_EQ ? 0 : 255;
4631 for( ; size.height--; src1 += step1, src2 += step2, dst += step )
4632 {
4633 int x = 0;
4634 #if CV_SSE2
4635 if( USE_SSE2 )
4636 {
4637 __m128i m128 = code == CMP_EQ ? _mm_setzero_si128() : _mm_set1_epi8 (-1);
4638 for( ; x <= size.width - 16; x += 16 )
4639 {
4640 __m128i r00 = _mm_loadu_si128((const __m128i*)(src1 + x));
4641 __m128i r10 = _mm_loadu_si128((const __m128i*)(src2 + x));
4642 r00 = _mm_xor_si128 ( _mm_cmpeq_epi8 (r00, r10), m128);
4643 _mm_storeu_si128((__m128i*)(dst + x), r00);
4644 }
4645 }
4646 #elif CV_NEON
4647 uint8x16_t mask = code == CMP_EQ ? vdupq_n_u8(0) : vdupq_n_u8(255);
4648
4649 for( ; x <= size.width - 16; x += 16 )
4650 {
4651 vst1q_u8(dst+x, veorq_u8(vceqq_u8(vld1q_u8(src1+x), vld1q_u8(src2+x)), mask));
4652 }
4653 #endif
4654 for( ; x < size.width; x++ )
4655 dst[x] = (uchar)(-(src1[x] == src2[x]) ^ m);
4656 }
4657 }
4658 }
4659
cmp8s(const schar * src1,size_t step1,const schar * src2,size_t step2,uchar * dst,size_t step,Size size,void * _cmpop)4660 static void cmp8s(const schar* src1, size_t step1, const schar* src2, size_t step2,
4661 uchar* dst, size_t step, Size size, void* _cmpop)
4662 {
4663 cmp_(src1, step1, src2, step2, dst, step, size, *(int*)_cmpop);
4664 }
4665
cmp16u(const ushort * src1,size_t step1,const ushort * src2,size_t step2,uchar * dst,size_t step,Size size,void * _cmpop)4666 static void cmp16u(const ushort* src1, size_t step1, const ushort* src2, size_t step2,
4667 uchar* dst, size_t step, Size size, void* _cmpop)
4668 {
4669 #if ARITHM_USE_IPP
4670 CV_IPP_CHECK()
4671 {
4672 IppCmpOp op = convert_cmp(*(int *)_cmpop);
4673 if( op >= 0 )
4674 {
4675 fixSteps(size, sizeof(dst[0]), step1, step2, step);
4676 if (0 <= ippiCompare_16u_C1R(src1, (int)step1, src2, (int)step2, dst, (int)step, ippiSize(size), op))
4677 {
4678 CV_IMPL_ADD(CV_IMPL_IPP);
4679 return;
4680 }
4681 setIppErrorStatus();
4682 }
4683 }
4684 #endif
4685 cmp_(src1, step1, src2, step2, dst, step, size, *(int*)_cmpop);
4686 }
4687
cmp16s(const short * src1,size_t step1,const short * src2,size_t step2,uchar * dst,size_t step,Size size,void * _cmpop)4688 static void cmp16s(const short* src1, size_t step1, const short* src2, size_t step2,
4689 uchar* dst, size_t step, Size size, void* _cmpop)
4690 {
4691 #if ARITHM_USE_IPP
4692 CV_IPP_CHECK()
4693 {
4694 IppCmpOp op = convert_cmp(*(int *)_cmpop);
4695 if( op > 0 )
4696 {
4697 fixSteps(size, sizeof(dst[0]), step1, step2, step);
4698 if (0 <= ippiCompare_16s_C1R(src1, (int)step1, src2, (int)step2, dst, (int)step, ippiSize(size), op))
4699 {
4700 CV_IMPL_ADD(CV_IMPL_IPP);
4701 return;
4702 }
4703 setIppErrorStatus();
4704 }
4705 }
4706 #endif
4707 //vz optimized cmp_(src1, step1, src2, step2, dst, step, size, *(int*)_cmpop);
4708
4709 int code = *(int*)_cmpop;
4710 step1 /= sizeof(src1[0]);
4711 step2 /= sizeof(src2[0]);
4712 if( code == CMP_GE || code == CMP_LT )
4713 {
4714 std::swap(src1, src2);
4715 std::swap(step1, step2);
4716 code = code == CMP_GE ? CMP_LE : CMP_GT;
4717 }
4718
4719 if( code == CMP_GT || code == CMP_LE )
4720 {
4721 int m = code == CMP_GT ? 0 : 255;
4722 for( ; size.height--; src1 += step1, src2 += step2, dst += step )
4723 {
4724 int x =0;
4725 #if CV_SSE2
4726 if( USE_SSE2)
4727 {
4728 __m128i m128 = code == CMP_GT ? _mm_setzero_si128() : _mm_set1_epi16 (-1);
4729 for( ; x <= size.width - 16; x += 16 )
4730 {
4731 __m128i r00 = _mm_loadu_si128((const __m128i*)(src1 + x));
4732 __m128i r10 = _mm_loadu_si128((const __m128i*)(src2 + x));
4733 r00 = _mm_xor_si128 ( _mm_cmpgt_epi16 (r00, r10), m128);
4734 __m128i r01 = _mm_loadu_si128((const __m128i*)(src1 + x + 8));
4735 __m128i r11 = _mm_loadu_si128((const __m128i*)(src2 + x + 8));
4736 r01 = _mm_xor_si128 ( _mm_cmpgt_epi16 (r01, r11), m128);
4737 r11 = _mm_packs_epi16(r00, r01);
4738 _mm_storeu_si128((__m128i*)(dst + x), r11);
4739 }
4740 if( x <= size.width-8)
4741 {
4742 __m128i r00 = _mm_loadu_si128((const __m128i*)(src1 + x));
4743 __m128i r10 = _mm_loadu_si128((const __m128i*)(src2 + x));
4744 r00 = _mm_xor_si128 ( _mm_cmpgt_epi16 (r00, r10), m128);
4745 r10 = _mm_packs_epi16(r00, r00);
4746 _mm_storel_epi64((__m128i*)(dst + x), r10);
4747
4748 x += 8;
4749 }
4750 }
4751 #elif CV_NEON
4752 uint8x16_t mask = code == CMP_GT ? vdupq_n_u8(0) : vdupq_n_u8(255);
4753
4754 for( ; x <= size.width - 16; x += 16 )
4755 {
4756 int16x8_t in1 = vld1q_s16(src1 + x);
4757 int16x8_t in2 = vld1q_s16(src2 + x);
4758 uint8x8_t t1 = vmovn_u16(vcgtq_s16(in1, in2));
4759
4760 in1 = vld1q_s16(src1 + x + 8);
4761 in2 = vld1q_s16(src2 + x + 8);
4762 uint8x8_t t2 = vmovn_u16(vcgtq_s16(in1, in2));
4763
4764 vst1q_u8(dst+x, veorq_u8(vcombine_u8(t1, t2), mask));
4765 }
4766 #endif
4767
4768 for( ; x < size.width; x++ ){
4769 dst[x] = (uchar)(-(src1[x] > src2[x]) ^ m);
4770 }
4771 }
4772 }
4773 else if( code == CMP_EQ || code == CMP_NE )
4774 {
4775 int m = code == CMP_EQ ? 0 : 255;
4776 for( ; size.height--; src1 += step1, src2 += step2, dst += step )
4777 {
4778 int x = 0;
4779 #if CV_SSE2
4780 if( USE_SSE2 )
4781 {
4782 __m128i m128 = code == CMP_EQ ? _mm_setzero_si128() : _mm_set1_epi16 (-1);
4783 for( ; x <= size.width - 16; x += 16 )
4784 {
4785 __m128i r00 = _mm_loadu_si128((const __m128i*)(src1 + x));
4786 __m128i r10 = _mm_loadu_si128((const __m128i*)(src2 + x));
4787 r00 = _mm_xor_si128 ( _mm_cmpeq_epi16 (r00, r10), m128);
4788 __m128i r01 = _mm_loadu_si128((const __m128i*)(src1 + x + 8));
4789 __m128i r11 = _mm_loadu_si128((const __m128i*)(src2 + x + 8));
4790 r01 = _mm_xor_si128 ( _mm_cmpeq_epi16 (r01, r11), m128);
4791 r11 = _mm_packs_epi16(r00, r01);
4792 _mm_storeu_si128((__m128i*)(dst + x), r11);
4793 }
4794 if( x <= size.width - 8)
4795 {
4796 __m128i r00 = _mm_loadu_si128((const __m128i*)(src1 + x));
4797 __m128i r10 = _mm_loadu_si128((const __m128i*)(src2 + x));
4798 r00 = _mm_xor_si128 ( _mm_cmpeq_epi16 (r00, r10), m128);
4799 r10 = _mm_packs_epi16(r00, r00);
4800 _mm_storel_epi64((__m128i*)(dst + x), r10);
4801
4802 x += 8;
4803 }
4804 }
4805 #elif CV_NEON
4806 uint8x16_t mask = code == CMP_EQ ? vdupq_n_u8(0) : vdupq_n_u8(255);
4807
4808 for( ; x <= size.width - 16; x += 16 )
4809 {
4810 int16x8_t in1 = vld1q_s16(src1 + x);
4811 int16x8_t in2 = vld1q_s16(src2 + x);
4812 uint8x8_t t1 = vmovn_u16(vceqq_s16(in1, in2));
4813
4814 in1 = vld1q_s16(src1 + x + 8);
4815 in2 = vld1q_s16(src2 + x + 8);
4816 uint8x8_t t2 = vmovn_u16(vceqq_s16(in1, in2));
4817
4818 vst1q_u8(dst+x, veorq_u8(vcombine_u8(t1, t2), mask));
4819 }
4820 #endif
4821 for( ; x < size.width; x++ )
4822 dst[x] = (uchar)(-(src1[x] == src2[x]) ^ m);
4823 }
4824 }
4825 }
4826
cmp32s(const int * src1,size_t step1,const int * src2,size_t step2,uchar * dst,size_t step,Size size,void * _cmpop)4827 static void cmp32s(const int* src1, size_t step1, const int* src2, size_t step2,
4828 uchar* dst, size_t step, Size size, void* _cmpop)
4829 {
4830 cmp_(src1, step1, src2, step2, dst, step, size, *(int*)_cmpop);
4831 }
4832
cmp32f(const float * src1,size_t step1,const float * src2,size_t step2,uchar * dst,size_t step,Size size,void * _cmpop)4833 static void cmp32f(const float* src1, size_t step1, const float* src2, size_t step2,
4834 uchar* dst, size_t step, Size size, void* _cmpop)
4835 {
4836 #if ARITHM_USE_IPP
4837 CV_IPP_CHECK()
4838 {
4839 IppCmpOp op = convert_cmp(*(int *)_cmpop);
4840 if( op >= 0 )
4841 {
4842 fixSteps(size, sizeof(dst[0]), step1, step2, step);
4843 if (0 <= ippiCompare_32f_C1R(src1, (int)step1, src2, (int)step2, dst, (int)step, ippiSize(size), op))
4844 {
4845 CV_IMPL_ADD(CV_IMPL_IPP);
4846 return;
4847 }
4848 setIppErrorStatus();
4849 }
4850 }
4851 #endif
4852 cmp_(src1, step1, src2, step2, dst, step, size, *(int*)_cmpop);
4853 }
4854
cmp64f(const double * src1,size_t step1,const double * src2,size_t step2,uchar * dst,size_t step,Size size,void * _cmpop)4855 static void cmp64f(const double* src1, size_t step1, const double* src2, size_t step2,
4856 uchar* dst, size_t step, Size size, void* _cmpop)
4857 {
4858 cmp_(src1, step1, src2, step2, dst, step, size, *(int*)_cmpop);
4859 }
4860
getCmpFunc(int depth)4861 static BinaryFunc getCmpFunc(int depth)
4862 {
4863 static BinaryFunc cmpTab[] =
4864 {
4865 (BinaryFunc)GET_OPTIMIZED(cmp8u), (BinaryFunc)GET_OPTIMIZED(cmp8s),
4866 (BinaryFunc)GET_OPTIMIZED(cmp16u), (BinaryFunc)GET_OPTIMIZED(cmp16s),
4867 (BinaryFunc)GET_OPTIMIZED(cmp32s),
4868 (BinaryFunc)GET_OPTIMIZED(cmp32f), (BinaryFunc)cmp64f,
4869 0
4870 };
4871
4872 return cmpTab[depth];
4873 }
4874
getMinVal(int depth)4875 static double getMinVal(int depth)
4876 {
4877 static const double tab[] = {0, -128, 0, -32768, INT_MIN, -FLT_MAX, -DBL_MAX, 0};
4878 return tab[depth];
4879 }
4880
getMaxVal(int depth)4881 static double getMaxVal(int depth)
4882 {
4883 static const double tab[] = {255, 127, 65535, 32767, INT_MAX, FLT_MAX, DBL_MAX, 0};
4884 return tab[depth];
4885 }
4886
4887 #ifdef HAVE_OPENCL
4888
ocl_compare(InputArray _src1,InputArray _src2,OutputArray _dst,int op,bool haveScalar)4889 static bool ocl_compare(InputArray _src1, InputArray _src2, OutputArray _dst, int op, bool haveScalar)
4890 {
4891 const ocl::Device& dev = ocl::Device::getDefault();
4892 bool doubleSupport = dev.doubleFPConfig() > 0;
4893 int type1 = _src1.type(), depth1 = CV_MAT_DEPTH(type1), cn = CV_MAT_CN(type1),
4894 type2 = _src2.type(), depth2 = CV_MAT_DEPTH(type2);
4895
4896 if (!doubleSupport && depth1 == CV_64F)
4897 return false;
4898
4899 if (!haveScalar && (!_src1.sameSize(_src2) || type1 != type2))
4900 return false;
4901
4902 int kercn = haveScalar ? cn : ocl::predictOptimalVectorWidth(_src1, _src2, _dst), rowsPerWI = dev.isIntel() ? 4 : 1;
4903 // Workaround for bug with "?:" operator in AMD OpenCL compiler
4904 if (depth1 >= CV_16U)
4905 kercn = 1;
4906
4907 int scalarcn = kercn == 3 ? 4 : kercn;
4908 const char * const operationMap[] = { "==", ">", ">=", "<", "<=", "!=" };
4909 char cvt[40];
4910
4911 String opts = format("-D %s -D srcT1=%s -D dstT=%s -D workT=srcT1 -D cn=%d"
4912 " -D convertToDT=%s -D OP_CMP -D CMP_OPERATOR=%s -D srcT1_C1=%s"
4913 " -D srcT2_C1=%s -D dstT_C1=%s -D workST=%s -D rowsPerWI=%d%s",
4914 haveScalar ? "UNARY_OP" : "BINARY_OP",
4915 ocl::typeToStr(CV_MAKE_TYPE(depth1, kercn)),
4916 ocl::typeToStr(CV_8UC(kercn)), kercn,
4917 ocl::convertTypeStr(depth1, CV_8U, kercn, cvt),
4918 operationMap[op], ocl::typeToStr(depth1),
4919 ocl::typeToStr(depth1), ocl::typeToStr(CV_8U),
4920 ocl::typeToStr(CV_MAKE_TYPE(depth1, scalarcn)), rowsPerWI,
4921 doubleSupport ? " -D DOUBLE_SUPPORT" : "");
4922
4923 ocl::Kernel k("KF", ocl::core::arithm_oclsrc, opts);
4924 if (k.empty())
4925 return false;
4926
4927 UMat src1 = _src1.getUMat();
4928 Size size = src1.size();
4929 _dst.create(size, CV_8UC(cn));
4930 UMat dst = _dst.getUMat();
4931
4932 if (haveScalar)
4933 {
4934 size_t esz = CV_ELEM_SIZE1(type1) * scalarcn;
4935 double buf[4] = { 0, 0, 0, 0 };
4936 Mat src2 = _src2.getMat();
4937
4938 if( depth1 > CV_32S )
4939 convertAndUnrollScalar( src2, depth1, (uchar *)buf, kercn );
4940 else
4941 {
4942 double fval = 0;
4943 getConvertFunc(depth2, CV_64F)(src2.ptr(), 1, 0, 1, (uchar *)&fval, 1, Size(1, 1), 0);
4944 if( fval < getMinVal(depth1) )
4945 return dst.setTo(Scalar::all(op == CMP_GT || op == CMP_GE || op == CMP_NE ? 255 : 0)), true;
4946
4947 if( fval > getMaxVal(depth1) )
4948 return dst.setTo(Scalar::all(op == CMP_LT || op == CMP_LE || op == CMP_NE ? 255 : 0)), true;
4949
4950 int ival = cvRound(fval);
4951 if( fval != ival )
4952 {
4953 if( op == CMP_LT || op == CMP_GE )
4954 ival = cvCeil(fval);
4955 else if( op == CMP_LE || op == CMP_GT )
4956 ival = cvFloor(fval);
4957 else
4958 return dst.setTo(Scalar::all(op == CMP_NE ? 255 : 0)), true;
4959 }
4960 convertAndUnrollScalar(Mat(1, 1, CV_32S, &ival), depth1, (uchar *)buf, kercn);
4961 }
4962
4963 ocl::KernelArg scalararg = ocl::KernelArg(0, 0, 0, 0, buf, esz);
4964
4965 k.args(ocl::KernelArg::ReadOnlyNoSize(src1, cn, kercn),
4966 ocl::KernelArg::WriteOnly(dst, cn, kercn), scalararg);
4967 }
4968 else
4969 {
4970 UMat src2 = _src2.getUMat();
4971
4972 k.args(ocl::KernelArg::ReadOnlyNoSize(src1),
4973 ocl::KernelArg::ReadOnlyNoSize(src2),
4974 ocl::KernelArg::WriteOnly(dst, cn, kercn));
4975 }
4976
4977 size_t globalsize[2] = { dst.cols * cn / kercn, (dst.rows + rowsPerWI - 1) / rowsPerWI };
4978 return k.run(2, globalsize, NULL, false);
4979 }
4980
4981 #endif
4982
4983 }
4984
compare(InputArray _src1,InputArray _src2,OutputArray _dst,int op)4985 void cv::compare(InputArray _src1, InputArray _src2, OutputArray _dst, int op)
4986 {
4987 CV_Assert( op == CMP_LT || op == CMP_LE || op == CMP_EQ ||
4988 op == CMP_NE || op == CMP_GE || op == CMP_GT );
4989
4990 bool haveScalar = false;
4991
4992 if ((_src1.isMatx() + _src2.isMatx()) == 1
4993 || !_src1.sameSize(_src2)
4994 || _src1.type() != _src2.type())
4995 {
4996 if (checkScalar(_src1, _src2.type(), _src1.kind(), _src2.kind()))
4997 {
4998 op = op == CMP_LT ? CMP_GT : op == CMP_LE ? CMP_GE :
4999 op == CMP_GE ? CMP_LE : op == CMP_GT ? CMP_LT : op;
5000 // src1 is a scalar; swap it with src2
5001 compare(_src2, _src1, _dst, op);
5002 return;
5003 }
5004 else if( !checkScalar(_src2, _src1.type(), _src2.kind(), _src1.kind()) )
5005 CV_Error( CV_StsUnmatchedSizes,
5006 "The operation is neither 'array op array' (where arrays have the same size and the same type), "
5007 "nor 'array op scalar', nor 'scalar op array'" );
5008 haveScalar = true;
5009 }
5010
5011 CV_OCL_RUN(_src1.dims() <= 2 && _src2.dims() <= 2 && OCL_PERFORMANCE_CHECK(_dst.isUMat()),
5012 ocl_compare(_src1, _src2, _dst, op, haveScalar))
5013
5014 int kind1 = _src1.kind(), kind2 = _src2.kind();
5015 Mat src1 = _src1.getMat(), src2 = _src2.getMat();
5016
5017 if( kind1 == kind2 && src1.dims <= 2 && src2.dims <= 2 && src1.size() == src2.size() && src1.type() == src2.type() )
5018 {
5019 int cn = src1.channels();
5020 _dst.create(src1.size(), CV_8UC(cn));
5021 Mat dst = _dst.getMat();
5022 Size sz = getContinuousSize(src1, src2, dst, src1.channels());
5023 getCmpFunc(src1.depth())(src1.ptr(), src1.step, src2.ptr(), src2.step, dst.ptr(), dst.step, sz, &op);
5024 return;
5025 }
5026
5027 int cn = src1.channels(), depth1 = src1.depth(), depth2 = src2.depth();
5028
5029 _dst.create(src1.dims, src1.size, CV_8UC(cn));
5030 src1 = src1.reshape(1); src2 = src2.reshape(1);
5031 Mat dst = _dst.getMat().reshape(1);
5032
5033 size_t esz = src1.elemSize();
5034 size_t blocksize0 = (size_t)(BLOCK_SIZE + esz-1)/esz;
5035 BinaryFunc func = getCmpFunc(depth1);
5036
5037 if( !haveScalar )
5038 {
5039 const Mat* arrays[] = { &src1, &src2, &dst, 0 };
5040 uchar* ptrs[3];
5041
5042 NAryMatIterator it(arrays, ptrs);
5043 size_t total = it.size;
5044
5045 for( size_t i = 0; i < it.nplanes; i++, ++it )
5046 func( ptrs[0], 0, ptrs[1], 0, ptrs[2], 0, Size((int)total, 1), &op );
5047 }
5048 else
5049 {
5050 const Mat* arrays[] = { &src1, &dst, 0 };
5051 uchar* ptrs[2];
5052
5053 NAryMatIterator it(arrays, ptrs);
5054 size_t total = it.size, blocksize = std::min(total, blocksize0);
5055
5056 AutoBuffer<uchar> _buf(blocksize*esz);
5057 uchar *buf = _buf;
5058
5059 if( depth1 > CV_32S )
5060 convertAndUnrollScalar( src2, depth1, buf, blocksize );
5061 else
5062 {
5063 double fval=0;
5064 getConvertFunc(depth2, CV_64F)(src2.ptr(), 1, 0, 1, (uchar*)&fval, 1, Size(1,1), 0);
5065 if( fval < getMinVal(depth1) )
5066 {
5067 dst = Scalar::all(op == CMP_GT || op == CMP_GE || op == CMP_NE ? 255 : 0);
5068 return;
5069 }
5070
5071 if( fval > getMaxVal(depth1) )
5072 {
5073 dst = Scalar::all(op == CMP_LT || op == CMP_LE || op == CMP_NE ? 255 : 0);
5074 return;
5075 }
5076
5077 int ival = cvRound(fval);
5078 if( fval != ival )
5079 {
5080 if( op == CMP_LT || op == CMP_GE )
5081 ival = cvCeil(fval);
5082 else if( op == CMP_LE || op == CMP_GT )
5083 ival = cvFloor(fval);
5084 else
5085 {
5086 dst = Scalar::all(op == CMP_NE ? 255 : 0);
5087 return;
5088 }
5089 }
5090 convertAndUnrollScalar(Mat(1, 1, CV_32S, &ival), depth1, buf, blocksize);
5091 }
5092
5093 for( size_t i = 0; i < it.nplanes; i++, ++it )
5094 {
5095 for( size_t j = 0; j < total; j += blocksize )
5096 {
5097 int bsz = (int)MIN(total - j, blocksize);
5098 func( ptrs[0], 0, buf, 0, ptrs[1], 0, Size(bsz, 1), &op);
5099 ptrs[0] += bsz*esz;
5100 ptrs[1] += bsz;
5101 }
5102 }
5103 }
5104 }
5105
5106 /****************************************************************************************\
5107 * inRange *
5108 \****************************************************************************************/
5109
5110 namespace cv
5111 {
5112
5113 template <typename T>
5114 struct InRange_SIMD
5115 {
operator ()cv::InRange_SIMD5116 int operator () (const T *, const T *, const T *, uchar *, int) const
5117 {
5118 return 0;
5119 }
5120 };
5121
5122 #if CV_SSE2
5123
5124 template <>
5125 struct InRange_SIMD<uchar>
5126 {
operator ()cv::InRange_SIMD5127 int operator () (const uchar * src1, const uchar * src2, const uchar * src3,
5128 uchar * dst, int len) const
5129 {
5130 int x = 0;
5131
5132 if (USE_SSE2)
5133 {
5134 __m128i v_full = _mm_set1_epi8(-1), v_128 = _mm_set1_epi8(-128);
5135
5136 for ( ; x <= len - 16; x += 16 )
5137 {
5138 __m128i v_src = _mm_add_epi8(_mm_loadu_si128((const __m128i *)(src1 + x)), v_128);
5139 __m128i v_mask1 = _mm_cmpgt_epi8(_mm_add_epi8(_mm_loadu_si128((const __m128i *)(src2 + x)), v_128), v_src);
5140 __m128i v_mask2 = _mm_cmpgt_epi8(v_src, _mm_add_epi8(_mm_loadu_si128((const __m128i *)(src3 + x)), v_128));
5141 _mm_storeu_si128((__m128i *)(dst + x), _mm_andnot_si128(_mm_or_si128(v_mask1, v_mask2), v_full));
5142 }
5143 }
5144
5145 return x;
5146 }
5147 };
5148
5149 template <>
5150 struct InRange_SIMD<schar>
5151 {
operator ()cv::InRange_SIMD5152 int operator () (const schar * src1, const schar * src2, const schar * src3,
5153 uchar * dst, int len) const
5154 {
5155 int x = 0;
5156
5157 if (USE_SSE2)
5158 {
5159 __m128i v_full = _mm_set1_epi8(-1);
5160
5161 for ( ; x <= len - 16; x += 16 )
5162 {
5163 __m128i v_src = _mm_loadu_si128((const __m128i *)(src1 + x));
5164 __m128i v_mask1 = _mm_cmpgt_epi8(_mm_loadu_si128((const __m128i *)(src2 + x)), v_src);
5165 __m128i v_mask2 = _mm_cmpgt_epi8(v_src, _mm_loadu_si128((const __m128i *)(src3 + x)));
5166 _mm_storeu_si128((__m128i *)(dst + x), _mm_andnot_si128(_mm_or_si128(v_mask1, v_mask2), v_full));
5167 }
5168 }
5169
5170 return x;
5171 }
5172 };
5173
5174 template <>
5175 struct InRange_SIMD<ushort>
5176 {
operator ()cv::InRange_SIMD5177 int operator () (const ushort * src1, const ushort * src2, const ushort * src3,
5178 uchar * dst, int len) const
5179 {
5180 int x = 0;
5181
5182 if (USE_SSE2)
5183 {
5184 __m128i v_zero = _mm_setzero_si128(), v_full = _mm_set1_epi16(-1), v_32768 = _mm_set1_epi16(-32768);
5185
5186 for ( ; x <= len - 8; x += 8 )
5187 {
5188 __m128i v_src = _mm_add_epi16(_mm_loadu_si128((const __m128i *)(src1 + x)), v_32768);
5189 __m128i v_mask1 = _mm_cmpgt_epi16(_mm_add_epi16(_mm_loadu_si128((const __m128i *)(src2 + x)), v_32768), v_src);
5190 __m128i v_mask2 = _mm_cmpgt_epi16(v_src, _mm_add_epi16(_mm_loadu_si128((const __m128i *)(src3 + x)), v_32768));
5191 __m128i v_res = _mm_andnot_si128(_mm_or_si128(v_mask1, v_mask2), v_full);
5192 _mm_storel_epi64((__m128i *)(dst + x), _mm_packus_epi16(_mm_srli_epi16(v_res, 8), v_zero));
5193 }
5194 }
5195
5196 return x;
5197 }
5198 };
5199
5200 template <>
5201 struct InRange_SIMD<short>
5202 {
operator ()cv::InRange_SIMD5203 int operator () (const short * src1, const short * src2, const short * src3,
5204 uchar * dst, int len) const
5205 {
5206 int x = 0;
5207
5208 if (USE_SSE2)
5209 {
5210 __m128i v_zero = _mm_setzero_si128(), v_full = _mm_set1_epi16(-1);
5211
5212 for ( ; x <= len - 8; x += 8 )
5213 {
5214 __m128i v_src = _mm_loadu_si128((const __m128i *)(src1 + x));
5215 __m128i v_mask1 = _mm_cmpgt_epi16(_mm_loadu_si128((const __m128i *)(src2 + x)), v_src);
5216 __m128i v_mask2 = _mm_cmpgt_epi16(v_src, _mm_loadu_si128((const __m128i *)(src3 + x)));
5217 __m128i v_res = _mm_andnot_si128(_mm_or_si128(v_mask1, v_mask2), v_full);
5218 _mm_storel_epi64((__m128i *)(dst + x), _mm_packus_epi16(_mm_srli_epi16(v_res, 8), v_zero));
5219 }
5220 }
5221
5222 return x;
5223 }
5224 };
5225
5226 template <>
5227 struct InRange_SIMD<int>
5228 {
operator ()cv::InRange_SIMD5229 int operator () (const int * src1, const int * src2, const int * src3,
5230 uchar * dst, int len) const
5231 {
5232 int x = 0;
5233
5234 if (USE_SSE2)
5235 {
5236 __m128i v_zero = _mm_setzero_si128(), v_full = _mm_set1_epi32(-1);
5237
5238 for ( ; x <= len - 8; x += 8 )
5239 {
5240 __m128i v_src = _mm_loadu_si128((const __m128i *)(src1 + x));
5241 __m128i v_res1 = _mm_or_si128(_mm_cmpgt_epi32(_mm_loadu_si128((const __m128i *)(src2 + x)), v_src),
5242 _mm_cmpgt_epi32(v_src, _mm_loadu_si128((const __m128i *)(src3 + x))));
5243
5244 v_src = _mm_loadu_si128((const __m128i *)(src1 + x + 4));
5245 __m128i v_res2 = _mm_or_si128(_mm_cmpgt_epi32(_mm_loadu_si128((const __m128i *)(src2 + x + 4)), v_src),
5246 _mm_cmpgt_epi32(v_src, _mm_loadu_si128((const __m128i *)(src3 + x + 4))));
5247
5248 __m128i v_res = _mm_packs_epi32(_mm_srli_epi32(_mm_andnot_si128(v_res1, v_full), 16),
5249 _mm_srli_epi32(_mm_andnot_si128(v_res2, v_full), 16));
5250 _mm_storel_epi64((__m128i *)(dst + x), _mm_packus_epi16(v_res, v_zero));
5251 }
5252 }
5253
5254 return x;
5255 }
5256 };
5257
5258 template <>
5259 struct InRange_SIMD<float>
5260 {
operator ()cv::InRange_SIMD5261 int operator () (const float * src1, const float * src2, const float * src3,
5262 uchar * dst, int len) const
5263 {
5264 int x = 0;
5265
5266 if (USE_SSE2)
5267 {
5268 __m128i v_zero = _mm_setzero_si128();
5269
5270 for ( ; x <= len - 8; x += 8 )
5271 {
5272 __m128 v_src = _mm_loadu_ps(src1 + x);
5273 __m128 v_res1 = _mm_and_ps(_mm_cmple_ps(_mm_loadu_ps(src2 + x), v_src),
5274 _mm_cmple_ps(v_src, _mm_loadu_ps(src3 + x)));
5275
5276 v_src = _mm_loadu_ps(src1 + x + 4);
5277 __m128 v_res2 = _mm_and_ps(_mm_cmple_ps(_mm_loadu_ps(src2 + x + 4), v_src),
5278 _mm_cmple_ps(v_src, _mm_loadu_ps(src3 + x + 4)));
5279
5280 __m128i v_res1i = _mm_cvtps_epi32(v_res1), v_res2i = _mm_cvtps_epi32(v_res2);
5281 __m128i v_res = _mm_packs_epi32(_mm_srli_epi32(v_res1i, 16), _mm_srli_epi32(v_res2i, 16));
5282 _mm_storel_epi64((__m128i *)(dst + x), _mm_packus_epi16(v_res, v_zero));
5283 }
5284 }
5285
5286 return x;
5287 }
5288 };
5289
5290 #elif CV_NEON
5291
5292 template <>
5293 struct InRange_SIMD<uchar>
5294 {
operator ()cv::InRange_SIMD5295 int operator () (const uchar * src1, const uchar * src2, const uchar * src3,
5296 uchar * dst, int len) const
5297 {
5298 int x = 0;
5299
5300 for ( ; x <= len - 16; x += 16 )
5301 {
5302 uint8x16_t values = vld1q_u8(src1 + x);
5303 uint8x16_t low = vld1q_u8(src2 + x);
5304 uint8x16_t high = vld1q_u8(src3 + x);
5305
5306 vst1q_u8(dst + x, vandq_u8(vcgeq_u8(values, low), vcgeq_u8(high, values)));
5307 }
5308 return x;
5309 }
5310 };
5311
5312 template <>
5313 struct InRange_SIMD<schar>
5314 {
operator ()cv::InRange_SIMD5315 int operator () (const schar * src1, const schar * src2, const schar * src3,
5316 uchar * dst, int len) const
5317 {
5318 int x = 0;
5319
5320 for ( ; x <= len - 16; x += 16 )
5321 {
5322 int8x16_t values = vld1q_s8(src1 + x);
5323 int8x16_t low = vld1q_s8(src2 + x);
5324 int8x16_t high = vld1q_s8(src3 + x);
5325
5326 vst1q_u8(dst + x, vandq_u8(vcgeq_s8(values, low), vcgeq_s8(high, values)));
5327 }
5328 return x;
5329 }
5330 };
5331
5332 template <>
5333 struct InRange_SIMD<ushort>
5334 {
operator ()cv::InRange_SIMD5335 int operator () (const ushort * src1, const ushort * src2, const ushort * src3,
5336 uchar * dst, int len) const
5337 {
5338 int x = 0;
5339
5340 for ( ; x <= len - 16; x += 16 )
5341 {
5342 uint16x8_t values = vld1q_u16((const uint16_t*)(src1 + x));
5343 uint16x8_t low = vld1q_u16((const uint16_t*)(src2 + x));
5344 uint16x8_t high = vld1q_u16((const uint16_t*)(src3 + x));
5345 uint8x8_t r1 = vmovn_u16(vandq_u16(vcgeq_u16(values, low), vcgeq_u16(high, values)));
5346
5347 values = vld1q_u16((const uint16_t*)(src1 + x + 8));
5348 low = vld1q_u16((const uint16_t*)(src2 + x + 8));
5349 high = vld1q_u16((const uint16_t*)(src3 + x + 8));
5350 uint8x8_t r2 = vmovn_u16(vandq_u16(vcgeq_u16(values, low), vcgeq_u16(high, values)));
5351
5352 vst1q_u8(dst + x, vcombine_u8(r1, r2));
5353 }
5354 return x;
5355 }
5356 };
5357
5358 template <>
5359 struct InRange_SIMD<short>
5360 {
operator ()cv::InRange_SIMD5361 int operator () (const short * src1, const short * src2, const short * src3,
5362 uchar * dst, int len) const
5363 {
5364 int x = 0;
5365
5366 for ( ; x <= len - 16; x += 16 )
5367 {
5368 int16x8_t values = vld1q_s16((const int16_t*)(src1 + x));
5369 int16x8_t low = vld1q_s16((const int16_t*)(src2 + x));
5370 int16x8_t high = vld1q_s16((const int16_t*)(src3 + x));
5371 uint8x8_t r1 = vmovn_u16(vandq_u16(vcgeq_s16(values, low), vcgeq_s16(high, values)));
5372
5373 values = vld1q_s16((const int16_t*)(src1 + x + 8));
5374 low = vld1q_s16((const int16_t*)(src2 + x + 8));
5375 high = vld1q_s16((const int16_t*)(src3 + x + 8));
5376 uint8x8_t r2 = vmovn_u16(vandq_u16(vcgeq_s16(values, low), vcgeq_s16(high, values)));
5377
5378 vst1q_u8(dst + x, vcombine_u8(r1, r2));
5379 }
5380 return x;
5381 }
5382 };
5383
5384 template <>
5385 struct InRange_SIMD<int>
5386 {
operator ()cv::InRange_SIMD5387 int operator () (const int * src1, const int * src2, const int * src3,
5388 uchar * dst, int len) const
5389 {
5390 int x = 0;
5391
5392 for ( ; x <= len - 8; x += 8 )
5393 {
5394 int32x4_t values = vld1q_s32((const int32_t*)(src1 + x));
5395 int32x4_t low = vld1q_s32((const int32_t*)(src2 + x));
5396 int32x4_t high = vld1q_s32((const int32_t*)(src3 + x));
5397
5398 uint16x4_t r1 = vmovn_u32(vandq_u32(vcgeq_s32(values, low), vcgeq_s32(high, values)));
5399
5400 values = vld1q_s32((const int32_t*)(src1 + x + 4));
5401 low = vld1q_s32((const int32_t*)(src2 + x + 4));
5402 high = vld1q_s32((const int32_t*)(src3 + x + 4));
5403
5404 uint16x4_t r2 = vmovn_u32(vandq_u32(vcgeq_s32(values, low), vcgeq_s32(high, values)));
5405
5406 uint16x8_t res_16 = vcombine_u16(r1, r2);
5407
5408 vst1_u8(dst + x, vmovn_u16(res_16));
5409 }
5410 return x;
5411 }
5412 };
5413
5414 template <>
5415 struct InRange_SIMD<float>
5416 {
operator ()cv::InRange_SIMD5417 int operator () (const float * src1, const float * src2, const float * src3,
5418 uchar * dst, int len) const
5419 {
5420 int x = 0;
5421
5422 for ( ; x <= len - 8; x += 8 )
5423 {
5424 float32x4_t values = vld1q_f32((const float32_t*)(src1 + x));
5425 float32x4_t low = vld1q_f32((const float32_t*)(src2 + x));
5426 float32x4_t high = vld1q_f32((const float32_t*)(src3 + x));
5427
5428 uint16x4_t r1 = vmovn_u32(vandq_u32(vcgeq_f32(values, low), vcgeq_f32(high, values)));
5429
5430 values = vld1q_f32((const float32_t*)(src1 + x + 4));
5431 low = vld1q_f32((const float32_t*)(src2 + x + 4));
5432 high = vld1q_f32((const float32_t*)(src3 + x + 4));
5433
5434 uint16x4_t r2 = vmovn_u32(vandq_u32(vcgeq_f32(values, low), vcgeq_f32(high, values)));
5435
5436 uint16x8_t res_16 = vcombine_u16(r1, r2);
5437
5438 vst1_u8(dst + x, vmovn_u16(res_16));
5439 }
5440 return x;
5441 }
5442 };
5443
5444 #endif
5445
5446 template <typename T>
inRange_(const T * src1,size_t step1,const T * src2,size_t step2,const T * src3,size_t step3,uchar * dst,size_t step,Size size)5447 static void inRange_(const T* src1, size_t step1, const T* src2, size_t step2,
5448 const T* src3, size_t step3, uchar* dst, size_t step,
5449 Size size)
5450 {
5451 step1 /= sizeof(src1[0]);
5452 step2 /= sizeof(src2[0]);
5453 step3 /= sizeof(src3[0]);
5454
5455 InRange_SIMD<T> vop;
5456
5457 for( ; size.height--; src1 += step1, src2 += step2, src3 += step3, dst += step )
5458 {
5459 int x = vop(src1, src2, src3, dst, size.width);
5460 #if CV_ENABLE_UNROLLED
5461 for( ; x <= size.width - 4; x += 4 )
5462 {
5463 int t0, t1;
5464 t0 = src2[x] <= src1[x] && src1[x] <= src3[x];
5465 t1 = src2[x+1] <= src1[x+1] && src1[x+1] <= src3[x+1];
5466 dst[x] = (uchar)-t0; dst[x+1] = (uchar)-t1;
5467 t0 = src2[x+2] <= src1[x+2] && src1[x+2] <= src3[x+2];
5468 t1 = src2[x+3] <= src1[x+3] && src1[x+3] <= src3[x+3];
5469 dst[x+2] = (uchar)-t0; dst[x+3] = (uchar)-t1;
5470 }
5471 #endif
5472 for( ; x < size.width; x++ )
5473 dst[x] = (uchar)-(src2[x] <= src1[x] && src1[x] <= src3[x]);
5474 }
5475 }
5476
5477
inRange8u(const uchar * src1,size_t step1,const uchar * src2,size_t step2,const uchar * src3,size_t step3,uchar * dst,size_t step,Size size)5478 static void inRange8u(const uchar* src1, size_t step1, const uchar* src2, size_t step2,
5479 const uchar* src3, size_t step3, uchar* dst, size_t step, Size size)
5480 {
5481 inRange_(src1, step1, src2, step2, src3, step3, dst, step, size);
5482 }
5483
inRange8s(const schar * src1,size_t step1,const schar * src2,size_t step2,const schar * src3,size_t step3,uchar * dst,size_t step,Size size)5484 static void inRange8s(const schar* src1, size_t step1, const schar* src2, size_t step2,
5485 const schar* src3, size_t step3, uchar* dst, size_t step, Size size)
5486 {
5487 inRange_(src1, step1, src2, step2, src3, step3, dst, step, size);
5488 }
5489
inRange16u(const ushort * src1,size_t step1,const ushort * src2,size_t step2,const ushort * src3,size_t step3,uchar * dst,size_t step,Size size)5490 static void inRange16u(const ushort* src1, size_t step1, const ushort* src2, size_t step2,
5491 const ushort* src3, size_t step3, uchar* dst, size_t step, Size size)
5492 {
5493 inRange_(src1, step1, src2, step2, src3, step3, dst, step, size);
5494 }
5495
inRange16s(const short * src1,size_t step1,const short * src2,size_t step2,const short * src3,size_t step3,uchar * dst,size_t step,Size size)5496 static void inRange16s(const short* src1, size_t step1, const short* src2, size_t step2,
5497 const short* src3, size_t step3, uchar* dst, size_t step, Size size)
5498 {
5499 inRange_(src1, step1, src2, step2, src3, step3, dst, step, size);
5500 }
5501
inRange32s(const int * src1,size_t step1,const int * src2,size_t step2,const int * src3,size_t step3,uchar * dst,size_t step,Size size)5502 static void inRange32s(const int* src1, size_t step1, const int* src2, size_t step2,
5503 const int* src3, size_t step3, uchar* dst, size_t step, Size size)
5504 {
5505 inRange_(src1, step1, src2, step2, src3, step3, dst, step, size);
5506 }
5507
inRange32f(const float * src1,size_t step1,const float * src2,size_t step2,const float * src3,size_t step3,uchar * dst,size_t step,Size size)5508 static void inRange32f(const float* src1, size_t step1, const float* src2, size_t step2,
5509 const float* src3, size_t step3, uchar* dst, size_t step, Size size)
5510 {
5511 inRange_(src1, step1, src2, step2, src3, step3, dst, step, size);
5512 }
5513
inRange64f(const double * src1,size_t step1,const double * src2,size_t step2,const double * src3,size_t step3,uchar * dst,size_t step,Size size)5514 static void inRange64f(const double* src1, size_t step1, const double* src2, size_t step2,
5515 const double* src3, size_t step3, uchar* dst, size_t step, Size size)
5516 {
5517 inRange_(src1, step1, src2, step2, src3, step3, dst, step, size);
5518 }
5519
inRangeReduce(const uchar * src,uchar * dst,size_t len,int cn)5520 static void inRangeReduce(const uchar* src, uchar* dst, size_t len, int cn)
5521 {
5522 int k = cn % 4 ? cn % 4 : 4;
5523 size_t i, j;
5524 if( k == 1 )
5525 for( i = j = 0; i < len; i++, j += cn )
5526 dst[i] = src[j];
5527 else if( k == 2 )
5528 for( i = j = 0; i < len; i++, j += cn )
5529 dst[i] = src[j] & src[j+1];
5530 else if( k == 3 )
5531 for( i = j = 0; i < len; i++, j += cn )
5532 dst[i] = src[j] & src[j+1] & src[j+2];
5533 else
5534 for( i = j = 0; i < len; i++, j += cn )
5535 dst[i] = src[j] & src[j+1] & src[j+2] & src[j+3];
5536
5537 for( ; k < cn; k += 4 )
5538 {
5539 for( i = 0, j = k; i < len; i++, j += cn )
5540 dst[i] &= src[j] & src[j+1] & src[j+2] & src[j+3];
5541 }
5542 }
5543
5544 typedef void (*InRangeFunc)( const uchar* src1, size_t step1, const uchar* src2, size_t step2,
5545 const uchar* src3, size_t step3, uchar* dst, size_t step, Size sz );
5546
getInRangeFunc(int depth)5547 static InRangeFunc getInRangeFunc(int depth)
5548 {
5549 static InRangeFunc inRangeTab[] =
5550 {
5551 (InRangeFunc)GET_OPTIMIZED(inRange8u), (InRangeFunc)GET_OPTIMIZED(inRange8s), (InRangeFunc)GET_OPTIMIZED(inRange16u),
5552 (InRangeFunc)GET_OPTIMIZED(inRange16s), (InRangeFunc)GET_OPTIMIZED(inRange32s), (InRangeFunc)GET_OPTIMIZED(inRange32f),
5553 (InRangeFunc)inRange64f, 0
5554 };
5555
5556 return inRangeTab[depth];
5557 }
5558
5559 #ifdef HAVE_OPENCL
5560
ocl_inRange(InputArray _src,InputArray _lowerb,InputArray _upperb,OutputArray _dst)5561 static bool ocl_inRange( InputArray _src, InputArray _lowerb,
5562 InputArray _upperb, OutputArray _dst )
5563 {
5564 const ocl::Device & d = ocl::Device::getDefault();
5565 int skind = _src.kind(), lkind = _lowerb.kind(), ukind = _upperb.kind();
5566 Size ssize = _src.size(), lsize = _lowerb.size(), usize = _upperb.size();
5567 int stype = _src.type(), ltype = _lowerb.type(), utype = _upperb.type();
5568 int sdepth = CV_MAT_DEPTH(stype), ldepth = CV_MAT_DEPTH(ltype), udepth = CV_MAT_DEPTH(utype);
5569 int cn = CV_MAT_CN(stype), rowsPerWI = d.isIntel() ? 4 : 1;
5570 bool lbScalar = false, ubScalar = false;
5571
5572 if( (lkind == _InputArray::MATX && skind != _InputArray::MATX) ||
5573 ssize != lsize || stype != ltype )
5574 {
5575 if( !checkScalar(_lowerb, stype, lkind, skind) )
5576 CV_Error( CV_StsUnmatchedSizes,
5577 "The lower bounary is neither an array of the same size and same type as src, nor a scalar");
5578 lbScalar = true;
5579 }
5580
5581 if( (ukind == _InputArray::MATX && skind != _InputArray::MATX) ||
5582 ssize != usize || stype != utype )
5583 {
5584 if( !checkScalar(_upperb, stype, ukind, skind) )
5585 CV_Error( CV_StsUnmatchedSizes,
5586 "The upper bounary is neither an array of the same size and same type as src, nor a scalar");
5587 ubScalar = true;
5588 }
5589
5590 if (lbScalar != ubScalar)
5591 return false;
5592
5593 bool doubleSupport = d.doubleFPConfig() > 0,
5594 haveScalar = lbScalar && ubScalar;
5595
5596 if ( (!doubleSupport && sdepth == CV_64F) ||
5597 (!haveScalar && (sdepth != ldepth || sdepth != udepth)) )
5598 return false;
5599
5600 int kercn = haveScalar ? cn : std::max(std::min(ocl::predictOptimalVectorWidth(_src, _lowerb, _upperb, _dst), 4), cn);
5601 if (kercn % cn != 0)
5602 kercn = cn;
5603 int colsPerWI = kercn / cn;
5604 String opts = format("%s-D cn=%d -D srcT=%s -D srcT1=%s -D dstT=%s -D kercn=%d -D depth=%d%s -D colsPerWI=%d",
5605 haveScalar ? "-D HAVE_SCALAR " : "", cn, ocl::typeToStr(CV_MAKE_TYPE(sdepth, kercn)),
5606 ocl::typeToStr(sdepth), ocl::typeToStr(CV_8UC(colsPerWI)), kercn, sdepth,
5607 doubleSupport ? " -D DOUBLE_SUPPORT" : "", colsPerWI);
5608
5609 ocl::Kernel ker("inrange", ocl::core::inrange_oclsrc, opts);
5610 if (ker.empty())
5611 return false;
5612
5613 _dst.create(ssize, CV_8UC1);
5614 UMat src = _src.getUMat(), dst = _dst.getUMat(), lscalaru, uscalaru;
5615 Mat lscalar, uscalar;
5616
5617 if (lbScalar && ubScalar)
5618 {
5619 lscalar = _lowerb.getMat();
5620 uscalar = _upperb.getMat();
5621
5622 size_t esz = src.elemSize();
5623 size_t blocksize = 36;
5624
5625 AutoBuffer<uchar> _buf(blocksize*(((int)lbScalar + (int)ubScalar)*esz + cn) + 2*cn*sizeof(int) + 128);
5626 uchar *buf = alignPtr(_buf + blocksize*cn, 16);
5627
5628 if( ldepth != sdepth && sdepth < CV_32S )
5629 {
5630 int* ilbuf = (int*)alignPtr(buf + blocksize*esz, 16);
5631 int* iubuf = ilbuf + cn;
5632
5633 BinaryFunc sccvtfunc = getConvertFunc(ldepth, CV_32S);
5634 sccvtfunc(lscalar.ptr(), 1, 0, 1, (uchar*)ilbuf, 1, Size(cn, 1), 0);
5635 sccvtfunc(uscalar.ptr(), 1, 0, 1, (uchar*)iubuf, 1, Size(cn, 1), 0);
5636 int minval = cvRound(getMinVal(sdepth)), maxval = cvRound(getMaxVal(sdepth));
5637
5638 for( int k = 0; k < cn; k++ )
5639 {
5640 if( ilbuf[k] > iubuf[k] || ilbuf[k] > maxval || iubuf[k] < minval )
5641 ilbuf[k] = minval+1, iubuf[k] = minval;
5642 }
5643 lscalar = Mat(cn, 1, CV_32S, ilbuf);
5644 uscalar = Mat(cn, 1, CV_32S, iubuf);
5645 }
5646
5647 lscalar.convertTo(lscalar, stype);
5648 uscalar.convertTo(uscalar, stype);
5649 }
5650 else
5651 {
5652 lscalaru = _lowerb.getUMat();
5653 uscalaru = _upperb.getUMat();
5654 }
5655
5656 ocl::KernelArg srcarg = ocl::KernelArg::ReadOnlyNoSize(src),
5657 dstarg = ocl::KernelArg::WriteOnly(dst, 1, colsPerWI);
5658
5659 if (haveScalar)
5660 {
5661 lscalar.copyTo(lscalaru);
5662 uscalar.copyTo(uscalaru);
5663
5664 ker.args(srcarg, dstarg, ocl::KernelArg::PtrReadOnly(lscalaru),
5665 ocl::KernelArg::PtrReadOnly(uscalaru), rowsPerWI);
5666 }
5667 else
5668 ker.args(srcarg, dstarg, ocl::KernelArg::ReadOnlyNoSize(lscalaru),
5669 ocl::KernelArg::ReadOnlyNoSize(uscalaru), rowsPerWI);
5670
5671 size_t globalsize[2] = { ssize.width / colsPerWI, (ssize.height + rowsPerWI - 1) / rowsPerWI };
5672 return ker.run(2, globalsize, NULL, false);
5673 }
5674
5675 #endif
5676
5677 }
5678
inRange(InputArray _src,InputArray _lowerb,InputArray _upperb,OutputArray _dst)5679 void cv::inRange(InputArray _src, InputArray _lowerb,
5680 InputArray _upperb, OutputArray _dst)
5681 {
5682 CV_OCL_RUN(_src.dims() <= 2 && _lowerb.dims() <= 2 &&
5683 _upperb.dims() <= 2 && OCL_PERFORMANCE_CHECK(_dst.isUMat()),
5684 ocl_inRange(_src, _lowerb, _upperb, _dst))
5685
5686 int skind = _src.kind(), lkind = _lowerb.kind(), ukind = _upperb.kind();
5687 Mat src = _src.getMat(), lb = _lowerb.getMat(), ub = _upperb.getMat();
5688
5689 bool lbScalar = false, ubScalar = false;
5690
5691 if( (lkind == _InputArray::MATX && skind != _InputArray::MATX) ||
5692 src.size != lb.size || src.type() != lb.type() )
5693 {
5694 if( !checkScalar(lb, src.type(), lkind, skind) )
5695 CV_Error( CV_StsUnmatchedSizes,
5696 "The lower bounary is neither an array of the same size and same type as src, nor a scalar");
5697 lbScalar = true;
5698 }
5699
5700 if( (ukind == _InputArray::MATX && skind != _InputArray::MATX) ||
5701 src.size != ub.size || src.type() != ub.type() )
5702 {
5703 if( !checkScalar(ub, src.type(), ukind, skind) )
5704 CV_Error( CV_StsUnmatchedSizes,
5705 "The upper bounary is neither an array of the same size and same type as src, nor a scalar");
5706 ubScalar = true;
5707 }
5708
5709 CV_Assert(lbScalar == ubScalar);
5710
5711 int cn = src.channels(), depth = src.depth();
5712
5713 size_t esz = src.elemSize();
5714 size_t blocksize0 = (size_t)(BLOCK_SIZE + esz-1)/esz;
5715
5716 _dst.create(src.dims, src.size, CV_8UC1);
5717 Mat dst = _dst.getMat();
5718 InRangeFunc func = getInRangeFunc(depth);
5719
5720 const Mat* arrays_sc[] = { &src, &dst, 0 };
5721 const Mat* arrays_nosc[] = { &src, &dst, &lb, &ub, 0 };
5722 uchar* ptrs[4];
5723
5724 NAryMatIterator it(lbScalar && ubScalar ? arrays_sc : arrays_nosc, ptrs);
5725 size_t total = it.size, blocksize = std::min(total, blocksize0);
5726
5727 AutoBuffer<uchar> _buf(blocksize*(((int)lbScalar + (int)ubScalar)*esz + cn) + 2*cn*sizeof(int) + 128);
5728 uchar *buf = _buf, *mbuf = buf, *lbuf = 0, *ubuf = 0;
5729 buf = alignPtr(buf + blocksize*cn, 16);
5730
5731 if( lbScalar && ubScalar )
5732 {
5733 lbuf = buf;
5734 ubuf = buf = alignPtr(buf + blocksize*esz, 16);
5735
5736 CV_Assert( lb.type() == ub.type() );
5737 int scdepth = lb.depth();
5738
5739 if( scdepth != depth && depth < CV_32S )
5740 {
5741 int* ilbuf = (int*)alignPtr(buf + blocksize*esz, 16);
5742 int* iubuf = ilbuf + cn;
5743
5744 BinaryFunc sccvtfunc = getConvertFunc(scdepth, CV_32S);
5745 sccvtfunc(lb.ptr(), 1, 0, 1, (uchar*)ilbuf, 1, Size(cn, 1), 0);
5746 sccvtfunc(ub.ptr(), 1, 0, 1, (uchar*)iubuf, 1, Size(cn, 1), 0);
5747 int minval = cvRound(getMinVal(depth)), maxval = cvRound(getMaxVal(depth));
5748
5749 for( int k = 0; k < cn; k++ )
5750 {
5751 if( ilbuf[k] > iubuf[k] || ilbuf[k] > maxval || iubuf[k] < minval )
5752 ilbuf[k] = minval+1, iubuf[k] = minval;
5753 }
5754 lb = Mat(cn, 1, CV_32S, ilbuf);
5755 ub = Mat(cn, 1, CV_32S, iubuf);
5756 }
5757
5758 convertAndUnrollScalar( lb, src.type(), lbuf, blocksize );
5759 convertAndUnrollScalar( ub, src.type(), ubuf, blocksize );
5760 }
5761
5762 for( size_t i = 0; i < it.nplanes; i++, ++it )
5763 {
5764 for( size_t j = 0; j < total; j += blocksize )
5765 {
5766 int bsz = (int)MIN(total - j, blocksize);
5767 size_t delta = bsz*esz;
5768 uchar *lptr = lbuf, *uptr = ubuf;
5769 if( !lbScalar )
5770 {
5771 lptr = ptrs[2];
5772 ptrs[2] += delta;
5773 }
5774 if( !ubScalar )
5775 {
5776 int idx = !lbScalar ? 3 : 2;
5777 uptr = ptrs[idx];
5778 ptrs[idx] += delta;
5779 }
5780 func( ptrs[0], 0, lptr, 0, uptr, 0, cn == 1 ? ptrs[1] : mbuf, 0, Size(bsz*cn, 1));
5781 if( cn > 1 )
5782 inRangeReduce(mbuf, ptrs[1], bsz, cn);
5783 ptrs[0] += delta;
5784 ptrs[1] += bsz;
5785 }
5786 }
5787 }
5788
5789 /****************************************************************************************\
5790 * Earlier API: cvAdd etc. *
5791 \****************************************************************************************/
5792
5793 CV_IMPL void
cvNot(const CvArr * srcarr,CvArr * dstarr)5794 cvNot( const CvArr* srcarr, CvArr* dstarr )
5795 {
5796 cv::Mat src = cv::cvarrToMat(srcarr), dst = cv::cvarrToMat(dstarr);
5797 CV_Assert( src.size == dst.size && src.type() == dst.type() );
5798 cv::bitwise_not( src, dst );
5799 }
5800
5801
5802 CV_IMPL void
cvAnd(const CvArr * srcarr1,const CvArr * srcarr2,CvArr * dstarr,const CvArr * maskarr)5803 cvAnd( const CvArr* srcarr1, const CvArr* srcarr2, CvArr* dstarr, const CvArr* maskarr )
5804 {
5805 cv::Mat src1 = cv::cvarrToMat(srcarr1), src2 = cv::cvarrToMat(srcarr2),
5806 dst = cv::cvarrToMat(dstarr), mask;
5807 CV_Assert( src1.size == dst.size && src1.type() == dst.type() );
5808 if( maskarr )
5809 mask = cv::cvarrToMat(maskarr);
5810 cv::bitwise_and( src1, src2, dst, mask );
5811 }
5812
5813
5814 CV_IMPL void
cvOr(const CvArr * srcarr1,const CvArr * srcarr2,CvArr * dstarr,const CvArr * maskarr)5815 cvOr( const CvArr* srcarr1, const CvArr* srcarr2, CvArr* dstarr, const CvArr* maskarr )
5816 {
5817 cv::Mat src1 = cv::cvarrToMat(srcarr1), src2 = cv::cvarrToMat(srcarr2),
5818 dst = cv::cvarrToMat(dstarr), mask;
5819 CV_Assert( src1.size == dst.size && src1.type() == dst.type() );
5820 if( maskarr )
5821 mask = cv::cvarrToMat(maskarr);
5822 cv::bitwise_or( src1, src2, dst, mask );
5823 }
5824
5825
5826 CV_IMPL void
cvXor(const CvArr * srcarr1,const CvArr * srcarr2,CvArr * dstarr,const CvArr * maskarr)5827 cvXor( const CvArr* srcarr1, const CvArr* srcarr2, CvArr* dstarr, const CvArr* maskarr )
5828 {
5829 cv::Mat src1 = cv::cvarrToMat(srcarr1), src2 = cv::cvarrToMat(srcarr2),
5830 dst = cv::cvarrToMat(dstarr), mask;
5831 CV_Assert( src1.size == dst.size && src1.type() == dst.type() );
5832 if( maskarr )
5833 mask = cv::cvarrToMat(maskarr);
5834 cv::bitwise_xor( src1, src2, dst, mask );
5835 }
5836
5837
5838 CV_IMPL void
cvAndS(const CvArr * srcarr,CvScalar s,CvArr * dstarr,const CvArr * maskarr)5839 cvAndS( const CvArr* srcarr, CvScalar s, CvArr* dstarr, const CvArr* maskarr )
5840 {
5841 cv::Mat src = cv::cvarrToMat(srcarr), dst = cv::cvarrToMat(dstarr), mask;
5842 CV_Assert( src.size == dst.size && src.type() == dst.type() );
5843 if( maskarr )
5844 mask = cv::cvarrToMat(maskarr);
5845 cv::bitwise_and( src, (const cv::Scalar&)s, dst, mask );
5846 }
5847
5848
5849 CV_IMPL void
cvOrS(const CvArr * srcarr,CvScalar s,CvArr * dstarr,const CvArr * maskarr)5850 cvOrS( const CvArr* srcarr, CvScalar s, CvArr* dstarr, const CvArr* maskarr )
5851 {
5852 cv::Mat src = cv::cvarrToMat(srcarr), dst = cv::cvarrToMat(dstarr), mask;
5853 CV_Assert( src.size == dst.size && src.type() == dst.type() );
5854 if( maskarr )
5855 mask = cv::cvarrToMat(maskarr);
5856 cv::bitwise_or( src, (const cv::Scalar&)s, dst, mask );
5857 }
5858
5859
5860 CV_IMPL void
cvXorS(const CvArr * srcarr,CvScalar s,CvArr * dstarr,const CvArr * maskarr)5861 cvXorS( const CvArr* srcarr, CvScalar s, CvArr* dstarr, const CvArr* maskarr )
5862 {
5863 cv::Mat src = cv::cvarrToMat(srcarr), dst = cv::cvarrToMat(dstarr), mask;
5864 CV_Assert( src.size == dst.size && src.type() == dst.type() );
5865 if( maskarr )
5866 mask = cv::cvarrToMat(maskarr);
5867 cv::bitwise_xor( src, (const cv::Scalar&)s, dst, mask );
5868 }
5869
5870
cvAdd(const CvArr * srcarr1,const CvArr * srcarr2,CvArr * dstarr,const CvArr * maskarr)5871 CV_IMPL void cvAdd( const CvArr* srcarr1, const CvArr* srcarr2, CvArr* dstarr, const CvArr* maskarr )
5872 {
5873 cv::Mat src1 = cv::cvarrToMat(srcarr1), src2 = cv::cvarrToMat(srcarr2),
5874 dst = cv::cvarrToMat(dstarr), mask;
5875 CV_Assert( src1.size == dst.size && src1.channels() == dst.channels() );
5876 if( maskarr )
5877 mask = cv::cvarrToMat(maskarr);
5878 cv::add( src1, src2, dst, mask, dst.type() );
5879 }
5880
5881
cvSub(const CvArr * srcarr1,const CvArr * srcarr2,CvArr * dstarr,const CvArr * maskarr)5882 CV_IMPL void cvSub( const CvArr* srcarr1, const CvArr* srcarr2, CvArr* dstarr, const CvArr* maskarr )
5883 {
5884 cv::Mat src1 = cv::cvarrToMat(srcarr1), src2 = cv::cvarrToMat(srcarr2),
5885 dst = cv::cvarrToMat(dstarr), mask;
5886 CV_Assert( src1.size == dst.size && src1.channels() == dst.channels() );
5887 if( maskarr )
5888 mask = cv::cvarrToMat(maskarr);
5889 cv::subtract( src1, src2, dst, mask, dst.type() );
5890 }
5891
5892
cvAddS(const CvArr * srcarr1,CvScalar value,CvArr * dstarr,const CvArr * maskarr)5893 CV_IMPL void cvAddS( const CvArr* srcarr1, CvScalar value, CvArr* dstarr, const CvArr* maskarr )
5894 {
5895 cv::Mat src1 = cv::cvarrToMat(srcarr1),
5896 dst = cv::cvarrToMat(dstarr), mask;
5897 CV_Assert( src1.size == dst.size && src1.channels() == dst.channels() );
5898 if( maskarr )
5899 mask = cv::cvarrToMat(maskarr);
5900 cv::add( src1, (const cv::Scalar&)value, dst, mask, dst.type() );
5901 }
5902
5903
cvSubRS(const CvArr * srcarr1,CvScalar value,CvArr * dstarr,const CvArr * maskarr)5904 CV_IMPL void cvSubRS( const CvArr* srcarr1, CvScalar value, CvArr* dstarr, const CvArr* maskarr )
5905 {
5906 cv::Mat src1 = cv::cvarrToMat(srcarr1),
5907 dst = cv::cvarrToMat(dstarr), mask;
5908 CV_Assert( src1.size == dst.size && src1.channels() == dst.channels() );
5909 if( maskarr )
5910 mask = cv::cvarrToMat(maskarr);
5911 cv::subtract( (const cv::Scalar&)value, src1, dst, mask, dst.type() );
5912 }
5913
5914
cvMul(const CvArr * srcarr1,const CvArr * srcarr2,CvArr * dstarr,double scale)5915 CV_IMPL void cvMul( const CvArr* srcarr1, const CvArr* srcarr2,
5916 CvArr* dstarr, double scale )
5917 {
5918 cv::Mat src1 = cv::cvarrToMat(srcarr1), src2 = cv::cvarrToMat(srcarr2),
5919 dst = cv::cvarrToMat(dstarr);
5920 CV_Assert( src1.size == dst.size && src1.channels() == dst.channels() );
5921 cv::multiply( src1, src2, dst, scale, dst.type() );
5922 }
5923
5924
cvDiv(const CvArr * srcarr1,const CvArr * srcarr2,CvArr * dstarr,double scale)5925 CV_IMPL void cvDiv( const CvArr* srcarr1, const CvArr* srcarr2,
5926 CvArr* dstarr, double scale )
5927 {
5928 cv::Mat src2 = cv::cvarrToMat(srcarr2),
5929 dst = cv::cvarrToMat(dstarr), mask;
5930 CV_Assert( src2.size == dst.size && src2.channels() == dst.channels() );
5931
5932 if( srcarr1 )
5933 cv::divide( cv::cvarrToMat(srcarr1), src2, dst, scale, dst.type() );
5934 else
5935 cv::divide( scale, src2, dst, dst.type() );
5936 }
5937
5938
5939 CV_IMPL void
cvAddWeighted(const CvArr * srcarr1,double alpha,const CvArr * srcarr2,double beta,double gamma,CvArr * dstarr)5940 cvAddWeighted( const CvArr* srcarr1, double alpha,
5941 const CvArr* srcarr2, double beta,
5942 double gamma, CvArr* dstarr )
5943 {
5944 cv::Mat src1 = cv::cvarrToMat(srcarr1), src2 = cv::cvarrToMat(srcarr2),
5945 dst = cv::cvarrToMat(dstarr);
5946 CV_Assert( src1.size == dst.size && src1.channels() == dst.channels() );
5947 cv::addWeighted( src1, alpha, src2, beta, gamma, dst, dst.type() );
5948 }
5949
5950
5951 CV_IMPL void
cvAbsDiff(const CvArr * srcarr1,const CvArr * srcarr2,CvArr * dstarr)5952 cvAbsDiff( const CvArr* srcarr1, const CvArr* srcarr2, CvArr* dstarr )
5953 {
5954 cv::Mat src1 = cv::cvarrToMat(srcarr1), dst = cv::cvarrToMat(dstarr);
5955 CV_Assert( src1.size == dst.size && src1.type() == dst.type() );
5956
5957 cv::absdiff( src1, cv::cvarrToMat(srcarr2), dst );
5958 }
5959
5960
5961 CV_IMPL void
cvAbsDiffS(const CvArr * srcarr1,CvArr * dstarr,CvScalar scalar)5962 cvAbsDiffS( const CvArr* srcarr1, CvArr* dstarr, CvScalar scalar )
5963 {
5964 cv::Mat src1 = cv::cvarrToMat(srcarr1), dst = cv::cvarrToMat(dstarr);
5965 CV_Assert( src1.size == dst.size && src1.type() == dst.type() );
5966
5967 cv::absdiff( src1, (const cv::Scalar&)scalar, dst );
5968 }
5969
5970
5971 CV_IMPL void
cvInRange(const void * srcarr1,const void * srcarr2,const void * srcarr3,void * dstarr)5972 cvInRange( const void* srcarr1, const void* srcarr2,
5973 const void* srcarr3, void* dstarr )
5974 {
5975 cv::Mat src1 = cv::cvarrToMat(srcarr1), dst = cv::cvarrToMat(dstarr);
5976 CV_Assert( src1.size == dst.size && dst.type() == CV_8U );
5977
5978 cv::inRange( src1, cv::cvarrToMat(srcarr2), cv::cvarrToMat(srcarr3), dst );
5979 }
5980
5981
5982 CV_IMPL void
cvInRangeS(const void * srcarr1,CvScalar lowerb,CvScalar upperb,void * dstarr)5983 cvInRangeS( const void* srcarr1, CvScalar lowerb, CvScalar upperb, void* dstarr )
5984 {
5985 cv::Mat src1 = cv::cvarrToMat(srcarr1), dst = cv::cvarrToMat(dstarr);
5986 CV_Assert( src1.size == dst.size && dst.type() == CV_8U );
5987
5988 cv::inRange( src1, (const cv::Scalar&)lowerb, (const cv::Scalar&)upperb, dst );
5989 }
5990
5991
5992 CV_IMPL void
cvCmp(const void * srcarr1,const void * srcarr2,void * dstarr,int cmp_op)5993 cvCmp( const void* srcarr1, const void* srcarr2, void* dstarr, int cmp_op )
5994 {
5995 cv::Mat src1 = cv::cvarrToMat(srcarr1), dst = cv::cvarrToMat(dstarr);
5996 CV_Assert( src1.size == dst.size && dst.type() == CV_8U );
5997
5998 cv::compare( src1, cv::cvarrToMat(srcarr2), dst, cmp_op );
5999 }
6000
6001
6002 CV_IMPL void
cvCmpS(const void * srcarr1,double value,void * dstarr,int cmp_op)6003 cvCmpS( const void* srcarr1, double value, void* dstarr, int cmp_op )
6004 {
6005 cv::Mat src1 = cv::cvarrToMat(srcarr1), dst = cv::cvarrToMat(dstarr);
6006 CV_Assert( src1.size == dst.size && dst.type() == CV_8U );
6007
6008 cv::compare( src1, value, dst, cmp_op );
6009 }
6010
6011
6012 CV_IMPL void
cvMin(const void * srcarr1,const void * srcarr2,void * dstarr)6013 cvMin( const void* srcarr1, const void* srcarr2, void* dstarr )
6014 {
6015 cv::Mat src1 = cv::cvarrToMat(srcarr1), dst = cv::cvarrToMat(dstarr);
6016 CV_Assert( src1.size == dst.size && src1.type() == dst.type() );
6017
6018 cv::min( src1, cv::cvarrToMat(srcarr2), dst );
6019 }
6020
6021
6022 CV_IMPL void
cvMax(const void * srcarr1,const void * srcarr2,void * dstarr)6023 cvMax( const void* srcarr1, const void* srcarr2, void* dstarr )
6024 {
6025 cv::Mat src1 = cv::cvarrToMat(srcarr1), dst = cv::cvarrToMat(dstarr);
6026 CV_Assert( src1.size == dst.size && src1.type() == dst.type() );
6027
6028 cv::max( src1, cv::cvarrToMat(srcarr2), dst );
6029 }
6030
6031
6032 CV_IMPL void
cvMinS(const void * srcarr1,double value,void * dstarr)6033 cvMinS( const void* srcarr1, double value, void* dstarr )
6034 {
6035 cv::Mat src1 = cv::cvarrToMat(srcarr1), dst = cv::cvarrToMat(dstarr);
6036 CV_Assert( src1.size == dst.size && src1.type() == dst.type() );
6037
6038 cv::min( src1, value, dst );
6039 }
6040
6041
6042 CV_IMPL void
cvMaxS(const void * srcarr1,double value,void * dstarr)6043 cvMaxS( const void* srcarr1, double value, void* dstarr )
6044 {
6045 cv::Mat src1 = cv::cvarrToMat(srcarr1), dst = cv::cvarrToMat(dstarr);
6046 CV_Assert( src1.size == dst.size && src1.type() == dst.type() );
6047
6048 cv::max( src1, value, dst );
6049 }
6050
6051 /* End of file. */
6052