1 //M*//////////////////////////////////////////////////////////////////////////////////////
2 //
3 // IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
4 //
5 // By downloading, copying, installing or using the software you agree to this license.
6 // If you do not agree to this license, do not download, install,
7 // copy or use the software.
8 //
9 //
10 // License Agreement
11 // For Open Source Computer Vision Library
12 //
13 // Copyright (C) 2000, Intel Corporation, all rights reserved.
14 // Copyright (C) 2013, OpenCV Foundation, all rights reserved.
15 // Third party copyrights are property of their respective owners.
16 //
17 // Redistribution and use in source and binary forms, with or without modification,
18 // are permitted provided that the following conditions are met:
19 //
20 // * Redistribution's of source code must retain the above copyright notice,
21 // this list of conditions and the following disclaimer.
22 //
23 // * Redistribution's in binary form must reproduce the above copyright notice,
24 // this list of conditions and the following disclaimer in the documentation
25 // and/or other materials provided with the distribution.
26 //
27 // * The name of the copyright holders may not be used to endorse or promote products
28 // derived from this software without specific prior written permission.
29 //
30 // This software is provided by the copyright holders and contributors "as is" and
31 // any express or implied warranties, including, but not limited to, the implied
32 // warranties of merchantability and fitness for a particular purpose are disclaimed.
33 // In no event shall the Intel Corporation or contributors be liable for any direct,
34 // indirect, incidental, special, exemplary, or consequential damages
35 // (including, but not limited to, procurement of substitute goods or services;
36 // loss of use, data, or profits; or business interruption) however caused
37 // and on any theory of liability, whether in contract, strict liability,
38 // or tort (including negligence or otherwise) arising in any way out of
39 // the use of this software, even if advised of the possibility of such damage.
40 //
41 //M*/
42
43 /****************************************************************************************\
44 * Very fast SAD-based (Sum-of-Absolute-Diffrences) stereo correspondence algorithm. *
45 * Contributed by Kurt Konolige *
46 \****************************************************************************************/
47
48 #include "precomp.hpp"
49 #include <stdio.h>
50 #include <limits>
51 #include "opencl_kernels_calib3d.hpp"
52
53 namespace cv
54 {
55
56 struct StereoBMParams
57 {
StereoBMParamscv::StereoBMParams58 StereoBMParams(int _numDisparities=64, int _SADWindowSize=21)
59 {
60 preFilterType = StereoBM::PREFILTER_XSOBEL;
61 preFilterSize = 9;
62 preFilterCap = 31;
63 SADWindowSize = _SADWindowSize;
64 minDisparity = 0;
65 numDisparities = _numDisparities > 0 ? _numDisparities : 64;
66 textureThreshold = 10;
67 uniquenessRatio = 15;
68 speckleRange = speckleWindowSize = 0;
69 roi1 = roi2 = Rect(0,0,0,0);
70 disp12MaxDiff = -1;
71 dispType = CV_16S;
72 }
73
74 int preFilterType;
75 int preFilterSize;
76 int preFilterCap;
77 int SADWindowSize;
78 int minDisparity;
79 int numDisparities;
80 int textureThreshold;
81 int uniquenessRatio;
82 int speckleRange;
83 int speckleWindowSize;
84 Rect roi1, roi2;
85 int disp12MaxDiff;
86 int dispType;
87 };
88
ocl_prefilter_norm(InputArray _input,OutputArray _output,int winsize,int prefilterCap)89 static bool ocl_prefilter_norm(InputArray _input, OutputArray _output, int winsize, int prefilterCap)
90 {
91 ocl::Kernel k("prefilter_norm", ocl::calib3d::stereobm_oclsrc, cv::format("-D WSZ=%d", winsize));
92 if(k.empty())
93 return false;
94
95 int scale_g = winsize*winsize/8, scale_s = (1024 + scale_g)/(scale_g*2);
96 scale_g *= scale_s;
97
98 UMat input = _input.getUMat(), output;
99 _output.create(input.size(), input.type());
100 output = _output.getUMat();
101
102 size_t globalThreads[3] = { input.cols, input.rows, 1 };
103
104 k.args(ocl::KernelArg::PtrReadOnly(input), ocl::KernelArg::PtrWriteOnly(output), input.rows, input.cols,
105 prefilterCap, scale_g, scale_s);
106
107 return k.run(2, globalThreads, NULL, false);
108 }
109
prefilterNorm(const Mat & src,Mat & dst,int winsize,int ftzero,uchar * buf)110 static void prefilterNorm( const Mat& src, Mat& dst, int winsize, int ftzero, uchar* buf )
111 {
112 int x, y, wsz2 = winsize/2;
113 int* vsum = (int*)alignPtr(buf + (wsz2 + 1)*sizeof(vsum[0]), 32);
114 int scale_g = winsize*winsize/8, scale_s = (1024 + scale_g)/(scale_g*2);
115 const int OFS = 256*5, TABSZ = OFS*2 + 256;
116 uchar tab[TABSZ];
117 const uchar* sptr = src.ptr();
118 int srcstep = (int)src.step;
119 Size size = src.size();
120
121 scale_g *= scale_s;
122
123 for( x = 0; x < TABSZ; x++ )
124 tab[x] = (uchar)(x - OFS < -ftzero ? 0 : x - OFS > ftzero ? ftzero*2 : x - OFS + ftzero);
125
126 for( x = 0; x < size.width; x++ )
127 vsum[x] = (ushort)(sptr[x]*(wsz2 + 2));
128
129 for( y = 1; y < wsz2; y++ )
130 {
131 for( x = 0; x < size.width; x++ )
132 vsum[x] = (ushort)(vsum[x] + sptr[srcstep*y + x]);
133 }
134
135 for( y = 0; y < size.height; y++ )
136 {
137 const uchar* top = sptr + srcstep*MAX(y-wsz2-1,0);
138 const uchar* bottom = sptr + srcstep*MIN(y+wsz2,size.height-1);
139 const uchar* prev = sptr + srcstep*MAX(y-1,0);
140 const uchar* curr = sptr + srcstep*y;
141 const uchar* next = sptr + srcstep*MIN(y+1,size.height-1);
142 uchar* dptr = dst.ptr<uchar>(y);
143
144 for( x = 0; x < size.width; x++ )
145 vsum[x] = (ushort)(vsum[x] + bottom[x] - top[x]);
146
147 for( x = 0; x <= wsz2; x++ )
148 {
149 vsum[-x-1] = vsum[0];
150 vsum[size.width+x] = vsum[size.width-1];
151 }
152
153 int sum = vsum[0]*(wsz2 + 1);
154 for( x = 1; x <= wsz2; x++ )
155 sum += vsum[x];
156
157 int val = ((curr[0]*5 + curr[1] + prev[0] + next[0])*scale_g - sum*scale_s) >> 10;
158 dptr[0] = tab[val + OFS];
159
160 for( x = 1; x < size.width-1; x++ )
161 {
162 sum += vsum[x+wsz2] - vsum[x-wsz2-1];
163 val = ((curr[x]*4 + curr[x-1] + curr[x+1] + prev[x] + next[x])*scale_g - sum*scale_s) >> 10;
164 dptr[x] = tab[val + OFS];
165 }
166
167 sum += vsum[x+wsz2] - vsum[x-wsz2-1];
168 val = ((curr[x]*5 + curr[x-1] + prev[x] + next[x])*scale_g - sum*scale_s) >> 10;
169 dptr[x] = tab[val + OFS];
170 }
171 }
172
ocl_prefilter_xsobel(InputArray _input,OutputArray _output,int prefilterCap)173 static bool ocl_prefilter_xsobel(InputArray _input, OutputArray _output, int prefilterCap)
174 {
175 ocl::Kernel k("prefilter_xsobel", ocl::calib3d::stereobm_oclsrc);
176 if(k.empty())
177 return false;
178
179 UMat input = _input.getUMat(), output;
180 _output.create(input.size(), input.type());
181 output = _output.getUMat();
182
183 size_t globalThreads[3] = { input.cols, input.rows, 1 };
184
185 k.args(ocl::KernelArg::PtrReadOnly(input), ocl::KernelArg::PtrWriteOnly(output), input.rows, input.cols, prefilterCap);
186
187 return k.run(2, globalThreads, NULL, false);
188 }
189
190 static void
prefilterXSobel(const Mat & src,Mat & dst,int ftzero)191 prefilterXSobel( const Mat& src, Mat& dst, int ftzero )
192 {
193 int x, y;
194 const int OFS = 256*4, TABSZ = OFS*2 + 256;
195 uchar tab[TABSZ];
196 Size size = src.size();
197
198 for( x = 0; x < TABSZ; x++ )
199 tab[x] = (uchar)(x - OFS < -ftzero ? 0 : x - OFS > ftzero ? ftzero*2 : x - OFS + ftzero);
200 uchar val0 = tab[0 + OFS];
201
202 #if CV_SSE2
203 volatile bool useSIMD = checkHardwareSupport(CV_CPU_SSE2);
204 #endif
205
206 for( y = 0; y < size.height-1; y += 2 )
207 {
208 const uchar* srow1 = src.ptr<uchar>(y);
209 const uchar* srow0 = y > 0 ? srow1 - src.step : size.height > 1 ? srow1 + src.step : srow1;
210 const uchar* srow2 = y < size.height-1 ? srow1 + src.step : size.height > 1 ? srow1 - src.step : srow1;
211 const uchar* srow3 = y < size.height-2 ? srow1 + src.step*2 : srow1;
212 uchar* dptr0 = dst.ptr<uchar>(y);
213 uchar* dptr1 = dptr0 + dst.step;
214
215 dptr0[0] = dptr0[size.width-1] = dptr1[0] = dptr1[size.width-1] = val0;
216 x = 1;
217
218 #if CV_NEON
219 int16x8_t ftz = vdupq_n_s16 ((short) ftzero);
220 uint8x8_t ftz2 = vdup_n_u8 (cv::saturate_cast<uchar>(ftzero*2));
221
222 for(; x <=size.width-9; x += 8 )
223 {
224 uint8x8_t c0 = vld1_u8 (srow0 + x - 1);
225 uint8x8_t c1 = vld1_u8 (srow1 + x - 1);
226 uint8x8_t d0 = vld1_u8 (srow0 + x + 1);
227 uint8x8_t d1 = vld1_u8 (srow1 + x + 1);
228
229 int16x8_t t0 = vreinterpretq_s16_u16 (vsubl_u8 (d0, c0));
230 int16x8_t t1 = vreinterpretq_s16_u16 (vsubl_u8 (d1, c1));
231
232 uint8x8_t c2 = vld1_u8 (srow2 + x - 1);
233 uint8x8_t c3 = vld1_u8 (srow3 + x - 1);
234 uint8x8_t d2 = vld1_u8 (srow2 + x + 1);
235 uint8x8_t d3 = vld1_u8 (srow3 + x + 1);
236
237 int16x8_t t2 = vreinterpretq_s16_u16 (vsubl_u8 (d2, c2));
238 int16x8_t t3 = vreinterpretq_s16_u16 (vsubl_u8 (d3, c3));
239
240 int16x8_t v0 = vaddq_s16 (vaddq_s16 (t2, t0), vaddq_s16 (t1, t1));
241 int16x8_t v1 = vaddq_s16 (vaddq_s16 (t3, t1), vaddq_s16 (t2, t2));
242
243
244 uint8x8_t v0_u8 = vqmovun_s16 (vaddq_s16 (v0, ftz));
245 uint8x8_t v1_u8 = vqmovun_s16 (vaddq_s16 (v1, ftz));
246 v0_u8 = vmin_u8 (v0_u8, ftz2);
247 v1_u8 = vmin_u8 (v1_u8, ftz2);
248 vqmovun_s16 (vaddq_s16 (v1, ftz));
249
250 vst1_u8 (dptr0 + x, v0_u8);
251 vst1_u8 (dptr1 + x, v1_u8);
252 }
253 #elif CV_SSE2
254 if( useSIMD )
255 {
256 __m128i z = _mm_setzero_si128(), ftz = _mm_set1_epi16((short)ftzero),
257 ftz2 = _mm_set1_epi8(cv::saturate_cast<uchar>(ftzero*2));
258 for( ; x <= size.width-9; x += 8 )
259 {
260 __m128i c0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(srow0 + x - 1)), z);
261 __m128i c1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(srow1 + x - 1)), z);
262 __m128i d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(srow0 + x + 1)), z);
263 __m128i d1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(srow1 + x + 1)), z);
264
265 d0 = _mm_sub_epi16(d0, c0);
266 d1 = _mm_sub_epi16(d1, c1);
267
268 __m128i c2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(srow2 + x - 1)), z);
269 __m128i c3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(srow3 + x - 1)), z);
270 __m128i d2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(srow2 + x + 1)), z);
271 __m128i d3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i*)(srow3 + x + 1)), z);
272
273 d2 = _mm_sub_epi16(d2, c2);
274 d3 = _mm_sub_epi16(d3, c3);
275
276 __m128i v0 = _mm_add_epi16(d0, _mm_add_epi16(d2, _mm_add_epi16(d1, d1)));
277 __m128i v1 = _mm_add_epi16(d1, _mm_add_epi16(d3, _mm_add_epi16(d2, d2)));
278 v0 = _mm_packus_epi16(_mm_add_epi16(v0, ftz), _mm_add_epi16(v1, ftz));
279 v0 = _mm_min_epu8(v0, ftz2);
280
281 _mm_storel_epi64((__m128i*)(dptr0 + x), v0);
282 _mm_storel_epi64((__m128i*)(dptr1 + x), _mm_unpackhi_epi64(v0, v0));
283 }
284 }
285 #endif
286
287 for( ; x < size.width-1; x++ )
288 {
289 int d0 = srow0[x+1] - srow0[x-1], d1 = srow1[x+1] - srow1[x-1],
290 d2 = srow2[x+1] - srow2[x-1], d3 = srow3[x+1] - srow3[x-1];
291 int v0 = tab[d0 + d1*2 + d2 + OFS];
292 int v1 = tab[d1 + d2*2 + d3 + OFS];
293 dptr0[x] = (uchar)v0;
294 dptr1[x] = (uchar)v1;
295 }
296 }
297
298 #if CV_NEON
299 uint8x16_t val0_16 = vdupq_n_u8 (val0);
300 #endif
301
302 for( ; y < size.height; y++ )
303 {
304 uchar* dptr = dst.ptr<uchar>(y);
305 x = 0;
306 #if CV_NEON
307 for(; x <= size.width-16; x+=16 )
308 vst1q_u8 (dptr + x, val0_16);
309 #endif
310 for(; x < size.width; x++ )
311 dptr[x] = val0;
312 }
313 }
314
315
316 static const int DISPARITY_SHIFT = 4;
317
318 #if CV_SSE2
findStereoCorrespondenceBM_SSE2(const Mat & left,const Mat & right,Mat & disp,Mat & cost,StereoBMParams & state,uchar * buf,int _dy0,int _dy1)319 static void findStereoCorrespondenceBM_SSE2( const Mat& left, const Mat& right,
320 Mat& disp, Mat& cost, StereoBMParams& state,
321 uchar* buf, int _dy0, int _dy1 )
322 {
323 const int ALIGN = 16;
324 int x, y, d;
325 int wsz = state.SADWindowSize, wsz2 = wsz/2;
326 int dy0 = MIN(_dy0, wsz2+1), dy1 = MIN(_dy1, wsz2+1);
327 int ndisp = state.numDisparities;
328 int mindisp = state.minDisparity;
329 int lofs = MAX(ndisp - 1 + mindisp, 0);
330 int rofs = -MIN(ndisp - 1 + mindisp, 0);
331 int width = left.cols, height = left.rows;
332 int width1 = width - rofs - ndisp + 1;
333 int ftzero = state.preFilterCap;
334 int textureThreshold = state.textureThreshold;
335 int uniquenessRatio = state.uniquenessRatio;
336 short FILTERED = (short)((mindisp - 1) << DISPARITY_SHIFT);
337
338 ushort *sad, *hsad0, *hsad, *hsad_sub;
339 int *htext;
340 uchar *cbuf0, *cbuf;
341 const uchar* lptr0 = left.ptr() + lofs;
342 const uchar* rptr0 = right.ptr() + rofs;
343 const uchar *lptr, *lptr_sub, *rptr;
344 short* dptr = disp.ptr<short>();
345 int sstep = (int)left.step;
346 int dstep = (int)(disp.step/sizeof(dptr[0]));
347 int cstep = (height + dy0 + dy1)*ndisp;
348 short costbuf = 0;
349 int coststep = cost.data ? (int)(cost.step/sizeof(costbuf)) : 0;
350 const int TABSZ = 256;
351 uchar tab[TABSZ];
352 const __m128i d0_8 = _mm_setr_epi16(0,1,2,3,4,5,6,7), dd_8 = _mm_set1_epi16(8);
353
354 sad = (ushort*)alignPtr(buf + sizeof(sad[0]), ALIGN);
355 hsad0 = (ushort*)alignPtr(sad + ndisp + 1 + dy0*ndisp, ALIGN);
356 htext = (int*)alignPtr((int*)(hsad0 + (height+dy1)*ndisp) + wsz2 + 2, ALIGN);
357 cbuf0 = (uchar*)alignPtr((uchar*)(htext + height + wsz2 + 2) + dy0*ndisp, ALIGN);
358
359 for( x = 0; x < TABSZ; x++ )
360 tab[x] = (uchar)std::abs(x - ftzero);
361
362 // initialize buffers
363 memset( hsad0 - dy0*ndisp, 0, (height + dy0 + dy1)*ndisp*sizeof(hsad0[0]) );
364 memset( htext - wsz2 - 1, 0, (height + wsz + 1)*sizeof(htext[0]) );
365
366 for( x = -wsz2-1; x < wsz2; x++ )
367 {
368 hsad = hsad0 - dy0*ndisp; cbuf = cbuf0 + (x + wsz2 + 1)*cstep - dy0*ndisp;
369 lptr = lptr0 + MIN(MAX(x, -lofs), width-lofs-1) - dy0*sstep;
370 rptr = rptr0 + MIN(MAX(x, -rofs), width-rofs-1) - dy0*sstep;
371
372 for( y = -dy0; y < height + dy1; y++, hsad += ndisp, cbuf += ndisp, lptr += sstep, rptr += sstep )
373 {
374 int lval = lptr[0];
375 __m128i lv = _mm_set1_epi8((char)lval), z = _mm_setzero_si128();
376 for( d = 0; d < ndisp; d += 16 )
377 {
378 __m128i rv = _mm_loadu_si128((const __m128i*)(rptr + d));
379 __m128i hsad_l = _mm_load_si128((__m128i*)(hsad + d));
380 __m128i hsad_h = _mm_load_si128((__m128i*)(hsad + d + 8));
381 __m128i diff = _mm_adds_epu8(_mm_subs_epu8(lv, rv), _mm_subs_epu8(rv, lv));
382 _mm_store_si128((__m128i*)(cbuf + d), diff);
383 hsad_l = _mm_add_epi16(hsad_l, _mm_unpacklo_epi8(diff,z));
384 hsad_h = _mm_add_epi16(hsad_h, _mm_unpackhi_epi8(diff,z));
385 _mm_store_si128((__m128i*)(hsad + d), hsad_l);
386 _mm_store_si128((__m128i*)(hsad + d + 8), hsad_h);
387 }
388 htext[y] += tab[lval];
389 }
390 }
391
392 // initialize the left and right borders of the disparity map
393 for( y = 0; y < height; y++ )
394 {
395 for( x = 0; x < lofs; x++ )
396 dptr[y*dstep + x] = FILTERED;
397 for( x = lofs + width1; x < width; x++ )
398 dptr[y*dstep + x] = FILTERED;
399 }
400 dptr += lofs;
401
402 for( x = 0; x < width1; x++, dptr++ )
403 {
404 short* costptr = cost.data ? cost.ptr<short>() + lofs + x : &costbuf;
405 int x0 = x - wsz2 - 1, x1 = x + wsz2;
406 const uchar* cbuf_sub = cbuf0 + ((x0 + wsz2 + 1) % (wsz + 1))*cstep - dy0*ndisp;
407 cbuf = cbuf0 + ((x1 + wsz2 + 1) % (wsz + 1))*cstep - dy0*ndisp;
408 hsad = hsad0 - dy0*ndisp;
409 lptr_sub = lptr0 + MIN(MAX(x0, -lofs), width-1-lofs) - dy0*sstep;
410 lptr = lptr0 + MIN(MAX(x1, -lofs), width-1-lofs) - dy0*sstep;
411 rptr = rptr0 + MIN(MAX(x1, -rofs), width-1-rofs) - dy0*sstep;
412
413 for( y = -dy0; y < height + dy1; y++, cbuf += ndisp, cbuf_sub += ndisp,
414 hsad += ndisp, lptr += sstep, lptr_sub += sstep, rptr += sstep )
415 {
416 int lval = lptr[0];
417 __m128i lv = _mm_set1_epi8((char)lval), z = _mm_setzero_si128();
418 for( d = 0; d < ndisp; d += 16 )
419 {
420 __m128i rv = _mm_loadu_si128((const __m128i*)(rptr + d));
421 __m128i hsad_l = _mm_load_si128((__m128i*)(hsad + d));
422 __m128i hsad_h = _mm_load_si128((__m128i*)(hsad + d + 8));
423 __m128i cbs = _mm_load_si128((const __m128i*)(cbuf_sub + d));
424 __m128i diff = _mm_adds_epu8(_mm_subs_epu8(lv, rv), _mm_subs_epu8(rv, lv));
425 __m128i diff_h = _mm_sub_epi16(_mm_unpackhi_epi8(diff, z), _mm_unpackhi_epi8(cbs, z));
426 _mm_store_si128((__m128i*)(cbuf + d), diff);
427 diff = _mm_sub_epi16(_mm_unpacklo_epi8(diff, z), _mm_unpacklo_epi8(cbs, z));
428 hsad_h = _mm_add_epi16(hsad_h, diff_h);
429 hsad_l = _mm_add_epi16(hsad_l, diff);
430 _mm_store_si128((__m128i*)(hsad + d), hsad_l);
431 _mm_store_si128((__m128i*)(hsad + d + 8), hsad_h);
432 }
433 htext[y] += tab[lval] - tab[lptr_sub[0]];
434 }
435
436 // fill borders
437 for( y = dy1; y <= wsz2; y++ )
438 htext[height+y] = htext[height+dy1-1];
439 for( y = -wsz2-1; y < -dy0; y++ )
440 htext[y] = htext[-dy0];
441
442 // initialize sums
443 for( d = 0; d < ndisp; d++ )
444 sad[d] = (ushort)(hsad0[d-ndisp*dy0]*(wsz2 + 2 - dy0));
445
446 hsad = hsad0 + (1 - dy0)*ndisp;
447 for( y = 1 - dy0; y < wsz2; y++, hsad += ndisp )
448 for( d = 0; d < ndisp; d += 16 )
449 {
450 __m128i s0 = _mm_load_si128((__m128i*)(sad + d));
451 __m128i s1 = _mm_load_si128((__m128i*)(sad + d + 8));
452 __m128i t0 = _mm_load_si128((__m128i*)(hsad + d));
453 __m128i t1 = _mm_load_si128((__m128i*)(hsad + d + 8));
454 s0 = _mm_add_epi16(s0, t0);
455 s1 = _mm_add_epi16(s1, t1);
456 _mm_store_si128((__m128i*)(sad + d), s0);
457 _mm_store_si128((__m128i*)(sad + d + 8), s1);
458 }
459 int tsum = 0;
460 for( y = -wsz2-1; y < wsz2; y++ )
461 tsum += htext[y];
462
463 // finally, start the real processing
464 for( y = 0; y < height; y++ )
465 {
466 int minsad = INT_MAX, mind = -1;
467 hsad = hsad0 + MIN(y + wsz2, height+dy1-1)*ndisp;
468 hsad_sub = hsad0 + MAX(y - wsz2 - 1, -dy0)*ndisp;
469 __m128i minsad8 = _mm_set1_epi16(SHRT_MAX);
470 __m128i mind8 = _mm_set1_epi16(0), d8 = d0_8, mask;
471
472 for( d = 0; d < ndisp; d += 16 )
473 {
474 __m128i u0 = _mm_load_si128((__m128i*)(hsad_sub + d));
475 __m128i u1 = _mm_load_si128((__m128i*)(hsad + d));
476
477 __m128i v0 = _mm_load_si128((__m128i*)(hsad_sub + d + 8));
478 __m128i v1 = _mm_load_si128((__m128i*)(hsad + d + 8));
479
480 __m128i usad8 = _mm_load_si128((__m128i*)(sad + d));
481 __m128i vsad8 = _mm_load_si128((__m128i*)(sad + d + 8));
482
483 u1 = _mm_sub_epi16(u1, u0);
484 v1 = _mm_sub_epi16(v1, v0);
485 usad8 = _mm_add_epi16(usad8, u1);
486 vsad8 = _mm_add_epi16(vsad8, v1);
487
488 mask = _mm_cmpgt_epi16(minsad8, usad8);
489 minsad8 = _mm_min_epi16(minsad8, usad8);
490 mind8 = _mm_max_epi16(mind8, _mm_and_si128(mask, d8));
491
492 _mm_store_si128((__m128i*)(sad + d), usad8);
493 _mm_store_si128((__m128i*)(sad + d + 8), vsad8);
494
495 mask = _mm_cmpgt_epi16(minsad8, vsad8);
496 minsad8 = _mm_min_epi16(minsad8, vsad8);
497
498 d8 = _mm_add_epi16(d8, dd_8);
499 mind8 = _mm_max_epi16(mind8, _mm_and_si128(mask, d8));
500 d8 = _mm_add_epi16(d8, dd_8);
501 }
502
503 tsum += htext[y + wsz2] - htext[y - wsz2 - 1];
504 if( tsum < textureThreshold )
505 {
506 dptr[y*dstep] = FILTERED;
507 continue;
508 }
509
510 ushort CV_DECL_ALIGNED(16) minsad_buf[8], mind_buf[8];
511 _mm_store_si128((__m128i*)minsad_buf, minsad8);
512 _mm_store_si128((__m128i*)mind_buf, mind8);
513 for( d = 0; d < 8; d++ )
514 if(minsad > (int)minsad_buf[d] || (minsad == (int)minsad_buf[d] && mind > mind_buf[d]))
515 {
516 minsad = minsad_buf[d];
517 mind = mind_buf[d];
518 }
519
520 if( uniquenessRatio > 0 )
521 {
522 int thresh = minsad + (minsad * uniquenessRatio/100);
523 __m128i thresh8 = _mm_set1_epi16((short)(thresh + 1));
524 __m128i d1 = _mm_set1_epi16((short)(mind-1)), d2 = _mm_set1_epi16((short)(mind+1));
525 __m128i dd_16 = _mm_add_epi16(dd_8, dd_8);
526 d8 = _mm_sub_epi16(d0_8, dd_16);
527
528 for( d = 0; d < ndisp; d += 16 )
529 {
530 __m128i usad8 = _mm_load_si128((__m128i*)(sad + d));
531 __m128i vsad8 = _mm_load_si128((__m128i*)(sad + d + 8));
532 mask = _mm_cmpgt_epi16( thresh8, _mm_min_epi16(usad8,vsad8));
533 d8 = _mm_add_epi16(d8, dd_16);
534 if( !_mm_movemask_epi8(mask) )
535 continue;
536 mask = _mm_cmpgt_epi16( thresh8, usad8);
537 mask = _mm_and_si128(mask, _mm_or_si128(_mm_cmpgt_epi16(d1,d8), _mm_cmpgt_epi16(d8,d2)));
538 if( _mm_movemask_epi8(mask) )
539 break;
540 __m128i t8 = _mm_add_epi16(d8, dd_8);
541 mask = _mm_cmpgt_epi16( thresh8, vsad8);
542 mask = _mm_and_si128(mask, _mm_or_si128(_mm_cmpgt_epi16(d1,t8), _mm_cmpgt_epi16(t8,d2)));
543 if( _mm_movemask_epi8(mask) )
544 break;
545 }
546 if( d < ndisp )
547 {
548 dptr[y*dstep] = FILTERED;
549 continue;
550 }
551 }
552
553 if( 0 < mind && mind < ndisp - 1 )
554 {
555 int p = sad[mind+1], n = sad[mind-1];
556 d = p + n - 2*sad[mind] + std::abs(p - n);
557 dptr[y*dstep] = (short)(((ndisp - mind - 1 + mindisp)*256 + (d != 0 ? (p-n)*256/d : 0) + 15) >> 4);
558 }
559 else
560 dptr[y*dstep] = (short)((ndisp - mind - 1 + mindisp)*16);
561 costptr[y*coststep] = sad[mind];
562 }
563 }
564 }
565 #endif
566
567 static void
findStereoCorrespondenceBM(const Mat & left,const Mat & right,Mat & disp,Mat & cost,const StereoBMParams & state,uchar * buf,int _dy0,int _dy1)568 findStereoCorrespondenceBM( const Mat& left, const Mat& right,
569 Mat& disp, Mat& cost, const StereoBMParams& state,
570 uchar* buf, int _dy0, int _dy1 )
571 {
572
573 const int ALIGN = 16;
574 int x, y, d;
575 int wsz = state.SADWindowSize, wsz2 = wsz/2;
576 int dy0 = MIN(_dy0, wsz2+1), dy1 = MIN(_dy1, wsz2+1);
577 int ndisp = state.numDisparities;
578 int mindisp = state.minDisparity;
579 int lofs = MAX(ndisp - 1 + mindisp, 0);
580 int rofs = -MIN(ndisp - 1 + mindisp, 0);
581 int width = left.cols, height = left.rows;
582 int width1 = width - rofs - ndisp + 1;
583 int ftzero = state.preFilterCap;
584 int textureThreshold = state.textureThreshold;
585 int uniquenessRatio = state.uniquenessRatio;
586 short FILTERED = (short)((mindisp - 1) << DISPARITY_SHIFT);
587
588 #if CV_NEON
589 CV_Assert (ndisp % 8 == 0);
590 int32_t d0_4_temp [4];
591 for (int i = 0; i < 4; i ++)
592 d0_4_temp[i] = i;
593 int32x4_t d0_4 = vld1q_s32 (d0_4_temp);
594 int32x4_t dd_4 = vdupq_n_s32 (4);
595 #endif
596
597 int *sad, *hsad0, *hsad, *hsad_sub, *htext;
598 uchar *cbuf0, *cbuf;
599 const uchar* lptr0 = left.ptr() + lofs;
600 const uchar* rptr0 = right.ptr() + rofs;
601 const uchar *lptr, *lptr_sub, *rptr;
602 short* dptr = disp.ptr<short>();
603 int sstep = (int)left.step;
604 int dstep = (int)(disp.step/sizeof(dptr[0]));
605 int cstep = (height+dy0+dy1)*ndisp;
606 int costbuf = 0;
607 int coststep = cost.data ? (int)(cost.step/sizeof(costbuf)) : 0;
608 const int TABSZ = 256;
609 uchar tab[TABSZ];
610
611 sad = (int*)alignPtr(buf + sizeof(sad[0]), ALIGN);
612 hsad0 = (int*)alignPtr(sad + ndisp + 1 + dy0*ndisp, ALIGN);
613 htext = (int*)alignPtr((int*)(hsad0 + (height+dy1)*ndisp) + wsz2 + 2, ALIGN);
614 cbuf0 = (uchar*)alignPtr((uchar*)(htext + height + wsz2 + 2) + dy0*ndisp, ALIGN);
615
616 for( x = 0; x < TABSZ; x++ )
617 tab[x] = (uchar)std::abs(x - ftzero);
618
619 // initialize buffers
620 memset( hsad0 - dy0*ndisp, 0, (height + dy0 + dy1)*ndisp*sizeof(hsad0[0]) );
621 memset( htext - wsz2 - 1, 0, (height + wsz + 1)*sizeof(htext[0]) );
622
623 for( x = -wsz2-1; x < wsz2; x++ )
624 {
625 hsad = hsad0 - dy0*ndisp; cbuf = cbuf0 + (x + wsz2 + 1)*cstep - dy0*ndisp;
626 lptr = lptr0 + std::min(std::max(x, -lofs), width-lofs-1) - dy0*sstep;
627 rptr = rptr0 + std::min(std::max(x, -rofs), width-rofs-1) - dy0*sstep;
628 for( y = -dy0; y < height + dy1; y++, hsad += ndisp, cbuf += ndisp, lptr += sstep, rptr += sstep )
629 {
630 int lval = lptr[0];
631 #if CV_NEON
632 int16x8_t lv = vdupq_n_s16 ((int16_t)lval);
633
634 for( d = 0; d < ndisp; d += 8 )
635 {
636 int16x8_t rv = vreinterpretq_s16_u16 (vmovl_u8 (vld1_u8 (rptr + d)));
637 int32x4_t hsad_l = vld1q_s32 (hsad + d);
638 int32x4_t hsad_h = vld1q_s32 (hsad + d + 4);
639 int16x8_t diff = vabdq_s16 (lv, rv);
640 vst1_u8 (cbuf + d, vmovn_u16(vreinterpretq_u16_s16(diff)));
641 hsad_l = vaddq_s32 (hsad_l, vmovl_s16(vget_low_s16 (diff)));
642 hsad_h = vaddq_s32 (hsad_h, vmovl_s16(vget_high_s16 (diff)));
643 vst1q_s32 ((hsad + d), hsad_l);
644 vst1q_s32 ((hsad + d + 4), hsad_h);
645 }
646 #else
647 for( d = 0; d < ndisp; d++ )
648 {
649 int diff = std::abs(lval - rptr[d]);
650 cbuf[d] = (uchar)diff;
651 hsad[d] = (int)(hsad[d] + diff);
652 }
653 #endif
654 htext[y] += tab[lval];
655 }
656 }
657
658 // initialize the left and right borders of the disparity map
659 for( y = 0; y < height; y++ )
660 {
661 for( x = 0; x < lofs; x++ )
662 dptr[y*dstep + x] = FILTERED;
663 for( x = lofs + width1; x < width; x++ )
664 dptr[y*dstep + x] = FILTERED;
665 }
666 dptr += lofs;
667
668 for( x = 0; x < width1; x++, dptr++ )
669 {
670 int* costptr = cost.data ? cost.ptr<int>() + lofs + x : &costbuf;
671 int x0 = x - wsz2 - 1, x1 = x + wsz2;
672 const uchar* cbuf_sub = cbuf0 + ((x0 + wsz2 + 1) % (wsz + 1))*cstep - dy0*ndisp;
673 cbuf = cbuf0 + ((x1 + wsz2 + 1) % (wsz + 1))*cstep - dy0*ndisp;
674 hsad = hsad0 - dy0*ndisp;
675 lptr_sub = lptr0 + MIN(MAX(x0, -lofs), width-1-lofs) - dy0*sstep;
676 lptr = lptr0 + MIN(MAX(x1, -lofs), width-1-lofs) - dy0*sstep;
677 rptr = rptr0 + MIN(MAX(x1, -rofs), width-1-rofs) - dy0*sstep;
678
679 for( y = -dy0; y < height + dy1; y++, cbuf += ndisp, cbuf_sub += ndisp,
680 hsad += ndisp, lptr += sstep, lptr_sub += sstep, rptr += sstep )
681 {
682 int lval = lptr[0];
683 #if CV_NEON
684 int16x8_t lv = vdupq_n_s16 ((int16_t)lval);
685 for( d = 0; d < ndisp; d += 8 )
686 {
687 int16x8_t rv = vreinterpretq_s16_u16 (vmovl_u8 (vld1_u8 (rptr + d)));
688 int32x4_t hsad_l = vld1q_s32 (hsad + d);
689 int32x4_t hsad_h = vld1q_s32 (hsad + d + 4);
690 int16x8_t cbs = vreinterpretq_s16_u16 (vmovl_u8 (vld1_u8 (cbuf_sub + d)));
691 int16x8_t diff = vabdq_s16 (lv, rv);
692 int32x4_t diff_h = vsubl_s16 (vget_high_s16 (diff), vget_high_s16 (cbs));
693 int32x4_t diff_l = vsubl_s16 (vget_low_s16 (diff), vget_low_s16 (cbs));
694 vst1_u8 (cbuf + d, vmovn_u16(vreinterpretq_u16_s16(diff)));
695 hsad_h = vaddq_s32 (hsad_h, diff_h);
696 hsad_l = vaddq_s32 (hsad_l, diff_l);
697 vst1q_s32 ((hsad + d), hsad_l);
698 vst1q_s32 ((hsad + d + 4), hsad_h);
699 }
700 #else
701 for( d = 0; d < ndisp; d++ )
702 {
703 int diff = std::abs(lval - rptr[d]);
704 cbuf[d] = (uchar)diff;
705 hsad[d] = hsad[d] + diff - cbuf_sub[d];
706 }
707 #endif
708 htext[y] += tab[lval] - tab[lptr_sub[0]];
709 }
710
711 // fill borders
712 for( y = dy1; y <= wsz2; y++ )
713 htext[height+y] = htext[height+dy1-1];
714 for( y = -wsz2-1; y < -dy0; y++ )
715 htext[y] = htext[-dy0];
716
717 // initialize sums
718 for( d = 0; d < ndisp; d++ )
719 sad[d] = (int)(hsad0[d-ndisp*dy0]*(wsz2 + 2 - dy0));
720
721 hsad = hsad0 + (1 - dy0)*ndisp;
722 for( y = 1 - dy0; y < wsz2; y++, hsad += ndisp )
723 {
724 #if CV_NEON
725 for( d = 0; d <= ndisp-8; d += 8 )
726 {
727 int32x4_t s0 = vld1q_s32 (sad + d);
728 int32x4_t s1 = vld1q_s32 (sad + d + 4);
729 int32x4_t t0 = vld1q_s32 (hsad + d);
730 int32x4_t t1 = vld1q_s32 (hsad + d + 4);
731 s0 = vaddq_s32 (s0, t0);
732 s1 = vaddq_s32 (s1, t1);
733 vst1q_s32 (sad + d, s0);
734 vst1q_s32 (sad + d + 4, s1);
735 }
736 #else
737 for( d = 0; d < ndisp; d++ )
738 sad[d] = (int)(sad[d] + hsad[d]);
739 #endif
740 }
741 int tsum = 0;
742 for( y = -wsz2-1; y < wsz2; y++ )
743 tsum += htext[y];
744
745 // finally, start the real processing
746 for( y = 0; y < height; y++ )
747 {
748 int minsad = INT_MAX, mind = -1;
749 hsad = hsad0 + MIN(y + wsz2, height+dy1-1)*ndisp;
750 hsad_sub = hsad0 + MAX(y - wsz2 - 1, -dy0)*ndisp;
751 #if CV_NEON
752 int32x4_t minsad4 = vdupq_n_s32 (INT_MAX);
753 int32x4_t mind4 = vdupq_n_s32(0), d4 = d0_4;
754
755 for( d = 0; d <= ndisp-8; d += 8 )
756 {
757 int32x4_t u0 = vld1q_s32 (hsad_sub + d);
758 int32x4_t u1 = vld1q_s32 (hsad + d);
759
760 int32x4_t v0 = vld1q_s32 (hsad_sub + d + 4);
761 int32x4_t v1 = vld1q_s32 (hsad + d + 4);
762
763 int32x4_t usad4 = vld1q_s32(sad + d);
764 int32x4_t vsad4 = vld1q_s32(sad + d + 4);
765
766 u1 = vsubq_s32 (u1, u0);
767 v1 = vsubq_s32 (v1, v0);
768 usad4 = vaddq_s32 (usad4, u1);
769 vsad4 = vaddq_s32 (vsad4, v1);
770
771 uint32x4_t mask = vcgtq_s32 (minsad4, usad4);
772 minsad4 = vminq_s32 (minsad4, usad4);
773 mind4 = vbslq_s32(mask, d4, mind4);
774
775 vst1q_s32 (sad + d, usad4);
776 vst1q_s32 (sad + d + 4, vsad4);
777 d4 = vaddq_s32 (d4, dd_4);
778
779 mask = vcgtq_s32 (minsad4, vsad4);
780 minsad4 = vminq_s32 (minsad4, vsad4);
781 mind4 = vbslq_s32(mask, d4, mind4);
782
783 d4 = vaddq_s32 (d4, dd_4);
784
785 }
786 int32x2_t mind4_h = vget_high_s32 (mind4);
787 int32x2_t mind4_l = vget_low_s32 (mind4);
788 int32x2_t minsad4_h = vget_high_s32 (minsad4);
789 int32x2_t minsad4_l = vget_low_s32 (minsad4);
790
791 uint32x2_t mask = vorr_u32 (vclt_s32 (minsad4_h, minsad4_l), vand_u32 (vceq_s32 (minsad4_h, minsad4_l), vclt_s32 (mind4_h, mind4_l)));
792 mind4_h = vbsl_s32 (mask, mind4_h, mind4_l);
793 minsad4_h = vbsl_s32 (mask, minsad4_h, minsad4_l);
794
795 mind4_l = vext_s32 (mind4_h,mind4_h,1);
796 minsad4_l = vext_s32 (minsad4_h,minsad4_h,1);
797
798 mask = vorr_u32 (vclt_s32 (minsad4_h, minsad4_l), vand_u32 (vceq_s32 (minsad4_h, minsad4_l), vclt_s32 (mind4_h, mind4_l)));
799 mind4_h = vbsl_s32 (mask, mind4_h, mind4_l);
800 minsad4_h = vbsl_s32 (mask, minsad4_h, minsad4_l);
801
802 mind = (int) vget_lane_s32 (mind4_h, 0);
803 minsad = sad[mind];
804
805 #else
806 for( d = 0; d < ndisp; d++ )
807 {
808 int currsad = sad[d] + hsad[d] - hsad_sub[d];
809 sad[d] = currsad;
810 if( currsad < minsad )
811 {
812 minsad = currsad;
813 mind = d;
814 }
815 }
816 #endif
817
818 tsum += htext[y + wsz2] - htext[y - wsz2 - 1];
819 if( tsum < textureThreshold )
820 {
821 dptr[y*dstep] = FILTERED;
822 continue;
823 }
824
825 if( uniquenessRatio > 0 )
826 {
827 int thresh = minsad + (minsad * uniquenessRatio/100);
828 for( d = 0; d < ndisp; d++ )
829 {
830 if( (d < mind-1 || d > mind+1) && sad[d] <= thresh)
831 break;
832 }
833 if( d < ndisp )
834 {
835 dptr[y*dstep] = FILTERED;
836 continue;
837 }
838 }
839
840 {
841 sad[-1] = sad[1];
842 sad[ndisp] = sad[ndisp-2];
843 int p = sad[mind+1], n = sad[mind-1];
844 d = p + n - 2*sad[mind] + std::abs(p - n);
845 dptr[y*dstep] = (short)(((ndisp - mind - 1 + mindisp)*256 + (d != 0 ? (p-n)*256/d : 0) + 15) >> 4);
846 costptr[y*coststep] = sad[mind];
847 }
848 }
849 }
850 }
851
ocl_prefiltering(InputArray left0,InputArray right0,OutputArray left,OutputArray right,StereoBMParams * state)852 static bool ocl_prefiltering(InputArray left0, InputArray right0, OutputArray left, OutputArray right, StereoBMParams* state)
853 {
854 if( state->preFilterType == StereoBM::PREFILTER_NORMALIZED_RESPONSE )
855 {
856 if(!ocl_prefilter_norm( left0, left, state->preFilterSize, state->preFilterCap))
857 return false;
858 if(!ocl_prefilter_norm( right0, right, state->preFilterSize, state->preFilterCap))
859 return false;
860 }
861 else
862 {
863 if(!ocl_prefilter_xsobel( left0, left, state->preFilterCap ))
864 return false;
865 if(!ocl_prefilter_xsobel( right0, right, state->preFilterCap))
866 return false;
867 }
868 return true;
869 }
870
871 struct PrefilterInvoker : public ParallelLoopBody
872 {
PrefilterInvokercv::PrefilterInvoker873 PrefilterInvoker(const Mat& left0, const Mat& right0, Mat& left, Mat& right,
874 uchar* buf0, uchar* buf1, StereoBMParams* _state)
875 {
876 imgs0[0] = &left0; imgs0[1] = &right0;
877 imgs[0] = &left; imgs[1] = &right;
878 buf[0] = buf0; buf[1] = buf1;
879 state = _state;
880 }
881
operator ()cv::PrefilterInvoker882 void operator()( const Range& range ) const
883 {
884 for( int i = range.start; i < range.end; i++ )
885 {
886 if( state->preFilterType == StereoBM::PREFILTER_NORMALIZED_RESPONSE )
887 prefilterNorm( *imgs0[i], *imgs[i], state->preFilterSize, state->preFilterCap, buf[i] );
888 else
889 prefilterXSobel( *imgs0[i], *imgs[i], state->preFilterCap );
890 }
891 }
892
893 const Mat* imgs0[2];
894 Mat* imgs[2];
895 uchar* buf[2];
896 StereoBMParams* state;
897 };
898
ocl_stereobm(InputArray _left,InputArray _right,OutputArray _disp,StereoBMParams * state)899 static bool ocl_stereobm( InputArray _left, InputArray _right,
900 OutputArray _disp, StereoBMParams* state)
901 {
902 int ndisp = state->numDisparities;
903 int mindisp = state->minDisparity;
904 int wsz = state->SADWindowSize;
905 int wsz2 = wsz/2;
906
907 ocl::Device devDef = ocl::Device::getDefault();
908 int sizeX = devDef.isIntel() ? 32 : std::max(11, 27 - devDef.maxComputeUnits()),
909 sizeY = sizeX - 1,
910 N = ndisp * 2;
911
912 cv::String opt = cv::format("-D DEFINE_KERNEL_STEREOBM -D MIN_DISP=%d -D NUM_DISP=%d"
913 " -D BLOCK_SIZE_X=%d -D BLOCK_SIZE_Y=%d -D WSZ=%d",
914 mindisp, ndisp,
915 sizeX, sizeY, wsz);
916 ocl::Kernel k("stereoBM", ocl::calib3d::stereobm_oclsrc, opt);
917 if(k.empty())
918 return false;
919
920 UMat left = _left.getUMat(), right = _right.getUMat();
921 int cols = left.cols, rows = left.rows;
922
923 _disp.create(_left.size(), CV_16S);
924 _disp.setTo((mindisp - 1) << 4);
925 Rect roi = Rect(Point(wsz2 + mindisp + ndisp - 1, wsz2), Point(cols-wsz2-mindisp, rows-wsz2) );
926 UMat disp = (_disp.getUMat())(roi);
927
928 int globalX = (disp.cols + sizeX - 1) / sizeX,
929 globalY = (disp.rows + sizeY - 1) / sizeY;
930 size_t globalThreads[3] = {N, globalX, globalY};
931 size_t localThreads[3] = {N, 1, 1};
932
933 int idx = 0;
934 idx = k.set(idx, ocl::KernelArg::PtrReadOnly(left));
935 idx = k.set(idx, ocl::KernelArg::PtrReadOnly(right));
936 idx = k.set(idx, ocl::KernelArg::WriteOnlyNoSize(disp));
937 idx = k.set(idx, rows);
938 idx = k.set(idx, cols);
939 idx = k.set(idx, state->textureThreshold);
940 idx = k.set(idx, state->uniquenessRatio);
941 return k.run(3, globalThreads, localThreads, false);
942 }
943
944 struct FindStereoCorrespInvoker : public ParallelLoopBody
945 {
FindStereoCorrespInvokercv::FindStereoCorrespInvoker946 FindStereoCorrespInvoker( const Mat& _left, const Mat& _right,
947 Mat& _disp, StereoBMParams* _state,
948 int _nstripes, size_t _stripeBufSize,
949 bool _useShorts, Rect _validDisparityRect,
950 Mat& _slidingSumBuf, Mat& _cost )
951 {
952 left = &_left; right = &_right;
953 disp = &_disp; state = _state;
954 nstripes = _nstripes; stripeBufSize = _stripeBufSize;
955 useShorts = _useShorts;
956 validDisparityRect = _validDisparityRect;
957 slidingSumBuf = &_slidingSumBuf;
958 cost = &_cost;
959 }
960
operator ()cv::FindStereoCorrespInvoker961 void operator()( const Range& range ) const
962 {
963 int cols = left->cols, rows = left->rows;
964 int _row0 = std::min(cvRound(range.start * rows / nstripes), rows);
965 int _row1 = std::min(cvRound(range.end * rows / nstripes), rows);
966 uchar *ptr = slidingSumBuf->ptr() + range.start * stripeBufSize;
967 int FILTERED = (state->minDisparity - 1)*16;
968
969 Rect roi = validDisparityRect & Rect(0, _row0, cols, _row1 - _row0);
970 if( roi.height == 0 )
971 return;
972 int row0 = roi.y;
973 int row1 = roi.y + roi.height;
974
975 Mat part;
976 if( row0 > _row0 )
977 {
978 part = disp->rowRange(_row0, row0);
979 part = Scalar::all(FILTERED);
980 }
981 if( _row1 > row1 )
982 {
983 part = disp->rowRange(row1, _row1);
984 part = Scalar::all(FILTERED);
985 }
986
987 Mat left_i = left->rowRange(row0, row1);
988 Mat right_i = right->rowRange(row0, row1);
989 Mat disp_i = disp->rowRange(row0, row1);
990 Mat cost_i = state->disp12MaxDiff >= 0 ? cost->rowRange(row0, row1) : Mat();
991
992 #if CV_SSE2
993 if( useShorts )
994 findStereoCorrespondenceBM_SSE2( left_i, right_i, disp_i, cost_i, *state, ptr, row0, rows - row1 );
995 else
996 #endif
997 findStereoCorrespondenceBM( left_i, right_i, disp_i, cost_i, *state, ptr, row0, rows - row1 );
998
999 if( state->disp12MaxDiff >= 0 )
1000 validateDisparity( disp_i, cost_i, state->minDisparity, state->numDisparities, state->disp12MaxDiff );
1001
1002 if( roi.x > 0 )
1003 {
1004 part = disp_i.colRange(0, roi.x);
1005 part = Scalar::all(FILTERED);
1006 }
1007 if( roi.x + roi.width < cols )
1008 {
1009 part = disp_i.colRange(roi.x + roi.width, cols);
1010 part = Scalar::all(FILTERED);
1011 }
1012 }
1013
1014 protected:
1015 const Mat *left, *right;
1016 Mat* disp, *slidingSumBuf, *cost;
1017 StereoBMParams *state;
1018
1019 int nstripes;
1020 size_t stripeBufSize;
1021 bool useShorts;
1022 Rect validDisparityRect;
1023 };
1024
1025 class StereoBMImpl : public StereoBM
1026 {
1027 public:
StereoBMImpl()1028 StereoBMImpl()
1029 {
1030 params = StereoBMParams();
1031 }
1032
StereoBMImpl(int _numDisparities,int _SADWindowSize)1033 StereoBMImpl( int _numDisparities, int _SADWindowSize )
1034 {
1035 params = StereoBMParams(_numDisparities, _SADWindowSize);
1036 }
1037
compute(InputArray leftarr,InputArray rightarr,OutputArray disparr)1038 void compute( InputArray leftarr, InputArray rightarr, OutputArray disparr )
1039 {
1040 int dtype = disparr.fixedType() ? disparr.type() : params.dispType;
1041 Size leftsize = leftarr.size();
1042
1043 if (leftarr.size() != rightarr.size())
1044 CV_Error( Error::StsUnmatchedSizes, "All the images must have the same size" );
1045
1046 if (leftarr.type() != CV_8UC1 || rightarr.type() != CV_8UC1)
1047 CV_Error( Error::StsUnsupportedFormat, "Both input images must have CV_8UC1" );
1048
1049 if (dtype != CV_16SC1 && dtype != CV_32FC1)
1050 CV_Error( Error::StsUnsupportedFormat, "Disparity image must have CV_16SC1 or CV_32FC1 format" );
1051
1052 if( params.preFilterType != PREFILTER_NORMALIZED_RESPONSE &&
1053 params.preFilterType != PREFILTER_XSOBEL )
1054 CV_Error( Error::StsOutOfRange, "preFilterType must be = CV_STEREO_BM_NORMALIZED_RESPONSE" );
1055
1056 if( params.preFilterSize < 5 || params.preFilterSize > 255 || params.preFilterSize % 2 == 0 )
1057 CV_Error( Error::StsOutOfRange, "preFilterSize must be odd and be within 5..255" );
1058
1059 if( params.preFilterCap < 1 || params.preFilterCap > 63 )
1060 CV_Error( Error::StsOutOfRange, "preFilterCap must be within 1..63" );
1061
1062 if( params.SADWindowSize < 5 || params.SADWindowSize > 255 || params.SADWindowSize % 2 == 0 ||
1063 params.SADWindowSize >= std::min(leftsize.width, leftsize.height) )
1064 CV_Error( Error::StsOutOfRange, "SADWindowSize must be odd, be within 5..255 and be not larger than image width or height" );
1065
1066 if( params.numDisparities <= 0 || params.numDisparities % 16 != 0 )
1067 CV_Error( Error::StsOutOfRange, "numDisparities must be positive and divisble by 16" );
1068
1069 if( params.textureThreshold < 0 )
1070 CV_Error( Error::StsOutOfRange, "texture threshold must be non-negative" );
1071
1072 if( params.uniquenessRatio < 0 )
1073 CV_Error( Error::StsOutOfRange, "uniqueness ratio must be non-negative" );
1074
1075 int FILTERED = (params.minDisparity - 1) << DISPARITY_SHIFT;
1076
1077 if(ocl::useOpenCL() && disparr.isUMat() && params.textureThreshold == 0)
1078 {
1079 UMat left, right;
1080 if(ocl_prefiltering(leftarr, rightarr, left, right, ¶ms))
1081 {
1082 if(ocl_stereobm(left, right, disparr, ¶ms))
1083 {
1084 if( params.speckleRange >= 0 && params.speckleWindowSize > 0 )
1085 filterSpeckles(disparr.getMat(), FILTERED, params.speckleWindowSize, params.speckleRange, slidingSumBuf);
1086 if (dtype == CV_32F)
1087 disparr.getUMat().convertTo(disparr, CV_32FC1, 1./(1 << DISPARITY_SHIFT), 0);
1088 CV_IMPL_ADD(CV_IMPL_OCL);
1089 return;
1090 }
1091 }
1092 }
1093
1094 Mat left0 = leftarr.getMat(), right0 = rightarr.getMat();
1095 disparr.create(left0.size(), dtype);
1096 Mat disp0 = disparr.getMat();
1097
1098 preFilteredImg0.create( left0.size(), CV_8U );
1099 preFilteredImg1.create( left0.size(), CV_8U );
1100 cost.create( left0.size(), CV_16S );
1101
1102 Mat left = preFilteredImg0, right = preFilteredImg1;
1103
1104 int mindisp = params.minDisparity;
1105 int ndisp = params.numDisparities;
1106
1107 int width = left0.cols;
1108 int height = left0.rows;
1109 int lofs = std::max(ndisp - 1 + mindisp, 0);
1110 int rofs = -std::min(ndisp - 1 + mindisp, 0);
1111 int width1 = width - rofs - ndisp + 1;
1112
1113 if( lofs >= width || rofs >= width || width1 < 1 )
1114 {
1115 disp0 = Scalar::all( FILTERED * ( disp0.type() < CV_32F ? 1 : 1./(1 << DISPARITY_SHIFT) ) );
1116 return;
1117 }
1118
1119 Mat disp = disp0;
1120 if( dtype == CV_32F )
1121 {
1122 dispbuf.create(disp0.size(), CV_16S);
1123 disp = dispbuf;
1124 }
1125
1126 int wsz = params.SADWindowSize;
1127 int bufSize0 = (int)((ndisp + 2)*sizeof(int));
1128 bufSize0 += (int)((height+wsz+2)*ndisp*sizeof(int));
1129 bufSize0 += (int)((height + wsz + 2)*sizeof(int));
1130 bufSize0 += (int)((height+wsz+2)*ndisp*(wsz+2)*sizeof(uchar) + 256);
1131
1132 int bufSize1 = (int)((width + params.preFilterSize + 2) * sizeof(int) + 256);
1133 int bufSize2 = 0;
1134 if( params.speckleRange >= 0 && params.speckleWindowSize > 0 )
1135 bufSize2 = width*height*(sizeof(Point_<short>) + sizeof(int) + sizeof(uchar));
1136
1137 #if CV_SSE2
1138 bool useShorts = params.preFilterCap <= 31 && params.SADWindowSize <= 21 && checkHardwareSupport(CV_CPU_SSE2);
1139 #else
1140 const bool useShorts = false;
1141 #endif
1142
1143 const double SAD_overhead_coeff = 10.0;
1144 double N0 = 8000000 / (useShorts ? 1 : 4); // approx tbb's min number instructions reasonable for one thread
1145 double maxStripeSize = std::min(std::max(N0 / (width * ndisp), (wsz-1) * SAD_overhead_coeff), (double)height);
1146 int nstripes = cvCeil(height / maxStripeSize);
1147 int bufSize = std::max(bufSize0 * nstripes, std::max(bufSize1 * 2, bufSize2));
1148
1149 if( slidingSumBuf.cols < bufSize )
1150 slidingSumBuf.create( 1, bufSize, CV_8U );
1151
1152 uchar *_buf = slidingSumBuf.ptr();
1153
1154 parallel_for_(Range(0, 2), PrefilterInvoker(left0, right0, left, right, _buf, _buf + bufSize1, ¶ms), 1);
1155
1156 Rect validDisparityRect(0, 0, width, height), R1 = params.roi1, R2 = params.roi2;
1157 validDisparityRect = getValidDisparityROI(R1.area() > 0 ? Rect(0, 0, width, height) : validDisparityRect,
1158 R2.area() > 0 ? Rect(0, 0, width, height) : validDisparityRect,
1159 params.minDisparity, params.numDisparities,
1160 params.SADWindowSize);
1161
1162 parallel_for_(Range(0, nstripes),
1163 FindStereoCorrespInvoker(left, right, disp, ¶ms, nstripes,
1164 bufSize0, useShorts, validDisparityRect,
1165 slidingSumBuf, cost));
1166
1167 if( params.speckleRange >= 0 && params.speckleWindowSize > 0 )
1168 filterSpeckles(disp, FILTERED, params.speckleWindowSize, params.speckleRange, slidingSumBuf);
1169
1170 if (disp0.data != disp.data)
1171 disp.convertTo(disp0, disp0.type(), 1./(1 << DISPARITY_SHIFT), 0);
1172 }
1173
getMinDisparity() const1174 int getMinDisparity() const { return params.minDisparity; }
setMinDisparity(int minDisparity)1175 void setMinDisparity(int minDisparity) { params.minDisparity = minDisparity; }
1176
getNumDisparities() const1177 int getNumDisparities() const { return params.numDisparities; }
setNumDisparities(int numDisparities)1178 void setNumDisparities(int numDisparities) { params.numDisparities = numDisparities; }
1179
getBlockSize() const1180 int getBlockSize() const { return params.SADWindowSize; }
setBlockSize(int blockSize)1181 void setBlockSize(int blockSize) { params.SADWindowSize = blockSize; }
1182
getSpeckleWindowSize() const1183 int getSpeckleWindowSize() const { return params.speckleWindowSize; }
setSpeckleWindowSize(int speckleWindowSize)1184 void setSpeckleWindowSize(int speckleWindowSize) { params.speckleWindowSize = speckleWindowSize; }
1185
getSpeckleRange() const1186 int getSpeckleRange() const { return params.speckleRange; }
setSpeckleRange(int speckleRange)1187 void setSpeckleRange(int speckleRange) { params.speckleRange = speckleRange; }
1188
getDisp12MaxDiff() const1189 int getDisp12MaxDiff() const { return params.disp12MaxDiff; }
setDisp12MaxDiff(int disp12MaxDiff)1190 void setDisp12MaxDiff(int disp12MaxDiff) { params.disp12MaxDiff = disp12MaxDiff; }
1191
getPreFilterType() const1192 int getPreFilterType() const { return params.preFilterType; }
setPreFilterType(int preFilterType)1193 void setPreFilterType(int preFilterType) { params.preFilterType = preFilterType; }
1194
getPreFilterSize() const1195 int getPreFilterSize() const { return params.preFilterSize; }
setPreFilterSize(int preFilterSize)1196 void setPreFilterSize(int preFilterSize) { params.preFilterSize = preFilterSize; }
1197
getPreFilterCap() const1198 int getPreFilterCap() const { return params.preFilterCap; }
setPreFilterCap(int preFilterCap)1199 void setPreFilterCap(int preFilterCap) { params.preFilterCap = preFilterCap; }
1200
getTextureThreshold() const1201 int getTextureThreshold() const { return params.textureThreshold; }
setTextureThreshold(int textureThreshold)1202 void setTextureThreshold(int textureThreshold) { params.textureThreshold = textureThreshold; }
1203
getUniquenessRatio() const1204 int getUniquenessRatio() const { return params.uniquenessRatio; }
setUniquenessRatio(int uniquenessRatio)1205 void setUniquenessRatio(int uniquenessRatio) { params.uniquenessRatio = uniquenessRatio; }
1206
getSmallerBlockSize() const1207 int getSmallerBlockSize() const { return 0; }
setSmallerBlockSize(int)1208 void setSmallerBlockSize(int) {}
1209
getROI1() const1210 Rect getROI1() const { return params.roi1; }
setROI1(Rect roi1)1211 void setROI1(Rect roi1) { params.roi1 = roi1; }
1212
getROI2() const1213 Rect getROI2() const { return params.roi2; }
setROI2(Rect roi2)1214 void setROI2(Rect roi2) { params.roi2 = roi2; }
1215
write(FileStorage & fs) const1216 void write(FileStorage& fs) const
1217 {
1218 fs << "name" << name_
1219 << "minDisparity" << params.minDisparity
1220 << "numDisparities" << params.numDisparities
1221 << "blockSize" << params.SADWindowSize
1222 << "speckleWindowSize" << params.speckleWindowSize
1223 << "speckleRange" << params.speckleRange
1224 << "disp12MaxDiff" << params.disp12MaxDiff
1225 << "preFilterType" << params.preFilterType
1226 << "preFilterSize" << params.preFilterSize
1227 << "preFilterCap" << params.preFilterCap
1228 << "textureThreshold" << params.textureThreshold
1229 << "uniquenessRatio" << params.uniquenessRatio;
1230 }
1231
read(const FileNode & fn)1232 void read(const FileNode& fn)
1233 {
1234 FileNode n = fn["name"];
1235 CV_Assert( n.isString() && String(n) == name_ );
1236 params.minDisparity = (int)fn["minDisparity"];
1237 params.numDisparities = (int)fn["numDisparities"];
1238 params.SADWindowSize = (int)fn["blockSize"];
1239 params.speckleWindowSize = (int)fn["speckleWindowSize"];
1240 params.speckleRange = (int)fn["speckleRange"];
1241 params.disp12MaxDiff = (int)fn["disp12MaxDiff"];
1242 params.preFilterType = (int)fn["preFilterType"];
1243 params.preFilterSize = (int)fn["preFilterSize"];
1244 params.preFilterCap = (int)fn["preFilterCap"];
1245 params.textureThreshold = (int)fn["textureThreshold"];
1246 params.uniquenessRatio = (int)fn["uniquenessRatio"];
1247 params.roi1 = params.roi2 = Rect();
1248 }
1249
1250 StereoBMParams params;
1251 Mat preFilteredImg0, preFilteredImg1, cost, dispbuf;
1252 Mat slidingSumBuf;
1253
1254 static const char* name_;
1255 };
1256
1257 const char* StereoBMImpl::name_ = "StereoMatcher.BM";
1258
create(int _numDisparities,int _SADWindowSize)1259 Ptr<StereoBM> StereoBM::create(int _numDisparities, int _SADWindowSize)
1260 {
1261 return makePtr<StereoBMImpl>(_numDisparities, _SADWindowSize);
1262 }
1263
1264 }
1265
1266 /* End of file. */
1267