1 /*M///////////////////////////////////////////////////////////////////////////////////////
2 //
3 // IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
4 //
5 // By downloading, copying, installing or using the software you agree to this license.
6 // If you do not agree to this license, do not download, install,
7 // copy or use the software.
8 //
9 //
10 // License Agreement
11 // For Open Source Computer Vision Library
12 //
13 // Copyright (C) 2000, Intel Corporation, all rights reserved.
14 // Copyright (C) 2013, OpenCV Foundation, all rights reserved.
15 // Third party copyrights are property of their respective owners.
16 //
17 // Redistribution and use in source and binary forms, with or without modification,
18 // are permitted provided that the following conditions are met:
19 //
20 // * Redistribution's of source code must retain the above copyright notice,
21 // this list of conditions and the following disclaimer.
22 //
23 // * Redistribution's in binary form must reproduce the above copyright notice,
24 // this list of conditions and the following disclaimer in the documentation
25 // and/or other materials provided with the distribution.
26 //
27 // * The name of the copyright holders may not be used to endorse or promote products
28 // derived from this software without specific prior written permission.
29 //
30 // This software is provided by the copyright holders and contributors "as is" and
31 // any express or implied warranties, including, but not limited to, the implied
32 // warranties of merchantability and fitness for a particular purpose are disclaimed.
33 // In no event shall the Intel Corporation or contributors be liable for any direct,
34 // indirect, incidental, special, exemplary, or consequential damages
35 // (including, but not limited to, procurement of substitute goods or services;
36 // loss of use, data, or profits; or business interruption) however caused
37 // and on any theory of liability, whether in contract, strict liability,
38 // or tort (including negligence or otherwise) arising in any way out of
39 // the use of this software, even if advised of the possibility of such damage.
40 //
41 //M*/
42 #include "precomp.hpp"
43 #include <float.h>
44 #include <stdio.h>
45 #include "lkpyramid.hpp"
46 #include "opencl_kernels_video.hpp"
47
48 #define CV_DESCALE(x,n) (((x) + (1 << ((n)-1))) >> (n))
49
50 namespace
51 {
calcSharrDeriv(const cv::Mat & src,cv::Mat & dst)52 static void calcSharrDeriv(const cv::Mat& src, cv::Mat& dst)
53 {
54 using namespace cv;
55 using cv::detail::deriv_type;
56 int rows = src.rows, cols = src.cols, cn = src.channels(), colsn = cols*cn, depth = src.depth();
57 CV_Assert(depth == CV_8U);
58 dst.create(rows, cols, CV_MAKETYPE(DataType<deriv_type>::depth, cn*2));
59
60 #ifdef HAVE_TEGRA_OPTIMIZATION
61 if (tegra::useTegra() && tegra::calcSharrDeriv(src, dst))
62 return;
63 #endif
64
65 int x, y, delta = (int)alignSize((cols + 2)*cn, 16);
66 AutoBuffer<deriv_type> _tempBuf(delta*2 + 64);
67 deriv_type *trow0 = alignPtr(_tempBuf + cn, 16), *trow1 = alignPtr(trow0 + delta, 16);
68
69 #if CV_SSE2
70 __m128i z = _mm_setzero_si128(), c3 = _mm_set1_epi16(3), c10 = _mm_set1_epi16(10);
71 #endif
72
73 #if CV_NEON
74 const uint16x8_t q8 = vdupq_n_u16(3);
75 const uint8x8_t d18 = vdup_n_u8(10);
76
77 const int16x8_t q8i = vdupq_n_s16(3);
78 const int16x8_t q9 = vdupq_n_s16(10);
79 #endif
80
81 for( y = 0; y < rows; y++ )
82 {
83 const uchar* srow0 = src.ptr<uchar>(y > 0 ? y-1 : rows > 1 ? 1 : 0);
84 const uchar* srow1 = src.ptr<uchar>(y);
85 const uchar* srow2 = src.ptr<uchar>(y < rows-1 ? y+1 : rows > 1 ? rows-2 : 0);
86 deriv_type* drow = dst.ptr<deriv_type>(y);
87
88 // do vertical convolution
89 x = 0;
90 #if CV_SSE2
91 for( ; x <= colsn - 8; x += 8 )
92 {
93 __m128i s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(srow0 + x)), z);
94 __m128i s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(srow1 + x)), z);
95 __m128i s2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(srow2 + x)), z);
96 __m128i t0 = _mm_add_epi16(_mm_mullo_epi16(_mm_add_epi16(s0, s2), c3), _mm_mullo_epi16(s1, c10));
97 __m128i t1 = _mm_sub_epi16(s2, s0);
98 _mm_store_si128((__m128i*)(trow0 + x), t0);
99 _mm_store_si128((__m128i*)(trow1 + x), t1);
100 }
101 #endif
102
103 #if CV_NEON
104 for( ; x <= colsn - 8; x += 8)
105 {
106 uint8x8_t d0 = vld1_u8((const uint8_t*)&srow0[x]);
107 uint8x8_t d1 = vld1_u8((const uint8_t*)&srow1[x]);
108 uint8x8_t d2 = vld1_u8((const uint8_t*)&srow2[x]);
109 uint16x8_t q4 = vaddl_u8(d0, d2);
110 uint16x8_t q11 = vsubl_u8(d2, d0);
111 uint16x8_t q5 = vmulq_u16(q4, q8);
112 uint16x8_t q6 = vmull_u8(d1, d18);
113 uint16x8_t q10 = vaddq_u16(q6, q5);
114 vst1q_u16((uint16_t*)&trow0[x], q10);
115 vst1q_u16((uint16_t*)&trow1[x], q11);
116
117 }
118 #endif
119
120 for( ; x < colsn; x++ )
121 {
122 int t0 = (srow0[x] + srow2[x])*3 + srow1[x]*10;
123 int t1 = srow2[x] - srow0[x];
124 trow0[x] = (deriv_type)t0;
125 trow1[x] = (deriv_type)t1;
126 }
127
128 // make border
129 int x0 = (cols > 1 ? 1 : 0)*cn, x1 = (cols > 1 ? cols-2 : 0)*cn;
130 for( int k = 0; k < cn; k++ )
131 {
132 trow0[-cn + k] = trow0[x0 + k]; trow0[colsn + k] = trow0[x1 + k];
133 trow1[-cn + k] = trow1[x0 + k]; trow1[colsn + k] = trow1[x1 + k];
134 }
135
136 // do horizontal convolution, interleave the results and store them to dst
137 x = 0;
138 #if CV_SSE2
139 for( ; x <= colsn - 8; x += 8 )
140 {
141 __m128i s0 = _mm_loadu_si128((const __m128i*)(trow0 + x - cn));
142 __m128i s1 = _mm_loadu_si128((const __m128i*)(trow0 + x + cn));
143 __m128i s2 = _mm_loadu_si128((const __m128i*)(trow1 + x - cn));
144 __m128i s3 = _mm_load_si128((const __m128i*)(trow1 + x));
145 __m128i s4 = _mm_loadu_si128((const __m128i*)(trow1 + x + cn));
146
147 __m128i t0 = _mm_sub_epi16(s1, s0);
148 __m128i t1 = _mm_add_epi16(_mm_mullo_epi16(_mm_add_epi16(s2, s4), c3), _mm_mullo_epi16(s3, c10));
149 __m128i t2 = _mm_unpacklo_epi16(t0, t1);
150 t0 = _mm_unpackhi_epi16(t0, t1);
151 // this can probably be replaced with aligned stores if we aligned dst properly.
152 _mm_storeu_si128((__m128i*)(drow + x*2), t2);
153 _mm_storeu_si128((__m128i*)(drow + x*2 + 8), t0);
154 }
155 #endif
156
157 #if CV_NEON
158 for( ; x <= colsn - 8; x += 8 )
159 {
160
161 int16x8_t q0 = vld1q_s16((const int16_t*)&trow0[x+cn]);
162 int16x8_t q1 = vld1q_s16((const int16_t*)&trow0[x-cn]);
163 int16x8_t q2 = vld1q_s16((const int16_t*)&trow1[x+cn]);
164 int16x8_t q3 = vld1q_s16((const int16_t*)&trow1[x-cn]);
165 int16x8_t q5 = vsubq_s16(q0, q1);
166 int16x8_t q6 = vaddq_s16(q2, q3);
167 int16x8_t q4 = vld1q_s16((const int16_t*)&trow1[x]);
168 int16x8_t q7 = vmulq_s16(q6, q8i);
169 int16x8_t q10 = vmulq_s16(q4, q9);
170 int16x8_t q11 = vaddq_s16(q7, q10);
171 int16x4_t d22 = vget_low_s16(q11);
172 int16x4_t d23 = vget_high_s16(q11);
173 int16x4_t d11 = vget_high_s16(q5);
174 int16x4_t d10 = vget_low_s16(q5);
175 int16x4x2_t q5x2, q11x2;
176 q5x2.val[0] = d10; q5x2.val[1] = d22;
177 q11x2.val[0] = d11; q11x2.val[1] = d23;
178 vst2_s16((int16_t*)&drow[x*2], q5x2);
179 vst2_s16((int16_t*)&drow[(x*2)+8], q11x2);
180
181 }
182 #endif
183 for( ; x < colsn; x++ )
184 {
185 deriv_type t0 = (deriv_type)(trow0[x+cn] - trow0[x-cn]);
186 deriv_type t1 = (deriv_type)((trow1[x+cn] + trow1[x-cn])*3 + trow1[x]*10);
187 drow[x*2] = t0; drow[x*2+1] = t1;
188 }
189 }
190 }
191
192 }//namespace
193
LKTrackerInvoker(const Mat & _prevImg,const Mat & _prevDeriv,const Mat & _nextImg,const Point2f * _prevPts,Point2f * _nextPts,uchar * _status,float * _err,Size _winSize,TermCriteria _criteria,int _level,int _maxLevel,int _flags,float _minEigThreshold)194 cv::detail::LKTrackerInvoker::LKTrackerInvoker(
195 const Mat& _prevImg, const Mat& _prevDeriv, const Mat& _nextImg,
196 const Point2f* _prevPts, Point2f* _nextPts,
197 uchar* _status, float* _err,
198 Size _winSize, TermCriteria _criteria,
199 int _level, int _maxLevel, int _flags, float _minEigThreshold )
200 {
201 prevImg = &_prevImg;
202 prevDeriv = &_prevDeriv;
203 nextImg = &_nextImg;
204 prevPts = _prevPts;
205 nextPts = _nextPts;
206 status = _status;
207 err = _err;
208 winSize = _winSize;
209 criteria = _criteria;
210 level = _level;
211 maxLevel = _maxLevel;
212 flags = _flags;
213 minEigThreshold = _minEigThreshold;
214 }
215
216 #if defined __arm__ && !CV_NEON
217 typedef int64 acctype;
218 typedef int itemtype;
219 #else
220 typedef float acctype;
221 typedef float itemtype;
222 #endif
223
operator ()(const Range & range) const224 void cv::detail::LKTrackerInvoker::operator()(const Range& range) const
225 {
226 Point2f halfWin((winSize.width-1)*0.5f, (winSize.height-1)*0.5f);
227 const Mat& I = *prevImg;
228 const Mat& J = *nextImg;
229 const Mat& derivI = *prevDeriv;
230
231 int j, cn = I.channels(), cn2 = cn*2;
232 cv::AutoBuffer<deriv_type> _buf(winSize.area()*(cn + cn2));
233 int derivDepth = DataType<deriv_type>::depth;
234
235 Mat IWinBuf(winSize, CV_MAKETYPE(derivDepth, cn), (deriv_type*)_buf);
236 Mat derivIWinBuf(winSize, CV_MAKETYPE(derivDepth, cn2), (deriv_type*)_buf + winSize.area()*cn);
237
238 for( int ptidx = range.start; ptidx < range.end; ptidx++ )
239 {
240 Point2f prevPt = prevPts[ptidx]*(float)(1./(1 << level));
241 Point2f nextPt;
242 if( level == maxLevel )
243 {
244 if( flags & OPTFLOW_USE_INITIAL_FLOW )
245 nextPt = nextPts[ptidx]*(float)(1./(1 << level));
246 else
247 nextPt = prevPt;
248 }
249 else
250 nextPt = nextPts[ptidx]*2.f;
251 nextPts[ptidx] = nextPt;
252
253 Point2i iprevPt, inextPt;
254 prevPt -= halfWin;
255 iprevPt.x = cvFloor(prevPt.x);
256 iprevPt.y = cvFloor(prevPt.y);
257
258 if( iprevPt.x < -winSize.width || iprevPt.x >= derivI.cols ||
259 iprevPt.y < -winSize.height || iprevPt.y >= derivI.rows )
260 {
261 if( level == 0 )
262 {
263 if( status )
264 status[ptidx] = false;
265 if( err )
266 err[ptidx] = 0;
267 }
268 continue;
269 }
270
271 float a = prevPt.x - iprevPt.x;
272 float b = prevPt.y - iprevPt.y;
273 const int W_BITS = 14, W_BITS1 = 14;
274 const float FLT_SCALE = 1.f/(1 << 20);
275 int iw00 = cvRound((1.f - a)*(1.f - b)*(1 << W_BITS));
276 int iw01 = cvRound(a*(1.f - b)*(1 << W_BITS));
277 int iw10 = cvRound((1.f - a)*b*(1 << W_BITS));
278 int iw11 = (1 << W_BITS) - iw00 - iw01 - iw10;
279
280 int dstep = (int)(derivI.step/derivI.elemSize1());
281 int stepI = (int)(I.step/I.elemSize1());
282 int stepJ = (int)(J.step/J.elemSize1());
283 acctype iA11 = 0, iA12 = 0, iA22 = 0;
284 float A11, A12, A22;
285
286 #if CV_SSE2
287 __m128i qw0 = _mm_set1_epi32(iw00 + (iw01 << 16));
288 __m128i qw1 = _mm_set1_epi32(iw10 + (iw11 << 16));
289 __m128i z = _mm_setzero_si128();
290 __m128i qdelta_d = _mm_set1_epi32(1 << (W_BITS1-1));
291 __m128i qdelta = _mm_set1_epi32(1 << (W_BITS1-5-1));
292 __m128 qA11 = _mm_setzero_ps(), qA12 = _mm_setzero_ps(), qA22 = _mm_setzero_ps();
293 #endif
294
295 #if CV_NEON
296
297 int CV_DECL_ALIGNED(16) nA11[] = {0, 0, 0, 0}, nA12[] = {0, 0, 0, 0}, nA22[] = {0, 0, 0, 0};
298 const int shifter1 = -(W_BITS - 5); //negative so it shifts right
299 const int shifter2 = -(W_BITS);
300
301 const int16x4_t d26 = vdup_n_s16((int16_t)iw00);
302 const int16x4_t d27 = vdup_n_s16((int16_t)iw01);
303 const int16x4_t d28 = vdup_n_s16((int16_t)iw10);
304 const int16x4_t d29 = vdup_n_s16((int16_t)iw11);
305 const int32x4_t q11 = vdupq_n_s32((int32_t)shifter1);
306 const int32x4_t q12 = vdupq_n_s32((int32_t)shifter2);
307
308 #endif
309
310 // extract the patch from the first image, compute covariation matrix of derivatives
311 int x, y;
312 for( y = 0; y < winSize.height; y++ )
313 {
314 const uchar* src = I.ptr() + (y + iprevPt.y)*stepI + iprevPt.x*cn;
315 const deriv_type* dsrc = derivI.ptr<deriv_type>() + (y + iprevPt.y)*dstep + iprevPt.x*cn2;
316
317 deriv_type* Iptr = IWinBuf.ptr<deriv_type>(y);
318 deriv_type* dIptr = derivIWinBuf.ptr<deriv_type>(y);
319
320 x = 0;
321
322 #if CV_SSE2
323 for( ; x <= winSize.width*cn - 4; x += 4, dsrc += 4*2, dIptr += 4*2 )
324 {
325 __m128i v00, v01, v10, v11, t0, t1;
326
327 v00 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x)), z);
328 v01 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x + cn)), z);
329 v10 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x + stepI)), z);
330 v11 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int*)(src + x + stepI + cn)), z);
331
332 t0 = _mm_add_epi32(_mm_madd_epi16(_mm_unpacklo_epi16(v00, v01), qw0),
333 _mm_madd_epi16(_mm_unpacklo_epi16(v10, v11), qw1));
334 t0 = _mm_srai_epi32(_mm_add_epi32(t0, qdelta), W_BITS1-5);
335 _mm_storel_epi64((__m128i*)(Iptr + x), _mm_packs_epi32(t0,t0));
336
337 v00 = _mm_loadu_si128((const __m128i*)(dsrc));
338 v01 = _mm_loadu_si128((const __m128i*)(dsrc + cn2));
339 v10 = _mm_loadu_si128((const __m128i*)(dsrc + dstep));
340 v11 = _mm_loadu_si128((const __m128i*)(dsrc + dstep + cn2));
341
342 t0 = _mm_add_epi32(_mm_madd_epi16(_mm_unpacklo_epi16(v00, v01), qw0),
343 _mm_madd_epi16(_mm_unpacklo_epi16(v10, v11), qw1));
344 t1 = _mm_add_epi32(_mm_madd_epi16(_mm_unpackhi_epi16(v00, v01), qw0),
345 _mm_madd_epi16(_mm_unpackhi_epi16(v10, v11), qw1));
346 t0 = _mm_srai_epi32(_mm_add_epi32(t0, qdelta_d), W_BITS1);
347 t1 = _mm_srai_epi32(_mm_add_epi32(t1, qdelta_d), W_BITS1);
348 v00 = _mm_packs_epi32(t0, t1); // Ix0 Iy0 Ix1 Iy1 ...
349
350 _mm_storeu_si128((__m128i*)dIptr, v00);
351 t0 = _mm_srai_epi32(v00, 16); // Iy0 Iy1 Iy2 Iy3
352 t1 = _mm_srai_epi32(_mm_slli_epi32(v00, 16), 16); // Ix0 Ix1 Ix2 Ix3
353
354 __m128 fy = _mm_cvtepi32_ps(t0);
355 __m128 fx = _mm_cvtepi32_ps(t1);
356
357 qA22 = _mm_add_ps(qA22, _mm_mul_ps(fy, fy));
358 qA12 = _mm_add_ps(qA12, _mm_mul_ps(fx, fy));
359 qA11 = _mm_add_ps(qA11, _mm_mul_ps(fx, fx));
360 }
361 #endif
362
363 #if CV_NEON
364 for( ; x <= winSize.width*cn - 4; x += 4, dsrc += 4*2, dIptr += 4*2 )
365 {
366
367 uint8x8_t d0 = vld1_u8(&src[x]);
368 uint8x8_t d2 = vld1_u8(&src[x+cn]);
369 uint16x8_t q0 = vmovl_u8(d0);
370 uint16x8_t q1 = vmovl_u8(d2);
371
372 int32x4_t q5 = vmull_s16(vget_low_s16(vreinterpretq_s16_u16(q0)), d26);
373 int32x4_t q6 = vmull_s16(vget_low_s16(vreinterpretq_s16_u16(q1)), d27);
374
375 uint8x8_t d4 = vld1_u8(&src[x + stepI]);
376 uint8x8_t d6 = vld1_u8(&src[x + stepI + cn]);
377 uint16x8_t q2 = vmovl_u8(d4);
378 uint16x8_t q3 = vmovl_u8(d6);
379
380 int32x4_t q7 = vmull_s16(vget_low_s16(vreinterpretq_s16_u16(q2)), d28);
381 int32x4_t q8 = vmull_s16(vget_low_s16(vreinterpretq_s16_u16(q3)), d29);
382
383 q5 = vaddq_s32(q5, q6);
384 q7 = vaddq_s32(q7, q8);
385 q5 = vaddq_s32(q5, q7);
386
387 int16x4x2_t d0d1 = vld2_s16(dsrc);
388 int16x4x2_t d2d3 = vld2_s16(&dsrc[cn2]);
389
390 q5 = vqrshlq_s32(q5, q11);
391
392 int32x4_t q4 = vmull_s16(d0d1.val[0], d26);
393 q6 = vmull_s16(d0d1.val[1], d26);
394
395 int16x4_t nd0 = vmovn_s32(q5);
396
397 q7 = vmull_s16(d2d3.val[0], d27);
398 q8 = vmull_s16(d2d3.val[1], d27);
399
400 vst1_s16(&Iptr[x], nd0);
401
402 int16x4x2_t d4d5 = vld2_s16(&dsrc[dstep]);
403 int16x4x2_t d6d7 = vld2_s16(&dsrc[dstep+cn2]);
404
405 q4 = vaddq_s32(q4, q7);
406 q6 = vaddq_s32(q6, q8);
407
408 q7 = vmull_s16(d4d5.val[0], d28);
409 int32x4_t nq0 = vmull_s16(d4d5.val[1], d28);
410 q8 = vmull_s16(d6d7.val[0], d29);
411 int32x4_t q15 = vmull_s16(d6d7.val[1], d29);
412
413 q7 = vaddq_s32(q7, q8);
414 nq0 = vaddq_s32(nq0, q15);
415
416 q4 = vaddq_s32(q4, q7);
417 q6 = vaddq_s32(q6, nq0);
418
419 int32x4_t nq1 = vld1q_s32(nA12);
420 int32x4_t nq2 = vld1q_s32(nA22);
421 nq0 = vld1q_s32(nA11);
422
423 q4 = vqrshlq_s32(q4, q12);
424 q6 = vqrshlq_s32(q6, q12);
425
426 q7 = vmulq_s32(q4, q4);
427 q8 = vmulq_s32(q4, q6);
428 q15 = vmulq_s32(q6, q6);
429
430 nq0 = vaddq_s32(nq0, q7);
431 nq1 = vaddq_s32(nq1, q8);
432 nq2 = vaddq_s32(nq2, q15);
433
434 vst1q_s32(nA11, nq0);
435 vst1q_s32(nA12, nq1);
436 vst1q_s32(nA22, nq2);
437
438 int16x4_t d8 = vmovn_s32(q4);
439 int16x4_t d12 = vmovn_s32(q6);
440
441 int16x4x2_t d8d12;
442 d8d12.val[0] = d8; d8d12.val[1] = d12;
443 vst2_s16(dIptr, d8d12);
444 }
445 #endif
446
447 for( ; x < winSize.width*cn; x++, dsrc += 2, dIptr += 2 )
448 {
449 int ival = CV_DESCALE(src[x]*iw00 + src[x+cn]*iw01 +
450 src[x+stepI]*iw10 + src[x+stepI+cn]*iw11, W_BITS1-5);
451 int ixval = CV_DESCALE(dsrc[0]*iw00 + dsrc[cn2]*iw01 +
452 dsrc[dstep]*iw10 + dsrc[dstep+cn2]*iw11, W_BITS1);
453 int iyval = CV_DESCALE(dsrc[1]*iw00 + dsrc[cn2+1]*iw01 + dsrc[dstep+1]*iw10 +
454 dsrc[dstep+cn2+1]*iw11, W_BITS1);
455
456 Iptr[x] = (short)ival;
457 dIptr[0] = (short)ixval;
458 dIptr[1] = (short)iyval;
459
460 iA11 += (itemtype)(ixval*ixval);
461 iA12 += (itemtype)(ixval*iyval);
462 iA22 += (itemtype)(iyval*iyval);
463 }
464 }
465
466 #if CV_SSE2
467 float CV_DECL_ALIGNED(16) A11buf[4], A12buf[4], A22buf[4];
468 _mm_store_ps(A11buf, qA11);
469 _mm_store_ps(A12buf, qA12);
470 _mm_store_ps(A22buf, qA22);
471 iA11 += A11buf[0] + A11buf[1] + A11buf[2] + A11buf[3];
472 iA12 += A12buf[0] + A12buf[1] + A12buf[2] + A12buf[3];
473 iA22 += A22buf[0] + A22buf[1] + A22buf[2] + A22buf[3];
474 #endif
475
476 #if CV_NEON
477 iA11 += (float)(nA11[0] + nA11[1] + nA11[2] + nA11[3]);
478 iA12 += (float)(nA12[0] + nA12[1] + nA12[2] + nA12[3]);
479 iA22 += (float)(nA22[0] + nA22[1] + nA22[2] + nA22[3]);
480 #endif
481
482 A11 = iA11*FLT_SCALE;
483 A12 = iA12*FLT_SCALE;
484 A22 = iA22*FLT_SCALE;
485
486 float D = A11*A22 - A12*A12;
487 float minEig = (A22 + A11 - std::sqrt((A11-A22)*(A11-A22) +
488 4.f*A12*A12))/(2*winSize.width*winSize.height);
489
490 if( err && (flags & OPTFLOW_LK_GET_MIN_EIGENVALS) != 0 )
491 err[ptidx] = (float)minEig;
492
493 if( minEig < minEigThreshold || D < FLT_EPSILON )
494 {
495 if( level == 0 && status )
496 status[ptidx] = false;
497 continue;
498 }
499
500 D = 1.f/D;
501
502 nextPt -= halfWin;
503 Point2f prevDelta;
504
505 for( j = 0; j < criteria.maxCount; j++ )
506 {
507 inextPt.x = cvFloor(nextPt.x);
508 inextPt.y = cvFloor(nextPt.y);
509
510 if( inextPt.x < -winSize.width || inextPt.x >= J.cols ||
511 inextPt.y < -winSize.height || inextPt.y >= J.rows )
512 {
513 if( level == 0 && status )
514 status[ptidx] = false;
515 break;
516 }
517
518 a = nextPt.x - inextPt.x;
519 b = nextPt.y - inextPt.y;
520 iw00 = cvRound((1.f - a)*(1.f - b)*(1 << W_BITS));
521 iw01 = cvRound(a*(1.f - b)*(1 << W_BITS));
522 iw10 = cvRound((1.f - a)*b*(1 << W_BITS));
523 iw11 = (1 << W_BITS) - iw00 - iw01 - iw10;
524 acctype ib1 = 0, ib2 = 0;
525 float b1, b2;
526 #if CV_SSE2
527 qw0 = _mm_set1_epi32(iw00 + (iw01 << 16));
528 qw1 = _mm_set1_epi32(iw10 + (iw11 << 16));
529 __m128 qb0 = _mm_setzero_ps(), qb1 = _mm_setzero_ps();
530 #endif
531
532 #if CV_NEON
533 int CV_DECL_ALIGNED(16) nB1[] = {0,0,0,0}, nB2[] = {0,0,0,0};
534
535 const int16x4_t d26_2 = vdup_n_s16((int16_t)iw00);
536 const int16x4_t d27_2 = vdup_n_s16((int16_t)iw01);
537 const int16x4_t d28_2 = vdup_n_s16((int16_t)iw10);
538 const int16x4_t d29_2 = vdup_n_s16((int16_t)iw11);
539
540 #endif
541
542 for( y = 0; y < winSize.height; y++ )
543 {
544 const uchar* Jptr = J.ptr() + (y + inextPt.y)*stepJ + inextPt.x*cn;
545 const deriv_type* Iptr = IWinBuf.ptr<deriv_type>(y);
546 const deriv_type* dIptr = derivIWinBuf.ptr<deriv_type>(y);
547
548 x = 0;
549
550 #if CV_SSE2
551 for( ; x <= winSize.width*cn - 8; x += 8, dIptr += 8*2 )
552 {
553 __m128i diff0 = _mm_loadu_si128((const __m128i*)(Iptr + x)), diff1;
554 __m128i v00 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x)), z);
555 __m128i v01 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x + cn)), z);
556 __m128i v10 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x + stepJ)), z);
557 __m128i v11 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(Jptr + x + stepJ + cn)), z);
558
559 __m128i t0 = _mm_add_epi32(_mm_madd_epi16(_mm_unpacklo_epi16(v00, v01), qw0),
560 _mm_madd_epi16(_mm_unpacklo_epi16(v10, v11), qw1));
561 __m128i t1 = _mm_add_epi32(_mm_madd_epi16(_mm_unpackhi_epi16(v00, v01), qw0),
562 _mm_madd_epi16(_mm_unpackhi_epi16(v10, v11), qw1));
563 t0 = _mm_srai_epi32(_mm_add_epi32(t0, qdelta), W_BITS1-5);
564 t1 = _mm_srai_epi32(_mm_add_epi32(t1, qdelta), W_BITS1-5);
565 diff0 = _mm_subs_epi16(_mm_packs_epi32(t0, t1), diff0);
566 diff1 = _mm_unpackhi_epi16(diff0, diff0);
567 diff0 = _mm_unpacklo_epi16(diff0, diff0); // It0 It0 It1 It1 ...
568 v00 = _mm_loadu_si128((const __m128i*)(dIptr)); // Ix0 Iy0 Ix1 Iy1 ...
569 v01 = _mm_loadu_si128((const __m128i*)(dIptr + 8));
570 v10 = _mm_mullo_epi16(v00, diff0);
571 v11 = _mm_mulhi_epi16(v00, diff0);
572 v00 = _mm_unpacklo_epi16(v10, v11);
573 v10 = _mm_unpackhi_epi16(v10, v11);
574 qb0 = _mm_add_ps(qb0, _mm_cvtepi32_ps(v00));
575 qb1 = _mm_add_ps(qb1, _mm_cvtepi32_ps(v10));
576 v10 = _mm_mullo_epi16(v01, diff1);
577 v11 = _mm_mulhi_epi16(v01, diff1);
578 v00 = _mm_unpacklo_epi16(v10, v11);
579 v10 = _mm_unpackhi_epi16(v10, v11);
580 qb0 = _mm_add_ps(qb0, _mm_cvtepi32_ps(v00));
581 qb1 = _mm_add_ps(qb1, _mm_cvtepi32_ps(v10));
582 }
583 #endif
584
585 #if CV_NEON
586 for( ; x <= winSize.width*cn - 8; x += 8, dIptr += 8*2 )
587 {
588
589 uint8x8_t d0 = vld1_u8(&Jptr[x]);
590 uint8x8_t d2 = vld1_u8(&Jptr[x+cn]);
591 uint8x8_t d4 = vld1_u8(&Jptr[x+stepJ]);
592 uint8x8_t d6 = vld1_u8(&Jptr[x+stepJ+cn]);
593
594 uint16x8_t q0 = vmovl_u8(d0);
595 uint16x8_t q1 = vmovl_u8(d2);
596 uint16x8_t q2 = vmovl_u8(d4);
597 uint16x8_t q3 = vmovl_u8(d6);
598
599 int32x4_t nq4 = vmull_s16(vget_low_s16(vreinterpretq_s16_u16(q0)), d26_2);
600 int32x4_t nq5 = vmull_s16(vget_high_s16(vreinterpretq_s16_u16(q0)), d26_2);
601
602 int32x4_t nq6 = vmull_s16(vget_low_s16(vreinterpretq_s16_u16(q1)), d27_2);
603 int32x4_t nq7 = vmull_s16(vget_high_s16(vreinterpretq_s16_u16(q1)), d27_2);
604
605 int32x4_t nq8 = vmull_s16(vget_low_s16(vreinterpretq_s16_u16(q2)), d28_2);
606 int32x4_t nq9 = vmull_s16(vget_high_s16(vreinterpretq_s16_u16(q2)), d28_2);
607
608 int32x4_t nq10 = vmull_s16(vget_low_s16(vreinterpretq_s16_u16(q3)), d29_2);
609 int32x4_t nq11 = vmull_s16(vget_high_s16(vreinterpretq_s16_u16(q3)), d29_2);
610
611 nq4 = vaddq_s32(nq4, nq6);
612 nq5 = vaddq_s32(nq5, nq7);
613 nq8 = vaddq_s32(nq8, nq10);
614 nq9 = vaddq_s32(nq9, nq11);
615
616 int16x8_t q6 = vld1q_s16(&Iptr[x]);
617
618 nq4 = vaddq_s32(nq4, nq8);
619 nq5 = vaddq_s32(nq5, nq9);
620
621 nq8 = vmovl_s16(vget_high_s16(q6));
622 nq6 = vmovl_s16(vget_low_s16(q6));
623
624 nq4 = vqrshlq_s32(nq4, q11);
625 nq5 = vqrshlq_s32(nq5, q11);
626
627 int16x8x2_t q0q1 = vld2q_s16(dIptr);
628 nq11 = vld1q_s32(nB1);
629 int32x4_t nq15 = vld1q_s32(nB2);
630
631 nq4 = vsubq_s32(nq4, nq6);
632 nq5 = vsubq_s32(nq5, nq8);
633
634 int32x4_t nq2 = vmovl_s16(vget_low_s16(q0q1.val[0]));
635 int32x4_t nq3 = vmovl_s16(vget_high_s16(q0q1.val[0]));
636
637 nq7 = vmovl_s16(vget_low_s16(q0q1.val[1]));
638 nq8 = vmovl_s16(vget_high_s16(q0q1.val[1]));
639
640 nq9 = vmulq_s32(nq4, nq2);
641 nq10 = vmulq_s32(nq5, nq3);
642
643 nq4 = vmulq_s32(nq4, nq7);
644 nq5 = vmulq_s32(nq5, nq8);
645
646 nq9 = vaddq_s32(nq9, nq10);
647 nq4 = vaddq_s32(nq4, nq5);
648
649 nq11 = vaddq_s32(nq11, nq9);
650 nq15 = vaddq_s32(nq15, nq4);
651
652 vst1q_s32(nB1, nq11);
653 vst1q_s32(nB2, nq15);
654 }
655 #endif
656
657 for( ; x < winSize.width*cn; x++, dIptr += 2 )
658 {
659 int diff = CV_DESCALE(Jptr[x]*iw00 + Jptr[x+cn]*iw01 +
660 Jptr[x+stepJ]*iw10 + Jptr[x+stepJ+cn]*iw11,
661 W_BITS1-5) - Iptr[x];
662 ib1 += (itemtype)(diff*dIptr[0]);
663 ib2 += (itemtype)(diff*dIptr[1]);
664 }
665 }
666
667 #if CV_SSE2
668 float CV_DECL_ALIGNED(16) bbuf[4];
669 _mm_store_ps(bbuf, _mm_add_ps(qb0, qb1));
670 ib1 += bbuf[0] + bbuf[2];
671 ib2 += bbuf[1] + bbuf[3];
672 #endif
673
674 #if CV_NEON
675
676 ib1 += (float)(nB1[0] + nB1[1] + nB1[2] + nB1[3]);
677 ib2 += (float)(nB2[0] + nB2[1] + nB2[2] + nB2[3]);
678 #endif
679
680 b1 = ib1*FLT_SCALE;
681 b2 = ib2*FLT_SCALE;
682
683 Point2f delta( (float)((A12*b2 - A22*b1) * D),
684 (float)((A12*b1 - A11*b2) * D));
685 //delta = -delta;
686
687 nextPt += delta;
688 nextPts[ptidx] = nextPt + halfWin;
689
690 if( delta.ddot(delta) <= criteria.epsilon )
691 break;
692
693 if( j > 0 && std::abs(delta.x + prevDelta.x) < 0.01 &&
694 std::abs(delta.y + prevDelta.y) < 0.01 )
695 {
696 nextPts[ptidx] -= delta*0.5f;
697 break;
698 }
699 prevDelta = delta;
700 }
701
702 if( status[ptidx] && err && level == 0 && (flags & OPTFLOW_LK_GET_MIN_EIGENVALS) == 0 )
703 {
704 Point2f nextPoint = nextPts[ptidx] - halfWin;
705 Point inextPoint;
706
707 inextPoint.x = cvFloor(nextPoint.x);
708 inextPoint.y = cvFloor(nextPoint.y);
709
710 if( inextPoint.x < -winSize.width || inextPoint.x >= J.cols ||
711 inextPoint.y < -winSize.height || inextPoint.y >= J.rows )
712 {
713 if( status )
714 status[ptidx] = false;
715 continue;
716 }
717
718 float aa = nextPoint.x - inextPoint.x;
719 float bb = nextPoint.y - inextPoint.y;
720 iw00 = cvRound((1.f - aa)*(1.f - bb)*(1 << W_BITS));
721 iw01 = cvRound(aa*(1.f - bb)*(1 << W_BITS));
722 iw10 = cvRound((1.f - aa)*bb*(1 << W_BITS));
723 iw11 = (1 << W_BITS) - iw00 - iw01 - iw10;
724 float errval = 0.f;
725
726 for( y = 0; y < winSize.height; y++ )
727 {
728 const uchar* Jptr = J.ptr() + (y + inextPoint.y)*stepJ + inextPoint.x*cn;
729 const deriv_type* Iptr = IWinBuf.ptr<deriv_type>(y);
730
731 for( x = 0; x < winSize.width*cn; x++ )
732 {
733 int diff = CV_DESCALE(Jptr[x]*iw00 + Jptr[x+cn]*iw01 +
734 Jptr[x+stepJ]*iw10 + Jptr[x+stepJ+cn]*iw11,
735 W_BITS1-5) - Iptr[x];
736 errval += std::abs((float)diff);
737 }
738 }
739 err[ptidx] = errval * 1.f/(32*winSize.width*cn*winSize.height);
740 }
741 }
742 }
743
buildOpticalFlowPyramid(InputArray _img,OutputArrayOfArrays pyramid,Size winSize,int maxLevel,bool withDerivatives,int pyrBorder,int derivBorder,bool tryReuseInputImage)744 int cv::buildOpticalFlowPyramid(InputArray _img, OutputArrayOfArrays pyramid, Size winSize, int maxLevel, bool withDerivatives,
745 int pyrBorder, int derivBorder, bool tryReuseInputImage)
746 {
747 Mat img = _img.getMat();
748 CV_Assert(img.depth() == CV_8U && winSize.width > 2 && winSize.height > 2 );
749 int pyrstep = withDerivatives ? 2 : 1;
750
751 pyramid.create(1, (maxLevel + 1) * pyrstep, 0 /*type*/, -1, true, 0);
752
753 int derivType = CV_MAKETYPE(DataType<cv::detail::deriv_type>::depth, img.channels() * 2);
754
755 //level 0
756 bool lvl0IsSet = false;
757 if(tryReuseInputImage && img.isSubmatrix() && (pyrBorder & BORDER_ISOLATED) == 0)
758 {
759 Size wholeSize;
760 Point ofs;
761 img.locateROI(wholeSize, ofs);
762 if (ofs.x >= winSize.width && ofs.y >= winSize.height
763 && ofs.x + img.cols + winSize.width <= wholeSize.width
764 && ofs.y + img.rows + winSize.height <= wholeSize.height)
765 {
766 pyramid.getMatRef(0) = img;
767 lvl0IsSet = true;
768 }
769 }
770
771 if(!lvl0IsSet)
772 {
773 Mat& temp = pyramid.getMatRef(0);
774
775 if(!temp.empty())
776 temp.adjustROI(winSize.height, winSize.height, winSize.width, winSize.width);
777 if(temp.type() != img.type() || temp.cols != winSize.width*2 + img.cols || temp.rows != winSize.height * 2 + img.rows)
778 temp.create(img.rows + winSize.height*2, img.cols + winSize.width*2, img.type());
779
780 if(pyrBorder == BORDER_TRANSPARENT)
781 img.copyTo(temp(Rect(winSize.width, winSize.height, img.cols, img.rows)));
782 else
783 copyMakeBorder(img, temp, winSize.height, winSize.height, winSize.width, winSize.width, pyrBorder);
784 temp.adjustROI(-winSize.height, -winSize.height, -winSize.width, -winSize.width);
785 }
786
787 Size sz = img.size();
788 Mat prevLevel = pyramid.getMatRef(0);
789 Mat thisLevel = prevLevel;
790
791 for(int level = 0; level <= maxLevel; ++level)
792 {
793 if (level != 0)
794 {
795 Mat& temp = pyramid.getMatRef(level * pyrstep);
796
797 if(!temp.empty())
798 temp.adjustROI(winSize.height, winSize.height, winSize.width, winSize.width);
799 if(temp.type() != img.type() || temp.cols != winSize.width*2 + sz.width || temp.rows != winSize.height * 2 + sz.height)
800 temp.create(sz.height + winSize.height*2, sz.width + winSize.width*2, img.type());
801
802 thisLevel = temp(Rect(winSize.width, winSize.height, sz.width, sz.height));
803 pyrDown(prevLevel, thisLevel, sz);
804
805 if(pyrBorder != BORDER_TRANSPARENT)
806 copyMakeBorder(thisLevel, temp, winSize.height, winSize.height, winSize.width, winSize.width, pyrBorder|BORDER_ISOLATED);
807 temp.adjustROI(-winSize.height, -winSize.height, -winSize.width, -winSize.width);
808 }
809
810 if(withDerivatives)
811 {
812 Mat& deriv = pyramid.getMatRef(level * pyrstep + 1);
813
814 if(!deriv.empty())
815 deriv.adjustROI(winSize.height, winSize.height, winSize.width, winSize.width);
816 if(deriv.type() != derivType || deriv.cols != winSize.width*2 + sz.width || deriv.rows != winSize.height * 2 + sz.height)
817 deriv.create(sz.height + winSize.height*2, sz.width + winSize.width*2, derivType);
818
819 Mat derivI = deriv(Rect(winSize.width, winSize.height, sz.width, sz.height));
820 calcSharrDeriv(thisLevel, derivI);
821
822 if(derivBorder != BORDER_TRANSPARENT)
823 copyMakeBorder(derivI, deriv, winSize.height, winSize.height, winSize.width, winSize.width, derivBorder|BORDER_ISOLATED);
824 deriv.adjustROI(-winSize.height, -winSize.height, -winSize.width, -winSize.width);
825 }
826
827 sz = Size((sz.width+1)/2, (sz.height+1)/2);
828 if( sz.width <= winSize.width || sz.height <= winSize.height )
829 {
830 pyramid.create(1, (level + 1) * pyrstep, 0 /*type*/, -1, true, 0);//check this
831 return level;
832 }
833
834 prevLevel = thisLevel;
835 }
836
837 return maxLevel;
838 }
839
840 namespace cv
841 {
842 class PyrLKOpticalFlow
843 {
844 struct dim3
845 {
846 unsigned int x, y, z;
dim3cv::PyrLKOpticalFlow::dim3847 dim3() : x(0), y(0), z(0) { }
848 };
849 public:
PyrLKOpticalFlow()850 PyrLKOpticalFlow()
851 {
852 winSize = Size(21, 21);
853 maxLevel = 3;
854 iters = 30;
855 derivLambda = 0.5;
856 useInitialFlow = false;
857
858 waveSize = 0;
859 }
860
checkParam()861 bool checkParam()
862 {
863 iters = std::min(std::max(iters, 0), 100);
864
865 derivLambda = std::min(std::max(derivLambda, 0.0), 1.0);
866 if (derivLambda < 0)
867 return false;
868 if (maxLevel < 0 || winSize.width <= 2 || winSize.height <= 2)
869 return false;
870 calcPatchSize();
871 if (patch.x <= 0 || patch.x >= 6 || patch.y <= 0 || patch.y >= 6)
872 return false;
873 if (!initWaveSize())
874 return false;
875 return true;
876 }
877
sparse(const UMat & prevImg,const UMat & nextImg,const UMat & prevPts,UMat & nextPts,UMat & status,UMat & err)878 bool sparse(const UMat &prevImg, const UMat &nextImg, const UMat &prevPts, UMat &nextPts, UMat &status, UMat &err)
879 {
880 if (!checkParam())
881 return false;
882
883 UMat temp1 = (useInitialFlow ? nextPts : prevPts).reshape(1);
884 UMat temp2 = nextPts.reshape(1);
885 multiply(1.0f / (1 << maxLevel) /2.0f, temp1, temp2);
886
887 status.setTo(Scalar::all(1));
888
889 // build the image pyramids.
890 std::vector<UMat> prevPyr; prevPyr.resize(maxLevel + 1);
891 std::vector<UMat> nextPyr; nextPyr.resize(maxLevel + 1);
892
893 // allocate buffers with aligned pitch to be able to use cl_khr_image2d_from_buffer extention
894 // This is the required pitch alignment in pixels
895 int pitchAlign = (int)ocl::Device::getDefault().imagePitchAlignment();
896 if (pitchAlign>0)
897 {
898 prevPyr[0] = UMat(prevImg.rows,(prevImg.cols+pitchAlign-1)&(-pitchAlign),CV_32FC1).colRange(0,prevImg.cols);
899 nextPyr[0] = UMat(nextImg.rows,(nextImg.cols+pitchAlign-1)&(-pitchAlign),CV_32FC1).colRange(0,nextImg.cols);
900 for (int level = 1; level <= maxLevel; ++level)
901 {
902 int cols,rows;
903 // allocate buffers with aligned pitch to be able to use image on buffer extention
904 cols = (prevPyr[level - 1].cols+1)/2;
905 rows = (prevPyr[level - 1].rows+1)/2;
906 prevPyr[level] = UMat(rows,(cols+pitchAlign-1)&(-pitchAlign),prevPyr[level-1].type()).colRange(0,cols);
907 cols = (nextPyr[level - 1].cols+1)/2;
908 rows = (nextPyr[level - 1].rows+1)/2;
909 nextPyr[level] = UMat(rows,(cols+pitchAlign-1)&(-pitchAlign),nextPyr[level-1].type()).colRange(0,cols);
910 }
911 }
912
913 prevImg.convertTo(prevPyr[0], CV_32F);
914 nextImg.convertTo(nextPyr[0], CV_32F);
915
916 for (int level = 1; level <= maxLevel; ++level)
917 {
918 pyrDown(prevPyr[level - 1], prevPyr[level]);
919 pyrDown(nextPyr[level - 1], nextPyr[level]);
920 }
921
922 // dI/dx ~ Ix, dI/dy ~ Iy
923 for (int level = maxLevel; level >= 0; level--)
924 {
925 if (!lkSparse_run(prevPyr[level], nextPyr[level], prevPts,
926 nextPts, status, err,
927 prevPts.cols, level))
928 return false;
929 }
930 return true;
931 }
932
933 Size winSize;
934 int maxLevel;
935 int iters;
936 double derivLambda;
937 bool useInitialFlow;
938
939 private:
940 int waveSize;
initWaveSize()941 bool initWaveSize()
942 {
943 waveSize = 1;
944 if (isDeviceCPU())
945 return true;
946
947 ocl::Kernel kernel;
948 if (!kernel.create("lkSparse", cv::ocl::video::pyrlk_oclsrc, ""))
949 return false;
950 waveSize = (int)kernel.preferedWorkGroupSizeMultiple();
951 return true;
952 }
953 dim3 patch;
calcPatchSize()954 void calcPatchSize()
955 {
956 dim3 block;
957
958 if (winSize.width > 32 && winSize.width > 2 * winSize.height)
959 {
960 block.x = 32;
961 block.y = 8;
962 }
963 else
964 {
965 block.x = 16;
966 block.y = 16;
967 }
968
969 patch.x = (winSize.width + block.x - 1) / block.x;
970 patch.y = (winSize.height + block.y - 1) / block.y;
971
972 block.z = patch.z = 1;
973 }
974
lkSparse_run(UMat & I,UMat & J,const UMat & prevPts,UMat & nextPts,UMat & status,UMat & err,int ptcount,int level)975 bool lkSparse_run(UMat &I, UMat &J, const UMat &prevPts, UMat &nextPts, UMat &status, UMat& err,
976 int ptcount, int level)
977 {
978 size_t localThreads[3] = { 8, 8};
979 size_t globalThreads[3] = { 8 * ptcount, 8};
980 char calcErr = (0 == level) ? 1 : 0;
981
982 cv::String build_options;
983 if (isDeviceCPU())
984 build_options = " -D CPU";
985 else
986 build_options = cv::format("-D WAVE_SIZE=%d", waveSize);
987
988 ocl::Kernel kernel;
989 if (!kernel.create("lkSparse", cv::ocl::video::pyrlk_oclsrc, build_options))
990 return false;
991
992 CV_Assert(I.depth() == CV_32F && J.depth() == CV_32F);
993 ocl::Image2D imageI(I, false, ocl::Image2D::canCreateAlias(I));
994 ocl::Image2D imageJ(J, false, ocl::Image2D::canCreateAlias(J));
995
996 int idxArg = 0;
997 idxArg = kernel.set(idxArg, imageI); //image2d_t I
998 idxArg = kernel.set(idxArg, imageJ); //image2d_t J
999 idxArg = kernel.set(idxArg, ocl::KernelArg::PtrReadOnly(prevPts)); // __global const float2* prevPts
1000 idxArg = kernel.set(idxArg, ocl::KernelArg::PtrReadWrite(nextPts)); // __global const float2* nextPts
1001 idxArg = kernel.set(idxArg, ocl::KernelArg::PtrReadWrite(status)); // __global uchar* status
1002 idxArg = kernel.set(idxArg, ocl::KernelArg::PtrReadWrite(err)); // __global float* err
1003 idxArg = kernel.set(idxArg, (int)level); // const int level
1004 idxArg = kernel.set(idxArg, (int)I.rows); // const int rows
1005 idxArg = kernel.set(idxArg, (int)I.cols); // const int cols
1006 idxArg = kernel.set(idxArg, (int)patch.x); // int PATCH_X
1007 idxArg = kernel.set(idxArg, (int)patch.y); // int PATCH_Y
1008 idxArg = kernel.set(idxArg, (int)winSize.width); // int c_winSize_x
1009 idxArg = kernel.set(idxArg, (int)winSize.height); // int c_winSize_y
1010 idxArg = kernel.set(idxArg, (int)iters); // int c_iters
1011 idxArg = kernel.set(idxArg, (char)calcErr); //char calcErr
1012 return kernel.run(2, globalThreads, localThreads, false);
1013 }
1014 private:
isDeviceCPU()1015 inline static bool isDeviceCPU()
1016 {
1017 return (cv::ocl::Device::TYPE_CPU == cv::ocl::Device::getDefault().type());
1018 }
1019 };
1020
1021
ocl_calcOpticalFlowPyrLK(InputArray _prevImg,InputArray _nextImg,InputArray _prevPts,InputOutputArray _nextPts,OutputArray _status,OutputArray _err,Size winSize,int maxLevel,TermCriteria criteria,int flags)1022 static bool ocl_calcOpticalFlowPyrLK(InputArray _prevImg, InputArray _nextImg,
1023 InputArray _prevPts, InputOutputArray _nextPts,
1024 OutputArray _status, OutputArray _err,
1025 Size winSize, int maxLevel,
1026 TermCriteria criteria,
1027 int flags/*, double minEigThreshold*/ )
1028 {
1029 if (0 != (OPTFLOW_LK_GET_MIN_EIGENVALS & flags))
1030 return false;
1031 if (!cv::ocl::Device::getDefault().imageSupport())
1032 return false;
1033 if (_nextImg.size() != _prevImg.size())
1034 return false;
1035 int typePrev = _prevImg.type();
1036 int typeNext = _nextImg.type();
1037 if ((1 != CV_MAT_CN(typePrev)) || (1 != CV_MAT_CN(typeNext)))
1038 return false;
1039 if ((0 != CV_MAT_DEPTH(typePrev)) || (0 != CV_MAT_DEPTH(typeNext)))
1040 return false;
1041
1042 if (_prevPts.empty() || _prevPts.type() != CV_32FC2 || (!_prevPts.isContinuous()))
1043 return false;
1044 if ((1 != _prevPts.size().height) && (1 != _prevPts.size().width))
1045 return false;
1046 size_t npoints = _prevPts.total();
1047 bool useInitialFlow = (0 != (flags & OPTFLOW_USE_INITIAL_FLOW));
1048 if (useInitialFlow)
1049 {
1050 if (_nextPts.empty() || _nextPts.type() != CV_32FC2 || (!_prevPts.isContinuous()))
1051 return false;
1052 if ((1 != _nextPts.size().height) && (1 != _nextPts.size().width))
1053 return false;
1054 if (_nextPts.total() != npoints)
1055 return false;
1056 }
1057 else
1058 {
1059 _nextPts.create(_prevPts.size(), _prevPts.type());
1060 }
1061
1062 PyrLKOpticalFlow opticalFlow;
1063 opticalFlow.winSize = winSize;
1064 opticalFlow.maxLevel = maxLevel;
1065 opticalFlow.iters = criteria.maxCount;
1066 opticalFlow.derivLambda = criteria.epsilon;
1067 opticalFlow.useInitialFlow = useInitialFlow;
1068
1069 if (!opticalFlow.checkParam())
1070 return false;
1071
1072 UMat umatErr;
1073 if (_err.needed())
1074 {
1075 _err.create((int)npoints, 1, CV_32FC1);
1076 umatErr = _err.getUMat();
1077 }
1078 else
1079 umatErr.create((int)npoints, 1, CV_32FC1);
1080
1081 _status.create((int)npoints, 1, CV_8UC1);
1082 UMat umatNextPts = _nextPts.getUMat();
1083 UMat umatStatus = _status.getUMat();
1084 return opticalFlow.sparse(_prevImg.getUMat(), _nextImg.getUMat(), _prevPts.getUMat(), umatNextPts, umatStatus, umatErr);
1085 }
1086 };
1087
calcOpticalFlowPyrLK(InputArray _prevImg,InputArray _nextImg,InputArray _prevPts,InputOutputArray _nextPts,OutputArray _status,OutputArray _err,Size winSize,int maxLevel,TermCriteria criteria,int flags,double minEigThreshold)1088 void cv::calcOpticalFlowPyrLK( InputArray _prevImg, InputArray _nextImg,
1089 InputArray _prevPts, InputOutputArray _nextPts,
1090 OutputArray _status, OutputArray _err,
1091 Size winSize, int maxLevel,
1092 TermCriteria criteria,
1093 int flags, double minEigThreshold )
1094 {
1095 bool use_opencl = ocl::useOpenCL() &&
1096 (_prevImg.isUMat() || _nextImg.isUMat()) &&
1097 ocl::Image2D::isFormatSupported(CV_32F, 1, false);
1098 if ( use_opencl && ocl_calcOpticalFlowPyrLK(_prevImg, _nextImg, _prevPts, _nextPts, _status, _err, winSize, maxLevel, criteria, flags/*, minEigThreshold*/))
1099 {
1100 CV_IMPL_ADD(CV_IMPL_OCL);
1101 return;
1102 }
1103
1104 Mat prevPtsMat = _prevPts.getMat();
1105 const int derivDepth = DataType<cv::detail::deriv_type>::depth;
1106
1107 CV_Assert( maxLevel >= 0 && winSize.width > 2 && winSize.height > 2 );
1108
1109 int level=0, i, npoints;
1110 CV_Assert( (npoints = prevPtsMat.checkVector(2, CV_32F, true)) >= 0 );
1111
1112 if( npoints == 0 )
1113 {
1114 _nextPts.release();
1115 _status.release();
1116 _err.release();
1117 return;
1118 }
1119
1120 if( !(flags & OPTFLOW_USE_INITIAL_FLOW) )
1121 _nextPts.create(prevPtsMat.size(), prevPtsMat.type(), -1, true);
1122
1123 Mat nextPtsMat = _nextPts.getMat();
1124 CV_Assert( nextPtsMat.checkVector(2, CV_32F, true) == npoints );
1125
1126 const Point2f* prevPts = prevPtsMat.ptr<Point2f>();
1127 Point2f* nextPts = nextPtsMat.ptr<Point2f>();
1128
1129 _status.create((int)npoints, 1, CV_8U, -1, true);
1130 Mat statusMat = _status.getMat(), errMat;
1131 CV_Assert( statusMat.isContinuous() );
1132 uchar* status = statusMat.ptr();
1133 float* err = 0;
1134
1135 for( i = 0; i < npoints; i++ )
1136 status[i] = true;
1137
1138 if( _err.needed() )
1139 {
1140 _err.create((int)npoints, 1, CV_32F, -1, true);
1141 errMat = _err.getMat();
1142 CV_Assert( errMat.isContinuous() );
1143 err = errMat.ptr<float>();
1144 }
1145
1146 std::vector<Mat> prevPyr, nextPyr;
1147 int levels1 = -1;
1148 int lvlStep1 = 1;
1149 int levels2 = -1;
1150 int lvlStep2 = 1;
1151
1152 if(_prevImg.kind() == _InputArray::STD_VECTOR_MAT)
1153 {
1154 _prevImg.getMatVector(prevPyr);
1155
1156 levels1 = int(prevPyr.size()) - 1;
1157 CV_Assert(levels1 >= 0);
1158
1159 if (levels1 % 2 == 1 && prevPyr[0].channels() * 2 == prevPyr[1].channels() && prevPyr[1].depth() == derivDepth)
1160 {
1161 lvlStep1 = 2;
1162 levels1 /= 2;
1163 }
1164
1165 // ensure that pyramid has reqired padding
1166 if(levels1 > 0)
1167 {
1168 Size fullSize;
1169 Point ofs;
1170 prevPyr[lvlStep1].locateROI(fullSize, ofs);
1171 CV_Assert(ofs.x >= winSize.width && ofs.y >= winSize.height
1172 && ofs.x + prevPyr[lvlStep1].cols + winSize.width <= fullSize.width
1173 && ofs.y + prevPyr[lvlStep1].rows + winSize.height <= fullSize.height);
1174 }
1175
1176 if(levels1 < maxLevel)
1177 maxLevel = levels1;
1178 }
1179
1180 if(_nextImg.kind() == _InputArray::STD_VECTOR_MAT)
1181 {
1182 _nextImg.getMatVector(nextPyr);
1183
1184 levels2 = int(nextPyr.size()) - 1;
1185 CV_Assert(levels2 >= 0);
1186
1187 if (levels2 % 2 == 1 && nextPyr[0].channels() * 2 == nextPyr[1].channels() && nextPyr[1].depth() == derivDepth)
1188 {
1189 lvlStep2 = 2;
1190 levels2 /= 2;
1191 }
1192
1193 // ensure that pyramid has reqired padding
1194 if(levels2 > 0)
1195 {
1196 Size fullSize;
1197 Point ofs;
1198 nextPyr[lvlStep2].locateROI(fullSize, ofs);
1199 CV_Assert(ofs.x >= winSize.width && ofs.y >= winSize.height
1200 && ofs.x + nextPyr[lvlStep2].cols + winSize.width <= fullSize.width
1201 && ofs.y + nextPyr[lvlStep2].rows + winSize.height <= fullSize.height);
1202 }
1203
1204 if(levels2 < maxLevel)
1205 maxLevel = levels2;
1206 }
1207
1208 if (levels1 < 0)
1209 maxLevel = buildOpticalFlowPyramid(_prevImg, prevPyr, winSize, maxLevel, false);
1210
1211 if (levels2 < 0)
1212 maxLevel = buildOpticalFlowPyramid(_nextImg, nextPyr, winSize, maxLevel, false);
1213
1214 if( (criteria.type & TermCriteria::COUNT) == 0 )
1215 criteria.maxCount = 30;
1216 else
1217 criteria.maxCount = std::min(std::max(criteria.maxCount, 0), 100);
1218 if( (criteria.type & TermCriteria::EPS) == 0 )
1219 criteria.epsilon = 0.01;
1220 else
1221 criteria.epsilon = std::min(std::max(criteria.epsilon, 0.), 10.);
1222 criteria.epsilon *= criteria.epsilon;
1223
1224 // dI/dx ~ Ix, dI/dy ~ Iy
1225 Mat derivIBuf;
1226 if(lvlStep1 == 1)
1227 derivIBuf.create(prevPyr[0].rows + winSize.height*2, prevPyr[0].cols + winSize.width*2, CV_MAKETYPE(derivDepth, prevPyr[0].channels() * 2));
1228
1229 for( level = maxLevel; level >= 0; level-- )
1230 {
1231 Mat derivI;
1232 if(lvlStep1 == 1)
1233 {
1234 Size imgSize = prevPyr[level * lvlStep1].size();
1235 Mat _derivI( imgSize.height + winSize.height*2,
1236 imgSize.width + winSize.width*2, derivIBuf.type(), derivIBuf.ptr() );
1237 derivI = _derivI(Rect(winSize.width, winSize.height, imgSize.width, imgSize.height));
1238 calcSharrDeriv(prevPyr[level * lvlStep1], derivI);
1239 copyMakeBorder(derivI, _derivI, winSize.height, winSize.height, winSize.width, winSize.width, BORDER_CONSTANT|BORDER_ISOLATED);
1240 }
1241 else
1242 derivI = prevPyr[level * lvlStep1 + 1];
1243
1244 CV_Assert(prevPyr[level * lvlStep1].size() == nextPyr[level * lvlStep2].size());
1245 CV_Assert(prevPyr[level * lvlStep1].type() == nextPyr[level * lvlStep2].type());
1246
1247 #ifdef HAVE_TEGRA_OPTIMIZATION
1248 typedef tegra::LKTrackerInvoker<cv::detail::LKTrackerInvoker> LKTrackerInvoker;
1249 #else
1250 typedef cv::detail::LKTrackerInvoker LKTrackerInvoker;
1251 #endif
1252
1253 parallel_for_(Range(0, npoints), LKTrackerInvoker(prevPyr[level * lvlStep1], derivI,
1254 nextPyr[level * lvlStep2], prevPts, nextPts,
1255 status, err,
1256 winSize, criteria, level, maxLevel,
1257 flags, (float)minEigThreshold));
1258 }
1259 }
1260
1261 namespace cv
1262 {
1263
1264 static void
getRTMatrix(const Point2f * a,const Point2f * b,int count,Mat & M,bool fullAffine)1265 getRTMatrix( const Point2f* a, const Point2f* b,
1266 int count, Mat& M, bool fullAffine )
1267 {
1268 CV_Assert( M.isContinuous() );
1269
1270 if( fullAffine )
1271 {
1272 double sa[6][6]={{0.}}, sb[6]={0.};
1273 Mat A( 6, 6, CV_64F, &sa[0][0] ), B( 6, 1, CV_64F, sb );
1274 Mat MM = M.reshape(1, 6);
1275
1276 for( int i = 0; i < count; i++ )
1277 {
1278 sa[0][0] += a[i].x*a[i].x;
1279 sa[0][1] += a[i].y*a[i].x;
1280 sa[0][2] += a[i].x;
1281
1282 sa[1][1] += a[i].y*a[i].y;
1283 sa[1][2] += a[i].y;
1284
1285 sa[2][2] += 1;
1286
1287 sb[0] += a[i].x*b[i].x;
1288 sb[1] += a[i].y*b[i].x;
1289 sb[2] += b[i].x;
1290 sb[3] += a[i].x*b[i].y;
1291 sb[4] += a[i].y*b[i].y;
1292 sb[5] += b[i].y;
1293 }
1294
1295 sa[3][4] = sa[4][3] = sa[1][0] = sa[0][1];
1296 sa[3][5] = sa[5][3] = sa[2][0] = sa[0][2];
1297 sa[4][5] = sa[5][4] = sa[2][1] = sa[1][2];
1298
1299 sa[3][3] = sa[0][0];
1300 sa[4][4] = sa[1][1];
1301 sa[5][5] = sa[2][2];
1302
1303 solve( A, B, MM, DECOMP_EIG );
1304 }
1305 else
1306 {
1307 double sa[4][4]={{0.}}, sb[4]={0.}, m[4];
1308 Mat A( 4, 4, CV_64F, sa ), B( 4, 1, CV_64F, sb );
1309 Mat MM( 4, 1, CV_64F, m );
1310
1311 for( int i = 0; i < count; i++ )
1312 {
1313 sa[0][0] += a[i].x*a[i].x + a[i].y*a[i].y;
1314 sa[0][2] += a[i].x;
1315 sa[0][3] += a[i].y;
1316
1317
1318 sa[2][1] += -a[i].y;
1319 sa[2][2] += 1;
1320
1321 sa[3][0] += a[i].y;
1322 sa[3][1] += a[i].x;
1323 sa[3][3] += 1;
1324
1325 sb[0] += a[i].x*b[i].x + a[i].y*b[i].y;
1326 sb[1] += a[i].x*b[i].y - a[i].y*b[i].x;
1327 sb[2] += b[i].x;
1328 sb[3] += b[i].y;
1329 }
1330
1331 sa[1][1] = sa[0][0];
1332 sa[2][1] = sa[1][2] = -sa[0][3];
1333 sa[3][1] = sa[1][3] = sa[2][0] = sa[0][2];
1334 sa[2][2] = sa[3][3] = count;
1335 sa[3][0] = sa[0][3];
1336
1337 solve( A, B, MM, DECOMP_EIG );
1338
1339 double* om = M.ptr<double>();
1340 om[0] = om[4] = m[0];
1341 om[1] = -m[1];
1342 om[3] = m[1];
1343 om[2] = m[2];
1344 om[5] = m[3];
1345 }
1346 }
1347
1348 }
1349
estimateRigidTransform(InputArray src1,InputArray src2,bool fullAffine)1350 cv::Mat cv::estimateRigidTransform( InputArray src1, InputArray src2, bool fullAffine )
1351 {
1352 Mat M(2, 3, CV_64F), A = src1.getMat(), B = src2.getMat();
1353
1354 const int COUNT = 15;
1355 const int WIDTH = 160, HEIGHT = 120;
1356 const int RANSAC_MAX_ITERS = 500;
1357 const int RANSAC_SIZE0 = 3;
1358 const double RANSAC_GOOD_RATIO = 0.5;
1359
1360 std::vector<Point2f> pA, pB;
1361 std::vector<int> good_idx;
1362 std::vector<uchar> status;
1363
1364 double scale = 1.;
1365 int i, j, k, k1;
1366
1367 RNG rng((uint64)-1);
1368 int good_count = 0;
1369
1370 if( A.size() != B.size() )
1371 CV_Error( Error::StsUnmatchedSizes, "Both input images must have the same size" );
1372
1373 if( A.type() != B.type() )
1374 CV_Error( Error::StsUnmatchedFormats, "Both input images must have the same data type" );
1375
1376 int count = A.checkVector(2);
1377
1378 if( count > 0 )
1379 {
1380 A.reshape(2, count).convertTo(pA, CV_32F);
1381 B.reshape(2, count).convertTo(pB, CV_32F);
1382 }
1383 else if( A.depth() == CV_8U )
1384 {
1385 int cn = A.channels();
1386 CV_Assert( cn == 1 || cn == 3 || cn == 4 );
1387 Size sz0 = A.size();
1388 Size sz1(WIDTH, HEIGHT);
1389
1390 scale = std::max(1., std::max( (double)sz1.width/sz0.width, (double)sz1.height/sz0.height ));
1391
1392 sz1.width = cvRound( sz0.width * scale );
1393 sz1.height = cvRound( sz0.height * scale );
1394
1395 bool equalSizes = sz1.width == sz0.width && sz1.height == sz0.height;
1396
1397 if( !equalSizes || cn != 1 )
1398 {
1399 Mat sA, sB;
1400
1401 if( cn != 1 )
1402 {
1403 Mat gray;
1404 cvtColor(A, gray, COLOR_BGR2GRAY);
1405 resize(gray, sA, sz1, 0., 0., INTER_AREA);
1406 cvtColor(B, gray, COLOR_BGR2GRAY);
1407 resize(gray, sB, sz1, 0., 0., INTER_AREA);
1408 }
1409 else
1410 {
1411 resize(A, sA, sz1, 0., 0., INTER_AREA);
1412 resize(B, sB, sz1, 0., 0., INTER_AREA);
1413 }
1414
1415 A = sA;
1416 B = sB;
1417 }
1418
1419 int count_y = COUNT;
1420 int count_x = cvRound((double)COUNT*sz1.width/sz1.height);
1421 count = count_x * count_y;
1422
1423 pA.resize(count);
1424 pB.resize(count);
1425 status.resize(count);
1426
1427 for( i = 0, k = 0; i < count_y; i++ )
1428 for( j = 0; j < count_x; j++, k++ )
1429 {
1430 pA[k].x = (j+0.5f)*sz1.width/count_x;
1431 pA[k].y = (i+0.5f)*sz1.height/count_y;
1432 }
1433
1434 // find the corresponding points in B
1435 calcOpticalFlowPyrLK(A, B, pA, pB, status, noArray(), Size(21, 21), 3,
1436 TermCriteria(TermCriteria::MAX_ITER,40,0.1));
1437
1438 // repack the remained points
1439 for( i = 0, k = 0; i < count; i++ )
1440 if( status[i] )
1441 {
1442 if( i > k )
1443 {
1444 pA[k] = pA[i];
1445 pB[k] = pB[i];
1446 }
1447 k++;
1448 }
1449 count = k;
1450 pA.resize(count);
1451 pB.resize(count);
1452 }
1453 else
1454 CV_Error( Error::StsUnsupportedFormat, "Both input images must have either 8uC1 or 8uC3 type" );
1455
1456 good_idx.resize(count);
1457
1458 if( count < RANSAC_SIZE0 )
1459 return Mat();
1460
1461 Rect brect = boundingRect(pB);
1462
1463 // RANSAC stuff:
1464 // 1. find the consensus
1465 for( k = 0; k < RANSAC_MAX_ITERS; k++ )
1466 {
1467 int idx[RANSAC_SIZE0];
1468 Point2f a[RANSAC_SIZE0];
1469 Point2f b[RANSAC_SIZE0];
1470
1471 // choose random 3 non-complanar points from A & B
1472 for( i = 0; i < RANSAC_SIZE0; i++ )
1473 {
1474 for( k1 = 0; k1 < RANSAC_MAX_ITERS; k1++ )
1475 {
1476 idx[i] = rng.uniform(0, count);
1477
1478 for( j = 0; j < i; j++ )
1479 {
1480 if( idx[j] == idx[i] )
1481 break;
1482 // check that the points are not very close one each other
1483 if( fabs(pA[idx[i]].x - pA[idx[j]].x) +
1484 fabs(pA[idx[i]].y - pA[idx[j]].y) < FLT_EPSILON )
1485 break;
1486 if( fabs(pB[idx[i]].x - pB[idx[j]].x) +
1487 fabs(pB[idx[i]].y - pB[idx[j]].y) < FLT_EPSILON )
1488 break;
1489 }
1490
1491 if( j < i )
1492 continue;
1493
1494 if( i+1 == RANSAC_SIZE0 )
1495 {
1496 // additional check for non-complanar vectors
1497 a[0] = pA[idx[0]];
1498 a[1] = pA[idx[1]];
1499 a[2] = pA[idx[2]];
1500
1501 b[0] = pB[idx[0]];
1502 b[1] = pB[idx[1]];
1503 b[2] = pB[idx[2]];
1504
1505 double dax1 = a[1].x - a[0].x, day1 = a[1].y - a[0].y;
1506 double dax2 = a[2].x - a[0].x, day2 = a[2].y - a[0].y;
1507 double dbx1 = b[1].x - b[0].x, dby1 = b[1].y - b[0].y;
1508 double dbx2 = b[2].x - b[0].x, dby2 = b[2].y - b[0].y;
1509 const double eps = 0.01;
1510
1511 if( fabs(dax1*day2 - day1*dax2) < eps*std::sqrt(dax1*dax1+day1*day1)*std::sqrt(dax2*dax2+day2*day2) ||
1512 fabs(dbx1*dby2 - dby1*dbx2) < eps*std::sqrt(dbx1*dbx1+dby1*dby1)*std::sqrt(dbx2*dbx2+dby2*dby2) )
1513 continue;
1514 }
1515 break;
1516 }
1517
1518 if( k1 >= RANSAC_MAX_ITERS )
1519 break;
1520 }
1521
1522 if( i < RANSAC_SIZE0 )
1523 continue;
1524
1525 // estimate the transformation using 3 points
1526 getRTMatrix( a, b, 3, M, fullAffine );
1527
1528 const double* m = M.ptr<double>();
1529 for( i = 0, good_count = 0; i < count; i++ )
1530 {
1531 if( std::abs( m[0]*pA[i].x + m[1]*pA[i].y + m[2] - pB[i].x ) +
1532 std::abs( m[3]*pA[i].x + m[4]*pA[i].y + m[5] - pB[i].y ) < std::max(brect.width,brect.height)*0.05 )
1533 good_idx[good_count++] = i;
1534 }
1535
1536 if( good_count >= count*RANSAC_GOOD_RATIO )
1537 break;
1538 }
1539
1540 if( k >= RANSAC_MAX_ITERS )
1541 return Mat();
1542
1543 if( good_count < count )
1544 {
1545 for( i = 0; i < good_count; i++ )
1546 {
1547 j = good_idx[i];
1548 pA[i] = pA[j];
1549 pB[i] = pB[j];
1550 }
1551 }
1552
1553 getRTMatrix( &pA[0], &pB[0], good_count, M, fullAffine );
1554 M.at<double>(0, 2) /= scale;
1555 M.at<double>(1, 2) /= scale;
1556
1557 return M;
1558 }
1559
1560 /* End of file. */
1561