1 // This file is auto-generated. Do not edit! 2 3 #include "precomp.hpp" 4 #include "opencl_kernels_video.hpp" 5 6 namespace cv 7 { 8 namespace ocl 9 { 10 namespace video 11 { 12 13 const struct ProgramEntry bgfg_mog2={"bgfg_mog2", 14 "#if CN==1\n" 15 "#define T_MEAN float\n" 16 "#define F_ZERO (0.0f)\n" 17 "#define cnMode 1\n" 18 "#define frameToMean(a, b) (b) = *(a);\n" 19 "#define meanToFrame(a, b) *b = convert_uchar_sat(a);\n" 20 "inline float sum(float val)\n" 21 "{\n" 22 "return val;\n" 23 "}\n" 24 "#else\n" 25 "#define T_MEAN float4\n" 26 "#define F_ZERO (0.0f, 0.0f, 0.0f, 0.0f)\n" 27 "#define cnMode 4\n" 28 "#define meanToFrame(a, b)\\\n" 29 "b[0] = convert_uchar_sat(a.x); \\\n" 30 "b[1] = convert_uchar_sat(a.y); \\\n" 31 "b[2] = convert_uchar_sat(a.z);\n" 32 "#define frameToMean(a, b)\\\n" 33 "b.x = a[0]; \\\n" 34 "b.y = a[1]; \\\n" 35 "b.z = a[2]; \\\n" 36 "b.w = 0.0f;\n" 37 "inline float sum(const float4 val)\n" 38 "{\n" 39 "return (val.x + val.y + val.z);\n" 40 "}\n" 41 "#endif\n" 42 "__kernel void mog2_kernel(__global const uchar* frame, int frame_step, int frame_offset, int frame_row, int frame_col,\n" 43 "__global uchar* modesUsed,\n" 44 "__global uchar* weight,\n" 45 "__global uchar* mean,\n" 46 "__global uchar* variance,\n" 47 "__global uchar* fgmask, int fgmask_step, int fgmask_offset,\n" 48 "float alphaT, float alpha1, float prune,\n" 49 "float c_Tb, float c_TB, float c_Tg, float c_varMin,\n" 50 "float c_varMax, float c_varInit, float c_tau\n" 51 "#ifdef SHADOW_DETECT\n" 52 ", uchar c_shadowVal\n" 53 "#endif\n" 54 ")\n" 55 "{\n" 56 "int x = get_global_id(0);\n" 57 "int y = get_global_id(1);\n" 58 "if( x < frame_col && y < frame_row)\n" 59 "{\n" 60 "__global const uchar* _frame = (frame + mad24(y, frame_step, mad24(x, CN, frame_offset)));\n" 61 "T_MEAN pix;\n" 62 "frameToMean(_frame, pix);\n" 63 "uchar foreground = 255;\n" 64 "bool fitsPDF = false;\n" 65 "int pt_idx = mad24(y, frame_col, x);\n" 66 "int idx_step = frame_row * frame_col;\n" 67 "__global uchar* _modesUsed = modesUsed + pt_idx;\n" 68 "uchar nmodes = _modesUsed[0];\n" 69 "float totalWeight = 0.0f;\n" 70 "__global float* _weight = (__global float*)(weight);\n" 71 "__global float* _variance = (__global float*)(variance);\n" 72 "__global T_MEAN* _mean = (__global T_MEAN*)(mean);\n" 73 "uchar mode = 0;\n" 74 "for (; mode < nmodes; ++mode)\n" 75 "{\n" 76 "int mode_idx = mad24(mode, idx_step, pt_idx);\n" 77 "float c_weight = mad(alpha1, _weight[mode_idx], prune);\n" 78 "float c_var = _variance[mode_idx];\n" 79 "T_MEAN c_mean = _mean[mode_idx];\n" 80 "T_MEAN diff = c_mean - pix;\n" 81 "float dist2 = dot(diff, diff);\n" 82 "if (totalWeight < c_TB && dist2 < c_Tb * c_var)\n" 83 "foreground = 0;\n" 84 "if (dist2 < c_Tg * c_var)\n" 85 "{\n" 86 "fitsPDF = true;\n" 87 "c_weight += alphaT;\n" 88 "float k = alphaT / c_weight;\n" 89 "T_MEAN mean_new = mad((T_MEAN)-k, diff, c_mean);\n" 90 "float variance_new = clamp(mad(k, (dist2 - c_var), c_var), c_varMin, c_varMax);\n" 91 "for (int i = mode; i > 0; --i)\n" 92 "{\n" 93 "int prev_idx = mode_idx - idx_step;\n" 94 "if (c_weight < _weight[prev_idx])\n" 95 "break;\n" 96 "_weight[mode_idx] = _weight[prev_idx];\n" 97 "_variance[mode_idx] = _variance[prev_idx];\n" 98 "_mean[mode_idx] = _mean[prev_idx];\n" 99 "mode_idx = prev_idx;\n" 100 "}\n" 101 "_mean[mode_idx] = mean_new;\n" 102 "_variance[mode_idx] = variance_new;\n" 103 "_weight[mode_idx] = c_weight;\n" 104 "totalWeight += c_weight;\n" 105 "mode ++;\n" 106 "break;\n" 107 "}\n" 108 "if (c_weight < -prune)\n" 109 "c_weight = 0.0f;\n" 110 "_weight[mode_idx] = c_weight;\n" 111 "totalWeight += c_weight;\n" 112 "}\n" 113 "for (; mode < nmodes; ++mode)\n" 114 "{\n" 115 "int mode_idx = mad24(mode, idx_step, pt_idx);\n" 116 "float c_weight = mad(alpha1, _weight[mode_idx], prune);\n" 117 "if (c_weight < -prune)\n" 118 "{\n" 119 "c_weight = 0.0f;\n" 120 "nmodes = mode;\n" 121 "break;\n" 122 "}\n" 123 "_weight[mode_idx] = c_weight;\n" 124 "totalWeight += c_weight;\n" 125 "}\n" 126 "if (0.f < totalWeight)\n" 127 "{\n" 128 "totalWeight = 1.f / totalWeight;\n" 129 "for (int mode = 0; mode < nmodes; ++mode)\n" 130 "_weight[mad24(mode, idx_step, pt_idx)] *= totalWeight;\n" 131 "}\n" 132 "if (!fitsPDF)\n" 133 "{\n" 134 "uchar mode = nmodes == (NMIXTURES) ? (NMIXTURES) - 1 : nmodes++;\n" 135 "int mode_idx = mad24(mode, idx_step, pt_idx);\n" 136 "if (nmodes == 1)\n" 137 "_weight[mode_idx] = 1.f;\n" 138 "else\n" 139 "{\n" 140 "_weight[mode_idx] = alphaT;\n" 141 "for (int i = pt_idx; i < mode_idx; i += idx_step)\n" 142 "_weight[i] *= alpha1;\n" 143 "}\n" 144 "for (int i = nmodes - 1; i > 0; --i)\n" 145 "{\n" 146 "int prev_idx = mode_idx - idx_step;\n" 147 "if (alphaT < _weight[prev_idx])\n" 148 "break;\n" 149 "_weight[mode_idx] = _weight[prev_idx];\n" 150 "_variance[mode_idx] = _variance[prev_idx];\n" 151 "_mean[mode_idx] = _mean[prev_idx];\n" 152 "mode_idx = prev_idx;\n" 153 "}\n" 154 "_mean[mode_idx] = pix;\n" 155 "_variance[mode_idx] = c_varInit;\n" 156 "}\n" 157 "_modesUsed[0] = nmodes;\n" 158 "#ifdef SHADOW_DETECT\n" 159 "if (foreground)\n" 160 "{\n" 161 "float tWeight = 0.0f;\n" 162 "for (uchar mode = 0; mode < nmodes; ++mode)\n" 163 "{\n" 164 "int mode_idx = mad24(mode, idx_step, pt_idx);\n" 165 "T_MEAN c_mean = _mean[mode_idx];\n" 166 "T_MEAN pix_mean = pix * c_mean;\n" 167 "float numerator = sum(pix_mean);\n" 168 "float denominator = dot(c_mean, c_mean);\n" 169 "if (denominator == 0)\n" 170 "break;\n" 171 "if (numerator <= denominator && numerator >= c_tau * denominator)\n" 172 "{\n" 173 "float a = numerator / denominator;\n" 174 "T_MEAN dD = mad(a, c_mean, -pix);\n" 175 "if (dot(dD, dD) < c_Tb * _variance[mode_idx] * a * a)\n" 176 "{\n" 177 "foreground = c_shadowVal;\n" 178 "break;\n" 179 "}\n" 180 "}\n" 181 "tWeight += _weight[mode_idx];\n" 182 "if (tWeight > c_TB)\n" 183 "break;\n" 184 "}\n" 185 "}\n" 186 "#endif\n" 187 "__global uchar* _fgmask = fgmask + mad24(y, fgmask_step, x + fgmask_offset);\n" 188 "*_fgmask = (uchar)foreground;\n" 189 "}\n" 190 "}\n" 191 "__kernel void getBackgroundImage2_kernel(__global const uchar* modesUsed,\n" 192 "__global const uchar* weight,\n" 193 "__global const uchar* mean,\n" 194 "__global uchar* dst, int dst_step, int dst_offset, int dst_row, int dst_col,\n" 195 "float c_TB)\n" 196 "{\n" 197 "int x = get_global_id(0);\n" 198 "int y = get_global_id(1);\n" 199 "if(x < dst_col && y < dst_row)\n" 200 "{\n" 201 "int pt_idx = mad24(y, dst_col, x);\n" 202 "__global const uchar* _modesUsed = modesUsed + pt_idx;\n" 203 "uchar nmodes = _modesUsed[0];\n" 204 "T_MEAN meanVal = (T_MEAN)F_ZERO;\n" 205 "float totalWeight = 0.0f;\n" 206 "__global const float* _weight = (__global const float*)weight;\n" 207 "__global const T_MEAN* _mean = (__global const T_MEAN*)(mean);\n" 208 "int idx_step = dst_row * dst_col;\n" 209 "for (uchar mode = 0; mode < nmodes; ++mode)\n" 210 "{\n" 211 "int mode_idx = mad24(mode, idx_step, pt_idx);\n" 212 "float c_weight = _weight[mode_idx];\n" 213 "T_MEAN c_mean = _mean[mode_idx];\n" 214 "meanVal = mad(c_weight, c_mean, meanVal);\n" 215 "totalWeight += c_weight;\n" 216 "if (totalWeight > c_TB)\n" 217 "break;\n" 218 "}\n" 219 "if (0.f < totalWeight)\n" 220 "meanVal = meanVal / totalWeight;\n" 221 "else\n" 222 "meanVal = (T_MEAN)(0.f);\n" 223 "__global uchar* _dst = dst + mad24(y, dst_step, mad24(x, CN, dst_offset));\n" 224 "meanToFrame(meanVal, _dst);\n" 225 "}\n" 226 "}\n" 227 , "b6e3850899862b7f0ab67cb32f1d52e9"}; 228 ProgramSource bgfg_mog2_oclsrc(bgfg_mog2.programStr); 229 const struct ProgramEntry optical_flow_farneback={"optical_flow_farneback", 230 "#define tx (int)get_local_id(0)\n" 231 "#define ty get_local_id(1)\n" 232 "#define bx get_group_id(0)\n" 233 "#define bdx (int)get_local_size(0)\n" 234 "#define BORDER_SIZE 5\n" 235 "#define MAX_KSIZE_HALF 100\n" 236 "#ifndef polyN\n" 237 "#define polyN 5\n" 238 "#endif\n" 239 "#if USE_DOUBLE\n" 240 "#ifdef cl_amd_fp64\n" 241 "#pragma OPENCL EXTENSION cl_amd_fp64:enable\n" 242 "#elif defined (cl_khr_fp64)\n" 243 "#pragma OPENCL EXTENSION cl_khr_fp64:enable\n" 244 "#endif\n" 245 "#define TYPE double\n" 246 "#define VECTYPE double4\n" 247 "#else\n" 248 "#define TYPE float\n" 249 "#define VECTYPE float4\n" 250 "#endif\n" 251 "__kernel void polynomialExpansion(__global __const float * src, int srcStep,\n" 252 "__global float * dst, int dstStep,\n" 253 "const int rows, const int cols,\n" 254 "__global __const float * c_g,\n" 255 "__global __const float * c_xg,\n" 256 "__global __const float * c_xxg,\n" 257 "__local float * smem,\n" 258 "const VECTYPE ig)\n" 259 "{\n" 260 "const int y = get_global_id(1);\n" 261 "const int x = bx * (bdx - 2*polyN) + tx - polyN;\n" 262 "int xWarped;\n" 263 "__local float *row = smem + tx;\n" 264 "if (y < rows && y >= 0)\n" 265 "{\n" 266 "xWarped = min(max(x, 0), cols - 1);\n" 267 "row[0] = src[mad24(y, srcStep, xWarped)] * c_g[0];\n" 268 "row[bdx] = 0.f;\n" 269 "row[2*bdx] = 0.f;\n" 270 "#pragma unroll\n" 271 "for (int k = 1; k <= polyN; ++k)\n" 272 "{\n" 273 "float t0 = src[mad24(max(y - k, 0), srcStep, xWarped)];\n" 274 "float t1 = src[mad24(min(y + k, rows - 1), srcStep, xWarped)];\n" 275 "row[0] += c_g[k] * (t0 + t1);\n" 276 "row[bdx] += c_xg[k] * (t1 - t0);\n" 277 "row[2*bdx] += c_xxg[k] * (t0 + t1);\n" 278 "}\n" 279 "}\n" 280 "barrier(CLK_LOCAL_MEM_FENCE);\n" 281 "if (y < rows && y >= 0 && tx >= polyN && tx + polyN < bdx && x < cols)\n" 282 "{\n" 283 "TYPE b1 = c_g[0] * row[0];\n" 284 "TYPE b3 = c_g[0] * row[bdx];\n" 285 "TYPE b5 = c_g[0] * row[2*bdx];\n" 286 "TYPE b2 = 0, b4 = 0, b6 = 0;\n" 287 "#pragma unroll\n" 288 "for (int k = 1; k <= polyN; ++k)\n" 289 "{\n" 290 "b1 += (row[k] + row[-k]) * c_g[k];\n" 291 "b4 += (row[k] + row[-k]) * c_xxg[k];\n" 292 "b2 += (row[k] - row[-k]) * c_xg[k];\n" 293 "b3 += (row[k + bdx] + row[-k + bdx]) * c_g[k];\n" 294 "b6 += (row[k + bdx] - row[-k + bdx]) * c_xg[k];\n" 295 "b5 += (row[k + 2*bdx] + row[-k + 2*bdx]) * c_g[k];\n" 296 "}\n" 297 "dst[mad24(y, dstStep, xWarped)] = (float)(b3*ig.s0);\n" 298 "dst[mad24(rows + y, dstStep, xWarped)] = (float)(b2*ig.s0);\n" 299 "dst[mad24(2*rows + y, dstStep, xWarped)] = (float)(b1*ig.s1 + b5*ig.s2);\n" 300 "dst[mad24(3*rows + y, dstStep, xWarped)] = (float)(b1*ig.s1 + b4*ig.s2);\n" 301 "dst[mad24(4*rows + y, dstStep, xWarped)] = (float)(b6*ig.s3);\n" 302 "}\n" 303 "}\n" 304 "inline int idx_row_low(const int y, const int last_row)\n" 305 "{\n" 306 "return abs(y) % (last_row + 1);\n" 307 "}\n" 308 "inline int idx_row_high(const int y, const int last_row)\n" 309 "{\n" 310 "return abs(last_row - abs(last_row - y)) % (last_row + 1);\n" 311 "}\n" 312 "inline int idx_col_low(const int x, const int last_col)\n" 313 "{\n" 314 "return abs(x) % (last_col + 1);\n" 315 "}\n" 316 "inline int idx_col_high(const int x, const int last_col)\n" 317 "{\n" 318 "return abs(last_col - abs(last_col - x)) % (last_col + 1);\n" 319 "}\n" 320 "inline int idx_col(const int x, const int last_col)\n" 321 "{\n" 322 "return idx_col_low(idx_col_high(x, last_col), last_col);\n" 323 "}\n" 324 "__kernel void gaussianBlur(__global const float * src, int srcStep,\n" 325 "__global float * dst, int dstStep, const int rows, const int cols,\n" 326 "__global const float * c_gKer, const int ksizeHalf,\n" 327 "__local float * smem)\n" 328 "{\n" 329 "const int y = get_global_id(1);\n" 330 "const int x = get_global_id(0);\n" 331 "__local float *row = smem + ty * (bdx + 2*ksizeHalf);\n" 332 "if (y < rows)\n" 333 "{\n" 334 "for (int i = tx; i < bdx + 2*ksizeHalf; i += bdx)\n" 335 "{\n" 336 "int xExt = (int)(bx * bdx) + i - ksizeHalf;\n" 337 "xExt = idx_col(xExt, cols - 1);\n" 338 "row[i] = src[mad24(y, srcStep, xExt)] * c_gKer[0];\n" 339 "for (int j = 1; j <= ksizeHalf; ++j)\n" 340 "row[i] += (src[mad24(idx_row_low(y - j, rows - 1), srcStep, xExt)]\n" 341 "+ src[mad24(idx_row_high(y + j, rows - 1), srcStep, xExt)]) * c_gKer[j];\n" 342 "}\n" 343 "}\n" 344 "barrier(CLK_LOCAL_MEM_FENCE);\n" 345 "if (y < rows && y >= 0 && x < cols && x >= 0)\n" 346 "{\n" 347 "row += tx + ksizeHalf;\n" 348 "float res = row[0] * c_gKer[0];\n" 349 "for (int i = 1; i <= ksizeHalf; ++i)\n" 350 "res += (row[-i] + row[i]) * c_gKer[i];\n" 351 "dst[mad24(y, dstStep, x)] = res;\n" 352 "}\n" 353 "}\n" 354 "__kernel void gaussianBlur5(__global const float * src, int srcStep,\n" 355 "__global float * dst, int dstStep,\n" 356 "const int rows, const int cols,\n" 357 "__global const float * c_gKer, const int ksizeHalf,\n" 358 "__local float * smem)\n" 359 "{\n" 360 "const int y = get_global_id(1);\n" 361 "const int x = get_global_id(0);\n" 362 "const int smw = bdx + 2*ksizeHalf;\n" 363 "__local volatile float *row = smem + 5 * ty * smw;\n" 364 "if (y < rows)\n" 365 "{\n" 366 "for (int i = tx; i < bdx + 2*ksizeHalf; i += bdx)\n" 367 "{\n" 368 "int xExt = (int)(bx * bdx) + i - ksizeHalf;\n" 369 "xExt = idx_col(xExt, cols - 1);\n" 370 "#pragma unroll\n" 371 "for (int k = 0; k < 5; ++k)\n" 372 "row[k*smw + i] = src[mad24(k*rows + y, srcStep, xExt)] * c_gKer[0];\n" 373 "for (int j = 1; j <= ksizeHalf; ++j)\n" 374 "#pragma unroll\n" 375 "for (int k = 0; k < 5; ++k)\n" 376 "row[k*smw + i] +=\n" 377 "(src[mad24(k*rows + idx_row_low(y - j, rows - 1), srcStep, xExt)] +\n" 378 "src[mad24(k*rows + idx_row_high(y + j, rows - 1), srcStep, xExt)]) * c_gKer[j];\n" 379 "}\n" 380 "}\n" 381 "barrier(CLK_LOCAL_MEM_FENCE);\n" 382 "if (y < rows && y >= 0 && x < cols && x >= 0)\n" 383 "{\n" 384 "row += tx + ksizeHalf;\n" 385 "float res[5];\n" 386 "#pragma unroll\n" 387 "for (int k = 0; k < 5; ++k)\n" 388 "res[k] = row[k*smw] * c_gKer[0];\n" 389 "for (int i = 1; i <= ksizeHalf; ++i)\n" 390 "#pragma unroll\n" 391 "for (int k = 0; k < 5; ++k)\n" 392 "res[k] += (row[k*smw - i] + row[k*smw + i]) * c_gKer[i];\n" 393 "#pragma unroll\n" 394 "for (int k = 0; k < 5; ++k)\n" 395 "dst[mad24(k*rows + y, dstStep, x)] = res[k];\n" 396 "}\n" 397 "}\n" 398 "__constant float c_border[BORDER_SIZE + 1] = { 0.14f, 0.14f, 0.4472f, 0.4472f, 0.4472f, 1.f };\n" 399 "__kernel void updateMatrices(__global const float * flowx, int xStep,\n" 400 "__global const float * flowy, int yStep,\n" 401 "const int rows, const int cols,\n" 402 "__global const float * R0, int R0Step,\n" 403 "__global const float * R1, int R1Step,\n" 404 "__global float * M, int mStep)\n" 405 "{\n" 406 "const int y = get_global_id(1);\n" 407 "const int x = get_global_id(0);\n" 408 "if (y < rows && y >= 0 && x < cols && x >= 0)\n" 409 "{\n" 410 "float dx = flowx[mad24(y, xStep, x)];\n" 411 "float dy = flowy[mad24(y, yStep, x)];\n" 412 "float fx = x + dx;\n" 413 "float fy = y + dy;\n" 414 "int x1 = convert_int(floor(fx));\n" 415 "int y1 = convert_int(floor(fy));\n" 416 "fx -= x1;\n" 417 "fy -= y1;\n" 418 "float r2, r3, r4, r5, r6;\n" 419 "if (x1 >= 0 && y1 >= 0 && x1 < cols - 1 && y1 < rows - 1)\n" 420 "{\n" 421 "float a00 = (1.f - fx) * (1.f - fy);\n" 422 "float a01 = fx * (1.f - fy);\n" 423 "float a10 = (1.f - fx) * fy;\n" 424 "float a11 = fx * fy;\n" 425 "r2 = a00 * R1[mad24(y1, R1Step, x1)] +\n" 426 "a01 * R1[mad24(y1, R1Step, x1 + 1)] +\n" 427 "a10 * R1[mad24(y1 + 1, R1Step, x1)] +\n" 428 "a11 * R1[mad24(y1 + 1, R1Step, x1 + 1)];\n" 429 "r3 = a00 * R1[mad24(rows + y1, R1Step, x1)] +\n" 430 "a01 * R1[mad24(rows + y1, R1Step, x1 + 1)] +\n" 431 "a10 * R1[mad24(rows + y1 + 1, R1Step, x1)] +\n" 432 "a11 * R1[mad24(rows + y1 + 1, R1Step, x1 + 1)];\n" 433 "r4 = a00 * R1[mad24(2*rows + y1, R1Step, x1)] +\n" 434 "a01 * R1[mad24(2*rows + y1, R1Step, x1 + 1)] +\n" 435 "a10 * R1[mad24(2*rows + y1 + 1, R1Step, x1)] +\n" 436 "a11 * R1[mad24(2*rows + y1 + 1, R1Step, x1 + 1)];\n" 437 "r5 = a00 * R1[mad24(3*rows + y1, R1Step, x1)] +\n" 438 "a01 * R1[mad24(3*rows + y1, R1Step, x1 + 1)] +\n" 439 "a10 * R1[mad24(3*rows + y1 + 1, R1Step, x1)] +\n" 440 "a11 * R1[mad24(3*rows + y1 + 1, R1Step, x1 + 1)];\n" 441 "r6 = a00 * R1[mad24(4*rows + y1, R1Step, x1)] +\n" 442 "a01 * R1[mad24(4*rows + y1, R1Step, x1 + 1)] +\n" 443 "a10 * R1[mad24(4*rows + y1 + 1, R1Step, x1)] +\n" 444 "a11 * R1[mad24(4*rows + y1 + 1, R1Step, x1 + 1)];\n" 445 "r4 = (R0[mad24(2*rows + y, R0Step, x)] + r4) * 0.5f;\n" 446 "r5 = (R0[mad24(3*rows + y, R0Step, x)] + r5) * 0.5f;\n" 447 "r6 = (R0[mad24(4*rows + y, R0Step, x)] + r6) * 0.25f;\n" 448 "}\n" 449 "else\n" 450 "{\n" 451 "r2 = r3 = 0.f;\n" 452 "r4 = R0[mad24(2*rows + y, R0Step, x)];\n" 453 "r5 = R0[mad24(3*rows + y, R0Step, x)];\n" 454 "r6 = R0[mad24(4*rows + y, R0Step, x)] * 0.5f;\n" 455 "}\n" 456 "r2 = (R0[mad24(y, R0Step, x)] - r2) * 0.5f;\n" 457 "r3 = (R0[mad24(rows + y, R0Step, x)] - r3) * 0.5f;\n" 458 "r2 += r4*dy + r6*dx;\n" 459 "r3 += r6*dy + r5*dx;\n" 460 "float scale =\n" 461 "c_border[min(x, BORDER_SIZE)] *\n" 462 "c_border[min(y, BORDER_SIZE)] *\n" 463 "c_border[min(cols - x - 1, BORDER_SIZE)] *\n" 464 "c_border[min(rows - y - 1, BORDER_SIZE)];\n" 465 "r2 *= scale;\n" 466 "r3 *= scale;\n" 467 "r4 *= scale;\n" 468 "r5 *= scale;\n" 469 "r6 *= scale;\n" 470 "M[mad24(y, mStep, x)] = r4*r4 + r6*r6;\n" 471 "M[mad24(rows + y, mStep, x)] = (r4 + r5)*r6;\n" 472 "M[mad24(2*rows + y, mStep, x)] = r5*r5 + r6*r6;\n" 473 "M[mad24(3*rows + y, mStep, x)] = r4*r2 + r6*r3;\n" 474 "M[mad24(4*rows + y, mStep, x)] = r6*r2 + r5*r3;\n" 475 "}\n" 476 "}\n" 477 "__kernel void boxFilter5(__global const float * src, int srcStep,\n" 478 "__global float * dst, int dstStep,\n" 479 "const int rows, const int cols,\n" 480 "const int ksizeHalf,\n" 481 "__local float * smem)\n" 482 "{\n" 483 "const int y = get_global_id(1);\n" 484 "const int x = get_global_id(0);\n" 485 "const float boxAreaInv = 1.f / ((1 + 2*ksizeHalf) * (1 + 2*ksizeHalf));\n" 486 "const int smw = bdx + 2*ksizeHalf;\n" 487 "__local float *row = smem + 5 * ty * smw;\n" 488 "if (y < rows)\n" 489 "{\n" 490 "for (int i = tx; i < bdx + 2*ksizeHalf; i += bdx)\n" 491 "{\n" 492 "int xExt = (int)(bx * bdx) + i - ksizeHalf;\n" 493 "xExt = min(max(xExt, 0), cols - 1);\n" 494 "#pragma unroll\n" 495 "for (int k = 0; k < 5; ++k)\n" 496 "row[k*smw + i] = src[mad24(k*rows + y, srcStep, xExt)];\n" 497 "for (int j = 1; j <= ksizeHalf; ++j)\n" 498 "#pragma unroll\n" 499 "for (int k = 0; k < 5; ++k)\n" 500 "row[k*smw + i] +=\n" 501 "src[mad24(k*rows + max(y - j, 0), srcStep, xExt)] +\n" 502 "src[mad24(k*rows + min(y + j, rows - 1), srcStep, xExt)];\n" 503 "}\n" 504 "}\n" 505 "barrier(CLK_LOCAL_MEM_FENCE);\n" 506 "if (y < rows && y >= 0 && x < cols && x >= 0)\n" 507 "{\n" 508 "row += tx + ksizeHalf;\n" 509 "float res[5];\n" 510 "#pragma unroll\n" 511 "for (int k = 0; k < 5; ++k)\n" 512 "res[k] = row[k*smw];\n" 513 "for (int i = 1; i <= ksizeHalf; ++i)\n" 514 "#pragma unroll\n" 515 "for (int k = 0; k < 5; ++k)\n" 516 "res[k] += row[k*smw - i] + row[k*smw + i];\n" 517 "#pragma unroll\n" 518 "for (int k = 0; k < 5; ++k)\n" 519 "dst[mad24(k*rows + y, dstStep, x)] = res[k] * boxAreaInv;\n" 520 "}\n" 521 "}\n" 522 "__kernel void updateFlow(__global const float * M, int mStep,\n" 523 "__global float * flowx, int xStep,\n" 524 "__global float * flowy, int yStep,\n" 525 "const int rows, const int cols)\n" 526 "{\n" 527 "const int y = get_global_id(1);\n" 528 "const int x = get_global_id(0);\n" 529 "if (y < rows && y >= 0 && x < cols && x >= 0)\n" 530 "{\n" 531 "float g11 = M[mad24(y, mStep, x)];\n" 532 "float g12 = M[mad24(rows + y, mStep, x)];\n" 533 "float g22 = M[mad24(2*rows + y, mStep, x)];\n" 534 "float h1 = M[mad24(3*rows + y, mStep, x)];\n" 535 "float h2 = M[mad24(4*rows + y, mStep, x)];\n" 536 "float detInv = 1.f / (g11*g22 - g12*g12 + 1e-3f);\n" 537 "flowx[mad24(y, xStep, x)] = (g11*h2 - g12*h1) * detInv;\n" 538 "flowy[mad24(y, yStep, x)] = (g22*h1 - g12*h2) * detInv;\n" 539 "}\n" 540 "}\n" 541 , "529300e6242f574f83d11a089cc120c0"}; 542 ProgramSource optical_flow_farneback_oclsrc(optical_flow_farneback.programStr); 543 const struct ProgramEntry optical_flow_tvl1={"optical_flow_tvl1", 544 "__kernel void centeredGradientKernel(__global const float* src_ptr, int src_col, int src_row, int src_step,\n" 545 "__global float* dx, __global float* dy, int d_step)\n" 546 "{\n" 547 "int x = get_global_id(0);\n" 548 "int y = get_global_id(1);\n" 549 "if((x < src_col)&&(y < src_row))\n" 550 "{\n" 551 "int src_x1 = (x + 1) < (src_col -1)? (x + 1) : (src_col - 1);\n" 552 "int src_x2 = (x - 1) > 0 ? (x -1) : 0;\n" 553 "dx[y * d_step+ x] = 0.5f * (src_ptr[y * src_step + src_x1] - src_ptr[y * src_step+ src_x2]);\n" 554 "int src_y1 = (y+1) < (src_row - 1) ? (y + 1) : (src_row - 1);\n" 555 "int src_y2 = (y - 1) > 0 ? (y - 1) : 0;\n" 556 "dy[y * d_step+ x] = 0.5f * (src_ptr[src_y1 * src_step + x] - src_ptr[src_y2 * src_step+ x]);\n" 557 "}\n" 558 "}\n" 559 "inline float bicubicCoeff(float x_)\n" 560 "{\n" 561 "float x = fabs(x_);\n" 562 "if (x <= 1.0f)\n" 563 "return x * x * (1.5f * x - 2.5f) + 1.0f;\n" 564 "else if (x < 2.0f)\n" 565 "return x * (x * (-0.5f * x + 2.5f) - 4.0f) + 2.0f;\n" 566 "else\n" 567 "return 0.0f;\n" 568 "}\n" 569 "__kernel void warpBackwardKernel(__global const float* I0, int I0_step, int I0_col, int I0_row,\n" 570 "image2d_t tex_I1, image2d_t tex_I1x, image2d_t tex_I1y,\n" 571 "__global const float* u1, int u1_step,\n" 572 "__global const float* u2,\n" 573 "__global float* I1w,\n" 574 "__global float* I1wx, \n" 575 "__global float* I1wy, \n" 576 "__global float* grad, \n" 577 "__global float* rho,\n" 578 "int I1w_step,\n" 579 "int u2_step,\n" 580 "int u1_offset_x,\n" 581 "int u1_offset_y,\n" 582 "int u2_offset_x,\n" 583 "int u2_offset_y)\n" 584 "{\n" 585 "int x = get_global_id(0);\n" 586 "int y = get_global_id(1);\n" 587 "if(x < I0_col&&y < I0_row)\n" 588 "{\n" 589 "float u1Val = u1[(y + u1_offset_y) * u1_step + x + u1_offset_x];\n" 590 "float u2Val = u2[(y + u2_offset_y) * u2_step + x + u2_offset_x];\n" 591 "float wx = x + u1Val;\n" 592 "float wy = y + u2Val;\n" 593 "int xmin = ceil(wx - 2.0f);\n" 594 "int xmax = floor(wx + 2.0f);\n" 595 "int ymin = ceil(wy - 2.0f);\n" 596 "int ymax = floor(wy + 2.0f);\n" 597 "float sum = 0.0f;\n" 598 "float sumx = 0.0f;\n" 599 "float sumy = 0.0f;\n" 600 "float wsum = 0.0f;\n" 601 "sampler_t sampleri = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;\n" 602 "for (int cy = ymin; cy <= ymax; ++cy)\n" 603 "{\n" 604 "for (int cx = xmin; cx <= xmax; ++cx)\n" 605 "{\n" 606 "float w = bicubicCoeff(wx - cx) * bicubicCoeff(wy - cy);\n" 607 "int2 cood = (int2)(cx, cy);\n" 608 "sum += w * read_imagef(tex_I1, sampleri, cood).x;\n" 609 "sumx += w * read_imagef(tex_I1x, sampleri, cood).x;\n" 610 "sumy += w * read_imagef(tex_I1y, sampleri, cood).x;\n" 611 "wsum += w;\n" 612 "}\n" 613 "}\n" 614 "float coeff = 1.0f / wsum;\n" 615 "float I1wVal = sum * coeff;\n" 616 "float I1wxVal = sumx * coeff;\n" 617 "float I1wyVal = sumy * coeff;\n" 618 "I1w[y * I1w_step + x] = I1wVal;\n" 619 "I1wx[y * I1w_step + x] = I1wxVal;\n" 620 "I1wy[y * I1w_step + x] = I1wyVal;\n" 621 "float Ix2 = I1wxVal * I1wxVal;\n" 622 "float Iy2 = I1wyVal * I1wyVal;\n" 623 "grad[y * I1w_step + x] = Ix2 + Iy2;\n" 624 "float I0Val = I0[y * I0_step + x];\n" 625 "rho[y * I1w_step + x] = I1wVal - I1wxVal * u1Val - I1wyVal * u2Val - I0Val;\n" 626 "}\n" 627 "}\n" 628 "inline float readImage(__global float *image, int x, int y, int rows, int cols, int elemCntPerRow)\n" 629 "{\n" 630 "int i0 = clamp(x, 0, cols - 1);\n" 631 "int j0 = clamp(y, 0, rows - 1);\n" 632 "return image[j0 * elemCntPerRow + i0];\n" 633 "}\n" 634 "__kernel void warpBackwardKernelNoImage2d(__global const float* I0, int I0_step, int I0_col, int I0_row,\n" 635 "__global const float* tex_I1, __global const float* tex_I1x, __global const float* tex_I1y,\n" 636 "__global const float* u1, int u1_step,\n" 637 "__global const float* u2,\n" 638 "__global float* I1w,\n" 639 "__global float* I1wx, \n" 640 "__global float* I1wy, \n" 641 "__global float* grad, \n" 642 "__global float* rho,\n" 643 "int I1w_step,\n" 644 "int u2_step,\n" 645 "int I1_step,\n" 646 "int I1x_step)\n" 647 "{\n" 648 "int x = get_global_id(0);\n" 649 "int y = get_global_id(1);\n" 650 "if(x < I0_col&&y < I0_row)\n" 651 "{\n" 652 "float u1Val = u1[y * u1_step + x];\n" 653 "float u2Val = u2[y * u2_step + x];\n" 654 "float wx = x + u1Val;\n" 655 "float wy = y + u2Val;\n" 656 "int xmin = ceil(wx - 2.0f);\n" 657 "int xmax = floor(wx + 2.0f);\n" 658 "int ymin = ceil(wy - 2.0f);\n" 659 "int ymax = floor(wy + 2.0f);\n" 660 "float sum = 0.0f;\n" 661 "float sumx = 0.0f;\n" 662 "float sumy = 0.0f;\n" 663 "float wsum = 0.0f;\n" 664 "for (int cy = ymin; cy <= ymax; ++cy)\n" 665 "{\n" 666 "for (int cx = xmin; cx <= xmax; ++cx)\n" 667 "{\n" 668 "float w = bicubicCoeff(wx - cx) * bicubicCoeff(wy - cy);\n" 669 "int2 cood = (int2)(cx, cy);\n" 670 "sum += w * readImage(tex_I1, cood.x, cood.y, I0_col, I0_row, I1_step);\n" 671 "sumx += w * readImage(tex_I1x, cood.x, cood.y, I0_col, I0_row, I1x_step);\n" 672 "sumy += w * readImage(tex_I1y, cood.x, cood.y, I0_col, I0_row, I1x_step);\n" 673 "wsum += w;\n" 674 "}\n" 675 "}\n" 676 "float coeff = 1.0f / wsum;\n" 677 "float I1wVal = sum * coeff;\n" 678 "float I1wxVal = sumx * coeff;\n" 679 "float I1wyVal = sumy * coeff;\n" 680 "I1w[y * I1w_step + x] = I1wVal;\n" 681 "I1wx[y * I1w_step + x] = I1wxVal;\n" 682 "I1wy[y * I1w_step + x] = I1wyVal;\n" 683 "float Ix2 = I1wxVal * I1wxVal;\n" 684 "float Iy2 = I1wyVal * I1wyVal;\n" 685 "grad[y * I1w_step + x] = Ix2 + Iy2;\n" 686 "float I0Val = I0[y * I0_step + x];\n" 687 "rho[y * I1w_step + x] = I1wVal - I1wxVal * u1Val - I1wyVal * u2Val - I0Val;\n" 688 "}\n" 689 "}\n" 690 "__kernel void estimateDualVariablesKernel(__global const float* u1, int u1_col, int u1_row, int u1_step,\n" 691 "__global const float* u2,\n" 692 "__global float* p11, int p11_step,\n" 693 "__global float* p12,\n" 694 "__global float* p21,\n" 695 "__global float* p22,\n" 696 "float taut,\n" 697 "int u2_step,\n" 698 "int u1_offset_x,\n" 699 "int u1_offset_y,\n" 700 "int u2_offset_x,\n" 701 "int u2_offset_y)\n" 702 "{\n" 703 "int x = get_global_id(0);\n" 704 "int y = get_global_id(1);\n" 705 "if(x < u1_col && y < u1_row)\n" 706 "{\n" 707 "int src_x1 = (x + 1) < (u1_col - 1) ? (x + 1) : (u1_col - 1);\n" 708 "float u1x = u1[(y + u1_offset_y) * u1_step + src_x1 + u1_offset_x] - u1[(y + u1_offset_y) * u1_step + x + u1_offset_x];\n" 709 "int src_y1 = (y + 1) < (u1_row - 1) ? (y + 1) : (u1_row - 1);\n" 710 "float u1y = u1[(src_y1 + u1_offset_y) * u1_step + x + u1_offset_x] - u1[(y + u1_offset_y) * u1_step + x + u1_offset_x];\n" 711 "int src_x2 = (x + 1) < (u1_col - 1) ? (x + 1) : (u1_col - 1);\n" 712 "float u2x = u2[(y + u2_offset_y) * u2_step + src_x2 + u2_offset_x] - u2[(y + u2_offset_y) * u2_step + x + u2_offset_x];\n" 713 "int src_y2 = (y + 1) < (u1_row - 1) ? (y + 1) : (u1_row - 1);\n" 714 "float u2y = u2[(src_y2 + u2_offset_y) * u2_step + x + u2_offset_x] - u2[(y + u2_offset_y) * u2_step + x + u2_offset_x];\n" 715 "float g1 = hypot(u1x, u1y);\n" 716 "float g2 = hypot(u2x, u2y);\n" 717 "float ng1 = 1.0f + taut * g1;\n" 718 "float ng2 = 1.0f + taut * g2;\n" 719 "p11[y * p11_step + x] = (p11[y * p11_step + x] + taut * u1x) / ng1;\n" 720 "p12[y * p11_step + x] = (p12[y * p11_step + x] + taut * u1y) / ng1;\n" 721 "p21[y * p11_step + x] = (p21[y * p11_step + x] + taut * u2x) / ng2;\n" 722 "p22[y * p11_step + x] = (p22[y * p11_step + x] + taut * u2y) / ng2;\n" 723 "}\n" 724 "}\n" 725 "inline float divergence(__global const float* v1, __global const float* v2, int y, int x, int v1_step, int v2_step)\n" 726 "{\n" 727 "if (x > 0 && y > 0)\n" 728 "{\n" 729 "float v1x = v1[y * v1_step + x] - v1[y * v1_step + x - 1];\n" 730 "float v2y = v2[y * v2_step + x] - v2[(y - 1) * v2_step + x];\n" 731 "return v1x + v2y;\n" 732 "}\n" 733 "else\n" 734 "{\n" 735 "if (y > 0)\n" 736 "return v1[y * v1_step + 0] + v2[y * v2_step + 0] - v2[(y - 1) * v2_step + 0];\n" 737 "else\n" 738 "{\n" 739 "if (x > 0)\n" 740 "return v1[0 * v1_step + x] - v1[0 * v1_step + x - 1] + v2[0 * v2_step + x];\n" 741 "else\n" 742 "return v1[0 * v1_step + 0] + v2[0 * v2_step + 0];\n" 743 "}\n" 744 "}\n" 745 "}\n" 746 "__kernel void estimateUKernel(__global const float* I1wx, int I1wx_col, int I1wx_row, int I1wx_step,\n" 747 "__global const float* I1wy, \n" 748 "__global const float* grad, \n" 749 "__global const float* rho_c, \n" 750 "__global const float* p11, \n" 751 "__global const float* p12, \n" 752 "__global const float* p21, \n" 753 "__global const float* p22, \n" 754 "__global float* u1, int u1_step,\n" 755 "__global float* u2,\n" 756 "__global float* error, float l_t, float theta, int u2_step,\n" 757 "int u1_offset_x,\n" 758 "int u1_offset_y,\n" 759 "int u2_offset_x,\n" 760 "int u2_offset_y,\n" 761 "char calc_error)\n" 762 "{\n" 763 "int x = get_global_id(0);\n" 764 "int y = get_global_id(1);\n" 765 "if(x < I1wx_col && y < I1wx_row)\n" 766 "{\n" 767 "float I1wxVal = I1wx[y * I1wx_step + x];\n" 768 "float I1wyVal = I1wy[y * I1wx_step + x];\n" 769 "float gradVal = grad[y * I1wx_step + x];\n" 770 "float u1OldVal = u1[(y + u1_offset_y) * u1_step + x + u1_offset_x];\n" 771 "float u2OldVal = u2[(y + u2_offset_y) * u2_step + x + u2_offset_x];\n" 772 "float rho = rho_c[y * I1wx_step + x] + (I1wxVal * u1OldVal + I1wyVal * u2OldVal);\n" 773 "float d1 = 0.0f;\n" 774 "float d2 = 0.0f;\n" 775 "if (rho < -l_t * gradVal)\n" 776 "{\n" 777 "d1 = l_t * I1wxVal;\n" 778 "d2 = l_t * I1wyVal;\n" 779 "}\n" 780 "else if (rho > l_t * gradVal)\n" 781 "{\n" 782 "d1 = -l_t * I1wxVal;\n" 783 "d2 = -l_t * I1wyVal;\n" 784 "}\n" 785 "else if (gradVal > 1.192092896e-07f)\n" 786 "{\n" 787 "float fi = -rho / gradVal;\n" 788 "d1 = fi * I1wxVal;\n" 789 "d2 = fi * I1wyVal;\n" 790 "}\n" 791 "float v1 = u1OldVal + d1;\n" 792 "float v2 = u2OldVal + d2;\n" 793 "float div_p1 = divergence(p11, p12, y, x, I1wx_step, I1wx_step);\n" 794 "float div_p2 = divergence(p21, p22, y, x, I1wx_step, I1wx_step);\n" 795 "float u1NewVal = v1 + theta * div_p1;\n" 796 "float u2NewVal = v2 + theta * div_p2;\n" 797 "u1[(y + u1_offset_y) * u1_step + x + u1_offset_x] = u1NewVal;\n" 798 "u2[(y + u2_offset_y) * u2_step + x + u2_offset_x] = u2NewVal;\n" 799 "if(calc_error)\n" 800 "{\n" 801 "float n1 = (u1OldVal - u1NewVal) * (u1OldVal - u1NewVal);\n" 802 "float n2 = (u2OldVal - u2NewVal) * (u2OldVal - u2NewVal);\n" 803 "error[y * I1wx_step + x] = n1 + n2;\n" 804 "}\n" 805 "}\n" 806 "}\n" 807 , "a9d306a49b405703820fae23312ebd28"}; 808 ProgramSource optical_flow_tvl1_oclsrc(optical_flow_tvl1.programStr); 809 const struct ProgramEntry pyrlk={"pyrlk", 810 "#define GRIDSIZE 3\n" 811 "#define LSx 8\n" 812 "#define LSy 8\n" 813 "#define LM_W (LSx*GRIDSIZE+2)\n" 814 "#define LM_H (LSy*GRIDSIZE+2)\n" 815 "#define BUFFER (LSx*LSy)\n" 816 "#define BUFFER2 BUFFER>>1\n" 817 "#ifndef WAVE_SIZE\n" 818 "#define WAVE_SIZE 1\n" 819 "#endif\n" 820 "#ifdef CPU\n" 821 "inline void reduce3(float val1, float val2, float val3, __local float* smem1, __local float* smem2, __local float* smem3, int tid)\n" 822 "{\n" 823 "smem1[tid] = val1;\n" 824 "smem2[tid] = val2;\n" 825 "smem3[tid] = val3;\n" 826 "barrier(CLK_LOCAL_MEM_FENCE);\n" 827 "for(int i = BUFFER2; i > 0; i >>= 1)\n" 828 "{\n" 829 "if(tid < i)\n" 830 "{\n" 831 "smem1[tid] += smem1[tid + i];\n" 832 "smem2[tid] += smem2[tid + i];\n" 833 "smem3[tid] += smem3[tid + i];\n" 834 "}\n" 835 "barrier(CLK_LOCAL_MEM_FENCE);\n" 836 "}\n" 837 "}\n" 838 "inline void reduce2(float val1, float val2, volatile __local float* smem1, volatile __local float* smem2, int tid)\n" 839 "{\n" 840 "smem1[tid] = val1;\n" 841 "smem2[tid] = val2;\n" 842 "barrier(CLK_LOCAL_MEM_FENCE);\n" 843 "for(int i = BUFFER2; i > 0; i >>= 1)\n" 844 "{\n" 845 "if(tid < i)\n" 846 "{\n" 847 "smem1[tid] += smem1[tid + i];\n" 848 "smem2[tid] += smem2[tid + i];\n" 849 "}\n" 850 "barrier(CLK_LOCAL_MEM_FENCE);\n" 851 "}\n" 852 "}\n" 853 "inline void reduce1(float val1, volatile __local float* smem1, int tid)\n" 854 "{\n" 855 "smem1[tid] = val1;\n" 856 "barrier(CLK_LOCAL_MEM_FENCE);\n" 857 "for(int i = BUFFER2; i > 0; i >>= 1)\n" 858 "{\n" 859 "if(tid < i)\n" 860 "{\n" 861 "smem1[tid] += smem1[tid + i];\n" 862 "}\n" 863 "barrier(CLK_LOCAL_MEM_FENCE);\n" 864 "}\n" 865 "}\n" 866 "#else\n" 867 "inline void reduce3(float val1, float val2, float val3,\n" 868 "__local volatile float* smem1, __local volatile float* smem2, __local volatile float* smem3, int tid)\n" 869 "{\n" 870 "smem1[tid] = val1;\n" 871 "smem2[tid] = val2;\n" 872 "smem3[tid] = val3;\n" 873 "barrier(CLK_LOCAL_MEM_FENCE);\n" 874 "if (tid < 32)\n" 875 "{\n" 876 "smem1[tid] += smem1[tid + 32];\n" 877 "smem2[tid] += smem2[tid + 32];\n" 878 "smem3[tid] += smem3[tid + 32];\n" 879 "#if WAVE_SIZE < 32\n" 880 "}\n" 881 "barrier(CLK_LOCAL_MEM_FENCE);\n" 882 "if (tid < 16)\n" 883 "{\n" 884 "#endif\n" 885 "smem1[tid] += smem1[tid + 16];\n" 886 "smem2[tid] += smem2[tid + 16];\n" 887 "smem3[tid] += smem3[tid + 16];\n" 888 "#if WAVE_SIZE <16\n" 889 "}\n" 890 "barrier(CLK_LOCAL_MEM_FENCE);\n" 891 "if (tid<1)\n" 892 "{\n" 893 "#endif\n" 894 "local float8* m1 = (local float8*)smem1;\n" 895 "local float8* m2 = (local float8*)smem2;\n" 896 "local float8* m3 = (local float8*)smem3;\n" 897 "float8 t1 = m1[0]+m1[1];\n" 898 "float8 t2 = m2[0]+m2[1];\n" 899 "float8 t3 = m3[0]+m3[1];\n" 900 "float4 t14 = t1.lo + t1.hi;\n" 901 "float4 t24 = t2.lo + t2.hi;\n" 902 "float4 t34 = t3.lo + t3.hi;\n" 903 "smem1[0] = t14.x+t14.y+t14.z+t14.w;\n" 904 "smem2[0] = t24.x+t24.y+t24.z+t24.w;\n" 905 "smem3[0] = t34.x+t34.y+t34.z+t34.w;\n" 906 "}\n" 907 "barrier(CLK_LOCAL_MEM_FENCE);\n" 908 "}\n" 909 "inline void reduce2(float val1, float val2, __local volatile float* smem1, __local volatile float* smem2, int tid)\n" 910 "{\n" 911 "smem1[tid] = val1;\n" 912 "smem2[tid] = val2;\n" 913 "barrier(CLK_LOCAL_MEM_FENCE);\n" 914 "if (tid < 32)\n" 915 "{\n" 916 "smem1[tid] += smem1[tid + 32];\n" 917 "smem2[tid] += smem2[tid + 32];\n" 918 "#if WAVE_SIZE < 32\n" 919 "}\n" 920 "barrier(CLK_LOCAL_MEM_FENCE);\n" 921 "if (tid < 16)\n" 922 "{\n" 923 "#endif\n" 924 "smem1[tid] += smem1[tid + 16];\n" 925 "smem2[tid] += smem2[tid + 16];\n" 926 "#if WAVE_SIZE <16\n" 927 "}\n" 928 "barrier(CLK_LOCAL_MEM_FENCE);\n" 929 "if (tid<1)\n" 930 "{\n" 931 "#endif\n" 932 "local float8* m1 = (local float8*)smem1;\n" 933 "local float8* m2 = (local float8*)smem2;\n" 934 "float8 t1 = m1[0]+m1[1];\n" 935 "float8 t2 = m2[0]+m2[1];\n" 936 "float4 t14 = t1.lo + t1.hi;\n" 937 "float4 t24 = t2.lo + t2.hi;\n" 938 "smem1[0] = t14.x+t14.y+t14.z+t14.w;\n" 939 "smem2[0] = t24.x+t24.y+t24.z+t24.w;\n" 940 "}\n" 941 "barrier(CLK_LOCAL_MEM_FENCE);\n" 942 "}\n" 943 "inline void reduce1(float val1, __local volatile float* smem1, int tid)\n" 944 "{\n" 945 "smem1[tid] = val1;\n" 946 "barrier(CLK_LOCAL_MEM_FENCE);\n" 947 "if (tid < 32)\n" 948 "{\n" 949 "smem1[tid] += smem1[tid + 32];\n" 950 "#if WAVE_SIZE < 32\n" 951 "}\n" 952 "barrier(CLK_LOCAL_MEM_FENCE);\n" 953 "if (tid < 16)\n" 954 "{\n" 955 "#endif\n" 956 "smem1[tid] += smem1[tid + 16];\n" 957 "#if WAVE_SIZE <16\n" 958 "}\n" 959 "barrier(CLK_LOCAL_MEM_FENCE);\n" 960 "if (tid<1)\n" 961 "{\n" 962 "#endif\n" 963 "local float8* m1 = (local float8*)smem1;\n" 964 "float8 t1 = m1[0]+m1[1];\n" 965 "float4 t14 = t1.lo + t1.hi;\n" 966 "smem1[0] = t14.x+t14.y+t14.z+t14.w;\n" 967 "}\n" 968 "barrier(CLK_LOCAL_MEM_FENCE);\n" 969 "}\n" 970 "#endif\n" 971 "#define SCALE (1.0f / (1 << 20))\n" 972 "#define THRESHOLD 0.01f\n" 973 "__constant sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_LINEAR;\n" 974 "#define VAL(_y,_x,_yy,_xx) (IPatchLocal[(yid+((_y)*LSy)+1+(_yy))*LM_W+(xid+((_x)*LSx)+1+(_xx))])\n" 975 "inline void SetPatch(local float* IPatchLocal, int TileY, int TileX,\n" 976 "float* Pch, float* Dx, float* Dy,\n" 977 "float* A11, float* A12, float* A22, float w)\n" 978 "{\n" 979 "unsigned int xid=get_local_id(0);\n" 980 "unsigned int yid=get_local_id(1);\n" 981 "*Pch = VAL(TileY,TileX,0,0);\n" 982 "float dIdx = (3.0f*VAL(TileY,TileX,-1,1)+10.0f*VAL(TileY,TileX,0,1)+3.0f*VAL(TileY,TileX,+1,1))-(3.0f*VAL(TileY,TileX,-1,-1)+10.0f*VAL(TileY,TileX,0,-1)+3.0f*VAL(TileY,TileX,+1,-1));\n" 983 "float dIdy = (3.0f*VAL(TileY,TileX,1,-1)+10.0f*VAL(TileY,TileX,1,0)+3.0f*VAL(TileY,TileX,1,+1))-(3.0f*VAL(TileY,TileX,-1,-1)+10.0f*VAL(TileY,TileX,-1,0)+3.0f*VAL(TileY,TileX,-1,+1));\n" 984 "dIdx *= w;\n" 985 "dIdy *= w;\n" 986 "*Dx = dIdx;\n" 987 "*Dy = dIdy;\n" 988 "*A11 += dIdx * dIdx;\n" 989 "*A12 += dIdx * dIdy;\n" 990 "*A22 += dIdy * dIdy;\n" 991 "}\n" 992 "#undef VAL\n" 993 "inline void GetPatch(image2d_t J, float x, float y,\n" 994 "float* Pch, float* Dx, float* Dy,\n" 995 "float* b1, float* b2)\n" 996 "{\n" 997 "float J_val = read_imagef(J, sampler, (float2)(x, y)).x;\n" 998 "float diff = (J_val - *Pch) * 32.0f;\n" 999 "*b1 += diff**Dx;\n" 1000 "*b2 += diff**Dy;\n" 1001 "}\n" 1002 "inline void GetError(image2d_t J, const float x, const float y, const float* Pch, float* errval)\n" 1003 "{\n" 1004 "float diff = read_imagef(J, sampler, (float2)(x,y)).x-*Pch;\n" 1005 "*errval += fabs(diff);\n" 1006 "}\n" 1007 "#define READI(_y,_x) IPatchLocal[(yid+((_y)*LSy))*LM_W+(xid+((_x)*LSx))] = read_imagef(I, sampler, (float2)(Point.x + xid+(_x)*LSx + 0.5f-1, Point.y + yid+(_y)*LSy+ 0.5f-1)).x;\n" 1008 "void ReadPatchIToLocalMem(image2d_t I, float2 Point, local float* IPatchLocal)\n" 1009 "{\n" 1010 "unsigned int xid=get_local_id(0);\n" 1011 "unsigned int yid=get_local_id(1);\n" 1012 "READI(0,0);READI(0,1);READI(0,2);\n" 1013 "READI(1,0);READI(1,1);READI(1,2);\n" 1014 "READI(2,0);READI(2,1);READI(2,2);\n" 1015 "if(xid<2)\n" 1016 "{\n" 1017 "READI(0,3);\n" 1018 "READI(1,3);\n" 1019 "READI(2,3);\n" 1020 "}\n" 1021 "if(yid<2)\n" 1022 "{\n" 1023 "READI(3,0);READI(3,1);READI(3,2);\n" 1024 "}\n" 1025 "if(yid<2 && xid<2)\n" 1026 "{\n" 1027 "READI(3,3);\n" 1028 "}\n" 1029 "barrier(CLK_LOCAL_MEM_FENCE);\n" 1030 "}\n" 1031 "#undef READI\n" 1032 "__attribute__((reqd_work_group_size(LSx, LSy, 1)))\n" 1033 "__kernel void lkSparse(image2d_t I, image2d_t J,\n" 1034 "__global const float2* prevPts, __global float2* nextPts, __global uchar* status, __global float* err,\n" 1035 "const int level, const int rows, const int cols, int PATCH_X, int PATCH_Y, int c_winSize_x, int c_winSize_y, int c_iters, char calcErr)\n" 1036 "{\n" 1037 "__local float smem1[BUFFER];\n" 1038 "__local float smem2[BUFFER];\n" 1039 "__local float smem3[BUFFER];\n" 1040 "unsigned int xid=get_local_id(0);\n" 1041 "unsigned int yid=get_local_id(1);\n" 1042 "unsigned int gid=get_group_id(0);\n" 1043 "unsigned int xsize=get_local_size(0);\n" 1044 "unsigned int ysize=get_local_size(1);\n" 1045 "int xBase, yBase, k;\n" 1046 "float wx = ((xid+2*xsize)<c_winSize_x)?1:0;\n" 1047 "float wy = ((yid+2*ysize)<c_winSize_y)?1:0;\n" 1048 "float2 c_halfWin = (float2)((c_winSize_x - 1)>>1, (c_winSize_y - 1)>>1);\n" 1049 "const int tid = mad24(yid, xsize, xid);\n" 1050 "float2 prevPt = prevPts[gid] / (float2)(1 << level);\n" 1051 "if (prevPt.x < 0 || prevPt.x >= cols || prevPt.y < 0 || prevPt.y >= rows)\n" 1052 "{\n" 1053 "if (tid == 0 && level == 0)\n" 1054 "{\n" 1055 "status[gid] = 0;\n" 1056 "}\n" 1057 "return;\n" 1058 "}\n" 1059 "prevPt -= c_halfWin;\n" 1060 "float A11 = 0;\n" 1061 "float A12 = 0;\n" 1062 "float A22 = 0;\n" 1063 "float I_patch[GRIDSIZE][GRIDSIZE];\n" 1064 "float dIdx_patch[GRIDSIZE][GRIDSIZE];\n" 1065 "float dIdy_patch[GRIDSIZE][GRIDSIZE];\n" 1066 "local float IPatchLocal[LM_W*LM_H];\n" 1067 "ReadPatchIToLocalMem(I,prevPt,IPatchLocal);\n" 1068 "{\n" 1069 "SetPatch(IPatchLocal, 0, 0,\n" 1070 "&I_patch[0][0], &dIdx_patch[0][0], &dIdy_patch[0][0],\n" 1071 "&A11, &A12, &A22,1);\n" 1072 "SetPatch(IPatchLocal, 0, 1,\n" 1073 "&I_patch[0][1], &dIdx_patch[0][1], &dIdy_patch[0][1],\n" 1074 "&A11, &A12, &A22,1);\n" 1075 "SetPatch(IPatchLocal, 0, 2,\n" 1076 "&I_patch[0][2], &dIdx_patch[0][2], &dIdy_patch[0][2],\n" 1077 "&A11, &A12, &A22,wx);\n" 1078 "}\n" 1079 "{\n" 1080 "SetPatch(IPatchLocal, 1, 0,\n" 1081 "&I_patch[1][0], &dIdx_patch[1][0], &dIdy_patch[1][0],\n" 1082 "&A11, &A12, &A22,1);\n" 1083 "SetPatch(IPatchLocal, 1,1,\n" 1084 "&I_patch[1][1], &dIdx_patch[1][1], &dIdy_patch[1][1],\n" 1085 "&A11, &A12, &A22,1);\n" 1086 "SetPatch(IPatchLocal, 1,2,\n" 1087 "&I_patch[1][2], &dIdx_patch[1][2], &dIdy_patch[1][2],\n" 1088 "&A11, &A12, &A22,wx);\n" 1089 "}\n" 1090 "{\n" 1091 "SetPatch(IPatchLocal, 2,0,\n" 1092 "&I_patch[2][0], &dIdx_patch[2][0], &dIdy_patch[2][0],\n" 1093 "&A11, &A12, &A22,wy);\n" 1094 "SetPatch(IPatchLocal, 2,1,\n" 1095 "&I_patch[2][1], &dIdx_patch[2][1], &dIdy_patch[2][1],\n" 1096 "&A11, &A12, &A22,wy);\n" 1097 "SetPatch(IPatchLocal, 2,2,\n" 1098 "&I_patch[2][2], &dIdx_patch[2][2], &dIdy_patch[2][2],\n" 1099 "&A11, &A12, &A22,wx*wy);\n" 1100 "}\n" 1101 "reduce3(A11, A12, A22, smem1, smem2, smem3, tid);\n" 1102 "A11 = smem1[0];\n" 1103 "A12 = smem2[0];\n" 1104 "A22 = smem3[0];\n" 1105 "barrier(CLK_LOCAL_MEM_FENCE);\n" 1106 "float D = A11 * A22 - A12 * A12;\n" 1107 "if (D < 1.192092896e-07f)\n" 1108 "{\n" 1109 "if (tid == 0 && level == 0)\n" 1110 "status[gid] = 0;\n" 1111 "return;\n" 1112 "}\n" 1113 "A11 /= D;\n" 1114 "A12 /= D;\n" 1115 "A22 /= D;\n" 1116 "prevPt = nextPts[gid] * 2.0f - c_halfWin;\n" 1117 "for (k = 0; k < c_iters; ++k)\n" 1118 "{\n" 1119 "if (prevPt.x < -c_halfWin.x || prevPt.x >= cols || prevPt.y < -c_halfWin.y || prevPt.y >= rows)\n" 1120 "{\n" 1121 "if (tid == 0 && level == 0)\n" 1122 "status[gid] = 0;\n" 1123 "break;\n" 1124 "}\n" 1125 "float b1 = 0;\n" 1126 "float b2 = 0;\n" 1127 "yBase=yid;\n" 1128 "{\n" 1129 "xBase=xid;\n" 1130 "GetPatch(J, prevPt.x + xBase + 0.5f, prevPt.y + yBase + 0.5f,\n" 1131 "&I_patch[0][0], &dIdx_patch[0][0], &dIdy_patch[0][0],\n" 1132 "&b1, &b2);\n" 1133 "xBase+=xsize;\n" 1134 "GetPatch(J, prevPt.x + xBase + 0.5f, prevPt.y + yBase + 0.5f,\n" 1135 "&I_patch[0][1], &dIdx_patch[0][1], &dIdy_patch[0][1],\n" 1136 "&b1, &b2);\n" 1137 "xBase+=xsize;\n" 1138 "GetPatch(J, prevPt.x + xBase + 0.5f, prevPt.y + yBase + 0.5f,\n" 1139 "&I_patch[0][2], &dIdx_patch[0][2], &dIdy_patch[0][2],\n" 1140 "&b1, &b2);\n" 1141 "}\n" 1142 "yBase+=ysize;\n" 1143 "{\n" 1144 "xBase=xid;\n" 1145 "GetPatch(J, prevPt.x + xBase + 0.5f, prevPt.y + yBase + 0.5f,\n" 1146 "&I_patch[1][0], &dIdx_patch[1][0], &dIdy_patch[1][0],\n" 1147 "&b1, &b2);\n" 1148 "xBase+=xsize;\n" 1149 "GetPatch(J, prevPt.x + xBase + 0.5f, prevPt.y + yBase + 0.5f,\n" 1150 "&I_patch[1][1], &dIdx_patch[1][1], &dIdy_patch[1][1],\n" 1151 "&b1, &b2);\n" 1152 "xBase+=xsize;\n" 1153 "GetPatch(J, prevPt.x + xBase + 0.5f, prevPt.y + yBase + 0.5f,\n" 1154 "&I_patch[1][2], &dIdx_patch[1][2], &dIdy_patch[1][2],\n" 1155 "&b1, &b2);\n" 1156 "}\n" 1157 "yBase+=ysize;\n" 1158 "{\n" 1159 "xBase=xid;\n" 1160 "GetPatch(J, prevPt.x + xBase + 0.5f, prevPt.y + yBase + 0.5f,\n" 1161 "&I_patch[2][0], &dIdx_patch[2][0], &dIdy_patch[2][0],\n" 1162 "&b1, &b2);\n" 1163 "xBase+=xsize;\n" 1164 "GetPatch(J, prevPt.x + xBase + 0.5f, prevPt.y + yBase + 0.5f,\n" 1165 "&I_patch[2][1], &dIdx_patch[2][1], &dIdy_patch[2][1],\n" 1166 "&b1, &b2);\n" 1167 "xBase+=xsize;\n" 1168 "GetPatch(J, prevPt.x + xBase + 0.5f, prevPt.y + yBase + 0.5f,\n" 1169 "&I_patch[2][2], &dIdx_patch[2][2], &dIdy_patch[2][2],\n" 1170 "&b1, &b2);\n" 1171 "}\n" 1172 "reduce2(b1, b2, smem1, smem2, tid);\n" 1173 "b1 = smem1[0];\n" 1174 "b2 = smem2[0];\n" 1175 "barrier(CLK_LOCAL_MEM_FENCE);\n" 1176 "float2 delta;\n" 1177 "delta.x = A12 * b2 - A22 * b1;\n" 1178 "delta.y = A12 * b1 - A11 * b2;\n" 1179 "prevPt += delta;\n" 1180 "if (fabs(delta.x) < THRESHOLD && fabs(delta.y) < THRESHOLD)\n" 1181 "break;\n" 1182 "}\n" 1183 "D = 0.0f;\n" 1184 "if (calcErr)\n" 1185 "{\n" 1186 "yBase=yid;\n" 1187 "{\n" 1188 "xBase=xid;\n" 1189 "GetError(J, prevPt.x + xBase + 0.5f, prevPt.y + yBase + 0.5f,\n" 1190 "&I_patch[0][0], &D);\n" 1191 "xBase+=xsize;\n" 1192 "GetError(J, prevPt.x + xBase + 0.5f, prevPt.y + yBase + 0.5f,\n" 1193 "&I_patch[0][1], &D);\n" 1194 "xBase+=xsize;\n" 1195 "if(xBase<c_winSize_x)\n" 1196 "GetError(J, prevPt.x + xBase + 0.5f, prevPt.y + yBase + 0.5f,\n" 1197 "&I_patch[0][2], &D);\n" 1198 "}\n" 1199 "yBase+=ysize;\n" 1200 "{\n" 1201 "xBase=xid;\n" 1202 "GetError(J, prevPt.x + xBase + 0.5f, prevPt.y + yBase + 0.5f,\n" 1203 "&I_patch[1][0], &D);\n" 1204 "xBase+=xsize;\n" 1205 "GetError(J, prevPt.x + xBase + 0.5f, prevPt.y + yBase + 0.5f,\n" 1206 "&I_patch[1][1], &D);\n" 1207 "xBase+=xsize;\n" 1208 "if(xBase<c_winSize_x)\n" 1209 "GetError(J, prevPt.x + xBase + 0.5f, prevPt.y + yBase + 0.5f,\n" 1210 "&I_patch[1][2], &D);\n" 1211 "}\n" 1212 "yBase+=ysize;\n" 1213 "if(yBase<c_winSize_y)\n" 1214 "{\n" 1215 "xBase=xid;\n" 1216 "GetError(J, prevPt.x + xBase + 0.5f, prevPt.y + yBase + 0.5f,\n" 1217 "&I_patch[2][0], &D);\n" 1218 "xBase+=xsize;\n" 1219 "GetError(J, prevPt.x + xBase + 0.5f, prevPt.y + yBase + 0.5f,\n" 1220 "&I_patch[2][1], &D);\n" 1221 "xBase+=xsize;\n" 1222 "if(xBase<c_winSize_x)\n" 1223 "GetError(J, prevPt.x + xBase + 0.5f, prevPt.y + yBase + 0.5f,\n" 1224 "&I_patch[2][2], &D);\n" 1225 "}\n" 1226 "reduce1(D, smem1, tid);\n" 1227 "}\n" 1228 "if (tid == 0)\n" 1229 "{\n" 1230 "prevPt += c_halfWin;\n" 1231 "nextPts[gid] = prevPt;\n" 1232 "if (calcErr)\n" 1233 "err[gid] = smem1[0] / (float)(c_winSize_x * c_winSize_y);\n" 1234 "}\n" 1235 "}\n" 1236 , "b7099fcbc60bd5528dacc491eadd88c1"}; 1237 ProgramSource pyrlk_oclsrc(pyrlk.programStr); 1238 } 1239 }} 1240