1/*M/////////////////////////////////////////////////////////////////////////////////////// 2// 3// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. 4// 5// By downloading, copying, installing or using the software you agree to this license. 6// If you do not agree to this license, do not download, install, 7// copy or use the software. 8// 9// 10// License Agreement 11// For Open Source Computer Vision Library 12// 13// Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved. 14// Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved. 15// Third party copyrights are property of their respective owners. 16// 17// @Authors 18// Sen Liu, swjtuls1987@126.com 19// 20// Redistribution and use in source and binary forms, with or without modification, 21// are permitted provided that the following conditions are met: 22// 23// * Redistribution's of source code must retain the above copyright notice, 24// this list of conditions and the following disclaimer. 25// 26// * Redistribution's in binary form must reproduce the above copyright notice, 27// this list of conditions and the following disclaimer in the documentation 28// and/or other materials provided with the distribution. 29// 30// * The name of the copyright holders may not be used to endorse or promote products 31// derived from this software without specific prior written permission. 32// 33// This software is provided by the copyright holders and contributors as is and 34// any express or implied warranties, including, but not limited to, the implied 35// warranties of merchantability and fitness for a particular purpose are disclaimed. 36// In no event shall the Intel Corporation or contributors be liable for any direct, 37// indirect, incidental, special, exemplary, or consequential damages 38// (including, but not limited to, procurement of substitute goods or services; 39// loss of use, data, or profits; or business interruption) however caused 40// and on any theory of liability, whether in contract, strict liability, 41// or tort (including negligence or otherwise) arising in any way out of 42// the use of this software, even if advised of the possibility of such damage. 43// 44//M*/ 45 46 47#define tx (int)get_local_id(0) 48#define ty get_local_id(1) 49#define bx get_group_id(0) 50#define bdx (int)get_local_size(0) 51 52#define BORDER_SIZE 5 53#define MAX_KSIZE_HALF 100 54 55#ifndef polyN 56#define polyN 5 57#endif 58 59#if USE_DOUBLE 60#ifdef cl_amd_fp64 61#pragma OPENCL EXTENSION cl_amd_fp64:enable 62#elif defined (cl_khr_fp64) 63#pragma OPENCL EXTENSION cl_khr_fp64:enable 64#endif 65#define TYPE double 66#define VECTYPE double4 67#else 68#define TYPE float 69#define VECTYPE float4 70#endif 71 72__kernel void polynomialExpansion(__global __const float * src, int srcStep, 73 __global float * dst, int dstStep, 74 const int rows, const int cols, 75 __global __const float * c_g, 76 __global __const float * c_xg, 77 __global __const float * c_xxg, 78 __local float * smem, 79 const VECTYPE ig) 80{ 81 const int y = get_global_id(1); 82 const int x = bx * (bdx - 2*polyN) + tx - polyN; 83 84 int xWarped; 85 __local float *row = smem + tx; 86 87 if (y < rows && y >= 0) 88 { 89 xWarped = min(max(x, 0), cols - 1); 90 91 row[0] = src[mad24(y, srcStep, xWarped)] * c_g[0]; 92 row[bdx] = 0.f; 93 row[2*bdx] = 0.f; 94 95#pragma unroll 96 for (int k = 1; k <= polyN; ++k) 97 { 98 float t0 = src[mad24(max(y - k, 0), srcStep, xWarped)]; 99 float t1 = src[mad24(min(y + k, rows - 1), srcStep, xWarped)]; 100 101 row[0] += c_g[k] * (t0 + t1); 102 row[bdx] += c_xg[k] * (t1 - t0); 103 row[2*bdx] += c_xxg[k] * (t0 + t1); 104 } 105 } 106 107 barrier(CLK_LOCAL_MEM_FENCE); 108 109 if (y < rows && y >= 0 && tx >= polyN && tx + polyN < bdx && x < cols) 110 { 111 TYPE b1 = c_g[0] * row[0]; 112 TYPE b3 = c_g[0] * row[bdx]; 113 TYPE b5 = c_g[0] * row[2*bdx]; 114 TYPE b2 = 0, b4 = 0, b6 = 0; 115 116#pragma unroll 117 for (int k = 1; k <= polyN; ++k) 118 { 119 b1 += (row[k] + row[-k]) * c_g[k]; 120 b4 += (row[k] + row[-k]) * c_xxg[k]; 121 b2 += (row[k] - row[-k]) * c_xg[k]; 122 b3 += (row[k + bdx] + row[-k + bdx]) * c_g[k]; 123 b6 += (row[k + bdx] - row[-k + bdx]) * c_xg[k]; 124 b5 += (row[k + 2*bdx] + row[-k + 2*bdx]) * c_g[k]; 125 } 126 127 dst[mad24(y, dstStep, xWarped)] = (float)(b3*ig.s0); 128 dst[mad24(rows + y, dstStep, xWarped)] = (float)(b2*ig.s0); 129 dst[mad24(2*rows + y, dstStep, xWarped)] = (float)(b1*ig.s1 + b5*ig.s2); 130 dst[mad24(3*rows + y, dstStep, xWarped)] = (float)(b1*ig.s1 + b4*ig.s2); 131 dst[mad24(4*rows + y, dstStep, xWarped)] = (float)(b6*ig.s3); 132 } 133} 134 135inline int idx_row_low(const int y, const int last_row) 136{ 137 return abs(y) % (last_row + 1); 138} 139 140inline int idx_row_high(const int y, const int last_row) 141{ 142 return abs(last_row - abs(last_row - y)) % (last_row + 1); 143} 144 145inline int idx_col_low(const int x, const int last_col) 146{ 147 return abs(x) % (last_col + 1); 148} 149 150inline int idx_col_high(const int x, const int last_col) 151{ 152 return abs(last_col - abs(last_col - x)) % (last_col + 1); 153} 154 155inline int idx_col(const int x, const int last_col) 156{ 157 return idx_col_low(idx_col_high(x, last_col), last_col); 158} 159 160__kernel void gaussianBlur(__global const float * src, int srcStep, 161 __global float * dst, int dstStep, const int rows, const int cols, 162 __global const float * c_gKer, const int ksizeHalf, 163 __local float * smem) 164{ 165 const int y = get_global_id(1); 166 const int x = get_global_id(0); 167 168 __local float *row = smem + ty * (bdx + 2*ksizeHalf); 169 170 if (y < rows) 171 { 172 // Vertical pass 173 for (int i = tx; i < bdx + 2*ksizeHalf; i += bdx) 174 { 175 int xExt = (int)(bx * bdx) + i - ksizeHalf; 176 xExt = idx_col(xExt, cols - 1); 177 row[i] = src[mad24(y, srcStep, xExt)] * c_gKer[0]; 178 for (int j = 1; j <= ksizeHalf; ++j) 179 row[i] += (src[mad24(idx_row_low(y - j, rows - 1), srcStep, xExt)] 180 + src[mad24(idx_row_high(y + j, rows - 1), srcStep, xExt)]) * c_gKer[j]; 181 } 182 } 183 184 barrier(CLK_LOCAL_MEM_FENCE); 185 186 if (y < rows && y >= 0 && x < cols && x >= 0) 187 { 188 // Horizontal pass 189 row += tx + ksizeHalf; 190 float res = row[0] * c_gKer[0]; 191 for (int i = 1; i <= ksizeHalf; ++i) 192 res += (row[-i] + row[i]) * c_gKer[i]; 193 194 dst[mad24(y, dstStep, x)] = res; 195 } 196} 197 198__kernel void gaussianBlur5(__global const float * src, int srcStep, 199 __global float * dst, int dstStep, 200 const int rows, const int cols, 201 __global const float * c_gKer, const int ksizeHalf, 202 __local float * smem) 203{ 204 const int y = get_global_id(1); 205 const int x = get_global_id(0); 206 207 const int smw = bdx + 2*ksizeHalf; // shared memory "cols" 208 __local volatile float *row = smem + 5 * ty * smw; 209 210 if (y < rows) 211 { 212 // Vertical pass 213 for (int i = tx; i < bdx + 2*ksizeHalf; i += bdx) 214 { 215 int xExt = (int)(bx * bdx) + i - ksizeHalf; 216 xExt = idx_col(xExt, cols - 1); 217 218#pragma unroll 219 for (int k = 0; k < 5; ++k) 220 row[k*smw + i] = src[mad24(k*rows + y, srcStep, xExt)] * c_gKer[0]; 221 222 for (int j = 1; j <= ksizeHalf; ++j) 223#pragma unroll 224 for (int k = 0; k < 5; ++k) 225 row[k*smw + i] += 226 (src[mad24(k*rows + idx_row_low(y - j, rows - 1), srcStep, xExt)] + 227 src[mad24(k*rows + idx_row_high(y + j, rows - 1), srcStep, xExt)]) * c_gKer[j]; 228 } 229 } 230 231 barrier(CLK_LOCAL_MEM_FENCE); 232 233 if (y < rows && y >= 0 && x < cols && x >= 0) 234 { 235 // Horizontal pass 236 237 row += tx + ksizeHalf; 238 float res[5]; 239 240#pragma unroll 241 for (int k = 0; k < 5; ++k) 242 res[k] = row[k*smw] * c_gKer[0]; 243 244 for (int i = 1; i <= ksizeHalf; ++i) 245#pragma unroll 246 for (int k = 0; k < 5; ++k) 247 res[k] += (row[k*smw - i] + row[k*smw + i]) * c_gKer[i]; 248 249#pragma unroll 250 for (int k = 0; k < 5; ++k) 251 dst[mad24(k*rows + y, dstStep, x)] = res[k]; 252 } 253} 254__constant float c_border[BORDER_SIZE + 1] = { 0.14f, 0.14f, 0.4472f, 0.4472f, 0.4472f, 1.f }; 255 256__kernel void updateMatrices(__global const float * flowx, int xStep, 257 __global const float * flowy, int yStep, 258 const int rows, const int cols, 259 __global const float * R0, int R0Step, 260 __global const float * R1, int R1Step, 261 __global float * M, int mStep) 262{ 263 const int y = get_global_id(1); 264 const int x = get_global_id(0); 265 266 if (y < rows && y >= 0 && x < cols && x >= 0) 267 { 268 float dx = flowx[mad24(y, xStep, x)]; 269 float dy = flowy[mad24(y, yStep, x)]; 270 float fx = x + dx; 271 float fy = y + dy; 272 273 int x1 = convert_int(floor(fx)); 274 int y1 = convert_int(floor(fy)); 275 fx -= x1; 276 fy -= y1; 277 278 float r2, r3, r4, r5, r6; 279 280 if (x1 >= 0 && y1 >= 0 && x1 < cols - 1 && y1 < rows - 1) 281 { 282 float a00 = (1.f - fx) * (1.f - fy); 283 float a01 = fx * (1.f - fy); 284 float a10 = (1.f - fx) * fy; 285 float a11 = fx * fy; 286 287 r2 = a00 * R1[mad24(y1, R1Step, x1)] + 288 a01 * R1[mad24(y1, R1Step, x1 + 1)] + 289 a10 * R1[mad24(y1 + 1, R1Step, x1)] + 290 a11 * R1[mad24(y1 + 1, R1Step, x1 + 1)]; 291 292 r3 = a00 * R1[mad24(rows + y1, R1Step, x1)] + 293 a01 * R1[mad24(rows + y1, R1Step, x1 + 1)] + 294 a10 * R1[mad24(rows + y1 + 1, R1Step, x1)] + 295 a11 * R1[mad24(rows + y1 + 1, R1Step, x1 + 1)]; 296 297 r4 = a00 * R1[mad24(2*rows + y1, R1Step, x1)] + 298 a01 * R1[mad24(2*rows + y1, R1Step, x1 + 1)] + 299 a10 * R1[mad24(2*rows + y1 + 1, R1Step, x1)] + 300 a11 * R1[mad24(2*rows + y1 + 1, R1Step, x1 + 1)]; 301 302 r5 = a00 * R1[mad24(3*rows + y1, R1Step, x1)] + 303 a01 * R1[mad24(3*rows + y1, R1Step, x1 + 1)] + 304 a10 * R1[mad24(3*rows + y1 + 1, R1Step, x1)] + 305 a11 * R1[mad24(3*rows + y1 + 1, R1Step, x1 + 1)]; 306 307 r6 = a00 * R1[mad24(4*rows + y1, R1Step, x1)] + 308 a01 * R1[mad24(4*rows + y1, R1Step, x1 + 1)] + 309 a10 * R1[mad24(4*rows + y1 + 1, R1Step, x1)] + 310 a11 * R1[mad24(4*rows + y1 + 1, R1Step, x1 + 1)]; 311 312 r4 = (R0[mad24(2*rows + y, R0Step, x)] + r4) * 0.5f; 313 r5 = (R0[mad24(3*rows + y, R0Step, x)] + r5) * 0.5f; 314 r6 = (R0[mad24(4*rows + y, R0Step, x)] + r6) * 0.25f; 315 } 316 else 317 { 318 r2 = r3 = 0.f; 319 r4 = R0[mad24(2*rows + y, R0Step, x)]; 320 r5 = R0[mad24(3*rows + y, R0Step, x)]; 321 r6 = R0[mad24(4*rows + y, R0Step, x)] * 0.5f; 322 } 323 324 r2 = (R0[mad24(y, R0Step, x)] - r2) * 0.5f; 325 r3 = (R0[mad24(rows + y, R0Step, x)] - r3) * 0.5f; 326 327 r2 += r4*dy + r6*dx; 328 r3 += r6*dy + r5*dx; 329 330 float scale = 331 c_border[min(x, BORDER_SIZE)] * 332 c_border[min(y, BORDER_SIZE)] * 333 c_border[min(cols - x - 1, BORDER_SIZE)] * 334 c_border[min(rows - y - 1, BORDER_SIZE)]; 335 336 r2 *= scale; 337 r3 *= scale; 338 r4 *= scale; 339 r5 *= scale; 340 r6 *= scale; 341 342 M[mad24(y, mStep, x)] = r4*r4 + r6*r6; 343 M[mad24(rows + y, mStep, x)] = (r4 + r5)*r6; 344 M[mad24(2*rows + y, mStep, x)] = r5*r5 + r6*r6; 345 M[mad24(3*rows + y, mStep, x)] = r4*r2 + r6*r3; 346 M[mad24(4*rows + y, mStep, x)] = r6*r2 + r5*r3; 347 } 348} 349 350__kernel void boxFilter5(__global const float * src, int srcStep, 351 __global float * dst, int dstStep, 352 const int rows, const int cols, 353 const int ksizeHalf, 354 __local float * smem) 355{ 356 const int y = get_global_id(1); 357 const int x = get_global_id(0); 358 359 const float boxAreaInv = 1.f / ((1 + 2*ksizeHalf) * (1 + 2*ksizeHalf)); 360 const int smw = bdx + 2*ksizeHalf; // shared memory "width" 361 __local float *row = smem + 5 * ty * smw; 362 363 if (y < rows) 364 { 365 // Vertical pass 366 for (int i = tx; i < bdx + 2*ksizeHalf; i += bdx) 367 { 368 int xExt = (int)(bx * bdx) + i - ksizeHalf; 369 xExt = min(max(xExt, 0), cols - 1); 370 371#pragma unroll 372 for (int k = 0; k < 5; ++k) 373 row[k*smw + i] = src[mad24(k*rows + y, srcStep, xExt)]; 374 375 for (int j = 1; j <= ksizeHalf; ++j) 376#pragma unroll 377 for (int k = 0; k < 5; ++k) 378 row[k*smw + i] += 379 src[mad24(k*rows + max(y - j, 0), srcStep, xExt)] + 380 src[mad24(k*rows + min(y + j, rows - 1), srcStep, xExt)]; 381 } 382 } 383 384 barrier(CLK_LOCAL_MEM_FENCE); 385 386 if (y < rows && y >= 0 && x < cols && x >= 0) 387 { 388 // Horizontal pass 389 390 row += tx + ksizeHalf; 391 float res[5]; 392 393#pragma unroll 394 for (int k = 0; k < 5; ++k) 395 res[k] = row[k*smw]; 396 397 for (int i = 1; i <= ksizeHalf; ++i) 398#pragma unroll 399 for (int k = 0; k < 5; ++k) 400 res[k] += row[k*smw - i] + row[k*smw + i]; 401 402#pragma unroll 403 for (int k = 0; k < 5; ++k) 404 dst[mad24(k*rows + y, dstStep, x)] = res[k] * boxAreaInv; 405 } 406} 407 408__kernel void updateFlow(__global const float * M, int mStep, 409 __global float * flowx, int xStep, 410 __global float * flowy, int yStep, 411 const int rows, const int cols) 412{ 413 const int y = get_global_id(1); 414 const int x = get_global_id(0); 415 416 if (y < rows && y >= 0 && x < cols && x >= 0) 417 { 418 float g11 = M[mad24(y, mStep, x)]; 419 float g12 = M[mad24(rows + y, mStep, x)]; 420 float g22 = M[mad24(2*rows + y, mStep, x)]; 421 float h1 = M[mad24(3*rows + y, mStep, x)]; 422 float h2 = M[mad24(4*rows + y, mStep, x)]; 423 424 float detInv = 1.f / (g11*g22 - g12*g12 + 1e-3f); 425 426 flowx[mad24(y, xStep, x)] = (g11*h2 - g12*h1) * detInv; 427 flowy[mad24(y, yStep, x)] = (g22*h1 - g12*h2) * detInv; 428 } 429} 430