1 // This file is auto-generated. Do not edit! 2 3 #include "precomp.hpp" 4 #include "opencl_kernels_features2d.hpp" 5 6 namespace cv 7 { 8 namespace ocl 9 { 10 namespace features2d 11 { 12 13 const struct ProgramEntry brute_force_match={"brute_force_match", 14 "#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics:enable\n" 15 "#define MAX_FLOAT 3.40282e+038f\n" 16 "#ifndef T\n" 17 "#define T float\n" 18 "#endif\n" 19 "#ifndef BLOCK_SIZE\n" 20 "#define BLOCK_SIZE 16\n" 21 "#endif\n" 22 "#ifndef MAX_DESC_LEN\n" 23 "#define MAX_DESC_LEN 64\n" 24 "#endif\n" 25 "#define BLOCK_SIZE_ODD (BLOCK_SIZE + 1)\n" 26 "#ifndef SHARED_MEM_SZ\n" 27 "# if (BLOCK_SIZE < MAX_DESC_LEN)\n" 28 "# define SHARED_MEM_SZ (kercn * (BLOCK_SIZE * MAX_DESC_LEN + BLOCK_SIZE * BLOCK_SIZE))\n" 29 "# else\n" 30 "# define SHARED_MEM_SZ (kercn * 2 * BLOCK_SIZE_ODD * BLOCK_SIZE)\n" 31 "# endif\n" 32 "#endif\n" 33 "#ifndef DIST_TYPE\n" 34 "#define DIST_TYPE 2\n" 35 "#endif\n" 36 "#if (DIST_TYPE == 2)\n" 37 "# ifdef T_FLOAT\n" 38 "typedef float result_type;\n" 39 "# if (8 == kercn)\n" 40 "typedef float8 value_type;\n" 41 "# define DIST(x, y) {value_type d = fabs((x) - (y)); result += d.s0 + d.s1 + d.s2 + d.s3 + d.s4 + d.s5 + d.s6 + d.s7;}\n" 42 "# elif (4 == kercn)\n" 43 "typedef float4 value_type;\n" 44 "# define DIST(x, y) {value_type d = fabs((x) - (y)); result += d.s0 + d.s1 + d.s2 + d.s3;}\n" 45 "# else\n" 46 "typedef float value_type;\n" 47 "# define DIST(x, y) result += fabs((x) - (y))\n" 48 "# endif\n" 49 "# else\n" 50 "typedef int result_type;\n" 51 "# if (8 == kercn)\n" 52 "typedef int8 value_type;\n" 53 "# define DIST(x, y) {value_type d = abs((x) - (y)); result += d.s0 + d.s1 + d.s2 + d.s3 + d.s4 + d.s5 + d.s6 + d.s7;}\n" 54 "# elif (4 == kercn)\n" 55 "typedef int4 value_type;\n" 56 "# define DIST(x, y) {value_type d = abs((x) - (y)); result += d.s0 + d.s1 + d.s2 + d.s3;}\n" 57 "# else\n" 58 "typedef int value_type;\n" 59 "# define DIST(x, y) result += abs((x) - (y))\n" 60 "# endif\n" 61 "# endif\n" 62 "# define DIST_RES(x) (x)\n" 63 "#elif (DIST_TYPE == 4)\n" 64 "typedef float result_type;\n" 65 "# if (8 == kercn)\n" 66 "typedef float8 value_type;\n" 67 "# define DIST(x, y) {value_type d = ((x) - (y)); result += dot(d.s0123, d.s0123) + dot(d.s4567, d.s4567);}\n" 68 "# elif (4 == kercn)\n" 69 "typedef float4 value_type;\n" 70 "# define DIST(x, y) {value_type d = ((x) - (y)); result += dot(d, d);}\n" 71 "# else\n" 72 "typedef float value_type;\n" 73 "# define DIST(x, y) {value_type d = ((x) - (y)); result = mad(d, d, result);}\n" 74 "# endif\n" 75 "# define DIST_RES(x) sqrt(x)\n" 76 "#elif (DIST_TYPE == 6)\n" 77 "# if (8 == kercn)\n" 78 "typedef int8 value_type;\n" 79 "# elif (4 == kercn)\n" 80 "typedef int4 value_type;\n" 81 "# else\n" 82 "typedef int value_type;\n" 83 "# endif\n" 84 "typedef int result_type;\n" 85 "# define DIST(x, y) result += popcount( (x) ^ (y) )\n" 86 "# define DIST_RES(x) (x)\n" 87 "#endif\n" 88 "inline result_type reduce_block(\n" 89 "__local value_type *s_query,\n" 90 "__local value_type *s_train,\n" 91 "int lidx,\n" 92 "int lidy\n" 93 ")\n" 94 "{\n" 95 "result_type result = 0;\n" 96 "#pragma unroll\n" 97 "for (int j = 0 ; j < BLOCK_SIZE ; j++)\n" 98 "{\n" 99 "DIST(s_query[lidy * BLOCK_SIZE_ODD + j], s_train[j * BLOCK_SIZE_ODD + lidx]);\n" 100 "}\n" 101 "return DIST_RES(result);\n" 102 "}\n" 103 "inline result_type reduce_block_match(\n" 104 "__local value_type *s_query,\n" 105 "__local value_type *s_train,\n" 106 "int lidx,\n" 107 "int lidy\n" 108 ")\n" 109 "{\n" 110 "result_type result = 0;\n" 111 "#pragma unroll\n" 112 "for (int j = 0 ; j < BLOCK_SIZE ; j++)\n" 113 "{\n" 114 "DIST(s_query[lidy * BLOCK_SIZE_ODD + j], s_train[j * BLOCK_SIZE_ODD + lidx]);\n" 115 "}\n" 116 "return result;\n" 117 "}\n" 118 "inline result_type reduce_multi_block(\n" 119 "__local value_type *s_query,\n" 120 "__local value_type *s_train,\n" 121 "int block_index,\n" 122 "int lidx,\n" 123 "int lidy\n" 124 ")\n" 125 "{\n" 126 "result_type result = 0;\n" 127 "#pragma unroll\n" 128 "for (int j = 0 ; j < BLOCK_SIZE ; j++)\n" 129 "{\n" 130 "DIST(s_query[lidy * MAX_DESC_LEN + block_index * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + lidx]);\n" 131 "}\n" 132 "return result;\n" 133 "}\n" 134 "__kernel void BruteForceMatch_Match(\n" 135 "__global T *query,\n" 136 "__global T *train,\n" 137 "__global int *bestTrainIdx,\n" 138 "__global float *bestDistance,\n" 139 "int query_rows,\n" 140 "int query_cols,\n" 141 "int train_rows,\n" 142 "int train_cols,\n" 143 "int step\n" 144 ")\n" 145 "{\n" 146 "const int lidx = get_local_id(0);\n" 147 "const int lidy = get_local_id(1);\n" 148 "const int groupidx = get_group_id(0);\n" 149 "const int queryIdx = mad24(BLOCK_SIZE, groupidx, lidy);\n" 150 "const int queryOffset = min(queryIdx, query_rows - 1) * step;\n" 151 "__global TN *query_vec = (__global TN *)(query + queryOffset);\n" 152 "query_cols /= kercn;\n" 153 "__local float sharebuffer[SHARED_MEM_SZ];\n" 154 "__local value_type *s_query = (__local value_type *)sharebuffer;\n" 155 "#if 0 < MAX_DESC_LEN\n" 156 "__local value_type *s_train = (__local value_type *)sharebuffer + BLOCK_SIZE * MAX_DESC_LEN;\n" 157 "#pragma unroll\n" 158 "for (int i = 0; i < MAX_DESC_LEN / BLOCK_SIZE; i++)\n" 159 "{\n" 160 "const int loadx = mad24(BLOCK_SIZE, i, lidx);\n" 161 "s_query[mad24(MAX_DESC_LEN, lidy, loadx)] = loadx < query_cols ? query_vec[loadx] : 0;\n" 162 "}\n" 163 "#else\n" 164 "__local value_type *s_train = (__local value_type *)sharebuffer + BLOCK_SIZE_ODD * BLOCK_SIZE;\n" 165 "const int s_query_i = mad24(BLOCK_SIZE_ODD, lidy, lidx);\n" 166 "const int s_train_i = mad24(BLOCK_SIZE_ODD, lidx, lidy);\n" 167 "#endif\n" 168 "float myBestDistance = MAX_FLOAT;\n" 169 "int myBestTrainIdx = -1;\n" 170 "for (int t = 0, endt = (train_rows + BLOCK_SIZE - 1) / BLOCK_SIZE; t < endt; t++)\n" 171 "{\n" 172 "result_type result = 0;\n" 173 "const int trainOffset = min(mad24(BLOCK_SIZE, t, lidy), train_rows - 1) * step;\n" 174 "__global TN *train_vec = (__global TN *)(train + trainOffset);\n" 175 "#if 0 < MAX_DESC_LEN\n" 176 "#pragma unroll\n" 177 "for (int i = 0; i < MAX_DESC_LEN / BLOCK_SIZE; i++)\n" 178 "{\n" 179 "const int loadx = mad24(BLOCK_SIZE, i, lidx);\n" 180 "s_train[mad24(BLOCK_SIZE, lidx, lidy)] = loadx < train_cols ? train_vec[loadx] : 0;\n" 181 "barrier(CLK_LOCAL_MEM_FENCE);\n" 182 "result += reduce_multi_block(s_query, s_train, i, lidx, lidy);\n" 183 "barrier(CLK_LOCAL_MEM_FENCE);\n" 184 "}\n" 185 "#else\n" 186 "for (int i = 0, endq = (query_cols + BLOCK_SIZE - 1) / BLOCK_SIZE; i < endq; i++)\n" 187 "{\n" 188 "const int loadx = mad24(i, BLOCK_SIZE, lidx);\n" 189 "if (loadx < query_cols)\n" 190 "{\n" 191 "s_query[s_query_i] = query_vec[loadx];\n" 192 "s_train[s_train_i] = train_vec[loadx];\n" 193 "}\n" 194 "else\n" 195 "{\n" 196 "s_query[s_query_i] = 0;\n" 197 "s_train[s_train_i] = 0;\n" 198 "}\n" 199 "barrier(CLK_LOCAL_MEM_FENCE);\n" 200 "result += reduce_block_match(s_query, s_train, lidx, lidy);\n" 201 "barrier(CLK_LOCAL_MEM_FENCE);\n" 202 "}\n" 203 "#endif\n" 204 "result = DIST_RES(result);\n" 205 "const int trainIdx = mad24(BLOCK_SIZE, t, lidx);\n" 206 "if (queryIdx < query_rows && trainIdx < train_rows && result < myBestDistance )\n" 207 "{\n" 208 "myBestDistance = result;\n" 209 "myBestTrainIdx = trainIdx;\n" 210 "}\n" 211 "}\n" 212 "barrier(CLK_LOCAL_MEM_FENCE);\n" 213 "__local float *s_distance = (__local float *)sharebuffer;\n" 214 "__local int *s_trainIdx = (__local int *)(sharebuffer + BLOCK_SIZE_ODD * BLOCK_SIZE);\n" 215 "s_distance += lidy * BLOCK_SIZE_ODD;\n" 216 "s_trainIdx += lidy * BLOCK_SIZE_ODD;\n" 217 "s_distance[lidx] = myBestDistance;\n" 218 "s_trainIdx[lidx] = myBestTrainIdx;\n" 219 "barrier(CLK_LOCAL_MEM_FENCE);\n" 220 "#pragma unroll\n" 221 "for (int k = 0 ; k < BLOCK_SIZE; k++)\n" 222 "{\n" 223 "if (myBestDistance > s_distance[k])\n" 224 "{\n" 225 "myBestDistance = s_distance[k];\n" 226 "myBestTrainIdx = s_trainIdx[k];\n" 227 "}\n" 228 "}\n" 229 "if (queryIdx < query_rows && lidx == 0)\n" 230 "{\n" 231 "bestTrainIdx[queryIdx] = myBestTrainIdx;\n" 232 "bestDistance[queryIdx] = myBestDistance;\n" 233 "}\n" 234 "}\n" 235 "__kernel void BruteForceMatch_RadiusMatch(\n" 236 "__global T *query,\n" 237 "__global T *train,\n" 238 "float maxDistance,\n" 239 "__global int *bestTrainIdx,\n" 240 "__global float *bestDistance,\n" 241 "__global int *nMatches,\n" 242 "int query_rows,\n" 243 "int query_cols,\n" 244 "int train_rows,\n" 245 "int train_cols,\n" 246 "int bestTrainIdx_cols,\n" 247 "int step,\n" 248 "int ostep\n" 249 ")\n" 250 "{\n" 251 "const int lidx = get_local_id(0);\n" 252 "const int lidy = get_local_id(1);\n" 253 "const int groupidx = get_group_id(0);\n" 254 "const int groupidy = get_group_id(1);\n" 255 "const int queryIdx = mad24(BLOCK_SIZE, groupidy, lidy);\n" 256 "const int queryOffset = min(queryIdx, query_rows - 1) * step;\n" 257 "__global TN *query_vec = (__global TN *)(query + queryOffset);\n" 258 "const int trainIdx = mad24(BLOCK_SIZE, groupidx, lidx);\n" 259 "const int trainOffset = min(mad24(BLOCK_SIZE, groupidx, lidy), train_rows - 1) * step;\n" 260 "__global TN *train_vec = (__global TN *)(train + trainOffset);\n" 261 "query_cols /= kercn;\n" 262 "__local float sharebuffer[SHARED_MEM_SZ];\n" 263 "__local value_type *s_query = (__local value_type *)sharebuffer;\n" 264 "__local value_type *s_train = (__local value_type *)sharebuffer + BLOCK_SIZE_ODD * BLOCK_SIZE;\n" 265 "result_type result = 0;\n" 266 "const int s_query_i = mad24(BLOCK_SIZE_ODD, lidy, lidx);\n" 267 "const int s_train_i = mad24(BLOCK_SIZE_ODD, lidx, lidy);\n" 268 "for (int i = 0 ; i < (query_cols + BLOCK_SIZE - 1) / BLOCK_SIZE ; ++i)\n" 269 "{\n" 270 "const int loadx = mad24(BLOCK_SIZE, i, lidx);\n" 271 "if (loadx < query_cols)\n" 272 "{\n" 273 "s_query[s_query_i] = query_vec[loadx];\n" 274 "s_train[s_train_i] = train_vec[loadx];\n" 275 "}\n" 276 "else\n" 277 "{\n" 278 "s_query[s_query_i] = 0;\n" 279 "s_train[s_train_i] = 0;\n" 280 "}\n" 281 "barrier(CLK_LOCAL_MEM_FENCE);\n" 282 "result += reduce_block(s_query, s_train, lidx, lidy);\n" 283 "barrier(CLK_LOCAL_MEM_FENCE);\n" 284 "}\n" 285 "if (queryIdx < query_rows && trainIdx < train_rows && convert_float(result) < maxDistance)\n" 286 "{\n" 287 "int ind = atom_inc(nMatches + queryIdx);\n" 288 "if(ind < bestTrainIdx_cols)\n" 289 "{\n" 290 "bestTrainIdx[mad24(queryIdx, ostep, ind)] = trainIdx;\n" 291 "bestDistance[mad24(queryIdx, ostep, ind)] = result;\n" 292 "}\n" 293 "}\n" 294 "}\n" 295 "__kernel void BruteForceMatch_knnMatch(\n" 296 "__global T *query,\n" 297 "__global T *train,\n" 298 "__global int2 *bestTrainIdx,\n" 299 "__global float2 *bestDistance,\n" 300 "int query_rows,\n" 301 "int query_cols,\n" 302 "int train_rows,\n" 303 "int train_cols,\n" 304 "int step\n" 305 ")\n" 306 "{\n" 307 "const int lidx = get_local_id(0);\n" 308 "const int lidy = get_local_id(1);\n" 309 "const int groupidx = get_group_id(0);\n" 310 "const int queryIdx = mad24(BLOCK_SIZE, groupidx, lidy);\n" 311 "const int queryOffset = min(queryIdx, query_rows - 1) * step;\n" 312 "__global TN *query_vec = (__global TN *)(query + queryOffset);\n" 313 "query_cols /= kercn;\n" 314 "__local float sharebuffer[SHARED_MEM_SZ];\n" 315 "__local value_type *s_query = (__local value_type *)sharebuffer;\n" 316 "#if 0 < MAX_DESC_LEN\n" 317 "__local value_type *s_train = (__local value_type *)sharebuffer + BLOCK_SIZE * MAX_DESC_LEN;\n" 318 "#pragma unroll\n" 319 "for (int i = 0 ; i < MAX_DESC_LEN / BLOCK_SIZE; i ++)\n" 320 "{\n" 321 "int loadx = mad24(BLOCK_SIZE, i, lidx);\n" 322 "s_query[mad24(MAX_DESC_LEN, lidy, loadx)] = loadx < query_cols ? query_vec[loadx] : 0;\n" 323 "}\n" 324 "#else\n" 325 "__local value_type *s_train = (__local value_type *)sharebuffer + BLOCK_SIZE_ODD * BLOCK_SIZE;\n" 326 "const int s_query_i = mad24(BLOCK_SIZE_ODD, lidy, lidx);\n" 327 "const int s_train_i = mad24(BLOCK_SIZE_ODD, lidx, lidy);\n" 328 "#endif\n" 329 "float myBestDistance1 = MAX_FLOAT;\n" 330 "float myBestDistance2 = MAX_FLOAT;\n" 331 "int myBestTrainIdx1 = -1;\n" 332 "int myBestTrainIdx2 = -1;\n" 333 "for (int t = 0, endt = (train_rows + BLOCK_SIZE - 1) / BLOCK_SIZE; t < endt ; t++)\n" 334 "{\n" 335 "result_type result = 0;\n" 336 "int trainOffset = min(mad24(BLOCK_SIZE, t, lidy), train_rows - 1) * step;\n" 337 "__global TN *train_vec = (__global TN *)(train + trainOffset);\n" 338 "#if 0 < MAX_DESC_LEN\n" 339 "#pragma unroll\n" 340 "for (int i = 0 ; i < MAX_DESC_LEN / BLOCK_SIZE ; i++)\n" 341 "{\n" 342 "const int loadx = mad24(BLOCK_SIZE, i, lidx);\n" 343 "s_train[mad24(BLOCK_SIZE, lidx, lidy)] = loadx < train_cols ? train_vec[loadx] : 0;\n" 344 "barrier(CLK_LOCAL_MEM_FENCE);\n" 345 "result += reduce_multi_block(s_query, s_train, i, lidx, lidy);\n" 346 "barrier(CLK_LOCAL_MEM_FENCE);\n" 347 "}\n" 348 "#else\n" 349 "for (int i = 0, endq = (query_cols + BLOCK_SIZE -1) / BLOCK_SIZE; i < endq ; i++)\n" 350 "{\n" 351 "const int loadx = mad24(BLOCK_SIZE, i, lidx);\n" 352 "if (loadx < query_cols)\n" 353 "{\n" 354 "s_query[s_query_i] = query_vec[loadx];\n" 355 "s_train[s_train_i] = train_vec[loadx];\n" 356 "}\n" 357 "else\n" 358 "{\n" 359 "s_query[s_query_i] = 0;\n" 360 "s_train[s_train_i] = 0;\n" 361 "}\n" 362 "barrier(CLK_LOCAL_MEM_FENCE);\n" 363 "result += reduce_block_match(s_query, s_train, lidx, lidy);\n" 364 "barrier(CLK_LOCAL_MEM_FENCE);\n" 365 "}\n" 366 "#endif\n" 367 "result = DIST_RES(result);\n" 368 "const int trainIdx = mad24(BLOCK_SIZE, t, lidx);\n" 369 "if (queryIdx < query_rows && trainIdx < train_rows)\n" 370 "{\n" 371 "if (result < myBestDistance1)\n" 372 "{\n" 373 "myBestDistance2 = myBestDistance1;\n" 374 "myBestTrainIdx2 = myBestTrainIdx1;\n" 375 "myBestDistance1 = result;\n" 376 "myBestTrainIdx1 = trainIdx;\n" 377 "}\n" 378 "else if (result < myBestDistance2)\n" 379 "{\n" 380 "myBestDistance2 = result;\n" 381 "myBestTrainIdx2 = trainIdx;\n" 382 "}\n" 383 "}\n" 384 "}\n" 385 "barrier(CLK_LOCAL_MEM_FENCE);\n" 386 "__local float *s_distance = (__local float *)sharebuffer;\n" 387 "__local int *s_trainIdx = (__local int *)(sharebuffer + BLOCK_SIZE_ODD * BLOCK_SIZE);\n" 388 "s_distance += lidy * BLOCK_SIZE_ODD;\n" 389 "s_trainIdx += lidy * BLOCK_SIZE_ODD;\n" 390 "s_distance[lidx] = myBestDistance1;\n" 391 "s_trainIdx[lidx] = myBestTrainIdx1;\n" 392 "float bestDistance1 = MAX_FLOAT;\n" 393 "float bestDistance2 = MAX_FLOAT;\n" 394 "int bestTrainIdx1 = -1;\n" 395 "int bestTrainIdx2 = -1;\n" 396 "barrier(CLK_LOCAL_MEM_FENCE);\n" 397 "if (lidx == 0)\n" 398 "{\n" 399 "for (int i = 0 ; i < BLOCK_SIZE ; i++)\n" 400 "{\n" 401 "float val = s_distance[i];\n" 402 "if (val < bestDistance1)\n" 403 "{\n" 404 "bestDistance2 = bestDistance1;\n" 405 "bestTrainIdx2 = bestTrainIdx1;\n" 406 "bestDistance1 = val;\n" 407 "bestTrainIdx1 = s_trainIdx[i];\n" 408 "}\n" 409 "else if (val < bestDistance2)\n" 410 "{\n" 411 "bestDistance2 = val;\n" 412 "bestTrainIdx2 = s_trainIdx[i];\n" 413 "}\n" 414 "}\n" 415 "}\n" 416 "barrier(CLK_LOCAL_MEM_FENCE);\n" 417 "s_distance[lidx] = myBestDistance2;\n" 418 "s_trainIdx[lidx] = myBestTrainIdx2;\n" 419 "barrier(CLK_LOCAL_MEM_FENCE);\n" 420 "if (lidx == 0)\n" 421 "{\n" 422 "for (int i = 0 ; i < BLOCK_SIZE ; i++)\n" 423 "{\n" 424 "float val = s_distance[i];\n" 425 "if (val < bestDistance2)\n" 426 "{\n" 427 "bestDistance2 = val;\n" 428 "bestTrainIdx2 = s_trainIdx[i];\n" 429 "}\n" 430 "}\n" 431 "}\n" 432 "myBestDistance1 = bestDistance1;\n" 433 "myBestDistance2 = bestDistance2;\n" 434 "myBestTrainIdx1 = bestTrainIdx1;\n" 435 "myBestTrainIdx2 = bestTrainIdx2;\n" 436 "if (queryIdx < query_rows && lidx == 0)\n" 437 "{\n" 438 "bestTrainIdx[queryIdx] = (int2)(myBestTrainIdx1, myBestTrainIdx2);\n" 439 "bestDistance[queryIdx] = (float2)(myBestDistance1, myBestDistance2);\n" 440 "}\n" 441 "}\n" 442 , "35c3a1e231d446e4088561e3604fb94f"}; 443 ProgramSource brute_force_match_oclsrc(brute_force_match.programStr); 444 const struct ProgramEntry fast={"fast", 445 "inline int cornerScore(__global const uchar* img, int step)\n" 446 "{\n" 447 "int k, tofs, v = img[0], a0 = 0, b0;\n" 448 "int d[16];\n" 449 "#define LOAD2(idx, ofs) \\\n" 450 "tofs = ofs; d[idx] = (short)(v - img[tofs]); d[idx+8] = (short)(v - img[-tofs])\n" 451 "LOAD2(0, 3);\n" 452 "LOAD2(1, -step+3);\n" 453 "LOAD2(2, -step*2+2);\n" 454 "LOAD2(3, -step*3+1);\n" 455 "LOAD2(4, -step*3);\n" 456 "LOAD2(5, -step*3-1);\n" 457 "LOAD2(6, -step*2-2);\n" 458 "LOAD2(7, -step-3);\n" 459 "#pragma unroll\n" 460 "for( k = 0; k < 16; k += 2 )\n" 461 "{\n" 462 "int a = min((int)d[(k+1)&15], (int)d[(k+2)&15]);\n" 463 "a = min(a, (int)d[(k+3)&15]);\n" 464 "a = min(a, (int)d[(k+4)&15]);\n" 465 "a = min(a, (int)d[(k+5)&15]);\n" 466 "a = min(a, (int)d[(k+6)&15]);\n" 467 "a = min(a, (int)d[(k+7)&15]);\n" 468 "a = min(a, (int)d[(k+8)&15]);\n" 469 "a0 = max(a0, min(a, (int)d[k&15]));\n" 470 "a0 = max(a0, min(a, (int)d[(k+9)&15]));\n" 471 "}\n" 472 "b0 = -a0;\n" 473 "#pragma unroll\n" 474 "for( k = 0; k < 16; k += 2 )\n" 475 "{\n" 476 "int b = max((int)d[(k+1)&15], (int)d[(k+2)&15]);\n" 477 "b = max(b, (int)d[(k+3)&15]);\n" 478 "b = max(b, (int)d[(k+4)&15]);\n" 479 "b = max(b, (int)d[(k+5)&15]);\n" 480 "b = max(b, (int)d[(k+6)&15]);\n" 481 "b = max(b, (int)d[(k+7)&15]);\n" 482 "b = max(b, (int)d[(k+8)&15]);\n" 483 "b0 = min(b0, max(b, (int)d[k]));\n" 484 "b0 = min(b0, max(b, (int)d[(k+9)&15]));\n" 485 "}\n" 486 "return -b0-1;\n" 487 "}\n" 488 "__kernel\n" 489 "void FAST_findKeypoints(\n" 490 "__global const uchar * _img, int step, int img_offset,\n" 491 "int img_rows, int img_cols,\n" 492 "volatile __global int* kp_loc,\n" 493 "int max_keypoints, int threshold )\n" 494 "{\n" 495 "int j = get_global_id(0) + 3;\n" 496 "int i = get_global_id(1) + 3;\n" 497 "if (i < img_rows - 3 && j < img_cols - 3)\n" 498 "{\n" 499 "__global const uchar* img = _img + mad24(i, step, j + img_offset);\n" 500 "int v = img[0], t0 = v - threshold, t1 = v + threshold;\n" 501 "int k, tofs, v0, v1;\n" 502 "int m0 = 0, m1 = 0;\n" 503 "#define UPDATE_MASK(idx, ofs) \\\n" 504 "tofs = ofs; v0 = img[tofs]; v1 = img[-tofs]; \\\n" 505 "m0 |= ((v0 < t0) << idx) | ((v1 < t0) << (8 + idx)); \\\n" 506 "m1 |= ((v0 > t1) << idx) | ((v1 > t1) << (8 + idx))\n" 507 "UPDATE_MASK(0, 3);\n" 508 "if( (m0 | m1) == 0 )\n" 509 "return;\n" 510 "UPDATE_MASK(2, -step*2+2);\n" 511 "UPDATE_MASK(4, -step*3);\n" 512 "UPDATE_MASK(6, -step*2-2);\n" 513 "#define EVEN_MASK (1+4+16+64)\n" 514 "if( ((m0 | (m0 >> 8)) & EVEN_MASK) != EVEN_MASK &&\n" 515 "((m1 | (m1 >> 8)) & EVEN_MASK) != EVEN_MASK )\n" 516 "return;\n" 517 "UPDATE_MASK(1, -step+3);\n" 518 "UPDATE_MASK(3, -step*3+1);\n" 519 "UPDATE_MASK(5, -step*3-1);\n" 520 "UPDATE_MASK(7, -step-3);\n" 521 "if( ((m0 | (m0 >> 8)) & 255) != 255 &&\n" 522 "((m1 | (m1 >> 8)) & 255) != 255 )\n" 523 "return;\n" 524 "m0 |= m0 << 16;\n" 525 "m1 |= m1 << 16;\n" 526 "#define CHECK0(i) ((m0 & (511 << i)) == (511 << i))\n" 527 "#define CHECK1(i) ((m1 & (511 << i)) == (511 << i))\n" 528 "if( CHECK0(0) + CHECK0(1) + CHECK0(2) + CHECK0(3) +\n" 529 "CHECK0(4) + CHECK0(5) + CHECK0(6) + CHECK0(7) +\n" 530 "CHECK0(8) + CHECK0(9) + CHECK0(10) + CHECK0(11) +\n" 531 "CHECK0(12) + CHECK0(13) + CHECK0(14) + CHECK0(15) +\n" 532 "CHECK1(0) + CHECK1(1) + CHECK1(2) + CHECK1(3) +\n" 533 "CHECK1(4) + CHECK1(5) + CHECK1(6) + CHECK1(7) +\n" 534 "CHECK1(8) + CHECK1(9) + CHECK1(10) + CHECK1(11) +\n" 535 "CHECK1(12) + CHECK1(13) + CHECK1(14) + CHECK1(15) == 0 )\n" 536 "return;\n" 537 "{\n" 538 "int idx = atomic_inc(kp_loc);\n" 539 "if( idx < max_keypoints )\n" 540 "{\n" 541 "kp_loc[1 + 2*idx] = j;\n" 542 "kp_loc[2 + 2*idx] = i;\n" 543 "}\n" 544 "}\n" 545 "}\n" 546 "}\n" 547 "__kernel\n" 548 "void FAST_nonmaxSupression(\n" 549 "__global const int* kp_in, volatile __global int* kp_out,\n" 550 "__global const uchar * _img, int step, int img_offset,\n" 551 "int rows, int cols, int counter, int max_keypoints)\n" 552 "{\n" 553 "const int idx = get_global_id(0);\n" 554 "if (idx < counter)\n" 555 "{\n" 556 "int x = kp_in[1 + 2*idx];\n" 557 "int y = kp_in[2 + 2*idx];\n" 558 "__global const uchar* img = _img + mad24(y, step, x + img_offset);\n" 559 "int s = cornerScore(img, step);\n" 560 "if( (x < 4 || s > cornerScore(img-1, step)) +\n" 561 "(y < 4 || s > cornerScore(img-step, step)) != 2 )\n" 562 "return;\n" 563 "if( (x >= cols - 4 || s > cornerScore(img+1, step)) +\n" 564 "(y >= rows - 4 || s > cornerScore(img+step, step)) +\n" 565 "(x < 4 || y < 4 || s > cornerScore(img-step-1, step)) +\n" 566 "(x >= cols - 4 || y < 4 || s > cornerScore(img-step+1, step)) +\n" 567 "(x < 4 || y >= rows - 4 || s > cornerScore(img+step-1, step)) +\n" 568 "(x >= cols - 4 || y >= rows - 4 || s > cornerScore(img+step+1, step)) == 6)\n" 569 "{\n" 570 "int new_idx = atomic_inc(kp_out);\n" 571 "if( new_idx < max_keypoints )\n" 572 "{\n" 573 "kp_out[1 + 3*new_idx] = x;\n" 574 "kp_out[2 + 3*new_idx] = y;\n" 575 "kp_out[3 + 3*new_idx] = s;\n" 576 "}\n" 577 "}\n" 578 "}\n" 579 "}\n" 580 , "f5e6f463f21a7ed77bd4d2c753478305"}; 581 ProgramSource fast_oclsrc(fast.programStr); 582 const struct ProgramEntry orb={"orb", 583 "#define LAYERINFO_SIZE 1\n" 584 "#define LAYERINFO_OFS 0\n" 585 "#define KEYPOINT_SIZE 3\n" 586 "#define ORIENTED_KEYPOINT_SIZE 4\n" 587 "#define KEYPOINT_X 0\n" 588 "#define KEYPOINT_Y 1\n" 589 "#define KEYPOINT_Z 2\n" 590 "#define KEYPOINT_ANGLE 3\n" 591 "#ifdef ORB_RESPONSES\n" 592 "__kernel void\n" 593 "ORB_HarrisResponses(__global const uchar* imgbuf, int imgstep, int imgoffset0,\n" 594 "__global const int* layerinfo, __global const int* keypoints,\n" 595 "__global float* responses, int nkeypoints )\n" 596 "{\n" 597 "int idx = get_global_id(0);\n" 598 "if( idx < nkeypoints )\n" 599 "{\n" 600 "__global const int* kpt = keypoints + idx*KEYPOINT_SIZE;\n" 601 "__global const int* layer = layerinfo + kpt[KEYPOINT_Z]*LAYERINFO_SIZE;\n" 602 "__global const uchar* img = imgbuf + imgoffset0 + layer[LAYERINFO_OFS] +\n" 603 "(kpt[KEYPOINT_Y] - blockSize/2)*imgstep + (kpt[KEYPOINT_X] - blockSize/2);\n" 604 "int i, j;\n" 605 "int a = 0, b = 0, c = 0;\n" 606 "for( i = 0; i < blockSize; i++, img += imgstep-blockSize )\n" 607 "{\n" 608 "for( j = 0; j < blockSize; j++, img++ )\n" 609 "{\n" 610 "int Ix = (img[1] - img[-1])*2 + img[-imgstep+1] - img[-imgstep-1] + img[imgstep+1] - img[imgstep-1];\n" 611 "int Iy = (img[imgstep] - img[-imgstep])*2 + img[imgstep-1] - img[-imgstep-1] + img[imgstep+1] - img[-imgstep+1];\n" 612 "a += Ix*Ix;\n" 613 "b += Iy*Iy;\n" 614 "c += Ix*Iy;\n" 615 "}\n" 616 "}\n" 617 "responses[idx] = ((float)a * b - (float)c * c - HARRIS_K * (float)(a + b) * (a + b))*scale_sq_sq;\n" 618 "}\n" 619 "}\n" 620 "#endif\n" 621 "#ifdef ORB_ANGLES\n" 622 "#define _DBL_EPSILON 2.2204460492503131e-16f\n" 623 "#define atan2_p1 (0.9997878412794807f*57.29577951308232f)\n" 624 "#define atan2_p3 (-0.3258083974640975f*57.29577951308232f)\n" 625 "#define atan2_p5 (0.1555786518463281f*57.29577951308232f)\n" 626 "#define atan2_p7 (-0.04432655554792128f*57.29577951308232f)\n" 627 "inline float fastAtan2( float y, float x )\n" 628 "{\n" 629 "float ax = fabs(x), ay = fabs(y);\n" 630 "float a, c, c2;\n" 631 "if( ax >= ay )\n" 632 "{\n" 633 "c = ay/(ax + _DBL_EPSILON);\n" 634 "c2 = c*c;\n" 635 "a = (((atan2_p7*c2 + atan2_p5)*c2 + atan2_p3)*c2 + atan2_p1)*c;\n" 636 "}\n" 637 "else\n" 638 "{\n" 639 "c = ax/(ay + _DBL_EPSILON);\n" 640 "c2 = c*c;\n" 641 "a = 90.f - (((atan2_p7*c2 + atan2_p5)*c2 + atan2_p3)*c2 + atan2_p1)*c;\n" 642 "}\n" 643 "if( x < 0 )\n" 644 "a = 180.f - a;\n" 645 "if( y < 0 )\n" 646 "a = 360.f - a;\n" 647 "return a;\n" 648 "}\n" 649 "__kernel void\n" 650 "ORB_ICAngle(__global const uchar* imgbuf, int imgstep, int imgoffset0,\n" 651 "__global const int* layerinfo, __global const int* keypoints,\n" 652 "__global float* responses, const __global int* u_max,\n" 653 "int nkeypoints, int half_k )\n" 654 "{\n" 655 "int idx = get_global_id(0);\n" 656 "if( idx < nkeypoints )\n" 657 "{\n" 658 "__global const int* kpt = keypoints + idx*KEYPOINT_SIZE;\n" 659 "__global const int* layer = layerinfo + kpt[KEYPOINT_Z]*LAYERINFO_SIZE;\n" 660 "__global const uchar* center = imgbuf + imgoffset0 + layer[LAYERINFO_OFS] +\n" 661 "kpt[KEYPOINT_Y]*imgstep + kpt[KEYPOINT_X];\n" 662 "int u, v, m_01 = 0, m_10 = 0;\n" 663 "for( u = -half_k; u <= half_k; u++ )\n" 664 "m_10 += u * center[u];\n" 665 "for( v = 1; v <= half_k; v++ )\n" 666 "{\n" 667 "int v_sum = 0;\n" 668 "int d = u_max[v];\n" 669 "for( u = -d; u <= d; u++ )\n" 670 "{\n" 671 "int val_plus = center[u + v*imgstep], val_minus = center[u - v*imgstep];\n" 672 "v_sum += (val_plus - val_minus);\n" 673 "m_10 += u * (val_plus + val_minus);\n" 674 "}\n" 675 "m_01 += v * v_sum;\n" 676 "}\n" 677 "responses[idx] = fastAtan2((float)m_01, (float)m_10);\n" 678 "}\n" 679 "}\n" 680 "#endif\n" 681 "#ifdef ORB_DESCRIPTORS\n" 682 "__kernel void\n" 683 "ORB_computeDescriptor(__global const uchar* imgbuf, int imgstep, int imgoffset0,\n" 684 "__global const int* layerinfo, __global const int* keypoints,\n" 685 "__global uchar* _desc, const __global int* pattern,\n" 686 "int nkeypoints, int dsize )\n" 687 "{\n" 688 "int idx = get_global_id(0);\n" 689 "if( idx < nkeypoints )\n" 690 "{\n" 691 "int i;\n" 692 "__global const int* kpt = keypoints + idx*ORIENTED_KEYPOINT_SIZE;\n" 693 "__global const int* layer = layerinfo + kpt[KEYPOINT_Z]*LAYERINFO_SIZE;\n" 694 "__global const uchar* center = imgbuf + imgoffset0 + layer[LAYERINFO_OFS] +\n" 695 "kpt[KEYPOINT_Y]*imgstep + kpt[KEYPOINT_X];\n" 696 "float angle = as_float(kpt[KEYPOINT_ANGLE]);\n" 697 "angle *= 0.01745329251994329547f;\n" 698 "float cosa;\n" 699 "float sina = sincos(angle, &cosa);\n" 700 "__global uchar* desc = _desc + idx*dsize;\n" 701 "#define GET_VALUE(idx) \\\n" 702 "center[mad24(convert_int_rte(pattern[(idx)*2] * sina + pattern[(idx)*2+1] * cosa), imgstep, \\\n" 703 "convert_int_rte(pattern[(idx)*2] * cosa - pattern[(idx)*2+1] * sina))]\n" 704 "for( i = 0; i < dsize; i++ )\n" 705 "{\n" 706 "int val;\n" 707 "#if WTA_K == 2\n" 708 "int t0, t1;\n" 709 "t0 = GET_VALUE(0); t1 = GET_VALUE(1);\n" 710 "val = t0 < t1;\n" 711 "t0 = GET_VALUE(2); t1 = GET_VALUE(3);\n" 712 "val |= (t0 < t1) << 1;\n" 713 "t0 = GET_VALUE(4); t1 = GET_VALUE(5);\n" 714 "val |= (t0 < t1) << 2;\n" 715 "t0 = GET_VALUE(6); t1 = GET_VALUE(7);\n" 716 "val |= (t0 < t1) << 3;\n" 717 "t0 = GET_VALUE(8); t1 = GET_VALUE(9);\n" 718 "val |= (t0 < t1) << 4;\n" 719 "t0 = GET_VALUE(10); t1 = GET_VALUE(11);\n" 720 "val |= (t0 < t1) << 5;\n" 721 "t0 = GET_VALUE(12); t1 = GET_VALUE(13);\n" 722 "val |= (t0 < t1) << 6;\n" 723 "t0 = GET_VALUE(14); t1 = GET_VALUE(15);\n" 724 "val |= (t0 < t1) << 7;\n" 725 "pattern += 16*2;\n" 726 "#elif WTA_K == 3\n" 727 "int t0, t1, t2;\n" 728 "t0 = GET_VALUE(0); t1 = GET_VALUE(1); t2 = GET_VALUE(2);\n" 729 "val = t2 > t1 ? (t2 > t0 ? 2 : 0) : (t1 > t0);\n" 730 "t0 = GET_VALUE(3); t1 = GET_VALUE(4); t2 = GET_VALUE(5);\n" 731 "val |= (t2 > t1 ? (t2 > t0 ? 2 : 0) : (t1 > t0)) << 2;\n" 732 "t0 = GET_VALUE(6); t1 = GET_VALUE(7); t2 = GET_VALUE(8);\n" 733 "val |= (t2 > t1 ? (t2 > t0 ? 2 : 0) : (t1 > t0)) << 4;\n" 734 "t0 = GET_VALUE(9); t1 = GET_VALUE(10); t2 = GET_VALUE(11);\n" 735 "val |= (t2 > t1 ? (t2 > t0 ? 2 : 0) : (t1 > t0)) << 6;\n" 736 "pattern += 12*2;\n" 737 "#elif WTA_K == 4\n" 738 "int t0, t1, t2, t3, k;\n" 739 "int a, b;\n" 740 "t0 = GET_VALUE(0); t1 = GET_VALUE(1);\n" 741 "t2 = GET_VALUE(2); t3 = GET_VALUE(3);\n" 742 "a = 0, b = 2;\n" 743 "if( t1 > t0 ) t0 = t1, a = 1;\n" 744 "if( t3 > t2 ) t2 = t3, b = 3;\n" 745 "k = t0 > t2 ? a : b;\n" 746 "val = k;\n" 747 "t0 = GET_VALUE(4); t1 = GET_VALUE(5);\n" 748 "t2 = GET_VALUE(6); t3 = GET_VALUE(7);\n" 749 "a = 0, b = 2;\n" 750 "if( t1 > t0 ) t0 = t1, a = 1;\n" 751 "if( t3 > t2 ) t2 = t3, b = 3;\n" 752 "k = t0 > t2 ? a : b;\n" 753 "val |= k << 2;\n" 754 "t0 = GET_VALUE(8); t1 = GET_VALUE(9);\n" 755 "t2 = GET_VALUE(10); t3 = GET_VALUE(11);\n" 756 "a = 0, b = 2;\n" 757 "if( t1 > t0 ) t0 = t1, a = 1;\n" 758 "if( t3 > t2 ) t2 = t3, b = 3;\n" 759 "k = t0 > t2 ? a : b;\n" 760 "val |= k << 4;\n" 761 "t0 = GET_VALUE(12); t1 = GET_VALUE(13);\n" 762 "t2 = GET_VALUE(14); t3 = GET_VALUE(15);\n" 763 "a = 0, b = 2;\n" 764 "if( t1 > t0 ) t0 = t1, a = 1;\n" 765 "if( t3 > t2 ) t2 = t3, b = 3;\n" 766 "k = t0 > t2 ? a : b;\n" 767 "val |= k << 6;\n" 768 "pattern += 16*2;\n" 769 "#else\n" 770 "#error \"unknown/undefined WTA_K value; should be 2, 3 or 4\"\n" 771 "#endif\n" 772 "desc[i] = (uchar)val;\n" 773 "}\n" 774 "}\n" 775 "}\n" 776 "#endif\n" 777 , "a7c2cfaeda19907b637211b1cc91d253"}; 778 ProgramSource orb_oclsrc(orb.programStr); 779 } 780 }} 781