1/* 2 * This file is part of FFmpeg. 3 * 4 * FFmpeg is free software; you can redistribute it and/or 5 * modify it under the terms of the GNU Lesser General Public 6 * License as published by the Free Software Foundation; either 7 * version 2.1 of the License, or (at your option) any later version. 8 * 9 * FFmpeg is distributed in the hope that it will be useful, 10 * but WITHOUT ANY WARRANTY; without even the implied warranty of 11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 12 * Lesser General Public License for more details. 13 * 14 * You should have received a copy of the GNU Lesser General Public 15 * License along with FFmpeg; if not, write to the Free Software 16 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 17 * 18 * Copyright (C) 2000, Intel Corporation, all rights reserved. 19 * Copyright (C) 2013, OpenCV Foundation, all rights reserved. 20 * Third party copyrights are property of their respective owners. 21 * 22 * Redistribution and use in source and binary forms, with or without modification, 23 * are permitted provided that the following conditions are met: 24 * 25 * * Redistribution's of source code must retain the above copyright notice, 26 * this list of conditions and the following disclaimer. 27 * 28 * * Redistribution's in binary form must reproduce the above copyright notice, 29 * this list of conditions and the following disclaimer in the documentation 30 * and/or other materials provided with the distribution. 31 * 32 * * The name of the copyright holders may not be used to endorse or promote products 33 * derived from this software without specific prior written permission. 34 * 35 * This software is provided by the copyright holders and contributors "as is" and 36 * any express or implied warranties, including, but not limited to, the implied 37 * warranties of merchantability and fitness for a particular purpose are disclaimed. 38 * In no event shall the Intel Corporation or contributors be liable for any direct, 39 * indirect, incidental, special, exemplary, or consequential damages 40 * (including, but not limited to, procurement of substitute goods or services; 41 * loss of use, data, or profits; or business interruption) however caused 42 * and on any theory of liability, whether in contract, strict liability, 43 * or tort (including negligence or otherwise) arising in any way out of 44 * the use of this software, even if advised of the possibility of such damage. 45 */ 46 47#define HARRIS_THRESHOLD 3.0f 48// Block size over which to compute harris response 49// 50// Note that changing this will require fiddling with the local array sizes in 51// harris_response 52#define HARRIS_RADIUS 2 53#define DISTANCE_THRESHOLD 80 54 55// Sub-pixel refinement window for feature points 56#define REFINE_WIN_HALF_W 5 57#define REFINE_WIN_HALF_H 5 58#define REFINE_WIN_W 11 // REFINE_WIN_HALF_W * 2 + 1 59#define REFINE_WIN_H 11 60 61// Non-maximum suppression window size 62#define NONMAX_WIN 30 63#define NONMAX_WIN_HALF 15 // NONMAX_WIN / 2 64 65typedef struct PointPair { 66 // Previous frame 67 float2 p1; 68 // Current frame 69 float2 p2; 70} PointPair; 71 72typedef struct SmoothedPointPair { 73 // Non-smoothed point in current frame 74 int2 p1; 75 // Smoothed point in current frame 76 float2 p2; 77} SmoothedPointPair; 78 79typedef struct MotionVector { 80 PointPair p; 81 // Used to mark vectors as potential outliers 82 int should_consider; 83} MotionVector; 84 85const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | 86 CLK_ADDRESS_CLAMP_TO_EDGE | 87 CLK_FILTER_NEAREST; 88 89const sampler_t sampler_linear = CLK_NORMALIZED_COORDS_FALSE | 90 CLK_ADDRESS_CLAMP_TO_EDGE | 91 CLK_FILTER_LINEAR; 92 93const sampler_t sampler_linear_mirror = CLK_NORMALIZED_COORDS_TRUE | 94 CLK_ADDRESS_MIRRORED_REPEAT | 95 CLK_FILTER_LINEAR; 96 97// Writes to a 1D array at loc, treating it as a 2D array with the same 98// dimensions as the global work size. 99static void write_to_1d_arrf(__global float *buf, int2 loc, float val) { 100 buf[loc.x + loc.y * get_global_size(0)] = val; 101} 102 103static void write_to_1d_arrul8(__global ulong8 *buf, int2 loc, ulong8 val) { 104 buf[loc.x + loc.y * get_global_size(0)] = val; 105} 106 107static void write_to_1d_arrvec(__global MotionVector *buf, int2 loc, MotionVector val) { 108 buf[loc.x + loc.y * get_global_size(0)] = val; 109} 110 111static void write_to_1d_arrf2(__global float2 *buf, int2 loc, float2 val) { 112 buf[loc.x + loc.y * get_global_size(0)] = val; 113} 114 115static ulong8 read_from_1d_arrul8(__global const ulong8 *buf, int2 loc) { 116 return buf[loc.x + loc.y * get_global_size(0)]; 117} 118 119static float2 read_from_1d_arrf2(__global const float2 *buf, int2 loc) { 120 return buf[loc.x + loc.y * get_global_size(0)]; 121} 122 123// Returns the grayscale value at the given point. 124static float pixel_grayscale(__read_only image2d_t src, int2 loc) { 125 float4 pixel = read_imagef(src, sampler, loc); 126 return (pixel.x + pixel.y + pixel.z) / 3.0f; 127} 128 129static float convolve( 130 __local const float *grayscale, 131 int local_idx_x, 132 int local_idx_y, 133 float mask[3][3] 134) { 135 float ret = 0; 136 137 // These loops touch each pixel surrounding loc as well as loc itself 138 for (int i = 1, i2 = 0; i >= -1; --i, ++i2) { 139 for (int j = -1, j2 = 0; j <= 1; ++j, ++j2) { 140 ret += mask[i2][j2] * grayscale[(local_idx_x + 3 + j) + (local_idx_y + 3 + i) * 14]; 141 } 142 } 143 144 return ret; 145} 146 147// Sums dx * dy for all pixels within radius of loc 148static float sum_deriv_prod( 149 __local const float *grayscale, 150 float mask_x[3][3], 151 float mask_y[3][3] 152) { 153 float ret = 0; 154 155 for (int i = HARRIS_RADIUS; i >= -HARRIS_RADIUS; --i) { 156 for (int j = -HARRIS_RADIUS; j <= HARRIS_RADIUS; ++j) { 157 ret += convolve(grayscale, get_local_id(0) + j, get_local_id(1) + i, mask_x) * 158 convolve(grayscale, get_local_id(0) + j, get_local_id(1) + i, mask_y); 159 } 160 } 161 162 return ret; 163} 164 165// Sums d<>^2 (determined by mask) for all pixels within radius of loc 166static float sum_deriv_pow(__local const float *grayscale, float mask[3][3]) 167{ 168 float ret = 0; 169 170 for (int i = HARRIS_RADIUS; i >= -HARRIS_RADIUS; --i) { 171 for (int j = -HARRIS_RADIUS; j <= HARRIS_RADIUS; ++j) { 172 float deriv = convolve(grayscale, get_local_id(0) + j, get_local_id(1) + i, mask); 173 ret += deriv * deriv; 174 } 175 } 176 177 return ret; 178} 179 180// Fills a box with the given radius and pixel around loc 181static void draw_box(__write_only image2d_t dst, int2 loc, float4 pixel, int radius) 182{ 183 for (int i = -radius; i <= radius; ++i) { 184 for (int j = -radius; j <= radius; ++j) { 185 write_imagef( 186 dst, 187 (int2)( 188 // Clamp to avoid writing outside image bounds 189 clamp(loc.x + i, 0, get_image_dim(dst).x - 1), 190 clamp(loc.y + j, 0, get_image_dim(dst).y - 1) 191 ), 192 pixel 193 ); 194 } 195 } 196} 197 198// Converts the src image to grayscale 199__kernel void grayscale( 200 __read_only image2d_t src, 201 __write_only image2d_t grayscale 202) { 203 int2 loc = (int2)(get_global_id(0), get_global_id(1)); 204 write_imagef(grayscale, loc, (float4)(pixel_grayscale(src, loc), 0.0f, 0.0f, 1.0f)); 205} 206 207// This kernel computes the harris response for the given grayscale src image 208// within the given radius and writes it to harris_buf 209__kernel void harris_response( 210 __read_only image2d_t grayscale, 211 __global float *harris_buf 212) { 213 int2 loc = (int2)(get_global_id(0), get_global_id(1)); 214 215 if (loc.x > get_image_width(grayscale) - 1 || loc.y > get_image_height(grayscale) - 1) { 216 write_to_1d_arrf(harris_buf, loc, 0); 217 return; 218 } 219 220 float scale = 1.0f / ((1 << 2) * HARRIS_RADIUS * 255.0f); 221 222 float sobel_mask_x[3][3] = { 223 {-1, 0, 1}, 224 {-2, 0, 2}, 225 {-1, 0, 1} 226 }; 227 228 float sobel_mask_y[3][3] = { 229 { 1, 2, 1}, 230 { 0, 0, 0}, 231 {-1, -2, -1} 232 }; 233 234 // 8 x 8 local work + 3 pixels around each side (needed to accomodate for the 235 // block size radius of 2) 236 __local float grayscale_data[196]; 237 238 int idx = get_group_id(0) * get_local_size(0); 239 int idy = get_group_id(1) * get_local_size(1); 240 241 for (int i = idy - 3, it = 0; i < idy + (int)get_local_size(1) + 3; i++, it++) { 242 for (int j = idx - 3, jt = 0; j < idx + (int)get_local_size(0) + 3; j++, jt++) { 243 grayscale_data[jt + it * 14] = read_imagef(grayscale, sampler, (int2)(j, i)).x; 244 } 245 } 246 247 barrier(CLK_LOCAL_MEM_FENCE); 248 249 float sumdxdy = sum_deriv_prod(grayscale_data, sobel_mask_x, sobel_mask_y); 250 float sumdx2 = sum_deriv_pow(grayscale_data, sobel_mask_x); 251 float sumdy2 = sum_deriv_pow(grayscale_data, sobel_mask_y); 252 253 float trace = sumdx2 + sumdy2; 254 // r = det(M) - k(trace(M))^2 255 // k usually between 0.04 to 0.06 256 float r = (sumdx2 * sumdy2 - sumdxdy * sumdxdy) - 0.04f * (trace * trace) * pown(scale, 4); 257 258 // Threshold the r value 259 harris_buf[loc.x + loc.y * get_image_width(grayscale)] = r * step(HARRIS_THRESHOLD, r); 260} 261 262// Gets a patch centered around a float coordinate from a grayscale image using 263// bilinear interpolation 264static void get_rect_sub_pix( 265 __read_only image2d_t grayscale, 266 float *buffer, 267 int size_x, 268 int size_y, 269 float2 center 270) { 271 float2 offset = ((float2)(size_x, size_y) - 1.0f) * 0.5f; 272 273 for (int i = 0; i < size_y; i++) { 274 for (int j = 0; j < size_x; j++) { 275 buffer[i * size_x + j] = read_imagef( 276 grayscale, 277 sampler_linear, 278 (float2)(j, i) + center - offset 279 ).x * 255.0f; 280 } 281 } 282} 283 284// Refines detected features at a sub-pixel level 285// 286// This function is ported from OpenCV 287static float2 corner_sub_pix( 288 __read_only image2d_t grayscale, 289 float2 feature, 290 float *mask 291) { 292 float2 init = feature; 293 int src_width = get_global_size(0); 294 int src_height = get_global_size(1); 295 296 const int max_iters = 40; 297 const float eps = 0.001f * 0.001f; 298 int i, j, k; 299 300 int iter = 0; 301 float err = 0; 302 float subpix[(REFINE_WIN_W + 2) * (REFINE_WIN_H + 2)]; 303 const float flt_epsilon = 0x1.0p-23f; 304 305 do { 306 float2 feature_tmp; 307 float a = 0, b = 0, c = 0, bb1 = 0, bb2 = 0; 308 309 get_rect_sub_pix(grayscale, subpix, REFINE_WIN_W + 2, REFINE_WIN_H + 2, feature); 310 float *subpix_ptr = subpix; 311 subpix_ptr += REFINE_WIN_W + 2 + 1; 312 313 // process gradient 314 for (i = 0, k = 0; i < REFINE_WIN_H; i++, subpix_ptr += REFINE_WIN_W + 2) { 315 float py = i - REFINE_WIN_HALF_H; 316 317 for (j = 0; j < REFINE_WIN_W; j++, k++) { 318 float m = mask[k]; 319 float tgx = subpix_ptr[j + 1] - subpix_ptr[j - 1]; 320 float tgy = subpix_ptr[j + REFINE_WIN_W + 2] - subpix_ptr[j - REFINE_WIN_W - 2]; 321 float gxx = tgx * tgx * m; 322 float gxy = tgx * tgy * m; 323 float gyy = tgy * tgy * m; 324 float px = j - REFINE_WIN_HALF_W; 325 326 a += gxx; 327 b += gxy; 328 c += gyy; 329 330 bb1 += gxx * px + gxy * py; 331 bb2 += gxy * px + gyy * py; 332 } 333 } 334 335 float det = a * c - b * b; 336 if (fabs(det) <= flt_epsilon * flt_epsilon) { 337 break; 338 } 339 340 // 2x2 matrix inversion 341 float scale = 1.0f / det; 342 feature_tmp.x = (float)(feature.x + (c * scale * bb1) - (b * scale * bb2)); 343 feature_tmp.y = (float)(feature.y - (b * scale * bb1) + (a * scale * bb2)); 344 err = dot(feature_tmp - feature, feature_tmp - feature); 345 346 feature = feature_tmp; 347 if (feature.x < 0 || feature.x >= src_width || feature.y < 0 || feature.y >= src_height) { 348 break; 349 } 350 } while (++iter < max_iters && err > eps); 351 352 // Make sure new point isn't too far from the initial point (indicates poor convergence) 353 if (fabs(feature.x - init.x) > REFINE_WIN_HALF_W || fabs(feature.y - init.y) > REFINE_WIN_HALF_H) { 354 feature = init; 355 } 356 357 return feature; 358} 359 360// Performs non-maximum suppression on the harris response and writes the resulting 361// feature locations to refined_features. 362// 363// Assumes that refined_features and the global work sizes are set up such that the image 364// is split up into a grid of 32x32 blocks where each block has a single slot in the 365// refined_features buffer. This kernel finds the best corner in each block (if the 366// block has any) and writes it to the corresponding slot in the buffer. 367// 368// If subpixel_refine is true, the features are additionally refined at a sub-pixel 369// level for increased precision. 370__kernel void refine_features( 371 __read_only image2d_t grayscale, 372 __global const float *harris_buf, 373 __global float2 *refined_features, 374 int subpixel_refine 375) { 376 int2 loc = (int2)(get_global_id(0), get_global_id(1)); 377 // The location in the grayscale buffer rather than the compacted grid 378 int2 loc_i = (int2)(loc.x * 32, loc.y * 32); 379 380 float new_val; 381 float max_val = 0; 382 float2 loc_max = (float2)(-1, -1); 383 384 int end_x = min(loc_i.x + 32, (int)get_image_dim(grayscale).x - 1); 385 int end_y = min(loc_i.y + 32, (int)get_image_dim(grayscale).y - 1); 386 387 for (int i = loc_i.x; i < end_x; ++i) { 388 for (int j = loc_i.y; j < end_y; ++j) { 389 new_val = harris_buf[i + j * get_image_dim(grayscale).x]; 390 391 if (new_val > max_val) { 392 max_val = new_val; 393 loc_max = (float2)(i, j); 394 } 395 } 396 } 397 398 if (max_val == 0) { 399 // There are no features in this part of the frame 400 write_to_1d_arrf2(refined_features, loc, loc_max); 401 return; 402 } 403 404 if (subpixel_refine) { 405 float mask[REFINE_WIN_H * REFINE_WIN_W]; 406 for (int i = 0; i < REFINE_WIN_H; i++) { 407 float y = (float)(i - REFINE_WIN_HALF_H) / REFINE_WIN_HALF_H; 408 float vy = exp(-y * y); 409 410 for (int j = 0; j < REFINE_WIN_W; j++) { 411 float x = (float)(j - REFINE_WIN_HALF_W) / REFINE_WIN_HALF_W; 412 mask[i * REFINE_WIN_W + j] = (float)(vy * exp(-x * x)); 413 } 414 } 415 416 loc_max = corner_sub_pix(grayscale, loc_max, mask); 417 } 418 419 write_to_1d_arrf2(refined_features, loc, loc_max); 420} 421 422// Extracts BRIEF descriptors from the grayscale src image for the given features 423// using the provided sampler. 424__kernel void brief_descriptors( 425 __read_only image2d_t grayscale, 426 __global const float2 *refined_features, 427 // for 512 bit descriptors 428 __global ulong8 *desc_buf, 429 __global const PointPair *brief_pattern 430) { 431 int2 loc = (int2)(get_global_id(0), get_global_id(1)); 432 float2 feature = read_from_1d_arrf2(refined_features, loc); 433 434 // There was no feature in this part of the frame 435 if (feature.x == -1) { 436 write_to_1d_arrul8(desc_buf, loc, (ulong8)(0)); 437 return; 438 } 439 440 ulong8 desc = 0; 441 ulong *p = &desc; 442 443 for (int i = 0; i < 8; ++i) { 444 for (int j = 0; j < 64; ++j) { 445 PointPair pair = brief_pattern[j * (i + 1)]; 446 float l1 = read_imagef(grayscale, sampler_linear, feature + pair.p1).x; 447 float l2 = read_imagef(grayscale, sampler_linear, feature + pair.p2).x; 448 449 if (l1 < l2) { 450 p[i] |= 1UL << j; 451 } 452 } 453 } 454 455 write_to_1d_arrul8(desc_buf, loc, desc); 456} 457 458// Given buffers with descriptors for the current and previous frame, determines 459// which ones match, writing correspondences to matches_buf. 460// 461// Feature and descriptor buffers are assumed to be compacted (each element sourced 462// from a 32x32 block in the frame being processed). 463__kernel void match_descriptors( 464 __global const float2 *prev_refined_features, 465 __global const float2 *refined_features, 466 __global const ulong8 *desc_buf, 467 __global const ulong8 *prev_desc_buf, 468 __global MotionVector *matches_buf 469) { 470 int2 loc = (int2)(get_global_id(0), get_global_id(1)); 471 ulong8 desc = read_from_1d_arrul8(desc_buf, loc); 472 const int search_radius = 3; 473 474 MotionVector invalid_vector = (MotionVector) { 475 (PointPair) { 476 (float2)(-1, -1), 477 (float2)(-1, -1) 478 }, 479 0 480 }; 481 482 if (desc.s0 == 0 && desc.s1 == 0) { 483 // There was no feature in this part of the frame 484 write_to_1d_arrvec( 485 matches_buf, 486 loc, 487 invalid_vector 488 ); 489 return; 490 } 491 492 int2 start = max(loc - search_radius, 0); 493 int2 end = min(loc + search_radius, (int2)(get_global_size(0) - 1, get_global_size(1) - 1)); 494 495 for (int i = start.x; i < end.x; ++i) { 496 for (int j = start.y; j < end.y; ++j) { 497 int2 prev_point = (int2)(i, j); 498 int total_dist = 0; 499 500 ulong8 prev_desc = read_from_1d_arrul8(prev_desc_buf, prev_point); 501 502 if (prev_desc.s0 == 0 && prev_desc.s1 == 0) { 503 continue; 504 } 505 506 ulong *prev_desc_p = &prev_desc; 507 ulong *desc_p = &desc; 508 509 for (int i = 0; i < 8; i++) { 510 total_dist += popcount(desc_p[i] ^ prev_desc_p[i]); 511 } 512 513 if (total_dist < DISTANCE_THRESHOLD) { 514 write_to_1d_arrvec( 515 matches_buf, 516 loc, 517 (MotionVector) { 518 (PointPair) { 519 read_from_1d_arrf2(prev_refined_features, prev_point), 520 read_from_1d_arrf2(refined_features, loc) 521 }, 522 1 523 } 524 ); 525 526 return; 527 } 528 } 529 } 530 531 // There is no found match for this point 532 write_to_1d_arrvec( 533 matches_buf, 534 loc, 535 invalid_vector 536 ); 537} 538 539// Returns the position of the given point after the transform is applied 540static float2 transformed_point(float2 p, __global const float *transform) { 541 float2 ret; 542 543 ret.x = p.x * transform[0] + p.y * transform[1] + transform[2]; 544 ret.y = p.x * transform[3] + p.y * transform[4] + transform[5]; 545 546 return ret; 547} 548 549 550// Performs the given transform on the src image 551__kernel void transform( 552 __read_only image2d_t src, 553 __write_only image2d_t dst, 554 __global const float *transform 555) { 556 int2 loc = (int2)(get_global_id(0), get_global_id(1)); 557 float2 norm = convert_float2(get_image_dim(src)); 558 559 write_imagef( 560 dst, 561 loc, 562 read_imagef( 563 src, 564 sampler_linear_mirror, 565 transformed_point((float2)(loc.x, loc.y), transform) / norm 566 ) 567 ); 568} 569 570// Returns the new location of the given point using the given crop bounding box 571// and the width and height of the original frame. 572static float2 cropped_point( 573 float2 p, 574 float2 top_left, 575 float2 bottom_right, 576 int2 orig_dim 577) { 578 float2 ret; 579 580 float crop_width = bottom_right.x - top_left.x; 581 float crop_height = bottom_right.y - top_left.y; 582 583 float width_norm = p.x / (float)orig_dim.x; 584 float height_norm = p.y / (float)orig_dim.y; 585 586 ret.x = (width_norm * crop_width) + top_left.x; 587 ret.y = (height_norm * crop_height) + ((float)orig_dim.y - bottom_right.y); 588 589 return ret; 590} 591 592// Upscales the given cropped region to the size of the original frame 593__kernel void crop_upscale( 594 __read_only image2d_t src, 595 __write_only image2d_t dst, 596 float2 top_left, 597 float2 bottom_right 598) { 599 int2 loc = (int2)(get_global_id(0), get_global_id(1)); 600 601 write_imagef( 602 dst, 603 loc, 604 read_imagef( 605 src, 606 sampler_linear, 607 cropped_point((float2)(loc.x, loc.y), top_left, bottom_right, get_image_dim(dst)) 608 ) 609 ); 610} 611 612// Draws boxes to represent the given point matches and uses the given transform 613// and crop info to make sure their positions are accurate on the transformed frame. 614// 615// model_matches is an array of three points that were used by the RANSAC process 616// to generate the given transform 617__kernel void draw_debug_info( 618 __write_only image2d_t dst, 619 __global const MotionVector *matches, 620 __global const MotionVector *model_matches, 621 int num_model_matches, 622 __global const float *transform 623) { 624 int loc = get_global_id(0); 625 MotionVector vec = matches[loc]; 626 // Black box: matched point that RANSAC considered an outlier 627 float4 big_rect_color = (float4)(0.1f, 0.1f, 0.1f, 1.0f); 628 629 if (vec.should_consider) { 630 // Green box: matched point that RANSAC considered an inlier 631 big_rect_color = (float4)(0.0f, 1.0f, 0.0f, 1.0f); 632 } 633 634 for (int i = 0; i < num_model_matches; i++) { 635 if (vec.p.p2.x == model_matches[i].p.p2.x && vec.p.p2.y == model_matches[i].p.p2.y) { 636 // Orange box: point used to calculate model 637 big_rect_color = (float4)(1.0f, 0.5f, 0.0f, 1.0f); 638 } 639 } 640 641 float2 transformed_p1 = transformed_point(vec.p.p1, transform); 642 float2 transformed_p2 = transformed_point(vec.p.p2, transform); 643 644 draw_box(dst, (int2)(transformed_p2.x, transformed_p2.y), big_rect_color, 5); 645 // Small light blue box: the point in the previous frame 646 draw_box(dst, (int2)(transformed_p1.x, transformed_p1.y), (float4)(0.0f, 0.3f, 0.7f, 1.0f), 3); 647} 648