• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * This file is part of FFmpeg.
3  *
4  * FFmpeg is free software; you can redistribute it and/or
5  * modify it under the terms of the GNU Lesser General Public
6  * License as published by the Free Software Foundation; either
7  * version 2.1 of the License, or (at your option) any later version.
8  *
9  * FFmpeg is distributed in the hope that it will be useful,
10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
12  * Lesser General Public License for more details.
13  *
14  * You should have received a copy of the GNU Lesser General Public
15  * License along with FFmpeg; if not, write to the Free Software
16  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
17  *
18  * Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
19  * Copyright (C) 2009, Willow Garage Inc., all rights reserved.
20  * Copyright (C) 2013, OpenCV Foundation, all rights reserved.
21  * Third party copyrights are property of their respective owners.
22  *
23  * Redistribution and use in source and binary forms, with or without modification,
24  * are permitted provided that the following conditions are met:
25  *
26  *   * Redistribution's of source code must retain the above copyright notice,
27  *     this list of conditions and the following disclaimer.
28  *
29  *   * Redistribution's in binary form must reproduce the above copyright notice,
30  *     this list of conditions and the following disclaimer in the documentation
31  *     and/or other materials provided with the distribution.
32  *
33  *   * The name of the copyright holders may not be used to endorse or promote products
34  *     derived from this software without specific prior written permission.
35  *
36  * This software is provided by the copyright holders and contributors "as is" and
37  * any express or implied warranties, including, but not limited to, the implied
38  * warranties of merchantability and fitness for a particular purpose are disclaimed.
39  * In no event shall the Intel Corporation or contributors be liable for any direct,
40  * indirect, incidental, special, exemplary, or consequential damages
41  * (including, but not limited to, procurement of substitute goods or services;
42  * loss of use, data, or profits; or business interruption) however caused
43  * and on any theory of liability, whether in contract, strict liability,
44  * or tort (including negligence or otherwise) arising in any way out of
45  * the use of this software, even if advised of the possibility of such damage.
46  */
47 
48 #include <float.h>
49 #include <libavutil/lfg.h>
50 #include "libavutil/opt.h"
51 #include "libavutil/imgutils.h"
52 #include "libavutil/mem.h"
53 #include "libavutil/fifo.h"
54 #include "libavutil/common.h"
55 #include "libavutil/avassert.h"
56 #include "libavutil/pixfmt.h"
57 #include "avfilter.h"
58 #include "framequeue.h"
59 #include "filters.h"
60 #include "transform.h"
61 #include "formats.h"
62 #include "internal.h"
63 #include "opencl.h"
64 #include "opencl_source.h"
65 #include "video.h"
66 
67 /*
68 This filter matches feature points between frames (dealing with outliers) and then
69 uses the matches to estimate an affine transform between frames. This transform is
70 decomposed into various values (translation, scale, rotation) and the values are
71 summed relative to the start of the video to obtain on absolute camera position
72 for each frame. This "camera path" is then smoothed via a gaussian filter, resulting
73 in a new path that is turned back into an affine transform and applied to each
74 frame to render it.
75 
76 High-level overview:
77 
78 All of the work to extract motion data from frames occurs in queue_frame. Motion data
79 is buffered in a smoothing window, so queue_frame simply computes the absolute camera
80 positions and places them in ringbuffers.
81 
82 filter_frame is responsible for looking at the absolute camera positions currently
83 in the ringbuffers, applying the gaussian filter, and then transforming the frames.
84 */
85 
86 // Number of bits for BRIEF descriptors
87 #define BREIFN 512
88 // Size of the patch from which a BRIEF descriptor is extracted
89 // This is the size used in OpenCV
90 #define BRIEF_PATCH_SIZE 31
91 #define BRIEF_PATCH_SIZE_HALF (BRIEF_PATCH_SIZE / 2)
92 
93 #define MATCHES_CONTIG_SIZE 2000
94 
95 #define ROUNDED_UP_DIV(a, b) ((a + (b - 1)) / b)
96 
97 typedef struct PointPair {
98     // Previous frame
99     cl_float2 p1;
100     // Current frame
101     cl_float2 p2;
102 } PointPair;
103 
104 typedef struct MotionVector {
105     PointPair p;
106     // Used to mark vectors as potential outliers
107     cl_int should_consider;
108 } MotionVector;
109 
110 // Denotes the indices for the different types of motion in the ringbuffers array
111 enum RingbufferIndices {
112     RingbufX,
113     RingbufY,
114     RingbufRot,
115     RingbufScaleX,
116     RingbufScaleY,
117 
118     // Should always be last
119     RingbufCount
120 };
121 
122 // Struct that holds data for drawing point match debug data
123 typedef struct DebugMatches {
124     MotionVector *matches;
125     // The points used to calculate the affine transform for a frame
126     MotionVector model_matches[3];
127 
128     int num_matches;
129     // For cases where we couldn't calculate a model
130     int num_model_matches;
131 } DebugMatches;
132 
133 // Groups together the ringbuffers that store absolute distortion / position values
134 // for each frame
135 typedef struct AbsoluteFrameMotion {
136     // Array with the various ringbuffers, indexed via the RingbufferIndices enum
137     AVFifo *ringbuffers[RingbufCount];
138 
139     // Offset to get to the current frame being processed
140     // (not in bytes)
141     int curr_frame_offset;
142     // Keeps track of where the start and end of contiguous motion data is (to
143     // deal with cases where no motion data is found between two frames)
144     int data_start_offset;
145     int data_end_offset;
146 
147     AVFifo *debug_matches;
148 } AbsoluteFrameMotion;
149 
150 // Takes care of freeing the arrays within the DebugMatches inside of the
151 // debug_matches ringbuffer and then freeing the buffer itself.
free_debug_matches(AbsoluteFrameMotion * afm)152 static void free_debug_matches(AbsoluteFrameMotion *afm) {
153     DebugMatches dm;
154 
155     if (!afm->debug_matches) {
156         return;
157     }
158 
159     while (av_fifo_read(afm->debug_matches, &dm, 1) >= 0)
160         av_freep(&dm.matches);
161 
162     av_fifo_freep2(&afm->debug_matches);
163 }
164 
165 // Stores the translation, scale, rotation, and skew deltas between two frames
166 typedef struct FrameDelta {
167     cl_float2 translation;
168     float rotation;
169     cl_float2 scale;
170     cl_float2 skew;
171 } FrameDelta;
172 
173 typedef struct SimilarityMatrix {
174     // The 2x3 similarity matrix
175     double matrix[6];
176 } SimilarityMatrix;
177 
178 typedef struct CropInfo {
179     // The top left corner of the bounding box for the crop
180     cl_float2 top_left;
181     // The bottom right corner of the bounding box for the crop
182     cl_float2 bottom_right;
183 } CropInfo;
184 
185 // Returned from function that determines start and end values for iteration
186 // around the current frame in a ringbuffer
187 typedef struct IterIndices {
188     int start;
189     int end;
190 } IterIndices;
191 
192 typedef struct DeshakeOpenCLContext {
193     OpenCLFilterContext ocf;
194     // Whether or not the above `OpenCLFilterContext` has been initialized
195     int initialized;
196 
197     // These variables are used in the activate callback
198     int64_t duration;
199     int eof;
200 
201     // State for random number generation
202     AVLFG alfg;
203 
204     // FIFO frame queue used to buffer future frames for processing
205     FFFrameQueue fq;
206     // Ringbuffers for frame positions
207     AbsoluteFrameMotion abs_motion;
208 
209     // The number of frames' motion to consider before and after the frame we are
210     // smoothing
211     int smooth_window;
212     // The number of the frame we are currently processing
213     int curr_frame;
214 
215     // Stores a 1d array of normalised gaussian kernel values for convolution
216     float *gauss_kernel;
217 
218     // Buffer for error values used in RANSAC code
219     float *ransac_err;
220 
221     // Information regarding how to crop the smoothed luminance (or RGB) planes
222     CropInfo crop_y;
223     // Information regarding how to crop the smoothed chroma planes
224     CropInfo crop_uv;
225 
226     // Whether or not we are processing YUV input (as oppposed to RGB)
227     int is_yuv;
228     // The underlying format of the hardware surfaces
229     int sw_format;
230 
231     // Buffer to copy `matches` into for the CPU to work with
232     MotionVector *matches_host;
233     MotionVector *matches_contig_host;
234 
235     MotionVector *inliers;
236 
237     cl_command_queue command_queue;
238     cl_kernel kernel_grayscale;
239     cl_kernel kernel_harris_response;
240     cl_kernel kernel_refine_features;
241     cl_kernel kernel_brief_descriptors;
242     cl_kernel kernel_match_descriptors;
243     cl_kernel kernel_transform;
244     cl_kernel kernel_crop_upscale;
245 
246     // Stores a frame converted to grayscale
247     cl_mem grayscale;
248     // Stores the harris response for a frame (measure of "cornerness" for each pixel)
249     cl_mem harris_buf;
250 
251     // Detected features after non-maximum suppression and sub-pixel refinement
252     cl_mem refined_features;
253     // Saved from the previous frame
254     cl_mem prev_refined_features;
255 
256     // BRIEF sampling pattern that is randomly initialized
257     cl_mem brief_pattern;
258     // Feature point descriptors for the current frame
259     cl_mem descriptors;
260     // Feature point descriptors for the previous frame
261     cl_mem prev_descriptors;
262     // Vectors between points in current and previous frame
263     cl_mem matches;
264     cl_mem matches_contig;
265     // Holds the matrix to transform luminance (or RGB) with
266     cl_mem transform_y;
267     // Holds the matrix to transform chroma with
268     cl_mem transform_uv;
269 
270     // Configurable options
271 
272     int tripod_mode;
273     int debug_on;
274     int should_crop;
275 
276     // Whether or not feature points should be refined at a sub-pixel level
277     cl_int refine_features;
278     // If the user sets a value other than the default, 0, this percentage is
279     // translated into a sigma value ranging from 0.5 to 40.0
280     float smooth_percent;
281     // This number is multiplied by the video frame rate to determine the size
282     // of the smooth window
283     float smooth_window_multiplier;
284 
285     // Debug stuff
286 
287     cl_kernel kernel_draw_debug_info;
288     cl_mem debug_matches;
289     cl_mem debug_model_matches;
290 
291     // These store the total time spent executing the different kernels in nanoseconds
292     unsigned long long grayscale_time;
293     unsigned long long harris_response_time;
294     unsigned long long refine_features_time;
295     unsigned long long brief_descriptors_time;
296     unsigned long long match_descriptors_time;
297     unsigned long long transform_time;
298     unsigned long long crop_upscale_time;
299 
300     // Time spent copying matched features from the device to the host
301     unsigned long long read_buf_time;
302 } DeshakeOpenCLContext;
303 
304 // Returns a random uniformly-distributed number in [low, high]
rand_in(int low,int high,AVLFG * alfg)305 static int rand_in(int low, int high, AVLFG *alfg) {
306     return (av_lfg_get(alfg) % (high - low)) + low;
307 }
308 
309 // Returns the average execution time for an event given the total time and the
310 // number of frames processed.
averaged_event_time_ms(unsigned long long total_time,int num_frames)311 static double averaged_event_time_ms(unsigned long long total_time, int num_frames) {
312     return (double)total_time / (double)num_frames / 1000000.0;
313 }
314 
315 // The following code is loosely ported from OpenCV
316 
317 // Estimates affine transform from 3 point pairs
318 // model is a 2x3 matrix:
319 //      a b c
320 //      d e f
run_estimate_kernel(const MotionVector * point_pairs,double * model)321 static void run_estimate_kernel(const MotionVector *point_pairs, double *model)
322 {
323     // src points
324     double x1 = point_pairs[0].p.p1.s[0];
325     double y1 = point_pairs[0].p.p1.s[1];
326     double x2 = point_pairs[1].p.p1.s[0];
327     double y2 = point_pairs[1].p.p1.s[1];
328     double x3 = point_pairs[2].p.p1.s[0];
329     double y3 = point_pairs[2].p.p1.s[1];
330 
331     // dest points
332     double X1 = point_pairs[0].p.p2.s[0];
333     double Y1 = point_pairs[0].p.p2.s[1];
334     double X2 = point_pairs[1].p.p2.s[0];
335     double Y2 = point_pairs[1].p.p2.s[1];
336     double X3 = point_pairs[2].p.p2.s[0];
337     double Y3 = point_pairs[2].p.p2.s[1];
338 
339     double d = 1.0 / ( x1*(y2-y3) + x2*(y3-y1) + x3*(y1-y2) );
340 
341     model[0] = d * ( X1*(y2-y3) + X2*(y3-y1) + X3*(y1-y2) );
342     model[1] = d * ( X1*(x3-x2) + X2*(x1-x3) + X3*(x2-x1) );
343     model[2] = d * ( X1*(x2*y3 - x3*y2) + X2*(x3*y1 - x1*y3) + X3*(x1*y2 - x2*y1) );
344 
345     model[3] = d * ( Y1*(y2-y3) + Y2*(y3-y1) + Y3*(y1-y2) );
346     model[4] = d * ( Y1*(x3-x2) + Y2*(x1-x3) + Y3*(x2-x1) );
347     model[5] = d * ( Y1*(x2*y3 - x3*y2) + Y2*(x3*y1 - x1*y3) + Y3*(x1*y2 - x2*y1) );
348 }
349 
350 // Checks that the 3 points in the given array are not collinear
points_not_collinear(const cl_float2 ** points)351 static int points_not_collinear(const cl_float2 **points)
352 {
353     int j, k, i = 2;
354 
355     for (j = 0; j < i; j++) {
356         double dx1 = points[j]->s[0] - points[i]->s[0];
357         double dy1 = points[j]->s[1] - points[i]->s[1];
358 
359         for (k = 0; k < j; k++) {
360             double dx2 = points[k]->s[0] - points[i]->s[0];
361             double dy2 = points[k]->s[1] - points[i]->s[1];
362 
363             // Assuming a 3840 x 2160 video with a point at (0, 0) and one at
364             // (3839, 2159), this prevents a third point from being within roughly
365             // 0.5 of a pixel of the line connecting the two on both axes
366             if (fabs(dx2*dy1 - dy2*dx1) <= 1.0) {
367                 return 0;
368             }
369         }
370     }
371 
372     return 1;
373 }
374 
375 // Checks a subset of 3 point pairs to make sure that the points are not collinear
376 // and not too close to each other
check_subset(const MotionVector * pairs_subset)377 static int check_subset(const MotionVector *pairs_subset)
378 {
379     const cl_float2 *prev_points[] = {
380         &pairs_subset[0].p.p1,
381         &pairs_subset[1].p.p1,
382         &pairs_subset[2].p.p1
383     };
384 
385     const cl_float2 *curr_points[] = {
386         &pairs_subset[0].p.p2,
387         &pairs_subset[1].p.p2,
388         &pairs_subset[2].p.p2
389     };
390 
391     return points_not_collinear(prev_points) && points_not_collinear(curr_points);
392 }
393 
394 // Selects a random subset of 3 points from point_pairs and places them in pairs_subset
get_subset(AVLFG * alfg,const MotionVector * point_pairs,const int num_point_pairs,MotionVector * pairs_subset,int max_attempts)395 static int get_subset(
396     AVLFG *alfg,
397     const MotionVector *point_pairs,
398     const int num_point_pairs,
399     MotionVector *pairs_subset,
400     int max_attempts
401 ) {
402     int idx[3];
403     int i = 0, j, iters = 0;
404 
405     for (; iters < max_attempts; iters++) {
406         for (i = 0; i < 3 && iters < max_attempts;) {
407             int idx_i = 0;
408 
409             for (;;) {
410                 idx_i = idx[i] = rand_in(0, num_point_pairs, alfg);
411 
412                 for (j = 0; j < i; j++) {
413                     if (idx_i == idx[j]) {
414                         break;
415                     }
416                 }
417 
418                 if (j == i) {
419                     break;
420                 }
421             }
422 
423             pairs_subset[i] = point_pairs[idx[i]];
424             i++;
425         }
426 
427         if (i == 3 && !check_subset(pairs_subset)) {
428             continue;
429         }
430         break;
431     }
432 
433     return i == 3 && iters < max_attempts;
434 }
435 
436 // Computes the error for each of the given points based on the given model.
compute_error(const MotionVector * point_pairs,const int num_point_pairs,const double * model,float * err)437 static void compute_error(
438     const MotionVector *point_pairs,
439     const int num_point_pairs,
440     const double *model,
441     float *err
442 ) {
443     double F0 = model[0], F1 = model[1], F2 = model[2];
444     double F3 = model[3], F4 = model[4], F5 = model[5];
445 
446     for (int i = 0; i < num_point_pairs; i++) {
447         const cl_float2 *f = &point_pairs[i].p.p1;
448         const cl_float2 *t = &point_pairs[i].p.p2;
449 
450         double a = F0*f->s[0] + F1*f->s[1] + F2 - t->s[0];
451         double b = F3*f->s[0] + F4*f->s[1] + F5 - t->s[1];
452 
453         err[i] = a*a + b*b;
454     }
455 }
456 
457 // Determines which of the given point matches are inliers for the given model
458 // based on the specified threshold.
459 //
460 // err must be an array of num_point_pairs length
find_inliers(MotionVector * point_pairs,const int num_point_pairs,const double * model,float * err,double thresh)461 static int find_inliers(
462     MotionVector *point_pairs,
463     const int num_point_pairs,
464     const double *model,
465     float *err,
466     double thresh
467 ) {
468     float t = (float)(thresh * thresh);
469     int i, n = num_point_pairs, num_inliers = 0;
470 
471     compute_error(point_pairs, num_point_pairs, model, err);
472 
473     for (i = 0; i < n; i++) {
474         if (err[i] <= t) {
475             // This is an inlier
476             point_pairs[i].should_consider = 1;
477             num_inliers += 1;
478         } else {
479             point_pairs[i].should_consider = 0;
480         }
481     }
482 
483     return num_inliers;
484 }
485 
486 // Determines the number of iterations required to achieve the desired confidence level.
487 //
488 // The equation used to determine the number of iterations to do is:
489 // 1 - confidence = (1 - inlier_probability^num_points)^num_iters
490 //
491 // Solving for num_iters:
492 //
493 // num_iters = log(1 - confidence) / log(1 - inlier_probability^num_points)
494 //
495 // A more in-depth explanation can be found at https://en.wikipedia.org/wiki/Random_sample_consensus
496 // under the 'Parameters' heading
ransac_update_num_iters(double confidence,double num_outliers,int max_iters)497 static int ransac_update_num_iters(double confidence, double num_outliers, int max_iters)
498 {
499     double num, denom;
500 
501     confidence   = av_clipd(confidence, 0.0, 1.0);
502     num_outliers = av_clipd(num_outliers, 0.0, 1.0);
503 
504     // avoid inf's & nan's
505     num = FFMAX(1.0 - confidence, DBL_MIN);
506     denom = 1.0 - pow(1.0 - num_outliers, 3);
507     if (denom < DBL_MIN) {
508         return 0;
509     }
510 
511     num = log(num);
512     denom = log(denom);
513 
514     return denom >= 0 || -num >= max_iters * (-denom) ? max_iters : (int)round(num / denom);
515 }
516 
517 // Estimates an affine transform between the given pairs of points using RANdom
518 // SAmple Consensus
estimate_affine_2d(DeshakeOpenCLContext * deshake_ctx,MotionVector * point_pairs,DebugMatches * debug_matches,const int num_point_pairs,double * model_out,const double threshold,const int max_iters,const double confidence)519 static int estimate_affine_2d(
520     DeshakeOpenCLContext *deshake_ctx,
521     MotionVector *point_pairs,
522     DebugMatches *debug_matches,
523     const int num_point_pairs,
524     double *model_out,
525     const double threshold,
526     const int max_iters,
527     const double confidence
528 ) {
529     int result = 0;
530     double best_model[6], model[6];
531     MotionVector pairs_subset[3], best_pairs[3];
532 
533     int iter, niters = FFMAX(max_iters, 1);
534     int good_count, max_good_count = 0;
535 
536     // We need at least 3 points to build a model from
537     if (num_point_pairs < 3) {
538         return 0;
539     } else if (num_point_pairs == 3) {
540         // There are only 3 points, so RANSAC doesn't apply here
541         run_estimate_kernel(point_pairs, model_out);
542 
543         for (int i = 0; i < 3; ++i) {
544             point_pairs[i].should_consider = 1;
545         }
546 
547         return 1;
548     }
549 
550     for (iter = 0; iter < niters; ++iter) {
551         int found = get_subset(&deshake_ctx->alfg, point_pairs, num_point_pairs, pairs_subset, 10000);
552 
553         if (!found) {
554             if (iter == 0) {
555                 return 0;
556             }
557 
558             break;
559         }
560 
561         run_estimate_kernel(pairs_subset, model);
562         good_count = find_inliers(point_pairs, num_point_pairs, model, deshake_ctx->ransac_err, threshold);
563 
564         if (good_count > FFMAX(max_good_count, 2)) {
565             for (int mi = 0; mi < 6; ++mi) {
566                 best_model[mi] = model[mi];
567             }
568 
569             for (int pi = 0; pi < 3; pi++) {
570                 best_pairs[pi] = pairs_subset[pi];
571             }
572 
573             max_good_count = good_count;
574             niters = ransac_update_num_iters(
575                 confidence,
576                 (double)(num_point_pairs - good_count) / num_point_pairs,
577                 niters
578             );
579         }
580     }
581 
582     if (max_good_count > 0) {
583         for (int mi = 0; mi < 6; ++mi) {
584             model_out[mi] = best_model[mi];
585         }
586 
587         for (int pi = 0; pi < 3; ++pi) {
588             debug_matches->model_matches[pi] = best_pairs[pi];
589         }
590         debug_matches->num_model_matches = 3;
591 
592         // Find the inliers again for the best model for debugging
593         find_inliers(point_pairs, num_point_pairs, best_model, deshake_ctx->ransac_err, threshold);
594         result = 1;
595     }
596 
597     return result;
598 }
599 
600 // "Wiggles" the first point in best_pairs around a tiny bit in order to decrease the
601 // total error
optimize_model(DeshakeOpenCLContext * deshake_ctx,MotionVector * best_pairs,MotionVector * inliers,const int num_inliers,float best_err,double * model_out)602 static void optimize_model(
603     DeshakeOpenCLContext *deshake_ctx,
604     MotionVector *best_pairs,
605     MotionVector *inliers,
606     const int num_inliers,
607     float best_err,
608     double *model_out
609 ) {
610     float move_x_val = 0.01;
611     float move_y_val = 0.01;
612     int move_x = 1;
613     float old_move_x_val = 0;
614     double model[6];
615     int last_changed = 0;
616 
617     for (int iters = 0; iters < 200; iters++) {
618         float total_err = 0;
619 
620         if (move_x) {
621             best_pairs[0].p.p2.s[0] += move_x_val;
622         } else {
623             best_pairs[0].p.p2.s[0] += move_y_val;
624         }
625 
626         run_estimate_kernel(best_pairs, model);
627         compute_error(inliers, num_inliers, model, deshake_ctx->ransac_err);
628 
629         for (int j = 0; j < num_inliers; j++) {
630             total_err += deshake_ctx->ransac_err[j];
631         }
632 
633         if (total_err < best_err) {
634             for (int mi = 0; mi < 6; ++mi) {
635                 model_out[mi] = model[mi];
636             }
637 
638             best_err = total_err;
639             last_changed = iters;
640         } else {
641             // Undo the change
642             if (move_x) {
643                 best_pairs[0].p.p2.s[0] -= move_x_val;
644             } else {
645                 best_pairs[0].p.p2.s[0] -= move_y_val;
646             }
647 
648             if (iters - last_changed > 4) {
649                 // We've already improved the model as much as we can
650                 break;
651             }
652 
653             old_move_x_val = move_x_val;
654 
655             if (move_x) {
656                 move_x_val *= -1;
657             } else {
658                 move_y_val *= -1;
659             }
660 
661             if (old_move_x_val < 0) {
662                 move_x = 0;
663             } else {
664                 move_x = 1;
665             }
666         }
667     }
668 }
669 
670 // Uses a process similar to that of RANSAC to find a transform that minimizes
671 // the total error for a set of point matches determined to be inliers
672 //
673 // (Pick random subsets, compute model, find total error, iterate until error
674 // is minimized.)
minimize_error(DeshakeOpenCLContext * deshake_ctx,MotionVector * inliers,DebugMatches * debug_matches,const int num_inliers,double * model_out,const int max_iters)675 static int minimize_error(
676     DeshakeOpenCLContext *deshake_ctx,
677     MotionVector *inliers,
678     DebugMatches *debug_matches,
679     const int num_inliers,
680     double *model_out,
681     const int max_iters
682 ) {
683     int result = 0;
684     float best_err = FLT_MAX;
685     double best_model[6], model[6];
686     MotionVector pairs_subset[3], best_pairs[3];
687 
688     for (int i = 0; i < max_iters; i++) {
689         float total_err = 0;
690         int found = get_subset(&deshake_ctx->alfg, inliers, num_inliers, pairs_subset, 10000);
691 
692         if (!found) {
693             if (i == 0) {
694                 return 0;
695             }
696 
697             break;
698         }
699 
700         run_estimate_kernel(pairs_subset, model);
701         compute_error(inliers, num_inliers, model, deshake_ctx->ransac_err);
702 
703         for (int j = 0; j < num_inliers; j++) {
704             total_err += deshake_ctx->ransac_err[j];
705         }
706 
707         if (total_err < best_err) {
708             for (int mi = 0; mi < 6; ++mi) {
709                 best_model[mi] = model[mi];
710             }
711 
712             for (int pi = 0; pi < 3; pi++) {
713                 best_pairs[pi] = pairs_subset[pi];
714             }
715 
716             best_err = total_err;
717         }
718     }
719 
720     for (int mi = 0; mi < 6; ++mi) {
721         model_out[mi] = best_model[mi];
722     }
723 
724     for (int pi = 0; pi < 3; ++pi) {
725         debug_matches->model_matches[pi] = best_pairs[pi];
726     }
727     debug_matches->num_model_matches = 3;
728     result = 1;
729 
730     optimize_model(deshake_ctx, best_pairs, inliers, num_inliers, best_err, model_out);
731     return result;
732 }
733 
734 // End code from OpenCV
735 
736 // Decomposes a similarity matrix into translation, rotation, scale, and skew
737 //
738 // See http://frederic-wang.fr/decomposition-of-2d-transform-matrices.html
decompose_transform(double * model)739 static FrameDelta decompose_transform(double *model)
740 {
741     FrameDelta ret;
742 
743     double a = model[0];
744     double c = model[1];
745     double e = model[2];
746     double b = model[3];
747     double d = model[4];
748     double f = model[5];
749     double delta = a * d - b * c;
750 
751     memset(&ret, 0, sizeof(ret));
752 
753     ret.translation.s[0] = e;
754     ret.translation.s[1] = f;
755 
756     // This is the QR method
757     if (a != 0 || b != 0) {
758         double r = hypot(a, b);
759 
760         ret.rotation = FFSIGN(b) * acos(a / r);
761         ret.scale.s[0] = r;
762         ret.scale.s[1] = delta / r;
763         ret.skew.s[0] = atan((a * c + b * d) / (r * r));
764         ret.skew.s[1] = 0;
765     } else if (c != 0 || d != 0) {
766         double s = sqrt(c * c + d * d);
767 
768         ret.rotation = M_PI / 2 - FFSIGN(d) * acos(-c / s);
769         ret.scale.s[0] = delta / s;
770         ret.scale.s[1] = s;
771         ret.skew.s[0] = 0;
772         ret.skew.s[1] = atan((a * c + b * d) / (s * s));
773     } // otherwise there is only translation
774 
775     return ret;
776 }
777 
778 // Move valid vectors from the 2d buffer into a 1d buffer where they are contiguous
make_vectors_contig(DeshakeOpenCLContext * deshake_ctx,int size_y,int size_x)779 static int make_vectors_contig(
780     DeshakeOpenCLContext *deshake_ctx,
781     int size_y,
782     int size_x
783 ) {
784     int num_vectors = 0;
785 
786     for (int i = 0; i < size_y; ++i) {
787         for (int j = 0; j < size_x; ++j) {
788             MotionVector v = deshake_ctx->matches_host[j + i * size_x];
789 
790             if (v.should_consider) {
791                 deshake_ctx->matches_contig_host[num_vectors] = v;
792                 ++num_vectors;
793             }
794 
795             // Make sure we do not exceed the amount of space we allocated for these vectors
796             if (num_vectors == MATCHES_CONTIG_SIZE - 1) {
797                 return num_vectors;
798             }
799         }
800     }
801     return num_vectors;
802 }
803 
804 // Returns the gaussian kernel value for the given x coordinate and sigma value
gaussian_for(int x,float sigma)805 static float gaussian_for(int x, float sigma) {
806     return 1.0f / expf(((float)x * (float)x) / (2.0f * sigma * sigma));
807 }
808 
809 // Makes a normalized gaussian kernel of the given length for the given sigma
810 // and places it in gauss_kernel
make_gauss_kernel(float * gauss_kernel,float length,float sigma)811 static void make_gauss_kernel(float *gauss_kernel, float length, float sigma)
812 {
813     float gauss_sum = 0;
814     int window_half = length / 2;
815 
816     for (int i = 0; i < length; ++i) {
817         float val = gaussian_for(i - window_half, sigma);
818 
819         gauss_sum += val;
820         gauss_kernel[i] = val;
821     }
822 
823     // Normalize the gaussian values
824     for (int i = 0; i < length; ++i) {
825         gauss_kernel[i] /= gauss_sum;
826     }
827 }
828 
829 // Returns indices to start and end iteration at in order to iterate over a window
830 // of length size centered at the current frame in a ringbuffer
831 //
832 // Always returns numbers that result in a window of length size, even if that
833 // means specifying negative indices or indices past the end of the values in the
834 // ringbuffers. Make sure you clip indices appropriately within your loop.
start_end_for(DeshakeOpenCLContext * deshake_ctx,int length)835 static IterIndices start_end_for(DeshakeOpenCLContext *deshake_ctx, int length) {
836     IterIndices indices;
837 
838     indices.start = deshake_ctx->abs_motion.curr_frame_offset - (length / 2);
839     indices.end = deshake_ctx->abs_motion.curr_frame_offset + (length / 2) + (length % 2);
840 
841     return indices;
842 }
843 
844 // Sets val to the value in the given ringbuffer at the given offset, taking care of
845 // clipping the offset into the appropriate range
ringbuf_float_at(DeshakeOpenCLContext * deshake_ctx,AVFifo * values,float * val,int offset)846 static void ringbuf_float_at(
847     DeshakeOpenCLContext *deshake_ctx,
848     AVFifo *values,
849     float *val,
850     int offset
851 ) {
852     int clip_start, clip_end, offset_clipped;
853     if (deshake_ctx->abs_motion.data_end_offset != -1) {
854         clip_end = deshake_ctx->abs_motion.data_end_offset;
855     } else {
856         // This expression represents the last valid index in the buffer,
857         // which we use repeatedly at the end of the video.
858         clip_end = deshake_ctx->smooth_window - av_fifo_can_write(values) - 1;
859     }
860 
861     if (deshake_ctx->abs_motion.data_start_offset != -1) {
862         clip_start = deshake_ctx->abs_motion.data_start_offset;
863     } else {
864         // Negative indices will occur at the start of the video, and we want
865         // them to be clipped to 0 in order to repeatedly use the position of
866         // the first frame.
867         clip_start = 0;
868     }
869 
870     offset_clipped = av_clip(
871         offset,
872         clip_start,
873         clip_end
874     );
875 
876     av_fifo_peek(values, val, 1, offset_clipped);
877 }
878 
879 // Returns smoothed current frame value of the given buffer of floats based on the
880 // given Gaussian kernel and its length (also the window length, centered around the
881 // current frame) and the "maximum value" of the motion.
882 //
883 // This "maximum value" should be the width / height of the image in the case of
884 // translation and an empirically chosen constant for rotation / scale.
885 //
886 // The sigma chosen to generate the final gaussian kernel with used to smooth the
887 // camera path is either hardcoded (set by user, deshake_ctx->smooth_percent) or
888 // adaptively chosen.
smooth(DeshakeOpenCLContext * deshake_ctx,float * gauss_kernel,int length,float max_val,AVFifo * values)889 static float smooth(
890     DeshakeOpenCLContext *deshake_ctx,
891     float *gauss_kernel,
892     int length,
893     float max_val,
894     AVFifo *values
895 ) {
896     float new_large_s = 0, new_small_s = 0, new_best = 0, old, diff_between,
897           percent_of_max, inverted_percent;
898     IterIndices indices = start_end_for(deshake_ctx, length);
899     float large_sigma = 40.0f;
900     float small_sigma = 2.0f;
901     float best_sigma;
902 
903     if (deshake_ctx->smooth_percent) {
904         best_sigma = (large_sigma - 0.5f) * deshake_ctx->smooth_percent + 0.5f;
905     } else {
906         // Strategy to adaptively smooth trajectory:
907         //
908         // 1. Smooth path with large and small sigma values
909         // 2. Take the absolute value of the difference between them
910         // 3. Get a percentage by putting the difference over the "max value"
911         // 4, Invert the percentage
912         // 5. Calculate a new sigma value weighted towards the larger sigma value
913         // 6. Determine final smoothed trajectory value using that sigma
914 
915         make_gauss_kernel(gauss_kernel, length, large_sigma);
916         for (int i = indices.start, j = 0; i < indices.end; ++i, ++j) {
917             ringbuf_float_at(deshake_ctx, values, &old, i);
918             new_large_s += old * gauss_kernel[j];
919         }
920 
921         make_gauss_kernel(gauss_kernel, length, small_sigma);
922         for (int i = indices.start, j = 0; i < indices.end; ++i, ++j) {
923             ringbuf_float_at(deshake_ctx, values, &old, i);
924             new_small_s += old * gauss_kernel[j];
925         }
926 
927         diff_between = fabsf(new_large_s - new_small_s);
928         percent_of_max = diff_between / max_val;
929         inverted_percent = 1 - percent_of_max;
930         best_sigma = large_sigma * powf(inverted_percent, 40);
931     }
932 
933     make_gauss_kernel(gauss_kernel, length, best_sigma);
934     for (int i = indices.start, j = 0; i < indices.end; ++i, ++j) {
935         ringbuf_float_at(deshake_ctx, values, &old, i);
936         new_best += old * gauss_kernel[j];
937     }
938 
939     return new_best;
940 }
941 
942 // Returns the position of the given point after the transform is applied
transformed_point(float x,float y,float * transform)943 static cl_float2 transformed_point(float x, float y, float *transform) {
944     cl_float2 ret;
945 
946     ret.s[0] = x * transform[0] + y * transform[1] + transform[2];
947     ret.s[1] = x * transform[3] + y * transform[4] + transform[5];
948 
949     return ret;
950 }
951 
952 // Creates an affine transform that scales from the center of a frame
transform_center_scale(float x_shift,float y_shift,float angle,float scale_x,float scale_y,float center_w,float center_h,float * matrix)953 static void transform_center_scale(
954     float x_shift,
955     float y_shift,
956     float angle,
957     float scale_x,
958     float scale_y,
959     float center_w,
960     float center_h,
961     float *matrix
962 ) {
963     cl_float2 center_s;
964     float center_s_w, center_s_h;
965 
966     ff_get_matrix(
967         0,
968         0,
969         0,
970         scale_x,
971         scale_y,
972         matrix
973     );
974 
975     center_s = transformed_point(center_w, center_h, matrix);
976     center_s_w = center_w - center_s.s[0];
977     center_s_h = center_h - center_s.s[1];
978 
979     ff_get_matrix(
980         x_shift + center_s_w,
981         y_shift + center_s_h,
982         angle,
983         scale_x,
984         scale_y,
985         matrix
986     );
987 }
988 
989 // Determines the crop necessary to eliminate black borders from a smoothed frame
990 // and updates target crop accordingly
update_needed_crop(CropInfo * crop,float * transform,float frame_width,float frame_height)991 static void update_needed_crop(
992     CropInfo* crop,
993     float *transform,
994     float frame_width,
995     float frame_height
996 ) {
997     float new_width, new_height, adjusted_width, adjusted_height, adjusted_x, adjusted_y;
998 
999     cl_float2 top_left = transformed_point(0, 0, transform);
1000     cl_float2 top_right = transformed_point(frame_width, 0, transform);
1001     cl_float2 bottom_left = transformed_point(0, frame_height, transform);
1002     cl_float2 bottom_right = transformed_point(frame_width, frame_height, transform);
1003     float ar_h = frame_height / frame_width;
1004     float ar_w = frame_width / frame_height;
1005 
1006     if (crop->bottom_right.s[0] == 0) {
1007         // The crop hasn't been set to the original size of the plane
1008         crop->bottom_right.s[0] = frame_width;
1009         crop->bottom_right.s[1] = frame_height;
1010     }
1011 
1012     crop->top_left.s[0] = FFMAX3(
1013         crop->top_left.s[0],
1014         top_left.s[0],
1015         bottom_left.s[0]
1016     );
1017 
1018     crop->top_left.s[1] = FFMAX3(
1019         crop->top_left.s[1],
1020         top_left.s[1],
1021         top_right.s[1]
1022     );
1023 
1024     crop->bottom_right.s[0] = FFMIN3(
1025         crop->bottom_right.s[0],
1026         bottom_right.s[0],
1027         top_right.s[0]
1028     );
1029 
1030     crop->bottom_right.s[1] = FFMIN3(
1031         crop->bottom_right.s[1],
1032         bottom_right.s[1],
1033         bottom_left.s[1]
1034     );
1035 
1036     // Make sure our potentially new bounding box has the same aspect ratio
1037     new_height = crop->bottom_right.s[1] - crop->top_left.s[1];
1038     new_width = crop->bottom_right.s[0] - crop->top_left.s[0];
1039 
1040     adjusted_width = new_height * ar_w;
1041     adjusted_x = crop->bottom_right.s[0] - adjusted_width;
1042 
1043     if (adjusted_x >= crop->top_left.s[0]) {
1044         crop->top_left.s[0] = adjusted_x;
1045     } else {
1046         adjusted_height = new_width * ar_h;
1047         adjusted_y = crop->bottom_right.s[1] - adjusted_height;
1048         crop->top_left.s[1] = adjusted_y;
1049     }
1050 }
1051 
deshake_opencl_uninit(AVFilterContext * avctx)1052 static av_cold void deshake_opencl_uninit(AVFilterContext *avctx)
1053 {
1054     DeshakeOpenCLContext *ctx = avctx->priv;
1055     cl_int cle;
1056 
1057     for (int i = 0; i < RingbufCount; i++)
1058         av_fifo_freep2(&ctx->abs_motion.ringbuffers[i]);
1059 
1060     if (ctx->debug_on)
1061         free_debug_matches(&ctx->abs_motion);
1062 
1063     if (ctx->gauss_kernel)
1064         av_freep(&ctx->gauss_kernel);
1065 
1066     if (ctx->ransac_err)
1067         av_freep(&ctx->ransac_err);
1068 
1069     if (ctx->matches_host)
1070         av_freep(&ctx->matches_host);
1071 
1072     if (ctx->matches_contig_host)
1073         av_freep(&ctx->matches_contig_host);
1074 
1075     if (ctx->inliers)
1076         av_freep(&ctx->inliers);
1077 
1078     ff_framequeue_free(&ctx->fq);
1079 
1080     CL_RELEASE_KERNEL(ctx->kernel_grayscale);
1081     CL_RELEASE_KERNEL(ctx->kernel_harris_response);
1082     CL_RELEASE_KERNEL(ctx->kernel_refine_features);
1083     CL_RELEASE_KERNEL(ctx->kernel_brief_descriptors);
1084     CL_RELEASE_KERNEL(ctx->kernel_match_descriptors);
1085     CL_RELEASE_KERNEL(ctx->kernel_crop_upscale);
1086     if (ctx->debug_on)
1087         CL_RELEASE_KERNEL(ctx->kernel_draw_debug_info);
1088 
1089     CL_RELEASE_QUEUE(ctx->command_queue);
1090 
1091     if (!ctx->is_yuv)
1092         CL_RELEASE_MEMORY(ctx->grayscale);
1093     CL_RELEASE_MEMORY(ctx->harris_buf);
1094     CL_RELEASE_MEMORY(ctx->refined_features);
1095     CL_RELEASE_MEMORY(ctx->prev_refined_features);
1096     CL_RELEASE_MEMORY(ctx->brief_pattern);
1097     CL_RELEASE_MEMORY(ctx->descriptors);
1098     CL_RELEASE_MEMORY(ctx->prev_descriptors);
1099     CL_RELEASE_MEMORY(ctx->matches);
1100     CL_RELEASE_MEMORY(ctx->matches_contig);
1101     CL_RELEASE_MEMORY(ctx->transform_y);
1102     CL_RELEASE_MEMORY(ctx->transform_uv);
1103     if (ctx->debug_on) {
1104         CL_RELEASE_MEMORY(ctx->debug_matches);
1105         CL_RELEASE_MEMORY(ctx->debug_model_matches);
1106     }
1107 
1108     ff_opencl_filter_uninit(avctx);
1109 }
1110 
deshake_opencl_init(AVFilterContext * avctx)1111 static int deshake_opencl_init(AVFilterContext *avctx)
1112 {
1113     DeshakeOpenCLContext *ctx = avctx->priv;
1114     AVFilterLink *outlink = avctx->outputs[0];
1115     AVFilterLink *inlink = avctx->inputs[0];
1116     // Pointer to the host-side pattern buffer to be initialized and then copied
1117     // to the GPU
1118     PointPair *pattern_host = NULL;
1119     cl_int cle;
1120     int err;
1121     cl_ulong8 zeroed_ulong8;
1122     FFFrameQueueGlobal fqg;
1123     cl_image_format grayscale_format;
1124     cl_image_desc grayscale_desc;
1125     cl_command_queue_properties queue_props;
1126 
1127     const enum AVPixelFormat disallowed_formats[14] = {
1128         AV_PIX_FMT_GBRP,
1129         AV_PIX_FMT_GBRP9BE,
1130         AV_PIX_FMT_GBRP9LE,
1131         AV_PIX_FMT_GBRP10BE,
1132         AV_PIX_FMT_GBRP10LE,
1133         AV_PIX_FMT_GBRP16BE,
1134         AV_PIX_FMT_GBRP16LE,
1135         AV_PIX_FMT_GBRAP,
1136         AV_PIX_FMT_GBRAP16BE,
1137         AV_PIX_FMT_GBRAP16LE,
1138         AV_PIX_FMT_GBRAP12BE,
1139         AV_PIX_FMT_GBRAP12LE,
1140         AV_PIX_FMT_GBRAP10BE,
1141         AV_PIX_FMT_GBRAP10LE
1142     };
1143 
1144     // Number of elements for an array
1145     const int image_grid_32 = ROUNDED_UP_DIV(outlink->h, 32) * ROUNDED_UP_DIV(outlink->w, 32);
1146 
1147     const int descriptor_buf_size = image_grid_32 * (BREIFN / 8);
1148     const int features_buf_size = image_grid_32 * sizeof(cl_float2);
1149 
1150     const AVHWFramesContext *hw_frames_ctx = (AVHWFramesContext*)inlink->hw_frames_ctx->data;
1151     const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(hw_frames_ctx->sw_format);
1152 
1153     av_assert0(hw_frames_ctx);
1154     av_assert0(desc);
1155 
1156     ff_framequeue_global_init(&fqg);
1157     ff_framequeue_init(&ctx->fq, &fqg);
1158     ctx->eof = 0;
1159     ctx->smooth_window = (int)(av_q2d(avctx->inputs[0]->frame_rate) * ctx->smooth_window_multiplier);
1160     ctx->curr_frame = 0;
1161 
1162     memset(&zeroed_ulong8, 0, sizeof(cl_ulong8));
1163 
1164     ctx->gauss_kernel = av_malloc_array(ctx->smooth_window, sizeof(float));
1165     if (!ctx->gauss_kernel) {
1166         err = AVERROR(ENOMEM);
1167         goto fail;
1168     }
1169 
1170     ctx->ransac_err = av_malloc_array(MATCHES_CONTIG_SIZE, sizeof(float));
1171     if (!ctx->ransac_err) {
1172         err = AVERROR(ENOMEM);
1173         goto fail;
1174     }
1175 
1176     for (int i = 0; i < RingbufCount; i++) {
1177         ctx->abs_motion.ringbuffers[i] = av_fifo_alloc2(ctx->smooth_window,
1178             sizeof(float), 0);
1179 
1180         if (!ctx->abs_motion.ringbuffers[i]) {
1181             err = AVERROR(ENOMEM);
1182             goto fail;
1183         }
1184     }
1185 
1186     if (ctx->debug_on) {
1187         ctx->abs_motion.debug_matches = av_fifo_alloc2(
1188             ctx->smooth_window / 2,
1189             sizeof(DebugMatches), 0
1190         );
1191 
1192         if (!ctx->abs_motion.debug_matches) {
1193             err = AVERROR(ENOMEM);
1194             goto fail;
1195         }
1196     }
1197 
1198     ctx->abs_motion.curr_frame_offset = 0;
1199     ctx->abs_motion.data_start_offset = -1;
1200     ctx->abs_motion.data_end_offset = -1;
1201 
1202     pattern_host = av_malloc_array(BREIFN, sizeof(PointPair));
1203     if (!pattern_host) {
1204         err = AVERROR(ENOMEM);
1205         goto fail;
1206     }
1207 
1208     ctx->matches_host = av_malloc_array(image_grid_32, sizeof(MotionVector));
1209     if (!ctx->matches_host) {
1210         err = AVERROR(ENOMEM);
1211         goto fail;
1212     }
1213 
1214     ctx->matches_contig_host = av_malloc_array(MATCHES_CONTIG_SIZE, sizeof(MotionVector));
1215     if (!ctx->matches_contig_host) {
1216         err = AVERROR(ENOMEM);
1217         goto fail;
1218     }
1219 
1220     ctx->inliers = av_malloc_array(MATCHES_CONTIG_SIZE, sizeof(MotionVector));
1221     if (!ctx->inliers) {
1222         err = AVERROR(ENOMEM);
1223         goto fail;
1224     }
1225 
1226     // Initializing the patch pattern for building BREIF descriptors with
1227     av_lfg_init(&ctx->alfg, 234342424);
1228     for (int i = 0; i < BREIFN; ++i) {
1229         PointPair pair;
1230 
1231         for (int j = 0; j < 2; ++j) {
1232             pair.p1.s[j] = rand_in(-BRIEF_PATCH_SIZE_HALF, BRIEF_PATCH_SIZE_HALF + 1, &ctx->alfg);
1233             pair.p2.s[j] = rand_in(-BRIEF_PATCH_SIZE_HALF, BRIEF_PATCH_SIZE_HALF + 1, &ctx->alfg);
1234         }
1235 
1236         pattern_host[i] = pair;
1237     }
1238 
1239     for (int i = 0; i < 14; i++) {
1240         if (ctx->sw_format == disallowed_formats[i]) {
1241             av_log(avctx, AV_LOG_ERROR, "unsupported format in deshake_opencl.\n");
1242             err = AVERROR(ENOSYS);
1243             goto fail;
1244         }
1245     }
1246 
1247     if (desc->flags & AV_PIX_FMT_FLAG_RGB) {
1248         ctx->is_yuv = 0;
1249     } else {
1250         ctx->is_yuv = 1;
1251     }
1252     ctx->sw_format = hw_frames_ctx->sw_format;
1253 
1254     err = ff_opencl_filter_load_program(avctx, &ff_opencl_source_deshake, 1);
1255     if (err < 0)
1256         goto fail;
1257 
1258     if (ctx->debug_on) {
1259         queue_props = CL_QUEUE_PROFILING_ENABLE;
1260     } else {
1261         queue_props = 0;
1262     }
1263     ctx->command_queue = clCreateCommandQueue(
1264         ctx->ocf.hwctx->context,
1265         ctx->ocf.hwctx->device_id,
1266         queue_props,
1267         &cle
1268     );
1269     CL_FAIL_ON_ERROR(AVERROR(EIO), "Failed to create OpenCL command queue %d.\n", cle);
1270 
1271     CL_CREATE_KERNEL(ctx, grayscale);
1272     CL_CREATE_KERNEL(ctx, harris_response);
1273     CL_CREATE_KERNEL(ctx, refine_features);
1274     CL_CREATE_KERNEL(ctx, brief_descriptors);
1275     CL_CREATE_KERNEL(ctx, match_descriptors);
1276     CL_CREATE_KERNEL(ctx, transform);
1277     CL_CREATE_KERNEL(ctx, crop_upscale);
1278     if (ctx->debug_on)
1279         CL_CREATE_KERNEL(ctx, draw_debug_info);
1280 
1281     if (!ctx->is_yuv) {
1282         grayscale_format.image_channel_order = CL_R;
1283         grayscale_format.image_channel_data_type = CL_FLOAT;
1284 
1285         grayscale_desc = (cl_image_desc) {
1286             .image_type = CL_MEM_OBJECT_IMAGE2D,
1287             .image_width = outlink->w,
1288             .image_height = outlink->h,
1289             .image_depth = 0,
1290             .image_array_size = 0,
1291             .image_row_pitch = 0,
1292             .image_slice_pitch = 0,
1293             .num_mip_levels = 0,
1294             .num_samples = 0,
1295             .buffer = NULL,
1296         };
1297 
1298         ctx->grayscale = clCreateImage(
1299             ctx->ocf.hwctx->context,
1300             0,
1301             &grayscale_format,
1302             &grayscale_desc,
1303             NULL,
1304             &cle
1305         );
1306         CL_FAIL_ON_ERROR(AVERROR(EIO), "Failed to create grayscale image: %d.\n", cle);
1307     }
1308 
1309     CL_CREATE_BUFFER(ctx, harris_buf, outlink->h * outlink->w * sizeof(float));
1310     CL_CREATE_BUFFER(ctx, refined_features, features_buf_size);
1311     CL_CREATE_BUFFER(ctx, prev_refined_features, features_buf_size);
1312     CL_CREATE_BUFFER_FLAGS(
1313         ctx,
1314         brief_pattern,
1315         CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,
1316         BREIFN * sizeof(PointPair),
1317         pattern_host
1318     );
1319     CL_CREATE_BUFFER(ctx, descriptors, descriptor_buf_size);
1320     CL_CREATE_BUFFER(ctx, prev_descriptors, descriptor_buf_size);
1321     CL_CREATE_BUFFER(ctx, matches, image_grid_32 * sizeof(MotionVector));
1322     CL_CREATE_BUFFER(ctx, matches_contig, MATCHES_CONTIG_SIZE * sizeof(MotionVector));
1323     CL_CREATE_BUFFER(ctx, transform_y, 9 * sizeof(float));
1324     CL_CREATE_BUFFER(ctx, transform_uv, 9 * sizeof(float));
1325     if (ctx->debug_on) {
1326         CL_CREATE_BUFFER(ctx, debug_matches, MATCHES_CONTIG_SIZE * sizeof(MotionVector));
1327         CL_CREATE_BUFFER(ctx, debug_model_matches, 3 * sizeof(MotionVector));
1328     }
1329 
1330     ctx->initialized = 1;
1331     av_freep(&pattern_host);
1332 
1333     return 0;
1334 
1335 fail:
1336     av_freep(&pattern_host);
1337     return err;
1338 }
1339 
1340 // Logs debug information about the transform data
transform_debug(AVFilterContext * avctx,float * new_vals,float * old_vals,int curr_frame)1341 static void transform_debug(AVFilterContext *avctx, float *new_vals, float *old_vals, int curr_frame) {
1342     av_log(avctx, AV_LOG_VERBOSE,
1343         "Frame %d:\n"
1344         "\tframe moved from: %f x, %f y\n"
1345         "\t              to: %f x, %f y\n"
1346         "\t    rotated from: %f degrees\n"
1347         "\t              to: %f degrees\n"
1348         "\t     scaled from: %f x, %f y\n"
1349         "\t              to: %f x, %f y\n"
1350         "\n"
1351         "\tframe moved by: %f x, %f y\n"
1352         "\t    rotated by: %f degrees\n"
1353         "\t     scaled by: %f x, %f y\n",
1354         curr_frame,
1355         old_vals[RingbufX], old_vals[RingbufY],
1356         new_vals[RingbufX], new_vals[RingbufY],
1357         old_vals[RingbufRot] * (180.0 / M_PI),
1358         new_vals[RingbufRot] * (180.0 / M_PI),
1359         old_vals[RingbufScaleX], old_vals[RingbufScaleY],
1360         new_vals[RingbufScaleX], new_vals[RingbufScaleY],
1361         old_vals[RingbufX] - new_vals[RingbufX], old_vals[RingbufY] - new_vals[RingbufY],
1362         old_vals[RingbufRot] * (180.0 / M_PI) - new_vals[RingbufRot] * (180.0 / M_PI),
1363         new_vals[RingbufScaleX] / old_vals[RingbufScaleX], new_vals[RingbufScaleY] / old_vals[RingbufScaleY]
1364     );
1365 }
1366 
1367 // Uses the buffered motion information to determine a transform that smooths the
1368 // given frame and applies it
filter_frame(AVFilterLink * link,AVFrame * input_frame)1369 static int filter_frame(AVFilterLink *link, AVFrame *input_frame)
1370 {
1371     AVFilterContext *avctx = link->dst;
1372     AVFilterLink *outlink = avctx->outputs[0];
1373     DeshakeOpenCLContext *deshake_ctx = avctx->priv;
1374     AVFrame *cropped_frame = NULL, *transformed_frame = NULL;
1375     int err;
1376     cl_int cle;
1377     float new_vals[RingbufCount];
1378     float old_vals[RingbufCount];
1379     // Luma (in the case of YUV) transform, or just the transform in the case of RGB
1380     float transform_y[9];
1381     // Chroma transform
1382     float transform_uv[9];
1383     // Luma crop transform (or RGB)
1384     float transform_crop_y[9];
1385     // Chroma crop transform
1386     float transform_crop_uv[9];
1387     float transform_debug_rgb[9];
1388     size_t global_work[2];
1389     int64_t duration;
1390     cl_mem src, transformed, dst;
1391     cl_mem transforms[3];
1392     CropInfo crops[3];
1393     cl_event transform_event, crop_upscale_event;
1394     DebugMatches debug_matches;
1395     cl_int num_model_matches;
1396 
1397     const float center_w = (float)input_frame->width / 2;
1398     const float center_h = (float)input_frame->height / 2;
1399 
1400     const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(deshake_ctx->sw_format);
1401     const int chroma_width  = AV_CEIL_RSHIFT(input_frame->width, desc->log2_chroma_w);
1402     const int chroma_height = AV_CEIL_RSHIFT(input_frame->height, desc->log2_chroma_h);
1403 
1404     const float center_w_chroma = (float)chroma_width / 2;
1405     const float center_h_chroma = (float)chroma_height / 2;
1406 
1407     const float luma_w_over_chroma_w = ((float)input_frame->width / (float)chroma_width);
1408     const float luma_h_over_chroma_h = ((float)input_frame->height / (float)chroma_height);
1409 
1410     if (deshake_ctx->debug_on) {
1411         av_fifo_read(
1412             deshake_ctx->abs_motion.debug_matches,
1413             &debug_matches, 1);
1414     }
1415 
1416     if (input_frame->pkt_duration) {
1417         duration = input_frame->pkt_duration;
1418     } else {
1419         duration = av_rescale_q(1, av_inv_q(outlink->frame_rate), outlink->time_base);
1420     }
1421     deshake_ctx->duration = input_frame->pts + duration;
1422 
1423     // Get the absolute transform data for this frame
1424     for (int i = 0; i < RingbufCount; i++) {
1425         av_fifo_peek(deshake_ctx->abs_motion.ringbuffers[i],
1426                      &old_vals[i], 1,
1427                      deshake_ctx->abs_motion.curr_frame_offset);
1428     }
1429 
1430     if (deshake_ctx->tripod_mode) {
1431         // If tripod mode is turned on we simply undo all motion relative to the
1432         // first frame
1433 
1434         new_vals[RingbufX] = 0.0f;
1435         new_vals[RingbufY] = 0.0f;
1436         new_vals[RingbufRot] = 0.0f;
1437         new_vals[RingbufScaleX] = 1.0f;
1438         new_vals[RingbufScaleY] = 1.0f;
1439     } else {
1440         // Tripod mode is off and we need to smooth a moving camera
1441 
1442         new_vals[RingbufX] = smooth(
1443             deshake_ctx,
1444             deshake_ctx->gauss_kernel,
1445             deshake_ctx->smooth_window,
1446             input_frame->width,
1447             deshake_ctx->abs_motion.ringbuffers[RingbufX]
1448         );
1449         new_vals[RingbufY] = smooth(
1450             deshake_ctx,
1451             deshake_ctx->gauss_kernel,
1452             deshake_ctx->smooth_window,
1453             input_frame->height,
1454             deshake_ctx->abs_motion.ringbuffers[RingbufY]
1455         );
1456         new_vals[RingbufRot] = smooth(
1457             deshake_ctx,
1458             deshake_ctx->gauss_kernel,
1459             deshake_ctx->smooth_window,
1460             M_PI / 4,
1461             deshake_ctx->abs_motion.ringbuffers[RingbufRot]
1462         );
1463         new_vals[RingbufScaleX] = smooth(
1464             deshake_ctx,
1465             deshake_ctx->gauss_kernel,
1466             deshake_ctx->smooth_window,
1467             2.0f,
1468             deshake_ctx->abs_motion.ringbuffers[RingbufScaleX]
1469         );
1470         new_vals[RingbufScaleY] = smooth(
1471             deshake_ctx,
1472             deshake_ctx->gauss_kernel,
1473             deshake_ctx->smooth_window,
1474             2.0f,
1475             deshake_ctx->abs_motion.ringbuffers[RingbufScaleY]
1476         );
1477     }
1478 
1479     transform_center_scale(
1480         old_vals[RingbufX] - new_vals[RingbufX],
1481         old_vals[RingbufY] - new_vals[RingbufY],
1482         old_vals[RingbufRot] - new_vals[RingbufRot],
1483         new_vals[RingbufScaleX] / old_vals[RingbufScaleX],
1484         new_vals[RingbufScaleY] / old_vals[RingbufScaleY],
1485         center_w,
1486         center_h,
1487         transform_y
1488     );
1489 
1490     transform_center_scale(
1491         (old_vals[RingbufX] - new_vals[RingbufX]) / luma_w_over_chroma_w,
1492         (old_vals[RingbufY] - new_vals[RingbufY]) / luma_h_over_chroma_h,
1493         old_vals[RingbufRot] - new_vals[RingbufRot],
1494         new_vals[RingbufScaleX] / old_vals[RingbufScaleX],
1495         new_vals[RingbufScaleY] / old_vals[RingbufScaleY],
1496         center_w_chroma,
1497         center_h_chroma,
1498         transform_uv
1499     );
1500 
1501     CL_BLOCKING_WRITE_BUFFER(deshake_ctx->command_queue, deshake_ctx->transform_y, 9 * sizeof(float), transform_y, NULL);
1502     CL_BLOCKING_WRITE_BUFFER(deshake_ctx->command_queue, deshake_ctx->transform_uv, 9 * sizeof(float), transform_uv, NULL);
1503 
1504     if (deshake_ctx->debug_on)
1505         transform_debug(avctx, new_vals, old_vals, deshake_ctx->curr_frame);
1506 
1507     cropped_frame = ff_get_video_buffer(outlink, outlink->w, outlink->h);
1508     if (!cropped_frame) {
1509         err = AVERROR(ENOMEM);
1510         goto fail;
1511     }
1512 
1513     transformed_frame = ff_get_video_buffer(outlink, outlink->w, outlink->h);
1514     if (!transformed_frame) {
1515         err = AVERROR(ENOMEM);
1516         goto fail;
1517     }
1518 
1519     transforms[0] = deshake_ctx->transform_y;
1520     transforms[1] = transforms[2] = deshake_ctx->transform_uv;
1521 
1522     for (int p = 0; p < FF_ARRAY_ELEMS(transformed_frame->data); p++) {
1523         // Transform all of the planes appropriately
1524         src = (cl_mem)input_frame->data[p];
1525         transformed = (cl_mem)transformed_frame->data[p];
1526 
1527         if (!transformed)
1528             break;
1529 
1530         err = ff_opencl_filter_work_size_from_image(avctx, global_work, input_frame, p, 0);
1531         if (err < 0)
1532             goto fail;
1533 
1534         CL_RUN_KERNEL_WITH_ARGS(
1535             deshake_ctx->command_queue,
1536             deshake_ctx->kernel_transform,
1537             global_work,
1538             NULL,
1539             &transform_event,
1540             { sizeof(cl_mem), &src },
1541             { sizeof(cl_mem), &transformed },
1542             { sizeof(cl_mem), &transforms[p] },
1543         );
1544     }
1545 
1546     if (deshake_ctx->debug_on && !deshake_ctx->is_yuv && debug_matches.num_matches > 0) {
1547         CL_BLOCKING_WRITE_BUFFER(
1548             deshake_ctx->command_queue,
1549             deshake_ctx->debug_matches,
1550             debug_matches.num_matches * sizeof(MotionVector),
1551             debug_matches.matches,
1552             NULL
1553         );
1554 
1555         CL_BLOCKING_WRITE_BUFFER(
1556             deshake_ctx->command_queue,
1557             deshake_ctx->debug_model_matches,
1558             debug_matches.num_model_matches * sizeof(MotionVector),
1559             debug_matches.model_matches,
1560             NULL
1561         );
1562 
1563         num_model_matches = debug_matches.num_model_matches;
1564 
1565         // Invert the transform
1566         transform_center_scale(
1567             new_vals[RingbufX] - old_vals[RingbufX],
1568             new_vals[RingbufY] - old_vals[RingbufY],
1569             new_vals[RingbufRot] - old_vals[RingbufRot],
1570             old_vals[RingbufScaleX] / new_vals[RingbufScaleX],
1571             old_vals[RingbufScaleY] / new_vals[RingbufScaleY],
1572             center_w,
1573             center_h,
1574             transform_debug_rgb
1575         );
1576 
1577         CL_BLOCKING_WRITE_BUFFER(deshake_ctx->command_queue, deshake_ctx->transform_y, 9 * sizeof(float), transform_debug_rgb, NULL);
1578 
1579         transformed = (cl_mem)transformed_frame->data[0];
1580         CL_RUN_KERNEL_WITH_ARGS(
1581             deshake_ctx->command_queue,
1582             deshake_ctx->kernel_draw_debug_info,
1583             (size_t[]){ debug_matches.num_matches },
1584             NULL,
1585             NULL,
1586             { sizeof(cl_mem), &transformed },
1587             { sizeof(cl_mem), &deshake_ctx->debug_matches },
1588             { sizeof(cl_mem), &deshake_ctx->debug_model_matches },
1589             { sizeof(cl_int), &num_model_matches },
1590             { sizeof(cl_mem), &deshake_ctx->transform_y }
1591         );
1592     }
1593 
1594     if (deshake_ctx->should_crop) {
1595         // Generate transforms for cropping
1596         transform_center_scale(
1597             (old_vals[RingbufX] - new_vals[RingbufX]) / 5,
1598             (old_vals[RingbufY] - new_vals[RingbufY]) / 5,
1599             (old_vals[RingbufRot] - new_vals[RingbufRot]) / 5,
1600             new_vals[RingbufScaleX] / old_vals[RingbufScaleX],
1601             new_vals[RingbufScaleY] / old_vals[RingbufScaleY],
1602             center_w,
1603             center_h,
1604             transform_crop_y
1605         );
1606         update_needed_crop(&deshake_ctx->crop_y, transform_crop_y, input_frame->width, input_frame->height);
1607 
1608         transform_center_scale(
1609             (old_vals[RingbufX] - new_vals[RingbufX]) / (5 * luma_w_over_chroma_w),
1610             (old_vals[RingbufY] - new_vals[RingbufY]) / (5 * luma_h_over_chroma_h),
1611             (old_vals[RingbufRot] - new_vals[RingbufRot]) / 5,
1612             new_vals[RingbufScaleX] / old_vals[RingbufScaleX],
1613             new_vals[RingbufScaleY] / old_vals[RingbufScaleY],
1614             center_w_chroma,
1615             center_h_chroma,
1616             transform_crop_uv
1617         );
1618         update_needed_crop(&deshake_ctx->crop_uv, transform_crop_uv, chroma_width, chroma_height);
1619 
1620         crops[0] = deshake_ctx->crop_y;
1621         crops[1] = crops[2] = deshake_ctx->crop_uv;
1622 
1623         for (int p = 0; p < FF_ARRAY_ELEMS(cropped_frame->data); p++) {
1624             // Crop all of the planes appropriately
1625             dst = (cl_mem)cropped_frame->data[p];
1626             transformed = (cl_mem)transformed_frame->data[p];
1627 
1628             if (!dst)
1629                 break;
1630 
1631             err = ff_opencl_filter_work_size_from_image(avctx, global_work, input_frame, p, 0);
1632             if (err < 0)
1633                 goto fail;
1634 
1635             CL_RUN_KERNEL_WITH_ARGS(
1636                 deshake_ctx->command_queue,
1637                 deshake_ctx->kernel_crop_upscale,
1638                 global_work,
1639                 NULL,
1640                 &crop_upscale_event,
1641                 { sizeof(cl_mem), &transformed },
1642                 { sizeof(cl_mem), &dst },
1643                 { sizeof(cl_float2), &crops[p].top_left },
1644                 { sizeof(cl_float2), &crops[p].bottom_right },
1645             );
1646         }
1647     }
1648 
1649     if (deshake_ctx->curr_frame < deshake_ctx->smooth_window / 2) {
1650         // This means we are somewhere at the start of the video. We need to
1651         // increment the current frame offset until it reaches the center of
1652         // the ringbuffers (as the current frame will be located there for
1653         // the rest of the video).
1654         //
1655         // The end of the video is taken care of by draining motion data
1656         // one-by-one out of the buffer, causing the (at that point fixed)
1657         // offset to move towards later frames' data.
1658         ++deshake_ctx->abs_motion.curr_frame_offset;
1659     }
1660 
1661     if (deshake_ctx->abs_motion.data_end_offset != -1) {
1662         // Keep the end offset in sync with the frame it's supposed to be
1663         // positioned at
1664         --deshake_ctx->abs_motion.data_end_offset;
1665 
1666         if (deshake_ctx->abs_motion.data_end_offset == deshake_ctx->abs_motion.curr_frame_offset - 1) {
1667             // The end offset would be the start of the new video sequence; flip to
1668             // start offset
1669             deshake_ctx->abs_motion.data_end_offset = -1;
1670             deshake_ctx->abs_motion.data_start_offset = deshake_ctx->abs_motion.curr_frame_offset;
1671         }
1672     } else if (deshake_ctx->abs_motion.data_start_offset != -1) {
1673         // Keep the start offset in sync with the frame it's supposed to be
1674         // positioned at
1675         --deshake_ctx->abs_motion.data_start_offset;
1676     }
1677 
1678     if (deshake_ctx->debug_on) {
1679         deshake_ctx->transform_time += ff_opencl_get_event_time(transform_event);
1680         if (deshake_ctx->should_crop) {
1681             deshake_ctx->crop_upscale_time += ff_opencl_get_event_time(crop_upscale_event);
1682         }
1683     }
1684 
1685     ++deshake_ctx->curr_frame;
1686 
1687     if (deshake_ctx->debug_on)
1688         av_freep(&debug_matches.matches);
1689 
1690     if (deshake_ctx->should_crop) {
1691         err = av_frame_copy_props(cropped_frame, input_frame);
1692         if (err < 0)
1693             goto fail;
1694 
1695         av_frame_free(&transformed_frame);
1696         av_frame_free(&input_frame);
1697         return ff_filter_frame(outlink, cropped_frame);
1698 
1699     } else {
1700         err = av_frame_copy_props(transformed_frame, input_frame);
1701         if (err < 0)
1702             goto fail;
1703 
1704         av_frame_free(&cropped_frame);
1705         av_frame_free(&input_frame);
1706         return ff_filter_frame(outlink, transformed_frame);
1707     }
1708 
1709 fail:
1710     clFinish(deshake_ctx->command_queue);
1711 
1712     if (deshake_ctx->debug_on)
1713         if (debug_matches.matches)
1714             av_freep(&debug_matches.matches);
1715 
1716     av_frame_free(&input_frame);
1717     av_frame_free(&transformed_frame);
1718     av_frame_free(&cropped_frame);
1719     return err;
1720 }
1721 
1722 // Add the given frame to the frame queue to eventually be processed.
1723 //
1724 // Also determines the motion from the previous frame and updates the stored
1725 // motion information accordingly.
queue_frame(AVFilterLink * link,AVFrame * input_frame)1726 static int queue_frame(AVFilterLink *link, AVFrame *input_frame)
1727 {
1728     AVFilterContext *avctx = link->dst;
1729     DeshakeOpenCLContext *deshake_ctx = avctx->priv;
1730     int err;
1731     int num_vectors;
1732     int num_inliers = 0;
1733     cl_int cle;
1734     FrameDelta relative;
1735     SimilarityMatrix model;
1736     size_t global_work[2];
1737     size_t harris_global_work[2];
1738     size_t grid_32_global_work[2];
1739     int grid_32_h, grid_32_w;
1740     size_t local_work[2];
1741     cl_mem src, temp;
1742     float prev_vals[5];
1743     float new_vals[5];
1744     cl_event grayscale_event, harris_response_event, refine_features_event,
1745              brief_event, match_descriptors_event, read_buf_event;
1746     DebugMatches debug_matches;
1747 
1748     num_vectors = 0;
1749 
1750     local_work[0] = 8;
1751     local_work[1] = 8;
1752 
1753     err = ff_opencl_filter_work_size_from_image(avctx, global_work, input_frame, 0, 0);
1754     if (err < 0)
1755         goto fail;
1756 
1757     err = ff_opencl_filter_work_size_from_image(avctx, harris_global_work, input_frame, 0, 8);
1758     if (err < 0)
1759         goto fail;
1760 
1761     err = ff_opencl_filter_work_size_from_image(avctx, grid_32_global_work, input_frame, 0, 32);
1762     if (err < 0)
1763         goto fail;
1764 
1765     // We want a single work-item for each 32x32 block of pixels in the input frame
1766     grid_32_global_work[0] /= 32;
1767     grid_32_global_work[1] /= 32;
1768 
1769     grid_32_h = ROUNDED_UP_DIV(input_frame->height, 32);
1770     grid_32_w = ROUNDED_UP_DIV(input_frame->width, 32);
1771 
1772     if (deshake_ctx->is_yuv) {
1773         deshake_ctx->grayscale = (cl_mem)input_frame->data[0];
1774     } else {
1775         src = (cl_mem)input_frame->data[0];
1776 
1777         CL_RUN_KERNEL_WITH_ARGS(
1778             deshake_ctx->command_queue,
1779             deshake_ctx->kernel_grayscale,
1780             global_work,
1781             NULL,
1782             &grayscale_event,
1783             { sizeof(cl_mem), &src },
1784             { sizeof(cl_mem), &deshake_ctx->grayscale }
1785         );
1786     }
1787 
1788     CL_RUN_KERNEL_WITH_ARGS(
1789         deshake_ctx->command_queue,
1790         deshake_ctx->kernel_harris_response,
1791         harris_global_work,
1792         local_work,
1793         &harris_response_event,
1794         { sizeof(cl_mem), &deshake_ctx->grayscale },
1795         { sizeof(cl_mem), &deshake_ctx->harris_buf }
1796     );
1797 
1798     CL_RUN_KERNEL_WITH_ARGS(
1799         deshake_ctx->command_queue,
1800         deshake_ctx->kernel_refine_features,
1801         grid_32_global_work,
1802         NULL,
1803         &refine_features_event,
1804         { sizeof(cl_mem), &deshake_ctx->grayscale },
1805         { sizeof(cl_mem), &deshake_ctx->harris_buf },
1806         { sizeof(cl_mem), &deshake_ctx->refined_features },
1807         { sizeof(cl_int), &deshake_ctx->refine_features }
1808     );
1809 
1810     CL_RUN_KERNEL_WITH_ARGS(
1811         deshake_ctx->command_queue,
1812         deshake_ctx->kernel_brief_descriptors,
1813         grid_32_global_work,
1814         NULL,
1815         &brief_event,
1816         { sizeof(cl_mem), &deshake_ctx->grayscale },
1817         { sizeof(cl_mem), &deshake_ctx->refined_features },
1818         { sizeof(cl_mem), &deshake_ctx->descriptors },
1819         { sizeof(cl_mem), &deshake_ctx->brief_pattern}
1820     );
1821 
1822     if (!av_fifo_can_read(deshake_ctx->abs_motion.ringbuffers[RingbufX])) {
1823         // This is the first frame we've been given to queue, meaning there is
1824         // no previous frame to match descriptors to
1825 
1826         goto no_motion_data;
1827     }
1828 
1829     CL_RUN_KERNEL_WITH_ARGS(
1830         deshake_ctx->command_queue,
1831         deshake_ctx->kernel_match_descriptors,
1832         grid_32_global_work,
1833         NULL,
1834         &match_descriptors_event,
1835         { sizeof(cl_mem), &deshake_ctx->prev_refined_features },
1836         { sizeof(cl_mem), &deshake_ctx->refined_features },
1837         { sizeof(cl_mem), &deshake_ctx->descriptors },
1838         { sizeof(cl_mem), &deshake_ctx->prev_descriptors },
1839         { sizeof(cl_mem), &deshake_ctx->matches }
1840     );
1841 
1842     cle = clEnqueueReadBuffer(
1843         deshake_ctx->command_queue,
1844         deshake_ctx->matches,
1845         CL_TRUE,
1846         0,
1847         grid_32_h * grid_32_w * sizeof(MotionVector),
1848         deshake_ctx->matches_host,
1849         0,
1850         NULL,
1851         &read_buf_event
1852     );
1853     CL_FAIL_ON_ERROR(AVERROR(EIO), "Failed to read matches to host: %d.\n", cle);
1854 
1855     num_vectors = make_vectors_contig(deshake_ctx, grid_32_h, grid_32_w);
1856 
1857     if (num_vectors < 10) {
1858         // Not enough matches to get reliable motion data for this frame
1859         //
1860         // From this point on all data is relative to this frame rather than the
1861         // original frame. We have to make sure that we don't mix values that were
1862         // relative to the original frame with the new values relative to this
1863         // frame when doing the gaussian smoothing. We keep track of where the old
1864         // values end using this data_end_offset field in order to accomplish
1865         // that goal.
1866         //
1867         // If no motion data is present for multiple frames in a short window of
1868         // time, we leave the end where it was to avoid mixing 0s in with the
1869         // old data (and just treat them all as part of the new values)
1870         if (deshake_ctx->abs_motion.data_end_offset == -1) {
1871             deshake_ctx->abs_motion.data_end_offset =
1872                 av_fifo_can_read(deshake_ctx->abs_motion.ringbuffers[RingbufX]) - 1;
1873         }
1874 
1875         goto no_motion_data;
1876     }
1877 
1878     if (!estimate_affine_2d(
1879         deshake_ctx,
1880         deshake_ctx->matches_contig_host,
1881         &debug_matches,
1882         num_vectors,
1883         model.matrix,
1884         10.0,
1885         3000,
1886         0.999999999999
1887     )) {
1888         goto no_motion_data;
1889     }
1890 
1891     for (int i = 0; i < num_vectors; i++) {
1892         if (deshake_ctx->matches_contig_host[i].should_consider) {
1893             deshake_ctx->inliers[num_inliers] = deshake_ctx->matches_contig_host[i];
1894             num_inliers++;
1895         }
1896     }
1897 
1898     if (!minimize_error(
1899         deshake_ctx,
1900         deshake_ctx->inliers,
1901         &debug_matches,
1902         num_inliers,
1903         model.matrix,
1904         400
1905     )) {
1906         goto no_motion_data;
1907     }
1908 
1909 
1910     relative = decompose_transform(model.matrix);
1911 
1912     // Get the absolute transform data for the previous frame
1913     for (int i = 0; i < RingbufCount; i++) {
1914         av_fifo_peek(
1915             deshake_ctx->abs_motion.ringbuffers[i],
1916             &prev_vals[i], 1,
1917             av_fifo_can_read(deshake_ctx->abs_motion.ringbuffers[i]) - 1);
1918     }
1919 
1920     new_vals[RingbufX]      = prev_vals[RingbufX] + relative.translation.s[0];
1921     new_vals[RingbufY]      = prev_vals[RingbufY] + relative.translation.s[1];
1922     new_vals[RingbufRot]    = prev_vals[RingbufRot] + relative.rotation;
1923     new_vals[RingbufScaleX] = prev_vals[RingbufScaleX] / relative.scale.s[0];
1924     new_vals[RingbufScaleY] = prev_vals[RingbufScaleY] / relative.scale.s[1];
1925 
1926     if (deshake_ctx->debug_on) {
1927         if (!deshake_ctx->is_yuv) {
1928             deshake_ctx->grayscale_time     += ff_opencl_get_event_time(grayscale_event);
1929         }
1930         deshake_ctx->harris_response_time   += ff_opencl_get_event_time(harris_response_event);
1931         deshake_ctx->refine_features_time   += ff_opencl_get_event_time(refine_features_event);
1932         deshake_ctx->brief_descriptors_time += ff_opencl_get_event_time(brief_event);
1933         deshake_ctx->match_descriptors_time += ff_opencl_get_event_time(match_descriptors_event);
1934         deshake_ctx->read_buf_time          += ff_opencl_get_event_time(read_buf_event);
1935     }
1936 
1937     goto end;
1938 
1939 no_motion_data:
1940     new_vals[RingbufX]      = 0.0f;
1941     new_vals[RingbufY]      = 0.0f;
1942     new_vals[RingbufRot]    = 0.0f;
1943     new_vals[RingbufScaleX] = 1.0f;
1944     new_vals[RingbufScaleY] = 1.0f;
1945 
1946     for (int i = 0; i < num_vectors; i++) {
1947         deshake_ctx->matches_contig_host[i].should_consider = 0;
1948     }
1949     debug_matches.num_model_matches = 0;
1950 
1951     if (deshake_ctx->debug_on) {
1952         av_log(avctx, AV_LOG_VERBOSE,
1953             "\n[ALERT] No motion data found in queue_frame, motion reset to 0\n\n"
1954         );
1955     }
1956 
1957     goto end;
1958 
1959 end:
1960     // Swap the descriptor buffers (we don't need the previous frame's descriptors
1961     // again so we will use that space for the next frame's descriptors)
1962     temp = deshake_ctx->prev_descriptors;
1963     deshake_ctx->prev_descriptors = deshake_ctx->descriptors;
1964     deshake_ctx->descriptors = temp;
1965 
1966     // Same for the refined features
1967     temp = deshake_ctx->prev_refined_features;
1968     deshake_ctx->prev_refined_features = deshake_ctx->refined_features;
1969     deshake_ctx->refined_features = temp;
1970 
1971     if (deshake_ctx->debug_on) {
1972         if (num_vectors == 0) {
1973             debug_matches.matches = NULL;
1974         } else {
1975             debug_matches.matches = av_malloc_array(num_vectors, sizeof(MotionVector));
1976 
1977             if (!debug_matches.matches) {
1978                 err = AVERROR(ENOMEM);
1979                 goto fail;
1980             }
1981         }
1982 
1983         for (int i = 0; i < num_vectors; i++) {
1984             debug_matches.matches[i] = deshake_ctx->matches_contig_host[i];
1985         }
1986         debug_matches.num_matches = num_vectors;
1987 
1988         av_fifo_write(
1989             deshake_ctx->abs_motion.debug_matches,
1990             &debug_matches, 1);
1991     }
1992 
1993     for (int i = 0; i < RingbufCount; i++) {
1994         av_fifo_write(deshake_ctx->abs_motion.ringbuffers[i], &new_vals[i], 1);
1995     }
1996 
1997     return ff_framequeue_add(&deshake_ctx->fq, input_frame);
1998 
1999 fail:
2000     clFinish(deshake_ctx->command_queue);
2001     av_frame_free(&input_frame);
2002     return err;
2003 }
2004 
activate(AVFilterContext * ctx)2005 static int activate(AVFilterContext *ctx)
2006 {
2007     AVFilterLink *inlink = ctx->inputs[0];
2008     AVFilterLink *outlink = ctx->outputs[0];
2009     DeshakeOpenCLContext *deshake_ctx = ctx->priv;
2010     AVFrame *frame = NULL;
2011     int ret, status;
2012     int64_t pts;
2013 
2014     FF_FILTER_FORWARD_STATUS_BACK(outlink, inlink);
2015 
2016     if (!deshake_ctx->eof) {
2017         ret = ff_inlink_consume_frame(inlink, &frame);
2018         if (ret < 0)
2019             return ret;
2020         if (ret > 0) {
2021             if (!frame->hw_frames_ctx)
2022                 return AVERROR(EINVAL);
2023 
2024             if (!deshake_ctx->initialized) {
2025                 ret = deshake_opencl_init(ctx);
2026                 if (ret < 0)
2027                     return ret;
2028             }
2029 
2030             // If there is no more space in the ringbuffers, remove the oldest
2031             // values to make room for the new ones
2032             if (!av_fifo_can_write(deshake_ctx->abs_motion.ringbuffers[RingbufX])) {
2033                 for (int i = 0; i < RingbufCount; i++) {
2034                     av_fifo_drain2(deshake_ctx->abs_motion.ringbuffers[i], 1);
2035                 }
2036             }
2037             ret = queue_frame(inlink, frame);
2038             if (ret < 0)
2039                 return ret;
2040             if (ret >= 0) {
2041                 // See if we have enough buffered frames to process one
2042                 //
2043                 // "enough" is half the smooth window of queued frames into the future
2044                 if (ff_framequeue_queued_frames(&deshake_ctx->fq) >= deshake_ctx->smooth_window / 2) {
2045                     return filter_frame(inlink, ff_framequeue_take(&deshake_ctx->fq));
2046                 }
2047             }
2048         }
2049     }
2050 
2051     if (!deshake_ctx->eof && ff_inlink_acknowledge_status(inlink, &status, &pts)) {
2052         if (status == AVERROR_EOF) {
2053             deshake_ctx->eof = 1;
2054         }
2055     }
2056 
2057     if (deshake_ctx->eof) {
2058         // Finish processing the rest of the frames in the queue.
2059         while(ff_framequeue_queued_frames(&deshake_ctx->fq) != 0) {
2060             for (int i = 0; i < RingbufCount; i++) {
2061                 av_fifo_drain2(deshake_ctx->abs_motion.ringbuffers[i], 1);
2062             }
2063 
2064             ret = filter_frame(inlink, ff_framequeue_take(&deshake_ctx->fq));
2065             if (ret < 0) {
2066                 return ret;
2067             }
2068         }
2069 
2070         if (deshake_ctx->debug_on) {
2071             av_log(ctx, AV_LOG_VERBOSE,
2072                 "Average kernel execution times:\n"
2073                 "\t        grayscale: %0.3f ms\n"
2074                 "\t  harris_response: %0.3f ms\n"
2075                 "\t  refine_features: %0.3f ms\n"
2076                 "\tbrief_descriptors: %0.3f ms\n"
2077                 "\tmatch_descriptors: %0.3f ms\n"
2078                 "\t        transform: %0.3f ms\n"
2079                 "\t     crop_upscale: %0.3f ms\n"
2080                 "Average buffer read times:\n"
2081                 "\t     features buf: %0.3f ms\n",
2082                 averaged_event_time_ms(deshake_ctx->grayscale_time, deshake_ctx->curr_frame),
2083                 averaged_event_time_ms(deshake_ctx->harris_response_time, deshake_ctx->curr_frame),
2084                 averaged_event_time_ms(deshake_ctx->refine_features_time, deshake_ctx->curr_frame),
2085                 averaged_event_time_ms(deshake_ctx->brief_descriptors_time, deshake_ctx->curr_frame),
2086                 averaged_event_time_ms(deshake_ctx->match_descriptors_time, deshake_ctx->curr_frame),
2087                 averaged_event_time_ms(deshake_ctx->transform_time, deshake_ctx->curr_frame),
2088                 averaged_event_time_ms(deshake_ctx->crop_upscale_time, deshake_ctx->curr_frame),
2089                 averaged_event_time_ms(deshake_ctx->read_buf_time, deshake_ctx->curr_frame)
2090             );
2091         }
2092 
2093         ff_outlink_set_status(outlink, AVERROR_EOF, deshake_ctx->duration);
2094         return 0;
2095     }
2096 
2097     if (!deshake_ctx->eof) {
2098         FF_FILTER_FORWARD_WANTED(outlink, inlink);
2099     }
2100 
2101     return FFERROR_NOT_READY;
2102 }
2103 
2104 static const AVFilterPad deshake_opencl_inputs[] = {
2105     {
2106         .name = "default",
2107         .type = AVMEDIA_TYPE_VIDEO,
2108         .config_props = &ff_opencl_filter_config_input,
2109     },
2110 };
2111 
2112 static const AVFilterPad deshake_opencl_outputs[] = {
2113     {
2114         .name = "default",
2115         .type = AVMEDIA_TYPE_VIDEO,
2116         .config_props = &ff_opencl_filter_config_output,
2117     },
2118 };
2119 
2120 #define OFFSET(x) offsetof(DeshakeOpenCLContext, x)
2121 #define FLAGS AV_OPT_FLAG_FILTERING_PARAM|AV_OPT_FLAG_VIDEO_PARAM
2122 
2123 static const AVOption deshake_opencl_options[] = {
2124     {
2125         "tripod", "simulates a tripod by preventing any camera movement whatsoever "
2126         "from the original frame",
2127         OFFSET(tripod_mode), AV_OPT_TYPE_BOOL, {.i64 = 0}, 0, 1, FLAGS
2128     },
2129     {
2130         "debug", "turn on additional debugging information",
2131         OFFSET(debug_on), AV_OPT_TYPE_BOOL, {.i64 = 0}, 0, 1, FLAGS
2132     },
2133     {
2134         "adaptive_crop", "attempt to subtly crop borders to reduce mirrored content",
2135         OFFSET(should_crop), AV_OPT_TYPE_BOOL, {.i64 = 1}, 0, 1, FLAGS
2136     },
2137     {
2138         "refine_features", "refine feature point locations at a sub-pixel level",
2139         OFFSET(refine_features), AV_OPT_TYPE_BOOL, {.i64 = 1}, 0, 1, FLAGS
2140     },
2141     {
2142         "smooth_strength", "smoothing strength (0 attempts to adaptively determine optimal strength)",
2143         OFFSET(smooth_percent), AV_OPT_TYPE_FLOAT, {.dbl = 0.0f}, 0.0f, 1.0f, FLAGS
2144     },
2145     {
2146         "smooth_window_multiplier", "multiplier for number of frames to buffer for motion data",
2147         OFFSET(smooth_window_multiplier), AV_OPT_TYPE_FLOAT, {.dbl = 2.0}, 0.1, 10.0, FLAGS
2148     },
2149     { NULL }
2150 };
2151 
2152 AVFILTER_DEFINE_CLASS(deshake_opencl);
2153 
2154 const AVFilter ff_vf_deshake_opencl = {
2155     .name           = "deshake_opencl",
2156     .description    = NULL_IF_CONFIG_SMALL("Feature-point based video stabilization filter"),
2157     .priv_size      = sizeof(DeshakeOpenCLContext),
2158     .priv_class     = &deshake_opencl_class,
2159     .init           = &ff_opencl_filter_init,
2160     .uninit         = &deshake_opencl_uninit,
2161     .activate       = activate,
2162     FILTER_INPUTS(deshake_opencl_inputs),
2163     FILTER_OUTPUTS(deshake_opencl_outputs),
2164     FILTER_SINGLE_PIXFMT(AV_PIX_FMT_OPENCL),
2165     .flags_internal = FF_FILTER_FLAG_HWFRAME_AWARE
2166 };
2167