1 /*
2 * Copyright (c) 2019 Guo Yejun
3 *
4 * This file is part of FFmpeg.
5 *
6 * FFmpeg is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
10 *
11 * FFmpeg is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with FFmpeg; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19 */
20
21 /**
22 * @file
23 * implementing a generic image processing filter using deep learning networks.
24 */
25
26 #include "libavformat/avio.h"
27 #include "libavutil/opt.h"
28 #include "libavutil/pixdesc.h"
29 #include "libavutil/avassert.h"
30 #include "libavutil/imgutils.h"
31 #include "filters.h"
32 #include "dnn_filter_common.h"
33 #include "formats.h"
34 #include "internal.h"
35 #include "libswscale/swscale.h"
36 #include "libavutil/time.h"
37
38 typedef struct DnnProcessingContext {
39 const AVClass *class;
40 DnnContext dnnctx;
41 struct SwsContext *sws_uv_scale;
42 int sws_uv_height;
43 } DnnProcessingContext;
44
45 #define OFFSET(x) offsetof(DnnProcessingContext, dnnctx.x)
46 #define FLAGS AV_OPT_FLAG_FILTERING_PARAM | AV_OPT_FLAG_VIDEO_PARAM
47 static const AVOption dnn_processing_options[] = {
48 { "dnn_backend", "DNN backend", OFFSET(backend_type), AV_OPT_TYPE_INT, { .i64 = 0 }, INT_MIN, INT_MAX, FLAGS, "backend" },
49 { "native", "native backend flag", 0, AV_OPT_TYPE_CONST, { .i64 = 0 }, 0, 0, FLAGS, "backend" },
50 #if (CONFIG_LIBTENSORFLOW == 1)
51 { "tensorflow", "tensorflow backend flag", 0, AV_OPT_TYPE_CONST, { .i64 = 1 }, 0, 0, FLAGS, "backend" },
52 #endif
53 #if (CONFIG_LIBOPENVINO == 1)
54 { "openvino", "openvino backend flag", 0, AV_OPT_TYPE_CONST, { .i64 = 2 }, 0, 0, FLAGS, "backend" },
55 #endif
56 DNN_COMMON_OPTIONS
57 { NULL }
58 };
59
60 AVFILTER_DEFINE_CLASS(dnn_processing);
61
init(AVFilterContext * context)62 static av_cold int init(AVFilterContext *context)
63 {
64 DnnProcessingContext *ctx = context->priv;
65 return ff_dnn_init(&ctx->dnnctx, DFT_PROCESS_FRAME, context);
66 }
67
68 static const enum AVPixelFormat pix_fmts[] = {
69 AV_PIX_FMT_RGB24, AV_PIX_FMT_BGR24,
70 AV_PIX_FMT_GRAY8, AV_PIX_FMT_GRAYF32,
71 AV_PIX_FMT_YUV420P, AV_PIX_FMT_YUV422P,
72 AV_PIX_FMT_YUV444P, AV_PIX_FMT_YUV410P, AV_PIX_FMT_YUV411P,
73 AV_PIX_FMT_NV12,
74 AV_PIX_FMT_NONE
75 };
76
77 #define LOG_FORMAT_CHANNEL_MISMATCH() \
78 av_log(ctx, AV_LOG_ERROR, \
79 "the frame's format %s does not match " \
80 "the model input channel %d\n", \
81 av_get_pix_fmt_name(fmt), \
82 model_input->channels);
83
check_modelinput_inlink(const DNNData * model_input,const AVFilterLink * inlink)84 static int check_modelinput_inlink(const DNNData *model_input, const AVFilterLink *inlink)
85 {
86 AVFilterContext *ctx = inlink->dst;
87 enum AVPixelFormat fmt = inlink->format;
88
89 // the design is to add explicit scale filter before this filter
90 if (model_input->height != -1 && model_input->height != inlink->h) {
91 av_log(ctx, AV_LOG_ERROR, "the model requires frame height %d but got %d\n",
92 model_input->height, inlink->h);
93 return AVERROR(EIO);
94 }
95 if (model_input->width != -1 && model_input->width != inlink->w) {
96 av_log(ctx, AV_LOG_ERROR, "the model requires frame width %d but got %d\n",
97 model_input->width, inlink->w);
98 return AVERROR(EIO);
99 }
100 if (model_input->dt != DNN_FLOAT) {
101 avpriv_report_missing_feature(ctx, "data type rather than DNN_FLOAT");
102 return AVERROR(EIO);
103 }
104
105 switch (fmt) {
106 case AV_PIX_FMT_RGB24:
107 case AV_PIX_FMT_BGR24:
108 if (model_input->channels != 3) {
109 LOG_FORMAT_CHANNEL_MISMATCH();
110 return AVERROR(EIO);
111 }
112 return 0;
113 case AV_PIX_FMT_GRAYF32:
114 case AV_PIX_FMT_YUV420P:
115 case AV_PIX_FMT_YUV422P:
116 case AV_PIX_FMT_YUV444P:
117 case AV_PIX_FMT_YUV410P:
118 case AV_PIX_FMT_YUV411P:
119 case AV_PIX_FMT_NV12:
120 if (model_input->channels != 1) {
121 LOG_FORMAT_CHANNEL_MISMATCH();
122 return AVERROR(EIO);
123 }
124 return 0;
125 default:
126 avpriv_report_missing_feature(ctx, "%s", av_get_pix_fmt_name(fmt));
127 return AVERROR(EIO);
128 }
129
130 return 0;
131 }
132
config_input(AVFilterLink * inlink)133 static int config_input(AVFilterLink *inlink)
134 {
135 AVFilterContext *context = inlink->dst;
136 DnnProcessingContext *ctx = context->priv;
137 int result;
138 DNNData model_input;
139 int check;
140
141 result = ff_dnn_get_input(&ctx->dnnctx, &model_input);
142 if (result != 0) {
143 av_log(ctx, AV_LOG_ERROR, "could not get input from the model\n");
144 return result;
145 }
146
147 check = check_modelinput_inlink(&model_input, inlink);
148 if (check != 0) {
149 return check;
150 }
151
152 return 0;
153 }
154
isPlanarYUV(enum AVPixelFormat pix_fmt)155 static av_always_inline int isPlanarYUV(enum AVPixelFormat pix_fmt)
156 {
157 const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(pix_fmt);
158 av_assert0(desc);
159 return !(desc->flags & AV_PIX_FMT_FLAG_RGB) && desc->nb_components == 3;
160 }
161
prepare_uv_scale(AVFilterLink * outlink)162 static int prepare_uv_scale(AVFilterLink *outlink)
163 {
164 AVFilterContext *context = outlink->src;
165 DnnProcessingContext *ctx = context->priv;
166 AVFilterLink *inlink = context->inputs[0];
167 enum AVPixelFormat fmt = inlink->format;
168
169 if (isPlanarYUV(fmt)) {
170 if (inlink->w != outlink->w || inlink->h != outlink->h) {
171 if (fmt == AV_PIX_FMT_NV12) {
172 ctx->sws_uv_scale = sws_getContext(inlink->w >> 1, inlink->h >> 1, AV_PIX_FMT_YA8,
173 outlink->w >> 1, outlink->h >> 1, AV_PIX_FMT_YA8,
174 SWS_BICUBIC, NULL, NULL, NULL);
175 ctx->sws_uv_height = inlink->h >> 1;
176 } else {
177 const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(fmt);
178 int sws_src_h = AV_CEIL_RSHIFT(inlink->h, desc->log2_chroma_h);
179 int sws_src_w = AV_CEIL_RSHIFT(inlink->w, desc->log2_chroma_w);
180 int sws_dst_h = AV_CEIL_RSHIFT(outlink->h, desc->log2_chroma_h);
181 int sws_dst_w = AV_CEIL_RSHIFT(outlink->w, desc->log2_chroma_w);
182 ctx->sws_uv_scale = sws_getContext(sws_src_w, sws_src_h, AV_PIX_FMT_GRAY8,
183 sws_dst_w, sws_dst_h, AV_PIX_FMT_GRAY8,
184 SWS_BICUBIC, NULL, NULL, NULL);
185 ctx->sws_uv_height = sws_src_h;
186 }
187 }
188 }
189
190 return 0;
191 }
192
config_output(AVFilterLink * outlink)193 static int config_output(AVFilterLink *outlink)
194 {
195 AVFilterContext *context = outlink->src;
196 DnnProcessingContext *ctx = context->priv;
197 int result;
198 AVFilterLink *inlink = context->inputs[0];
199
200 // have a try run in case that the dnn model resize the frame
201 result = ff_dnn_get_output(&ctx->dnnctx, inlink->w, inlink->h, &outlink->w, &outlink->h);
202 if (result != 0) {
203 av_log(ctx, AV_LOG_ERROR, "could not get output from the model\n");
204 return result;
205 }
206
207 prepare_uv_scale(outlink);
208
209 return 0;
210 }
211
copy_uv_planes(DnnProcessingContext * ctx,AVFrame * out,const AVFrame * in)212 static int copy_uv_planes(DnnProcessingContext *ctx, AVFrame *out, const AVFrame *in)
213 {
214 const AVPixFmtDescriptor *desc;
215 int uv_height;
216
217 if (!ctx->sws_uv_scale) {
218 av_assert0(in->height == out->height && in->width == out->width);
219 desc = av_pix_fmt_desc_get(in->format);
220 uv_height = AV_CEIL_RSHIFT(in->height, desc->log2_chroma_h);
221 for (int i = 1; i < 3; ++i) {
222 int bytewidth = av_image_get_linesize(in->format, in->width, i);
223 if (bytewidth < 0) {
224 return AVERROR(EINVAL);
225 }
226 av_image_copy_plane(out->data[i], out->linesize[i],
227 in->data[i], in->linesize[i],
228 bytewidth, uv_height);
229 }
230 } else if (in->format == AV_PIX_FMT_NV12) {
231 sws_scale(ctx->sws_uv_scale, (const uint8_t **)(in->data + 1), in->linesize + 1,
232 0, ctx->sws_uv_height, out->data + 1, out->linesize + 1);
233 } else {
234 sws_scale(ctx->sws_uv_scale, (const uint8_t **)(in->data + 1), in->linesize + 1,
235 0, ctx->sws_uv_height, out->data + 1, out->linesize + 1);
236 sws_scale(ctx->sws_uv_scale, (const uint8_t **)(in->data + 2), in->linesize + 2,
237 0, ctx->sws_uv_height, out->data + 2, out->linesize + 2);
238 }
239
240 return 0;
241 }
242
flush_frame(AVFilterLink * outlink,int64_t pts,int64_t * out_pts)243 static int flush_frame(AVFilterLink *outlink, int64_t pts, int64_t *out_pts)
244 {
245 DnnProcessingContext *ctx = outlink->src->priv;
246 int ret;
247 DNNAsyncStatusType async_state;
248
249 ret = ff_dnn_flush(&ctx->dnnctx);
250 if (ret != 0) {
251 return -1;
252 }
253
254 do {
255 AVFrame *in_frame = NULL;
256 AVFrame *out_frame = NULL;
257 async_state = ff_dnn_get_result(&ctx->dnnctx, &in_frame, &out_frame);
258 if (out_frame) {
259 if (isPlanarYUV(in_frame->format))
260 copy_uv_planes(ctx, out_frame, in_frame);
261 av_frame_free(&in_frame);
262 ret = ff_filter_frame(outlink, out_frame);
263 if (ret < 0)
264 return ret;
265 if (out_pts)
266 *out_pts = out_frame->pts + pts;
267 }
268 av_usleep(5000);
269 } while (async_state >= DAST_NOT_READY);
270
271 return 0;
272 }
273
activate(AVFilterContext * filter_ctx)274 static int activate(AVFilterContext *filter_ctx)
275 {
276 AVFilterLink *inlink = filter_ctx->inputs[0];
277 AVFilterLink *outlink = filter_ctx->outputs[0];
278 DnnProcessingContext *ctx = filter_ctx->priv;
279 AVFrame *in = NULL, *out = NULL;
280 int64_t pts;
281 int ret, status;
282 int got_frame = 0;
283 int async_state;
284
285 FF_FILTER_FORWARD_STATUS_BACK(outlink, inlink);
286
287 do {
288 // drain all input frames
289 ret = ff_inlink_consume_frame(inlink, &in);
290 if (ret < 0)
291 return ret;
292 if (ret > 0) {
293 out = ff_get_video_buffer(outlink, outlink->w, outlink->h);
294 if (!out) {
295 av_frame_free(&in);
296 return AVERROR(ENOMEM);
297 }
298 av_frame_copy_props(out, in);
299 if (ff_dnn_execute_model(&ctx->dnnctx, in, out) != 0) {
300 return AVERROR(EIO);
301 }
302 }
303 } while (ret > 0);
304
305 // drain all processed frames
306 do {
307 AVFrame *in_frame = NULL;
308 AVFrame *out_frame = NULL;
309 async_state = ff_dnn_get_result(&ctx->dnnctx, &in_frame, &out_frame);
310 if (out_frame) {
311 if (isPlanarYUV(in_frame->format))
312 copy_uv_planes(ctx, out_frame, in_frame);
313 av_frame_free(&in_frame);
314 ret = ff_filter_frame(outlink, out_frame);
315 if (ret < 0)
316 return ret;
317 got_frame = 1;
318 }
319 } while (async_state == DAST_SUCCESS);
320
321 // if frame got, schedule to next filter
322 if (got_frame)
323 return 0;
324
325 if (ff_inlink_acknowledge_status(inlink, &status, &pts)) {
326 if (status == AVERROR_EOF) {
327 int64_t out_pts = pts;
328 ret = flush_frame(outlink, pts, &out_pts);
329 ff_outlink_set_status(outlink, status, out_pts);
330 return ret;
331 }
332 }
333
334 FF_FILTER_FORWARD_WANTED(outlink, inlink);
335
336 return 0;
337 }
338
uninit(AVFilterContext * ctx)339 static av_cold void uninit(AVFilterContext *ctx)
340 {
341 DnnProcessingContext *context = ctx->priv;
342
343 sws_freeContext(context->sws_uv_scale);
344 ff_dnn_uninit(&context->dnnctx);
345 }
346
347 static const AVFilterPad dnn_processing_inputs[] = {
348 {
349 .name = "default",
350 .type = AVMEDIA_TYPE_VIDEO,
351 .config_props = config_input,
352 },
353 };
354
355 static const AVFilterPad dnn_processing_outputs[] = {
356 {
357 .name = "default",
358 .type = AVMEDIA_TYPE_VIDEO,
359 .config_props = config_output,
360 },
361 };
362
363 const AVFilter ff_vf_dnn_processing = {
364 .name = "dnn_processing",
365 .description = NULL_IF_CONFIG_SMALL("Apply DNN processing filter to the input."),
366 .priv_size = sizeof(DnnProcessingContext),
367 .init = init,
368 .uninit = uninit,
369 FILTER_INPUTS(dnn_processing_inputs),
370 FILTER_OUTPUTS(dnn_processing_outputs),
371 FILTER_PIXFMTS_ARRAY(pix_fmts),
372 .priv_class = &dnn_processing_class,
373 .activate = activate,
374 };
375