• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * This file is part of FFmpeg.
3  *
4  * FFmpeg is free software; you can redistribute it and/or
5  * modify it under the terms of the GNU Lesser General Public
6  * License as published by the Free Software Foundation; either
7  * version 2.1 of the License, or (at your option) any later version.
8  *
9  * FFmpeg is distributed in the hope that it will be useful,
10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
12  * Lesser General Public License for more details.
13  *
14  * You should have received a copy of the GNU Lesser General Public
15  * License along with FFmpeg; if not, write to the Free Software
16  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
17  */
18 
19 /**
20  * @file
21  * implementing an classification filter using deep learning networks.
22  */
23 
24 #include "libavformat/avio.h"
25 #include "libavutil/opt.h"
26 #include "libavutil/pixdesc.h"
27 #include "libavutil/avassert.h"
28 #include "libavutil/imgutils.h"
29 #include "filters.h"
30 #include "dnn_filter_common.h"
31 #include "formats.h"
32 #include "internal.h"
33 #include "libavutil/time.h"
34 #include "libavutil/avstring.h"
35 #include "libavutil/detection_bbox.h"
36 
37 typedef struct DnnClassifyContext {
38     const AVClass *class;
39     DnnContext dnnctx;
40     float confidence;
41     char *labels_filename;
42     char *target;
43     char **labels;
44     int label_count;
45 } DnnClassifyContext;
46 
47 #define OFFSET(x) offsetof(DnnClassifyContext, dnnctx.x)
48 #define OFFSET2(x) offsetof(DnnClassifyContext, x)
49 #define FLAGS AV_OPT_FLAG_FILTERING_PARAM | AV_OPT_FLAG_VIDEO_PARAM
50 static const AVOption dnn_classify_options[] = {
51     { "dnn_backend", "DNN backend",                OFFSET(backend_type),     AV_OPT_TYPE_INT,       { .i64 = 2 },    INT_MIN, INT_MAX, FLAGS, "backend" },
52 #if (CONFIG_LIBOPENVINO == 1)
53     { "openvino",    "openvino backend flag",      0,                        AV_OPT_TYPE_CONST,     { .i64 = 2 },    0, 0, FLAGS, "backend" },
54 #endif
55     DNN_COMMON_OPTIONS
56     { "confidence",  "threshold of confidence",    OFFSET2(confidence),      AV_OPT_TYPE_FLOAT,     { .dbl = 0.5 },  0, 1, FLAGS},
57     { "labels",      "path to labels file",        OFFSET2(labels_filename), AV_OPT_TYPE_STRING,    { .str = NULL }, 0, 0, FLAGS },
58     { "target",      "which one to be classified", OFFSET2(target),          AV_OPT_TYPE_STRING,    { .str = NULL }, 0, 0, FLAGS },
59     { NULL }
60 };
61 
62 AVFILTER_DEFINE_CLASS(dnn_classify);
63 
dnn_classify_post_proc(AVFrame * frame,DNNData * output,uint32_t bbox_index,AVFilterContext * filter_ctx)64 static int dnn_classify_post_proc(AVFrame *frame, DNNData *output, uint32_t bbox_index, AVFilterContext *filter_ctx)
65 {
66     DnnClassifyContext *ctx = filter_ctx->priv;
67     float conf_threshold = ctx->confidence;
68     AVDetectionBBoxHeader *header;
69     AVDetectionBBox *bbox;
70     float *classifications;
71     uint32_t label_id;
72     float confidence;
73     AVFrameSideData *sd;
74 
75     if (output->channels <= 0) {
76         return -1;
77     }
78 
79     sd = av_frame_get_side_data(frame, AV_FRAME_DATA_DETECTION_BBOXES);
80     if (!sd) {
81         av_log(filter_ctx, AV_LOG_ERROR, "Cannot get side data in dnn_classify_post_proc\n");
82         return -1;
83     }
84     header = (AVDetectionBBoxHeader *)sd->data;
85 
86     if (bbox_index == 0) {
87         av_strlcat(header->source, ", ", sizeof(header->source));
88         av_strlcat(header->source, ctx->dnnctx.model_filename, sizeof(header->source));
89     }
90 
91     classifications = output->data;
92     label_id = 0;
93     confidence= classifications[0];
94     for (int i = 1; i < output->channels; i++) {
95         if (classifications[i] > confidence) {
96             label_id = i;
97             confidence= classifications[i];
98         }
99     }
100 
101     if (confidence < conf_threshold) {
102         return 0;
103     }
104 
105     bbox = av_get_detection_bbox(header, bbox_index);
106     bbox->classify_confidences[bbox->classify_count] = av_make_q((int)(confidence * 10000), 10000);
107 
108     if (ctx->labels && label_id < ctx->label_count) {
109         av_strlcpy(bbox->classify_labels[bbox->classify_count], ctx->labels[label_id], sizeof(bbox->classify_labels[bbox->classify_count]));
110     } else {
111         snprintf(bbox->classify_labels[bbox->classify_count], sizeof(bbox->classify_labels[bbox->classify_count]), "%d", label_id);
112     }
113 
114     bbox->classify_count++;
115 
116     return 0;
117 }
118 
free_classify_labels(DnnClassifyContext * ctx)119 static void free_classify_labels(DnnClassifyContext *ctx)
120 {
121     for (int i = 0; i < ctx->label_count; i++) {
122         av_freep(&ctx->labels[i]);
123     }
124     ctx->label_count = 0;
125     av_freep(&ctx->labels);
126 }
127 
read_classify_label_file(AVFilterContext * context)128 static int read_classify_label_file(AVFilterContext *context)
129 {
130     int line_len;
131     FILE *file;
132     DnnClassifyContext *ctx = context->priv;
133 
134     file = avpriv_fopen_utf8(ctx->labels_filename, "r");
135     if (!file){
136         av_log(context, AV_LOG_ERROR, "failed to open file %s\n", ctx->labels_filename);
137         return AVERROR(EINVAL);
138     }
139 
140     while (!feof(file)) {
141         char *label;
142         char buf[256];
143         if (!fgets(buf, 256, file)) {
144             break;
145         }
146 
147         line_len = strlen(buf);
148         while (line_len) {
149             int i = line_len - 1;
150             if (buf[i] == '\n' || buf[i] == '\r' || buf[i] == ' ') {
151                 buf[i] = '\0';
152                 line_len--;
153             } else {
154                 break;
155             }
156         }
157 
158         if (line_len == 0)  // empty line
159             continue;
160 
161         if (line_len >= AV_DETECTION_BBOX_LABEL_NAME_MAX_SIZE) {
162             av_log(context, AV_LOG_ERROR, "label %s too long\n", buf);
163             fclose(file);
164             return AVERROR(EINVAL);
165         }
166 
167         label = av_strdup(buf);
168         if (!label) {
169             av_log(context, AV_LOG_ERROR, "failed to allocate memory for label %s\n", buf);
170             fclose(file);
171             return AVERROR(ENOMEM);
172         }
173 
174         if (av_dynarray_add_nofree(&ctx->labels, &ctx->label_count, label) < 0) {
175             av_log(context, AV_LOG_ERROR, "failed to do av_dynarray_add\n");
176             fclose(file);
177             av_freep(&label);
178             return AVERROR(ENOMEM);
179         }
180     }
181 
182     fclose(file);
183     return 0;
184 }
185 
dnn_classify_init(AVFilterContext * context)186 static av_cold int dnn_classify_init(AVFilterContext *context)
187 {
188     DnnClassifyContext *ctx = context->priv;
189     int ret = ff_dnn_init(&ctx->dnnctx, DFT_ANALYTICS_CLASSIFY, context);
190     if (ret < 0)
191         return ret;
192     ff_dnn_set_classify_post_proc(&ctx->dnnctx, dnn_classify_post_proc);
193 
194     if (ctx->labels_filename) {
195         return read_classify_label_file(context);
196     }
197     return 0;
198 }
199 
200 static const enum AVPixelFormat pix_fmts[] = {
201     AV_PIX_FMT_RGB24, AV_PIX_FMT_BGR24,
202     AV_PIX_FMT_GRAY8, AV_PIX_FMT_GRAYF32,
203     AV_PIX_FMT_YUV420P, AV_PIX_FMT_YUV422P,
204     AV_PIX_FMT_YUV444P, AV_PIX_FMT_YUV410P, AV_PIX_FMT_YUV411P,
205     AV_PIX_FMT_NV12,
206     AV_PIX_FMT_NONE
207 };
208 
dnn_classify_flush_frame(AVFilterLink * outlink,int64_t pts,int64_t * out_pts)209 static int dnn_classify_flush_frame(AVFilterLink *outlink, int64_t pts, int64_t *out_pts)
210 {
211     DnnClassifyContext *ctx = outlink->src->priv;
212     int ret;
213     DNNAsyncStatusType async_state;
214 
215     ret = ff_dnn_flush(&ctx->dnnctx);
216     if (ret != 0) {
217         return -1;
218     }
219 
220     do {
221         AVFrame *in_frame = NULL;
222         AVFrame *out_frame = NULL;
223         async_state = ff_dnn_get_result(&ctx->dnnctx, &in_frame, &out_frame);
224         if (async_state == DAST_SUCCESS) {
225             ret = ff_filter_frame(outlink, in_frame);
226             if (ret < 0)
227                 return ret;
228             if (out_pts)
229                 *out_pts = in_frame->pts + pts;
230         }
231         av_usleep(5000);
232     } while (async_state >= DAST_NOT_READY);
233 
234     return 0;
235 }
236 
dnn_classify_activate(AVFilterContext * filter_ctx)237 static int dnn_classify_activate(AVFilterContext *filter_ctx)
238 {
239     AVFilterLink *inlink = filter_ctx->inputs[0];
240     AVFilterLink *outlink = filter_ctx->outputs[0];
241     DnnClassifyContext *ctx = filter_ctx->priv;
242     AVFrame *in = NULL;
243     int64_t pts;
244     int ret, status;
245     int got_frame = 0;
246     int async_state;
247 
248     FF_FILTER_FORWARD_STATUS_BACK(outlink, inlink);
249 
250     do {
251         // drain all input frames
252         ret = ff_inlink_consume_frame(inlink, &in);
253         if (ret < 0)
254             return ret;
255         if (ret > 0) {
256             if (ff_dnn_execute_model_classification(&ctx->dnnctx, in, NULL, ctx->target) != 0) {
257                 return AVERROR(EIO);
258             }
259         }
260     } while (ret > 0);
261 
262     // drain all processed frames
263     do {
264         AVFrame *in_frame = NULL;
265         AVFrame *out_frame = NULL;
266         async_state = ff_dnn_get_result(&ctx->dnnctx, &in_frame, &out_frame);
267         if (async_state == DAST_SUCCESS) {
268             ret = ff_filter_frame(outlink, in_frame);
269             if (ret < 0)
270                 return ret;
271             got_frame = 1;
272         }
273     } while (async_state == DAST_SUCCESS);
274 
275     // if frame got, schedule to next filter
276     if (got_frame)
277         return 0;
278 
279     if (ff_inlink_acknowledge_status(inlink, &status, &pts)) {
280         if (status == AVERROR_EOF) {
281             int64_t out_pts = pts;
282             ret = dnn_classify_flush_frame(outlink, pts, &out_pts);
283             ff_outlink_set_status(outlink, status, out_pts);
284             return ret;
285         }
286     }
287 
288     FF_FILTER_FORWARD_WANTED(outlink, inlink);
289 
290     return 0;
291 }
292 
dnn_classify_uninit(AVFilterContext * context)293 static av_cold void dnn_classify_uninit(AVFilterContext *context)
294 {
295     DnnClassifyContext *ctx = context->priv;
296     ff_dnn_uninit(&ctx->dnnctx);
297     free_classify_labels(ctx);
298 }
299 
300 static const AVFilterPad dnn_classify_inputs[] = {
301     {
302         .name         = "default",
303         .type         = AVMEDIA_TYPE_VIDEO,
304     },
305 };
306 
307 static const AVFilterPad dnn_classify_outputs[] = {
308     {
309         .name = "default",
310         .type = AVMEDIA_TYPE_VIDEO,
311     },
312 };
313 
314 const AVFilter ff_vf_dnn_classify = {
315     .name          = "dnn_classify",
316     .description   = NULL_IF_CONFIG_SMALL("Apply DNN classify filter to the input."),
317     .priv_size     = sizeof(DnnClassifyContext),
318     .init          = dnn_classify_init,
319     .uninit        = dnn_classify_uninit,
320     FILTER_INPUTS(dnn_classify_inputs),
321     FILTER_OUTPUTS(dnn_classify_outputs),
322     FILTER_PIXFMTS_ARRAY(pix_fmts),
323     .priv_class    = &dnn_classify_class,
324     .activate      = dnn_classify_activate,
325 };
326