1 /*
2 * Copyright (c) 2001 Heikki Leinonen
3 * Copyright (c) 2001 Chris Bagwell
4 * Copyright (c) 2003 Donnie Smith
5 * Copyright (c) 2014 Paul B Mahol
6 *
7 * This file is part of FFmpeg.
8 *
9 * FFmpeg is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public
11 * License as published by the Free Software Foundation; either
12 * version 2.1 of the License, or (at your option) any later version.
13 *
14 * FFmpeg is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
18 *
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with FFmpeg; if not, write to the Free Software
21 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22 */
23
24 #include <float.h> /* DBL_MAX */
25
26 #include "libavutil/audio_fifo.h"
27 #include "libavutil/avassert.h"
28 #include "libavutil/opt.h"
29 #include "libavutil/timestamp.h"
30 #include "audio.h"
31 #include "formats.h"
32 #include "avfilter.h"
33 #include "internal.h"
34
35 enum SilenceDetect {
36 D_PEAK,
37 D_RMS,
38 };
39
40 enum ThresholdMode {
41 T_ANY,
42 T_ALL,
43 };
44
45 enum SilenceMode {
46 SILENCE_TRIM,
47 SILENCE_TRIM_FLUSH,
48 SILENCE_COPY,
49 SILENCE_COPY_FLUSH,
50 SILENCE_STOP
51 };
52
53 typedef struct SilenceRemoveContext {
54 const AVClass *class;
55
56 enum SilenceMode mode;
57
58 int start_periods;
59 int64_t start_duration;
60 int64_t start_duration_opt;
61 double start_threshold;
62 int64_t start_silence;
63 int64_t start_silence_opt;
64 int start_mode;
65
66 int stop_periods;
67 int64_t stop_duration;
68 int64_t stop_duration_opt;
69 double stop_threshold;
70 int64_t stop_silence;
71 int64_t stop_silence_opt;
72 int stop_mode;
73
74 int64_t window_duration_opt;
75
76 AVFrame *start_holdoff;
77 AVFrame *start_silence_hold;
78 size_t start_holdoff_offset;
79 size_t start_holdoff_end;
80 size_t start_silence_offset;
81 size_t start_silence_end;
82 int start_found_periods;
83
84 AVFrame *stop_holdoff;
85 AVFrame *stop_silence_hold;
86 size_t stop_holdoff_offset;
87 size_t stop_holdoff_end;
88 size_t stop_silence_offset;
89 size_t stop_silence_end;
90 int stop_found_periods;
91
92 AVFrame *window;
93 int window_offset;
94 int64_t window_duration;
95 double sum;
96
97 int one_period;
98 int restart;
99 int64_t next_pts;
100
101 int detection;
102 void (*update)(struct SilenceRemoveContext *s, AVFrame *frame, int ch, int offset);
103 double (*compute)(struct SilenceRemoveContext *s, AVFrame *frame, int ch, int offset);
104 void (*copy)(struct SilenceRemoveContext *s, AVFrame *out, AVFrame *in,
105 int ch, int out_offset, int in_offset);
106
107 AVAudioFifo *fifo;
108 } SilenceRemoveContext;
109
110 #define OFFSET(x) offsetof(SilenceRemoveContext, x)
111 #define AF AV_OPT_FLAG_FILTERING_PARAM|AV_OPT_FLAG_AUDIO_PARAM
112
113 static const AVOption silenceremove_options[] = {
114 { "start_periods", "set periods of silence parts to skip from start", OFFSET(start_periods), AV_OPT_TYPE_INT, {.i64=0}, 0, 9000, AF },
115 { "start_duration", "set start duration of non-silence part", OFFSET(start_duration_opt), AV_OPT_TYPE_DURATION, {.i64=0}, 0, INT32_MAX, AF },
116 { "start_threshold", "set threshold for start silence detection", OFFSET(start_threshold), AV_OPT_TYPE_DOUBLE, {.dbl=0}, 0, DBL_MAX, AF },
117 { "start_silence", "set start duration of silence part to keep", OFFSET(start_silence_opt), AV_OPT_TYPE_DURATION, {.i64=0}, 0, INT32_MAX, AF },
118 { "start_mode", "set which channel will trigger trimming from start", OFFSET(start_mode), AV_OPT_TYPE_INT, {.i64=T_ANY}, T_ANY, T_ALL, AF, "mode" },
119 { "any", 0, 0, AV_OPT_TYPE_CONST, {.i64=T_ANY}, 0, 0, AF, "mode" },
120 { "all", 0, 0, AV_OPT_TYPE_CONST, {.i64=T_ALL}, 0, 0, AF, "mode" },
121 { "stop_periods", "set periods of silence parts to skip from end", OFFSET(stop_periods), AV_OPT_TYPE_INT, {.i64=0}, -9000, 9000, AF },
122 { "stop_duration", "set stop duration of non-silence part", OFFSET(stop_duration_opt), AV_OPT_TYPE_DURATION, {.i64=0}, 0, INT32_MAX, AF },
123 { "stop_threshold", "set threshold for stop silence detection", OFFSET(stop_threshold), AV_OPT_TYPE_DOUBLE, {.dbl=0}, 0, DBL_MAX, AF },
124 { "stop_silence", "set stop duration of silence part to keep", OFFSET(stop_silence_opt), AV_OPT_TYPE_DURATION, {.i64=0}, 0, INT32_MAX, AF },
125 { "stop_mode", "set which channel will trigger trimming from end", OFFSET(stop_mode), AV_OPT_TYPE_INT, {.i64=T_ANY}, T_ANY, T_ALL, AF, "mode" },
126 { "detection", "set how silence is detected", OFFSET(detection), AV_OPT_TYPE_INT, {.i64=D_RMS}, D_PEAK,D_RMS, AF, "detection" },
127 { "peak", "use absolute values of samples", 0, AV_OPT_TYPE_CONST, {.i64=D_PEAK},0, 0, AF, "detection" },
128 { "rms", "use squared values of samples", 0, AV_OPT_TYPE_CONST, {.i64=D_RMS}, 0, 0, AF, "detection" },
129 { "window", "set duration of window for silence detection", OFFSET(window_duration_opt), AV_OPT_TYPE_DURATION, {.i64=20000}, 0, 100000000, AF },
130 { NULL }
131 };
132
133 AVFILTER_DEFINE_CLASS(silenceremove);
134
copy_double(SilenceRemoveContext * s,AVFrame * out,AVFrame * in,int ch,int out_offset,int in_offset)135 static void copy_double(SilenceRemoveContext *s, AVFrame *out, AVFrame *in,
136 int ch, int out_offset, int in_offset)
137 {
138 const double *srcp = (const double *)in->data[0];
139 const double src = srcp[in->ch_layout.nb_channels * in_offset + ch];
140 double *dstp = (double *)out->data[0];
141
142 dstp[out->ch_layout.nb_channels * out_offset + ch] = src;
143 }
144
copy_doublep(SilenceRemoveContext * s,AVFrame * out,AVFrame * in,int ch,int out_offset,int in_offset)145 static void copy_doublep(SilenceRemoveContext *s, AVFrame *out, AVFrame *in,
146 int ch, int out_offset, int in_offset)
147 {
148 const double *srcp = (const double *)in->extended_data[ch];
149 const double src = srcp[in_offset];
150 double *dstp = (double *)out->extended_data[ch];
151
152 dstp[out_offset] = src;
153 }
154
copy_float(SilenceRemoveContext * s,AVFrame * out,AVFrame * in,int ch,int out_offset,int in_offset)155 static void copy_float(SilenceRemoveContext *s, AVFrame *out, AVFrame *in,
156 int ch, int out_offset, int in_offset)
157 {
158 const float *srcp = (const float *)in->data[0];
159 const float src = srcp[in->ch_layout.nb_channels * in_offset + ch];
160 float *dstp = (float *)out->data[0];
161
162 dstp[out->ch_layout.nb_channels * out_offset + ch] = src;
163 }
164
copy_floatp(SilenceRemoveContext * s,AVFrame * out,AVFrame * in,int ch,int out_offset,int in_offset)165 static void copy_floatp(SilenceRemoveContext *s, AVFrame *out, AVFrame *in,
166 int ch, int out_offset, int in_offset)
167 {
168 const float *srcp = (const float *)in->extended_data[ch];
169 const float src = srcp[in_offset];
170 float *dstp = (float *)out->extended_data[ch];
171
172 dstp[out_offset] = src;
173 }
174
compute_peak_double(SilenceRemoveContext * s,AVFrame * frame,int ch,int offset)175 static double compute_peak_double(SilenceRemoveContext *s, AVFrame *frame, int ch, int offset)
176 {
177 const double *samples = (const double *)frame->data[0];
178 const double *wsamples = (const double *)s->window->data[0];
179 double sample = samples[frame->ch_layout.nb_channels * offset + ch];
180 double wsample = wsamples[frame->ch_layout.nb_channels * s->window_offset + ch];
181 double new_sum;
182
183 new_sum = s->sum;
184 new_sum -= wsample;
185 new_sum = fmax(new_sum, 0.);
186 new_sum += fabs(sample);
187
188 return new_sum / s->window_duration;
189 }
190
update_peak_double(SilenceRemoveContext * s,AVFrame * frame,int ch,int offset)191 static void update_peak_double(SilenceRemoveContext *s, AVFrame *frame, int ch, int offset)
192 {
193 const double *samples = (const double *)frame->data[0];
194 double *wsamples = (double *)s->window->data[0];
195 double sample = samples[frame->ch_layout.nb_channels * offset + ch];
196 double *wsample = &wsamples[frame->ch_layout.nb_channels * s->window_offset + ch];
197
198 s->sum -= *wsample;
199 s->sum = fmax(s->sum, 0.);
200 *wsample = fabs(sample);
201 s->sum += *wsample;
202 }
203
compute_peak_float(SilenceRemoveContext * s,AVFrame * frame,int ch,int offset)204 static double compute_peak_float(SilenceRemoveContext *s, AVFrame *frame, int ch, int offset)
205 {
206 const float *samples = (const float *)frame->data[0];
207 const float *wsamples = (const float *)s->window->data[0];
208 float sample = samples[frame->ch_layout.nb_channels * offset + ch];
209 float wsample = wsamples[frame->ch_layout.nb_channels * s->window_offset + ch];
210 float new_sum;
211
212 new_sum = s->sum;
213 new_sum -= wsample;
214 new_sum = fmaxf(new_sum, 0.f);
215 new_sum += fabsf(sample);
216
217 return new_sum / s->window_duration;
218 }
219
update_peak_float(SilenceRemoveContext * s,AVFrame * frame,int ch,int offset)220 static void update_peak_float(SilenceRemoveContext *s, AVFrame *frame, int ch, int offset)
221 {
222 const float *samples = (const float *)frame->data[0];
223 float *wsamples = (float *)s->window->data[0];
224 float sample = samples[frame->ch_layout.nb_channels * offset + ch];
225 float *wsample = &wsamples[frame->ch_layout.nb_channels * s->window_offset + ch];
226
227 s->sum -= *wsample;
228 s->sum = fmaxf(s->sum, 0.f);
229 *wsample = fabsf(sample);
230 s->sum += *wsample;
231 }
232
compute_rms_double(SilenceRemoveContext * s,AVFrame * frame,int ch,int offset)233 static double compute_rms_double(SilenceRemoveContext *s, AVFrame *frame, int ch, int offset)
234 {
235 const double *samples = (const double *)frame->data[0];
236 const double *wsamples = (const double *)s->window->data[0];
237 double sample = samples[frame->ch_layout.nb_channels * offset + ch];
238 double wsample = wsamples[frame->ch_layout.nb_channels * s->window_offset + ch];
239 double new_sum;
240
241 new_sum = s->sum;
242 new_sum -= wsample;
243 new_sum = fmax(new_sum, 0.);
244 new_sum += sample * sample;
245
246 av_assert2(new_sum >= 0.);
247 return sqrt(new_sum / s->window_duration);
248 }
249
update_rms_double(SilenceRemoveContext * s,AVFrame * frame,int ch,int offset)250 static void update_rms_double(SilenceRemoveContext *s, AVFrame *frame, int ch, int offset)
251 {
252 const double *samples = (const double *)frame->data[0];
253 double *wsamples = (double *)s->window->data[0];
254 double sample = samples[frame->ch_layout.nb_channels * offset + ch];
255 double *wsample = &wsamples[frame->ch_layout.nb_channels * s->window_offset + ch];
256
257 s->sum -= *wsample;
258 s->sum = fmax(s->sum, 0.);
259 *wsample = sample * sample;
260 s->sum += *wsample;
261 }
262
compute_rms_float(SilenceRemoveContext * s,AVFrame * frame,int ch,int offset)263 static double compute_rms_float(SilenceRemoveContext *s, AVFrame *frame, int ch, int offset)
264 {
265 const float *samples = (const float *)frame->data[0];
266 const float *wsamples = (const float *)s->window->data[0];
267 float sample = samples[frame->ch_layout.nb_channels * offset + ch];
268 float wsample = wsamples[frame->ch_layout.nb_channels * s->window_offset + ch];
269 float new_sum;
270
271 new_sum = s->sum;
272 new_sum -= wsample;
273 new_sum = fmaxf(new_sum, 0.f);
274 new_sum += sample * sample;
275
276 av_assert2(new_sum >= 0.f);
277 return sqrtf(new_sum / s->window_duration);
278 }
279
update_rms_float(SilenceRemoveContext * s,AVFrame * frame,int ch,int offset)280 static void update_rms_float(SilenceRemoveContext *s, AVFrame *frame, int ch, int offset)
281 {
282 const float *samples = (const float *)frame->data[0];
283 float sample = samples[frame->ch_layout.nb_channels * offset + ch];
284 float *wsamples = (float *)s->window->data[0];
285 float *wsample = &wsamples[frame->ch_layout.nb_channels * s->window_offset + ch];
286
287 s->sum -= *wsample;
288 s->sum = fmaxf(s->sum, 0.f);
289 *wsample = sample * sample;
290 s->sum += *wsample;
291 }
292
compute_peak_doublep(SilenceRemoveContext * s,AVFrame * frame,int ch,int offset)293 static double compute_peak_doublep(SilenceRemoveContext *s, AVFrame *frame, int ch, int offset)
294 {
295 const double *samples = (const double *)frame->extended_data[ch];
296 const double *wsamples = (const double *)s->window->extended_data[ch];
297 double sample = samples[offset];
298 double wsample = wsamples[s->window_offset];
299 double new_sum;
300
301 new_sum = s->sum;
302 new_sum -= wsample;
303 new_sum = fmax(new_sum, 0.);
304 new_sum += fabs(sample);
305
306 return new_sum / s->window_duration;
307 }
308
update_peak_doublep(SilenceRemoveContext * s,AVFrame * frame,int ch,int offset)309 static void update_peak_doublep(SilenceRemoveContext *s, AVFrame *frame, int ch, int offset)
310 {
311 const double *samples = (const double *)frame->extended_data[ch];
312 double *wsamples = (double *)s->window->extended_data[ch];
313 double sample = samples[offset];
314 double *wsample = &wsamples[s->window_offset];
315
316 s->sum -= *wsample;
317 s->sum = fmax(s->sum, 0.);
318 *wsample = fabs(sample);
319 s->sum += *wsample;
320 }
321
compute_peak_floatp(SilenceRemoveContext * s,AVFrame * frame,int ch,int offset)322 static double compute_peak_floatp(SilenceRemoveContext *s, AVFrame *frame, int ch, int offset)
323 {
324 const float *samples = (const float *)frame->extended_data[ch];
325 const float *wsamples = (const float *)s->window->extended_data[ch];
326 float sample = samples[offset];
327 float wsample = wsamples[s->window_offset];
328 float new_sum;
329
330 new_sum = s->sum;
331 new_sum -= wsample;
332 new_sum = fmaxf(new_sum, 0.f);
333 new_sum += fabsf(sample);
334
335 return new_sum / s->window_duration;
336 }
337
update_peak_floatp(SilenceRemoveContext * s,AVFrame * frame,int ch,int offset)338 static void update_peak_floatp(SilenceRemoveContext *s, AVFrame *frame, int ch, int offset)
339 {
340 const float *samples = (const float *)frame->extended_data[ch];
341 float *wsamples = (float *)s->window->extended_data[ch];
342 float sample = samples[offset];
343 float *wsample = &wsamples[s->window_offset];
344
345 s->sum -= *wsample;
346 s->sum = fmaxf(s->sum, 0.f);
347 *wsample = fabsf(sample);
348 s->sum += *wsample;
349 }
350
compute_rms_doublep(SilenceRemoveContext * s,AVFrame * frame,int ch,int offset)351 static double compute_rms_doublep(SilenceRemoveContext *s, AVFrame *frame, int ch, int offset)
352 {
353 const double *samples = (const double *)frame->extended_data[ch];
354 const double *wsamples = (const double *)s->window->extended_data[ch];
355 double sample = samples[offset];
356 double wsample = wsamples[s->window_offset];
357 double new_sum;
358
359 new_sum = s->sum;
360 new_sum -= wsample;
361 new_sum = fmax(new_sum, 0.);
362 new_sum += sample * sample;
363
364 av_assert2(new_sum >= 0.);
365 return sqrt(new_sum / s->window_duration);
366 }
367
update_rms_doublep(SilenceRemoveContext * s,AVFrame * frame,int ch,int offset)368 static void update_rms_doublep(SilenceRemoveContext *s, AVFrame *frame, int ch, int offset)
369 {
370 const double *samples = (const double *)frame->extended_data[ch];
371 double *wsamples = (double *)s->window->extended_data[ch];
372 double sample = samples[offset];
373 double *wsample = &wsamples[s->window_offset];
374
375 s->sum -= *wsample;
376 s->sum = fmax(s->sum, 0.);
377 *wsample = sample * sample;
378 s->sum += *wsample;
379 }
380
compute_rms_floatp(SilenceRemoveContext * s,AVFrame * frame,int ch,int offset)381 static double compute_rms_floatp(SilenceRemoveContext *s, AVFrame *frame, int ch, int offset)
382 {
383 const float *samples = (const float *)frame->extended_data[ch];
384 const float *wsamples = (const float *)s->window->extended_data[ch];
385 float sample = samples[offset];
386 float wsample = wsamples[s->window_offset];
387 float new_sum;
388
389 new_sum = s->sum;
390 new_sum -= wsample;
391 new_sum = fmaxf(new_sum, 0.f);
392 new_sum += sample * sample;
393
394 av_assert2(new_sum >= 0.f);
395 return sqrtf(new_sum / s->window_duration);
396 }
397
update_rms_floatp(SilenceRemoveContext * s,AVFrame * frame,int ch,int offset)398 static void update_rms_floatp(SilenceRemoveContext *s, AVFrame *frame, int ch, int offset)
399 {
400 const float *samples = (const float *)frame->extended_data[ch];
401 float *wsamples = (float *)s->window->extended_data[ch];
402 float sample = samples[offset];
403 float *wsample = &wsamples[s->window_offset];
404
405 s->sum -= *wsample;
406 s->sum = fmaxf(s->sum, 0.f);
407 *wsample = sample * sample;
408 s->sum += *wsample;
409 }
410
init(AVFilterContext * ctx)411 static av_cold int init(AVFilterContext *ctx)
412 {
413 SilenceRemoveContext *s = ctx->priv;
414
415 if (s->stop_periods < 0) {
416 s->stop_periods = -s->stop_periods;
417 s->restart = 1;
418 }
419
420 return 0;
421 }
422
clear_window(SilenceRemoveContext * s)423 static void clear_window(SilenceRemoveContext *s)
424 {
425 av_samples_set_silence(s->window->extended_data, 0, s->window_duration,
426 s->window->ch_layout.nb_channels, s->window->format);
427
428 s->window_offset = 0;
429 s->sum = 0;
430 }
431
config_input(AVFilterLink * inlink)432 static int config_input(AVFilterLink *inlink)
433 {
434 AVFilterContext *ctx = inlink->dst;
435 SilenceRemoveContext *s = ctx->priv;
436
437 s->next_pts = AV_NOPTS_VALUE;
438 s->window_duration = av_rescale(s->window_duration_opt, inlink->sample_rate,
439 AV_TIME_BASE);
440 s->window_duration = FFMAX(1, s->window_duration);
441 s->window = ff_get_audio_buffer(ctx->outputs[0], s->window_duration);
442 if (!s->window)
443 return AVERROR(ENOMEM);
444
445 clear_window(s);
446
447 s->start_duration = av_rescale(s->start_duration_opt, inlink->sample_rate,
448 AV_TIME_BASE);
449 s->start_silence = av_rescale(s->start_silence_opt, inlink->sample_rate,
450 AV_TIME_BASE);
451 s->stop_duration = av_rescale(s->stop_duration_opt, inlink->sample_rate,
452 AV_TIME_BASE);
453 s->stop_silence = av_rescale(s->stop_silence_opt, inlink->sample_rate,
454 AV_TIME_BASE);
455
456 s->start_holdoff = ff_get_audio_buffer(ctx->outputs[0],
457 FFMAX(s->start_duration, 1));
458 if (!s->start_holdoff)
459 return AVERROR(ENOMEM);
460
461 s->start_silence_hold = ff_get_audio_buffer(ctx->outputs[0],
462 FFMAX(s->start_silence, 1));
463 if (!s->start_silence_hold)
464 return AVERROR(ENOMEM);
465
466 s->start_holdoff_offset = 0;
467 s->start_holdoff_end = 0;
468 s->start_found_periods = 0;
469
470 s->stop_holdoff = ff_get_audio_buffer(ctx->outputs[0],
471 FFMAX(s->stop_duration, 1));
472 if (!s->stop_holdoff)
473 return AVERROR(ENOMEM);
474
475 s->stop_silence_hold = ff_get_audio_buffer(ctx->outputs[0],
476 FFMAX(s->stop_silence, 1));
477 if (!s->stop_silence_hold)
478 return AVERROR(ENOMEM);
479
480 s->stop_holdoff_offset = 0;
481 s->stop_holdoff_end = 0;
482 s->stop_found_periods = 0;
483
484 if (s->start_periods)
485 s->mode = SILENCE_TRIM;
486 else
487 s->mode = SILENCE_COPY;
488
489 switch (inlink->format) {
490 case AV_SAMPLE_FMT_DBL:
491 s->copy = copy_double;
492 switch (s->detection) {
493 case D_PEAK:
494 s->update = update_peak_double;
495 s->compute = compute_peak_double;
496 break;
497 case D_RMS:
498 s->update = update_rms_double;
499 s->compute = compute_rms_double;
500 break;
501 }
502 break;
503 case AV_SAMPLE_FMT_FLT:
504 s->copy = copy_float;
505 switch (s->detection) {
506 case D_PEAK:
507 s->update = update_peak_float;
508 s->compute = compute_peak_float;
509 break;
510 case D_RMS:
511 s->update = update_rms_float;
512 s->compute = compute_rms_float;
513 break;
514 }
515 break;
516 case AV_SAMPLE_FMT_DBLP:
517 s->copy = copy_doublep;
518 switch (s->detection) {
519 case D_PEAK:
520 s->update = update_peak_doublep;
521 s->compute = compute_peak_doublep;
522 break;
523 case D_RMS:
524 s->update = update_rms_doublep;
525 s->compute = compute_rms_doublep;
526 break;
527 }
528 break;
529 case AV_SAMPLE_FMT_FLTP:
530 s->copy = copy_floatp;
531 switch (s->detection) {
532 case D_PEAK:
533 s->update = update_peak_floatp;
534 s->compute = compute_peak_floatp;
535 break;
536 case D_RMS:
537 s->update = update_rms_floatp;
538 s->compute = compute_rms_floatp;
539 break;
540 }
541 break;
542 default:
543 return AVERROR_BUG;
544 }
545
546 s->fifo = av_audio_fifo_alloc(inlink->format, inlink->ch_layout.nb_channels, 1024);
547 if (!s->fifo)
548 return AVERROR(ENOMEM);
549
550 return 0;
551 }
552
flush(SilenceRemoveContext * s,AVFrame * out,AVFilterLink * outlink,int * nb_samples_written,int flush_silence)553 static void flush(SilenceRemoveContext *s,
554 AVFrame *out, AVFilterLink *outlink,
555 int *nb_samples_written, int flush_silence)
556 {
557 AVFrame *silence;
558
559 if (*nb_samples_written) {
560 out->nb_samples = *nb_samples_written;
561
562 av_audio_fifo_write(s->fifo, (void **)out->extended_data, out->nb_samples);
563 *nb_samples_written = 0;
564 }
565
566 av_frame_free(&out);
567
568 if (s->stop_silence_end <= 0 || !flush_silence)
569 return;
570
571 silence = ff_get_audio_buffer(outlink, s->stop_silence_end);
572 if (!silence)
573 return;
574
575 if (s->stop_silence_offset < s->stop_silence_end) {
576 av_samples_copy(silence->extended_data, s->stop_silence_hold->extended_data, 0,
577 s->stop_silence_offset,
578 s->stop_silence_end - s->stop_silence_offset,
579 outlink->ch_layout.nb_channels, outlink->format);
580 }
581
582 if (s->stop_silence_offset > 0) {
583 av_samples_copy(silence->extended_data, s->stop_silence_hold->extended_data,
584 s->stop_silence_end - s->stop_silence_offset,
585 0, s->stop_silence_offset,
586 outlink->ch_layout.nb_channels, outlink->format);
587 }
588
589 s->stop_silence_offset = 0;
590 s->stop_silence_end = 0;
591
592 av_audio_fifo_write(s->fifo, (void **)silence->extended_data, silence->nb_samples);
593 av_frame_free(&silence);
594 }
595
filter_frame(AVFilterLink * inlink,AVFrame * in)596 static int filter_frame(AVFilterLink *inlink, AVFrame *in)
597 {
598 AVFilterContext *ctx = inlink->dst;
599 AVFilterLink *outlink = ctx->outputs[0];
600 SilenceRemoveContext *s = ctx->priv;
601 int nbs, nb_samples_read, nb_samples_written;
602 int i, j, threshold, ret = 0;
603 AVFrame *out;
604
605 nb_samples_read = nb_samples_written = 0;
606
607 if (s->next_pts == AV_NOPTS_VALUE)
608 s->next_pts = in->pts;
609
610 switch (s->mode) {
611 case SILENCE_TRIM:
612 silence_trim:
613 nbs = in->nb_samples - nb_samples_read;
614 if (!nbs)
615 break;
616
617 for (i = 0; i < nbs; i++) {
618 if (s->start_mode == T_ANY) {
619 threshold = 0;
620 for (j = 0; j < outlink->ch_layout.nb_channels; j++) {
621 threshold |= s->compute(s, in, j, nb_samples_read) > s->start_threshold;
622 }
623 } else {
624 threshold = 1;
625 for (j = 0; j < outlink->ch_layout.nb_channels; j++) {
626 threshold &= s->compute(s, in, j, nb_samples_read) > s->start_threshold;
627 }
628 }
629
630 if (threshold) {
631 for (j = 0; j < outlink->ch_layout.nb_channels; j++) {
632 s->update(s, in, j, nb_samples_read);
633 s->copy(s, s->start_holdoff, in, j, s->start_holdoff_end, nb_samples_read);
634 }
635
636 s->window_offset++;
637 if (s->window_offset >= s->window_duration)
638 s->window_offset = 0;
639 s->start_holdoff_end++;
640 nb_samples_read++;
641
642 if (s->start_holdoff_end >= s->start_duration) {
643 s->start_found_periods += s->one_period >= 1;
644 s->one_period = 0;
645 if (s->start_found_periods >= s->start_periods) {
646 s->mode = SILENCE_TRIM_FLUSH;
647 goto silence_trim_flush;
648 }
649
650 s->start_holdoff_offset = 0;
651 s->start_holdoff_end = 0;
652 s->start_silence_offset = 0;
653 s->start_silence_end = 0;
654 }
655 } else {
656 s->start_holdoff_end = 0;
657 s->one_period++;
658
659 for (j = 0; j < outlink->ch_layout.nb_channels; j++) {
660 s->update(s, in, j, nb_samples_read);
661 if (s->start_silence)
662 s->copy(s, s->start_silence_hold, in, j, s->start_silence_offset, nb_samples_read);
663 }
664
665 s->window_offset++;
666 if (s->window_offset >= s->window_duration)
667 s->window_offset = 0;
668 nb_samples_read++;
669 s->start_silence_offset++;
670
671 if (s->start_silence) {
672 s->start_silence_end = FFMIN(s->start_silence_end + 1, s->start_silence);
673 if (s->start_silence_offset >= s->start_silence)
674 s->start_silence_offset = 0;
675 }
676 }
677 }
678 break;
679
680 case SILENCE_TRIM_FLUSH:
681 silence_trim_flush:
682 nbs = s->start_holdoff_end - s->start_holdoff_offset;
683 if (!nbs)
684 break;
685
686 out = ff_get_audio_buffer(outlink, nbs + s->start_silence_end);
687 if (!out) {
688 av_frame_free(&in);
689 return AVERROR(ENOMEM);
690 }
691
692 if (s->start_silence_end > 0) {
693 if (s->start_silence_offset < s->start_silence_end) {
694 av_samples_copy(out->extended_data, s->start_silence_hold->extended_data, 0,
695 s->start_silence_offset,
696 s->start_silence_end - s->start_silence_offset,
697 outlink->ch_layout.nb_channels, outlink->format);
698 }
699
700 if (s->start_silence_offset > 0) {
701 av_samples_copy(out->extended_data, s->start_silence_hold->extended_data,
702 s->start_silence_end - s->start_silence_offset,
703 0, s->start_silence_offset,
704 outlink->ch_layout.nb_channels, outlink->format);
705 }
706 }
707
708 av_samples_copy(out->extended_data, s->start_holdoff->extended_data,
709 s->start_silence_end,
710 s->start_holdoff_offset, nbs,
711 outlink->ch_layout.nb_channels, outlink->format);
712
713 s->start_holdoff_offset += nbs;
714
715 av_audio_fifo_write(s->fifo, (void **)out->extended_data, out->nb_samples);
716 av_frame_free(&out);
717
718 if (s->start_holdoff_offset == s->start_holdoff_end) {
719 s->start_holdoff_offset = 0;
720 s->start_holdoff_end = 0;
721 s->start_silence_offset = 0;
722 s->start_silence_end = 0;
723 s->mode = SILENCE_COPY;
724 goto silence_copy;
725 }
726 break;
727
728 case SILENCE_COPY:
729 silence_copy:
730 nbs = in->nb_samples - nb_samples_read;
731 if (!nbs)
732 break;
733
734 out = ff_get_audio_buffer(outlink, nbs);
735 if (!out) {
736 av_frame_free(&in);
737 return AVERROR(ENOMEM);
738 }
739
740 if (s->stop_periods) {
741 for (i = 0; i < nbs; i++) {
742 if (s->stop_mode == T_ANY) {
743 threshold = 0;
744 for (j = 0; j < outlink->ch_layout.nb_channels; j++) {
745 threshold |= s->compute(s, in, j, nb_samples_read) > s->stop_threshold;
746 }
747 } else {
748 threshold = 1;
749 for (j = 0; j < outlink->ch_layout.nb_channels; j++) {
750 threshold &= s->compute(s, in, j, nb_samples_read) > s->stop_threshold;
751 }
752 }
753
754 if (threshold && s->stop_holdoff_end && !s->stop_silence) {
755 s->mode = SILENCE_COPY_FLUSH;
756 flush(s, out, outlink, &nb_samples_written, 0);
757 s->one_period++;
758 goto silence_copy_flush;
759 } else if (threshold) {
760 for (j = 0; j < outlink->ch_layout.nb_channels; j++) {
761 s->update(s, in, j, nb_samples_read);
762 s->copy(s, out, in, j, nb_samples_written, nb_samples_read);
763 }
764
765 s->window_offset++;
766 if (s->window_offset >= s->window_duration)
767 s->window_offset = 0;
768 nb_samples_read++;
769 nb_samples_written++;
770 s->one_period++;
771 } else if (!threshold) {
772 for (j = 0; j < outlink->ch_layout.nb_channels; j++) {
773 s->update(s, in, j, nb_samples_read);
774 if (s->stop_silence)
775 s->copy(s, s->stop_silence_hold, in, j, s->stop_silence_offset, nb_samples_read);
776
777 s->copy(s, s->stop_holdoff, in, j, s->stop_holdoff_end, nb_samples_read);
778 }
779
780 if (s->stop_silence) {
781 s->stop_silence_offset++;
782 s->stop_silence_end = FFMIN(s->stop_silence_end + 1, s->stop_silence);
783 if (s->stop_silence_offset >= s->stop_silence) {
784 s->stop_silence_offset = 0;
785 }
786 }
787
788 s->window_offset++;
789 if (s->window_offset >= s->window_duration)
790 s->window_offset = 0;
791 nb_samples_read++;
792 s->stop_holdoff_end++;
793
794 if (s->stop_holdoff_end >= s->stop_duration) {
795 s->stop_found_periods += s->one_period >= 1;
796 s->one_period = 0;
797 if (s->stop_found_periods >= s->stop_periods) {
798 s->stop_holdoff_offset = 0;
799 s->stop_holdoff_end = 0;
800
801 if (!s->restart) {
802 s->mode = SILENCE_STOP;
803 flush(s, out, outlink, &nb_samples_written, 1);
804 goto silence_stop;
805 } else {
806 s->stop_found_periods = 0;
807 s->start_found_periods = 0;
808 s->start_holdoff_offset = 0;
809 s->start_holdoff_end = 0;
810 s->start_silence_offset = 0;
811 s->start_silence_end = 0;
812 clear_window(s);
813 s->mode = SILENCE_TRIM;
814 flush(s, out, outlink, &nb_samples_written, 1);
815 goto silence_trim;
816 }
817 }
818 s->mode = SILENCE_COPY_FLUSH;
819 flush(s, out, outlink, &nb_samples_written, 0);
820 goto silence_copy_flush;
821 }
822 }
823 }
824 s->one_period++;
825 flush(s, out, outlink, &nb_samples_written, 0);
826 } else {
827 av_samples_copy(out->extended_data, in->extended_data,
828 nb_samples_written,
829 nb_samples_read, nbs,
830 outlink->ch_layout.nb_channels, outlink->format);
831
832 av_audio_fifo_write(s->fifo, (void **)out->extended_data, out->nb_samples);
833 av_frame_free(&out);
834 }
835 break;
836
837 case SILENCE_COPY_FLUSH:
838 silence_copy_flush:
839 nbs = s->stop_holdoff_end - s->stop_holdoff_offset;
840 if (!nbs)
841 break;
842
843 out = ff_get_audio_buffer(outlink, nbs);
844 if (!out) {
845 av_frame_free(&in);
846 return AVERROR(ENOMEM);
847 }
848
849 av_samples_copy(out->extended_data, s->stop_holdoff->extended_data, 0,
850 s->stop_holdoff_offset, nbs,
851 outlink->ch_layout.nb_channels, outlink->format);
852
853 s->stop_holdoff_offset += nbs;
854
855 av_audio_fifo_write(s->fifo, (void **)out->extended_data, out->nb_samples);
856 av_frame_free(&out);
857
858 if (s->stop_holdoff_offset == s->stop_holdoff_end) {
859 s->stop_holdoff_offset = 0;
860 s->stop_holdoff_end = 0;
861 s->stop_silence_offset = 0;
862 s->stop_silence_end = 0;
863 s->mode = SILENCE_COPY;
864 goto silence_copy;
865 }
866 break;
867 case SILENCE_STOP:
868 silence_stop:
869 break;
870 default:
871 ret = AVERROR_BUG;
872 }
873
874 av_frame_free(&in);
875
876 if (av_audio_fifo_size(s->fifo) > 0) {
877 out = ff_get_audio_buffer(outlink, av_audio_fifo_size(s->fifo));
878 if (!out)
879 return AVERROR(ENOMEM);
880
881 av_audio_fifo_read(s->fifo, (void **)out->extended_data, out->nb_samples);
882 out->pts = s->next_pts;
883 s->next_pts += av_rescale_q(out->nb_samples,
884 (AVRational){1, outlink->sample_rate},
885 outlink->time_base);
886
887 ret = ff_filter_frame(outlink, out);
888 }
889
890 return ret;
891 }
892
request_frame(AVFilterLink * outlink)893 static int request_frame(AVFilterLink *outlink)
894 {
895 AVFilterContext *ctx = outlink->src;
896 SilenceRemoveContext *s = ctx->priv;
897 int ret;
898
899 ret = ff_request_frame(ctx->inputs[0]);
900 if (ret == AVERROR_EOF && (s->mode == SILENCE_COPY_FLUSH ||
901 s->mode == SILENCE_COPY)) {
902 int nbs = s->stop_holdoff_end - s->stop_holdoff_offset;
903 if (nbs) {
904 AVFrame *frame;
905
906 frame = ff_get_audio_buffer(outlink, nbs);
907 if (!frame)
908 return AVERROR(ENOMEM);
909
910 av_samples_copy(frame->extended_data, s->stop_holdoff->extended_data, 0,
911 s->stop_holdoff_offset, nbs,
912 outlink->ch_layout.nb_channels, outlink->format);
913
914 frame->pts = s->next_pts;
915 s->next_pts += av_rescale_q(frame->nb_samples,
916 (AVRational){1, outlink->sample_rate},
917 outlink->time_base);
918
919 ret = ff_filter_frame(outlink, frame);
920 }
921 s->mode = SILENCE_STOP;
922 }
923 return ret;
924 }
925
uninit(AVFilterContext * ctx)926 static av_cold void uninit(AVFilterContext *ctx)
927 {
928 SilenceRemoveContext *s = ctx->priv;
929
930 av_frame_free(&s->start_holdoff);
931 av_frame_free(&s->start_silence_hold);
932 av_frame_free(&s->stop_holdoff);
933 av_frame_free(&s->stop_silence_hold);
934 av_frame_free(&s->window);
935
936 av_audio_fifo_free(s->fifo);
937 s->fifo = NULL;
938 }
939
940 static const AVFilterPad silenceremove_inputs[] = {
941 {
942 .name = "default",
943 .type = AVMEDIA_TYPE_AUDIO,
944 .config_props = config_input,
945 .filter_frame = filter_frame,
946 },
947 };
948
949 static const AVFilterPad silenceremove_outputs[] = {
950 {
951 .name = "default",
952 .type = AVMEDIA_TYPE_AUDIO,
953 .request_frame = request_frame,
954 },
955 };
956
957 const AVFilter ff_af_silenceremove = {
958 .name = "silenceremove",
959 .description = NULL_IF_CONFIG_SMALL("Remove silence."),
960 .priv_size = sizeof(SilenceRemoveContext),
961 .priv_class = &silenceremove_class,
962 .init = init,
963 .uninit = uninit,
964 FILTER_INPUTS(silenceremove_inputs),
965 FILTER_OUTPUTS(silenceremove_outputs),
966 FILTER_SAMPLEFMTS(AV_SAMPLE_FMT_FLT, AV_SAMPLE_FMT_FLTP,
967 AV_SAMPLE_FMT_DBL, AV_SAMPLE_FMT_DBLP),
968 };
969