1 /*
2 * Copyright (c) 2019 Paul B Mahol
3 *
4 * This file is part of FFmpeg.
5 *
6 * FFmpeg is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
10 *
11 * FFmpeg is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with FFmpeg; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19 */
20
21 #include <pocketsphinx/pocketsphinx.h>
22
23 #include "libavutil/avstring.h"
24 #include "libavutil/channel_layout.h"
25 #include "libavutil/opt.h"
26 #include "audio.h"
27 #include "avfilter.h"
28 #include "internal.h"
29
30 typedef struct ASRContext {
31 const AVClass *class;
32
33 int rate;
34 char *hmm;
35 char *dict;
36 char *lm;
37 char *lmctl;
38 char *lmname;
39 char *logfn;
40
41 ps_decoder_t *ps;
42 cmd_ln_t *config;
43
44 int utt_started;
45 } ASRContext;
46
47 #define OFFSET(x) offsetof(ASRContext, x)
48 #define FLAGS AV_OPT_FLAG_AUDIO_PARAM | AV_OPT_FLAG_FILTERING_PARAM
49 static const AVOption asr_options[] = {
50 { "rate", "set sampling rate", OFFSET(rate), AV_OPT_TYPE_INT, {.i64=16000}, 0, INT_MAX, .flags = FLAGS },
51 { "hmm", "set directory containing acoustic model files", OFFSET(hmm), AV_OPT_TYPE_STRING, {.str=NULL}, .flags = FLAGS },
52 { "dict", "set pronunciation dictionary", OFFSET(dict), AV_OPT_TYPE_STRING, {.str=NULL}, .flags = FLAGS },
53 { "lm", "set language model file", OFFSET(lm), AV_OPT_TYPE_STRING, {.str=NULL}, .flags = FLAGS },
54 { "lmctl", "set language model set", OFFSET(lmctl), AV_OPT_TYPE_STRING, {.str=NULL}, .flags = FLAGS },
55 { "lmname","set which language model to use", OFFSET(lmname), AV_OPT_TYPE_STRING, {.str=NULL}, .flags = FLAGS },
56 { "logfn", "set output for log messages", OFFSET(logfn), AV_OPT_TYPE_STRING, {.str="/dev/null"}, .flags = FLAGS },
57 { NULL }
58 };
59
60 AVFILTER_DEFINE_CLASS(asr);
61
filter_frame(AVFilterLink * inlink,AVFrame * in)62 static int filter_frame(AVFilterLink *inlink, AVFrame *in)
63 {
64 AVFilterContext *ctx = inlink->dst;
65 AVDictionary **metadata = &in->metadata;
66 ASRContext *s = ctx->priv;
67 int have_speech;
68 const char *speech;
69
70 ps_process_raw(s->ps, (const int16_t *)in->data[0], in->nb_samples, 0, 0);
71 have_speech = ps_get_in_speech(s->ps);
72 if (have_speech && !s->utt_started)
73 s->utt_started = 1;
74 if (!have_speech && s->utt_started) {
75 ps_end_utt(s->ps);
76 speech = ps_get_hyp(s->ps, NULL);
77 if (speech != NULL)
78 av_dict_set(metadata, "lavfi.asr.text", speech, 0);
79 ps_start_utt(s->ps);
80 s->utt_started = 0;
81 }
82
83 return ff_filter_frame(ctx->outputs[0], in);
84 }
85
config_input(AVFilterLink * inlink)86 static int config_input(AVFilterLink *inlink)
87 {
88 AVFilterContext *ctx = inlink->dst;
89 ASRContext *s = ctx->priv;
90
91 ps_start_utt(s->ps);
92
93 return 0;
94 }
95
asr_init(AVFilterContext * ctx)96 static av_cold int asr_init(AVFilterContext *ctx)
97 {
98 ASRContext *s = ctx->priv;
99 const float frate = s->rate;
100 char *rate = av_asprintf("%f", frate);
101 const char *argv[] = { "-logfn", s->logfn,
102 "-hmm", s->hmm,
103 "-lm", s->lm,
104 "-lmctl", s->lmctl,
105 "-lmname", s->lmname,
106 "-dict", s->dict,
107 "-samprate", rate,
108 NULL };
109
110 s->config = cmd_ln_parse_r(NULL, ps_args(), 14, (char **)argv, 0);
111 av_free(rate);
112 if (!s->config)
113 return AVERROR(ENOMEM);
114
115 ps_default_search_args(s->config);
116 s->ps = ps_init(s->config);
117 if (!s->ps)
118 return AVERROR(ENOMEM);
119
120 return 0;
121 }
122
query_formats(AVFilterContext * ctx)123 static int query_formats(AVFilterContext *ctx)
124 {
125 ASRContext *s = ctx->priv;
126 int sample_rates[] = { s->rate, -1 };
127 int ret;
128
129 AVFilterFormats *formats = NULL;
130 AVFilterChannelLayouts *layout = NULL;
131
132 if ((ret = ff_add_format (&formats, AV_SAMPLE_FMT_S16 )) < 0 ||
133 (ret = ff_set_common_formats (ctx , formats )) < 0 ||
134 (ret = ff_add_channel_layout (&layout , &(AVChannelLayout)AV_CHANNEL_LAYOUT_MONO )) < 0 ||
135 (ret = ff_set_common_channel_layouts (ctx , layout )) < 0 ||
136 (ret = ff_set_common_samplerates_from_list(ctx, sample_rates )) < 0)
137 return ret;
138
139 return 0;
140 }
141
asr_uninit(AVFilterContext * ctx)142 static av_cold void asr_uninit(AVFilterContext *ctx)
143 {
144 ASRContext *s = ctx->priv;
145
146 ps_free(s->ps);
147 s->ps = NULL;
148 cmd_ln_free_r(s->config);
149 s->config = NULL;
150 }
151
152 static const AVFilterPad asr_inputs[] = {
153 {
154 .name = "default",
155 .type = AVMEDIA_TYPE_AUDIO,
156 .filter_frame = filter_frame,
157 .config_props = config_input,
158 },
159 };
160
161 static const AVFilterPad asr_outputs[] = {
162 {
163 .name = "default",
164 .type = AVMEDIA_TYPE_AUDIO,
165 },
166 };
167
168 const AVFilter ff_af_asr = {
169 .name = "asr",
170 .description = NULL_IF_CONFIG_SMALL("Automatic Speech Recognition."),
171 .priv_size = sizeof(ASRContext),
172 .priv_class = &asr_class,
173 .init = asr_init,
174 .uninit = asr_uninit,
175 .flags = AVFILTER_FLAG_METADATA_ONLY,
176 FILTER_INPUTS(asr_inputs),
177 FILTER_OUTPUTS(asr_outputs),
178 FILTER_QUERY_FUNC(query_formats),
179 };
180