• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Windows Media Audio Voice decoder.
3  * Copyright (c) 2009 Ronald S. Bultje
4  *
5  * This file is part of FFmpeg.
6  *
7  * FFmpeg is free software; you can redistribute it and/or
8  * modify it under the terms of the GNU Lesser General Public
9  * License as published by the Free Software Foundation; either
10  * version 2.1 of the License, or (at your option) any later version.
11  *
12  * FFmpeg is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15  * Lesser General Public License for more details.
16  *
17  * You should have received a copy of the GNU Lesser General Public
18  * License along with FFmpeg; if not, write to the Free Software
19  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20  */
21 
22 /**
23  * @file
24  * @brief Windows Media Audio Voice compatible decoder
25  * @author Ronald S. Bultje <rsbultje@gmail.com>
26  */
27 
28 #include <math.h>
29 
30 #include "libavutil/channel_layout.h"
31 #include "libavutil/float_dsp.h"
32 #include "libavutil/mem_internal.h"
33 #include "libavutil/thread.h"
34 #include "avcodec.h"
35 #include "codec_internal.h"
36 #include "internal.h"
37 #include "get_bits.h"
38 #include "put_bits.h"
39 #include "wmavoice_data.h"
40 #include "celp_filters.h"
41 #include "acelp_vectors.h"
42 #include "acelp_filters.h"
43 #include "lsp.h"
44 #include "dct.h"
45 #include "rdft.h"
46 #include "sinewin.h"
47 
48 #define MAX_BLOCKS           8   ///< maximum number of blocks per frame
49 #define MAX_LSPS             16  ///< maximum filter order
50 #define MAX_LSPS_ALIGN16     16  ///< same as #MAX_LSPS; needs to be multiple
51                                  ///< of 16 for ASM input buffer alignment
52 #define MAX_FRAMES           3   ///< maximum number of frames per superframe
53 #define MAX_FRAMESIZE        160 ///< maximum number of samples per frame
54 #define MAX_SIGNAL_HISTORY   416 ///< maximum excitation signal history
55 #define MAX_SFRAMESIZE       (MAX_FRAMESIZE * MAX_FRAMES)
56                                  ///< maximum number of samples per superframe
57 #define SFRAME_CACHE_MAXSIZE 256 ///< maximum cache size for frame data that
58                                  ///< was split over two packets
59 #define VLC_NBITS            6   ///< number of bits to read per VLC iteration
60 
61 /**
62  * Frame type VLC coding.
63  */
64 static VLC frame_type_vlc;
65 
66 /**
67  * Adaptive codebook types.
68  */
69 enum {
70     ACB_TYPE_NONE       = 0, ///< no adaptive codebook (only hardcoded fixed)
71     ACB_TYPE_ASYMMETRIC = 1, ///< adaptive codebook with per-frame pitch, which
72                              ///< we interpolate to get a per-sample pitch.
73                              ///< Signal is generated using an asymmetric sinc
74                              ///< window function
75                              ///< @note see #wmavoice_ipol1_coeffs
76     ACB_TYPE_HAMMING    = 2  ///< Per-block pitch with signal generation using
77                              ///< a Hamming sinc window function
78                              ///< @note see #wmavoice_ipol2_coeffs
79 };
80 
81 /**
82  * Fixed codebook types.
83  */
84 enum {
85     FCB_TYPE_SILENCE    = 0, ///< comfort noise during silence
86                              ///< generated from a hardcoded (fixed) codebook
87                              ///< with per-frame (low) gain values
88     FCB_TYPE_HARDCODED  = 1, ///< hardcoded (fixed) codebook with per-block
89                              ///< gain values
90     FCB_TYPE_AW_PULSES  = 2, ///< Pitch-adaptive window (AW) pulse signals,
91                              ///< used in particular for low-bitrate streams
92     FCB_TYPE_EXC_PULSES = 3, ///< Innovation (fixed) codebook pulse sets in
93                              ///< combinations of either single pulses or
94                              ///< pulse pairs
95 };
96 
97 /**
98  * Description of frame types.
99  */
100 static const struct frame_type_desc {
101     uint8_t n_blocks;     ///< amount of blocks per frame (each block
102                           ///< (contains 160/#n_blocks samples)
103     uint8_t log_n_blocks; ///< log2(#n_blocks)
104     uint8_t acb_type;     ///< Adaptive codebook type (ACB_TYPE_*)
105     uint8_t fcb_type;     ///< Fixed codebook type (FCB_TYPE_*)
106     uint8_t dbl_pulses;   ///< how many pulse vectors have pulse pairs
107                           ///< (rather than just one single pulse)
108                           ///< only if #fcb_type == #FCB_TYPE_EXC_PULSES
109 } frame_descs[17] = {
110     { 1, 0, ACB_TYPE_NONE,       FCB_TYPE_SILENCE,    0 },
111     { 2, 1, ACB_TYPE_NONE,       FCB_TYPE_HARDCODED,  0 },
112     { 2, 1, ACB_TYPE_ASYMMETRIC, FCB_TYPE_AW_PULSES,  0 },
113     { 2, 1, ACB_TYPE_ASYMMETRIC, FCB_TYPE_EXC_PULSES, 2 },
114     { 2, 1, ACB_TYPE_ASYMMETRIC, FCB_TYPE_EXC_PULSES, 5 },
115     { 4, 2, ACB_TYPE_ASYMMETRIC, FCB_TYPE_EXC_PULSES, 0 },
116     { 4, 2, ACB_TYPE_ASYMMETRIC, FCB_TYPE_EXC_PULSES, 2 },
117     { 4, 2, ACB_TYPE_ASYMMETRIC, FCB_TYPE_EXC_PULSES, 5 },
118     { 2, 1, ACB_TYPE_HAMMING,    FCB_TYPE_EXC_PULSES, 0 },
119     { 2, 1, ACB_TYPE_HAMMING,    FCB_TYPE_EXC_PULSES, 2 },
120     { 2, 1, ACB_TYPE_HAMMING,    FCB_TYPE_EXC_PULSES, 5 },
121     { 4, 2, ACB_TYPE_HAMMING,    FCB_TYPE_EXC_PULSES, 0 },
122     { 4, 2, ACB_TYPE_HAMMING,    FCB_TYPE_EXC_PULSES, 2 },
123     { 4, 2, ACB_TYPE_HAMMING,    FCB_TYPE_EXC_PULSES, 5 },
124     { 8, 3, ACB_TYPE_HAMMING,    FCB_TYPE_EXC_PULSES, 0 },
125     { 8, 3, ACB_TYPE_HAMMING,    FCB_TYPE_EXC_PULSES, 2 },
126     { 8, 3, ACB_TYPE_HAMMING,    FCB_TYPE_EXC_PULSES, 5 }
127 };
128 
129 /**
130  * WMA Voice decoding context.
131  */
132 typedef struct WMAVoiceContext {
133     /**
134      * @name Global values specified in the stream header / extradata or used all over.
135      * @{
136      */
137     GetBitContext gb;             ///< packet bitreader. During decoder init,
138                                   ///< it contains the extradata from the
139                                   ///< demuxer. During decoding, it contains
140                                   ///< packet data.
141     int8_t vbm_tree[25];          ///< converts VLC codes to frame type
142 
143     int spillover_bitsize;        ///< number of bits used to specify
144                                   ///< #spillover_nbits in the packet header
145                                   ///< = ceil(log2(ctx->block_align << 3))
146     int history_nsamples;         ///< number of samples in history for signal
147                                   ///< prediction (through ACB)
148 
149     /* postfilter specific values */
150     int do_apf;                   ///< whether to apply the averaged
151                                   ///< projection filter (APF)
152     int denoise_strength;         ///< strength of denoising in Wiener filter
153                                   ///< [0-11]
154     int denoise_tilt_corr;        ///< Whether to apply tilt correction to the
155                                   ///< Wiener filter coefficients (postfilter)
156     int dc_level;                 ///< Predicted amount of DC noise, based
157                                   ///< on which a DC removal filter is used
158 
159     int lsps;                     ///< number of LSPs per frame [10 or 16]
160     int lsp_q_mode;               ///< defines quantizer defaults [0, 1]
161     int lsp_def_mode;             ///< defines different sets of LSP defaults
162                                   ///< [0, 1]
163 
164     int min_pitch_val;            ///< base value for pitch parsing code
165     int max_pitch_val;            ///< max value + 1 for pitch parsing
166     int pitch_nbits;              ///< number of bits used to specify the
167                                   ///< pitch value in the frame header
168     int block_pitch_nbits;        ///< number of bits used to specify the
169                                   ///< first block's pitch value
170     int block_pitch_range;        ///< range of the block pitch
171     int block_delta_pitch_nbits;  ///< number of bits used to specify the
172                                   ///< delta pitch between this and the last
173                                   ///< block's pitch value, used in all but
174                                   ///< first block
175     int block_delta_pitch_hrange; ///< 1/2 range of the delta (full range is
176                                   ///< from -this to +this-1)
177     uint16_t block_conv_table[4]; ///< boundaries for block pitch unit/scale
178                                   ///< conversion
179 
180     /**
181      * @}
182      *
183      * @name Packet values specified in the packet header or related to a packet.
184      *
185      * A packet is considered to be a single unit of data provided to this
186      * decoder by the demuxer.
187      * @{
188      */
189     int spillover_nbits;          ///< number of bits of the previous packet's
190                                   ///< last superframe preceding this
191                                   ///< packet's first full superframe (useful
192                                   ///< for re-synchronization also)
193     int has_residual_lsps;        ///< if set, superframes contain one set of
194                                   ///< LSPs that cover all frames, encoded as
195                                   ///< independent and residual LSPs; if not
196                                   ///< set, each frame contains its own, fully
197                                   ///< independent, LSPs
198     int skip_bits_next;           ///< number of bits to skip at the next call
199                                   ///< to #wmavoice_decode_packet() (since
200                                   ///< they're part of the previous superframe)
201 
202     uint8_t sframe_cache[SFRAME_CACHE_MAXSIZE + AV_INPUT_BUFFER_PADDING_SIZE];
203                                   ///< cache for superframe data split over
204                                   ///< multiple packets
205     int sframe_cache_size;        ///< set to >0 if we have data from an
206                                   ///< (incomplete) superframe from a previous
207                                   ///< packet that spilled over in the current
208                                   ///< packet; specifies the amount of bits in
209                                   ///< #sframe_cache
210     PutBitContext pb;             ///< bitstream writer for #sframe_cache
211 
212     /**
213      * @}
214      *
215      * @name Frame and superframe values
216      * Superframe and frame data - these can change from frame to frame,
217      * although some of them do in that case serve as a cache / history for
218      * the next frame or superframe.
219      * @{
220      */
221     double prev_lsps[MAX_LSPS];   ///< LSPs of the last frame of the previous
222                                   ///< superframe
223     int last_pitch_val;           ///< pitch value of the previous frame
224     int last_acb_type;            ///< frame type [0-2] of the previous frame
225     int pitch_diff_sh16;          ///< ((cur_pitch_val - #last_pitch_val)
226                                   ///< << 16) / #MAX_FRAMESIZE
227     float silence_gain;           ///< set for use in blocks if #ACB_TYPE_NONE
228 
229     int aw_idx_is_ext;            ///< whether the AW index was encoded in
230                                   ///< 8 bits (instead of 6)
231     int aw_pulse_range;           ///< the range over which #aw_pulse_set1()
232                                   ///< can apply the pulse, relative to the
233                                   ///< value in aw_first_pulse_off. The exact
234                                   ///< position of the first AW-pulse is within
235                                   ///< [pulse_off, pulse_off + this], and
236                                   ///< depends on bitstream values; [16 or 24]
237     int aw_n_pulses[2];           ///< number of AW-pulses in each block; note
238                                   ///< that this number can be negative (in
239                                   ///< which case it basically means "zero")
240     int aw_first_pulse_off[2];    ///< index of first sample to which to
241                                   ///< apply AW-pulses, or -0xff if unset
242     int aw_next_pulse_off_cache;  ///< the position (relative to start of the
243                                   ///< second block) at which pulses should
244                                   ///< start to be positioned, serves as a
245                                   ///< cache for pitch-adaptive window pulses
246                                   ///< between blocks
247 
248     int frame_cntr;               ///< current frame index [0 - 0xFFFE]; is
249                                   ///< only used for comfort noise in #pRNG()
250     int nb_superframes;           ///< number of superframes in current packet
251     float gain_pred_err[6];       ///< cache for gain prediction
252     float excitation_history[MAX_SIGNAL_HISTORY];
253                                   ///< cache of the signal of previous
254                                   ///< superframes, used as a history for
255                                   ///< signal generation
256     float synth_history[MAX_LSPS]; ///< see #excitation_history
257     /**
258      * @}
259      *
260      * @name Postfilter values
261      *
262      * Variables used for postfilter implementation, mostly history for
263      * smoothing and so on, and context variables for FFT/iFFT.
264      * @{
265      */
266     RDFTContext rdft, irdft;      ///< contexts for FFT-calculation in the
267                                   ///< postfilter (for denoise filter)
268     DCTContext dct, dst;          ///< contexts for phase shift (in Hilbert
269                                   ///< transform, part of postfilter)
270     float sin[511], cos[511];     ///< 8-bit cosine/sine windows over [-pi,pi]
271                                   ///< range
272     float postfilter_agc;         ///< gain control memory, used in
273                                   ///< #adaptive_gain_control()
274     float dcf_mem[2];             ///< DC filter history
275     float zero_exc_pf[MAX_SIGNAL_HISTORY + MAX_SFRAMESIZE];
276                                   ///< zero filter output (i.e. excitation)
277                                   ///< by postfilter
278     float denoise_filter_cache[MAX_FRAMESIZE];
279     int   denoise_filter_cache_size; ///< samples in #denoise_filter_cache
280     DECLARE_ALIGNED(32, float, tilted_lpcs_pf)[0x80];
281                                   ///< aligned buffer for LPC tilting
282     DECLARE_ALIGNED(32, float, denoise_coeffs_pf)[0x80];
283                                   ///< aligned buffer for denoise coefficients
284     DECLARE_ALIGNED(32, float, synth_filter_out_buf)[0x80 + MAX_LSPS_ALIGN16];
285                                   ///< aligned buffer for postfilter speech
286                                   ///< synthesis
287     /**
288      * @}
289      */
290 } WMAVoiceContext;
291 
292 /**
293  * Set up the variable bit mode (VBM) tree from container extradata.
294  * @param gb bit I/O context.
295  *           The bit context (s->gb) should be loaded with byte 23-46 of the
296  *           container extradata (i.e. the ones containing the VBM tree).
297  * @param vbm_tree pointer to array to which the decoded VBM tree will be
298  *                 written.
299  * @return 0 on success, <0 on error.
300  */
decode_vbmtree(GetBitContext * gb,int8_t vbm_tree[25])301 static av_cold int decode_vbmtree(GetBitContext *gb, int8_t vbm_tree[25])
302 {
303     int cntr[8] = { 0 }, n, res;
304 
305     memset(vbm_tree, 0xff, sizeof(vbm_tree[0]) * 25);
306     for (n = 0; n < 17; n++) {
307         res = get_bits(gb, 3);
308         if (cntr[res] > 3) // should be >= 3 + (res == 7))
309             return -1;
310         vbm_tree[res * 3 + cntr[res]++] = n;
311     }
312     return 0;
313 }
314 
wmavoice_init_static_data(void)315 static av_cold void wmavoice_init_static_data(void)
316 {
317     static const uint8_t bits[] = {
318          2,  2,  2,  4,  4,  4,
319          6,  6,  6,  8,  8,  8,
320         10, 10, 10, 12, 12, 12,
321         14, 14, 14, 14
322     };
323     static const uint16_t codes[] = {
324           0x0000, 0x0001, 0x0002,        //              00/01/10
325           0x000c, 0x000d, 0x000e,        //           11+00/01/10
326           0x003c, 0x003d, 0x003e,        //         1111+00/01/10
327           0x00fc, 0x00fd, 0x00fe,        //       111111+00/01/10
328           0x03fc, 0x03fd, 0x03fe,        //     11111111+00/01/10
329           0x0ffc, 0x0ffd, 0x0ffe,        //   1111111111+00/01/10
330           0x3ffc, 0x3ffd, 0x3ffe, 0x3fff // 111111111111+xx
331     };
332 
333     INIT_VLC_STATIC(&frame_type_vlc, VLC_NBITS, sizeof(bits),
334                     bits, 1, 1, codes, 2, 2, 132);
335 }
336 
wmavoice_flush(AVCodecContext * ctx)337 static av_cold void wmavoice_flush(AVCodecContext *ctx)
338 {
339     WMAVoiceContext *s = ctx->priv_data;
340     int n;
341 
342     s->postfilter_agc    = 0;
343     s->sframe_cache_size = 0;
344     s->skip_bits_next    = 0;
345     for (n = 0; n < s->lsps; n++)
346         s->prev_lsps[n] = M_PI * (n + 1.0) / (s->lsps + 1.0);
347     memset(s->excitation_history, 0,
348            sizeof(*s->excitation_history) * MAX_SIGNAL_HISTORY);
349     memset(s->synth_history,      0,
350            sizeof(*s->synth_history)      * MAX_LSPS);
351     memset(s->gain_pred_err,      0,
352            sizeof(s->gain_pred_err));
353 
354     if (s->do_apf) {
355         memset(&s->synth_filter_out_buf[MAX_LSPS_ALIGN16 - s->lsps], 0,
356                sizeof(*s->synth_filter_out_buf) * s->lsps);
357         memset(s->dcf_mem,              0,
358                sizeof(*s->dcf_mem)              * 2);
359         memset(s->zero_exc_pf,          0,
360                sizeof(*s->zero_exc_pf)          * s->history_nsamples);
361         memset(s->denoise_filter_cache, 0, sizeof(s->denoise_filter_cache));
362     }
363 }
364 
365 /**
366  * Set up decoder with parameters from demuxer (extradata etc.).
367  */
wmavoice_decode_init(AVCodecContext * ctx)368 static av_cold int wmavoice_decode_init(AVCodecContext *ctx)
369 {
370     static AVOnce init_static_once = AV_ONCE_INIT;
371     int n, flags, pitch_range, lsp16_flag, ret;
372     WMAVoiceContext *s = ctx->priv_data;
373 
374     ff_thread_once(&init_static_once, wmavoice_init_static_data);
375 
376     /**
377      * Extradata layout:
378      * - byte  0-18: WMAPro-in-WMAVoice extradata (see wmaprodec.c),
379      * - byte 19-22: flags field (annoyingly in LE; see below for known
380      *               values),
381      * - byte 23-46: variable bitmode tree (really just 17 * 3 bits,
382      *               rest is 0).
383      */
384     if (ctx->extradata_size != 46) {
385         av_log(ctx, AV_LOG_ERROR,
386                "Invalid extradata size %d (should be 46)\n",
387                ctx->extradata_size);
388         return AVERROR_INVALIDDATA;
389     }
390     if (ctx->block_align <= 0 || ctx->block_align > (1<<22)) {
391         av_log(ctx, AV_LOG_ERROR, "Invalid block alignment %d.\n", ctx->block_align);
392         return AVERROR_INVALIDDATA;
393     }
394 
395     flags                = AV_RL32(ctx->extradata + 18);
396     s->spillover_bitsize = 3 + av_ceil_log2(ctx->block_align);
397     s->do_apf            =    flags & 0x1;
398     if (s->do_apf) {
399         if ((ret = ff_rdft_init(&s->rdft,  7,  DFT_R2C)) < 0 ||
400             (ret = ff_rdft_init(&s->irdft, 7, IDFT_C2R)) < 0 ||
401             (ret = ff_dct_init (&s->dct,   6,    DCT_I)) < 0 ||
402             (ret = ff_dct_init (&s->dst,   6,    DST_I)) < 0)
403             return ret;
404 
405         ff_sine_window_init(s->cos, 256);
406         memcpy(&s->sin[255], s->cos, 256 * sizeof(s->cos[0]));
407         for (n = 0; n < 255; n++) {
408             s->sin[n]       = -s->sin[510 - n];
409             s->cos[510 - n] =  s->cos[n];
410         }
411     }
412     s->denoise_strength  =   (flags >> 2) & 0xF;
413     if (s->denoise_strength >= 12) {
414         av_log(ctx, AV_LOG_ERROR,
415                "Invalid denoise filter strength %d (max=11)\n",
416                s->denoise_strength);
417         return AVERROR_INVALIDDATA;
418     }
419     s->denoise_tilt_corr = !!(flags & 0x40);
420     s->dc_level          =   (flags >> 7) & 0xF;
421     s->lsp_q_mode        = !!(flags & 0x2000);
422     s->lsp_def_mode      = !!(flags & 0x4000);
423     lsp16_flag           =    flags & 0x1000;
424     if (lsp16_flag) {
425         s->lsps               = 16;
426     } else {
427         s->lsps               = 10;
428     }
429     for (n = 0; n < s->lsps; n++)
430         s->prev_lsps[n] = M_PI * (n + 1.0) / (s->lsps + 1.0);
431 
432     init_get_bits(&s->gb, ctx->extradata + 22, (ctx->extradata_size - 22) << 3);
433     if (decode_vbmtree(&s->gb, s->vbm_tree) < 0) {
434         av_log(ctx, AV_LOG_ERROR, "Invalid VBM tree; broken extradata?\n");
435         return AVERROR_INVALIDDATA;
436     }
437 
438     if (ctx->sample_rate >= INT_MAX / (256 * 37))
439         return AVERROR_INVALIDDATA;
440 
441     s->min_pitch_val    = ((ctx->sample_rate << 8)      /  400 + 50) >> 8;
442     s->max_pitch_val    = ((ctx->sample_rate << 8) * 37 / 2000 + 50) >> 8;
443     pitch_range         = s->max_pitch_val - s->min_pitch_val;
444     if (pitch_range <= 0) {
445         av_log(ctx, AV_LOG_ERROR, "Invalid pitch range; broken extradata?\n");
446         return AVERROR_INVALIDDATA;
447     }
448     s->pitch_nbits      = av_ceil_log2(pitch_range);
449     s->last_pitch_val   = 40;
450     s->last_acb_type    = ACB_TYPE_NONE;
451     s->history_nsamples = s->max_pitch_val + 8;
452 
453     if (s->min_pitch_val < 1 || s->history_nsamples > MAX_SIGNAL_HISTORY) {
454         int min_sr = ((((1 << 8) - 50) * 400) + 0xFF) >> 8,
455             max_sr = ((((MAX_SIGNAL_HISTORY - 8) << 8) + 205) * 2000 / 37) >> 8;
456 
457         av_log(ctx, AV_LOG_ERROR,
458                "Unsupported samplerate %d (min=%d, max=%d)\n",
459                ctx->sample_rate, min_sr, max_sr); // 322-22097 Hz
460 
461         return AVERROR(ENOSYS);
462     }
463 
464     s->block_conv_table[0]      = s->min_pitch_val;
465     s->block_conv_table[1]      = (pitch_range * 25) >> 6;
466     s->block_conv_table[2]      = (pitch_range * 44) >> 6;
467     s->block_conv_table[3]      = s->max_pitch_val - 1;
468     s->block_delta_pitch_hrange = (pitch_range >> 3) & ~0xF;
469     if (s->block_delta_pitch_hrange <= 0) {
470         av_log(ctx, AV_LOG_ERROR, "Invalid delta pitch hrange; broken extradata?\n");
471         return AVERROR_INVALIDDATA;
472     }
473     s->block_delta_pitch_nbits  = 1 + av_ceil_log2(s->block_delta_pitch_hrange);
474     s->block_pitch_range        = s->block_conv_table[2] +
475                                   s->block_conv_table[3] + 1 +
476                                   2 * (s->block_conv_table[1] - 2 * s->min_pitch_val);
477     s->block_pitch_nbits        = av_ceil_log2(s->block_pitch_range);
478 
479     av_channel_layout_uninit(&ctx->ch_layout);
480     ctx->ch_layout = (AVChannelLayout)AV_CHANNEL_LAYOUT_MONO;
481     ctx->sample_fmt             = AV_SAMPLE_FMT_FLT;
482 
483     return 0;
484 }
485 
486 /**
487  * @name Postfilter functions
488  * Postfilter functions (gain control, wiener denoise filter, DC filter,
489  * kalman smoothening, plus surrounding code to wrap it)
490  * @{
491  */
492 /**
493  * Adaptive gain control (as used in postfilter).
494  *
495  * Identical to #ff_adaptive_gain_control() in acelp_vectors.c, except
496  * that the energy here is calculated using sum(abs(...)), whereas the
497  * other codecs (e.g. AMR-NB, SIPRO) use sqrt(dotproduct(...)).
498  *
499  * @param out output buffer for filtered samples
500  * @param in input buffer containing the samples as they are after the
501  *           postfilter steps so far
502  * @param speech_synth input buffer containing speech synth before postfilter
503  * @param size input buffer size
504  * @param alpha exponential filter factor
505  * @param gain_mem pointer to filter memory (single float)
506  */
adaptive_gain_control(float * out,const float * in,const float * speech_synth,int size,float alpha,float * gain_mem)507 static void adaptive_gain_control(float *out, const float *in,
508                                   const float *speech_synth,
509                                   int size, float alpha, float *gain_mem)
510 {
511     int i;
512     float speech_energy = 0.0, postfilter_energy = 0.0, gain_scale_factor;
513     float mem = *gain_mem;
514 
515     for (i = 0; i < size; i++) {
516         speech_energy     += fabsf(speech_synth[i]);
517         postfilter_energy += fabsf(in[i]);
518     }
519     gain_scale_factor = postfilter_energy == 0.0 ? 0.0 :
520                         (1.0 - alpha) * speech_energy / postfilter_energy;
521 
522     for (i = 0; i < size; i++) {
523         mem = alpha * mem + gain_scale_factor;
524         out[i] = in[i] * mem;
525     }
526 
527     *gain_mem = mem;
528 }
529 
530 /**
531  * Kalman smoothing function.
532  *
533  * This function looks back pitch +/- 3 samples back into history to find
534  * the best fitting curve (that one giving the optimal gain of the two
535  * signals, i.e. the highest dot product between the two), and then
536  * uses that signal history to smoothen the output of the speech synthesis
537  * filter.
538  *
539  * @param s WMA Voice decoding context
540  * @param pitch pitch of the speech signal
541  * @param in input speech signal
542  * @param out output pointer for smoothened signal
543  * @param size input/output buffer size
544  *
545  * @returns -1 if no smoothening took place, e.g. because no optimal
546  *          fit could be found, or 0 on success.
547  */
kalman_smoothen(WMAVoiceContext * s,int pitch,const float * in,float * out,int size)548 static int kalman_smoothen(WMAVoiceContext *s, int pitch,
549                            const float *in, float *out, int size)
550 {
551     int n;
552     float optimal_gain = 0, dot;
553     const float *ptr = &in[-FFMAX(s->min_pitch_val, pitch - 3)],
554                 *end = &in[-FFMIN(s->max_pitch_val, pitch + 3)],
555                 *best_hist_ptr = NULL;
556 
557     /* find best fitting point in history */
558     do {
559         dot = avpriv_scalarproduct_float_c(in, ptr, size);
560         if (dot > optimal_gain) {
561             optimal_gain  = dot;
562             best_hist_ptr = ptr;
563         }
564     } while (--ptr >= end);
565 
566     if (optimal_gain <= 0)
567         return -1;
568     dot = avpriv_scalarproduct_float_c(best_hist_ptr, best_hist_ptr, size);
569     if (dot <= 0) // would be 1.0
570         return -1;
571 
572     if (optimal_gain <= dot) {
573         dot = dot / (dot + 0.6 * optimal_gain); // 0.625-1.000
574     } else
575         dot = 0.625;
576 
577     /* actual smoothing */
578     for (n = 0; n < size; n++)
579         out[n] = best_hist_ptr[n] + dot * (in[n] - best_hist_ptr[n]);
580 
581     return 0;
582 }
583 
584 /**
585  * Get the tilt factor of a formant filter from its transfer function
586  * @see #tilt_factor() in amrnbdec.c, which does essentially the same,
587  *      but somehow (??) it does a speech synthesis filter in the
588  *      middle, which is missing here
589  *
590  * @param lpcs LPC coefficients
591  * @param n_lpcs Size of LPC buffer
592  * @returns the tilt factor
593  */
tilt_factor(const float * lpcs,int n_lpcs)594 static float tilt_factor(const float *lpcs, int n_lpcs)
595 {
596     float rh0, rh1;
597 
598     rh0 = 1.0     + avpriv_scalarproduct_float_c(lpcs,  lpcs,    n_lpcs);
599     rh1 = lpcs[0] + avpriv_scalarproduct_float_c(lpcs, &lpcs[1], n_lpcs - 1);
600 
601     return rh1 / rh0;
602 }
603 
604 /**
605  * Derive denoise filter coefficients (in real domain) from the LPCs.
606  */
calc_input_response(WMAVoiceContext * s,float * lpcs,int fcb_type,float * coeffs,int remainder)607 static void calc_input_response(WMAVoiceContext *s, float *lpcs,
608                                 int fcb_type, float *coeffs, int remainder)
609 {
610     float last_coeff, min = 15.0, max = -15.0;
611     float irange, angle_mul, gain_mul, range, sq;
612     int n, idx;
613 
614     /* Create frequency power spectrum of speech input (i.e. RDFT of LPCs) */
615     s->rdft.rdft_calc(&s->rdft, lpcs);
616 #define log_range(var, assign) do { \
617         float tmp = log10f(assign);  var = tmp; \
618         max       = FFMAX(max, tmp); min = FFMIN(min, tmp); \
619     } while (0)
620     log_range(last_coeff,  lpcs[1]         * lpcs[1]);
621     for (n = 1; n < 64; n++)
622         log_range(lpcs[n], lpcs[n * 2]     * lpcs[n * 2] +
623                            lpcs[n * 2 + 1] * lpcs[n * 2 + 1]);
624     log_range(lpcs[0],     lpcs[0]         * lpcs[0]);
625 #undef log_range
626     range    = max - min;
627     lpcs[64] = last_coeff;
628 
629     /* Now, use this spectrum to pick out these frequencies with higher
630      * (relative) power/energy (which we then take to be "not noise"),
631      * and set up a table (still in lpc[]) of (relative) gains per frequency.
632      * These frequencies will be maintained, while others ("noise") will be
633      * decreased in the filter output. */
634     irange    = 64.0 / range; // so irange*(max-value) is in the range [0, 63]
635     gain_mul  = range * (fcb_type == FCB_TYPE_HARDCODED ? (5.0 / 13.0) :
636                                                           (5.0 / 14.7));
637     angle_mul = gain_mul * (8.0 * M_LN10 / M_PI);
638     for (n = 0; n <= 64; n++) {
639         float pwr;
640 
641         idx = lrint((max - lpcs[n]) * irange - 1);
642         idx = FFMAX(0, idx);
643         pwr = wmavoice_denoise_power_table[s->denoise_strength][idx];
644         lpcs[n] = angle_mul * pwr;
645 
646         /* 70.57 =~ 1/log10(1.0331663) */
647         idx = av_clipf((pwr * gain_mul - 0.0295) * 70.570526123, 0, INT_MAX / 2);
648 
649         if (idx > 127) { // fall back if index falls outside table range
650             coeffs[n] = wmavoice_energy_table[127] *
651                         powf(1.0331663, idx - 127);
652         } else
653             coeffs[n] = wmavoice_energy_table[FFMAX(0, idx)];
654     }
655 
656     /* calculate the Hilbert transform of the gains, which we do (since this
657      * is a sine input) by doing a phase shift (in theory, H(sin())=cos()).
658      * Hilbert_Transform(RDFT(x)) = Laplace_Transform(x), which calculates the
659      * "moment" of the LPCs in this filter. */
660     s->dct.dct_calc(&s->dct, lpcs);
661     s->dst.dct_calc(&s->dst, lpcs);
662 
663     /* Split out the coefficient indexes into phase/magnitude pairs */
664     idx = 255 + av_clip(lpcs[64],               -255, 255);
665     coeffs[0]  = coeffs[0]  * s->cos[idx];
666     idx = 255 + av_clip(lpcs[64] - 2 * lpcs[63], -255, 255);
667     last_coeff = coeffs[64] * s->cos[idx];
668     for (n = 63;; n--) {
669         idx = 255 + av_clip(-lpcs[64] - 2 * lpcs[n - 1], -255, 255);
670         coeffs[n * 2 + 1] = coeffs[n] * s->sin[idx];
671         coeffs[n * 2]     = coeffs[n] * s->cos[idx];
672 
673         if (!--n) break;
674 
675         idx = 255 + av_clip( lpcs[64] - 2 * lpcs[n - 1], -255, 255);
676         coeffs[n * 2 + 1] = coeffs[n] * s->sin[idx];
677         coeffs[n * 2]     = coeffs[n] * s->cos[idx];
678     }
679     coeffs[1] = last_coeff;
680 
681     /* move into real domain */
682     s->irdft.rdft_calc(&s->irdft, coeffs);
683 
684     /* tilt correction and normalize scale */
685     memset(&coeffs[remainder], 0, sizeof(coeffs[0]) * (128 - remainder));
686     if (s->denoise_tilt_corr) {
687         float tilt_mem = 0;
688 
689         coeffs[remainder - 1] = 0;
690         ff_tilt_compensation(&tilt_mem,
691                              -1.8 * tilt_factor(coeffs, remainder - 1),
692                              coeffs, remainder);
693     }
694     sq = (1.0 / 64.0) * sqrtf(1 / avpriv_scalarproduct_float_c(coeffs, coeffs,
695                                                                remainder));
696     for (n = 0; n < remainder; n++)
697         coeffs[n] *= sq;
698 }
699 
700 /**
701  * This function applies a Wiener filter on the (noisy) speech signal as
702  * a means to denoise it.
703  *
704  * - take RDFT of LPCs to get the power spectrum of the noise + speech;
705  * - using this power spectrum, calculate (for each frequency) the Wiener
706  *    filter gain, which depends on the frequency power and desired level
707  *    of noise subtraction (when set too high, this leads to artifacts)
708  *    We can do this symmetrically over the X-axis (so 0-4kHz is the inverse
709  *    of 4-8kHz);
710  * - by doing a phase shift, calculate the Hilbert transform of this array
711  *    of per-frequency filter-gains to get the filtering coefficients;
712  * - smoothen/normalize/de-tilt these filter coefficients as desired;
713  * - take RDFT of noisy sound, apply the coefficients and take its IRDFT
714  *    to get the denoised speech signal;
715  * - the leftover (i.e. output of the IRDFT on denoised speech data beyond
716  *    the frame boundary) are saved and applied to subsequent frames by an
717  *    overlap-add method (otherwise you get clicking-artifacts).
718  *
719  * @param s WMA Voice decoding context
720  * @param fcb_type Frame (codebook) type
721  * @param synth_pf input: the noisy speech signal, output: denoised speech
722  *                 data; should be 16-byte aligned (for ASM purposes)
723  * @param size size of the speech data
724  * @param lpcs LPCs used to synthesize this frame's speech data
725  */
wiener_denoise(WMAVoiceContext * s,int fcb_type,float * synth_pf,int size,const float * lpcs)726 static void wiener_denoise(WMAVoiceContext *s, int fcb_type,
727                            float *synth_pf, int size,
728                            const float *lpcs)
729 {
730     int remainder, lim, n;
731 
732     if (fcb_type != FCB_TYPE_SILENCE) {
733         float *tilted_lpcs = s->tilted_lpcs_pf,
734               *coeffs = s->denoise_coeffs_pf, tilt_mem = 0;
735 
736         tilted_lpcs[0]           = 1.0;
737         memcpy(&tilted_lpcs[1], lpcs, sizeof(lpcs[0]) * s->lsps);
738         memset(&tilted_lpcs[s->lsps + 1], 0,
739                sizeof(tilted_lpcs[0]) * (128 - s->lsps - 1));
740         ff_tilt_compensation(&tilt_mem, 0.7 * tilt_factor(lpcs, s->lsps),
741                              tilted_lpcs, s->lsps + 2);
742 
743         /* The IRDFT output (127 samples for 7-bit filter) beyond the frame
744          * size is applied to the next frame. All input beyond this is zero,
745          * and thus all output beyond this will go towards zero, hence we can
746          * limit to min(size-1, 127-size) as a performance consideration. */
747         remainder = FFMIN(127 - size, size - 1);
748         calc_input_response(s, tilted_lpcs, fcb_type, coeffs, remainder);
749 
750         /* apply coefficients (in frequency spectrum domain), i.e. complex
751          * number multiplication */
752         memset(&synth_pf[size], 0, sizeof(synth_pf[0]) * (128 - size));
753         s->rdft.rdft_calc(&s->rdft, synth_pf);
754         s->rdft.rdft_calc(&s->rdft, coeffs);
755         synth_pf[0] *= coeffs[0];
756         synth_pf[1] *= coeffs[1];
757         for (n = 1; n < 64; n++) {
758             float v1 = synth_pf[n * 2], v2 = synth_pf[n * 2 + 1];
759             synth_pf[n * 2]     = v1 * coeffs[n * 2] - v2 * coeffs[n * 2 + 1];
760             synth_pf[n * 2 + 1] = v2 * coeffs[n * 2] + v1 * coeffs[n * 2 + 1];
761         }
762         s->irdft.rdft_calc(&s->irdft, synth_pf);
763     }
764 
765     /* merge filter output with the history of previous runs */
766     if (s->denoise_filter_cache_size) {
767         lim = FFMIN(s->denoise_filter_cache_size, size);
768         for (n = 0; n < lim; n++)
769             synth_pf[n] += s->denoise_filter_cache[n];
770         s->denoise_filter_cache_size -= lim;
771         memmove(s->denoise_filter_cache, &s->denoise_filter_cache[size],
772                 sizeof(s->denoise_filter_cache[0]) * s->denoise_filter_cache_size);
773     }
774 
775     /* move remainder of filter output into a cache for future runs */
776     if (fcb_type != FCB_TYPE_SILENCE) {
777         lim = FFMIN(remainder, s->denoise_filter_cache_size);
778         for (n = 0; n < lim; n++)
779             s->denoise_filter_cache[n] += synth_pf[size + n];
780         if (lim < remainder) {
781             memcpy(&s->denoise_filter_cache[lim], &synth_pf[size + lim],
782                    sizeof(s->denoise_filter_cache[0]) * (remainder - lim));
783             s->denoise_filter_cache_size = remainder;
784         }
785     }
786 }
787 
788 /**
789  * Averaging projection filter, the postfilter used in WMAVoice.
790  *
791  * This uses the following steps:
792  * - A zero-synthesis filter (generate excitation from synth signal)
793  * - Kalman smoothing on excitation, based on pitch
794  * - Re-synthesized smoothened output
795  * - Iterative Wiener denoise filter
796  * - Adaptive gain filter
797  * - DC filter
798  *
799  * @param s WMAVoice decoding context
800  * @param synth Speech synthesis output (before postfilter)
801  * @param samples Output buffer for filtered samples
802  * @param size Buffer size of synth & samples
803  * @param lpcs Generated LPCs used for speech synthesis
804  * @param zero_exc_pf destination for zero synthesis filter (16-byte aligned)
805  * @param fcb_type Frame type (silence, hardcoded, AW-pulses or FCB-pulses)
806  * @param pitch Pitch of the input signal
807  */
postfilter(WMAVoiceContext * s,const float * synth,float * samples,int size,const float * lpcs,float * zero_exc_pf,int fcb_type,int pitch)808 static void postfilter(WMAVoiceContext *s, const float *synth,
809                        float *samples,    int size,
810                        const float *lpcs, float *zero_exc_pf,
811                        int fcb_type,      int pitch)
812 {
813     float synth_filter_in_buf[MAX_FRAMESIZE / 2],
814           *synth_pf = &s->synth_filter_out_buf[MAX_LSPS_ALIGN16],
815           *synth_filter_in = zero_exc_pf;
816 
817     av_assert0(size <= MAX_FRAMESIZE / 2);
818 
819     /* generate excitation from input signal */
820     ff_celp_lp_zero_synthesis_filterf(zero_exc_pf, lpcs, synth, size, s->lsps);
821 
822     if (fcb_type >= FCB_TYPE_AW_PULSES &&
823         !kalman_smoothen(s, pitch, zero_exc_pf, synth_filter_in_buf, size))
824         synth_filter_in = synth_filter_in_buf;
825 
826     /* re-synthesize speech after smoothening, and keep history */
827     ff_celp_lp_synthesis_filterf(synth_pf, lpcs,
828                                  synth_filter_in, size, s->lsps);
829     memcpy(&synth_pf[-s->lsps], &synth_pf[size - s->lsps],
830            sizeof(synth_pf[0]) * s->lsps);
831 
832     wiener_denoise(s, fcb_type, synth_pf, size, lpcs);
833 
834     adaptive_gain_control(samples, synth_pf, synth, size, 0.99,
835                           &s->postfilter_agc);
836 
837     if (s->dc_level > 8) {
838         /* remove ultra-low frequency DC noise / highpass filter;
839          * coefficients are identical to those used in SIPR decoding,
840          * and very closely resemble those used in AMR-NB decoding. */
841         ff_acelp_apply_order_2_transfer_function(samples, samples,
842             (const float[2]) { -1.99997,      1.0 },
843             (const float[2]) { -1.9330735188, 0.93589198496 },
844             0.93980580475, s->dcf_mem, size);
845     }
846 }
847 /**
848  * @}
849  */
850 
851 /**
852  * Dequantize LSPs
853  * @param lsps output pointer to the array that will hold the LSPs
854  * @param num number of LSPs to be dequantized
855  * @param values quantized values, contains n_stages values
856  * @param sizes range (i.e. max value) of each quantized value
857  * @param n_stages number of dequantization runs
858  * @param table dequantization table to be used
859  * @param mul_q LSF multiplier
860  * @param base_q base (lowest) LSF values
861  */
dequant_lsps(double * lsps,int num,const uint16_t * values,const uint16_t * sizes,int n_stages,const uint8_t * table,const double * mul_q,const double * base_q)862 static void dequant_lsps(double *lsps, int num,
863                          const uint16_t *values,
864                          const uint16_t *sizes,
865                          int n_stages, const uint8_t *table,
866                          const double *mul_q,
867                          const double *base_q)
868 {
869     int n, m;
870 
871     memset(lsps, 0, num * sizeof(*lsps));
872     for (n = 0; n < n_stages; n++) {
873         const uint8_t *t_off = &table[values[n] * num];
874         double base = base_q[n], mul = mul_q[n];
875 
876         for (m = 0; m < num; m++)
877             lsps[m] += base + mul * t_off[m];
878 
879         table += sizes[n] * num;
880     }
881 }
882 
883 /**
884  * @name LSP dequantization routines
885  * LSP dequantization routines, for 10/16LSPs and independent/residual coding.
886  * lsp10i() consumes 24 bits; lsp10r() consumes an additional 24 bits;
887  * lsp16i() consumes 34 bits; lsp16r() consumes an additional 26 bits.
888  * @{
889  */
890 /**
891  * Parse 10 independently-coded LSPs.
892  */
dequant_lsp10i(GetBitContext * gb,double * lsps)893 static void dequant_lsp10i(GetBitContext *gb, double *lsps)
894 {
895     static const uint16_t vec_sizes[4] = { 256, 64, 32, 32 };
896     static const double mul_lsf[4] = {
897         5.2187144800e-3,    1.4626986422e-3,
898         9.6179549166e-4,    1.1325736225e-3
899     };
900     static const double base_lsf[4] = {
901         M_PI * -2.15522e-1, M_PI * -6.1646e-2,
902         M_PI * -3.3486e-2,  M_PI * -5.7408e-2
903     };
904     uint16_t v[4];
905 
906     v[0] = get_bits(gb, 8);
907     v[1] = get_bits(gb, 6);
908     v[2] = get_bits(gb, 5);
909     v[3] = get_bits(gb, 5);
910 
911     dequant_lsps(lsps, 10, v, vec_sizes, 4, wmavoice_dq_lsp10i,
912                  mul_lsf, base_lsf);
913 }
914 
915 /**
916  * Parse 10 independently-coded LSPs, and then derive the tables to
917  * generate LSPs for the other frames from them (residual coding).
918  */
dequant_lsp10r(GetBitContext * gb,double * i_lsps,const double * old,double * a1,double * a2,int q_mode)919 static void dequant_lsp10r(GetBitContext *gb,
920                            double *i_lsps, const double *old,
921                            double *a1, double *a2, int q_mode)
922 {
923     static const uint16_t vec_sizes[3] = { 128, 64, 64 };
924     static const double mul_lsf[3] = {
925         2.5807601174e-3,    1.2354460219e-3,   1.1763821673e-3
926     };
927     static const double base_lsf[3] = {
928         M_PI * -1.07448e-1, M_PI * -5.2706e-2, M_PI * -5.1634e-2
929     };
930     const float (*ipol_tab)[2][10] = q_mode ?
931         wmavoice_lsp10_intercoeff_b : wmavoice_lsp10_intercoeff_a;
932     uint16_t interpol, v[3];
933     int n;
934 
935     dequant_lsp10i(gb, i_lsps);
936 
937     interpol = get_bits(gb, 5);
938     v[0]     = get_bits(gb, 7);
939     v[1]     = get_bits(gb, 6);
940     v[2]     = get_bits(gb, 6);
941 
942     for (n = 0; n < 10; n++) {
943         double delta = old[n] - i_lsps[n];
944         a1[n]        = ipol_tab[interpol][0][n] * delta + i_lsps[n];
945         a1[10 + n]   = ipol_tab[interpol][1][n] * delta + i_lsps[n];
946     }
947 
948     dequant_lsps(a2, 20, v, vec_sizes, 3, wmavoice_dq_lsp10r,
949                  mul_lsf, base_lsf);
950 }
951 
952 /**
953  * Parse 16 independently-coded LSPs.
954  */
dequant_lsp16i(GetBitContext * gb,double * lsps)955 static void dequant_lsp16i(GetBitContext *gb, double *lsps)
956 {
957     static const uint16_t vec_sizes[5] = { 256, 64, 128, 64, 128 };
958     static const double mul_lsf[5] = {
959         3.3439586280e-3,    6.9908173703e-4,
960         3.3216608306e-3,    1.0334960326e-3,
961         3.1899104283e-3
962     };
963     static const double base_lsf[5] = {
964         M_PI * -1.27576e-1, M_PI * -2.4292e-2,
965         M_PI * -1.28094e-1, M_PI * -3.2128e-2,
966         M_PI * -1.29816e-1
967     };
968     uint16_t v[5];
969 
970     v[0] = get_bits(gb, 8);
971     v[1] = get_bits(gb, 6);
972     v[2] = get_bits(gb, 7);
973     v[3] = get_bits(gb, 6);
974     v[4] = get_bits(gb, 7);
975 
976     dequant_lsps( lsps,     5,  v,     vec_sizes,    2,
977                  wmavoice_dq_lsp16i1,  mul_lsf,     base_lsf);
978     dequant_lsps(&lsps[5],  5, &v[2], &vec_sizes[2], 2,
979                  wmavoice_dq_lsp16i2, &mul_lsf[2], &base_lsf[2]);
980     dequant_lsps(&lsps[10], 6, &v[4], &vec_sizes[4], 1,
981                  wmavoice_dq_lsp16i3, &mul_lsf[4], &base_lsf[4]);
982 }
983 
984 /**
985  * Parse 16 independently-coded LSPs, and then derive the tables to
986  * generate LSPs for the other frames from them (residual coding).
987  */
dequant_lsp16r(GetBitContext * gb,double * i_lsps,const double * old,double * a1,double * a2,int q_mode)988 static void dequant_lsp16r(GetBitContext *gb,
989                            double *i_lsps, const double *old,
990                            double *a1, double *a2, int q_mode)
991 {
992     static const uint16_t vec_sizes[3] = { 128, 128, 128 };
993     static const double mul_lsf[3] = {
994         1.2232979501e-3,   1.4062241527e-3,   1.6114744851e-3
995     };
996     static const double base_lsf[3] = {
997         M_PI * -5.5830e-2, M_PI * -5.2908e-2, M_PI * -5.4776e-2
998     };
999     const float (*ipol_tab)[2][16] = q_mode ?
1000         wmavoice_lsp16_intercoeff_b : wmavoice_lsp16_intercoeff_a;
1001     uint16_t interpol, v[3];
1002     int n;
1003 
1004     dequant_lsp16i(gb, i_lsps);
1005 
1006     interpol = get_bits(gb, 5);
1007     v[0]     = get_bits(gb, 7);
1008     v[1]     = get_bits(gb, 7);
1009     v[2]     = get_bits(gb, 7);
1010 
1011     for (n = 0; n < 16; n++) {
1012         double delta = old[n] - i_lsps[n];
1013         a1[n]        = ipol_tab[interpol][0][n] * delta + i_lsps[n];
1014         a1[16 + n]   = ipol_tab[interpol][1][n] * delta + i_lsps[n];
1015     }
1016 
1017     dequant_lsps( a2,     10,  v,     vec_sizes,    1,
1018                  wmavoice_dq_lsp16r1,  mul_lsf,     base_lsf);
1019     dequant_lsps(&a2[10], 10, &v[1], &vec_sizes[1], 1,
1020                  wmavoice_dq_lsp16r2, &mul_lsf[1], &base_lsf[1]);
1021     dequant_lsps(&a2[20], 12, &v[2], &vec_sizes[2], 1,
1022                  wmavoice_dq_lsp16r3, &mul_lsf[2], &base_lsf[2]);
1023 }
1024 
1025 /**
1026  * @}
1027  * @name Pitch-adaptive window coding functions
1028  * The next few functions are for pitch-adaptive window coding.
1029  * @{
1030  */
1031 /**
1032  * Parse the offset of the first pitch-adaptive window pulses, and
1033  * the distribution of pulses between the two blocks in this frame.
1034  * @param s WMA Voice decoding context private data
1035  * @param gb bit I/O context
1036  * @param pitch pitch for each block in this frame
1037  */
aw_parse_coords(WMAVoiceContext * s,GetBitContext * gb,const int * pitch)1038 static void aw_parse_coords(WMAVoiceContext *s, GetBitContext *gb,
1039                             const int *pitch)
1040 {
1041     static const int16_t start_offset[94] = {
1042         -11,  -9,  -7,  -5,  -3,  -1,   1,   3,   5,   7,   9,  11,
1043          13,  15,  18,  17,  19,  20,  21,  22,  23,  24,  25,  26,
1044          27,  28,  29,  30,  31,  32,  33,  35,  37,  39,  41,  43,
1045          45,  47,  49,  51,  53,  55,  57,  59,  61,  63,  65,  67,
1046          69,  71,  73,  75,  77,  79,  81,  83,  85,  87,  89,  91,
1047          93,  95,  97,  99, 101, 103, 105, 107, 109, 111, 113, 115,
1048         117, 119, 121, 123, 125, 127, 129, 131, 133, 135, 137, 139,
1049         141, 143, 145, 147, 149, 151, 153, 155, 157, 159
1050     };
1051     int bits, offset;
1052 
1053     /* position of pulse */
1054     s->aw_idx_is_ext = 0;
1055     if ((bits = get_bits(gb, 6)) >= 54) {
1056         s->aw_idx_is_ext = 1;
1057         bits += (bits - 54) * 3 + get_bits(gb, 2);
1058     }
1059 
1060     /* for a repeated pulse at pulse_off with a pitch_lag of pitch[], count
1061      * the distribution of the pulses in each block contained in this frame. */
1062     s->aw_pulse_range        = FFMIN(pitch[0], pitch[1]) > 32 ? 24 : 16;
1063     for (offset = start_offset[bits]; offset < 0; offset += pitch[0]) ;
1064     s->aw_n_pulses[0]        = (pitch[0] - 1 + MAX_FRAMESIZE / 2 - offset) / pitch[0];
1065     s->aw_first_pulse_off[0] = offset - s->aw_pulse_range / 2;
1066     offset                  += s->aw_n_pulses[0] * pitch[0];
1067     s->aw_n_pulses[1]        = (pitch[1] - 1 + MAX_FRAMESIZE - offset) / pitch[1];
1068     s->aw_first_pulse_off[1] = offset - (MAX_FRAMESIZE + s->aw_pulse_range) / 2;
1069 
1070     /* if continuing from a position before the block, reset position to
1071      * start of block (when corrected for the range over which it can be
1072      * spread in aw_pulse_set1()). */
1073     if (start_offset[bits] < MAX_FRAMESIZE / 2) {
1074         while (s->aw_first_pulse_off[1] - pitch[1] + s->aw_pulse_range > 0)
1075             s->aw_first_pulse_off[1] -= pitch[1];
1076         if (start_offset[bits] < 0)
1077             while (s->aw_first_pulse_off[0] - pitch[0] + s->aw_pulse_range > 0)
1078                 s->aw_first_pulse_off[0] -= pitch[0];
1079     }
1080 }
1081 
1082 /**
1083  * Apply second set of pitch-adaptive window pulses.
1084  * @param s WMA Voice decoding context private data
1085  * @param gb bit I/O context
1086  * @param block_idx block index in frame [0, 1]
1087  * @param fcb structure containing fixed codebook vector info
1088  * @return -1 on error, 0 otherwise
1089  */
aw_pulse_set2(WMAVoiceContext * s,GetBitContext * gb,int block_idx,AMRFixed * fcb)1090 static int aw_pulse_set2(WMAVoiceContext *s, GetBitContext *gb,
1091                          int block_idx, AMRFixed *fcb)
1092 {
1093     uint16_t use_mask_mem[9]; // only 5 are used, rest is padding
1094     uint16_t *use_mask = use_mask_mem + 2;
1095     /* in this function, idx is the index in the 80-bit (+ padding) use_mask
1096      * bit-array. Since use_mask consists of 16-bit values, the lower 4 bits
1097      * of idx are the position of the bit within a particular item in the
1098      * array (0 being the most significant bit, and 15 being the least
1099      * significant bit), and the remainder (>> 4) is the index in the
1100      * use_mask[]-array. This is faster and uses less memory than using a
1101      * 80-byte/80-int array. */
1102     int pulse_off = s->aw_first_pulse_off[block_idx],
1103         pulse_start, n, idx, range, aidx, start_off = 0;
1104 
1105     /* set offset of first pulse to within this block */
1106     if (s->aw_n_pulses[block_idx] > 0)
1107         while (pulse_off + s->aw_pulse_range < 1)
1108             pulse_off += fcb->pitch_lag;
1109 
1110     /* find range per pulse */
1111     if (s->aw_n_pulses[0] > 0) {
1112         if (block_idx == 0) {
1113             range = 32;
1114         } else /* block_idx = 1 */ {
1115             range = 8;
1116             if (s->aw_n_pulses[block_idx] > 0)
1117                 pulse_off = s->aw_next_pulse_off_cache;
1118         }
1119     } else
1120         range = 16;
1121     pulse_start = s->aw_n_pulses[block_idx] > 0 ? pulse_off - range / 2 : 0;
1122 
1123     /* aw_pulse_set1() already applies pulses around pulse_off (to be exactly,
1124      * in the range of [pulse_off, pulse_off + s->aw_pulse_range], and thus
1125      * we exclude that range from being pulsed again in this function. */
1126     memset(&use_mask[-2], 0, 2 * sizeof(use_mask[0]));
1127     memset( use_mask,   -1, 5 * sizeof(use_mask[0]));
1128     memset(&use_mask[5], 0, 2 * sizeof(use_mask[0]));
1129     if (s->aw_n_pulses[block_idx] > 0)
1130         for (idx = pulse_off; idx < MAX_FRAMESIZE / 2; idx += fcb->pitch_lag) {
1131             int excl_range         = s->aw_pulse_range; // always 16 or 24
1132             uint16_t *use_mask_ptr = &use_mask[idx >> 4];
1133             int first_sh           = 16 - (idx & 15);
1134             *use_mask_ptr++       &= 0xFFFFu << first_sh;
1135             excl_range            -= first_sh;
1136             if (excl_range >= 16) {
1137                 *use_mask_ptr++    = 0;
1138                 *use_mask_ptr     &= 0xFFFF >> (excl_range - 16);
1139             } else
1140                 *use_mask_ptr     &= 0xFFFF >> excl_range;
1141         }
1142 
1143     /* find the 'aidx'th offset that is not excluded */
1144     aidx = get_bits(gb, s->aw_n_pulses[0] > 0 ? 5 - 2 * block_idx : 4);
1145     for (n = 0; n <= aidx; pulse_start++) {
1146         for (idx = pulse_start; idx < 0; idx += fcb->pitch_lag) ;
1147         if (idx >= MAX_FRAMESIZE / 2) { // find from zero
1148             if (use_mask[0])      idx = 0x0F;
1149             else if (use_mask[1]) idx = 0x1F;
1150             else if (use_mask[2]) idx = 0x2F;
1151             else if (use_mask[3]) idx = 0x3F;
1152             else if (use_mask[4]) idx = 0x4F;
1153             else return -1;
1154             idx -= av_log2_16bit(use_mask[idx >> 4]);
1155         }
1156         if (use_mask[idx >> 4] & (0x8000 >> (idx & 15))) {
1157             use_mask[idx >> 4] &= ~(0x8000 >> (idx & 15));
1158             n++;
1159             start_off = idx;
1160         }
1161     }
1162 
1163     fcb->x[fcb->n] = start_off;
1164     fcb->y[fcb->n] = get_bits1(gb) ? -1.0 : 1.0;
1165     fcb->n++;
1166 
1167     /* set offset for next block, relative to start of that block */
1168     n = (MAX_FRAMESIZE / 2 - start_off) % fcb->pitch_lag;
1169     s->aw_next_pulse_off_cache = n ? fcb->pitch_lag - n : 0;
1170     return 0;
1171 }
1172 
1173 /**
1174  * Apply first set of pitch-adaptive window pulses.
1175  * @param s WMA Voice decoding context private data
1176  * @param gb bit I/O context
1177  * @param block_idx block index in frame [0, 1]
1178  * @param fcb storage location for fixed codebook pulse info
1179  */
aw_pulse_set1(WMAVoiceContext * s,GetBitContext * gb,int block_idx,AMRFixed * fcb)1180 static void aw_pulse_set1(WMAVoiceContext *s, GetBitContext *gb,
1181                           int block_idx, AMRFixed *fcb)
1182 {
1183     int val = get_bits(gb, 12 - 2 * (s->aw_idx_is_ext && !block_idx));
1184     float v;
1185 
1186     if (s->aw_n_pulses[block_idx] > 0) {
1187         int n, v_mask, i_mask, sh, n_pulses;
1188 
1189         if (s->aw_pulse_range == 24) { // 3 pulses, 1:sign + 3:index each
1190             n_pulses = 3;
1191             v_mask   = 8;
1192             i_mask   = 7;
1193             sh       = 4;
1194         } else { // 4 pulses, 1:sign + 2:index each
1195             n_pulses = 4;
1196             v_mask   = 4;
1197             i_mask   = 3;
1198             sh       = 3;
1199         }
1200 
1201         for (n = n_pulses - 1; n >= 0; n--, val >>= sh) {
1202             fcb->y[fcb->n] = (val & v_mask) ? -1.0 : 1.0;
1203             fcb->x[fcb->n] = (val & i_mask) * n_pulses + n +
1204                                  s->aw_first_pulse_off[block_idx];
1205             while (fcb->x[fcb->n] < 0)
1206                 fcb->x[fcb->n] += fcb->pitch_lag;
1207             if (fcb->x[fcb->n] < MAX_FRAMESIZE / 2)
1208                 fcb->n++;
1209         }
1210     } else {
1211         int num2 = (val & 0x1FF) >> 1, delta, idx;
1212 
1213         if (num2 < 1 * 79)      { delta = 1; idx = num2 + 1; }
1214         else if (num2 < 2 * 78) { delta = 3; idx = num2 + 1 - 1 * 77; }
1215         else if (num2 < 3 * 77) { delta = 5; idx = num2 + 1 - 2 * 76; }
1216         else                    { delta = 7; idx = num2 + 1 - 3 * 75; }
1217         v = (val & 0x200) ? -1.0 : 1.0;
1218 
1219         fcb->no_repeat_mask |= 3 << fcb->n;
1220         fcb->x[fcb->n]       = idx - delta;
1221         fcb->y[fcb->n]       = v;
1222         fcb->x[fcb->n + 1]   = idx;
1223         fcb->y[fcb->n + 1]   = (val & 1) ? -v : v;
1224         fcb->n              += 2;
1225     }
1226 }
1227 
1228 /**
1229  * @}
1230  *
1231  * Generate a random number from frame_cntr and block_idx, which will live
1232  * in the range [0, 1000 - block_size] (so it can be used as an index in a
1233  * table of size 1000 of which you want to read block_size entries).
1234  *
1235  * @param frame_cntr current frame number
1236  * @param block_num current block index
1237  * @param block_size amount of entries we want to read from a table
1238  *                   that has 1000 entries
1239  * @return a (non-)random number in the [0, 1000 - block_size] range.
1240  */
pRNG(int frame_cntr,int block_num,int block_size)1241 static int pRNG(int frame_cntr, int block_num, int block_size)
1242 {
1243     /* array to simplify the calculation of z:
1244      * y = (x % 9) * 5 + 6;
1245      * z = (49995 * x) / y;
1246      * Since y only has 9 values, we can remove the division by using a
1247      * LUT and using FASTDIV-style divisions. For each of the 9 values
1248      * of y, we can rewrite z as:
1249      * z = x * (49995 / y) + x * ((49995 % y) / y)
1250      * In this table, each col represents one possible value of y, the
1251      * first number is 49995 / y, and the second is the FASTDIV variant
1252      * of 49995 % y / y. */
1253     static const unsigned int div_tbl[9][2] = {
1254         { 8332,  3 * 715827883U }, // y =  6
1255         { 4545,  0 * 390451573U }, // y = 11
1256         { 3124, 11 * 268435456U }, // y = 16
1257         { 2380, 15 * 204522253U }, // y = 21
1258         { 1922, 23 * 165191050U }, // y = 26
1259         { 1612, 23 * 138547333U }, // y = 31
1260         { 1388, 27 * 119304648U }, // y = 36
1261         { 1219, 16 * 104755300U }, // y = 41
1262         { 1086, 39 *  93368855U }  // y = 46
1263     };
1264     unsigned int z, y, x = MUL16(block_num, 1877) + frame_cntr;
1265     if (x >= 0xFFFF) x -= 0xFFFF;   // max value of x is 8*1877+0xFFFE=0x13AA6,
1266                                     // so this is effectively a modulo (%)
1267     y = x - 9 * MULH(477218589, x); // x % 9
1268     z = (uint16_t) (x * div_tbl[y][0] + UMULH(x, div_tbl[y][1]));
1269                                     // z = x * 49995 / (y * 5 + 6)
1270     return z % (1000 - block_size);
1271 }
1272 
1273 /**
1274  * Parse hardcoded signal for a single block.
1275  * @note see #synth_block().
1276  */
synth_block_hardcoded(WMAVoiceContext * s,GetBitContext * gb,int block_idx,int size,const struct frame_type_desc * frame_desc,float * excitation)1277 static void synth_block_hardcoded(WMAVoiceContext *s, GetBitContext *gb,
1278                                  int block_idx, int size,
1279                                  const struct frame_type_desc *frame_desc,
1280                                  float *excitation)
1281 {
1282     float gain;
1283     int n, r_idx;
1284 
1285     av_assert0(size <= MAX_FRAMESIZE);
1286 
1287     /* Set the offset from which we start reading wmavoice_std_codebook */
1288     if (frame_desc->fcb_type == FCB_TYPE_SILENCE) {
1289         r_idx = pRNG(s->frame_cntr, block_idx, size);
1290         gain  = s->silence_gain;
1291     } else /* FCB_TYPE_HARDCODED */ {
1292         r_idx = get_bits(gb, 8);
1293         gain  = wmavoice_gain_universal[get_bits(gb, 6)];
1294     }
1295 
1296     /* Clear gain prediction parameters */
1297     memset(s->gain_pred_err, 0, sizeof(s->gain_pred_err));
1298 
1299     /* Apply gain to hardcoded codebook and use that as excitation signal */
1300     for (n = 0; n < size; n++)
1301         excitation[n] = wmavoice_std_codebook[r_idx + n] * gain;
1302 }
1303 
1304 /**
1305  * Parse FCB/ACB signal for a single block.
1306  * @note see #synth_block().
1307  */
synth_block_fcb_acb(WMAVoiceContext * s,GetBitContext * gb,int block_idx,int size,int block_pitch_sh2,const struct frame_type_desc * frame_desc,float * excitation)1308 static void synth_block_fcb_acb(WMAVoiceContext *s, GetBitContext *gb,
1309                                 int block_idx, int size,
1310                                 int block_pitch_sh2,
1311                                 const struct frame_type_desc *frame_desc,
1312                                 float *excitation)
1313 {
1314     static const float gain_coeff[6] = {
1315         0.8169, -0.06545, 0.1726, 0.0185, -0.0359, 0.0458
1316     };
1317     float pulses[MAX_FRAMESIZE / 2], pred_err, acb_gain, fcb_gain;
1318     int n, idx, gain_weight;
1319     AMRFixed fcb;
1320 
1321     av_assert0(size <= MAX_FRAMESIZE / 2);
1322     memset(pulses, 0, sizeof(*pulses) * size);
1323 
1324     fcb.pitch_lag      = block_pitch_sh2 >> 2;
1325     fcb.pitch_fac      = 1.0;
1326     fcb.no_repeat_mask = 0;
1327     fcb.n              = 0;
1328 
1329     /* For the other frame types, this is where we apply the innovation
1330      * (fixed) codebook pulses of the speech signal. */
1331     if (frame_desc->fcb_type == FCB_TYPE_AW_PULSES) {
1332         aw_pulse_set1(s, gb, block_idx, &fcb);
1333         if (aw_pulse_set2(s, gb, block_idx, &fcb)) {
1334             /* Conceal the block with silence and return.
1335              * Skip the correct amount of bits to read the next
1336              * block from the correct offset. */
1337             int r_idx = pRNG(s->frame_cntr, block_idx, size);
1338 
1339             for (n = 0; n < size; n++)
1340                 excitation[n] =
1341                     wmavoice_std_codebook[r_idx + n] * s->silence_gain;
1342             skip_bits(gb, 7 + 1);
1343             return;
1344         }
1345     } else /* FCB_TYPE_EXC_PULSES */ {
1346         int offset_nbits = 5 - frame_desc->log_n_blocks;
1347 
1348         fcb.no_repeat_mask = -1;
1349         /* similar to ff_decode_10_pulses_35bits(), but with single pulses
1350          * (instead of double) for a subset of pulses */
1351         for (n = 0; n < 5; n++) {
1352             float sign;
1353             int pos1, pos2;
1354 
1355             sign           = get_bits1(gb) ? 1.0 : -1.0;
1356             pos1           = get_bits(gb, offset_nbits);
1357             fcb.x[fcb.n]   = n + 5 * pos1;
1358             fcb.y[fcb.n++] = sign;
1359             if (n < frame_desc->dbl_pulses) {
1360                 pos2           = get_bits(gb, offset_nbits);
1361                 fcb.x[fcb.n]   = n + 5 * pos2;
1362                 fcb.y[fcb.n++] = (pos1 < pos2) ? -sign : sign;
1363             }
1364         }
1365     }
1366     ff_set_fixed_vector(pulses, &fcb, 1.0, size);
1367 
1368     /* Calculate gain for adaptive & fixed codebook signal.
1369      * see ff_amr_set_fixed_gain(). */
1370     idx = get_bits(gb, 7);
1371     fcb_gain = expf(avpriv_scalarproduct_float_c(s->gain_pred_err,
1372                                                  gain_coeff, 6) -
1373                     5.2409161640 + wmavoice_gain_codebook_fcb[idx]);
1374     acb_gain = wmavoice_gain_codebook_acb[idx];
1375     pred_err = av_clipf(wmavoice_gain_codebook_fcb[idx],
1376                         -2.9957322736 /* log(0.05) */,
1377                          1.6094379124 /* log(5.0)  */);
1378 
1379     gain_weight = 8 >> frame_desc->log_n_blocks;
1380     memmove(&s->gain_pred_err[gain_weight], s->gain_pred_err,
1381             sizeof(*s->gain_pred_err) * (6 - gain_weight));
1382     for (n = 0; n < gain_weight; n++)
1383         s->gain_pred_err[n] = pred_err;
1384 
1385     /* Calculation of adaptive codebook */
1386     if (frame_desc->acb_type == ACB_TYPE_ASYMMETRIC) {
1387         int len;
1388         for (n = 0; n < size; n += len) {
1389             int next_idx_sh16;
1390             int abs_idx    = block_idx * size + n;
1391             int pitch_sh16 = (s->last_pitch_val << 16) +
1392                              s->pitch_diff_sh16 * abs_idx;
1393             int pitch      = (pitch_sh16 + 0x6FFF) >> 16;
1394             int idx_sh16   = ((pitch << 16) - pitch_sh16) * 8 + 0x58000;
1395             idx            = idx_sh16 >> 16;
1396             if (s->pitch_diff_sh16) {
1397                 if (s->pitch_diff_sh16 > 0) {
1398                     next_idx_sh16 = (idx_sh16) &~ 0xFFFF;
1399                 } else
1400                     next_idx_sh16 = (idx_sh16 + 0x10000) &~ 0xFFFF;
1401                 len = av_clip((idx_sh16 - next_idx_sh16) / s->pitch_diff_sh16 / 8,
1402                               1, size - n);
1403             } else
1404                 len = size;
1405 
1406             ff_acelp_interpolatef(&excitation[n], &excitation[n - pitch],
1407                                   wmavoice_ipol1_coeffs, 17,
1408                                   idx, 9, len);
1409         }
1410     } else /* ACB_TYPE_HAMMING */ {
1411         int block_pitch = block_pitch_sh2 >> 2;
1412         idx             = block_pitch_sh2 & 3;
1413         if (idx) {
1414             ff_acelp_interpolatef(excitation, &excitation[-block_pitch],
1415                                   wmavoice_ipol2_coeffs, 4,
1416                                   idx, 8, size);
1417         } else
1418             av_memcpy_backptr((uint8_t *) excitation, sizeof(float) * block_pitch,
1419                               sizeof(float) * size);
1420     }
1421 
1422     /* Interpolate ACB/FCB and use as excitation signal */
1423     ff_weighted_vector_sumf(excitation, excitation, pulses,
1424                             acb_gain, fcb_gain, size);
1425 }
1426 
1427 /**
1428  * Parse data in a single block.
1429  *
1430  * @param s WMA Voice decoding context private data
1431  * @param gb bit I/O context
1432  * @param block_idx index of the to-be-read block
1433  * @param size amount of samples to be read in this block
1434  * @param block_pitch_sh2 pitch for this block << 2
1435  * @param lsps LSPs for (the end of) this frame
1436  * @param prev_lsps LSPs for the last frame
1437  * @param frame_desc frame type descriptor
1438  * @param excitation target memory for the ACB+FCB interpolated signal
1439  * @param synth target memory for the speech synthesis filter output
1440  * @return 0 on success, <0 on error.
1441  */
synth_block(WMAVoiceContext * s,GetBitContext * gb,int block_idx,int size,int block_pitch_sh2,const double * lsps,const double * prev_lsps,const struct frame_type_desc * frame_desc,float * excitation,float * synth)1442 static void synth_block(WMAVoiceContext *s, GetBitContext *gb,
1443                         int block_idx, int size,
1444                         int block_pitch_sh2,
1445                         const double *lsps, const double *prev_lsps,
1446                         const struct frame_type_desc *frame_desc,
1447                         float *excitation, float *synth)
1448 {
1449     double i_lsps[MAX_LSPS];
1450     float lpcs[MAX_LSPS];
1451     float fac;
1452     int n;
1453 
1454     if (frame_desc->acb_type == ACB_TYPE_NONE)
1455         synth_block_hardcoded(s, gb, block_idx, size, frame_desc, excitation);
1456     else
1457         synth_block_fcb_acb(s, gb, block_idx, size, block_pitch_sh2,
1458                             frame_desc, excitation);
1459 
1460     /* convert interpolated LSPs to LPCs */
1461     fac = (block_idx + 0.5) / frame_desc->n_blocks;
1462     for (n = 0; n < s->lsps; n++) // LSF -> LSP
1463         i_lsps[n] = cos(prev_lsps[n] + fac * (lsps[n] - prev_lsps[n]));
1464     ff_acelp_lspd2lpc(i_lsps, lpcs, s->lsps >> 1);
1465 
1466     /* Speech synthesis */
1467     ff_celp_lp_synthesis_filterf(synth, lpcs, excitation, size, s->lsps);
1468 }
1469 
1470 /**
1471  * Synthesize output samples for a single frame.
1472  *
1473  * @param ctx WMA Voice decoder context
1474  * @param gb bit I/O context (s->gb or one for cross-packet superframes)
1475  * @param frame_idx Frame number within superframe [0-2]
1476  * @param samples pointer to output sample buffer, has space for at least 160
1477  *                samples
1478  * @param lsps LSP array
1479  * @param prev_lsps array of previous frame's LSPs
1480  * @param excitation target buffer for excitation signal
1481  * @param synth target buffer for synthesized speech data
1482  * @return 0 on success, <0 on error.
1483  */
synth_frame(AVCodecContext * ctx,GetBitContext * gb,int frame_idx,float * samples,const double * lsps,const double * prev_lsps,float * excitation,float * synth)1484 static int synth_frame(AVCodecContext *ctx, GetBitContext *gb, int frame_idx,
1485                        float *samples,
1486                        const double *lsps, const double *prev_lsps,
1487                        float *excitation, float *synth)
1488 {
1489     WMAVoiceContext *s = ctx->priv_data;
1490     int n, n_blocks_x2, log_n_blocks_x2, av_uninit(cur_pitch_val);
1491     int pitch[MAX_BLOCKS], av_uninit(last_block_pitch);
1492 
1493     /* Parse frame type ("frame header"), see frame_descs */
1494     int bd_idx = s->vbm_tree[get_vlc2(gb, frame_type_vlc.table, 6, 3)], block_nsamples;
1495 
1496     pitch[0] = INT_MAX;
1497 
1498     if (bd_idx < 0) {
1499         av_log(ctx, AV_LOG_ERROR,
1500                "Invalid frame type VLC code, skipping\n");
1501         return AVERROR_INVALIDDATA;
1502     }
1503 
1504     block_nsamples = MAX_FRAMESIZE / frame_descs[bd_idx].n_blocks;
1505 
1506     /* Pitch calculation for ACB_TYPE_ASYMMETRIC ("pitch-per-frame") */
1507     if (frame_descs[bd_idx].acb_type == ACB_TYPE_ASYMMETRIC) {
1508         /* Pitch is provided per frame, which is interpreted as the pitch of
1509          * the last sample of the last block of this frame. We can interpolate
1510          * the pitch of other blocks (and even pitch-per-sample) by gradually
1511          * incrementing/decrementing prev_frame_pitch to cur_pitch_val. */
1512         n_blocks_x2      = frame_descs[bd_idx].n_blocks << 1;
1513         log_n_blocks_x2  = frame_descs[bd_idx].log_n_blocks + 1;
1514         cur_pitch_val    = s->min_pitch_val + get_bits(gb, s->pitch_nbits);
1515         cur_pitch_val    = FFMIN(cur_pitch_val, s->max_pitch_val - 1);
1516         if (s->last_acb_type == ACB_TYPE_NONE ||
1517             20 * abs(cur_pitch_val - s->last_pitch_val) >
1518                 (cur_pitch_val + s->last_pitch_val))
1519             s->last_pitch_val = cur_pitch_val;
1520 
1521         /* pitch per block */
1522         for (n = 0; n < frame_descs[bd_idx].n_blocks; n++) {
1523             int fac = n * 2 + 1;
1524 
1525             pitch[n] = (MUL16(fac,                 cur_pitch_val) +
1526                         MUL16((n_blocks_x2 - fac), s->last_pitch_val) +
1527                         frame_descs[bd_idx].n_blocks) >> log_n_blocks_x2;
1528         }
1529 
1530         /* "pitch-diff-per-sample" for calculation of pitch per sample */
1531         s->pitch_diff_sh16 =
1532             (cur_pitch_val - s->last_pitch_val) * (1 << 16) / MAX_FRAMESIZE;
1533     }
1534 
1535     /* Global gain (if silence) and pitch-adaptive window coordinates */
1536     switch (frame_descs[bd_idx].fcb_type) {
1537     case FCB_TYPE_SILENCE:
1538         s->silence_gain = wmavoice_gain_silence[get_bits(gb, 8)];
1539         break;
1540     case FCB_TYPE_AW_PULSES:
1541         aw_parse_coords(s, gb, pitch);
1542         break;
1543     }
1544 
1545     for (n = 0; n < frame_descs[bd_idx].n_blocks; n++) {
1546         int bl_pitch_sh2;
1547 
1548         /* Pitch calculation for ACB_TYPE_HAMMING ("pitch-per-block") */
1549         switch (frame_descs[bd_idx].acb_type) {
1550         case ACB_TYPE_HAMMING: {
1551             /* Pitch is given per block. Per-block pitches are encoded as an
1552              * absolute value for the first block, and then delta values
1553              * relative to this value) for all subsequent blocks. The scale of
1554              * this pitch value is semi-logarithmic compared to its use in the
1555              * decoder, so we convert it to normal scale also. */
1556             int block_pitch,
1557                 t1 = (s->block_conv_table[1] - s->block_conv_table[0]) << 2,
1558                 t2 = (s->block_conv_table[2] - s->block_conv_table[1]) << 1,
1559                 t3 =  s->block_conv_table[3] - s->block_conv_table[2] + 1;
1560 
1561             if (n == 0) {
1562                 block_pitch = get_bits(gb, s->block_pitch_nbits);
1563             } else
1564                 block_pitch = last_block_pitch - s->block_delta_pitch_hrange +
1565                                  get_bits(gb, s->block_delta_pitch_nbits);
1566             /* Convert last_ so that any next delta is within _range */
1567             last_block_pitch = av_clip(block_pitch,
1568                                        s->block_delta_pitch_hrange,
1569                                        s->block_pitch_range -
1570                                            s->block_delta_pitch_hrange);
1571 
1572             /* Convert semi-log-style scale back to normal scale */
1573             if (block_pitch < t1) {
1574                 bl_pitch_sh2 = (s->block_conv_table[0] << 2) + block_pitch;
1575             } else {
1576                 block_pitch -= t1;
1577                 if (block_pitch < t2) {
1578                     bl_pitch_sh2 =
1579                         (s->block_conv_table[1] << 2) + (block_pitch << 1);
1580                 } else {
1581                     block_pitch -= t2;
1582                     if (block_pitch < t3) {
1583                         bl_pitch_sh2 =
1584                             (s->block_conv_table[2] + block_pitch) << 2;
1585                     } else
1586                         bl_pitch_sh2 = s->block_conv_table[3] << 2;
1587                 }
1588             }
1589             pitch[n] = bl_pitch_sh2 >> 2;
1590             break;
1591         }
1592 
1593         case ACB_TYPE_ASYMMETRIC: {
1594             bl_pitch_sh2 = pitch[n] << 2;
1595             break;
1596         }
1597 
1598         default: // ACB_TYPE_NONE has no pitch
1599             bl_pitch_sh2 = 0;
1600             break;
1601         }
1602 
1603         synth_block(s, gb, n, block_nsamples, bl_pitch_sh2,
1604                     lsps, prev_lsps, &frame_descs[bd_idx],
1605                     &excitation[n * block_nsamples],
1606                     &synth[n * block_nsamples]);
1607     }
1608 
1609     /* Averaging projection filter, if applicable. Else, just copy samples
1610      * from synthesis buffer */
1611     if (s->do_apf) {
1612         double i_lsps[MAX_LSPS];
1613         float lpcs[MAX_LSPS];
1614 
1615         if(frame_descs[bd_idx].fcb_type >= FCB_TYPE_AW_PULSES && pitch[0] == INT_MAX)
1616             return AVERROR_INVALIDDATA;
1617 
1618         for (n = 0; n < s->lsps; n++) // LSF -> LSP
1619             i_lsps[n] = cos(0.5 * (prev_lsps[n] + lsps[n]));
1620         ff_acelp_lspd2lpc(i_lsps, lpcs, s->lsps >> 1);
1621         postfilter(s, synth, samples, 80, lpcs,
1622                    &s->zero_exc_pf[s->history_nsamples + MAX_FRAMESIZE * frame_idx],
1623                    frame_descs[bd_idx].fcb_type, pitch[0]);
1624 
1625         for (n = 0; n < s->lsps; n++) // LSF -> LSP
1626             i_lsps[n] = cos(lsps[n]);
1627         ff_acelp_lspd2lpc(i_lsps, lpcs, s->lsps >> 1);
1628         postfilter(s, &synth[80], &samples[80], 80, lpcs,
1629                    &s->zero_exc_pf[s->history_nsamples + MAX_FRAMESIZE * frame_idx + 80],
1630                    frame_descs[bd_idx].fcb_type, pitch[0]);
1631     } else
1632         memcpy(samples, synth, 160 * sizeof(synth[0]));
1633 
1634     /* Cache values for next frame */
1635     s->frame_cntr++;
1636     if (s->frame_cntr >= 0xFFFF) s->frame_cntr -= 0xFFFF; // i.e. modulo (%)
1637     s->last_acb_type = frame_descs[bd_idx].acb_type;
1638     switch (frame_descs[bd_idx].acb_type) {
1639     case ACB_TYPE_NONE:
1640         s->last_pitch_val = 0;
1641         break;
1642     case ACB_TYPE_ASYMMETRIC:
1643         s->last_pitch_val = cur_pitch_val;
1644         break;
1645     case ACB_TYPE_HAMMING:
1646         s->last_pitch_val = pitch[frame_descs[bd_idx].n_blocks - 1];
1647         break;
1648     }
1649 
1650     return 0;
1651 }
1652 
1653 /**
1654  * Ensure minimum value for first item, maximum value for last value,
1655  * proper spacing between each value and proper ordering.
1656  *
1657  * @param lsps array of LSPs
1658  * @param num size of LSP array
1659  *
1660  * @note basically a double version of #ff_acelp_reorder_lsf(), might be
1661  *       useful to put in a generic location later on. Parts are also
1662  *       present in #ff_set_min_dist_lsf() + #ff_sort_nearly_sorted_floats(),
1663  *       which is in float.
1664  */
stabilize_lsps(double * lsps,int num)1665 static void stabilize_lsps(double *lsps, int num)
1666 {
1667     int n, m, l;
1668 
1669     /* set minimum value for first, maximum value for last and minimum
1670      * spacing between LSF values.
1671      * Very similar to ff_set_min_dist_lsf(), but in double. */
1672     lsps[0]       = FFMAX(lsps[0],       0.0015 * M_PI);
1673     for (n = 1; n < num; n++)
1674         lsps[n]   = FFMAX(lsps[n],       lsps[n - 1] + 0.0125 * M_PI);
1675     lsps[num - 1] = FFMIN(lsps[num - 1], 0.9985 * M_PI);
1676 
1677     /* reorder (looks like one-time / non-recursed bubblesort).
1678      * Very similar to ff_sort_nearly_sorted_floats(), but in double. */
1679     for (n = 1; n < num; n++) {
1680         if (lsps[n] < lsps[n - 1]) {
1681             for (m = 1; m < num; m++) {
1682                 double tmp = lsps[m];
1683                 for (l = m - 1; l >= 0; l--) {
1684                     if (lsps[l] <= tmp) break;
1685                     lsps[l + 1] = lsps[l];
1686                 }
1687                 lsps[l + 1] = tmp;
1688             }
1689             break;
1690         }
1691     }
1692 }
1693 
1694 /**
1695  * Synthesize output samples for a single superframe. If we have any data
1696  * cached in s->sframe_cache, that will be used instead of whatever is loaded
1697  * in s->gb.
1698  *
1699  * WMA Voice superframes contain 3 frames, each containing 160 audio samples,
1700  * to give a total of 480 samples per frame. See #synth_frame() for frame
1701  * parsing. In addition to 3 frames, superframes can also contain the LSPs
1702  * (if these are globally specified for all frames (residually); they can
1703  * also be specified individually per-frame. See the s->has_residual_lsps
1704  * option), and can specify the number of samples encoded in this superframe
1705  * (if less than 480), usually used to prevent blanks at track boundaries.
1706  *
1707  * @param ctx WMA Voice decoder context
1708  * @return 0 on success, <0 on error or 1 if there was not enough data to
1709  *         fully parse the superframe
1710  */
synth_superframe(AVCodecContext * ctx,AVFrame * frame,int * got_frame_ptr)1711 static int synth_superframe(AVCodecContext *ctx, AVFrame *frame,
1712                             int *got_frame_ptr)
1713 {
1714     WMAVoiceContext *s = ctx->priv_data;
1715     GetBitContext *gb = &s->gb, s_gb;
1716     int n, res, n_samples = MAX_SFRAMESIZE;
1717     double lsps[MAX_FRAMES][MAX_LSPS];
1718     const double *mean_lsf = s->lsps == 16 ?
1719         wmavoice_mean_lsf16[s->lsp_def_mode] : wmavoice_mean_lsf10[s->lsp_def_mode];
1720     float excitation[MAX_SIGNAL_HISTORY + MAX_SFRAMESIZE + 12];
1721     float synth[MAX_LSPS + MAX_SFRAMESIZE];
1722     float *samples;
1723 
1724     memcpy(synth,      s->synth_history,
1725            s->lsps             * sizeof(*synth));
1726     memcpy(excitation, s->excitation_history,
1727            s->history_nsamples * sizeof(*excitation));
1728 
1729     if (s->sframe_cache_size > 0) {
1730         gb = &s_gb;
1731         init_get_bits(gb, s->sframe_cache, s->sframe_cache_size);
1732         s->sframe_cache_size = 0;
1733     }
1734 
1735     /* First bit is speech/music bit, it differentiates between WMAVoice
1736      * speech samples (the actual codec) and WMAVoice music samples, which
1737      * are really WMAPro-in-WMAVoice-superframes. I've never seen those in
1738      * the wild yet. */
1739     if (!get_bits1(gb)) {
1740         avpriv_request_sample(ctx, "WMAPro-in-WMAVoice");
1741         return AVERROR_PATCHWELCOME;
1742     }
1743 
1744     /* (optional) nr. of samples in superframe; always <= 480 and >= 0 */
1745     if (get_bits1(gb)) {
1746         if ((n_samples = get_bits(gb, 12)) > MAX_SFRAMESIZE) {
1747             av_log(ctx, AV_LOG_ERROR,
1748                    "Superframe encodes > %d samples (%d), not allowed\n",
1749                    MAX_SFRAMESIZE, n_samples);
1750             return AVERROR_INVALIDDATA;
1751         }
1752     }
1753 
1754     /* Parse LSPs, if global for the superframe (can also be per-frame). */
1755     if (s->has_residual_lsps) {
1756         double prev_lsps[MAX_LSPS], a1[MAX_LSPS * 2], a2[MAX_LSPS * 2];
1757 
1758         for (n = 0; n < s->lsps; n++)
1759             prev_lsps[n] = s->prev_lsps[n] - mean_lsf[n];
1760 
1761         if (s->lsps == 10) {
1762             dequant_lsp10r(gb, lsps[2], prev_lsps, a1, a2, s->lsp_q_mode);
1763         } else /* s->lsps == 16 */
1764             dequant_lsp16r(gb, lsps[2], prev_lsps, a1, a2, s->lsp_q_mode);
1765 
1766         for (n = 0; n < s->lsps; n++) {
1767             lsps[0][n]  = mean_lsf[n] + (a1[n]           - a2[n * 2]);
1768             lsps[1][n]  = mean_lsf[n] + (a1[s->lsps + n] - a2[n * 2 + 1]);
1769             lsps[2][n] += mean_lsf[n];
1770         }
1771         for (n = 0; n < 3; n++)
1772             stabilize_lsps(lsps[n], s->lsps);
1773     }
1774 
1775     /* synth_superframe can run multiple times per packet
1776      * free potential previous frame */
1777     av_frame_unref(frame);
1778 
1779     /* get output buffer */
1780     frame->nb_samples = MAX_SFRAMESIZE;
1781     if ((res = ff_get_buffer(ctx, frame, 0)) < 0)
1782         return res;
1783     frame->nb_samples = n_samples;
1784     samples = (float *)frame->data[0];
1785 
1786     /* Parse frames, optionally preceded by per-frame (independent) LSPs. */
1787     for (n = 0; n < 3; n++) {
1788         if (!s->has_residual_lsps) {
1789             int m;
1790 
1791             if (s->lsps == 10) {
1792                 dequant_lsp10i(gb, lsps[n]);
1793             } else /* s->lsps == 16 */
1794                 dequant_lsp16i(gb, lsps[n]);
1795 
1796             for (m = 0; m < s->lsps; m++)
1797                 lsps[n][m] += mean_lsf[m];
1798             stabilize_lsps(lsps[n], s->lsps);
1799         }
1800 
1801         if ((res = synth_frame(ctx, gb, n,
1802                                &samples[n * MAX_FRAMESIZE],
1803                                lsps[n], n == 0 ? s->prev_lsps : lsps[n - 1],
1804                                &excitation[s->history_nsamples + n * MAX_FRAMESIZE],
1805                                &synth[s->lsps + n * MAX_FRAMESIZE]))) {
1806             *got_frame_ptr = 0;
1807             return res;
1808         }
1809     }
1810 
1811     /* Statistics? FIXME - we don't check for length, a slight overrun
1812      * will be caught by internal buffer padding, and anything else
1813      * will be skipped, not read. */
1814     if (get_bits1(gb)) {
1815         res = get_bits(gb, 4);
1816         skip_bits(gb, 10 * (res + 1));
1817     }
1818 
1819     if (get_bits_left(gb) < 0) {
1820         wmavoice_flush(ctx);
1821         return AVERROR_INVALIDDATA;
1822     }
1823 
1824     *got_frame_ptr = 1;
1825 
1826     /* Update history */
1827     memcpy(s->prev_lsps,           lsps[2],
1828            s->lsps             * sizeof(*s->prev_lsps));
1829     memcpy(s->synth_history,      &synth[MAX_SFRAMESIZE],
1830            s->lsps             * sizeof(*synth));
1831     memcpy(s->excitation_history, &excitation[MAX_SFRAMESIZE],
1832            s->history_nsamples * sizeof(*excitation));
1833     if (s->do_apf)
1834         memmove(s->zero_exc_pf,       &s->zero_exc_pf[MAX_SFRAMESIZE],
1835                 s->history_nsamples * sizeof(*s->zero_exc_pf));
1836 
1837     return 0;
1838 }
1839 
1840 /**
1841  * Parse the packet header at the start of each packet (input data to this
1842  * decoder).
1843  *
1844  * @param s WMA Voice decoding context private data
1845  * @return <0 on error, nb_superframes on success.
1846  */
parse_packet_header(WMAVoiceContext * s)1847 static int parse_packet_header(WMAVoiceContext *s)
1848 {
1849     GetBitContext *gb = &s->gb;
1850     unsigned int res, n_superframes = 0;
1851 
1852     skip_bits(gb, 4);          // packet sequence number
1853     s->has_residual_lsps = get_bits1(gb);
1854     do {
1855         if (get_bits_left(gb) < 6 + s->spillover_bitsize)
1856             return AVERROR_INVALIDDATA;
1857 
1858         res = get_bits(gb, 6); // number of superframes per packet
1859                                // (minus first one if there is spillover)
1860         n_superframes += res;
1861     } while (res == 0x3F);
1862     s->spillover_nbits   = get_bits(gb, s->spillover_bitsize);
1863 
1864     return get_bits_left(gb) >= 0 ? n_superframes : AVERROR_INVALIDDATA;
1865 }
1866 
1867 /**
1868  * Copy (unaligned) bits from gb/data/size to pb.
1869  *
1870  * @param pb target buffer to copy bits into
1871  * @param data source buffer to copy bits from
1872  * @param size size of the source data, in bytes
1873  * @param gb bit I/O context specifying the current position in the source.
1874  *           data. This function might use this to align the bit position to
1875  *           a whole-byte boundary before calling #ff_copy_bits() on aligned
1876  *           source data
1877  * @param nbits the amount of bits to copy from source to target
1878  *
1879  * @note after calling this function, the current position in the input bit
1880  *       I/O context is undefined.
1881  */
copy_bits(PutBitContext * pb,const uint8_t * data,int size,GetBitContext * gb,int nbits)1882 static void copy_bits(PutBitContext *pb,
1883                       const uint8_t *data, int size,
1884                       GetBitContext *gb, int nbits)
1885 {
1886     int rmn_bytes, rmn_bits;
1887 
1888     rmn_bits = rmn_bytes = get_bits_left(gb);
1889     if (rmn_bits < nbits)
1890         return;
1891     if (nbits > put_bits_left(pb))
1892         return;
1893     rmn_bits &= 7; rmn_bytes >>= 3;
1894     if ((rmn_bits = FFMIN(rmn_bits, nbits)) > 0)
1895         put_bits(pb, rmn_bits, get_bits(gb, rmn_bits));
1896     ff_copy_bits(pb, data + size - rmn_bytes,
1897                  FFMIN(nbits - rmn_bits, rmn_bytes << 3));
1898 }
1899 
1900 /**
1901  * Packet decoding: a packet is anything that the (ASF) demuxer contains,
1902  * and we expect that the demuxer / application provides it to us as such
1903  * (else you'll probably get garbage as output). Every packet has a size of
1904  * ctx->block_align bytes, starts with a packet header (see
1905  * #parse_packet_header()), and then a series of superframes. Superframe
1906  * boundaries may exceed packets, i.e. superframes can split data over
1907  * multiple (two) packets.
1908  *
1909  * For more information about frames, see #synth_superframe().
1910  */
wmavoice_decode_packet(AVCodecContext * ctx,AVFrame * frame,int * got_frame_ptr,AVPacket * avpkt)1911 static int wmavoice_decode_packet(AVCodecContext *ctx, AVFrame *frame,
1912                                   int *got_frame_ptr, AVPacket *avpkt)
1913 {
1914     WMAVoiceContext *s = ctx->priv_data;
1915     GetBitContext *gb = &s->gb;
1916     int size, res, pos;
1917 
1918     /* Packets are sometimes a multiple of ctx->block_align, with a packet
1919      * header at each ctx->block_align bytes. However, FFmpeg's ASF demuxer
1920      * feeds us ASF packets, which may concatenate multiple "codec" packets
1921      * in a single "muxer" packet, so we artificially emulate that by
1922      * capping the packet size at ctx->block_align. */
1923     for (size = avpkt->size; size > ctx->block_align; size -= ctx->block_align);
1924     init_get_bits8(&s->gb, avpkt->data, size);
1925 
1926     /* size == ctx->block_align is used to indicate whether we are dealing with
1927      * a new packet or a packet of which we already read the packet header
1928      * previously. */
1929     if (!(size % ctx->block_align)) { // new packet header
1930         if (!size) {
1931             s->spillover_nbits = 0;
1932             s->nb_superframes = 0;
1933         } else {
1934             if ((res = parse_packet_header(s)) < 0)
1935                 return res;
1936             s->nb_superframes = res;
1937         }
1938 
1939         /* If the packet header specifies a s->spillover_nbits, then we want
1940          * to push out all data of the previous packet (+ spillover) before
1941          * continuing to parse new superframes in the current packet. */
1942         if (s->sframe_cache_size > 0) {
1943             int cnt = get_bits_count(gb);
1944             if (cnt + s->spillover_nbits > avpkt->size * 8) {
1945                 s->spillover_nbits = avpkt->size * 8 - cnt;
1946             }
1947             copy_bits(&s->pb, avpkt->data, size, gb, s->spillover_nbits);
1948             flush_put_bits(&s->pb);
1949             s->sframe_cache_size += s->spillover_nbits;
1950             if ((res = synth_superframe(ctx, frame, got_frame_ptr)) == 0 &&
1951                 *got_frame_ptr) {
1952                 cnt += s->spillover_nbits;
1953                 s->skip_bits_next = cnt & 7;
1954                 res = cnt >> 3;
1955                 return res;
1956             } else
1957                 skip_bits_long (gb, s->spillover_nbits - cnt +
1958                                 get_bits_count(gb)); // resync
1959         } else if (s->spillover_nbits) {
1960             skip_bits_long(gb, s->spillover_nbits);  // resync
1961         }
1962     } else if (s->skip_bits_next)
1963         skip_bits(gb, s->skip_bits_next);
1964 
1965     /* Try parsing superframes in current packet */
1966     s->sframe_cache_size = 0;
1967     s->skip_bits_next = 0;
1968     pos = get_bits_left(gb);
1969     if (s->nb_superframes-- == 0) {
1970         *got_frame_ptr = 0;
1971         return size;
1972     } else if (s->nb_superframes > 0) {
1973         if ((res = synth_superframe(ctx, frame, got_frame_ptr)) < 0) {
1974             return res;
1975         } else if (*got_frame_ptr) {
1976             int cnt = get_bits_count(gb);
1977             s->skip_bits_next = cnt & 7;
1978             res = cnt >> 3;
1979             return res;
1980         }
1981     } else if ((s->sframe_cache_size = pos) > 0) {
1982         /* ... cache it for spillover in next packet */
1983         init_put_bits(&s->pb, s->sframe_cache, SFRAME_CACHE_MAXSIZE);
1984         copy_bits(&s->pb, avpkt->data, size, gb, s->sframe_cache_size);
1985         // FIXME bad - just copy bytes as whole and add use the
1986         // skip_bits_next field
1987     }
1988 
1989     return size;
1990 }
1991 
wmavoice_decode_end(AVCodecContext * ctx)1992 static av_cold int wmavoice_decode_end(AVCodecContext *ctx)
1993 {
1994     WMAVoiceContext *s = ctx->priv_data;
1995 
1996     if (s->do_apf) {
1997         ff_rdft_end(&s->rdft);
1998         ff_rdft_end(&s->irdft);
1999         ff_dct_end(&s->dct);
2000         ff_dct_end(&s->dst);
2001     }
2002 
2003     return 0;
2004 }
2005 
2006 const FFCodec ff_wmavoice_decoder = {
2007     .p.name           = "wmavoice",
2008     .p.long_name      = NULL_IF_CONFIG_SMALL("Windows Media Audio Voice"),
2009     .p.type           = AVMEDIA_TYPE_AUDIO,
2010     .p.id             = AV_CODEC_ID_WMAVOICE,
2011     .priv_data_size   = sizeof(WMAVoiceContext),
2012     .init             = wmavoice_decode_init,
2013     .close            = wmavoice_decode_end,
2014     FF_CODEC_DECODE_CB(wmavoice_decode_packet),
2015     .p.capabilities   = AV_CODEC_CAP_SUBFRAMES | AV_CODEC_CAP_DR1 | AV_CODEC_CAP_DELAY,
2016     .caps_internal    = FF_CODEC_CAP_INIT_THREADSAFE | FF_CODEC_CAP_INIT_CLEANUP,
2017     .flush            = wmavoice_flush,
2018 };
2019