FFmpeg
 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Groups Pages
wmavoice.c
Go to the documentation of this file.
1 /*
2  * Windows Media Audio Voice decoder.
3  * Copyright (c) 2009 Ronald S. Bultje
4  *
5  * This file is part of FFmpeg.
6  *
7  * FFmpeg is free software; you can redistribute it and/or
8  * modify it under the terms of the GNU Lesser General Public
9  * License as published by the Free Software Foundation; either
10  * version 2.1 of the License, or (at your option) any later version.
11  *
12  * FFmpeg is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15  * Lesser General Public License for more details.
16  *
17  * You should have received a copy of the GNU Lesser General Public
18  * License along with FFmpeg; if not, write to the Free Software
19  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20  */
21 
22 /**
23  * @file
24  * @brief Windows Media Audio Voice compatible decoder
25  * @author Ronald S. Bultje <rsbultje@gmail.com>
26  */
27 
28 #include <math.h>
29 
31 #include "libavutil/float_dsp.h"
32 #include "libavutil/mem.h"
33 #include "avcodec.h"
34 #include "internal.h"
35 #include "get_bits.h"
36 #include "put_bits.h"
37 #include "wmavoice_data.h"
38 #include "celp_filters.h"
39 #include "acelp_vectors.h"
40 #include "acelp_filters.h"
41 #include "lsp.h"
42 #include "dct.h"
43 #include "rdft.h"
44 #include "sinewin.h"
45 
46 #define MAX_BLOCKS 8 ///< maximum number of blocks per frame
47 #define MAX_LSPS 16 ///< maximum filter order
48 #define MAX_LSPS_ALIGN16 16 ///< same as #MAX_LSPS; needs to be multiple
49  ///< of 16 for ASM input buffer alignment
50 #define MAX_FRAMES 3 ///< maximum number of frames per superframe
51 #define MAX_FRAMESIZE 160 ///< maximum number of samples per frame
52 #define MAX_SIGNAL_HISTORY 416 ///< maximum excitation signal history
53 #define MAX_SFRAMESIZE (MAX_FRAMESIZE * MAX_FRAMES)
54  ///< maximum number of samples per superframe
55 #define SFRAME_CACHE_MAXSIZE 256 ///< maximum cache size for frame data that
56  ///< was split over two packets
57 #define VLC_NBITS 6 ///< number of bits to read per VLC iteration
58 
59 /**
60  * Frame type VLC coding.
61  */
63 
64 /**
65  * Adaptive codebook types.
66  */
67 enum {
68  ACB_TYPE_NONE = 0, ///< no adaptive codebook (only hardcoded fixed)
69  ACB_TYPE_ASYMMETRIC = 1, ///< adaptive codebook with per-frame pitch, which
70  ///< we interpolate to get a per-sample pitch.
71  ///< Signal is generated using an asymmetric sinc
72  ///< window function
73  ///< @note see #wmavoice_ipol1_coeffs
74  ACB_TYPE_HAMMING = 2 ///< Per-block pitch with signal generation using
75  ///< a Hamming sinc window function
76  ///< @note see #wmavoice_ipol2_coeffs
77 };
78 
79 /**
80  * Fixed codebook types.
81  */
82 enum {
83  FCB_TYPE_SILENCE = 0, ///< comfort noise during silence
84  ///< generated from a hardcoded (fixed) codebook
85  ///< with per-frame (low) gain values
86  FCB_TYPE_HARDCODED = 1, ///< hardcoded (fixed) codebook with per-block
87  ///< gain values
88  FCB_TYPE_AW_PULSES = 2, ///< Pitch-adaptive window (AW) pulse signals,
89  ///< used in particular for low-bitrate streams
90  FCB_TYPE_EXC_PULSES = 3, ///< Innovation (fixed) codebook pulse sets in
91  ///< combinations of either single pulses or
92  ///< pulse pairs
93 };
94 
95 /**
96  * Description of frame types.
97  */
98 static const struct frame_type_desc {
99  uint8_t n_blocks; ///< amount of blocks per frame (each block
100  ///< (contains 160/#n_blocks samples)
101  uint8_t log_n_blocks; ///< log2(#n_blocks)
102  uint8_t acb_type; ///< Adaptive codebook type (ACB_TYPE_*)
103  uint8_t fcb_type; ///< Fixed codebook type (FCB_TYPE_*)
104  uint8_t dbl_pulses; ///< how many pulse vectors have pulse pairs
105  ///< (rather than just one single pulse)
106  ///< only if #fcb_type == #FCB_TYPE_EXC_PULSES
107 } frame_descs[17] = {
108  { 1, 0, ACB_TYPE_NONE, FCB_TYPE_SILENCE, 0 },
109  { 2, 1, ACB_TYPE_NONE, FCB_TYPE_HARDCODED, 0 },
125 };
126 
127 /**
128  * WMA Voice decoding context.
129  */
130 typedef struct WMAVoiceContext {
131  /**
132  * @name Global values specified in the stream header / extradata or used all over.
133  * @{
134  */
135  GetBitContext gb; ///< packet bitreader. During decoder init,
136  ///< it contains the extradata from the
137  ///< demuxer. During decoding, it contains
138  ///< packet data.
139  int8_t vbm_tree[25]; ///< converts VLC codes to frame type
140 
141  int spillover_bitsize; ///< number of bits used to specify
142  ///< #spillover_nbits in the packet header
143  ///< = ceil(log2(ctx->block_align << 3))
144  int history_nsamples; ///< number of samples in history for signal
145  ///< prediction (through ACB)
146 
147  /* postfilter specific values */
148  int do_apf; ///< whether to apply the averaged
149  ///< projection filter (APF)
150  int denoise_strength; ///< strength of denoising in Wiener filter
151  ///< [0-11]
152  int denoise_tilt_corr; ///< Whether to apply tilt correction to the
153  ///< Wiener filter coefficients (postfilter)
154  int dc_level; ///< Predicted amount of DC noise, based
155  ///< on which a DC removal filter is used
156 
157  int lsps; ///< number of LSPs per frame [10 or 16]
158  int lsp_q_mode; ///< defines quantizer defaults [0, 1]
159  int lsp_def_mode; ///< defines different sets of LSP defaults
160  ///< [0, 1]
161 
162  int min_pitch_val; ///< base value for pitch parsing code
163  int max_pitch_val; ///< max value + 1 for pitch parsing
164  int pitch_nbits; ///< number of bits used to specify the
165  ///< pitch value in the frame header
166  int block_pitch_nbits; ///< number of bits used to specify the
167  ///< first block's pitch value
168  int block_pitch_range; ///< range of the block pitch
169  int block_delta_pitch_nbits; ///< number of bits used to specify the
170  ///< delta pitch between this and the last
171  ///< block's pitch value, used in all but
172  ///< first block
173  int block_delta_pitch_hrange; ///< 1/2 range of the delta (full range is
174  ///< from -this to +this-1)
175  uint16_t block_conv_table[4]; ///< boundaries for block pitch unit/scale
176  ///< conversion
177 
178  /**
179  * @}
180  *
181  * @name Packet values specified in the packet header or related to a packet.
182  *
183  * A packet is considered to be a single unit of data provided to this
184  * decoder by the demuxer.
185  * @{
186  */
187  int spillover_nbits; ///< number of bits of the previous packet's
188  ///< last superframe preceding this
189  ///< packet's first full superframe (useful
190  ///< for re-synchronization also)
191  int has_residual_lsps; ///< if set, superframes contain one set of
192  ///< LSPs that cover all frames, encoded as
193  ///< independent and residual LSPs; if not
194  ///< set, each frame contains its own, fully
195  ///< independent, LSPs
196  int skip_bits_next; ///< number of bits to skip at the next call
197  ///< to #wmavoice_decode_packet() (since
198  ///< they're part of the previous superframe)
199 
201  ///< cache for superframe data split over
202  ///< multiple packets
203  int sframe_cache_size; ///< set to >0 if we have data from an
204  ///< (incomplete) superframe from a previous
205  ///< packet that spilled over in the current
206  ///< packet; specifies the amount of bits in
207  ///< #sframe_cache
208  PutBitContext pb; ///< bitstream writer for #sframe_cache
209 
210  /**
211  * @}
212  *
213  * @name Frame and superframe values
214  * Superframe and frame data - these can change from frame to frame,
215  * although some of them do in that case serve as a cache / history for
216  * the next frame or superframe.
217  * @{
218  */
219  double prev_lsps[MAX_LSPS]; ///< LSPs of the last frame of the previous
220  ///< superframe
221  int last_pitch_val; ///< pitch value of the previous frame
222  int last_acb_type; ///< frame type [0-2] of the previous frame
223  int pitch_diff_sh16; ///< ((cur_pitch_val - #last_pitch_val)
224  ///< << 16) / #MAX_FRAMESIZE
225  float silence_gain; ///< set for use in blocks if #ACB_TYPE_NONE
226 
227  int aw_idx_is_ext; ///< whether the AW index was encoded in
228  ///< 8 bits (instead of 6)
229  int aw_pulse_range; ///< the range over which #aw_pulse_set1()
230  ///< can apply the pulse, relative to the
231  ///< value in aw_first_pulse_off. The exact
232  ///< position of the first AW-pulse is within
233  ///< [pulse_off, pulse_off + this], and
234  ///< depends on bitstream values; [16 or 24]
235  int aw_n_pulses[2]; ///< number of AW-pulses in each block; note
236  ///< that this number can be negative (in
237  ///< which case it basically means "zero")
238  int aw_first_pulse_off[2]; ///< index of first sample to which to
239  ///< apply AW-pulses, or -0xff if unset
240  int aw_next_pulse_off_cache; ///< the position (relative to start of the
241  ///< second block) at which pulses should
242  ///< start to be positioned, serves as a
243  ///< cache for pitch-adaptive window pulses
244  ///< between blocks
245 
246  int frame_cntr; ///< current frame index [0 - 0xFFFE]; is
247  ///< only used for comfort noise in #pRNG()
248  int nb_superframes; ///< number of superframes in current packet
249  float gain_pred_err[6]; ///< cache for gain prediction
251  ///< cache of the signal of previous
252  ///< superframes, used as a history for
253  ///< signal generation
254  float synth_history[MAX_LSPS]; ///< see #excitation_history
255  /**
256  * @}
257  *
258  * @name Postfilter values
259  *
260  * Variables used for postfilter implementation, mostly history for
261  * smoothing and so on, and context variables for FFT/iFFT.
262  * @{
263  */
264  RDFTContext rdft, irdft; ///< contexts for FFT-calculation in the
265  ///< postfilter (for denoise filter)
266  DCTContext dct, dst; ///< contexts for phase shift (in Hilbert
267  ///< transform, part of postfilter)
268  float sin[511], cos[511]; ///< 8-bit cosine/sine windows over [-pi,pi]
269  ///< range
270  float postfilter_agc; ///< gain control memory, used in
271  ///< #adaptive_gain_control()
272  float dcf_mem[2]; ///< DC filter history
274  ///< zero filter output (i.e. excitation)
275  ///< by postfilter
277  int denoise_filter_cache_size; ///< samples in #denoise_filter_cache
278  DECLARE_ALIGNED(32, float, tilted_lpcs_pf)[0x80];
279  ///< aligned buffer for LPC tilting
281  ///< aligned buffer for denoise coefficients
283  ///< aligned buffer for postfilter speech
284  ///< synthesis
285  /**
286  * @}
287  */
289 
290 /**
291  * Set up the variable bit mode (VBM) tree from container extradata.
292  * @param gb bit I/O context.
293  * The bit context (s->gb) should be loaded with byte 23-46 of the
294  * container extradata (i.e. the ones containing the VBM tree).
295  * @param vbm_tree pointer to array to which the decoded VBM tree will be
296  * written.
297  * @return 0 on success, <0 on error.
298  */
299 static av_cold int decode_vbmtree(GetBitContext *gb, int8_t vbm_tree[25])
300 {
301  int cntr[8] = { 0 }, n, res;
302 
303  memset(vbm_tree, 0xff, sizeof(vbm_tree[0]) * 25);
304  for (n = 0; n < 17; n++) {
305  res = get_bits(gb, 3);
306  if (cntr[res] > 3) // should be >= 3 + (res == 7))
307  return -1;
308  vbm_tree[res * 3 + cntr[res]++] = n;
309  }
310  return 0;
311 }
312 
314 {
315  static const uint8_t bits[] = {
316  2, 2, 2, 4, 4, 4,
317  6, 6, 6, 8, 8, 8,
318  10, 10, 10, 12, 12, 12,
319  14, 14, 14, 14
320  };
321  static const uint16_t codes[] = {
322  0x0000, 0x0001, 0x0002, // 00/01/10
323  0x000c, 0x000d, 0x000e, // 11+00/01/10
324  0x003c, 0x003d, 0x003e, // 1111+00/01/10
325  0x00fc, 0x00fd, 0x00fe, // 111111+00/01/10
326  0x03fc, 0x03fd, 0x03fe, // 11111111+00/01/10
327  0x0ffc, 0x0ffd, 0x0ffe, // 1111111111+00/01/10
328  0x3ffc, 0x3ffd, 0x3ffe, 0x3fff // 111111111111+xx
329  };
330 
331  INIT_VLC_STATIC(&frame_type_vlc, VLC_NBITS, sizeof(bits),
332  bits, 1, 1, codes, 2, 2, 132);
333 }
334 
336 {
337  WMAVoiceContext *s = ctx->priv_data;
338  int n;
339 
340  s->postfilter_agc = 0;
341  s->sframe_cache_size = 0;
342  s->skip_bits_next = 0;
343  for (n = 0; n < s->lsps; n++)
344  s->prev_lsps[n] = M_PI * (n + 1.0) / (s->lsps + 1.0);
345  memset(s->excitation_history, 0,
346  sizeof(*s->excitation_history) * MAX_SIGNAL_HISTORY);
347  memset(s->synth_history, 0,
348  sizeof(*s->synth_history) * MAX_LSPS);
349  memset(s->gain_pred_err, 0,
350  sizeof(s->gain_pred_err));
351 
352  if (s->do_apf) {
353  memset(&s->synth_filter_out_buf[MAX_LSPS_ALIGN16 - s->lsps], 0,
354  sizeof(*s->synth_filter_out_buf) * s->lsps);
355  memset(s->dcf_mem, 0,
356  sizeof(*s->dcf_mem) * 2);
357  memset(s->zero_exc_pf, 0,
358  sizeof(*s->zero_exc_pf) * s->history_nsamples);
359  memset(s->denoise_filter_cache, 0, sizeof(s->denoise_filter_cache));
360  }
361 }
362 
363 /**
364  * Set up decoder with parameters from demuxer (extradata etc.).
365  */
367 {
368  int n, flags, pitch_range, lsp16_flag;
369  WMAVoiceContext *s = ctx->priv_data;
370 
371  /**
372  * Extradata layout:
373  * - byte 0-18: WMAPro-in-WMAVoice extradata (see wmaprodec.c),
374  * - byte 19-22: flags field (annoyingly in LE; see below for known
375  * values),
376  * - byte 23-46: variable bitmode tree (really just 17 * 3 bits,
377  * rest is 0).
378  */
379  if (ctx->extradata_size != 46) {
380  av_log(ctx, AV_LOG_ERROR,
381  "Invalid extradata size %d (should be 46)\n",
382  ctx->extradata_size);
383  return AVERROR_INVALIDDATA;
384  }
385  if (ctx->block_align <= 0) {
386  av_log(ctx, AV_LOG_ERROR, "Invalid block alignment %d.\n", ctx->block_align);
387  return AVERROR_INVALIDDATA;
388  }
389 
390  flags = AV_RL32(ctx->extradata + 18);
391  s->spillover_bitsize = 3 + av_ceil_log2(ctx->block_align);
392  s->do_apf = flags & 0x1;
393  if (s->do_apf) {
394  ff_rdft_init(&s->rdft, 7, DFT_R2C);
395  ff_rdft_init(&s->irdft, 7, IDFT_C2R);
396  ff_dct_init(&s->dct, 6, DCT_I);
397  ff_dct_init(&s->dst, 6, DST_I);
398 
399  ff_sine_window_init(s->cos, 256);
400  memcpy(&s->sin[255], s->cos, 256 * sizeof(s->cos[0]));
401  for (n = 0; n < 255; n++) {
402  s->sin[n] = -s->sin[510 - n];
403  s->cos[510 - n] = s->cos[n];
404  }
405  }
406  s->denoise_strength = (flags >> 2) & 0xF;
407  if (s->denoise_strength >= 12) {
408  av_log(ctx, AV_LOG_ERROR,
409  "Invalid denoise filter strength %d (max=11)\n",
410  s->denoise_strength);
411  return AVERROR_INVALIDDATA;
412  }
413  s->denoise_tilt_corr = !!(flags & 0x40);
414  s->dc_level = (flags >> 7) & 0xF;
415  s->lsp_q_mode = !!(flags & 0x2000);
416  s->lsp_def_mode = !!(flags & 0x4000);
417  lsp16_flag = flags & 0x1000;
418  if (lsp16_flag) {
419  s->lsps = 16;
420  } else {
421  s->lsps = 10;
422  }
423  for (n = 0; n < s->lsps; n++)
424  s->prev_lsps[n] = M_PI * (n + 1.0) / (s->lsps + 1.0);
425 
426  init_get_bits(&s->gb, ctx->extradata + 22, (ctx->extradata_size - 22) << 3);
427  if (decode_vbmtree(&s->gb, s->vbm_tree) < 0) {
428  av_log(ctx, AV_LOG_ERROR, "Invalid VBM tree; broken extradata?\n");
429  return AVERROR_INVALIDDATA;
430  }
431 
432  s->min_pitch_val = ((ctx->sample_rate << 8) / 400 + 50) >> 8;
433  s->max_pitch_val = ((ctx->sample_rate << 8) * 37 / 2000 + 50) >> 8;
434  pitch_range = s->max_pitch_val - s->min_pitch_val;
435  if (pitch_range <= 0) {
436  av_log(ctx, AV_LOG_ERROR, "Invalid pitch range; broken extradata?\n");
437  return AVERROR_INVALIDDATA;
438  }
439  s->pitch_nbits = av_ceil_log2(pitch_range);
440  s->last_pitch_val = 40;
442  s->history_nsamples = s->max_pitch_val + 8;
443 
445  int min_sr = ((((1 << 8) - 50) * 400) + 0xFF) >> 8,
446  max_sr = ((((MAX_SIGNAL_HISTORY - 8) << 8) + 205) * 2000 / 37) >> 8;
447 
448  av_log(ctx, AV_LOG_ERROR,
449  "Unsupported samplerate %d (min=%d, max=%d)\n",
450  ctx->sample_rate, min_sr, max_sr); // 322-22097 Hz
451 
452  return AVERROR(ENOSYS);
453  }
454 
455  s->block_conv_table[0] = s->min_pitch_val;
456  s->block_conv_table[1] = (pitch_range * 25) >> 6;
457  s->block_conv_table[2] = (pitch_range * 44) >> 6;
458  s->block_conv_table[3] = s->max_pitch_val - 1;
459  s->block_delta_pitch_hrange = (pitch_range >> 3) & ~0xF;
460  if (s->block_delta_pitch_hrange <= 0) {
461  av_log(ctx, AV_LOG_ERROR, "Invalid delta pitch hrange; broken extradata?\n");
462  return AVERROR_INVALIDDATA;
463  }
464  s->block_delta_pitch_nbits = 1 + av_ceil_log2(s->block_delta_pitch_hrange);
466  s->block_conv_table[3] + 1 +
467  2 * (s->block_conv_table[1] - 2 * s->min_pitch_val);
468  s->block_pitch_nbits = av_ceil_log2(s->block_pitch_range);
469 
470  ctx->channels = 1;
473 
474  return 0;
475 }
476 
477 /**
478  * @name Postfilter functions
479  * Postfilter functions (gain control, wiener denoise filter, DC filter,
480  * kalman smoothening, plus surrounding code to wrap it)
481  * @{
482  */
483 /**
484  * Adaptive gain control (as used in postfilter).
485  *
486  * Identical to #ff_adaptive_gain_control() in acelp_vectors.c, except
487  * that the energy here is calculated using sum(abs(...)), whereas the
488  * other codecs (e.g. AMR-NB, SIPRO) use sqrt(dotproduct(...)).
489  *
490  * @param out output buffer for filtered samples
491  * @param in input buffer containing the samples as they are after the
492  * postfilter steps so far
493  * @param speech_synth input buffer containing speech synth before postfilter
494  * @param size input buffer size
495  * @param alpha exponential filter factor
496  * @param gain_mem pointer to filter memory (single float)
497  */
498 static void adaptive_gain_control(float *out, const float *in,
499  const float *speech_synth,
500  int size, float alpha, float *gain_mem)
501 {
502  int i;
503  float speech_energy = 0.0, postfilter_energy = 0.0, gain_scale_factor;
504  float mem = *gain_mem;
505 
506  for (i = 0; i < size; i++) {
507  speech_energy += fabsf(speech_synth[i]);
508  postfilter_energy += fabsf(in[i]);
509  }
510  gain_scale_factor = postfilter_energy == 0.0 ? 0.0 :
511  (1.0 - alpha) * speech_energy / postfilter_energy;
512 
513  for (i = 0; i < size; i++) {
514  mem = alpha * mem + gain_scale_factor;
515  out[i] = in[i] * mem;
516  }
517 
518  *gain_mem = mem;
519 }
520 
521 /**
522  * Kalman smoothing function.
523  *
524  * This function looks back pitch +/- 3 samples back into history to find
525  * the best fitting curve (that one giving the optimal gain of the two
526  * signals, i.e. the highest dot product between the two), and then
527  * uses that signal history to smoothen the output of the speech synthesis
528  * filter.
529  *
530  * @param s WMA Voice decoding context
531  * @param pitch pitch of the speech signal
532  * @param in input speech signal
533  * @param out output pointer for smoothened signal
534  * @param size input/output buffer size
535  *
536  * @returns -1 if no smoothening took place, e.g. because no optimal
537  * fit could be found, or 0 on success.
538  */
539 static int kalman_smoothen(WMAVoiceContext *s, int pitch,
540  const float *in, float *out, int size)
541 {
542  int n;
543  float optimal_gain = 0, dot;
544  const float *ptr = &in[-FFMAX(s->min_pitch_val, pitch - 3)],
545  *end = &in[-FFMIN(s->max_pitch_val, pitch + 3)],
546  *best_hist_ptr = NULL;
547 
548  /* find best fitting point in history */
549  do {
550  dot = avpriv_scalarproduct_float_c(in, ptr, size);
551  if (dot > optimal_gain) {
552  optimal_gain = dot;
553  best_hist_ptr = ptr;
554  }
555  } while (--ptr >= end);
556 
557  if (optimal_gain <= 0)
558  return -1;
559  dot = avpriv_scalarproduct_float_c(best_hist_ptr, best_hist_ptr, size);
560  if (dot <= 0) // would be 1.0
561  return -1;
562 
563  if (optimal_gain <= dot) {
564  dot = dot / (dot + 0.6 * optimal_gain); // 0.625-1.000
565  } else
566  dot = 0.625;
567 
568  /* actual smoothing */
569  for (n = 0; n < size; n++)
570  out[n] = best_hist_ptr[n] + dot * (in[n] - best_hist_ptr[n]);
571 
572  return 0;
573 }
574 
575 /**
576  * Get the tilt factor of a formant filter from its transfer function
577  * @see #tilt_factor() in amrnbdec.c, which does essentially the same,
578  * but somehow (??) it does a speech synthesis filter in the
579  * middle, which is missing here
580  *
581  * @param lpcs LPC coefficients
582  * @param n_lpcs Size of LPC buffer
583  * @returns the tilt factor
584  */
585 static float tilt_factor(const float *lpcs, int n_lpcs)
586 {
587  float rh0, rh1;
588 
589  rh0 = 1.0 + avpriv_scalarproduct_float_c(lpcs, lpcs, n_lpcs);
590  rh1 = lpcs[0] + avpriv_scalarproduct_float_c(lpcs, &lpcs[1], n_lpcs - 1);
591 
592  return rh1 / rh0;
593 }
594 
595 /**
596  * Derive denoise filter coefficients (in real domain) from the LPCs.
597  */
598 static void calc_input_response(WMAVoiceContext *s, float *lpcs,
599  int fcb_type, float *coeffs, int remainder)
600 {
601  float last_coeff, min = 15.0, max = -15.0;
602  float irange, angle_mul, gain_mul, range, sq;
603  int n, idx;
604 
605  /* Create frequency power spectrum of speech input (i.e. RDFT of LPCs) */
606  s->rdft.rdft_calc(&s->rdft, lpcs);
607 #define log_range(var, assign) do { \
608  float tmp = log10f(assign); var = tmp; \
609  max = FFMAX(max, tmp); min = FFMIN(min, tmp); \
610  } while (0)
611  log_range(last_coeff, lpcs[1] * lpcs[1]);
612  for (n = 1; n < 64; n++)
613  log_range(lpcs[n], lpcs[n * 2] * lpcs[n * 2] +
614  lpcs[n * 2 + 1] * lpcs[n * 2 + 1]);
615  log_range(lpcs[0], lpcs[0] * lpcs[0]);
616 #undef log_range
617  range = max - min;
618  lpcs[64] = last_coeff;
619 
620  /* Now, use this spectrum to pick out these frequencies with higher
621  * (relative) power/energy (which we then take to be "not noise"),
622  * and set up a table (still in lpc[]) of (relative) gains per frequency.
623  * These frequencies will be maintained, while others ("noise") will be
624  * decreased in the filter output. */
625  irange = 64.0 / range; // so irange*(max-value) is in the range [0, 63]
626  gain_mul = range * (fcb_type == FCB_TYPE_HARDCODED ? (5.0 / 13.0) :
627  (5.0 / 14.7));
628  angle_mul = gain_mul * (8.0 * M_LN10 / M_PI);
629  for (n = 0; n <= 64; n++) {
630  float pwr;
631 
632  idx = FFMAX(0, lrint((max - lpcs[n]) * irange) - 1);
634  lpcs[n] = angle_mul * pwr;
635 
636  /* 70.57 =~ 1/log10(1.0331663) */
637  idx = (pwr * gain_mul - 0.0295) * 70.570526123;
638  if (idx > 127) { // fall back if index falls outside table range
639  coeffs[n] = wmavoice_energy_table[127] *
640  powf(1.0331663, idx - 127);
641  } else
642  coeffs[n] = wmavoice_energy_table[FFMAX(0, idx)];
643  }
644 
645  /* calculate the Hilbert transform of the gains, which we do (since this
646  * is a sine input) by doing a phase shift (in theory, H(sin())=cos()).
647  * Hilbert_Transform(RDFT(x)) = Laplace_Transform(x), which calculates the
648  * "moment" of the LPCs in this filter. */
649  s->dct.dct_calc(&s->dct, lpcs);
650  s->dst.dct_calc(&s->dst, lpcs);
651 
652  /* Split out the coefficient indexes into phase/magnitude pairs */
653  idx = 255 + av_clip(lpcs[64], -255, 255);
654  coeffs[0] = coeffs[0] * s->cos[idx];
655  idx = 255 + av_clip(lpcs[64] - 2 * lpcs[63], -255, 255);
656  last_coeff = coeffs[64] * s->cos[idx];
657  for (n = 63;; n--) {
658  idx = 255 + av_clip(-lpcs[64] - 2 * lpcs[n - 1], -255, 255);
659  coeffs[n * 2 + 1] = coeffs[n] * s->sin[idx];
660  coeffs[n * 2] = coeffs[n] * s->cos[idx];
661 
662  if (!--n) break;
663 
664  idx = 255 + av_clip( lpcs[64] - 2 * lpcs[n - 1], -255, 255);
665  coeffs[n * 2 + 1] = coeffs[n] * s->sin[idx];
666  coeffs[n * 2] = coeffs[n] * s->cos[idx];
667  }
668  coeffs[1] = last_coeff;
669 
670  /* move into real domain */
671  s->irdft.rdft_calc(&s->irdft, coeffs);
672 
673  /* tilt correction and normalize scale */
674  memset(&coeffs[remainder], 0, sizeof(coeffs[0]) * (128 - remainder));
675  if (s->denoise_tilt_corr) {
676  float tilt_mem = 0;
677 
678  coeffs[remainder - 1] = 0;
679  ff_tilt_compensation(&tilt_mem,
680  -1.8 * tilt_factor(coeffs, remainder - 1),
681  coeffs, remainder);
682  }
683  sq = (1.0 / 64.0) * sqrtf(1 / avpriv_scalarproduct_float_c(coeffs, coeffs,
684  remainder));
685  for (n = 0; n < remainder; n++)
686  coeffs[n] *= sq;
687 }
688 
689 /**
690  * This function applies a Wiener filter on the (noisy) speech signal as
691  * a means to denoise it.
692  *
693  * - take RDFT of LPCs to get the power spectrum of the noise + speech;
694  * - using this power spectrum, calculate (for each frequency) the Wiener
695  * filter gain, which depends on the frequency power and desired level
696  * of noise subtraction (when set too high, this leads to artifacts)
697  * We can do this symmetrically over the X-axis (so 0-4kHz is the inverse
698  * of 4-8kHz);
699  * - by doing a phase shift, calculate the Hilbert transform of this array
700  * of per-frequency filter-gains to get the filtering coefficients;
701  * - smoothen/normalize/de-tilt these filter coefficients as desired;
702  * - take RDFT of noisy sound, apply the coefficients and take its IRDFT
703  * to get the denoised speech signal;
704  * - the leftover (i.e. output of the IRDFT on denoised speech data beyond
705  * the frame boundary) are saved and applied to subsequent frames by an
706  * overlap-add method (otherwise you get clicking-artifacts).
707  *
708  * @param s WMA Voice decoding context
709  * @param fcb_type Frame (codebook) type
710  * @param synth_pf input: the noisy speech signal, output: denoised speech
711  * data; should be 16-byte aligned (for ASM purposes)
712  * @param size size of the speech data
713  * @param lpcs LPCs used to synthesize this frame's speech data
714  */
715 static void wiener_denoise(WMAVoiceContext *s, int fcb_type,
716  float *synth_pf, int size,
717  const float *lpcs)
718 {
719  int remainder, lim, n;
720 
721  if (fcb_type != FCB_TYPE_SILENCE) {
722  float *tilted_lpcs = s->tilted_lpcs_pf,
723  *coeffs = s->denoise_coeffs_pf, tilt_mem = 0;
724 
725  tilted_lpcs[0] = 1.0;
726  memcpy(&tilted_lpcs[1], lpcs, sizeof(lpcs[0]) * s->lsps);
727  memset(&tilted_lpcs[s->lsps + 1], 0,
728  sizeof(tilted_lpcs[0]) * (128 - s->lsps - 1));
729  ff_tilt_compensation(&tilt_mem, 0.7 * tilt_factor(lpcs, s->lsps),
730  tilted_lpcs, s->lsps + 2);
731 
732  /* The IRDFT output (127 samples for 7-bit filter) beyond the frame
733  * size is applied to the next frame. All input beyond this is zero,
734  * and thus all output beyond this will go towards zero, hence we can
735  * limit to min(size-1, 127-size) as a performance consideration. */
736  remainder = FFMIN(127 - size, size - 1);
737  calc_input_response(s, tilted_lpcs, fcb_type, coeffs, remainder);
738 
739  /* apply coefficients (in frequency spectrum domain), i.e. complex
740  * number multiplication */
741  memset(&synth_pf[size], 0, sizeof(synth_pf[0]) * (128 - size));
742  s->rdft.rdft_calc(&s->rdft, synth_pf);
743  s->rdft.rdft_calc(&s->rdft, coeffs);
744  synth_pf[0] *= coeffs[0];
745  synth_pf[1] *= coeffs[1];
746  for (n = 1; n < 64; n++) {
747  float v1 = synth_pf[n * 2], v2 = synth_pf[n * 2 + 1];
748  synth_pf[n * 2] = v1 * coeffs[n * 2] - v2 * coeffs[n * 2 + 1];
749  synth_pf[n * 2 + 1] = v2 * coeffs[n * 2] + v1 * coeffs[n * 2 + 1];
750  }
751  s->irdft.rdft_calc(&s->irdft, synth_pf);
752  }
753 
754  /* merge filter output with the history of previous runs */
755  if (s->denoise_filter_cache_size) {
756  lim = FFMIN(s->denoise_filter_cache_size, size);
757  for (n = 0; n < lim; n++)
758  synth_pf[n] += s->denoise_filter_cache[n];
759  s->denoise_filter_cache_size -= lim;
760  memmove(s->denoise_filter_cache, &s->denoise_filter_cache[size],
762  }
763 
764  /* move remainder of filter output into a cache for future runs */
765  if (fcb_type != FCB_TYPE_SILENCE) {
766  lim = FFMIN(remainder, s->denoise_filter_cache_size);
767  for (n = 0; n < lim; n++)
768  s->denoise_filter_cache[n] += synth_pf[size + n];
769  if (lim < remainder) {
770  memcpy(&s->denoise_filter_cache[lim], &synth_pf[size + lim],
771  sizeof(s->denoise_filter_cache[0]) * (remainder - lim));
772  s->denoise_filter_cache_size = remainder;
773  }
774  }
775 }
776 
777 /**
778  * Averaging projection filter, the postfilter used in WMAVoice.
779  *
780  * This uses the following steps:
781  * - A zero-synthesis filter (generate excitation from synth signal)
782  * - Kalman smoothing on excitation, based on pitch
783  * - Re-synthesized smoothened output
784  * - Iterative Wiener denoise filter
785  * - Adaptive gain filter
786  * - DC filter
787  *
788  * @param s WMAVoice decoding context
789  * @param synth Speech synthesis output (before postfilter)
790  * @param samples Output buffer for filtered samples
791  * @param size Buffer size of synth & samples
792  * @param lpcs Generated LPCs used for speech synthesis
793  * @param zero_exc_pf destination for zero synthesis filter (16-byte aligned)
794  * @param fcb_type Frame type (silence, hardcoded, AW-pulses or FCB-pulses)
795  * @param pitch Pitch of the input signal
796  */
797 static void postfilter(WMAVoiceContext *s, const float *synth,
798  float *samples, int size,
799  const float *lpcs, float *zero_exc_pf,
800  int fcb_type, int pitch)
801 {
802  float synth_filter_in_buf[MAX_FRAMESIZE / 2],
803  *synth_pf = &s->synth_filter_out_buf[MAX_LSPS_ALIGN16],
804  *synth_filter_in = zero_exc_pf;
805 
806  av_assert0(size <= MAX_FRAMESIZE / 2);
807 
808  /* generate excitation from input signal */
809  ff_celp_lp_zero_synthesis_filterf(zero_exc_pf, lpcs, synth, size, s->lsps);
810 
811  if (fcb_type >= FCB_TYPE_AW_PULSES &&
812  !kalman_smoothen(s, pitch, zero_exc_pf, synth_filter_in_buf, size))
813  synth_filter_in = synth_filter_in_buf;
814 
815  /* re-synthesize speech after smoothening, and keep history */
816  ff_celp_lp_synthesis_filterf(synth_pf, lpcs,
817  synth_filter_in, size, s->lsps);
818  memcpy(&synth_pf[-s->lsps], &synth_pf[size - s->lsps],
819  sizeof(synth_pf[0]) * s->lsps);
820 
821  wiener_denoise(s, fcb_type, synth_pf, size, lpcs);
822 
823  adaptive_gain_control(samples, synth_pf, synth, size, 0.99,
824  &s->postfilter_agc);
825 
826  if (s->dc_level > 8) {
827  /* remove ultra-low frequency DC noise / highpass filter;
828  * coefficients are identical to those used in SIPR decoding,
829  * and very closely resemble those used in AMR-NB decoding. */
831  (const float[2]) { -1.99997, 1.0 },
832  (const float[2]) { -1.9330735188, 0.93589198496 },
833  0.93980580475, s->dcf_mem, size);
834  }
835 }
836 /**
837  * @}
838  */
839 
840 /**
841  * Dequantize LSPs
842  * @param lsps output pointer to the array that will hold the LSPs
843  * @param num number of LSPs to be dequantized
844  * @param values quantized values, contains n_stages values
845  * @param sizes range (i.e. max value) of each quantized value
846  * @param n_stages number of dequantization runs
847  * @param table dequantization table to be used
848  * @param mul_q LSF multiplier
849  * @param base_q base (lowest) LSF values
850  */
851 static void dequant_lsps(double *lsps, int num,
852  const uint16_t *values,
853  const uint16_t *sizes,
854  int n_stages, const uint8_t *table,
855  const double *mul_q,
856  const double *base_q)
857 {
858  int n, m;
859 
860  memset(lsps, 0, num * sizeof(*lsps));
861  for (n = 0; n < n_stages; n++) {
862  const uint8_t *t_off = &table[values[n] * num];
863  double base = base_q[n], mul = mul_q[n];
864 
865  for (m = 0; m < num; m++)
866  lsps[m] += base + mul * t_off[m];
867 
868  table += sizes[n] * num;
869  }
870 }
871 
872 /**
873  * @name LSP dequantization routines
874  * LSP dequantization routines, for 10/16LSPs and independent/residual coding.
875  * lsp10i() consumes 24 bits; lsp10r() consumes an additional 24 bits;
876  * lsp16i() consumes 34 bits; lsp16r() consumes an additional 26 bits.
877  * @{
878  */
879 /**
880  * Parse 10 independently-coded LSPs.
881  */
882 static void dequant_lsp10i(GetBitContext *gb, double *lsps)
883 {
884  static const uint16_t vec_sizes[4] = { 256, 64, 32, 32 };
885  static const double mul_lsf[4] = {
886  5.2187144800e-3, 1.4626986422e-3,
887  9.6179549166e-4, 1.1325736225e-3
888  };
889  static const double base_lsf[4] = {
890  M_PI * -2.15522e-1, M_PI * -6.1646e-2,
891  M_PI * -3.3486e-2, M_PI * -5.7408e-2
892  };
893  uint16_t v[4];
894 
895  v[0] = get_bits(gb, 8);
896  v[1] = get_bits(gb, 6);
897  v[2] = get_bits(gb, 5);
898  v[3] = get_bits(gb, 5);
899 
900  dequant_lsps(lsps, 10, v, vec_sizes, 4, wmavoice_dq_lsp10i,
901  mul_lsf, base_lsf);
902 }
903 
904 /**
905  * Parse 10 independently-coded LSPs, and then derive the tables to
906  * generate LSPs for the other frames from them (residual coding).
907  */
909  double *i_lsps, const double *old,
910  double *a1, double *a2, int q_mode)
911 {
912  static const uint16_t vec_sizes[3] = { 128, 64, 64 };
913  static const double mul_lsf[3] = {
914  2.5807601174e-3, 1.2354460219e-3, 1.1763821673e-3
915  };
916  static const double base_lsf[3] = {
917  M_PI * -1.07448e-1, M_PI * -5.2706e-2, M_PI * -5.1634e-2
918  };
919  const float (*ipol_tab)[2][10] = q_mode ?
921  uint16_t interpol, v[3];
922  int n;
923 
924  dequant_lsp10i(gb, i_lsps);
925 
926  interpol = get_bits(gb, 5);
927  v[0] = get_bits(gb, 7);
928  v[1] = get_bits(gb, 6);
929  v[2] = get_bits(gb, 6);
930 
931  for (n = 0; n < 10; n++) {
932  double delta = old[n] - i_lsps[n];
933  a1[n] = ipol_tab[interpol][0][n] * delta + i_lsps[n];
934  a1[10 + n] = ipol_tab[interpol][1][n] * delta + i_lsps[n];
935  }
936 
937  dequant_lsps(a2, 20, v, vec_sizes, 3, wmavoice_dq_lsp10r,
938  mul_lsf, base_lsf);
939 }
940 
941 /**
942  * Parse 16 independently-coded LSPs.
943  */
944 static void dequant_lsp16i(GetBitContext *gb, double *lsps)
945 {
946  static const uint16_t vec_sizes[5] = { 256, 64, 128, 64, 128 };
947  static const double mul_lsf[5] = {
948  3.3439586280e-3, 6.9908173703e-4,
949  3.3216608306e-3, 1.0334960326e-3,
950  3.1899104283e-3
951  };
952  static const double base_lsf[5] = {
953  M_PI * -1.27576e-1, M_PI * -2.4292e-2,
954  M_PI * -1.28094e-1, M_PI * -3.2128e-2,
955  M_PI * -1.29816e-1
956  };
957  uint16_t v[5];
958 
959  v[0] = get_bits(gb, 8);
960  v[1] = get_bits(gb, 6);
961  v[2] = get_bits(gb, 7);
962  v[3] = get_bits(gb, 6);
963  v[4] = get_bits(gb, 7);
964 
965  dequant_lsps( lsps, 5, v, vec_sizes, 2,
966  wmavoice_dq_lsp16i1, mul_lsf, base_lsf);
967  dequant_lsps(&lsps[5], 5, &v[2], &vec_sizes[2], 2,
968  wmavoice_dq_lsp16i2, &mul_lsf[2], &base_lsf[2]);
969  dequant_lsps(&lsps[10], 6, &v[4], &vec_sizes[4], 1,
970  wmavoice_dq_lsp16i3, &mul_lsf[4], &base_lsf[4]);
971 }
972 
973 /**
974  * Parse 16 independently-coded LSPs, and then derive the tables to
975  * generate LSPs for the other frames from them (residual coding).
976  */
978  double *i_lsps, const double *old,
979  double *a1, double *a2, int q_mode)
980 {
981  static const uint16_t vec_sizes[3] = { 128, 128, 128 };
982  static const double mul_lsf[3] = {
983  1.2232979501e-3, 1.4062241527e-3, 1.6114744851e-3
984  };
985  static const double base_lsf[3] = {
986  M_PI * -5.5830e-2, M_PI * -5.2908e-2, M_PI * -5.4776e-2
987  };
988  const float (*ipol_tab)[2][16] = q_mode ?
990  uint16_t interpol, v[3];
991  int n;
992 
993  dequant_lsp16i(gb, i_lsps);
994 
995  interpol = get_bits(gb, 5);
996  v[0] = get_bits(gb, 7);
997  v[1] = get_bits(gb, 7);
998  v[2] = get_bits(gb, 7);
999 
1000  for (n = 0; n < 16; n++) {
1001  double delta = old[n] - i_lsps[n];
1002  a1[n] = ipol_tab[interpol][0][n] * delta + i_lsps[n];
1003  a1[16 + n] = ipol_tab[interpol][1][n] * delta + i_lsps[n];
1004  }
1005 
1006  dequant_lsps( a2, 10, v, vec_sizes, 1,
1007  wmavoice_dq_lsp16r1, mul_lsf, base_lsf);
1008  dequant_lsps(&a2[10], 10, &v[1], &vec_sizes[1], 1,
1009  wmavoice_dq_lsp16r2, &mul_lsf[1], &base_lsf[1]);
1010  dequant_lsps(&a2[20], 12, &v[2], &vec_sizes[2], 1,
1011  wmavoice_dq_lsp16r3, &mul_lsf[2], &base_lsf[2]);
1012 }
1013 
1014 /**
1015  * @}
1016  * @name Pitch-adaptive window coding functions
1017  * The next few functions are for pitch-adaptive window coding.
1018  * @{
1019  */
1020 /**
1021  * Parse the offset of the first pitch-adaptive window pulses, and
1022  * the distribution of pulses between the two blocks in this frame.
1023  * @param s WMA Voice decoding context private data
1024  * @param gb bit I/O context
1025  * @param pitch pitch for each block in this frame
1026  */
1028  const int *pitch)
1029 {
1030  static const int16_t start_offset[94] = {
1031  -11, -9, -7, -5, -3, -1, 1, 3, 5, 7, 9, 11,
1032  13, 15, 18, 17, 19, 20, 21, 22, 23, 24, 25, 26,
1033  27, 28, 29, 30, 31, 32, 33, 35, 37, 39, 41, 43,
1034  45, 47, 49, 51, 53, 55, 57, 59, 61, 63, 65, 67,
1035  69, 71, 73, 75, 77, 79, 81, 83, 85, 87, 89, 91,
1036  93, 95, 97, 99, 101, 103, 105, 107, 109, 111, 113, 115,
1037  117, 119, 121, 123, 125, 127, 129, 131, 133, 135, 137, 139,
1038  141, 143, 145, 147, 149, 151, 153, 155, 157, 159
1039  };
1040  int bits, offset;
1041 
1042  /* position of pulse */
1043  s->aw_idx_is_ext = 0;
1044  if ((bits = get_bits(gb, 6)) >= 54) {
1045  s->aw_idx_is_ext = 1;
1046  bits += (bits - 54) * 3 + get_bits(gb, 2);
1047  }
1048 
1049  /* for a repeated pulse at pulse_off with a pitch_lag of pitch[], count
1050  * the distribution of the pulses in each block contained in this frame. */
1051  s->aw_pulse_range = FFMIN(pitch[0], pitch[1]) > 32 ? 24 : 16;
1052  for (offset = start_offset[bits]; offset < 0; offset += pitch[0]) ;
1053  s->aw_n_pulses[0] = (pitch[0] - 1 + MAX_FRAMESIZE / 2 - offset) / pitch[0];
1054  s->aw_first_pulse_off[0] = offset - s->aw_pulse_range / 2;
1055  offset += s->aw_n_pulses[0] * pitch[0];
1056  s->aw_n_pulses[1] = (pitch[1] - 1 + MAX_FRAMESIZE - offset) / pitch[1];
1057  s->aw_first_pulse_off[1] = offset - (MAX_FRAMESIZE + s->aw_pulse_range) / 2;
1058 
1059  /* if continuing from a position before the block, reset position to
1060  * start of block (when corrected for the range over which it can be
1061  * spread in aw_pulse_set1()). */
1062  if (start_offset[bits] < MAX_FRAMESIZE / 2) {
1063  while (s->aw_first_pulse_off[1] - pitch[1] + s->aw_pulse_range > 0)
1064  s->aw_first_pulse_off[1] -= pitch[1];
1065  if (start_offset[bits] < 0)
1066  while (s->aw_first_pulse_off[0] - pitch[0] + s->aw_pulse_range > 0)
1067  s->aw_first_pulse_off[0] -= pitch[0];
1068  }
1069 }
1070 
1071 /**
1072  * Apply second set of pitch-adaptive window pulses.
1073  * @param s WMA Voice decoding context private data
1074  * @param gb bit I/O context
1075  * @param block_idx block index in frame [0, 1]
1076  * @param fcb structure containing fixed codebook vector info
1077  * @return -1 on error, 0 otherwise
1078  */
1080  int block_idx, AMRFixed *fcb)
1081 {
1082  uint16_t use_mask_mem[9]; // only 5 are used, rest is padding
1083  uint16_t *use_mask = use_mask_mem + 2;
1084  /* in this function, idx is the index in the 80-bit (+ padding) use_mask
1085  * bit-array. Since use_mask consists of 16-bit values, the lower 4 bits
1086  * of idx are the position of the bit within a particular item in the
1087  * array (0 being the most significant bit, and 15 being the least
1088  * significant bit), and the remainder (>> 4) is the index in the
1089  * use_mask[]-array. This is faster and uses less memory than using a
1090  * 80-byte/80-int array. */
1091  int pulse_off = s->aw_first_pulse_off[block_idx],
1092  pulse_start, n, idx, range, aidx, start_off = 0;
1093 
1094  /* set offset of first pulse to within this block */
1095  if (s->aw_n_pulses[block_idx] > 0)
1096  while (pulse_off + s->aw_pulse_range < 1)
1097  pulse_off += fcb->pitch_lag;
1098 
1099  /* find range per pulse */
1100  if (s->aw_n_pulses[0] > 0) {
1101  if (block_idx == 0) {
1102  range = 32;
1103  } else /* block_idx = 1 */ {
1104  range = 8;
1105  if (s->aw_n_pulses[block_idx] > 0)
1106  pulse_off = s->aw_next_pulse_off_cache;
1107  }
1108  } else
1109  range = 16;
1110  pulse_start = s->aw_n_pulses[block_idx] > 0 ? pulse_off - range / 2 : 0;
1111 
1112  /* aw_pulse_set1() already applies pulses around pulse_off (to be exactly,
1113  * in the range of [pulse_off, pulse_off + s->aw_pulse_range], and thus
1114  * we exclude that range from being pulsed again in this function. */
1115  memset(&use_mask[-2], 0, 2 * sizeof(use_mask[0]));
1116  memset( use_mask, -1, 5 * sizeof(use_mask[0]));
1117  memset(&use_mask[5], 0, 2 * sizeof(use_mask[0]));
1118  if (s->aw_n_pulses[block_idx] > 0)
1119  for (idx = pulse_off; idx < MAX_FRAMESIZE / 2; idx += fcb->pitch_lag) {
1120  int excl_range = s->aw_pulse_range; // always 16 or 24
1121  uint16_t *use_mask_ptr = &use_mask[idx >> 4];
1122  int first_sh = 16 - (idx & 15);
1123  *use_mask_ptr++ &= 0xFFFFu << first_sh;
1124  excl_range -= first_sh;
1125  if (excl_range >= 16) {
1126  *use_mask_ptr++ = 0;
1127  *use_mask_ptr &= 0xFFFF >> (excl_range - 16);
1128  } else
1129  *use_mask_ptr &= 0xFFFF >> excl_range;
1130  }
1131 
1132  /* find the 'aidx'th offset that is not excluded */
1133  aidx = get_bits(gb, s->aw_n_pulses[0] > 0 ? 5 - 2 * block_idx : 4);
1134  for (n = 0; n <= aidx; pulse_start++) {
1135  for (idx = pulse_start; idx < 0; idx += fcb->pitch_lag) ;
1136  if (idx >= MAX_FRAMESIZE / 2) { // find from zero
1137  if (use_mask[0]) idx = 0x0F;
1138  else if (use_mask[1]) idx = 0x1F;
1139  else if (use_mask[2]) idx = 0x2F;
1140  else if (use_mask[3]) idx = 0x3F;
1141  else if (use_mask[4]) idx = 0x4F;
1142  else return -1;
1143  idx -= av_log2_16bit(use_mask[idx >> 4]);
1144  }
1145  if (use_mask[idx >> 4] & (0x8000 >> (idx & 15))) {
1146  use_mask[idx >> 4] &= ~(0x8000 >> (idx & 15));
1147  n++;
1148  start_off = idx;
1149  }
1150  }
1151 
1152  fcb->x[fcb->n] = start_off;
1153  fcb->y[fcb->n] = get_bits1(gb) ? -1.0 : 1.0;
1154  fcb->n++;
1155 
1156  /* set offset for next block, relative to start of that block */
1157  n = (MAX_FRAMESIZE / 2 - start_off) % fcb->pitch_lag;
1158  s->aw_next_pulse_off_cache = n ? fcb->pitch_lag - n : 0;
1159  return 0;
1160 }
1161 
1162 /**
1163  * Apply first set of pitch-adaptive window pulses.
1164  * @param s WMA Voice decoding context private data
1165  * @param gb bit I/O context
1166  * @param block_idx block index in frame [0, 1]
1167  * @param fcb storage location for fixed codebook pulse info
1168  */
1170  int block_idx, AMRFixed *fcb)
1171 {
1172  int val = get_bits(gb, 12 - 2 * (s->aw_idx_is_ext && !block_idx));
1173  float v;
1174 
1175  if (s->aw_n_pulses[block_idx] > 0) {
1176  int n, v_mask, i_mask, sh, n_pulses;
1177 
1178  if (s->aw_pulse_range == 24) { // 3 pulses, 1:sign + 3:index each
1179  n_pulses = 3;
1180  v_mask = 8;
1181  i_mask = 7;
1182  sh = 4;
1183  } else { // 4 pulses, 1:sign + 2:index each
1184  n_pulses = 4;
1185  v_mask = 4;
1186  i_mask = 3;
1187  sh = 3;
1188  }
1189 
1190  for (n = n_pulses - 1; n >= 0; n--, val >>= sh) {
1191  fcb->y[fcb->n] = (val & v_mask) ? -1.0 : 1.0;
1192  fcb->x[fcb->n] = (val & i_mask) * n_pulses + n +
1193  s->aw_first_pulse_off[block_idx];
1194  while (fcb->x[fcb->n] < 0)
1195  fcb->x[fcb->n] += fcb->pitch_lag;
1196  if (fcb->x[fcb->n] < MAX_FRAMESIZE / 2)
1197  fcb->n++;
1198  }
1199  } else {
1200  int num2 = (val & 0x1FF) >> 1, delta, idx;
1201 
1202  if (num2 < 1 * 79) { delta = 1; idx = num2 + 1; }
1203  else if (num2 < 2 * 78) { delta = 3; idx = num2 + 1 - 1 * 77; }
1204  else if (num2 < 3 * 77) { delta = 5; idx = num2 + 1 - 2 * 76; }
1205  else { delta = 7; idx = num2 + 1 - 3 * 75; }
1206  v = (val & 0x200) ? -1.0 : 1.0;
1207 
1208  fcb->no_repeat_mask |= 3 << fcb->n;
1209  fcb->x[fcb->n] = idx - delta;
1210  fcb->y[fcb->n] = v;
1211  fcb->x[fcb->n + 1] = idx;
1212  fcb->y[fcb->n + 1] = (val & 1) ? -v : v;
1213  fcb->n += 2;
1214  }
1215 }
1216 
1217 /**
1218  * @}
1219  *
1220  * Generate a random number from frame_cntr and block_idx, which will live
1221  * in the range [0, 1000 - block_size] (so it can be used as an index in a
1222  * table of size 1000 of which you want to read block_size entries).
1223  *
1224  * @param frame_cntr current frame number
1225  * @param block_num current block index
1226  * @param block_size amount of entries we want to read from a table
1227  * that has 1000 entries
1228  * @return a (non-)random number in the [0, 1000 - block_size] range.
1229  */
1230 static int pRNG(int frame_cntr, int block_num, int block_size)
1231 {
1232  /* array to simplify the calculation of z:
1233  * y = (x % 9) * 5 + 6;
1234  * z = (49995 * x) / y;
1235  * Since y only has 9 values, we can remove the division by using a
1236  * LUT and using FASTDIV-style divisions. For each of the 9 values
1237  * of y, we can rewrite z as:
1238  * z = x * (49995 / y) + x * ((49995 % y) / y)
1239  * In this table, each col represents one possible value of y, the
1240  * first number is 49995 / y, and the second is the FASTDIV variant
1241  * of 49995 % y / y. */
1242  static const unsigned int div_tbl[9][2] = {
1243  { 8332, 3 * 715827883U }, // y = 6
1244  { 4545, 0 * 390451573U }, // y = 11
1245  { 3124, 11 * 268435456U }, // y = 16
1246  { 2380, 15 * 204522253U }, // y = 21
1247  { 1922, 23 * 165191050U }, // y = 26
1248  { 1612, 23 * 138547333U }, // y = 31
1249  { 1388, 27 * 119304648U }, // y = 36
1250  { 1219, 16 * 104755300U }, // y = 41
1251  { 1086, 39 * 93368855U } // y = 46
1252  };
1253  unsigned int z, y, x = MUL16(block_num, 1877) + frame_cntr;
1254  if (x >= 0xFFFF) x -= 0xFFFF; // max value of x is 8*1877+0xFFFE=0x13AA6,
1255  // so this is effectively a modulo (%)
1256  y = x - 9 * MULH(477218589, x); // x % 9
1257  z = (uint16_t) (x * div_tbl[y][0] + UMULH(x, div_tbl[y][1]));
1258  // z = x * 49995 / (y * 5 + 6)
1259  return z % (1000 - block_size);
1260 }
1261 
1262 /**
1263  * Parse hardcoded signal for a single block.
1264  * @note see #synth_block().
1265  */
1267  int block_idx, int size,
1268  const struct frame_type_desc *frame_desc,
1269  float *excitation)
1270 {
1271  float gain;
1272  int n, r_idx;
1273 
1274  av_assert0(size <= MAX_FRAMESIZE);
1275 
1276  /* Set the offset from which we start reading wmavoice_std_codebook */
1277  if (frame_desc->fcb_type == FCB_TYPE_SILENCE) {
1278  r_idx = pRNG(s->frame_cntr, block_idx, size);
1279  gain = s->silence_gain;
1280  } else /* FCB_TYPE_HARDCODED */ {
1281  r_idx = get_bits(gb, 8);
1282  gain = wmavoice_gain_universal[get_bits(gb, 6)];
1283  }
1284 
1285  /* Clear gain prediction parameters */
1286  memset(s->gain_pred_err, 0, sizeof(s->gain_pred_err));
1287 
1288  /* Apply gain to hardcoded codebook and use that as excitation signal */
1289  for (n = 0; n < size; n++)
1290  excitation[n] = wmavoice_std_codebook[r_idx + n] * gain;
1291 }
1292 
1293 /**
1294  * Parse FCB/ACB signal for a single block.
1295  * @note see #synth_block().
1296  */
1298  int block_idx, int size,
1299  int block_pitch_sh2,
1300  const struct frame_type_desc *frame_desc,
1301  float *excitation)
1302 {
1303  static const float gain_coeff[6] = {
1304  0.8169, -0.06545, 0.1726, 0.0185, -0.0359, 0.0458
1305  };
1306  float pulses[MAX_FRAMESIZE / 2], pred_err, acb_gain, fcb_gain;
1307  int n, idx, gain_weight;
1308  AMRFixed fcb;
1309 
1310  av_assert0(size <= MAX_FRAMESIZE / 2);
1311  memset(pulses, 0, sizeof(*pulses) * size);
1312 
1313  fcb.pitch_lag = block_pitch_sh2 >> 2;
1314  fcb.pitch_fac = 1.0;
1315  fcb.no_repeat_mask = 0;
1316  fcb.n = 0;
1317 
1318  /* For the other frame types, this is where we apply the innovation
1319  * (fixed) codebook pulses of the speech signal. */
1320  if (frame_desc->fcb_type == FCB_TYPE_AW_PULSES) {
1321  aw_pulse_set1(s, gb, block_idx, &fcb);
1322  if (aw_pulse_set2(s, gb, block_idx, &fcb)) {
1323  /* Conceal the block with silence and return.
1324  * Skip the correct amount of bits to read the next
1325  * block from the correct offset. */
1326  int r_idx = pRNG(s->frame_cntr, block_idx, size);
1327 
1328  for (n = 0; n < size; n++)
1329  excitation[n] =
1330  wmavoice_std_codebook[r_idx + n] * s->silence_gain;
1331  skip_bits(gb, 7 + 1);
1332  return;
1333  }
1334  } else /* FCB_TYPE_EXC_PULSES */ {
1335  int offset_nbits = 5 - frame_desc->log_n_blocks;
1336 
1337  fcb.no_repeat_mask = -1;
1338  /* similar to ff_decode_10_pulses_35bits(), but with single pulses
1339  * (instead of double) for a subset of pulses */
1340  for (n = 0; n < 5; n++) {
1341  float sign;
1342  int pos1, pos2;
1343 
1344  sign = get_bits1(gb) ? 1.0 : -1.0;
1345  pos1 = get_bits(gb, offset_nbits);
1346  fcb.x[fcb.n] = n + 5 * pos1;
1347  fcb.y[fcb.n++] = sign;
1348  if (n < frame_desc->dbl_pulses) {
1349  pos2 = get_bits(gb, offset_nbits);
1350  fcb.x[fcb.n] = n + 5 * pos2;
1351  fcb.y[fcb.n++] = (pos1 < pos2) ? -sign : sign;
1352  }
1353  }
1354  }
1355  ff_set_fixed_vector(pulses, &fcb, 1.0, size);
1356 
1357  /* Calculate gain for adaptive & fixed codebook signal.
1358  * see ff_amr_set_fixed_gain(). */
1359  idx = get_bits(gb, 7);
1361  gain_coeff, 6) -
1362  5.2409161640 + wmavoice_gain_codebook_fcb[idx]);
1363  acb_gain = wmavoice_gain_codebook_acb[idx];
1364  pred_err = av_clipf(wmavoice_gain_codebook_fcb[idx],
1365  -2.9957322736 /* log(0.05) */,
1366  1.6094379124 /* log(5.0) */);
1367 
1368  gain_weight = 8 >> frame_desc->log_n_blocks;
1369  memmove(&s->gain_pred_err[gain_weight], s->gain_pred_err,
1370  sizeof(*s->gain_pred_err) * (6 - gain_weight));
1371  for (n = 0; n < gain_weight; n++)
1372  s->gain_pred_err[n] = pred_err;
1373 
1374  /* Calculation of adaptive codebook */
1375  if (frame_desc->acb_type == ACB_TYPE_ASYMMETRIC) {
1376  int len;
1377  for (n = 0; n < size; n += len) {
1378  int next_idx_sh16;
1379  int abs_idx = block_idx * size + n;
1380  int pitch_sh16 = (s->last_pitch_val << 16) +
1381  s->pitch_diff_sh16 * abs_idx;
1382  int pitch = (pitch_sh16 + 0x6FFF) >> 16;
1383  int idx_sh16 = ((pitch << 16) - pitch_sh16) * 8 + 0x58000;
1384  idx = idx_sh16 >> 16;
1385  if (s->pitch_diff_sh16) {
1386  if (s->pitch_diff_sh16 > 0) {
1387  next_idx_sh16 = (idx_sh16) &~ 0xFFFF;
1388  } else
1389  next_idx_sh16 = (idx_sh16 + 0x10000) &~ 0xFFFF;
1390  len = av_clip((idx_sh16 - next_idx_sh16) / s->pitch_diff_sh16 / 8,
1391  1, size - n);
1392  } else
1393  len = size;
1394 
1395  ff_acelp_interpolatef(&excitation[n], &excitation[n - pitch],
1397  idx, 9, len);
1398  }
1399  } else /* ACB_TYPE_HAMMING */ {
1400  int block_pitch = block_pitch_sh2 >> 2;
1401  idx = block_pitch_sh2 & 3;
1402  if (idx) {
1403  ff_acelp_interpolatef(excitation, &excitation[-block_pitch],
1405  idx, 8, size);
1406  } else
1407  av_memcpy_backptr((uint8_t *) excitation, sizeof(float) * block_pitch,
1408  sizeof(float) * size);
1409  }
1410 
1411  /* Interpolate ACB/FCB and use as excitation signal */
1412  ff_weighted_vector_sumf(excitation, excitation, pulses,
1413  acb_gain, fcb_gain, size);
1414 }
1415 
1416 /**
1417  * Parse data in a single block.
1418  *
1419  * @param s WMA Voice decoding context private data
1420  * @param gb bit I/O context
1421  * @param block_idx index of the to-be-read block
1422  * @param size amount of samples to be read in this block
1423  * @param block_pitch_sh2 pitch for this block << 2
1424  * @param lsps LSPs for (the end of) this frame
1425  * @param prev_lsps LSPs for the last frame
1426  * @param frame_desc frame type descriptor
1427  * @param excitation target memory for the ACB+FCB interpolated signal
1428  * @param synth target memory for the speech synthesis filter output
1429  * @return 0 on success, <0 on error.
1430  */
1432  int block_idx, int size,
1433  int block_pitch_sh2,
1434  const double *lsps, const double *prev_lsps,
1435  const struct frame_type_desc *frame_desc,
1436  float *excitation, float *synth)
1437 {
1438  double i_lsps[MAX_LSPS];
1439  float lpcs[MAX_LSPS];
1440  float fac;
1441  int n;
1442 
1443  if (frame_desc->acb_type == ACB_TYPE_NONE)
1444  synth_block_hardcoded(s, gb, block_idx, size, frame_desc, excitation);
1445  else
1446  synth_block_fcb_acb(s, gb, block_idx, size, block_pitch_sh2,
1447  frame_desc, excitation);
1448 
1449  /* convert interpolated LSPs to LPCs */
1450  fac = (block_idx + 0.5) / frame_desc->n_blocks;
1451  for (n = 0; n < s->lsps; n++) // LSF -> LSP
1452  i_lsps[n] = cos(prev_lsps[n] + fac * (lsps[n] - prev_lsps[n]));
1453  ff_acelp_lspd2lpc(i_lsps, lpcs, s->lsps >> 1);
1454 
1455  /* Speech synthesis */
1456  ff_celp_lp_synthesis_filterf(synth, lpcs, excitation, size, s->lsps);
1457 }
1458 
1459 /**
1460  * Synthesize output samples for a single frame.
1461  *
1462  * @param ctx WMA Voice decoder context
1463  * @param gb bit I/O context (s->gb or one for cross-packet superframes)
1464  * @param frame_idx Frame number within superframe [0-2]
1465  * @param samples pointer to output sample buffer, has space for at least 160
1466  * samples
1467  * @param lsps LSP array
1468  * @param prev_lsps array of previous frame's LSPs
1469  * @param excitation target buffer for excitation signal
1470  * @param synth target buffer for synthesized speech data
1471  * @return 0 on success, <0 on error.
1472  */
1473 static int synth_frame(AVCodecContext *ctx, GetBitContext *gb, int frame_idx,
1474  float *samples,
1475  const double *lsps, const double *prev_lsps,
1476  float *excitation, float *synth)
1477 {
1478  WMAVoiceContext *s = ctx->priv_data;
1479  int n, n_blocks_x2, log_n_blocks_x2, av_uninit(cur_pitch_val);
1480  int pitch[MAX_BLOCKS], av_uninit(last_block_pitch);
1481 
1482  /* Parse frame type ("frame header"), see frame_descs */
1483  int bd_idx = s->vbm_tree[get_vlc2(gb, frame_type_vlc.table, 6, 3)], block_nsamples;
1484 
1485  if (bd_idx < 0) {
1486  av_log(ctx, AV_LOG_ERROR,
1487  "Invalid frame type VLC code, skipping\n");
1488  return AVERROR_INVALIDDATA;
1489  }
1490 
1491  block_nsamples = MAX_FRAMESIZE / frame_descs[bd_idx].n_blocks;
1492 
1493  /* Pitch calculation for ACB_TYPE_ASYMMETRIC ("pitch-per-frame") */
1494  if (frame_descs[bd_idx].acb_type == ACB_TYPE_ASYMMETRIC) {
1495  /* Pitch is provided per frame, which is interpreted as the pitch of
1496  * the last sample of the last block of this frame. We can interpolate
1497  * the pitch of other blocks (and even pitch-per-sample) by gradually
1498  * incrementing/decrementing prev_frame_pitch to cur_pitch_val. */
1499  n_blocks_x2 = frame_descs[bd_idx].n_blocks << 1;
1500  log_n_blocks_x2 = frame_descs[bd_idx].log_n_blocks + 1;
1501  cur_pitch_val = s->min_pitch_val + get_bits(gb, s->pitch_nbits);
1502  cur_pitch_val = FFMIN(cur_pitch_val, s->max_pitch_val - 1);
1503  if (s->last_acb_type == ACB_TYPE_NONE ||
1504  20 * abs(cur_pitch_val - s->last_pitch_val) >
1505  (cur_pitch_val + s->last_pitch_val))
1506  s->last_pitch_val = cur_pitch_val;
1507 
1508  /* pitch per block */
1509  for (n = 0; n < frame_descs[bd_idx].n_blocks; n++) {
1510  int fac = n * 2 + 1;
1511 
1512  pitch[n] = (MUL16(fac, cur_pitch_val) +
1513  MUL16((n_blocks_x2 - fac), s->last_pitch_val) +
1514  frame_descs[bd_idx].n_blocks) >> log_n_blocks_x2;
1515  }
1516 
1517  /* "pitch-diff-per-sample" for calculation of pitch per sample */
1518  s->pitch_diff_sh16 =
1519  ((cur_pitch_val - s->last_pitch_val) << 16) / MAX_FRAMESIZE;
1520  }
1521 
1522  /* Global gain (if silence) and pitch-adaptive window coordinates */
1523  switch (frame_descs[bd_idx].fcb_type) {
1524  case FCB_TYPE_SILENCE:
1526  break;
1527  case FCB_TYPE_AW_PULSES:
1528  aw_parse_coords(s, gb, pitch);
1529  break;
1530  }
1531 
1532  for (n = 0; n < frame_descs[bd_idx].n_blocks; n++) {
1533  int bl_pitch_sh2;
1534 
1535  /* Pitch calculation for ACB_TYPE_HAMMING ("pitch-per-block") */
1536  switch (frame_descs[bd_idx].acb_type) {
1537  case ACB_TYPE_HAMMING: {
1538  /* Pitch is given per block. Per-block pitches are encoded as an
1539  * absolute value for the first block, and then delta values
1540  * relative to this value) for all subsequent blocks. The scale of
1541  * this pitch value is semi-logarithmic compared to its use in the
1542  * decoder, so we convert it to normal scale also. */
1543  int block_pitch,
1544  t1 = (s->block_conv_table[1] - s->block_conv_table[0]) << 2,
1545  t2 = (s->block_conv_table[2] - s->block_conv_table[1]) << 1,
1546  t3 = s->block_conv_table[3] - s->block_conv_table[2] + 1;
1547 
1548  if (n == 0) {
1549  block_pitch = get_bits(gb, s->block_pitch_nbits);
1550  } else
1551  block_pitch = last_block_pitch - s->block_delta_pitch_hrange +
1553  /* Convert last_ so that any next delta is within _range */
1554  last_block_pitch = av_clip(block_pitch,
1556  s->block_pitch_range -
1558 
1559  /* Convert semi-log-style scale back to normal scale */
1560  if (block_pitch < t1) {
1561  bl_pitch_sh2 = (s->block_conv_table[0] << 2) + block_pitch;
1562  } else {
1563  block_pitch -= t1;
1564  if (block_pitch < t2) {
1565  bl_pitch_sh2 =
1566  (s->block_conv_table[1] << 2) + (block_pitch << 1);
1567  } else {
1568  block_pitch -= t2;
1569  if (block_pitch < t3) {
1570  bl_pitch_sh2 =
1571  (s->block_conv_table[2] + block_pitch) << 2;
1572  } else
1573  bl_pitch_sh2 = s->block_conv_table[3] << 2;
1574  }
1575  }
1576  pitch[n] = bl_pitch_sh2 >> 2;
1577  break;
1578  }
1579 
1580  case ACB_TYPE_ASYMMETRIC: {
1581  bl_pitch_sh2 = pitch[n] << 2;
1582  break;
1583  }
1584 
1585  default: // ACB_TYPE_NONE has no pitch
1586  bl_pitch_sh2 = 0;
1587  break;
1588  }
1589 
1590  synth_block(s, gb, n, block_nsamples, bl_pitch_sh2,
1591  lsps, prev_lsps, &frame_descs[bd_idx],
1592  &excitation[n * block_nsamples],
1593  &synth[n * block_nsamples]);
1594  }
1595 
1596  /* Averaging projection filter, if applicable. Else, just copy samples
1597  * from synthesis buffer */
1598  if (s->do_apf) {
1599  double i_lsps[MAX_LSPS];
1600  float lpcs[MAX_LSPS];
1601 
1602  for (n = 0; n < s->lsps; n++) // LSF -> LSP
1603  i_lsps[n] = cos(0.5 * (prev_lsps[n] + lsps[n]));
1604  ff_acelp_lspd2lpc(i_lsps, lpcs, s->lsps >> 1);
1605  postfilter(s, synth, samples, 80, lpcs,
1606  &s->zero_exc_pf[s->history_nsamples + MAX_FRAMESIZE * frame_idx],
1607  frame_descs[bd_idx].fcb_type, pitch[0]);
1608 
1609  for (n = 0; n < s->lsps; n++) // LSF -> LSP
1610  i_lsps[n] = cos(lsps[n]);
1611  ff_acelp_lspd2lpc(i_lsps, lpcs, s->lsps >> 1);
1612  postfilter(s, &synth[80], &samples[80], 80, lpcs,
1613  &s->zero_exc_pf[s->history_nsamples + MAX_FRAMESIZE * frame_idx + 80],
1614  frame_descs[bd_idx].fcb_type, pitch[0]);
1615  } else
1616  memcpy(samples, synth, 160 * sizeof(synth[0]));
1617 
1618  /* Cache values for next frame */
1619  s->frame_cntr++;
1620  if (s->frame_cntr >= 0xFFFF) s->frame_cntr -= 0xFFFF; // i.e. modulo (%)
1621  s->last_acb_type = frame_descs[bd_idx].acb_type;
1622  switch (frame_descs[bd_idx].acb_type) {
1623  case ACB_TYPE_NONE:
1624  s->last_pitch_val = 0;
1625  break;
1626  case ACB_TYPE_ASYMMETRIC:
1627  s->last_pitch_val = cur_pitch_val;
1628  break;
1629  case ACB_TYPE_HAMMING:
1630  s->last_pitch_val = pitch[frame_descs[bd_idx].n_blocks - 1];
1631  break;
1632  }
1633 
1634  return 0;
1635 }
1636 
1637 /**
1638  * Ensure minimum value for first item, maximum value for last value,
1639  * proper spacing between each value and proper ordering.
1640  *
1641  * @param lsps array of LSPs
1642  * @param num size of LSP array
1643  *
1644  * @note basically a double version of #ff_acelp_reorder_lsf(), might be
1645  * useful to put in a generic location later on. Parts are also
1646  * present in #ff_set_min_dist_lsf() + #ff_sort_nearly_sorted_floats(),
1647  * which is in float.
1648  */
1649 static void stabilize_lsps(double *lsps, int num)
1650 {
1651  int n, m, l;
1652 
1653  /* set minimum value for first, maximum value for last and minimum
1654  * spacing between LSF values.
1655  * Very similar to ff_set_min_dist_lsf(), but in double. */
1656  lsps[0] = FFMAX(lsps[0], 0.0015 * M_PI);
1657  for (n = 1; n < num; n++)
1658  lsps[n] = FFMAX(lsps[n], lsps[n - 1] + 0.0125 * M_PI);
1659  lsps[num - 1] = FFMIN(lsps[num - 1], 0.9985 * M_PI);
1660 
1661  /* reorder (looks like one-time / non-recursed bubblesort).
1662  * Very similar to ff_sort_nearly_sorted_floats(), but in double. */
1663  for (n = 1; n < num; n++) {
1664  if (lsps[n] < lsps[n - 1]) {
1665  for (m = 1; m < num; m++) {
1666  double tmp = lsps[m];
1667  for (l = m - 1; l >= 0; l--) {
1668  if (lsps[l] <= tmp) break;
1669  lsps[l + 1] = lsps[l];
1670  }
1671  lsps[l + 1] = tmp;
1672  }
1673  break;
1674  }
1675  }
1676 }
1677 
1678 /**
1679  * Synthesize output samples for a single superframe. If we have any data
1680  * cached in s->sframe_cache, that will be used instead of whatever is loaded
1681  * in s->gb.
1682  *
1683  * WMA Voice superframes contain 3 frames, each containing 160 audio samples,
1684  * to give a total of 480 samples per frame. See #synth_frame() for frame
1685  * parsing. In addition to 3 frames, superframes can also contain the LSPs
1686  * (if these are globally specified for all frames (residually); they can
1687  * also be specified individually per-frame. See the s->has_residual_lsps
1688  * option), and can specify the number of samples encoded in this superframe
1689  * (if less than 480), usually used to prevent blanks at track boundaries.
1690  *
1691  * @param ctx WMA Voice decoder context
1692  * @return 0 on success, <0 on error or 1 if there was not enough data to
1693  * fully parse the superframe
1694  */
1696  int *got_frame_ptr)
1697 {
1698  WMAVoiceContext *s = ctx->priv_data;
1699  GetBitContext *gb = &s->gb, s_gb;
1700  int n, res, n_samples = MAX_SFRAMESIZE;
1701  double lsps[MAX_FRAMES][MAX_LSPS];
1702  const double *mean_lsf = s->lsps == 16 ?
1704  float excitation[MAX_SIGNAL_HISTORY + MAX_SFRAMESIZE + 12];
1705  float synth[MAX_LSPS + MAX_SFRAMESIZE];
1706  float *samples;
1707 
1708  memcpy(synth, s->synth_history,
1709  s->lsps * sizeof(*synth));
1710  memcpy(excitation, s->excitation_history,
1711  s->history_nsamples * sizeof(*excitation));
1712 
1713  if (s->sframe_cache_size > 0) {
1714  gb = &s_gb;
1716  s->sframe_cache_size = 0;
1717  }
1718 
1719  /* First bit is speech/music bit, it differentiates between WMAVoice
1720  * speech samples (the actual codec) and WMAVoice music samples, which
1721  * are really WMAPro-in-WMAVoice-superframes. I've never seen those in
1722  * the wild yet. */
1723  if (!get_bits1(gb)) {
1724  avpriv_request_sample(ctx, "WMAPro-in-WMAVoice");
1725  return AVERROR_PATCHWELCOME;
1726  }
1727 
1728  /* (optional) nr. of samples in superframe; always <= 480 and >= 0 */
1729  if (get_bits1(gb)) {
1730  if ((n_samples = get_bits(gb, 12)) > MAX_SFRAMESIZE) {
1731  av_log(ctx, AV_LOG_ERROR,
1732  "Superframe encodes > %d samples (%d), not allowed\n",
1733  MAX_SFRAMESIZE, n_samples);
1734  return AVERROR_INVALIDDATA;
1735  }
1736  }
1737 
1738  /* Parse LSPs, if global for the superframe (can also be per-frame). */
1739  if (s->has_residual_lsps) {
1740  double prev_lsps[MAX_LSPS], a1[MAX_LSPS * 2], a2[MAX_LSPS * 2];
1741 
1742  for (n = 0; n < s->lsps; n++)
1743  prev_lsps[n] = s->prev_lsps[n] - mean_lsf[n];
1744 
1745  if (s->lsps == 10) {
1746  dequant_lsp10r(gb, lsps[2], prev_lsps, a1, a2, s->lsp_q_mode);
1747  } else /* s->lsps == 16 */
1748  dequant_lsp16r(gb, lsps[2], prev_lsps, a1, a2, s->lsp_q_mode);
1749 
1750  for (n = 0; n < s->lsps; n++) {
1751  lsps[0][n] = mean_lsf[n] + (a1[n] - a2[n * 2]);
1752  lsps[1][n] = mean_lsf[n] + (a1[s->lsps + n] - a2[n * 2 + 1]);
1753  lsps[2][n] += mean_lsf[n];
1754  }
1755  for (n = 0; n < 3; n++)
1756  stabilize_lsps(lsps[n], s->lsps);
1757  }
1758 
1759  /* get output buffer */
1760  frame->nb_samples = MAX_SFRAMESIZE;
1761  if ((res = ff_get_buffer(ctx, frame, 0)) < 0)
1762  return res;
1763  frame->nb_samples = n_samples;
1764  samples = (float *)frame->data[0];
1765 
1766  /* Parse frames, optionally preceded by per-frame (independent) LSPs. */
1767  for (n = 0; n < 3; n++) {
1768  if (!s->has_residual_lsps) {
1769  int m;
1770 
1771  if (s->lsps == 10) {
1772  dequant_lsp10i(gb, lsps[n]);
1773  } else /* s->lsps == 16 */
1774  dequant_lsp16i(gb, lsps[n]);
1775 
1776  for (m = 0; m < s->lsps; m++)
1777  lsps[n][m] += mean_lsf[m];
1778  stabilize_lsps(lsps[n], s->lsps);
1779  }
1780 
1781  if ((res = synth_frame(ctx, gb, n,
1782  &samples[n * MAX_FRAMESIZE],
1783  lsps[n], n == 0 ? s->prev_lsps : lsps[n - 1],
1784  &excitation[s->history_nsamples + n * MAX_FRAMESIZE],
1785  &synth[s->lsps + n * MAX_FRAMESIZE]))) {
1786  *got_frame_ptr = 0;
1787  return res;
1788  }
1789  }
1790 
1791  /* Statistics? FIXME - we don't check for length, a slight overrun
1792  * will be caught by internal buffer padding, and anything else
1793  * will be skipped, not read. */
1794  if (get_bits1(gb)) {
1795  res = get_bits(gb, 4);
1796  skip_bits(gb, 10 * (res + 1));
1797  }
1798 
1799  if (get_bits_left(gb) < 0) {
1800  wmavoice_flush(ctx);
1801  return AVERROR_INVALIDDATA;
1802  }
1803 
1804  *got_frame_ptr = 1;
1805 
1806  /* Update history */
1807  memcpy(s->prev_lsps, lsps[2],
1808  s->lsps * sizeof(*s->prev_lsps));
1809  memcpy(s->synth_history, &synth[MAX_SFRAMESIZE],
1810  s->lsps * sizeof(*synth));
1811  memcpy(s->excitation_history, &excitation[MAX_SFRAMESIZE],
1812  s->history_nsamples * sizeof(*excitation));
1813  if (s->do_apf)
1814  memmove(s->zero_exc_pf, &s->zero_exc_pf[MAX_SFRAMESIZE],
1815  s->history_nsamples * sizeof(*s->zero_exc_pf));
1816 
1817  return 0;
1818 }
1819 
1820 /**
1821  * Parse the packet header at the start of each packet (input data to this
1822  * decoder).
1823  *
1824  * @param s WMA Voice decoding context private data
1825  * @return <0 on error, nb_superframes on success.
1826  */
1828 {
1829  GetBitContext *gb = &s->gb;
1830  unsigned int res, n_superframes = 0;
1831 
1832  skip_bits(gb, 4); // packet sequence number
1833  s->has_residual_lsps = get_bits1(gb);
1834  do {
1835  res = get_bits(gb, 6); // number of superframes per packet
1836  // (minus first one if there is spillover)
1837  n_superframes += res;
1838  } while (res == 0x3F);
1840 
1841  return get_bits_left(gb) >= 0 ? n_superframes : AVERROR_INVALIDDATA;
1842 }
1843 
1844 /**
1845  * Copy (unaligned) bits from gb/data/size to pb.
1846  *
1847  * @param pb target buffer to copy bits into
1848  * @param data source buffer to copy bits from
1849  * @param size size of the source data, in bytes
1850  * @param gb bit I/O context specifying the current position in the source.
1851  * data. This function might use this to align the bit position to
1852  * a whole-byte boundary before calling #avpriv_copy_bits() on aligned
1853  * source data
1854  * @param nbits the amount of bits to copy from source to target
1855  *
1856  * @note after calling this function, the current position in the input bit
1857  * I/O context is undefined.
1858  */
1859 static void copy_bits(PutBitContext *pb,
1860  const uint8_t *data, int size,
1861  GetBitContext *gb, int nbits)
1862 {
1863  int rmn_bytes, rmn_bits;
1864 
1865  rmn_bits = rmn_bytes = get_bits_left(gb);
1866  if (rmn_bits < nbits)
1867  return;
1868  if (nbits > pb->size_in_bits - put_bits_count(pb))
1869  return;
1870  rmn_bits &= 7; rmn_bytes >>= 3;
1871  if ((rmn_bits = FFMIN(rmn_bits, nbits)) > 0)
1872  put_bits(pb, rmn_bits, get_bits(gb, rmn_bits));
1873  avpriv_copy_bits(pb, data + size - rmn_bytes,
1874  FFMIN(nbits - rmn_bits, rmn_bytes << 3));
1875 }
1876 
1877 /**
1878  * Packet decoding: a packet is anything that the (ASF) demuxer contains,
1879  * and we expect that the demuxer / application provides it to us as such
1880  * (else you'll probably get garbage as output). Every packet has a size of
1881  * ctx->block_align bytes, starts with a packet header (see
1882  * #parse_packet_header()), and then a series of superframes. Superframe
1883  * boundaries may exceed packets, i.e. superframes can split data over
1884  * multiple (two) packets.
1885  *
1886  * For more information about frames, see #synth_superframe().
1887  */
1889  int *got_frame_ptr, AVPacket *avpkt)
1890 {
1891  WMAVoiceContext *s = ctx->priv_data;
1892  GetBitContext *gb = &s->gb;
1893  int size, res, pos;
1894 
1895  /* Packets are sometimes a multiple of ctx->block_align, with a packet
1896  * header at each ctx->block_align bytes. However, FFmpeg's ASF demuxer
1897  * feeds us ASF packets, which may concatenate multiple "codec" packets
1898  * in a single "muxer" packet, so we artificially emulate that by
1899  * capping the packet size at ctx->block_align. */
1900  for (size = avpkt->size; size > ctx->block_align; size -= ctx->block_align);
1901  init_get_bits(&s->gb, avpkt->data, size << 3);
1902 
1903  /* size == ctx->block_align is used to indicate whether we are dealing with
1904  * a new packet or a packet of which we already read the packet header
1905  * previously. */
1906  if (!(size % ctx->block_align)) { // new packet header
1907  if (!size) {
1908  s->spillover_nbits = 0;
1909  s->nb_superframes = 0;
1910  } else {
1911  if ((res = parse_packet_header(s)) < 0)
1912  return res;
1913  s->nb_superframes = res;
1914  }
1915 
1916  /* If the packet header specifies a s->spillover_nbits, then we want
1917  * to push out all data of the previous packet (+ spillover) before
1918  * continuing to parse new superframes in the current packet. */
1919  if (s->sframe_cache_size > 0) {
1920  int cnt = get_bits_count(gb);
1921  if (cnt + s->spillover_nbits > avpkt->size * 8) {
1922  s->spillover_nbits = avpkt->size * 8 - cnt;
1923  }
1924  copy_bits(&s->pb, avpkt->data, size, gb, s->spillover_nbits);
1925  flush_put_bits(&s->pb);
1927  if ((res = synth_superframe(ctx, data, got_frame_ptr)) == 0 &&
1928  *got_frame_ptr) {
1929  cnt += s->spillover_nbits;
1930  s->skip_bits_next = cnt & 7;
1931  res = cnt >> 3;
1932  return res;
1933  } else
1934  skip_bits_long (gb, s->spillover_nbits - cnt +
1935  get_bits_count(gb)); // resync
1936  } else if (s->spillover_nbits) {
1937  skip_bits_long(gb, s->spillover_nbits); // resync
1938  }
1939  } else if (s->skip_bits_next)
1940  skip_bits(gb, s->skip_bits_next);
1941 
1942  /* Try parsing superframes in current packet */
1943  s->sframe_cache_size = 0;
1944  s->skip_bits_next = 0;
1945  pos = get_bits_left(gb);
1946  if (s->nb_superframes-- == 0) {
1947  *got_frame_ptr = 0;
1948  return size;
1949  } else if (s->nb_superframes > 0) {
1950  if ((res = synth_superframe(ctx, data, got_frame_ptr)) < 0) {
1951  return res;
1952  } else if (*got_frame_ptr) {
1953  int cnt = get_bits_count(gb);
1954  s->skip_bits_next = cnt & 7;
1955  res = cnt >> 3;
1956  return res;
1957  }
1958  } else if ((s->sframe_cache_size = pos) > 0) {
1959  /* ... cache it for spillover in next packet */
1961  copy_bits(&s->pb, avpkt->data, size, gb, s->sframe_cache_size);
1962  // FIXME bad - just copy bytes as whole and add use the
1963  // skip_bits_next field
1964  }
1965 
1966  return size;
1967 }
1968 
1970 {
1971  WMAVoiceContext *s = ctx->priv_data;
1972 
1973  if (s->do_apf) {
1974  ff_rdft_end(&s->rdft);
1975  ff_rdft_end(&s->irdft);
1976  ff_dct_end(&s->dct);
1977  ff_dct_end(&s->dst);
1978  }
1979 
1980  return 0;
1981 }
1982 
1984  .name = "wmavoice",
1985  .long_name = NULL_IF_CONFIG_SMALL("Windows Media Audio Voice"),
1986  .type = AVMEDIA_TYPE_AUDIO,
1987  .id = AV_CODEC_ID_WMAVOICE,
1988  .priv_data_size = sizeof(WMAVoiceContext),
1990  .init_static_data = wmavoice_init_static_data,
1991  .close = wmavoice_decode_end,
1994  .flush = wmavoice_flush,
1995 };
RDFTContext rdft
Definition: wmavoice.c:264
Description of frame types.
Definition: wmavoice.c:98
static void aw_pulse_set1(WMAVoiceContext *s, GetBitContext *gb, int block_idx, AMRFixed *fcb)
Apply first set of pitch-adaptive window pulses.
Definition: wmavoice.c:1169
av_cold void ff_rdft_end(RDFTContext *s)
Definition: rdft.c:132
static const uint8_t wmavoice_dq_lsp16r2[0x500]
#define NULL
Definition: coverity.c:32
const char const char void * val
Definition: avisynth_c.h:771
int do_apf
whether to apply the averaged projection filter (APF)
Definition: wmavoice.c:148
const char * s
Definition: avisynth_c.h:768
#define AVERROR_INVALIDDATA
Invalid data found when processing input.
Definition: error.h:59
static int pRNG(int frame_cntr, int block_num, int block_size)
Generate a random number from frame_cntr and block_idx, which will live in the range [0...
Definition: wmavoice.c:1230
static av_cold int decode_vbmtree(GetBitContext *gb, int8_t vbm_tree[25])
Set up the variable bit mode (VBM) tree from container extradata.
Definition: wmavoice.c:299
void ff_celp_lp_synthesis_filterf(float *out, const float *filter_coeffs, const float *in, int buffer_length, int filter_length)
LP synthesis filter.
Definition: celp_filters.c:84
float gain_pred_err[6]
cache for gain prediction
Definition: wmavoice.c:249
Per-block pitch with signal generation using a Hamming sinc window function.
Definition: wmavoice.c:74
This structure describes decoded (raw) audio or video data.
Definition: frame.h:187
void(* dct_calc)(struct DCTContext *s, FFTSample *data)
Definition: dct.h:38
int aw_next_pulse_off_cache
the position (relative to start of the second block) at which pulses should start to be positioned...
Definition: wmavoice.c:240
int nb_superframes
number of superframes in current packet
Definition: wmavoice.c:248
ptrdiff_t const GLvoid * data
Definition: opengl_enc.c:101
static void flush(AVCodecContext *avctx)
float postfilter_agc
gain control memory, used in adaptive_gain_control()
Definition: wmavoice.c:270
void ff_acelp_apply_order_2_transfer_function(float *out, const float *in, const float zero_coeffs[2], const float pole_coeffs[2], float gain, float mem[2], int n)
Apply an order 2 rational transfer function in-place.
static void put_bits(Jpeg2000EncoderContext *s, int val, int n)
put n times val bit
Definition: j2kenc.c:206
static unsigned int get_bits(GetBitContext *s, int n)
Read 1-25 bits.
Definition: get_bits.h:261
static void postfilter(WMAVoiceContext *s, const float *synth, float *samples, int size, const float *lpcs, float *zero_exc_pf, int fcb_type, int pitch)
Averaging projection filter, the postfilter used in WMAVoice.
Definition: wmavoice.c:797
Memory handling functions.
void ff_weighted_vector_sumf(float *out, const float *in_a, const float *in_b, float weight_coeff_a, float weight_coeff_b, int length)
float implementation of weighted sum of two vectors.
static void skip_bits_long(GetBitContext *s, int n)
Definition: get_bits.h:204
static av_cold int init(AVCodecContext *avctx)
Definition: avrndec.c:35
adaptive codebook with per-frame pitch, which we interpolate to get a per-sample pitch.
Definition: wmavoice.c:69
#define INIT_VLC_STATIC(vlc, bits, a, b, c, d, e, f, g, static_size)
Definition: vlc.h:75
float synth_filter_out_buf[0x80+MAX_LSPS_ALIGN16]
aligned buffer for postfilter speech synthesis
Definition: wmavoice.c:282
static void aw_parse_coords(WMAVoiceContext *s, GetBitContext *gb, const int *pitch)
Parse the offset of the first pitch-adaptive window pulses, and the distribution of pulses between th...
Definition: wmavoice.c:1027
static const int8_t pulses[4]
Number of non-zero pulses in the MP-MLQ excitation.
Definition: g723_1.h:720
int x[10]
Definition: acelp_vectors.h:55
int size
Definition: avcodec.h:1658
int aw_n_pulses[2]
number of AW-pulses in each block; note that this number can be negative (in which case it basically ...
Definition: wmavoice.c:235
static int interpol(MBContext *s, uint32_t *color, int x, int y, int linesize)
void avpriv_copy_bits(PutBitContext *pb, const uint8_t *src, int length)
Copy the content of src to the bitstream.
Definition: bitstream.c:64
static void stabilize_lsps(double *lsps, int num)
Ensure minimum value for first item, maximum value for last value, proper spacing between each value ...
Definition: wmavoice.c:1649
static const float wmavoice_gain_codebook_fcb[128]
static const uint8_t wmavoice_dq_lsp16i1[0x640]
#define a1
Definition: regdef.h:47
static const uint8_t wmavoice_dq_lsp16r1[0x500]
int spillover_nbits
number of bits of the previous packet's last superframe preceding this packet's first full superframe...
Definition: wmavoice.c:187
void ff_set_fixed_vector(float *out, const AMRFixed *in, float scale, int size)
Add fixed vector to an array from a sparse representation.
int block_pitch_nbits
number of bits used to specify the first block's pitch value
Definition: wmavoice.c:166
static const uint8_t wmavoice_dq_lsp16i3[0x300]
float pitch_fac
Definition: acelp_vectors.h:59
static int synth_frame(AVCodecContext *ctx, GetBitContext *gb, int frame_idx, float *samples, const double *lsps, const double *prev_lsps, float *excitation, float *synth)
Synthesize output samples for a single frame.
Definition: wmavoice.c:1473
static void calc_input_response(WMAVoiceContext *s, float *lpcs, int fcb_type, float *coeffs, int remainder)
Derive denoise filter coefficients (in real domain) from the LPCs.
Definition: wmavoice.c:598
static void dequant_lsp10i(GetBitContext *gb, double *lsps)
Parse 10 independently-coded LSPs.
Definition: wmavoice.c:882
int av_log2_16bit(unsigned v)
Definition: intmath.c:31
AVCodec.
Definition: avcodec.h:3681
#define MAX_LSPS_ALIGN16
same as MAX_LSPS; needs to be multiple
Definition: wmavoice.c:48
int block_align
number of bytes per packet if constant and known or 0 Used by some WAV based audio codecs...
Definition: avcodec.h:2531
static int aw_pulse_set2(WMAVoiceContext *s, GetBitContext *gb, int block_idx, AMRFixed *fcb)
Apply second set of pitch-adaptive window pulses.
Definition: wmavoice.c:1079
static const float wmavoice_ipol1_coeffs[17 *9]
static const uint8_t wmavoice_dq_lsp16i2[0x3c0]
#define AV_CODEC_CAP_DELAY
Encoder or decoder requires flushing with NULL input at the end in order to give the complete and cor...
Definition: avcodec.h:1019
#define av_assert0(cond)
assert() equivalent, that is always enabled.
Definition: avassert.h:37
int spillover_bitsize
number of bits used to specify spillover_nbits in the packet header = ceil(log2(ctx->block_align << 3...
Definition: wmavoice.c:141
float avpriv_scalarproduct_float_c(const float *v1, const float *v2, int len)
Return the scalar product of two vectors.
Definition: float_dsp.c:108
void void avpriv_request_sample(void *avc, const char *msg,...) av_printf_format(2
Log a generic warning message about a missing feature.
int block_delta_pitch_nbits
number of bits used to specify the delta pitch between this and the last block's pitch value...
Definition: wmavoice.c:169
uint8_t bits
Definition: crc.c:296
enum AVSampleFormat sample_fmt
audio sample format
Definition: avcodec.h:2502
int mem
Definition: avisynth_c.h:821
uint8_t
#define av_cold
Definition: attributes.h:82
Sparse representation for the algebraic codebook (fixed) vector.
Definition: acelp_vectors.h:53
static const uint8_t wmavoice_dq_lsp16r3[0x600]
float delta
DCTContext dct
Definition: wmavoice.c:266
static const float wmavoice_gain_codebook_acb[128]
uint8_t log_n_blocks
log2(n_blocks)
Definition: wmavoice.c:101
comfort noise during silence generated from a hardcoded (fixed) codebook with per-frame (low) gain va...
Definition: wmavoice.c:83
int aw_first_pulse_off[2]
index of first sample to which to apply AW-pulses, or -0xff if unset
Definition: wmavoice.c:238
static av_cold int end(AVCodecContext *avctx)
Definition: avrndec.c:90
int has_residual_lsps
if set, superframes contain one set of LSPs that cover all frames, encoded as independent and residua...
Definition: wmavoice.c:191
float tilted_lpcs_pf[0x80]
aligned buffer for LPC tilting
Definition: wmavoice.c:278
uint8_t * extradata
some codecs need / can use extradata like Huffman tables.
Definition: avcodec.h:1847
static float tilt_factor(const float *lpcs, int n_lpcs)
Get the tilt factor of a formant filter from its transfer function.
Definition: wmavoice.c:585
static const uint8_t wmavoice_dq_lsp10r[0x1400]
static AVFrame * frame
static void dequant_lsps(double *lsps, int num, const uint16_t *values, const uint16_t *sizes, int n_stages, const uint8_t *table, const double *mul_q, const double *base_q)
Dequantize LSPs.
Definition: wmavoice.c:851
#define DECLARE_ALIGNED(n, t, v)
Declare a variable that is aligned in memory.
Definition: mem.h:104
static const float wmavoice_ipol2_coeffs[32]
Hamming-window sinc function (num = 32, x = [ 0, 31 ]): (0.54 + 0.46 * cos(2 * M_PI * x / (num - 1)))...
uint8_t * data
Definition: avcodec.h:1657
static int get_bits_count(const GetBitContext *s)
Definition: get_bits.h:199
static int flags
Definition: log.c:57
Pitch-adaptive window (AW) pulse signals, used in particular for low-bitrate streams.
Definition: wmavoice.c:88
float dcf_mem[2]
DC filter history.
Definition: wmavoice.c:272
void av_memcpy_backptr(uint8_t *dst, int back, int cnt)
Overlapping memcpy() implementation.
Definition: mem.c:400
bitstream reader API header.
static av_cold void wmavoice_flush(AVCodecContext *ctx)
Definition: wmavoice.c:335
float synth_history[MAX_LSPS]
see excitation_history
Definition: wmavoice.c:254
no adaptive codebook (only hardcoded fixed)
Definition: wmavoice.c:68
ptrdiff_t size
Definition: opengl_enc.c:101
double prev_lsps[MAX_LSPS]
LSPs of the last frame of the previous superframe.
Definition: wmavoice.c:219
static void copy_bits(PutBitContext *pb, const uint8_t *data, int size, GetBitContext *gb, int nbits)
Copy (unaligned) bits from gb/data/size to pb.
Definition: wmavoice.c:1859
#define av_log(a,...)
#define expf(x)
Definition: libm.h:283
#define U(x)
Definition: vp56_arith.h:37
static int get_bits_left(GetBitContext *gb)
Definition: get_bits.h:587
int size_in_bits
Definition: put_bits.h:39
static double alpha(void *priv, double x, double y)
Definition: vf_geq.c:99
#define AV_LOG_ERROR
Something went wrong and cannot losslessly be recovered.
Definition: log.h:176
static const double wmavoice_mean_lsf16[2][16]
int sframe_cache_size
set to >0 if we have data from an (incomplete) superframe from a previous packet that spilled over in...
Definition: wmavoice.c:203
static const float wmavoice_lsp10_intercoeff_b[32][2][10]
int block_pitch_range
range of the block pitch
Definition: wmavoice.c:168
static const float wmavoice_std_codebook[1000]
static const int sizes[][2]
Definition: img2dec.c:50
int last_acb_type
frame type [0-2] of the previous frame
Definition: wmavoice.c:222
#define AVERROR(e)
Definition: error.h:43
static const struct endianess table[]
#define NULL_IF_CONFIG_SMALL(x)
Return NULL if CONFIG_SMALL is true, otherwise the argument without modification. ...
Definition: internal.h:179
static const float wmavoice_gain_silence[256]
int denoise_filter_cache_size
samples in denoise_filter_cache
Definition: wmavoice.c:277
int history_nsamples
number of samples in history for signal prediction (through ACB)
Definition: wmavoice.c:144
static const uint8_t wmavoice_dq_lsp10i[0xf00]
Definition: wmavoice_data.h:33
static const float wmavoice_lsp10_intercoeff_a[32][2][10]
#define t1
Definition: regdef.h:29
static const float wmavoice_energy_table[128]
LUT for 1.071575641632 * pow(1.0331663, n - 127)
Innovation (fixed) codebook pulse sets in combinations of either single pulses or pulse pairs...
Definition: wmavoice.c:90
Windows Media Voice (WMAVoice) tables.
Definition: avfft.h:73
const char * name
Name of the codec implementation.
Definition: avcodec.h:3688
int no_repeat_mask
Definition: acelp_vectors.h:57
int denoise_tilt_corr
Whether to apply tilt correction to the Wiener filter coefficients (postfilter)
Definition: wmavoice.c:152
int aw_idx_is_ext
whether the AW index was encoded in 8 bits (instead of 6)
Definition: wmavoice.c:227
#define t3
Definition: regdef.h:31
static const uint8_t offset[127][2]
Definition: vf_spp.c:92
#define FFMAX(a, b)
Definition: common.h:94
uint16_t block_conv_table[4]
boundaries for block pitch unit/scale conversion
Definition: wmavoice.c:175
#define MUL16(ra, rb)
Definition: mathops.h:88
DCTContext dst
contexts for phase shift (in Hilbert transform, part of postfilter)
Definition: wmavoice.c:266
int lsp_def_mode
defines different sets of LSP defaults [0, 1]
Definition: wmavoice.c:159
Definition: vlc.h:26
uint64_t channel_layout
Audio channel layout.
Definition: avcodec.h:2545
void(* rdft_calc)(struct RDFTContext *s, FFTSample *z)
Definition: rdft.h:60
static int put_bits_count(PutBitContext *s)
Definition: put_bits.h:85
#define powf(x, y)
Definition: libm.h:50
int skip_bits_next
number of bits to skip at the next call to wmavoice_decode_packet() (since they're part of the previo...
Definition: wmavoice.c:196
static void dequant_lsp16r(GetBitContext *gb, double *i_lsps, const double *old, double *a1, double *a2, int q_mode)
Parse 16 independently-coded LSPs, and then derive the tables to generate LSPs for the other frames f...
Definition: wmavoice.c:977
int min_pitch_val
base value for pitch parsing code
Definition: wmavoice.c:162
WMA Voice decoding context.
Definition: wmavoice.c:130
static void wiener_denoise(WMAVoiceContext *s, int fcb_type, float *synth_pf, int size, const float *lpcs)
This function applies a Wiener filter on the (noisy) speech signal as a means to denoise it...
Definition: wmavoice.c:715
int denoise_strength
strength of denoising in Wiener filter [0-11]
Definition: wmavoice.c:150
uint8_t sframe_cache[SFRAME_CACHE_MAXSIZE+AV_INPUT_BUFFER_PADDING_SIZE]
cache for superframe data split over multiple packets
Definition: wmavoice.c:200
audio channel layout utility functions
Definition: avfft.h:97
#define FFMIN(a, b)
Definition: common.h:96
#define log_range(var, assign)
#define MAX_LSPS
maximum filter order
Definition: wmavoice.c:47
static VLC frame_type_vlc
Frame type VLC coding.
Definition: wmavoice.c:62
int pitch_nbits
number of bits used to specify the pitch value in the frame header
Definition: wmavoice.c:164
#define MAX_BLOCKS
maximum number of blocks per frame
Definition: wmavoice.c:46
float denoise_coeffs_pf[0x80]
aligned buffer for denoise coefficients
Definition: wmavoice.c:280
static void dequant_lsp10r(GetBitContext *gb, double *i_lsps, const double *old, double *a1, double *a2, int q_mode)
Parse 10 independently-coded LSPs, and then derive the tables to generate LSPs for the other frames f...
Definition: wmavoice.c:908
float y[10]
Definition: acelp_vectors.h:56
AVFormatContext * ctx
Definition: movenc.c:48
static av_always_inline unsigned UMULH(unsigned a, unsigned b)
Definition: mathops.h:68
#define a2
Definition: regdef.h:48
Definition: dct.h:32
float sin[511]
Definition: wmavoice.c:268
static av_always_inline int get_vlc2(GetBitContext *s, VLC_TYPE(*table)[2], int bits, int max_depth)
Parse a vlc code.
Definition: get_bits.h:554
Definition: avfft.h:72
int n
Definition: avisynth_c.h:684
static int kalman_smoothen(WMAVoiceContext *s, int pitch, const float *in, float *out, int size)
Kalman smoothing function.
Definition: wmavoice.c:539
void ff_tilt_compensation(float *mem, float tilt, float *samples, int size)
Apply tilt compensation filter, 1 - tilt * z-1.
static const float wmavoice_gain_universal[64]
void ff_acelp_lspd2lpc(const double *lsp, float *lpc, int lp_half_order)
Reconstruct LPC coefficients from the line spectral pair frequencies.
Definition: lsp.c:209
static av_cold int wmavoice_decode_init(AVCodecContext *ctx)
Set up decoder with parameters from demuxer (extradata etc.).
Definition: wmavoice.c:366
#define AVERROR_PATCHWELCOME
Not yet implemented in FFmpeg, patches welcome.
Definition: error.h:62
static const uint8_t last_coeff[3]
Definition: qdm2data.h:257
static const struct frame_type_desc frame_descs[17]
float denoise_filter_cache[MAX_FRAMESIZE]
Definition: wmavoice.c:276
Libavcodec external API header.
int sample_rate
samples per second
Definition: avcodec.h:2494
void AAC_RENAME() ff_sine_window_init(INTFLOAT *window, int n)
Generate a sine window.
static int wmavoice_decode_packet(AVCodecContext *ctx, void *data, int *got_frame_ptr, AVPacket *avpkt)
Packet decoding: a packet is anything that the (ASF) demuxer contains, and we expect that the demuxer...
Definition: wmavoice.c:1888
main external API structure.
Definition: avcodec.h:1732
static int parse_packet_header(WMAVoiceContext *s)
Parse the packet header at the start of each packet (input data to this decoder). ...
Definition: wmavoice.c:1827
int ff_get_buffer(AVCodecContext *avctx, AVFrame *frame, int flags)
Get a buffer for a frame.
Definition: utils.c:953
uint8_t pi<< 24) CONV_FUNC_GROUP(AV_SAMPLE_FMT_FLT, float, AV_SAMPLE_FMT_U8, uint8_t,(*(constuint8_t *) pi-0x80)*(1.0f/(1<< 7))) CONV_FUNC_GROUP(AV_SAMPLE_FMT_DBL, double, AV_SAMPLE_FMT_U8, uint8_t,(*(constuint8_t *) pi-0x80)*(1.0/(1<< 7))) CONV_FUNC_GROUP(AV_SAMPLE_FMT_U8, uint8_t, AV_SAMPLE_FMT_S16, int16_t,(*(constint16_t *) pi >>8)+0x80) CONV_FUNC_GROUP(AV_SAMPLE_FMT_FLT, float, AV_SAMPLE_FMT_S16, int16_t,*(constint16_t *) pi *(1.0f/(1<< 15))) CONV_FUNC_GROUP(AV_SAMPLE_FMT_DBL, double, AV_SAMPLE_FMT_S16, int16_t,*(constint16_t *) pi *(1.0/(1<< 15))) CONV_FUNC_GROUP(AV_SAMPLE_FMT_U8, uint8_t, AV_SAMPLE_FMT_S32, int32_t,(*(constint32_t *) pi >>24)+0x80) CONV_FUNC_GROUP(AV_SAMPLE_FMT_FLT, float, AV_SAMPLE_FMT_S32, int32_t,*(constint32_t *) pi *(1.0f/(1U<< 31))) CONV_FUNC_GROUP(AV_SAMPLE_FMT_DBL, double, AV_SAMPLE_FMT_S32, int32_t,*(constint32_t *) pi *(1.0/(1U<< 31))) CONV_FUNC_GROUP(AV_SAMPLE_FMT_U8, uint8_t, AV_SAMPLE_FMT_FLT, float, av_clip_uint8(lrintf(*(constfloat *) pi *(1<< 7))+0x80)) CONV_FUNC_GROUP(AV_SAMPLE_FMT_S16, int16_t, AV_SAMPLE_FMT_FLT, float, av_clip_int16(lrintf(*(constfloat *) pi *(1<< 15)))) CONV_FUNC_GROUP(AV_SAMPLE_FMT_S32, int32_t, AV_SAMPLE_FMT_FLT, float, av_clipl_int32(llrintf(*(constfloat *) pi *(1U<< 31)))) CONV_FUNC_GROUP(AV_SAMPLE_FMT_U8, uint8_t, AV_SAMPLE_FMT_DBL, double, av_clip_uint8(lrint(*(constdouble *) pi *(1<< 7))+0x80)) CONV_FUNC_GROUP(AV_SAMPLE_FMT_S16, int16_t, AV_SAMPLE_FMT_DBL, double, av_clip_int16(lrint(*(constdouble *) pi *(1<< 15)))) CONV_FUNC_GROUP(AV_SAMPLE_FMT_S32, int32_t, AV_SAMPLE_FMT_DBL, double, av_clipl_int32(llrint(*(constdouble *) pi *(1U<< 31))))#defineSET_CONV_FUNC_GROUP(ofmt, ifmt) staticvoidset_generic_function(AudioConvert *ac){}voidff_audio_convert_free(AudioConvert **ac){if(!*ac) return;ff_dither_free(&(*ac) ->dc);av_freep(ac);}AudioConvert *ff_audio_convert_alloc(AVAudioResampleContext *avr, enumAVSampleFormatout_fmt, enumAVSampleFormatin_fmt, intchannels, intsample_rate, intapply_map){AudioConvert *ac;intin_planar, out_planar;ac=av_mallocz(sizeof(*ac));if(!ac) returnNULL;ac->avr=avr;ac->out_fmt=out_fmt;ac->in_fmt=in_fmt;ac->channels=channels;ac->apply_map=apply_map;if(avr->dither_method!=AV_RESAMPLE_DITHER_NONE &&av_get_packed_sample_fmt(out_fmt)==AV_SAMPLE_FMT_S16 &&av_get_bytes_per_sample(in_fmt)>2){ac->dc=ff_dither_alloc(avr, out_fmt, in_fmt, channels, sample_rate, apply_map);if(!ac->dc){av_free(ac);returnNULL;}returnac;}in_planar=ff_sample_fmt_is_planar(in_fmt, channels);out_planar=ff_sample_fmt_is_planar(out_fmt, channels);if(in_planar==out_planar){ac->func_type=CONV_FUNC_TYPE_FLAT;ac->planes=in_planar?ac->channels:1;}elseif(in_planar) ac->func_type=CONV_FUNC_TYPE_INTERLEAVE;elseac->func_type=CONV_FUNC_TYPE_DEINTERLEAVE;set_generic_function(ac);if(ARCH_AARCH64) ff_audio_convert_init_aarch64(ac);if(ARCH_ARM) ff_audio_convert_init_arm(ac);if(ARCH_X86) ff_audio_convert_init_x86(ac);returnac;}intff_audio_convert(AudioConvert *ac, AudioData *out, AudioData *in){intuse_generic=1;intlen=in->nb_samples;intp;if(ac->dc){av_log(ac->avr, AV_LOG_TRACE,"%dsamples-audio_convert:%sto%s(dithered)\n", len, av_get_sample_fmt_name(ac->in_fmt), av_get_sample_fmt_name(ac->out_fmt));returnff_convert_dither(ac-> in
AVCodec ff_wmavoice_decoder
Definition: wmavoice.c:1983
int8_t vbm_tree[25]
converts VLC codes to frame type
Definition: wmavoice.c:139
int extradata_size
Definition: avcodec.h:1848
static unsigned int get_bits1(GetBitContext *s)
Definition: get_bits.h:313
static void synth_block(WMAVoiceContext *s, GetBitContext *gb, int block_idx, int size, int block_pitch_sh2, const double *lsps, const double *prev_lsps, const struct frame_type_desc *frame_desc, float *excitation, float *synth)
Parse data in a single block.
Definition: wmavoice.c:1431
static av_cold int wmavoice_decode_end(AVCodecContext *ctx)
Definition: wmavoice.c:1969
static void skip_bits(GetBitContext *s, int n)
Definition: get_bits.h:306
av_cold int ff_dct_init(DCTContext *s, int nbits, enum DCTTransformType inverse)
Set up DCT.
Definition: dct.c:177
#define AV_CODEC_CAP_SUBFRAMES
Codec can output multiple frames per AVPacket Normally demuxers return one frame at a time...
Definition: avcodec.h:1044
int pitch_diff_sh16
((cur_pitch_val - last_pitch_val) << 16) / MAX_FRAMESIZE
Definition: wmavoice.c:223
static int init_get_bits(GetBitContext *s, const uint8_t *buffer, int bit_size)
Initialize GetBitContext.
Definition: get_bits.h:425
#define MAX_SFRAMESIZE
maximum number of samples per superframe
Definition: wmavoice.c:53
int lsp_q_mode
defines quantizer defaults [0, 1]
Definition: wmavoice.c:158
int frame_cntr
current frame index [0 - 0xFFFE]; is only used for comfort noise in pRNG()
Definition: wmavoice.c:246
void ff_celp_lp_zero_synthesis_filterf(float *out, const float *filter_coeffs, const float *in, int buffer_length, int filter_length)
LP zero synthesis filter.
Definition: celp_filters.c:199
#define u(width,...)
static void adaptive_gain_control(float *out, const float *in, const float *speech_synth, int size, float alpha, float *gain_mem)
Adaptive gain control (as used in postfilter).
Definition: wmavoice.c:498
static const float mean_lsf[10]
Definition: siprdata.h:27
#define SFRAME_CACHE_MAXSIZE
maximum cache size for frame data that
Definition: wmavoice.c:55
uint8_t fcb_type
Fixed codebook type (FCB_TYPE_*)
Definition: wmavoice.c:103
static void dequant_lsp16i(GetBitContext *gb, double *lsps)
Parse 16 independently-coded LSPs.
Definition: wmavoice.c:944
RDFTContext irdft
contexts for FFT-calculation in the postfilter (for denoise filter)
Definition: wmavoice.c:264
uint8_t * data[AV_NUM_DATA_POINTERS]
pointer to the picture/channel planes.
Definition: frame.h:201
static int synth_superframe(AVCodecContext *ctx, AVFrame *frame, int *got_frame_ptr)
Synthesize output samples for a single superframe.
Definition: wmavoice.c:1695
#define M_LN10
Definition: mathematics.h:43
hardcoded (fixed) codebook with per-block gain values
Definition: wmavoice.c:86
static void synth_block_hardcoded(WMAVoiceContext *s, GetBitContext *gb, int block_idx, int size, const struct frame_type_desc *frame_desc, float *excitation)
Parse hardcoded signal for a single block.
Definition: wmavoice.c:1266
uint8_t n_blocks
amount of blocks per frame (each block (contains 160/n_blocks samples)
Definition: wmavoice.c:99
common internal api header.
static void flush_put_bits(PutBitContext *s)
Pad the end of the output stream with zeros.
Definition: put_bits.h:101
if(ret< 0)
Definition: vf_mcdeint.c:282
static av_cold void wmavoice_init_static_data(AVCodec *codec)
Definition: wmavoice.c:313
int pitch_lag
Definition: acelp_vectors.h:58
float excitation_history[MAX_SIGNAL_HISTORY]
cache of the signal of previous superframes, used as a history for signal generation ...
Definition: wmavoice.c:250
static void init_put_bits(PutBitContext *s, uint8_t *buffer, int buffer_size)
Initialize the PutBitContext s.
Definition: put_bits.h:48
int last_pitch_val
pitch value of the previous frame
Definition: wmavoice.c:221
#define AV_INPUT_BUFFER_PADDING_SIZE
Required number of additionally allocated bytes at the end of the input bitstream for decoding...
Definition: avcodec.h:769
void * priv_data
Definition: avcodec.h:1774
#define MAX_FRAMESIZE
maximum number of samples per frame
Definition: wmavoice.c:51
float silence_gain
set for use in blocks if ACB_TYPE_NONE
Definition: wmavoice.c:225
static const double wmavoice_mean_lsf10[2][10]
static const int16_t coeffs[]
int len
int channels
number of audio channels
Definition: avcodec.h:2495
VLC_TYPE(* table)[2]
code, bits
Definition: vlc.h:28
#define lrint
Definition: tablegen.h:53
av_cold void ff_dct_end(DCTContext *s)
Definition: dct.c:220
void ff_acelp_interpolatef(float *out, const float *in, const float *filter_coeffs, int precision, int frac_pos, int filter_length, int length)
Floating point version of ff_acelp_interpolate()
Definition: acelp_filters.c:78
int block_delta_pitch_hrange
1/2 range of the delta (full range is from -this to +this-1)
Definition: wmavoice.c:173
int max_pitch_val
max value + 1 for pitch parsing
Definition: wmavoice.c:163
#define av_uninit(x)
Definition: attributes.h:148
int lsps
number of LSPs per frame [10 or 16]
Definition: wmavoice.c:157
FILE * out
Definition: movenc.c:54
#define MAX_FRAMES
maximum number of frames per superframe
Definition: wmavoice.c:50
static const float wmavoice_lsp16_intercoeff_b[32][2][16]
static int decode(AVCodecContext *avctx, AVFrame *frame, int *got_frame, AVPacket *pkt)
Definition: ffmpeg.c:2257
PutBitContext pb
bitstream writer for sframe_cache
Definition: wmavoice.c:208
#define M_PI
Definition: mathematics.h:52
uint8_t acb_type
Adaptive codebook type (ACB_TYPE_*)
Definition: wmavoice.c:102
static const float wmavoice_denoise_power_table[12][64]
LUT for f(x,y) = pow((y + 6.9) / 64, 0.025 * (x + 1)).
int dc_level
Predicted amount of DC noise, based on which a DC removal filter is used.
Definition: wmavoice.c:154
#define VLC_NBITS
number of bits to read per VLC iteration
Definition: wmavoice.c:57
static const float wmavoice_lsp16_intercoeff_a[32][2][16]
Definition: avfft.h:96
float cos[511]
8-bit cosine/sine windows over [-pi,pi] range
Definition: wmavoice.c:268
#define AV_CH_LAYOUT_MONO
av_cold int ff_rdft_init(RDFTContext *s, int nbits, enum RDFTransformType trans)
Set up a real FFT.
Definition: rdft.c:99
int aw_pulse_range
the range over which aw_pulse_set1() can apply the pulse, relative to the value in aw_first_pulse_off...
Definition: wmavoice.c:229
float min
uint64_t_TMPL AV_WL64 unsigned int_TMPL AV_RL32
Definition: bytestream.h:87
This structure stores compressed data.
Definition: avcodec.h:1634
int nb_samples
number of audio samples (per channel) described by this frame
Definition: frame.h:244
float zero_exc_pf[MAX_SIGNAL_HISTORY+MAX_SFRAMESIZE]
zero filter output (i.e.
Definition: wmavoice.c:273
#define AV_CODEC_CAP_DR1
Codec uses get_buffer() for allocating buffers and supports custom allocators.
Definition: avcodec.h:994
for(j=16;j >0;--j)
static void synth_block_fcb_acb(WMAVoiceContext *s, GetBitContext *gb, int block_idx, int size, int block_pitch_sh2, const struct frame_type_desc *frame_desc, float *excitation)
Parse FCB/ACB signal for a single block.
Definition: wmavoice.c:1297
uint8_t dbl_pulses
how many pulse vectors have pulse pairs (rather than just one single pulse) only if fcb_type == FCB_T...
Definition: wmavoice.c:104
#define t2
Definition: regdef.h:30
#define MAX_SIGNAL_HISTORY
maximum excitation signal history
Definition: wmavoice.c:52
#define MULH
Definition: mathops.h:42
GetBitContext gb
packet bitreader.
Definition: wmavoice.c:135
static uint8_t tmp[11]
Definition: aes_ctr.c:26
bitstream writer API