FFmpeg
tx_float_init.c
Go to the documentation of this file.
1 /*
2  * This file is part of FFmpeg.
3  *
4  * FFmpeg is free software; you can redistribute it and/or
5  * modify it under the terms of the GNU Lesser General Public
6  * License as published by the Free Software Foundation; either
7  * version 2.1 of the License, or (at your option) any later version.
8  *
9  * FFmpeg is distributed in the hope that it will be useful,
10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12  * Lesser General Public License for more details.
13  *
14  * You should have received a copy of the GNU Lesser General Public
15  * License along with FFmpeg; if not, write to the Free Software
16  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
17  */
18 
19 #define TX_FLOAT
20 #include "libavutil/tx_priv.h"
21 #include "libavutil/attributes.h"
22 #include "libavutil/x86/cpu.h"
23 
24 #include "config.h"
25 
26 TX_DECL_FN(fft2, sse3)
27 TX_DECL_FN(fft4_fwd, sse2)
28 TX_DECL_FN(fft4_inv, sse2)
29 TX_DECL_FN(fft8, sse3)
30 TX_DECL_FN(fft8_ns, sse3)
31 TX_DECL_FN(fft8, avx)
32 TX_DECL_FN(fft8_ns, avx)
33 TX_DECL_FN(fft15, avx2)
34 TX_DECL_FN(fft15_ns, avx2)
35 TX_DECL_FN(fft16, avx)
36 TX_DECL_FN(fft16_ns, avx)
37 TX_DECL_FN(fft16, fma3)
38 TX_DECL_FN(fft16_ns, fma3)
39 TX_DECL_FN(fft32, avx)
40 TX_DECL_FN(fft32_ns, avx)
41 TX_DECL_FN(fft32, fma3)
42 TX_DECL_FN(fft32_ns, fma3)
43 TX_DECL_FN(fft_sr, avx)
44 TX_DECL_FN(fft_sr_ns, avx)
45 TX_DECL_FN(fft_sr, fma3)
46 TX_DECL_FN(fft_sr_ns, fma3)
47 TX_DECL_FN(fft_sr, avx2)
48 TX_DECL_FN(fft_sr_ns, avx2)
49 
50 TX_DECL_FN(fft_pfa_15xM, avx2)
51 TX_DECL_FN(fft_pfa_15xM_ns, avx2)
52 
53 TX_DECL_FN(mdct_inv, avx2)
54 
55 TX_DECL_FN(fft2_asm, sse3)
56 TX_DECL_FN(fft4_fwd_asm, sse2)
57 TX_DECL_FN(fft4_inv_asm, sse2)
58 TX_DECL_FN(fft8_asm, sse3)
59 TX_DECL_FN(fft8_asm, avx)
60 TX_DECL_FN(fft16_asm, avx)
61 TX_DECL_FN(fft16_asm, fma3)
62 TX_DECL_FN(fft32_asm, avx)
63 TX_DECL_FN(fft32_asm, fma3)
64 TX_DECL_FN(fft_sr_asm, avx)
65 TX_DECL_FN(fft_sr_asm, fma3)
66 TX_DECL_FN(fft_sr_asm, avx2)
67 
68 TX_DECL_FN(fft_pfa_15xM_asm, avx2)
69 
70 #define DECL_INIT_FN(basis, interleave) \
71 static av_cold int b ##basis## _i ##interleave(AVTXContext *s, \
72  const FFTXCodelet *cd, \
73  uint64_t flags, \
74  FFTXCodeletOptions *opts, \
75  int len, int inv, \
76  const void *scale) \
77 { \
78  ff_tx_init_tabs_float(len); \
79  if (cd->max_len == 2) \
80  return ff_tx_gen_ptwo_revtab(s, opts); \
81  else \
82  return ff_tx_gen_split_radix_parity_revtab(s, len, inv, opts, \
83  basis, interleave); \
84 }
85 
86 DECL_INIT_FN(8, 0)
87 DECL_INIT_FN(8, 2)
88 
89 static av_cold int factor_init(AVTXContext *s, const FFTXCodelet *cd,
90  uint64_t flags, FFTXCodeletOptions *opts,
91  int len, int inv, const void *scale)
92 {
93  int ret;
94 
95  /* The transformations below are performed in the gather domain,
96  * so override the option and let the infrastructure convert the map
97  * to SCATTER if needed. */
99 
100  TX_TAB(ff_tx_init_tabs)(len);
101 
102  if (len == 15)
103  ret = ff_tx_gen_pfa_input_map(s, &sub_opts, 3, 5);
104  else
105  ret = ff_tx_gen_default_map(s, &sub_opts);
106 
107  if (ret < 0)
108  return ret;
109 
110  if (len == 15) {
111  int cnt = 0, tmp[15];
112 
113  /* Special permutation to simplify loads in the pre-permuted version */
114  memcpy(tmp, s->map, 15*sizeof(*tmp));
115  for (int i = 1; i < 15; i += 3) {
116  s->map[cnt] = tmp[i];
117  cnt++;
118  }
119  for (int i = 2; i < 15; i += 3) {
120  s->map[cnt] = tmp[i];
121  cnt++;
122  }
123  for (int i = 0; i < 15; i += 3) {
124  s->map[cnt] = tmp[i];
125  cnt++;
126  }
127  memmove(&s->map[7], &s->map[6], 4*sizeof(int));
128  memmove(&s->map[3], &s->map[1], 4*sizeof(int));
129  s->map[1] = tmp[2];
130  s->map[2] = tmp[0];
131  }
132 
133  return 0;
134 }
135 
136 static av_cold int m_inv_init(AVTXContext *s, const FFTXCodelet *cd,
137  uint64_t flags, FFTXCodeletOptions *opts,
138  int len, int inv, const void *scale)
139 {
140  int ret;
141  FFTXCodeletOptions sub_opts = { .map_dir = FF_TX_MAP_GATHER };
142 
143  s->scale_d = *((SCALE_TYPE *)scale);
144  s->scale_f = s->scale_d;
145 
146  flags &= ~FF_TX_OUT_OF_PLACE; /* We want the subtransform to be */
147  flags |= AV_TX_INPLACE; /* in-place */
148  flags |= FF_TX_PRESHUFFLE; /* This function handles the permute step */
149  flags |= FF_TX_ASM_CALL; /* We want an assembly function, not C */
150 
151  if ((ret = ff_tx_init_subtx(s, TX_TYPE(FFT), flags, &sub_opts, len >> 1,
152  inv, scale)))
153  return ret;
154 
155  s->map = av_malloc(len*sizeof(*s->map));
156  if (!s->map)
157  return AVERROR(ENOMEM);
158 
159  memcpy(s->map, s->sub->map, (len >> 1)*sizeof(*s->map));
160  /* Invert lookup table for unstrided path */
161  for (int i = 0; i < (len >> 1); i++)
162  s->map[(len >> 1) + s->map[i]] = i;
163 
164  if ((ret = ff_tx_mdct_gen_exp_float(s, s->map)))
165  return ret;
166 
167  return 0;
168 }
169 
171  const FFTXCodelet *cd,
172  uint64_t flags,
174  int len, int inv,
175  const void *scale)
176 {
177  int ret;
178  int sub_len = len / cd->factors[0];
179  FFTXCodeletOptions sub_opts = { .map_dir = FF_TX_MAP_SCATTER };
180 
181  flags &= ~FF_TX_OUT_OF_PLACE; /* We want the subtransform to be */
182  flags |= AV_TX_INPLACE; /* in-place */
183  flags |= FF_TX_PRESHUFFLE; /* This function handles the permute step */
184  flags |= FF_TX_ASM_CALL; /* We want an assembly function, not C */
185 
186  if ((ret = ff_tx_init_subtx(s, TX_TYPE(FFT), flags, &sub_opts,
187  sub_len, inv, scale)))
188  return ret;
189 
190  if ((ret = ff_tx_gen_compound_mapping(s, opts, s->inv, cd->factors[0], sub_len)))
191  return ret;
192 
193  if (cd->factors[0] == 15) {
194  int tmp[15];
195 
196  /* Our 15-point transform is also a compound one, so embed its input map */
197  TX_EMBED_INPUT_PFA_MAP(s->map, len, 3, 5);
198 
199  /* Special permutation to simplify loads in the pre-permuted version */
200  for (int k = 0; k < s->sub[0].len; k++) {
201  int cnt = 0;
202  memcpy(tmp, &s->map[k*15], 15*sizeof(*tmp));
203  for (int i = 1; i < 15; i += 3) {
204  s->map[k*15 + cnt] = tmp[i];
205  cnt++;
206  }
207  for (int i = 2; i < 15; i += 3) {
208  s->map[k*15 + cnt] = tmp[i];
209  cnt++;
210  }
211  for (int i = 0; i < 15; i += 3) {
212  s->map[k*15 + cnt] = tmp[i];
213  cnt++;
214  }
215  memmove(&s->map[k*15 + 7], &s->map[k*15 + 6], 4*sizeof(int));
216  memmove(&s->map[k*15 + 3], &s->map[k*15 + 1], 4*sizeof(int));
217  s->map[k*15 + 1] = tmp[2];
218  s->map[k*15 + 2] = tmp[0];
219  }
220  }
221 
222  if (!(s->tmp = av_malloc(len*sizeof(*s->tmp))))
223  return AVERROR(ENOMEM);
224 
225  TX_TAB(ff_tx_init_tabs)(len / sub_len);
226 
227  return 0;
228 }
229 
231  TX_DEF(fft2, FFT, 2, 2, 2, 0, 128, NULL, sse3, SSE3, AV_TX_INPLACE, 0),
232  TX_DEF(fft2_asm, FFT, 2, 2, 2, 0, 192, b8_i0, sse3, SSE3,
234  TX_DEF(fft2, FFT, 2, 2, 2, 0, 192, b8_i0, sse3, SSE3, AV_TX_INPLACE | FF_TX_PRESHUFFLE, 0),
235  TX_DEF(fft4_fwd, FFT, 4, 4, 2, 0, 128, NULL, sse2, SSE2, AV_TX_INPLACE | FF_TX_FORWARD_ONLY, 0),
236  TX_DEF(fft4_fwd_asm, FFT, 4, 4, 2, 0, 192, b8_i0, sse2, SSE2,
238  TX_DEF(fft4_inv_asm, FFT, 4, 4, 2, 0, 128, NULL, sse2, SSE2,
240  TX_DEF(fft4_fwd, FFT, 4, 4, 2, 0, 192, b8_i0, sse2, SSE2, AV_TX_INPLACE | FF_TX_PRESHUFFLE, 0),
241  TX_DEF(fft4_inv, FFT, 4, 4, 2, 0, 128, NULL, sse2, SSE2, AV_TX_INPLACE | FF_TX_INVERSE_ONLY, 0),
242  TX_DEF(fft8, FFT, 8, 8, 2, 0, 128, b8_i0, sse3, SSE3, AV_TX_INPLACE, 0),
243  TX_DEF(fft8_asm, FFT, 8, 8, 2, 0, 192, b8_i0, sse3, SSE3,
245  TX_DEF(fft8_ns, FFT, 8, 8, 2, 0, 192, b8_i0, sse3, SSE3, AV_TX_INPLACE | FF_TX_PRESHUFFLE, 0),
246  TX_DEF(fft8, FFT, 8, 8, 2, 0, 256, b8_i0, avx, AVX, AV_TX_INPLACE, AV_CPU_FLAG_AVXSLOW),
247  TX_DEF(fft8_asm, FFT, 8, 8, 2, 0, 320, b8_i0, avx, AVX,
249  TX_DEF(fft8_ns, FFT, 8, 8, 2, 0, 320, b8_i0, avx, AVX, AV_TX_INPLACE | FF_TX_PRESHUFFLE,
251  TX_DEF(fft16, FFT, 16, 16, 2, 0, 256, b8_i2, avx, AVX, AV_TX_INPLACE, AV_CPU_FLAG_AVXSLOW),
252  TX_DEF(fft16_asm, FFT, 16, 16, 2, 0, 320, b8_i2, avx, AVX,
254  TX_DEF(fft16_ns, FFT, 16, 16, 2, 0, 320, b8_i2, avx, AVX, AV_TX_INPLACE | FF_TX_PRESHUFFLE,
256  TX_DEF(fft16, FFT, 16, 16, 2, 0, 288, b8_i2, fma3, FMA3, AV_TX_INPLACE, AV_CPU_FLAG_AVXSLOW),
257  TX_DEF(fft16_asm, FFT, 16, 16, 2, 0, 352, b8_i2, fma3, FMA3,
259  TX_DEF(fft16_ns, FFT, 16, 16, 2, 0, 352, b8_i2, fma3, FMA3, AV_TX_INPLACE | FF_TX_PRESHUFFLE,
261 
262 #if ARCH_X86_64
263  TX_DEF(fft32, FFT, 32, 32, 2, 0, 256, b8_i2, avx, AVX, AV_TX_INPLACE, AV_CPU_FLAG_AVXSLOW),
264  TX_DEF(fft32_asm, FFT, 32, 32, 2, 0, 320, b8_i2, avx, AVX,
266  TX_DEF(fft32_ns, FFT, 32, 32, 2, 0, 320, b8_i2, avx, AVX, AV_TX_INPLACE | FF_TX_PRESHUFFLE,
268  TX_DEF(fft32, FFT, 32, 32, 2, 0, 288, b8_i2, fma3, FMA3, AV_TX_INPLACE, AV_CPU_FLAG_AVXSLOW),
269  TX_DEF(fft32_asm, FFT, 32, 32, 2, 0, 352, b8_i2, fma3, FMA3,
271  TX_DEF(fft32_ns, FFT, 32, 32, 2, 0, 352, b8_i2, fma3, FMA3, AV_TX_INPLACE | FF_TX_PRESHUFFLE,
273  TX_DEF(fft_sr, FFT, 64, 2097152, 2, 0, 256, b8_i2, avx, AVX, 0, AV_CPU_FLAG_AVXSLOW),
274  TX_DEF(fft_sr_asm, FFT, 64, 2097152, 2, 0, 320, b8_i2, avx, AVX,
276  TX_DEF(fft_sr_ns, FFT, 64, 2097152, 2, 0, 320, b8_i2, avx, AVX, AV_TX_INPLACE | FF_TX_PRESHUFFLE,
278  TX_DEF(fft_sr, FFT, 64, 2097152, 2, 0, 288, b8_i2, fma3, FMA3, 0, AV_CPU_FLAG_AVXSLOW),
279  TX_DEF(fft_sr_asm, FFT, 64, 2097152, 2, 0, 352, b8_i2, fma3, FMA3,
281  TX_DEF(fft_sr_ns, FFT, 64, 2097152, 2, 0, 352, b8_i2, fma3, FMA3, AV_TX_INPLACE | FF_TX_PRESHUFFLE,
283 
284 #if HAVE_AVX2_EXTERNAL
285  TX_DEF(fft15, FFT, 15, 15, 15, 0, 320, factor_init, avx2, AVX2,
287  TX_DEF(fft15_ns, FFT, 15, 15, 15, 0, 384, factor_init, avx2, AVX2,
289 
290  TX_DEF(fft_sr, FFT, 64, 2097152, 2, 0, 320, b8_i2, avx2, AVX2, 0,
292  TX_DEF(fft_sr_asm, FFT, 64, 2097152, 2, 0, 384, b8_i2, avx2, AVX2,
294  TX_DEF(fft_sr_ns, FFT, 64, 2097152, 2, 0, 384, b8_i2, avx2, AVX2, AV_TX_INPLACE | FF_TX_PRESHUFFLE,
296 
297  TX_DEF(fft_pfa_15xM, FFT, 60, TX_LEN_UNLIMITED, 15, 2, 320, fft_pfa_init, avx2, AVX2,
299  TX_DEF(fft_pfa_15xM_asm, FFT, 60, TX_LEN_UNLIMITED, 15, 2, 384, fft_pfa_init, avx2, AVX2,
301  TX_DEF(fft_pfa_15xM_ns, FFT, 60, TX_LEN_UNLIMITED, 15, 2, 384, fft_pfa_init, avx2, AVX2,
303 
304  TX_DEF(mdct_inv, MDCT, 16, TX_LEN_UNLIMITED, 2, TX_FACTOR_ANY, 384, m_inv_init, avx2, AVX2,
306 #endif
307 #endif
308 
309  NULL,
310 };
cpu.h
AVERROR
Filter the word “frame” indicates either a video frame or a group of audio as stored in an AVFrame structure Format for each input and each output the list of supported formats For video that means pixel format For audio that means channel sample they are references to shared objects When the negotiation mechanism computes the intersection of the formats supported at each end of a all references to both lists are replaced with a reference to the intersection And when a single format is eventually chosen for a link amongst the remaining all references to the list are updated That means that if a filter requires that its input and output have the same format amongst a supported all it has to do is use a reference to the same list of formats query_formats can leave some formats unset and return AVERROR(EAGAIN) to cause the negotiation mechanism toagain later. That can be used by filters with complex requirements to use the format negotiated on one link to set the formats supported on another. Frame references ownership and permissions
AVTXContext
Definition: tx_priv.h:235
tmp
static uint8_t tmp[11]
Definition: aes_ctr.c:28
FFTXCodeletOptions
Definition: tx_priv.h:183
TX_DEF
#define TX_DEF(fn, tx_type, len_min, len_max, f1, f2, p, init_fn, suffix, cf, cd_flags, cf2)
Definition: tx_priv.h:71
fft15
static av_always_inline void fft15(TXComplex *out, TXComplex *in, ptrdiff_t stride)
Definition: tx_template.c:467
ff_tx_gen_compound_mapping
int ff_tx_gen_compound_mapping(AVTXContext *s, FFTXCodeletOptions *opts, int inv, int n, int m)
Definition: tx.c:74
av_malloc
#define av_malloc(s)
Definition: tableprint_vlc.h:30
FF_TX_MAP_GATHER
@ FF_TX_MAP_GATHER
Definition: tx_priv.h:176
FFTXCodeletOptions::map_dir
FFTXMapDirection map_dir
Definition: tx_priv.h:187
AV_CPU_FLAG_SLOW_GATHER
#define AV_CPU_FLAG_SLOW_GATHER
CPU has slow gathers.
Definition: cpu.h:58
FFTXCodelet::factors
int factors[TX_MAX_FACTORS]
Definition: tx_priv.h:208
av_cold
#define av_cold
Definition: attributes.h:90
s
#define s(width, name)
Definition: cbs_vp9.c:198
ff_tx_mdct_gen_exp_float
int ff_tx_mdct_gen_exp_float(AVTXContext *s, int *pre_tab)
FF_TX_FORWARD_ONLY
#define FF_TX_FORWARD_ONLY
Definition: tx_priv.h:158
factor_init
static av_cold int factor_init(AVTXContext *s, const FFTXCodelet *cd, uint64_t flags, FFTXCodeletOptions *opts, int len, int inv, const void *scale)
Definition: tx_float_init.c:89
opts
AVDictionary * opts
Definition: movenc.c:50
NULL
#define NULL
Definition: coverity.c:32
AV_TX_INPLACE
@ AV_TX_INPLACE
Allows for in-place transformations, where input == output.
Definition: tx.h:161
FF_TX_OUT_OF_PLACE
#define FF_TX_OUT_OF_PLACE
Definition: tx_priv.h:154
FF_TX_PRESHUFFLE
#define FF_TX_PRESHUFFLE
Definition: tx_priv.h:156
ff_tx_gen_default_map
int ff_tx_gen_default_map(AVTXContext *s, FFTXCodeletOptions *opts)
Definition: tx.c:524
scale
static void scale(int *out, const int *in, const int w, const int h, const int shift)
Definition: vvc_intra.c:291
AV_CPU_FLAG_AVXSLOW
#define AV_CPU_FLAG_AVXSLOW
AVX supported, but slow when using YMM registers (e.g. Bulldozer)
Definition: cpu.h:48
attributes.h
TX_EMBED_INPUT_PFA_MAP
#define TX_EMBED_INPUT_PFA_MAP(map, tot_len, d1, d2)
Definition: tx_priv.h:271
i
#define i(width, name, range_min, range_max)
Definition: cbs_h2645.c:255
len
int len
Definition: vorbis_enc_data.h:426
FF_TX_MAP_SCATTER
@ FF_TX_MAP_SCATTER
Definition: tx_priv.h:179
TX_LEN_UNLIMITED
#define TX_LEN_UNLIMITED
Definition: tx_priv.h:216
tx_priv.h
ret
ret
Definition: filter_design.txt:187
ff_tx_init_subtx
av_cold int ff_tx_init_subtx(AVTXContext *s, enum AVTXType type, uint64_t flags, FFTXCodeletOptions *opts, int len, int inv, const void *scale)
Definition: tx.c:711
TX_FACTOR_ANY
#define TX_FACTOR_ANY
Definition: tx_priv.h:209
FF_TX_INVERSE_ONLY
#define FF_TX_INVERSE_ONLY
Definition: tx_priv.h:157
fft_pfa_init
static av_cold int fft_pfa_init(AVTXContext *s, const FFTXCodelet *cd, uint64_t flags, FFTXCodeletOptions *opts, int len, int inv, const void *scale)
Definition: tx_float_init.c:170
FFTXCodelet
Definition: tx_priv.h:199
ff_tx_init_tabs
av_cold void TX_TAB() ff_tx_init_tabs(int len)
Definition: tx_template.c:146
m_inv_init
static av_cold int m_inv_init(AVTXContext *s, const FFTXCodelet *cd, uint64_t flags, FFTXCodeletOptions *opts, int len, int inv, const void *scale)
Definition: tx_float_init.c:136
FF_TX_ASM_CALL
#define FF_TX_ASM_CALL
Definition: tx_priv.h:159
ff_tx_codelet_list_float_x86
const FFTXCodelet *const ff_tx_codelet_list_float_x86[]
Definition: tx_float_init.c:230
DECL_INIT_FN
#define DECL_INIT_FN(basis, interleave)
Definition: tx_float_init.c:70
flags
#define flags(name, subs,...)
Definition: cbs_av1.c:482
ff_tx_gen_pfa_input_map
int ff_tx_gen_pfa_input_map(AVTXContext *s, FFTXCodeletOptions *opts, int d1, int d2)
Definition: tx.c:43
TX_DECL_FN
#define TX_DECL_FN(fn, suffix)
Definition: tx_priv.h:68
TX_TYPE
#define TX_TYPE
Definition: aacdec.c:36