FFmpeg
dsp_init.c
Go to the documentation of this file.
1 /*
2  * VVC DSP init for x86
3  *
4  * Copyright (C) 2022-2024 Nuo Mi
5  * Copyright (c) 2023-2024 Wu Jianhua
6  *
7  * This file is part of FFmpeg.
8  *
9  * FFmpeg is free software; you can redistribute it and/or
10  * modify it under the terms of the GNU Lesser General Public
11  * License as published by the Free Software Foundation; either
12  * version 2.1 of the License, or (at your option) any later version.
13  *
14  * FFmpeg is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17  * Lesser General Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser General Public
20  * License along with FFmpeg; if not, write to the Free Software
21  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22  */
23 
24 #include "config.h"
25 
26 #include "libavutil/cpu.h"
27 #include "libavutil/x86/cpu.h"
28 #include "libavcodec/vvc/dec.h"
29 #include "libavcodec/vvc/ctu.h"
30 #include "libavcodec/vvc/dsp.h"
32 
33 #if ARCH_X86_64
34 
35 #define bf(fn, bd, opt) fn##_##bd##_##opt
36 #define BF(fn, bpc, opt) fn##_##bpc##bpc_##opt
37 
38 #define AVG_BPC_PROTOTYPES(bpc, opt) \
39 void BF(ff_vvc_avg, bpc, opt)(uint8_t *dst, ptrdiff_t dst_stride, \
40  const int16_t *src0, const int16_t *src1, intptr_t width, intptr_t height, intptr_t pixel_max); \
41 void BF(ff_vvc_w_avg, bpc, opt)(uint8_t *dst, ptrdiff_t dst_stride, \
42  const int16_t *src0, const int16_t *src1, intptr_t width, intptr_t height, \
43  intptr_t denom, intptr_t w0, intptr_t w1, intptr_t o0, intptr_t o1, intptr_t pixel_max);
44 
45 AVG_BPC_PROTOTYPES( 8, avx2)
46 AVG_BPC_PROTOTYPES(16, avx2)
47 
48 #define DMVR_PROTOTYPES(bd, opt) \
49 void ff_vvc_dmvr_##bd##_##opt(int16_t *dst, const uint8_t *src, ptrdiff_t src_stride, \
50  int height, intptr_t mx, intptr_t my, int width); \
51 void ff_vvc_dmvr_h_##bd##_##opt(int16_t *dst, const uint8_t *src, ptrdiff_t src_stride, \
52  int height, intptr_t mx, intptr_t my, int width); \
53 void ff_vvc_dmvr_v_##bd##_##opt(int16_t *dst, const uint8_t *src, ptrdiff_t src_stride, \
54  int height, intptr_t mx, intptr_t my, int width); \
55 void ff_vvc_dmvr_hv_##bd##_##opt(int16_t *dst, const uint8_t *src, ptrdiff_t src_stride, \
56  int height, intptr_t mx, intptr_t my, int width); \
57 
58 DMVR_PROTOTYPES( 8, avx2)
59 DMVR_PROTOTYPES(10, avx2)
60 DMVR_PROTOTYPES(12, avx2)
61 
62 #if ARCH_X86_64 && HAVE_AVX2_EXTERNAL
63 void ff_vvc_apply_bdof_avx2(uint8_t *dst, ptrdiff_t dst_stride,
64  const int16_t *src0, const int16_t *src1,
65  int w, int h, int pixel_max);
66 
67 #define OF_FUNC(bd, opt) \
68 static void vvc_apply_bdof_##bd##_##opt(uint8_t *dst, ptrdiff_t dst_stride, \
69  const int16_t *src0, const int16_t *src1, int w, int h) \
70 { \
71  ff_vvc_apply_bdof##_##opt(dst, dst_stride, src0, src1, w, h, (1 << bd) - 1); \
72 } \
73 
74 OF_FUNC( 8, avx2)
75 OF_FUNC(10, avx2)
76 OF_FUNC(12, avx2)
77 
78 #define OF_INIT(bd) c->inter.apply_bdof = vvc_apply_bdof_##bd##_avx2
79 #endif
80 
81 #define ALF_BPC_PROTOTYPES(bpc, opt) \
82 void BF(ff_vvc_alf_filter_luma, bpc, opt)(uint8_t *dst, ptrdiff_t dst_stride, \
83  const uint8_t *src, ptrdiff_t src_stride, ptrdiff_t width, ptrdiff_t height, \
84  const int16_t *filter, const int16_t *clip, ptrdiff_t stride, ptrdiff_t vb_pos, ptrdiff_t pixel_max); \
85 void BF(ff_vvc_alf_filter_chroma, bpc, opt)(uint8_t *dst, ptrdiff_t dst_stride, \
86  const uint8_t *src, ptrdiff_t src_stride, ptrdiff_t width, ptrdiff_t height, \
87  const int16_t *filter, const int16_t *clip, ptrdiff_t stride, ptrdiff_t vb_pos, ptrdiff_t pixel_max); \
88 void BF(ff_vvc_alf_classify_grad, bpc, opt)(int *gradient_sum, \
89  const uint8_t *src, ptrdiff_t src_stride, intptr_t width, intptr_t height, intptr_t vb_pos); \
90 void BF(ff_vvc_alf_classify, bpc, opt)(int *class_idx, int *transpose_idx, const int *gradient_sum, \
91  intptr_t width, intptr_t height, intptr_t vb_pos, intptr_t bit_depth); \
92 
93 ALF_BPC_PROTOTYPES(8, avx2)
94 ALF_BPC_PROTOTYPES(16, avx2)
95 
96 #if ARCH_X86_64
97 #define FW_PUT(name, depth, opt) \
98 static void vvc_put_ ## name ## _ ## depth ## _##opt(int16_t *dst, const uint8_t *src, ptrdiff_t srcstride, \
99  int height, const int8_t *hf, const int8_t *vf, int width) \
100 { \
101  ff_h2656_put_## name ## _ ## depth ## _##opt(dst, 2 * MAX_PB_SIZE, src, srcstride, height, hf, vf, width); \
102 }
103 
104 #if HAVE_SSE4_EXTERNAL
105 #define FW_PUT_TAP(fname, bitd, opt ) \
106  FW_PUT(fname##4, bitd, opt ) \
107  FW_PUT(fname##8, bitd, opt ) \
108  FW_PUT(fname##16, bitd, opt ) \
109  FW_PUT(fname##32, bitd, opt ) \
110  FW_PUT(fname##64, bitd, opt ) \
111  FW_PUT(fname##128, bitd, opt ) \
112 
113 #define FW_PUT_4TAP(fname, bitd, opt) \
114  FW_PUT(fname ## 2, bitd, opt) \
115  FW_PUT_TAP(fname, bitd, opt)
116 
117 #define FW_PUT_4TAP_SSE4(bitd) \
118  FW_PUT_4TAP(pixels, bitd, sse4) \
119  FW_PUT_4TAP(4tap_h, bitd, sse4) \
120  FW_PUT_4TAP(4tap_v, bitd, sse4) \
121  FW_PUT_4TAP(4tap_hv, bitd, sse4)
122 
123 #define FW_PUT_8TAP_SSE4(bitd) \
124  FW_PUT_TAP(8tap_h, bitd, sse4) \
125  FW_PUT_TAP(8tap_v, bitd, sse4) \
126  FW_PUT_TAP(8tap_hv, bitd, sse4)
127 
128 #define FW_PUT_SSE4(bitd) \
129  FW_PUT_4TAP_SSE4(bitd) \
130  FW_PUT_8TAP_SSE4(bitd)
131 
132 FW_PUT_SSE4( 8)
133 FW_PUT_SSE4(10)
134 FW_PUT_SSE4(12)
135 #endif
136 
137 #if HAVE_AVX2_EXTERNAL
138 #define FW_PUT_TAP_AVX2(n, bitd) \
139  FW_PUT(n ## tap_h32, bitd, avx2) \
140  FW_PUT(n ## tap_h64, bitd, avx2) \
141  FW_PUT(n ## tap_h128, bitd, avx2) \
142  FW_PUT(n ## tap_v32, bitd, avx2) \
143  FW_PUT(n ## tap_v64, bitd, avx2) \
144  FW_PUT(n ## tap_v128, bitd, avx2)
145 
146 #define FW_PUT_AVX2(bitd) \
147  FW_PUT(pixels32, bitd, avx2) \
148  FW_PUT(pixels64, bitd, avx2) \
149  FW_PUT(pixels128, bitd, avx2) \
150  FW_PUT_TAP_AVX2(4, bitd) \
151  FW_PUT_TAP_AVX2(8, bitd) \
152 
153 FW_PUT_AVX2( 8)
154 FW_PUT_AVX2(10)
155 FW_PUT_AVX2(12)
156 
157 #define FW_PUT_TAP_16BPC_AVX2(n, bitd) \
158  FW_PUT(n ## tap_h16, bitd, avx2) \
159  FW_PUT(n ## tap_v16, bitd, avx2) \
160  FW_PUT(n ## tap_hv16, bitd, avx2) \
161  FW_PUT(n ## tap_hv32, bitd, avx2) \
162  FW_PUT(n ## tap_hv64, bitd, avx2) \
163  FW_PUT(n ## tap_hv128, bitd, avx2)
164 
165 #define FW_PUT_16BPC_AVX2(bitd) \
166  FW_PUT(pixels16, bitd, avx2) \
167  FW_PUT_TAP_16BPC_AVX2(4, bitd) \
168  FW_PUT_TAP_16BPC_AVX2(8, bitd)
169 
170 FW_PUT_16BPC_AVX2(10)
171 FW_PUT_16BPC_AVX2(12)
172 
173 #define AVG_FUNCS(bpc, bd, opt) \
174 static void bf(vvc_avg, bd, opt)(uint8_t *dst, ptrdiff_t dst_stride, \
175  const int16_t *src0, const int16_t *src1, int width, int height) \
176 { \
177  BF(ff_vvc_avg, bpc, opt)(dst, dst_stride, src0, src1, width, height, (1 << bd) - 1); \
178 } \
179 static void bf(vvc_w_avg, bd, opt)(uint8_t *dst, ptrdiff_t dst_stride, \
180  const int16_t *src0, const int16_t *src1, int width, int height, \
181  int denom, int w0, int w1, int o0, int o1) \
182 { \
183  BF(ff_vvc_w_avg, bpc, opt)(dst, dst_stride, src0, src1, width, height, \
184  denom, w0, w1, o0, o1, (1 << bd) - 1); \
185 }
186 
187 AVG_FUNCS(8, 8, avx2)
188 AVG_FUNCS(16, 10, avx2)
189 AVG_FUNCS(16, 12, avx2)
190 
191 #define ALF_FUNCS(bpc, bd, opt) \
192 static void bf(vvc_alf_filter_luma, bd, opt)(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, ptrdiff_t src_stride, \
193  int width, int height, const int16_t *filter, const int16_t *clip, const int vb_pos) \
194 { \
195  const int param_stride = (width >> 2) * ALF_NUM_COEFF_LUMA; \
196  BF(ff_vvc_alf_filter_luma, bpc, opt)(dst, dst_stride, src, src_stride, width, height, \
197  filter, clip, param_stride, vb_pos, (1 << bd) - 1); \
198 } \
199 static void bf(vvc_alf_filter_chroma, bd, opt)(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, ptrdiff_t src_stride, \
200  int width, int height, const int16_t *filter, const int16_t *clip, const int vb_pos) \
201 { \
202  BF(ff_vvc_alf_filter_chroma, bpc, opt)(dst, dst_stride, src, src_stride, width, height, \
203  filter, clip, 0, vb_pos,(1 << bd) - 1); \
204 } \
205 static void bf(vvc_alf_classify, bd, opt)(int *class_idx, int *transpose_idx, \
206  const uint8_t *src, ptrdiff_t src_stride, int width, int height, int vb_pos, int *gradient_tmp) \
207 { \
208  BF(ff_vvc_alf_classify_grad, bpc, opt)(gradient_tmp, src, src_stride, width, height, vb_pos); \
209  BF(ff_vvc_alf_classify, bpc, opt)(class_idx, transpose_idx, gradient_tmp, width, height, vb_pos, bd); \
210 } \
211 
212 ALF_FUNCS(8, 8, avx2)
213 ALF_FUNCS(16, 10, avx2)
214 ALF_FUNCS(16, 12, avx2)
215 
216 #endif
217 
218 #define PEL_LINK(dst, C, W, idx1, idx2, name, D, opt) \
219  dst[C][W][idx1][idx2] = vvc_put_## name ## _ ## D ## _##opt; \
220  dst ## _uni[C][W][idx1][idx2] = ff_h2656_put_uni_ ## name ## _ ## D ## _##opt; \
221 
222 #define MC_TAP_LINKS(pointer, C, my, mx, fname, bitd, opt ) \
223  PEL_LINK(pointer, C, 1, my , mx , fname##4 , bitd, opt ); \
224  PEL_LINK(pointer, C, 2, my , mx , fname##8 , bitd, opt ); \
225  PEL_LINK(pointer, C, 3, my , mx , fname##16, bitd, opt ); \
226  PEL_LINK(pointer, C, 4, my , mx , fname##32, bitd, opt ); \
227  PEL_LINK(pointer, C, 5, my , mx , fname##64, bitd, opt ); \
228  PEL_LINK(pointer, C, 6, my , mx , fname##128, bitd, opt );
229 
230 #define MC_8TAP_LINKS(pointer, my, mx, fname, bitd, opt) \
231  MC_TAP_LINKS(pointer, LUMA, my, mx, fname, bitd, opt)
232 
233 #define MC_8TAP_LINKS_SSE4(bd) \
234  MC_8TAP_LINKS(c->inter.put, 0, 0, pixels, bd, sse4); \
235  MC_8TAP_LINKS(c->inter.put, 0, 1, 8tap_h, bd, sse4); \
236  MC_8TAP_LINKS(c->inter.put, 1, 0, 8tap_v, bd, sse4); \
237  MC_8TAP_LINKS(c->inter.put, 1, 1, 8tap_hv, bd, sse4)
238 
239 #define MC_4TAP_LINKS(pointer, my, mx, fname, bitd, opt) \
240  PEL_LINK(pointer, CHROMA, 0, my , mx , fname##2 , bitd, opt ); \
241  MC_TAP_LINKS(pointer, CHROMA, my, mx, fname, bitd, opt) \
242 
243 #define MC_4TAP_LINKS_SSE4(bd) \
244  MC_4TAP_LINKS(c->inter.put, 0, 0, pixels, bd, sse4); \
245  MC_4TAP_LINKS(c->inter.put, 0, 1, 4tap_h, bd, sse4); \
246  MC_4TAP_LINKS(c->inter.put, 1, 0, 4tap_v, bd, sse4); \
247  MC_4TAP_LINKS(c->inter.put, 1, 1, 4tap_hv, bd, sse4)
248 
249 #define MC_LINK_SSE4(bd) \
250  MC_4TAP_LINKS_SSE4(bd) \
251  MC_8TAP_LINKS_SSE4(bd)
252 
253 #define MC_TAP_LINKS_AVX2(C,tap,bd) do { \
254  PEL_LINK(c->inter.put, C, 4, 0, 0, pixels32, bd, avx2) \
255  PEL_LINK(c->inter.put, C, 5, 0, 0, pixels64, bd, avx2) \
256  PEL_LINK(c->inter.put, C, 6, 0, 0, pixels128, bd, avx2) \
257  PEL_LINK(c->inter.put, C, 4, 0, 1, tap##tap_h32, bd, avx2) \
258  PEL_LINK(c->inter.put, C, 5, 0, 1, tap##tap_h64, bd, avx2) \
259  PEL_LINK(c->inter.put, C, 6, 0, 1, tap##tap_h128, bd, avx2) \
260  PEL_LINK(c->inter.put, C, 4, 1, 0, tap##tap_v32, bd, avx2) \
261  PEL_LINK(c->inter.put, C, 5, 1, 0, tap##tap_v64, bd, avx2) \
262  PEL_LINK(c->inter.put, C, 6, 1, 0, tap##tap_v128, bd, avx2) \
263  } while (0)
264 
265 #define MC_LINKS_AVX2(bd) \
266  MC_TAP_LINKS_AVX2(LUMA, 8, bd); \
267  MC_TAP_LINKS_AVX2(CHROMA, 4, bd);
268 
269 #define MC_TAP_LINKS_16BPC_AVX2(C, tap, bd) do { \
270  PEL_LINK(c->inter.put, C, 3, 0, 0, pixels16, bd, avx2) \
271  PEL_LINK(c->inter.put, C, 3, 0, 1, tap##tap_h16, bd, avx2) \
272  PEL_LINK(c->inter.put, C, 3, 1, 0, tap##tap_v16, bd, avx2) \
273  PEL_LINK(c->inter.put, C, 3, 1, 1, tap##tap_hv16, bd, avx2) \
274  PEL_LINK(c->inter.put, C, 4, 1, 1, tap##tap_hv32, bd, avx2) \
275  PEL_LINK(c->inter.put, C, 5, 1, 1, tap##tap_hv64, bd, avx2) \
276  PEL_LINK(c->inter.put, C, 6, 1, 1, tap##tap_hv128, bd, avx2) \
277  } while (0)
278 
279 #define MC_LINKS_16BPC_AVX2(bd) \
280  MC_TAP_LINKS_16BPC_AVX2(LUMA, 8, bd); \
281  MC_TAP_LINKS_16BPC_AVX2(CHROMA, 4, bd);
282 
283 #define AVG_INIT(bd, opt) do { \
284  c->inter.avg = bf(vvc_avg, bd, opt); \
285  c->inter.w_avg = bf(vvc_w_avg, bd, opt); \
286 } while (0)
287 
288 #define DMVR_INIT(bd) do { \
289  c->inter.dmvr[0][0] = ff_vvc_dmvr_##bd##_avx2; \
290  c->inter.dmvr[0][1] = ff_vvc_dmvr_h_##bd##_avx2; \
291  c->inter.dmvr[1][0] = ff_vvc_dmvr_v_##bd##_avx2; \
292  c->inter.dmvr[1][1] = ff_vvc_dmvr_hv_##bd##_avx2; \
293 } while (0)
294 
295 #define ALF_INIT(bd) do { \
296  c->alf.filter[LUMA] = vvc_alf_filter_luma_##bd##_avx2; \
297  c->alf.filter[CHROMA] = vvc_alf_filter_chroma_##bd##_avx2; \
298  c->alf.classify = vvc_alf_classify_##bd##_avx2; \
299 } while (0)
300 
301 int ff_vvc_sad_avx2(const int16_t *src0, const int16_t *src1, int dx, int dy, int block_w, int block_h);
302 #define SAD_INIT() c->inter.sad = ff_vvc_sad_avx2
303 #endif
304 
305 
306 #endif // ARCH_X86_64
307 
308 void ff_vvc_dsp_init_x86(VVCDSPContext *const c, const int bd)
309 {
310 #if ARCH_X86_64
311  const int cpu_flags = av_get_cpu_flags();
312 
313  switch (bd) {
314  case 8:
315 #if HAVE_SSE4_EXTERNAL
316  if (EXTERNAL_SSE4(cpu_flags)) {
317  MC_LINK_SSE4(8);
318  }
319 #endif
320 #if HAVE_AVX2_EXTERNAL
322  ALF_INIT(8);
323  AVG_INIT(8, avx2);
324  MC_LINKS_AVX2(8);
325  OF_INIT(8);
326  DMVR_INIT(8);
327  SAD_INIT();
328  }
329 #endif
330  break;
331  case 10:
332 #if HAVE_SSE4_EXTERNAL
333  if (EXTERNAL_SSE4(cpu_flags)) {
334  MC_LINK_SSE4(10);
335  }
336 #endif
337 #if HAVE_AVX2_EXTERNAL
339  ALF_INIT(10);
340  AVG_INIT(10, avx2);
341  MC_LINKS_AVX2(10);
342  MC_LINKS_16BPC_AVX2(10);
343  OF_INIT(10);
344  DMVR_INIT(10);
345  SAD_INIT();
346  }
347 #endif
348  break;
349  case 12:
350 #if HAVE_SSE4_EXTERNAL
351  if (EXTERNAL_SSE4(cpu_flags)) {
352  MC_LINK_SSE4(12);
353  }
354 #endif
355 #if HAVE_AVX2_EXTERNAL
357  ALF_INIT(12);
358  AVG_INIT(12, avx2);
359  MC_LINKS_AVX2(12);
360  MC_LINKS_16BPC_AVX2(12);
361  OF_INIT(12);
362  DMVR_INIT(12);
363  SAD_INIT();
364  }
365 #endif
366  break;
367  default:
368  break;
369  }
370 #endif
371 }
cpu.h
src1
const pixel * src1
Definition: h264pred_template.c:420
EXTERNAL_AVX2_FAST
#define EXTERNAL_AVX2_FAST(flags)
Definition: cpu.h:79
w
uint8_t w
Definition: llviddspenc.c:38
av_get_cpu_flags
int av_get_cpu_flags(void)
Return the flags which specify extensions supported by the CPU.
Definition: cpu.c:109
cpu_flags
static atomic_int cpu_flags
Definition: cpu.c:56
ff_vvc_dsp_init_x86
void ff_vvc_dsp_init_x86(VVCDSPContext *const c, const int bd)
Definition: dsp_init.c:308
DMVR_PROTOTYPES
#define DMVR_PROTOTYPES(bd, opt)
Definition: dsp_init.c:42
dsp.h
c
Undefined Behavior In the C some operations are like signed integer dereferencing freed accessing outside allocated Undefined Behavior must not occur in a C it is not safe even if the output of undefined operations is unused The unsafety may seem nit picking but Optimizing compilers have in fact optimized code on the assumption that no undefined Behavior occurs Optimizing code based on wrong assumptions can and has in some cases lead to effects beyond the output of computations The signed integer overflow problem in speed critical code Code which is highly optimized and works with signed integers sometimes has the problem that often the output of the computation does not c
Definition: undefined.txt:32
dst
uint8_t ptrdiff_t const uint8_t ptrdiff_t int intptr_t intptr_t int int16_t * dst
Definition: dsp.h:83
cpu.h
DMVR_INIT
#define DMVR_INIT(bd, opt)
Definition: dsp_init.c:55
h2656dsp.h
EXTERNAL_SSE4
#define EXTERNAL_SSE4(flags)
Definition: cpu.h:68
src0
const pixel *const src0
Definition: h264pred_template.c:419
h
h
Definition: vp9dsp_template.c:2070
ctu.h
dec.h
VVCDSPContext
Definition: dsp.h:169