FFmpeg
uops_tmpl.c
Go to the documentation of this file.
1 /**
2  * Copyright (C) 2026 Niklas Haas
3  *
4  * This file is part of FFmpeg.
5  *
6  * FFmpeg is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU Lesser General Public
8  * License as published by the Free Software Foundation; either
9  * version 2.1 of the License, or (at your option) any later version.
10  *
11  * FFmpeg is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14  * Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with FFmpeg; if not, write to the Free Software
18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19  */
20 
21 #include <libavutil/bswap.h>
22 
23 #include "uops_tmpl.h"
24 
25 #ifndef BIT_DEPTH
26 # define BIT_DEPTH 8
27 #endif
28 
29 #if IS_FLOAT && BIT_DEPTH == 32
30 # define PIXEL_TYPE SWS_PIXEL_F32
31 # define pixel_t float
32 # define inter_t float
33 # define PX F32
34 # define px f32
35 #elif BIT_DEPTH == 32
36 # define PIXEL_MAX 0xFFFFFFFFu
37 # define PIXEL_SWAP av_bswap32
38 # define pixel_t uint32_t
39 # define inter_t int64_t
40 # define PX U32
41 # define px u32
42 #elif BIT_DEPTH == 16
43 # define PIXEL_MAX 0xFFFFu
44 # define PIXEL_SWAP av_bswap16
45 # define pixel_t uint16_t
46 # define inter_t int64_t
47 # define PX U16
48 # define px u16
49 #elif BIT_DEPTH == 8
50 # define PIXEL_MAX 0xFFu
51 # define pixel_t uint8_t
52 # define inter_t int32_t
53 # define PX U8
54 # define px u8
55 #else
56 # error Invalid BIT_DEPTH
57 #endif
58 
59 /*********************************
60  * Generic read/write operations *
61  *********************************/
62 
63 DECL_READ(read_planar, const SwsCompMask mask)
64 {
65  SWS_LOOP
66  for (int i = 0; i < SWS_BLOCK_SIZE; i++) {
67  if (X) x[i] = in0[i];
68  if (Y) y[i] = in1[i];
69  if (Z) z[i] = in2[i];
70  if (W) w[i] = in3[i];
71  }
72 
73  if (X) iter->in[0] += SIZEOF_BLOCK;
74  if (Y) iter->in[1] += SIZEOF_BLOCK;
75  if (Z) iter->in[2] += SIZEOF_BLOCK;
76  if (W) iter->in[3] += SIZEOF_BLOCK;
77 
78  CONTINUE(x, y, z, w);
79 }
80 
81 DECL_READ(read_packed, const SwsCompMask mask)
82 {
83  const int elems = W ? 4 : Z ? 3 : Y ? 2 : 1;
84 
85  SWS_LOOP
86  for (int i = 0; i < SWS_BLOCK_SIZE; i++) {
87  if (X) x[i] = in0[elems * i + 0];
88  if (Y) y[i] = in0[elems * i + 1];
89  if (Z) z[i] = in0[elems * i + 2];
90  if (W) w[i] = in0[elems * i + 3];
91  }
92 
93  iter->in[0] += SIZEOF_BLOCK * elems;
94  CONTINUE(x, y, z, w);
95 }
96 
97 DECL_WRITE(write_planar, const SwsCompMask mask)
98 {
99  SWS_LOOP
100  for (int i = 0; i < SWS_BLOCK_SIZE; i++) {
101  if (X) out0[i] = x[i];
102  if (Y) out1[i] = y[i];
103  if (Z) out2[i] = z[i];
104  if (W) out3[i] = w[i];
105  }
106 
107  if (X) iter->out[0] += SIZEOF_BLOCK;
108  if (Y) iter->out[1] += SIZEOF_BLOCK;
109  if (Z) iter->out[2] += SIZEOF_BLOCK;
110  if (W) iter->out[3] += SIZEOF_BLOCK;
111 }
112 
113 DECL_WRITE(write_packed, const SwsCompMask mask)
114 {
115  const int elems = W ? 4 : Z ? 3 : Y ? 2 : 1;
116 
117  SWS_LOOP
118  for (int i = 0; i < SWS_BLOCK_SIZE; i++) {
119  if (X) out0[elems * i + 0] = x[i];
120  if (Y) out0[elems * i + 1] = y[i];
121  if (Z) out0[elems * i + 2] = z[i];
122  if (W) out0[elems * i + 3] = w[i];
123  }
124 
125  iter->out[0] += SIZEOF_BLOCK * elems;
126 }
127 
128 #if BIT_DEPTH == 8
129 
131 {
133 
134  SWS_LOOP
135  for (int i = 0; i < SWS_BLOCK_SIZE; i += 8) {
136  const pixel_t val = ((const pixel_t *) in0)[i >> 3];
137  x[i + 0] = (val >> 7) & 1;
138  x[i + 1] = (val >> 6) & 1;
139  x[i + 2] = (val >> 5) & 1;
140  x[i + 3] = (val >> 4) & 1;
141  x[i + 4] = (val >> 3) & 1;
142  x[i + 5] = (val >> 2) & 1;
143  x[i + 6] = (val >> 1) & 1;
144  x[i + 7] = (val >> 0) & 1;
145  }
146 
147  iter->in[0] += SIZEOF_BLOCK >> 3;
148  CONTINUE(x, y, z, w);
149 }
150 
151 DECL_READ(read_nibble, const SwsCompMask mask)
152 {
154 
155  SWS_LOOP
156  for (int i = 0; i < SWS_BLOCK_SIZE; i += 2) {
157  const pixel_t val = in0[i >> 1];
158  x[i + 0] = val >> 4; /* high nibble */
159  x[i + 1] = val & 0xF; /* low nibble */
160  }
161 
162  iter->in[0] += SIZEOF_BLOCK >> 1;
163  CONTINUE(x, y, z, w);
164 }
165 
166 DECL_WRITE(write_bit, const SwsCompMask mask)
167 {
169 
170  SWS_LOOP
171  for (int i = 0; i < SWS_BLOCK_SIZE; i += 8) {
172  out0[i >> 3] = x[i + 0] << 7 |
173  x[i + 1] << 6 |
174  x[i + 2] << 5 |
175  x[i + 3] << 4 |
176  x[i + 4] << 3 |
177  x[i + 5] << 2 |
178  x[i + 6] << 1 |
179  x[i + 7];
180  }
181 
182  iter->out[0] += SIZEOF_BLOCK >> 3;
183 }
184 
185 DECL_WRITE(write_nibble, const SwsCompMask mask)
186 {
188 
189  SWS_LOOP
190  for (int i = 0; i < SWS_BLOCK_SIZE; i += 2)
191  out0[i >> 1] = x[i] << 4 | x[i + 1];
192 
193  iter->out[0] += SIZEOF_BLOCK >> 1;
194 }
195 
196 #endif /* BIT_DEPTH == 8 */
197 
198 SWS_FOR(PX, READ_PLANAR, DECL_IMPL_READ, read_planar)
199 SWS_FOR(PX, READ_PACKED, DECL_IMPL_READ, read_packed)
200 SWS_FOR(PX, READ_NIBBLE, DECL_IMPL_READ, read_nibble)
201 SWS_FOR(PX, READ_BIT, DECL_IMPL_READ, read_bit)
202 SWS_FOR(PX, WRITE_PLANAR, DECL_IMPL_WRITE, write_planar)
203 SWS_FOR(PX, WRITE_PACKED, DECL_IMPL_WRITE, write_packed)
204 SWS_FOR(PX, WRITE_NIBBLE, DECL_IMPL_WRITE, write_nibble)
205 SWS_FOR(PX, WRITE_BIT, DECL_IMPL_WRITE, write_bit)
206 
207 SWS_FOR_STRUCT(PX, READ_PLANAR, DECL_ENTRY)
208 SWS_FOR_STRUCT(PX, READ_PACKED, DECL_ENTRY)
209 SWS_FOR_STRUCT(PX, READ_NIBBLE, DECL_ENTRY)
210 SWS_FOR_STRUCT(PX, READ_BIT, DECL_ENTRY)
211 SWS_FOR_STRUCT(PX, WRITE_PLANAR, DECL_ENTRY)
212 SWS_FOR_STRUCT(PX, WRITE_PACKED, DECL_ENTRY)
213 SWS_FOR_STRUCT(PX, WRITE_NIBBLE, DECL_ENTRY)
214 SWS_FOR_STRUCT(PX, WRITE_BIT, DECL_ENTRY)
215 
216 /*****************************
217  * Scaling / filtering reads *
218  *****************************/
219 
221 {
222  if (params->uop->par.filter.type != SWS_PIXEL_F32)
223  return AVERROR(ENOTSUP);
224 
225  const SwsFilterWeights *filter = params->uop->data.kernel;
226  static_assert(sizeof(out->priv.ptr) <= sizeof(int32_t[2]),
227  ">8 byte pointers not supported");
228 
229  /* Pre-convert weights to float */
230  float *weights = av_calloc(filter->num_weights, sizeof(float));
231  if (!weights)
232  return AVERROR(ENOMEM);
233 
234  for (int i = 0; i < filter->num_weights; i++)
235  weights[i] = (float) filter->weights[i] / SWS_FILTER_SCALE;
236 
237  out->priv.ptr = weights;
238  out->priv.i32[2] = filter->filter_size;
239  out->free = ff_op_priv_free;
240  return 0;
241 }
242 
243 /* Fully general vertical planar filter case */
244 DECL_READ(read_planar_fv, const SwsCompMask mask, const SwsPixelType type)
245 {
247  const SwsOpExec *exec = iter->exec;
248  const float *restrict weights = impl->priv.ptr;
249  const int filter_size = impl->priv.i32[2];
250  weights += filter_size * iter->y;
251 
252  block_t xs, ys, zs, ws;
253  if (X) memset(&xs.f32, 0, sizeof(xs.f32));
254  if (Y) memset(&ys.f32, 0, sizeof(ys.f32));
255  if (Z) memset(&zs.f32, 0, sizeof(zs.f32));
256  if (W) memset(&ws.f32, 0, sizeof(ws.f32));
257 
258  for (int j = 0; j < filter_size; j++) {
259  const float weight = weights[j];
260 
261  SWS_LOOP
262  for (int i = 0; i < SWS_BLOCK_SIZE; i++) {
263  if (X) xs.f32[i] += weight * in0[i];
264  if (Y) ys.f32[i] += weight * in1[i];
265  if (Z) zs.f32[i] += weight * in2[i];
266  if (W) ws.f32[i] += weight * in3[i];
267  }
268 
269  if (X) in0 = bump_ptr(in0, exec->in_stride[0]);
270  if (Y) in1 = bump_ptr(in1, exec->in_stride[1]);
271  if (Z) in2 = bump_ptr(in2, exec->in_stride[2]);
272  if (W) in3 = bump_ptr(in3, exec->in_stride[3]);
273  }
274 
275  if (X) iter->in[0] += SIZEOF_BLOCK;
276  if (Y) iter->in[1] += SIZEOF_BLOCK;
277  if (Z) iter->in[2] += SIZEOF_BLOCK;
278  if (W) iter->in[3] += SIZEOF_BLOCK;
279 
280  CONTINUE(&xs, &ys, &zs, &ws);
281 }
282 
284 {
285  if (params->uop->par.filter.type != SWS_PIXEL_F32)
286  return AVERROR(ENOTSUP);
287 
288  SwsFilterWeights *filter = params->uop->data.kernel;
289  out->priv.ptr = av_refstruct_ref(filter->weights);
290  out->priv.i32[2] = filter->filter_size;
291  out->free = ff_op_priv_unref;
292  return 0;
293 }
294 
295 /* Fully general horizontal planar filter case */
296 DECL_READ(read_planar_fh, const SwsCompMask mask, const SwsPixelType type)
297 {
299  const SwsOpExec *exec = iter->exec;
300  const int *restrict weights = impl->priv.ptr;
301  const int filter_size = impl->priv.i32[2];
302  const float scale = 1.0f / SWS_FILTER_SCALE;
303  const int xpos = iter->x;
304  weights += filter_size * iter->x;
305 
306  block_t xs, ys, zs, ws;
307  for (int i = 0; i < SWS_BLOCK_SIZE; i++) {
308  const int offset = exec->in_offset_x[xpos + i];
309  pixel_t *start0 = bump_ptr(in0, offset);
310  pixel_t *start1 = bump_ptr(in1, offset);
311  pixel_t *start2 = bump_ptr(in2, offset);
312  pixel_t *start3 = bump_ptr(in3, offset);
313 
314  inter_t sx = 0, sy = 0, sz = 0, sw = 0;
315  for (int j = 0; j < filter_size; j++) {
316  const int weight = weights[j];
317  if (X) sx += weight * start0[j];
318  if (Y) sy += weight * start1[j];
319  if (Z) sz += weight * start2[j];
320  if (W) sw += weight * start3[j];
321  }
322 
323  if (X) xs.f32[i] = (float) sx * scale;
324  if (Y) ys.f32[i] = (float) sy * scale;
325  if (Z) zs.f32[i] = (float) sz * scale;
326  if (W) ws.f32[i] = (float) sw * scale;
327 
328  weights += filter_size;
329  }
330 
331  CONTINUE(&xs, &ys, &zs, &ws);
332 }
333 
334 SWS_FOR(PX, READ_PLANAR_FV, DECL_IMPL_READ, read_planar_fv)
335 SWS_FOR(PX, READ_PLANAR_FH, DECL_IMPL_READ, read_planar_fh)
336 SWS_FOR_STRUCT(PX, READ_PLANAR_FV, DECL_ENTRY, .setup = fn(setup_filter_v) )
337 SWS_FOR_STRUCT(PX, READ_PLANAR_FH, DECL_ENTRY, .setup = fn(setup_filter_h) )
338 
339 /***************************
340  * Permutation and copying *
341  ***************************/
342 
343 /* Permute by directly swapping the order of arguments to the continuation. */
344 #define DECL_PERMUTE(DUMMY, NAME, TYPE, UOP, MASK, IDX0, IDX1, IDX2, IDX3) \
345  static void NAME##_c(SwsOpIter *restrict iter, \
346  const SwsOpImpl *restrict impl, \
347  void *restrict in0, void *restrict in1, \
348  void *restrict in2, void *restrict in3) \
349  { \
350  CONTINUE(in##IDX0, in##IDX1, in##IDX2, in##IDX3); \
351  }
352 
353 #define DECL_COPY(DUMMY, NAME, TYPE, UOP, MASK, IDX0, IDX1, IDX2, IDX3) \
354  static void NAME##_c(SwsOpIter *restrict iter, \
355  const SwsOpImpl *restrict impl, \
356  void *restrict in0, void *restrict in1, \
357  void *restrict in2, void *restrict in3) \
358  { \
359  const SwsCompMask mask = (MASK); \
360  block_t x, y, z, w; \
361  \
362  if (X) memcpy(&x.px, in##IDX0, SIZEOF_BLOCK); \
363  if (Y) memcpy(&y.px, in##IDX1, SIZEOF_BLOCK); \
364  if (Z) memcpy(&z.px, in##IDX2, SIZEOF_BLOCK); \
365  if (W) memcpy(&w.px, in##IDX3, SIZEOF_BLOCK); \
366  \
367  CONTINUE(X ? &x : in0, Y ? &y : in1, Z ? &z : in2, W ? &w : in3); \
368  }
369 
370 SWS_FOR(PX, PERMUTE, DECL_PERMUTE)
372 SWS_FOR_STRUCT(PX, PERMUTE, DECL_ENTRY)
374 
375 /*********************
376  * Format conversion *
377  *********************/
378 
379 #define DECL_CAST(DST, dst) \
380  DECL_FUNC(to_##dst, const SwsCompMask mask) \
381  { \
382  block_t xx, yy, zz, ww; \
383  \
384  SWS_LOOP \
385  for (int i = 0; i < SWS_BLOCK_SIZE; i++) { \
386  if (X) xx.dst[i] = x[i]; \
387  if (Y) yy.dst[i] = y[i]; \
388  if (Z) zz.dst[i] = z[i]; \
389  if (W) ww.dst[i] = w[i]; \
390  } \
391  \
392  CONTINUE(&xx, &yy, &zz, &ww); \
393  } \
394  \
395  SWS_FOR(PX, TO_##DST, DECL_IMPL, to_##dst) \
396  SWS_FOR_STRUCT(PX, TO_##DST, DECL_ENTRY)
397 
398 DECL_CAST(U8, u8)
399 DECL_CAST(U16, u16)
400 DECL_CAST(U32, u32)
401 DECL_CAST(F32, f32)
402 
403 /********************
404  * Bit manipulation *
405  ********************/
406 
407 #if !IS_FLOAT
408 DECL_FUNC(lshift, const SwsCompMask mask, const uint8_t amount)
409 {
410  SWS_LOOP
411  for (int i = 0; i < SWS_BLOCK_SIZE; i++) {
412  if (X) x[i] <<= amount;
413  if (Y) y[i] <<= amount;
414  if (Z) z[i] <<= amount;
415  if (W) w[i] <<= amount;
416  }
417 
418  CONTINUE(x, y, z, w);
419 }
420 
421 DECL_FUNC(rshift, const SwsCompMask mask, const uint8_t amount)
422 {
423  SWS_LOOP
424  for (int i = 0; i < SWS_BLOCK_SIZE; i++) {
425  if (X) x[i] >>= amount;
426  if (Y) y[i] >>= amount;
427  if (Z) z[i] >>= amount;
428  if (W) w[i] >>= amount;
429  }
430 
431  CONTINUE(x, y, z, w);
432 }
433 #endif
434 
435 SWS_FOR(PX, LSHIFT, DECL_IMPL, lshift)
436 SWS_FOR(PX, RSHIFT, DECL_IMPL, rshift)
437 
438 SWS_FOR_STRUCT(PX, LSHIFT, DECL_ENTRY)
440 
441 #ifdef PIXEL_SWAP
442 DECL_FUNC(swap_bytes, const SwsCompMask mask)
443 {
444  SWS_LOOP
445  for (int i = 0; i < SWS_BLOCK_SIZE; i++) {
446  if (X) x[i] = PIXEL_SWAP(x[i]);
447  if (Y) y[i] = PIXEL_SWAP(y[i]);
448  if (Z) z[i] = PIXEL_SWAP(z[i]);
449  if (W) w[i] = PIXEL_SWAP(w[i]);
450  }
451 
452  CONTINUE(x, y, z, w);
453 }
454 #endif /* PIXEL_SWAP */
455 
456 SWS_FOR(PX, SWAP_BYTES, DECL_IMPL, swap_bytes)
457 SWS_FOR_STRUCT(PX, SWAP_BYTES, DECL_ENTRY)
458 
459 #ifdef PIXEL_MAX
460 DECL_FUNC(expand_bit, const SwsCompMask mask)
461 {
462  SWS_LOOP
463  for (int i = 0; i < SWS_BLOCK_SIZE; i++) {
464  if (X) x[i] = x[i] ? PIXEL_MAX : 0;
465  if (Y) y[i] = y[i] ? PIXEL_MAX : 0;
466  if (Z) z[i] = z[i] ? PIXEL_MAX : 0;
467  if (W) w[i] = w[i] ? PIXEL_MAX : 0;
468  }
469 
470  CONTINUE(x, y, z, w);
471 }
472 #endif
473 
474 #if BIT_DEPTH == 8
475 DECL_FUNC(expand_pair, const SwsCompMask mask)
476 {
477  block_t x16, y16, z16, w16;
478 
479  SWS_LOOP
480  for (int i = 0; i < SWS_BLOCK_SIZE; i++) {
481  if (X) x16.u16[i] = x[i] << 8 | x[i];
482  if (Y) y16.u16[i] = y[i] << 8 | y[i];
483  if (Z) z16.u16[i] = z[i] << 8 | z[i];
484  if (W) w16.u16[i] = w[i] << 8 | w[i];
485  }
486 
487  CONTINUE(&x16, &y16, &z16, &w16);
488 }
489 
490 DECL_FUNC(expand_quad, const SwsCompMask mask)
491 {
492  block_t x32, y32, z32, w32;
493 
494  SWS_LOOP
495  for (int i = 0; i < SWS_BLOCK_SIZE; i++) {
496  if (X) x32.u32[i] = (uint32_t) x[i] << 24 | x[i] << 16 | x[i] << 8 | x[i];
497  if (Y) y32.u32[i] = (uint32_t) y[i] << 24 | y[i] << 16 | y[i] << 8 | y[i];
498  if (Z) z32.u32[i] = (uint32_t) z[i] << 24 | z[i] << 16 | z[i] << 8 | z[i];
499  if (W) w32.u32[i] = (uint32_t) w[i] << 24 | w[i] << 16 | w[i] << 8 | w[i];
500  }
501 
502  CONTINUE(&x32, &y32, &z32, &w32);
503 }
504 #endif /* BIT_DEPTH == 8 */
505 
506 SWS_FOR(PX, EXPAND_BIT, DECL_IMPL, expand_bit)
507 SWS_FOR(PX, EXPAND_PAIR, DECL_IMPL, expand_pair)
508 SWS_FOR(PX, EXPAND_QUAD, DECL_IMPL, expand_quad)
509 SWS_FOR_STRUCT(PX, EXPAND_BIT, DECL_ENTRY)
510 SWS_FOR_STRUCT(PX, EXPAND_PAIR, DECL_ENTRY)
511 SWS_FOR_STRUCT(PX, EXPAND_QUAD, DECL_ENTRY)
512 
513 /*************************
514  * Packing and unpacking *
515  ************************/
516 
517 #if !IS_FLOAT
519  const uint8_t bx, const uint8_t by,
520  const uint8_t bz, const uint8_t bw)
521 {
522  const uint8_t sx = bw + bz + by;
523  const uint8_t sy = bw + bz;
524  const uint8_t sz = bw;
525  const uint8_t sw = 0;
526 
527  const pixel_t mx = (1 << bx) - 1;
528  const pixel_t my = (1 << by) - 1;
529  const pixel_t mz = (1 << bz) - 1;
530  const pixel_t mw = (1 << bw) - 1;
531 
532  SWS_LOOP
533  for (int i = 0; i < SWS_BLOCK_SIZE; i++) {
534  const pixel_t val = x[i];
535  if (X) x[i] = (val >> sx) & mx;
536  if (Y) y[i] = (val >> sy) & my;
537  if (Z) z[i] = (val >> sz) & mz;
538  if (W) w[i] = (val >> sw) & mw;
539  }
540 
541  CONTINUE(x, y, z, w);
542 }
543 
545  const uint8_t bx, const uint8_t by,
546  const uint8_t bz, const uint8_t bw)
547 {
548  const uint8_t sx = bw + bz + by;
549  const uint8_t sy = bw + bz;
550  const uint8_t sz = bw;
551  const uint8_t sw = 0;
552 
553  SWS_LOOP
554  for (int i = 0; i < SWS_BLOCK_SIZE; i++) {
555  pixel_t val = 0;
556  if (X) val |= x[i] << sx;
557  if (Y) val |= y[i] << sy;
558  if (Z) val |= z[i] << sz;
559  if (W) val |= w[i] << sw;
560  x[i] = val;
561  }
562 
563  CONTINUE(x, y, z, w);
564 }
565 #endif /* !IS_FLOAT */
566 
567 SWS_FOR(PX, UNPACK, DECL_IMPL, unpack)
568 SWS_FOR(PX, PACK, DECL_IMPL, pack)
569 SWS_FOR_STRUCT(PX, UNPACK, DECL_ENTRY)
571 
572 /***********************
573  * Pixel data clearing *
574  ***********************/
575 
576 #ifdef PIXEL_MAX
577 DECL_FUNC(clear, const SwsCompMask mask, const SwsCompMask one,
578  const SwsCompMask zero)
579 {
580  #define ONE(N) SWS_COMP_TEST(one, N)
581  #define ZERO(N) SWS_COMP_TEST(zero, N)
582  const pixel_t cx = ONE(0) ? PIXEL_MAX : ZERO(0) ? 0 : impl->priv.px[0];
583  const pixel_t cy = ONE(1) ? PIXEL_MAX : ZERO(1) ? 0 : impl->priv.px[1];
584  const pixel_t cz = ONE(2) ? PIXEL_MAX : ZERO(2) ? 0 : impl->priv.px[2];
585  const pixel_t cw = ONE(3) ? PIXEL_MAX : ZERO(3) ? 0 : impl->priv.px[3];
586 
587  SWS_LOOP
588  for (int i = 0; i < SWS_BLOCK_SIZE; i++) {
589  if (X) x[i] = cx;
590  if (Y) y[i] = cy;
591  if (Z) z[i] = cz;
592  if (W) w[i] = cw;
593  }
594 
595  CONTINUE(x, y, z, w);
596 }
597 #endif
598 
599 SWS_FOR(PX, CLEAR, DECL_IMPL, clear)
601 
602 /*************************
603  * Arithmetic operations *
604  *************************/
605 
607 {
608  const pixel_t scale = impl->priv.px[0];
609 
610  SWS_LOOP
611  for (int i = 0; i < SWS_BLOCK_SIZE; i++) {
612  if (X) x[i] *= scale;
613  if (Y) y[i] *= scale;
614  if (Z) z[i] *= scale;
615  if (W) w[i] *= scale;
616  }
617 
618  CONTINUE(x, y, z, w);
619 }
620 
622 {
623  SWS_LOOP
624  for (int i = 0; i < SWS_BLOCK_SIZE; i++) {
625  if (X) x[i] += impl->priv.px[0];
626  if (Y) y[i] += impl->priv.px[1];
627  if (Z) z[i] += impl->priv.px[2];
628  if (W) w[i] += impl->priv.px[3];
629  }
630 
631  CONTINUE(x, y, z, w);
632 }
633 
635 {
636  SWS_LOOP
637  for (int i = 0; i < SWS_BLOCK_SIZE; i++) {
638  if (X) x[i] = FFMIN(x[i], impl->priv.px[0]);
639  if (Y) y[i] = FFMIN(y[i], impl->priv.px[1]);
640  if (Z) z[i] = FFMIN(z[i], impl->priv.px[2]);
641  if (W) w[i] = FFMIN(w[i], impl->priv.px[3]);
642  }
643 
644  CONTINUE(x, y, z, w);
645 }
646 
648 {
649  SWS_LOOP
650  for (int i = 0; i < SWS_BLOCK_SIZE; i++) {
651  if (X) x[i] = FFMAX(x[i], impl->priv.px[0]);
652  if (Y) y[i] = FFMAX(y[i], impl->priv.px[1]);
653  if (Z) z[i] = FFMAX(z[i], impl->priv.px[2]);
654  if (W) w[i] = FFMAX(w[i], impl->priv.px[3]);
655  }
656 
657  CONTINUE(x, y, z, w);
658 }
659 
661 SWS_FOR(PX, ADD, DECL_IMPL, add)
668 
669 /*************
670  * Dithering *
671  *************/
672 
674 {
675  const SwsUOp *uop = params->uop;
676  const SwsDitherUOp *dither = &uop->par.dither;
677  const int size = 1 << dither->size_log2;
678  if (size >= SWS_BLOCK_SIZE) {
679  /* No extra padding needed */
680  out->priv.ptr = av_refstruct_ref(uop->data.ptr);
681  out->free = ff_op_priv_unref;
682  return 0;
683  }
684 
685  const int stride = FFMAX(size, SWS_BLOCK_SIZE);
686  const int height = ff_sws_dither_height(dither);
687  pixel_t *matrix = av_malloc(sizeof(pixel_t) * height * stride);
688  if (!matrix)
689  return AVERROR(ENOMEM);
690  out->priv.ptr = matrix;
691  out->free = ff_op_priv_free;
692 
693  /* Pad to multiple of block size. We don't need extra padding for the
694  * height because ff_sws_dither_height() already includes any padding
695  * necessary for the y_offset */
696  for (int y = 0; y < height; y++) {
697  pixel_t *row = &matrix[y * stride];
698  for (int x = 0; x < size; x++)
699  row[x] = uop->data.ptr[y * size + x].px;
700  for (int x = size; x < stride; x++)
701  row[x] = row[x % size];
702  }
703 
704  return 0;
705 }
706 
708  const uint8_t off0, const uint8_t off1,
709  const uint8_t off2, const uint8_t off3,
710  const uint8_t size_log2)
711 {
712  const int size = 1 << size_log2;
713  const int stride = FFMAX(size, SWS_BLOCK_SIZE);
714 
715  const pixel_t *matrix = impl->priv.ptr;
716  matrix += (iter->y & (size - 1)) * stride;
717  matrix += (iter->x & (size - 1)) & ~(SWS_BLOCK_SIZE - 1);
718 
719  const pixel_t *const row0 = &matrix[off0 * stride];
720  const pixel_t *const row1 = &matrix[off1 * stride];
721  const pixel_t *const row2 = &matrix[off2 * stride];
722  const pixel_t *const row3 = &matrix[off3 * stride];
723 
724  SWS_LOOP
725  for (int i = 0; i < SWS_BLOCK_SIZE; i++) {
726  if (X) x[i] += row0[i];
727  if (Y) y[i] += row1[i];
728  if (Z) z[i] += row2[i];
729  if (W) w[i] += row3[i];
730  }
731 
732  CONTINUE(x, y, z, w);
733 }
734 
735 SWS_FOR(PX, DITHER, DECL_IMPL, dither)
736 SWS_FOR_STRUCT(PX, DITHER, DECL_ENTRY, .setup = fn(setup_dither) )
737 
738 /*********************
739  * Linear operations *
740  *********************/
741 
742 typedef struct {
743  /* Stored in split form for convenience */
744  pixel_t m[4][4];
745  pixel_t k[4];
746 } fn(LinCoeffs);
747 
749 {
750  const SwsUOp *uop = params->uop;
751  fn(LinCoeffs) c;
752 
753  for (int i = 0; i < 4; i++) {
754  for (int j = 0; j < 4; j++)
755  c.m[i][j] = uop->data.mat4[i][j].px;
756  c.k[i] = uop->data.mat4[i][4].px;
757  }
758 
759  out->priv.ptr = av_memdup(&c, sizeof(c));
760  out->free = ff_op_priv_free;
761  return out->priv.ptr ? 0 : AVERROR(ENOMEM);
762 }
763 
764 /**
765  * Fully general case for a 5x5 linear affine transformation. Should never be
766  * called without constant `mask`. This function will compile down to the
767  * appropriately optimized version for the required subset of operations when
768  * called with a constant mask.
769  */
770 DECL_FUNC(linear, const SwsCompMask mask, const uint32_t one, const uint32_t zero)
771 {
772  const fn(LinCoeffs) c = *(const fn(LinCoeffs) *) impl->priv.ptr;
773 
774  SWS_LOOP
775  for (int i = 0; i < SWS_BLOCK_SIZE; i++) {
776  const pixel_t xx = x[i];
777  const pixel_t yy = y[i];
778  const pixel_t zz = z[i];
779  const pixel_t ww = w[i];
780 
781 #define LIN_VAL(I, J, val) \
782  ((one & SWS_MASK(I, J)) ? (val) : c.m[I][J] * (val))
783 
784 #define LIN_ROW(I, var) do { \
785  var[i] = (zero & SWS_MASK(I, 4)) ? 0 : c.k[I]; \
786  if (!(zero & SWS_MASK(I, 0))) var[i] += LIN_VAL(I, 0, xx); \
787  if (!(zero & SWS_MASK(I, 1))) var[i] += LIN_VAL(I, 1, yy); \
788  if (!(zero & SWS_MASK(I, 2))) var[i] += LIN_VAL(I, 2, zz); \
789  if (!(zero & SWS_MASK(I, 3))) var[i] += LIN_VAL(I, 3, ww); \
790 } while (0)
791 
792  if (X) LIN_ROW(0, x);
793  if (Y) LIN_ROW(1, y);
794  if (Z) LIN_ROW(2, z);
795  if (W) LIN_ROW(3, w);
796  }
797 
798  CONTINUE(x, y, z, w);
799 }
800 
803 
804 #undef PIXEL_MAX
805 #undef PIXEL_SWAP
806 #undef pixel_t
807 #undef inter_t
808 #undef block_t
809 #undef PX
810 #undef px
DECL_IMPL_WRITE
#define DECL_IMPL_WRITE(...)
Definition: uops_tmpl.h:133
PIXEL_MAX
#define PIXEL_MAX
Definition: uops_tmpl.c:50
AVERROR
Filter the word “frame” indicates either a video frame or a group of audio as stored in an AVFrame structure Format for each input and each output the list of supported formats For video that means pixel format For audio that means channel sample they are references to shared objects When the negotiation mechanism computes the intersection of the formats supported at each end of a all references to both lists are replaced with a reference to the intersection And when a single format is eventually chosen for a link amongst the remaining all references to the list are updated That means that if a filter requires that its input and output have the same format amongst a supported all it has to do is use a reference to the same list of formats query_formats can leave some formats unset and return AVERROR(EAGAIN) to cause the negotiation mechanism toagain later. That can be used by filters with complex requirements to use the format negotiated on one link to set the formats supported on another. Frame references ownership and permissions
MAX
#define MAX
Definition: blend_modes.c:46
out
static FILE * out
Definition: movenc.c:55
SIZEOF_BLOCK
#define SIZEOF_BLOCK
Definition: uops_tmpl.h:50
Z
#define Z
Definition: uops_tmpl.h:83
X
@ X
Definition: vf_addroi.c:27
matrix
Definition: vc1dsp.c:43
U32
@ U32
Definition: sw_ops.c:43
block_t::f32
float f32[SWS_BLOCK_SIZE]
Definition: uops_tmpl.h:47
DECL_ENTRY
#define DECL_ENTRY(SETUP, NAME,...)
Definition: uops_tmpl.h:139
mask
int mask
Definition: mediacodecdec_common.c:154
SwsUOp::data
union SwsUOp::@585 data
SwsFilterWeights
Represents a computed filter kernel.
Definition: filters.h:64
DECL_COPY
#define DECL_COPY(DUMMY, NAME, TYPE, UOP, MASK, IDX0, IDX1, IDX2, IDX3)
Definition: uops_tmpl.c:353
linear
static int linear(InterplayACMContext *s, unsigned ind, unsigned col)
Definition: interplayacm.c:135
filter
void(* filter)(uint8_t *src, int stride, int qscale)
Definition: h263dsp.c:29
max
#define max(a, b)
Definition: cuda_runtime.h:33
FFMAX
#define FFMAX(a, b)
Definition: macros.h:47
SwsOpExec::in_stride
ptrdiff_t in_stride[4]
Definition: ops_dispatch.h:41
ff_op_priv_unref
static void ff_op_priv_unref(SwsOpPriv *priv)
Definition: ops_chain.h:149
setup_linear
static int setup_linear(const SwsImplParams *params, SwsImplResult *out)
Definition: ops.c:272
ONE
#define ONE(N)
av_memdup
void * av_memdup(const void *p, size_t size)
Duplicate a buffer with av_malloc().
Definition: mem.c:304
setup_dither
static int setup_dither(const SwsImplParams *params, SwsImplResult *out)
Definition: ops.c:265
mx
uint8_t ptrdiff_t const uint8_t ptrdiff_t int intptr_t mx
Definition: dsp.h:57
DECL_WRITE
DECL_WRITE(write_planar, const SwsCompMask mask)
Definition: uops_tmpl.c:97
PX
#define PX
Definition: uops_tmpl.c:53
weight
const h264_weight_func weight
Definition: h264dsp_init.c:33
val
static double val(void *priv, double ch)
Definition: aeval.c:77
type
it s the only field you need to keep assuming you have a context There is some magic you don t need to care about around this just let it vf type
Definition: writing_filters.txt:86
SWS_COMP_ELEMS
#define SWS_COMP_ELEMS(N)
Definition: uops.h:73
SWS_FOR
#define SWS_FOR(TYPE, UOP, MACRO,...)
Definition: uops_macros.h:17
SWS_FOR_STRUCT
#define SWS_FOR_STRUCT(TYPE, UOP, MACRO,...)
Definition: uops_macros.h:19
float
float
Definition: af_crystalizer.c:122
W
#define W(a, i, v)
Definition: jpegls.h:119
dither
static const uint16_t dither[8][8]
Definition: vf_gradfun.c:46
SwsUOp::uop
SwsUOpType uop
Definition: uops.h:204
SCALE
#define SCALE(c)
Definition: dcadata.c:7338
LINEAR
#define LINEAR
Definition: vf_perspective.c:36
SwsCompMask
uint8_t SwsCompMask
Bit-mask of components.
Definition: uops.h:61
COPY
#define COPY(src, name)
RSHIFT
#define RSHIFT(a, b)
Definition: common.h:56
my
uint8_t ptrdiff_t const uint8_t ptrdiff_t int intptr_t intptr_t my
Definition: dsp.h:57
read_bit
static unsigned int BS_FUNC() read_bit(BSCTX *bc)
Return one bit from the buffer.
Definition: bitstream_template.h:211
SwsOpExec
Copyright (C) 2026 Niklas Haas.
Definition: ops_dispatch.h:35
LIN_ROW
#define LIN_ROW(I, var)
xs
#define xs(width, name, var, subs,...)
Definition: cbs_vp9.c:305
SwsUOp::mat4
SwsPixel mat4[4][5]
Definition: uops.h:214
ADD
#define ADD(a, b)
Definition: dct32_template.c:123
block_t
Definition: uops_tmpl.h:43
DECL_IMPL
#define DECL_IMPL(FUNC, NAME, TYPE, UOP,...)
Definition: uops_tmpl.h:119
SWS_BLOCK_SIZE
#define SWS_BLOCK_SIZE
Copyright (C) 2026 Niklas Haas.
Definition: uops_tmpl.h:40
SwsPixelType
SwsPixelType
Definition: uops.h:38
c
Undefined Behavior In the C some operations are like signed integer dereferencing freed accessing outside allocated Undefined Behavior must not occur in a C it is not safe even if the output of undefined operations is unused The unsafety may seem nit picking but Optimizing compilers have in fact optimized code on the assumption that no undefined Behavior occurs Optimizing code based on wrong assumptions can and has in some cases lead to effects beyond the output of computations The signed integer overflow problem in speed critical code Code which is highly optimized and works with signed integers sometimes has the problem that often the output of the computation does not c
Definition: undefined.txt:32
SwsUOp::par
SwsUOpParams par
Definition: uops.h:206
ff_sws_setup_vec4
int ff_sws_setup_vec4(const SwsImplParams *params, SwsImplResult *out)
Definition: ops_chain.c:197
SwsUOp
Definition: uops.h:201
SWS_LOOP
#define SWS_LOOP
Definition: uops_tmpl.h:68
height
#define height
Definition: dsp.h:89
i
#define i(width, name, range_min, range_max)
Definition: cbs_h264.c:63
for
for(k=2;k<=8;++k)
Definition: h264pred_template.c:424
size
int size
Definition: twinvq_data.h:10344
mz
static double mz(int i, double w0, double r, double alpha)
Definition: af_atilt.c:55
inter_t
#define inter_t
Definition: uops_tmpl.c:52
fn
#define fn(a)
Definition: aap_template.c:37
av_refstruct_ref
void * av_refstruct_ref(void *obj)
Create a new reference to an object managed via this API, i.e.
Definition: refstruct.c:140
offset
it s the only field you need to keep assuming you have a context There is some magic you don t need to care about around this just let it vf offset
Definition: writing_filters.txt:86
zero
static int zero(InterplayACMContext *s, unsigned ind, unsigned col)
Definition: interplayacm.c:121
Y
#define Y
Definition: boxblur.h:37
U8
@ U8
Definition: sw_ops.c:41
CONTINUE
#define CONTINUE(...)
Definition: uops_tmpl.h:107
av_assert2
#define av_assert2(cond)
assert() equivalent, that does lie in speed critical code.
Definition: avassert.h:68
unpack
static int unpack(const uint8_t *src, const uint8_t *src_end, uint8_t *dst, int width, int height)
Unpack buffer.
Definition: eatgv.c:73
av_malloc
#define av_malloc(s)
Definition: ops_asmgen.c:44
SwsOpExec::in_offset_x
int32_t * in_offset_x
Pixel offset map; for horizontal scaling, in bytes.
Definition: ops_dispatch.h:80
weights
static const int weights[]
Definition: hevc_pel.c:32
FFMIN
#define FFMIN(a, b)
Definition: macros.h:49
MIN
#define MIN(a, b)
Definition: qt-faststart.c:45
av_calloc
void * av_calloc(size_t nmemb, size_t size)
Definition: mem.c:264
ff_op_priv_free
static void ff_op_priv_free(SwsOpPriv *priv)
Definition: ops_chain.h:144
bswap.h
F32
@ F32
Definition: sw_ops.c:44
bump_ptr
#define bump_ptr(ptr, bump)
Definition: uops_tmpl.h:78
DECL_FUNC
DECL_FUNC(lshift, const SwsCompMask mask, const uint8_t amount)
Definition: uops_tmpl.c:408
DECL_SETUP
DECL_SETUP(setup_filter_v, params, out)
Definition: uops_tmpl.c:220
DECL_CAST
#define DECL_CAST(DST, dst)
Definition: uops_tmpl.c:379
SwsUOp::ptr
SwsPixel * ptr
Definition: uops.h:211
pixel_t
#define pixel_t
Definition: uops_tmpl.c:51
block_t::u32
uint32_t u32[SWS_BLOCK_SIZE]
Definition: uops_tmpl.h:46
CLEAR
#define CLEAR(destin)
Definition: wavpackenc.c:50
ff_sws_setup_scalar
int ff_sws_setup_scalar(const SwsImplParams *params, SwsImplResult *out)
Definition: ops_chain.c:182
SWS_FILTER_SCALE
@ SWS_FILTER_SCALE
14-bit coefficients are picked to fit comfortably within int16_t for efficient SIMD processing (e....
Definition: filters.h:40
block_t::u16
uint16_t u16[SWS_BLOCK_SIZE]
Definition: uops_tmpl.h:45
SwsDitherUOp
Definition: uops.h:179
SwsUOpParams::dither
SwsDitherUOp dither
Definition: uops.h:198
ZERO
#define ZERO(N)
SWS_PIXEL_F32
@ SWS_PIXEL_F32
Definition: uops.h:43
w
uint8_t w
Definition: llvidencdsp.c:39
scale
static void scale(int *out, const int *in, const int w, const int h, const int shift)
Definition: intra.c:278
uops_tmpl.h
setup_filter_v
static int setup_filter_v(const SwsImplParams *params, SwsImplResult *out)
Definition: ops.c:46
DECL_PERMUTE
#define DECL_PERMUTE(DUMMY, NAME, TYPE, UOP, MASK, IDX0, IDX1, IDX2, IDX3)
Definition: uops_tmpl.c:344
DECL_READ
DECL_READ(read_planar, const SwsCompMask mask)
Definition: uops_tmpl.c:63
int32_t
int32_t
Definition: audioconvert.c:56
ff_sws_dither_height
int ff_sws_dither_height(const SwsDitherUOp *dither)
Computes (1 << size_log2) + MAX(y_offset).
Definition: uops.c:400
stride
#define stride
Definition: h264pred_template.c:536
U16
@ U16
Definition: sw_ops.c:42
DECL_IMPL_READ
#define DECL_IMPL_READ(...)
Definition: uops_tmpl.h:128
min
float min
Definition: vorbis_enc_data.h:429
setup_filter_h
static int setup_filter_h(const SwsImplParams *params, SwsImplResult *out)
Definition: ops.c:76