doxygen/trunk/uops__tmpl_8c_source.html

/**

 * Copyright (C) 2026 Niklas Haas

 *

 * This file is part of FFmpeg.

 *

 * FFmpeg is free software; you can redistribute it and/or

 * modify it under the terms of the GNU Lesser General Public

 * License as published by the Free Software Foundation; either

 * version 2.1 of the License, or (at your option) any later version.

 *

 * FFmpeg is distributed in the hope that it will be useful,

 * but WITHOUT ANY WARRANTY; without even the implied warranty of

 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU

 * Lesser General Public License for more details.

 *

 * You should have received a copy of the GNU Lesser General Public

 * License along with FFmpeg; if not, write to the Free Software

 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA

 */


#include <libavutil/bswap.h>


#include "uops_tmpl.h"


#ifndef BIT_DEPTH

#  define BIT_DEPTH 8

#endif


#if IS_FLOAT && BIT_DEPTH == 32

#  define PIXEL_TYPE SWS_PIXEL_F32

#  define pixel_t    float

#  define inter_t    float

#  define PX         F32

#  define px         f32

#elif BIT_DEPTH == 32

#  define PIXEL_MAX  0xFFFFFFFFu

#  define PIXEL_SWAP av_bswap32

#  define pixel_t    uint32_t

#  define inter_t    int64_t

#  define PX         U32

#  define px         u32

#elif BIT_DEPTH == 16

#  define PIXEL_MAX  0xFFFFu

#  define PIXEL_SWAP av_bswap16

#  define pixel_t    uint16_t

#  define inter_t    int64_t

#  define PX         U16

#  define px         u16

#elif BIT_DEPTH == 8

#  define PIXEL_MAX  0xFFu

#  define pixel_t    uint8_t

#  define inter_t    int32_t

#  define PX         U8

#  define px         u8

#else

#  error Invalid BIT_DEPTH

#endif


/*********************************

 * Generic read/write operations *

 *********************************/


DECL_READ(read_planar, const SwsCompMask mask)

{

    SWS_LOOP

    for (int i = 0; i < SWS_BLOCK_SIZE; i++) {

        if (X) x[i] = in0[i];

        if (Y) y[i] = in1[i];

        if (Z) z[i] = in2[i];

        if (W) w[i] = in3[i];

    }


    if (X) iter->in[0] += SIZEOF_BLOCK;

    if (Y) iter->in[1] += SIZEOF_BLOCK;

    if (Z) iter->in[2] += SIZEOF_BLOCK;

    if (W) iter->in[3] += SIZEOF_BLOCK;


    CONTINUE(x, y, z, w);

}


DECL_READ(read_packed, const SwsCompMask mask)

{

    const int elems = W ? 4 : Z ? 3 : Y ? 2 : 1;


    SWS_LOOP

    for (int i = 0; i < SWS_BLOCK_SIZE; i++) {

        if (X) x[i] = in0[elems * i + 0];

        if (Y) y[i] = in0[elems * i + 1];

        if (Z) z[i] = in0[elems * i + 2];

        if (W) w[i] = in0[elems * i + 3];

    }


    iter->in[0] += SIZEOF_BLOCK * elems;

    CONTINUE(x, y, z, w);

}


DECL_WRITE(write_planar, const SwsCompMask mask)

{

    SWS_LOOP

    for (int i = 0; i < SWS_BLOCK_SIZE; i++) {

        if (X) out0[i] = x[i];

        if (Y) out1[i] = y[i];

        if (Z) out2[i] = z[i];

        if (W) out3[i] = w[i];

    }


    if (X) iter->out[0] += SIZEOF_BLOCK;

    if (Y) iter->out[1] += SIZEOF_BLOCK;

    if (Z) iter->out[2] += SIZEOF_BLOCK;

    if (W) iter->out[3] += SIZEOF_BLOCK;

}


DECL_WRITE(write_packed, const SwsCompMask mask)

{

    const int elems = W ? 4 : Z ? 3 : Y ? 2 : 1;


    SWS_LOOP

    for (int i = 0; i < SWS_BLOCK_SIZE; i++) {

        if (X) out0[elems * i + 0] = x[i];

        if (Y) out0[elems * i + 1] = y[i];

        if (Z) out0[elems * i + 2] = z[i];

        if (W) out0[elems * i + 3] = w[i];

    }


    iter->out[0] += SIZEOF_BLOCK * elems;

}


#if BIT_DEPTH == 8


DECL_READ(read_bit, const SwsCompMask mask)

{

    av_assert2(mask == SWS_COMP_ELEMS(1));


    SWS_LOOP

    for (int i = 0; i < SWS_BLOCK_SIZE; i += 8) {

        const pixel_t val = ((const pixel_t *) in0)[i >> 3];

        x[i + 0] = (val >> 7) & 1;

        x[i + 1] = (val >> 6) & 1;

        x[i + 2] = (val >> 5) & 1;

        x[i + 3] = (val >> 4) & 1;

        x[i + 4] = (val >> 3) & 1;

        x[i + 5] = (val >> 2) & 1;

        x[i + 6] = (val >> 1) & 1;

        x[i + 7] = (val >> 0) & 1;

    }


    iter->in[0] += SIZEOF_BLOCK >> 3;

    CONTINUE(x, y, z, w);

}


DECL_READ(read_nibble, const SwsCompMask mask)

{

    av_assert2(mask == SWS_COMP_ELEMS(1));


    SWS_LOOP

    for (int i = 0; i < SWS_BLOCK_SIZE; i += 2) {

        const pixel_t val = in0[i >> 1];

        x[i + 0] = val >> 4;  /* high nibble */

        x[i + 1] = val & 0xF; /* low nibble */

    }


    iter->in[0] += SIZEOF_BLOCK >> 1;

    CONTINUE(x, y, z, w);

}


DECL_WRITE(write_bit, const SwsCompMask mask)

{

    av_assert2(mask == SWS_COMP_ELEMS(1));


    SWS_LOOP

    for (int i = 0; i < SWS_BLOCK_SIZE; i += 8) {

        out0[i >> 3] = x[i + 0] << 7 |

                       x[i + 1] << 6 |

                       x[i + 2] << 5 |

                       x[i + 3] << 4 |

                       x[i + 4] << 3 |

                       x[i + 5] << 2 |

                       x[i + 6] << 1 |

                       x[i + 7];

    }


    iter->out[0] += SIZEOF_BLOCK >> 3;

}


DECL_WRITE(write_nibble, const SwsCompMask mask)

{

    av_assert2(mask == SWS_COMP_ELEMS(1));


    SWS_LOOP

    for (int i = 0; i < SWS_BLOCK_SIZE; i += 2)

        out0[i >> 1] = x[i] << 4 | x[i + 1];


    iter->out[0] += SIZEOF_BLOCK >> 1;

}


#endif /* BIT_DEPTH == 8 */


SWS_FOR(PX, READ_PLANAR,    DECL_IMPL_READ,     read_planar)

SWS_FOR(PX, READ_PACKED,    DECL_IMPL_READ,     read_packed)

SWS_FOR(PX, READ_NIBBLE,    DECL_IMPL_READ,     read_nibble)

SWS_FOR(PX, READ_BIT,       DECL_IMPL_READ,     read_bit)

SWS_FOR(PX, WRITE_PLANAR,   DECL_IMPL_WRITE,    write_planar)

SWS_FOR(PX, WRITE_PACKED,   DECL_IMPL_WRITE,    write_packed)

SWS_FOR(PX, WRITE_NIBBLE,   DECL_IMPL_WRITE,    write_nibble)

SWS_FOR(PX, WRITE_BIT,      DECL_IMPL_WRITE,    write_bit)


SWS_FOR_STRUCT(PX, READ_PLANAR,     DECL_ENTRY)

SWS_FOR_STRUCT(PX, READ_PACKED,     DECL_ENTRY)

SWS_FOR_STRUCT(PX, READ_NIBBLE,     DECL_ENTRY)

SWS_FOR_STRUCT(PX, READ_BIT,        DECL_ENTRY)

SWS_FOR_STRUCT(PX, WRITE_PLANAR,    DECL_ENTRY)

SWS_FOR_STRUCT(PX, WRITE_PACKED,    DECL_ENTRY)

SWS_FOR_STRUCT(PX, WRITE_NIBBLE,    DECL_ENTRY)

SWS_FOR_STRUCT(PX, WRITE_BIT,       DECL_ENTRY)


/*****************************

 * Scaling / filtering reads *

 *****************************/


DECL_SETUP(setup_filter_v, params, out)

{

    if (params->uop->par.filter.type != SWS_PIXEL_F32)

        return AVERROR(ENOTSUP);


    const SwsFilterWeights *filter = params->uop->data.kernel;

    static_assert(sizeof(out->priv.ptr) <= sizeof(int32_t[2]),

                  ">8 byte pointers not supported");


    /* Pre-convert weights to float */

    float *weights = av_calloc(filter->num_weights, sizeof(float));

    if (!weights)

        return AVERROR(ENOMEM);


    for (int i = 0; i < filter->num_weights; i++)

        weights[i] = (float) filter->weights[i] / SWS_FILTER_SCALE;


    out->priv.ptr = weights;

    out->priv.i32[2] = filter->filter_size;

    out->free = ff_op_priv_free;

    return 0;

}


/* Fully general vertical planar filter case */

DECL_READ(read_planar_fv, const SwsCompMask mask, const SwsPixelType type)

{

    av_assert2(type == SWS_PIXEL_F32);

    const SwsOpExec *exec = iter->exec;

    const float *restrict weights = impl->priv.ptr;

    const int filter_size = impl->priv.i32[2];

    weights += filter_size * iter->y;


    block_t xs, ys, zs, ws;

    if (X) memset(&xs.f32, 0, sizeof(xs.f32));

    if (Y) memset(&ys.f32, 0, sizeof(ys.f32));

    if (Z) memset(&zs.f32, 0, sizeof(zs.f32));

    if (W) memset(&ws.f32, 0, sizeof(ws.f32));


    for (int j = 0; j < filter_size; j++) {

        const float weight = weights[j];


        SWS_LOOP

        for (int i = 0; i < SWS_BLOCK_SIZE; i++) {

            if (X) xs.f32[i] += weight * in0[i];

            if (Y) ys.f32[i] += weight * in1[i];

            if (Z) zs.f32[i] += weight * in2[i];

            if (W) ws.f32[i] += weight * in3[i];

        }


        if (X) in0 = bump_ptr(in0, exec->in_stride[0]);

        if (Y) in1 = bump_ptr(in1, exec->in_stride[1]);

        if (Z) in2 = bump_ptr(in2, exec->in_stride[2]);

        if (W) in3 = bump_ptr(in3, exec->in_stride[3]);

    }


    if (X) iter->in[0] += SIZEOF_BLOCK;

    if (Y) iter->in[1] += SIZEOF_BLOCK;

    if (Z) iter->in[2] += SIZEOF_BLOCK;

    if (W) iter->in[3] += SIZEOF_BLOCK;


    CONTINUE(&xs, &ys, &zs, &ws);

}


DECL_SETUP(setup_filter_h, params, out)

{

    if (params->uop->par.filter.type != SWS_PIXEL_F32)

        return AVERROR(ENOTSUP);


    SwsFilterWeights *filter = params->uop->data.kernel;

    out->priv.ptr = av_refstruct_ref(filter->weights);

    out->priv.i32[2] = filter->filter_size;

    out->free = ff_op_priv_unref;

    return 0;

}


/* Fully general horizontal planar filter case */

DECL_READ(read_planar_fh, const SwsCompMask mask, const SwsPixelType type)

{

    av_assert2(type == SWS_PIXEL_F32);

    const SwsOpExec *exec = iter->exec;

    const int *restrict weights = impl->priv.ptr;

    const int filter_size = impl->priv.i32[2];

    const float scale = 1.0f / SWS_FILTER_SCALE;

    const int xpos = iter->x;

    weights += filter_size * iter->x;


    block_t xs, ys, zs, ws;

    for (int i = 0; i < SWS_BLOCK_SIZE; i++) {

        const int offset = exec->in_offset_x[xpos + i];

        pixel_t *start0 = bump_ptr(in0, offset);

        pixel_t *start1 = bump_ptr(in1, offset);

        pixel_t *start2 = bump_ptr(in2, offset);

        pixel_t *start3 = bump_ptr(in3, offset);


        inter_t sx = 0, sy = 0, sz = 0, sw = 0;

        for (int j = 0; j < filter_size; j++) {

            const int weight = weights[j];

            if (X) sx += weight * start0[j];

            if (Y) sy += weight * start1[j];

            if (Z) sz += weight * start2[j];

            if (W) sw += weight * start3[j];

        }


        if (X) xs.f32[i] = (float) sx * scale;

        if (Y) ys.f32[i] = (float) sy * scale;

        if (Z) zs.f32[i] = (float) sz * scale;

        if (W) ws.f32[i] = (float) sw * scale;


        weights += filter_size;

    }


    CONTINUE(&xs, &ys, &zs, &ws);

}


SWS_FOR(PX, READ_PLANAR_FV, DECL_IMPL_READ, read_planar_fv)

SWS_FOR(PX, READ_PLANAR_FH, DECL_IMPL_READ, read_planar_fh)

SWS_FOR_STRUCT(PX, READ_PLANAR_FV, DECL_ENTRY, .setup = fn(setup_filter_v) )

SWS_FOR_STRUCT(PX, READ_PLANAR_FH, DECL_ENTRY, .setup = fn(setup_filter_h) )


/***************************

 * Permutation and copying *

 ***************************/


/* Permute by directly swapping the order of arguments to the continuation. */

#define DECL_PERMUTE(DUMMY, NAME, TYPE, UOP, MASK, IDX0, IDX1, IDX2, IDX3)      \

    static void NAME##_c(SwsOpIter *restrict iter,                              \

                         const SwsOpImpl *restrict impl,                        \

                         void *restrict in0, void *restrict in1,                \

                         void *restrict in2, void *restrict in3)                \

    {                                                                           \

        CONTINUE(in##IDX0, in##IDX1, in##IDX2, in##IDX3);                       \

    }


#define DECL_COPY(DUMMY, NAME, TYPE, UOP, MASK, IDX0, IDX1, IDX2, IDX3)         \

    static void NAME##_c(SwsOpIter *restrict iter,                              \

                         const SwsOpImpl *restrict impl,                        \

                         void *restrict in0, void *restrict in1,                \

                         void *restrict in2, void *restrict in3)                \

    {                                                                           \

        const SwsCompMask mask = (MASK);                                        \

        block_t x, y, z, w;                                                     \

                                                                                \

        if (X) memcpy(&x.px, in##IDX0, SIZEOF_BLOCK);                           \

        if (Y) memcpy(&y.px, in##IDX1, SIZEOF_BLOCK);                           \

        if (Z) memcpy(&z.px, in##IDX2, SIZEOF_BLOCK);                           \

        if (W) memcpy(&w.px, in##IDX3, SIZEOF_BLOCK);                           \

                                                                                \

        CONTINUE(X ? &x : in0, Y ? &y : in1, Z ? &z : in2, W ? &w : in3);       \

    }


SWS_FOR(PX, PERMUTE, DECL_PERMUTE)

SWS_FOR(PX, COPY,    DECL_COPY)

SWS_FOR_STRUCT(PX, PERMUTE, DECL_ENTRY)

SWS_FOR_STRUCT(PX, COPY,    DECL_ENTRY)


/*********************

 * Format conversion *

 *********************/


#define DECL_CAST(DST, dst)                                                     \

    DECL_FUNC(to_##dst, const SwsCompMask mask)                                 \

    {                                                                           \

        block_t xx, yy, zz, ww;                                                 \

                                                                                \

        SWS_LOOP                                                                \

        for (int i = 0; i < SWS_BLOCK_SIZE; i++) {                              \

            if (X) xx.dst[i] = x[i];                                            \

            if (Y) yy.dst[i] = y[i];                                            \

            if (Z) zz.dst[i] = z[i];                                            \

            if (W) ww.dst[i] = w[i];                                            \

        }                                                                       \

                                                                                \

        CONTINUE(&xx, &yy, &zz, &ww);                                           \

    }                                                                           \

                                                                                \

    SWS_FOR(PX, TO_##DST, DECL_IMPL, to_##dst)                                  \

    SWS_FOR_STRUCT(PX, TO_##DST, DECL_ENTRY)


DECL_CAST(U8,  u8)

DECL_CAST(U16, u16)

DECL_CAST(U32, u32)

DECL_CAST(F32, f32)


/********************

 * Bit manipulation *

 ********************/


#if !IS_FLOAT

DECL_FUNC(lshift, const SwsCompMask mask, const uint8_t amount)

{

    SWS_LOOP

    for (int i = 0; i < SWS_BLOCK_SIZE; i++) {

        if (X) x[i] <<= amount;

        if (Y) y[i] <<= amount;

        if (Z) z[i] <<= amount;

        if (W) w[i] <<= amount;

    }


    CONTINUE(x, y, z, w);

}


DECL_FUNC(rshift, const SwsCompMask mask, const uint8_t amount)

{

    SWS_LOOP

    for (int i = 0; i < SWS_BLOCK_SIZE; i++) {

        if (X) x[i] >>= amount;

        if (Y) y[i] >>= amount;

        if (Z) z[i] >>= amount;

        if (W) w[i] >>= amount;

    }


    CONTINUE(x, y, z, w);

}

#endif


SWS_FOR(PX, LSHIFT, DECL_IMPL, lshift)

SWS_FOR(PX, RSHIFT, DECL_IMPL, rshift)


SWS_FOR_STRUCT(PX, LSHIFT, DECL_ENTRY)

SWS_FOR_STRUCT(PX, RSHIFT, DECL_ENTRY)


#ifdef PIXEL_SWAP

DECL_FUNC(swap_bytes, const SwsCompMask mask)

{

    SWS_LOOP

    for (int i = 0; i < SWS_BLOCK_SIZE; i++) {

        if (X) x[i] = PIXEL_SWAP(x[i]);

        if (Y) y[i] = PIXEL_SWAP(y[i]);

        if (Z) z[i] = PIXEL_SWAP(z[i]);

        if (W) w[i] = PIXEL_SWAP(w[i]);

    }


    CONTINUE(x, y, z, w);

}

#endif /* PIXEL_SWAP */


SWS_FOR(PX, SWAP_BYTES, DECL_IMPL, swap_bytes)

SWS_FOR_STRUCT(PX, SWAP_BYTES, DECL_ENTRY)


#ifdef PIXEL_MAX

DECL_FUNC(expand_bit, const SwsCompMask mask)

{

    SWS_LOOP

    for (int i = 0; i < SWS_BLOCK_SIZE; i++) {

        if (X) x[i] = x[i] ? PIXEL_MAX : 0;

        if (Y) y[i] = y[i] ? PIXEL_MAX : 0;

        if (Z) z[i] = z[i] ? PIXEL_MAX : 0;

        if (W) w[i] = w[i] ? PIXEL_MAX : 0;

    }


    CONTINUE(x, y, z, w);

}

#endif


#if BIT_DEPTH == 8

DECL_FUNC(expand_pair, const SwsCompMask mask)

{

    block_t x16, y16, z16, w16;


    SWS_LOOP

    for (int i = 0; i < SWS_BLOCK_SIZE; i++) {

        if (X) x16.u16[i] = x[i] << 8 | x[i];

        if (Y) y16.u16[i] = y[i] << 8 | y[i];

        if (Z) z16.u16[i] = z[i] << 8 | z[i];

        if (W) w16.u16[i] = w[i] << 8 | w[i];

    }


    CONTINUE(&x16, &y16, &z16, &w16);

}


DECL_FUNC(expand_quad, const SwsCompMask mask)

{

    block_t x32, y32, z32, w32;


    SWS_LOOP

    for (int i = 0; i < SWS_BLOCK_SIZE; i++) {

        if (X) x32.u32[i] = (uint32_t) x[i] << 24 | x[i] << 16 | x[i] << 8 | x[i];

        if (Y) y32.u32[i] = (uint32_t) y[i] << 24 | y[i] << 16 | y[i] << 8 | y[i];

        if (Z) z32.u32[i] = (uint32_t) z[i] << 24 | z[i] << 16 | z[i] << 8 | z[i];

        if (W) w32.u32[i] = (uint32_t) w[i] << 24 | w[i] << 16 | w[i] << 8 | w[i];

    }


    CONTINUE(&x32, &y32, &z32, &w32);

}

#endif /* BIT_DEPTH == 8 */


SWS_FOR(PX, EXPAND_BIT,  DECL_IMPL, expand_bit)

SWS_FOR(PX, EXPAND_PAIR, DECL_IMPL, expand_pair)

SWS_FOR(PX, EXPAND_QUAD, DECL_IMPL, expand_quad)

SWS_FOR_STRUCT(PX, EXPAND_BIT,  DECL_ENTRY)

SWS_FOR_STRUCT(PX, EXPAND_PAIR, DECL_ENTRY)

SWS_FOR_STRUCT(PX, EXPAND_QUAD, DECL_ENTRY)


/*************************

 * Packing and unpacking *

 ************************/


#if !IS_FLOAT

DECL_FUNC(unpack, const SwsCompMask mask,

                  const uint8_t bx, const uint8_t by,

                  const uint8_t bz, const uint8_t bw)

{

    const uint8_t sx = bw + bz + by;

    const uint8_t sy = bw + bz;

    const uint8_t sz = bw;

    const uint8_t sw = 0;


    const pixel_t mx = (1 << bx) - 1;

    const pixel_t my = (1 << by) - 1;

    const pixel_t mz = (1 << bz) - 1;

    const pixel_t mw = (1 << bw) - 1;


    SWS_LOOP

    for (int i = 0; i < SWS_BLOCK_SIZE; i++) {

        const pixel_t val = x[i];

        if (X) x[i] = (val >> sx) & mx;

        if (Y) y[i] = (val >> sy) & my;

        if (Z) z[i] = (val >> sz) & mz;

        if (W) w[i] = (val >> sw) & mw;

    }


    CONTINUE(x, y, z, w);

}


DECL_FUNC(pack, const SwsCompMask mask,

                const uint8_t bx, const uint8_t by,

                const uint8_t bz, const uint8_t bw)

{

    const uint8_t sx = bw + bz + by;

    const uint8_t sy = bw + bz;

    const uint8_t sz = bw;

    const uint8_t sw = 0;


    SWS_LOOP

    for (int i = 0; i < SWS_BLOCK_SIZE; i++) {

        pixel_t val = 0;

        if (X) val |= x[i] << sx;

        if (Y) val |= y[i] << sy;

        if (Z) val |= z[i] << sz;

        if (W) val |= w[i] << sw;

        x[i] = val;

    }


    CONTINUE(x, y, z, w);

}

#endif /* !IS_FLOAT */


SWS_FOR(PX, UNPACK, DECL_IMPL, unpack)

SWS_FOR(PX, PACK,   DECL_IMPL, pack)

SWS_FOR_STRUCT(PX, UNPACK,  DECL_ENTRY)

SWS_FOR_STRUCT(PX, PACK,    DECL_ENTRY)


/***********************

 * Pixel data clearing *

 ***********************/


#ifdef PIXEL_MAX

DECL_FUNC(clear, const SwsCompMask mask, const SwsCompMask one,

                 const SwsCompMask zero)

{

    #define ONE(N)  SWS_COMP_TEST(one, N)

    #define ZERO(N) SWS_COMP_TEST(zero, N)

    const pixel_t cx = ONE(0) ? PIXEL_MAX : ZERO(0) ? 0 : impl->priv.px[0];

    const pixel_t cy = ONE(1) ? PIXEL_MAX : ZERO(1) ? 0 : impl->priv.px[1];

    const pixel_t cz = ONE(2) ? PIXEL_MAX : ZERO(2) ? 0 : impl->priv.px[2];

    const pixel_t cw = ONE(3) ? PIXEL_MAX : ZERO(3) ? 0 : impl->priv.px[3];


    SWS_LOOP

    for (int i = 0; i < SWS_BLOCK_SIZE; i++) {

        if (X) x[i] = cx;

        if (Y) y[i] = cy;

        if (Z) z[i] = cz;

        if (W) w[i] = cw;

    }


    CONTINUE(x, y, z, w);

}

#endif


SWS_FOR(PX, CLEAR, DECL_IMPL, clear)

SWS_FOR_STRUCT(PX, CLEAR, DECL_ENTRY, .setup = ff_sws_setup_vec4)


/*************************

 * Arithmetic operations *

 *************************/


DECL_FUNC(scale, const SwsCompMask mask)

{

    const pixel_t scale = impl->priv.px[0];


    SWS_LOOP

    for (int i = 0; i < SWS_BLOCK_SIZE; i++) {

        if (X) x[i] *= scale;

        if (Y) y[i] *= scale;

        if (Z) z[i] *= scale;

        if (W) w[i] *= scale;

    }


    CONTINUE(x, y, z, w);

}


DECL_FUNC(add, const SwsCompMask mask)

{

    SWS_LOOP

    for (int i = 0; i < SWS_BLOCK_SIZE; i++) {

        if (X) x[i] += impl->priv.px[0];

        if (Y) y[i] += impl->priv.px[1];

        if (Z) z[i] += impl->priv.px[2];

        if (W) w[i] += impl->priv.px[3];

    }


    CONTINUE(x, y, z, w);

}


DECL_FUNC(min, const SwsCompMask mask)

{

    SWS_LOOP

    for (int i = 0; i < SWS_BLOCK_SIZE; i++) {

        if (X) x[i] = FFMIN(x[i], impl->priv.px[0]);

        if (Y) y[i] = FFMIN(y[i], impl->priv.px[1]);

        if (Z) z[i] = FFMIN(z[i], impl->priv.px[2]);

        if (W) w[i] = FFMIN(w[i], impl->priv.px[3]);

    }


    CONTINUE(x, y, z, w);

}


DECL_FUNC(max, const SwsCompMask mask)

{

    SWS_LOOP

    for (int i = 0; i < SWS_BLOCK_SIZE; i++) {

        if (X) x[i] = FFMAX(x[i], impl->priv.px[0]);

        if (Y) y[i] = FFMAX(y[i], impl->priv.px[1]);

        if (Z) z[i] = FFMAX(z[i], impl->priv.px[2]);

        if (W) w[i] = FFMAX(w[i], impl->priv.px[3]);

    }


    CONTINUE(x, y, z, w);

}


SWS_FOR(PX, SCALE, DECL_IMPL, scale)

SWS_FOR(PX, ADD,   DECL_IMPL, add)

SWS_FOR(PX, MIN,   DECL_IMPL, min)

SWS_FOR(PX, MAX,   DECL_IMPL, max)

SWS_FOR_STRUCT(PX, SCALE, DECL_ENTRY, .setup = ff_sws_setup_scalar )

SWS_FOR_STRUCT(PX, ADD,   DECL_ENTRY, .setup = ff_sws_setup_vec4 )

SWS_FOR_STRUCT(PX, MIN,   DECL_ENTRY, .setup = ff_sws_setup_vec4 )

SWS_FOR_STRUCT(PX, MAX,   DECL_ENTRY, .setup = ff_sws_setup_vec4 )


/*************

 * Dithering *

 *************/


DECL_SETUP(setup_dither, params, out)

{

    const SwsUOp *uop = params->uop;

    const SwsDitherUOp *dither = &uop->par.dither;

    const int size = 1 << dither->size_log2;

    if (size >= SWS_BLOCK_SIZE) {

        /* No extra padding needed */

        out->priv.ptr = av_refstruct_ref(uop->data.ptr);

        out->free = ff_op_priv_unref;

        return 0;

    }


    const int stride = FFMAX(size, SWS_BLOCK_SIZE);

    const int height = ff_sws_dither_height(dither);

    pixel_t *matrix = av_malloc(sizeof(pixel_t) * height * stride);

    if (!matrix)

        return AVERROR(ENOMEM);

    out->priv.ptr = matrix;

    out->free = ff_op_priv_free;


    /* Pad to multiple of block size. We don't need extra padding for the

     * height because ff_sws_dither_height() already includes any padding

     * necessary for the y_offset */

    for (int y = 0; y < height; y++) {

        pixel_t *row = &matrix[y * stride];

        for (int x = 0; x < size; x++)

            row[x] = uop->data.ptr[y * size + x].px;

        for (int x = size; x < stride; x++)

            row[x] = row[x % size];

    }


    return 0;

}


DECL_FUNC(dither, const SwsCompMask mask,

                  const uint8_t off0, const uint8_t off1,

                  const uint8_t off2, const uint8_t off3,

                  const uint8_t size_log2)

{

    const int size   = 1 << size_log2;

    const int stride = FFMAX(size, SWS_BLOCK_SIZE);


    const pixel_t *matrix = impl->priv.ptr;

    matrix += (iter->y & (size - 1)) * stride;

    matrix += (iter->x & (size - 1)) & ~(SWS_BLOCK_SIZE - 1);


    const pixel_t *const row0 = &matrix[off0 * stride];

    const pixel_t *const row1 = &matrix[off1 * stride];

    const pixel_t *const row2 = &matrix[off2 * stride];

    const pixel_t *const row3 = &matrix[off3 * stride];


    SWS_LOOP

    for (int i = 0; i < SWS_BLOCK_SIZE; i++) {

        if (X) x[i] += row0[i];

        if (Y) y[i] += row1[i];

        if (Z) z[i] += row2[i];

        if (W) w[i] += row3[i];

    }


    CONTINUE(x, y, z, w);

}


SWS_FOR(PX, DITHER, DECL_IMPL, dither)

SWS_FOR_STRUCT(PX, DITHER, DECL_ENTRY, .setup = fn(setup_dither) )


/*********************

 * Linear operations *

 *********************/


typedef struct {

    /* Stored in split form for convenience */

    pixel_t m[4][4];

    pixel_t k[4];

} fn(LinCoeffs);


DECL_SETUP(setup_linear, params, out)

{

    const SwsUOp *uop = params->uop;

    fn(LinCoeffs) c;


    for (int i = 0; i < 4; i++) {

        for (int j = 0; j < 4; j++)

            c.m[i][j] = uop->data.mat4[i][j].px;

        c.k[i] = uop->data.mat4[i][4].px;

    }


    out->priv.ptr = av_memdup(&c, sizeof(c));

    out->free = ff_op_priv_free;

    return out->priv.ptr ? 0 : AVERROR(ENOMEM);

}


/**

 * Fully general case for a 5x5 linear affine transformation. Should never be

 * called without constant `mask`. This function will compile down to the

 * appropriately optimized version for the required subset of operations when

 * called with a constant mask.

 */

DECL_FUNC(linear, const SwsCompMask mask, const uint32_t one, const uint32_t zero)

{

    const fn(LinCoeffs) c = *(const fn(LinCoeffs) *) impl->priv.ptr;


    SWS_LOOP

    for (int i = 0; i < SWS_BLOCK_SIZE; i++) {

        const pixel_t xx = x[i];

        const pixel_t yy = y[i];

        const pixel_t zz = z[i];

        const pixel_t ww = w[i];


#define LIN_VAL(I, J, val) \

    ((one & SWS_MASK(I, J)) ? (val) : c.m[I][J] * (val))


#define LIN_ROW(I, var) do {                                    \

    var[i] = (zero & SWS_MASK(I, 4)) ? 0 : c.k[I];              \

    if (!(zero & SWS_MASK(I, 0))) var[i] += LIN_VAL(I, 0, xx);  \

    if (!(zero & SWS_MASK(I, 1))) var[i] += LIN_VAL(I, 1, yy);  \

    if (!(zero & SWS_MASK(I, 2))) var[i] += LIN_VAL(I, 2, zz);  \

    if (!(zero & SWS_MASK(I, 3))) var[i] += LIN_VAL(I, 3, ww);  \

} while (0)


        if (X) LIN_ROW(0, x);

        if (Y) LIN_ROW(1, y);

        if (Z) LIN_ROW(2, z);

        if (W) LIN_ROW(3, w);

    }


    CONTINUE(x, y, z, w);

}


SWS_FOR(PX, LINEAR, DECL_IMPL, linear)

SWS_FOR_STRUCT(PX, LINEAR, DECL_ENTRY, .setup = fn(setup_linear) )


#undef PIXEL_MAX

#undef PIXEL_SWAP

#undef pixel_t

#undef inter_t

#undef block_t

#undef PX

#undef px