29               unsigned A1, 
unsigned A2,
 
   30               const void *_r, 
const void *_g, 
const void *_b, 
int y,
 
   35         uint32_t *dest = (uint32_t *) _dest;
 
   36         const uint32_t *
r = (
const uint32_t *) _r;
 
   37         const uint32_t *
g = (
const uint32_t *) _g;
 
   38         const uint32_t *
b = (
const uint32_t *) _b;
 
   41         dest[
i * 2 + 0] = 
r[Y1] + 
g[Y1] + 
b[Y1];
 
   42         dest[
i * 2 + 1] = 
r[Y2] + 
g[Y2] + 
b[Y2];
 
   44 #if defined(ASSERT_LEVEL) && ASSERT_LEVEL > 1 
   49         dest[
i * 2 + 0] = 
r[Y1] + 
g[Y1] + 
b[Y1];
 
   50         dest[
i * 2 + 1] = 
r[Y2] + 
g[Y2] + 
b[Y2];
 
   53         uint8_t *dest = (uint8_t *) _dest;
 
   54         const uint8_t *
r = (
const uint8_t *) _r;
 
   55         const uint8_t *
g = (
const uint8_t *) _g;
 
   56         const uint8_t *
b = (
const uint8_t *) _b;
 
   58 #define r_b ((target == AV_PIX_FMT_RGB24) ? r : b) 
   59 #define b_r ((target == AV_PIX_FMT_RGB24) ? b : r) 
   61         dest[
i * 6 + 0] = 
r_b[Y1];
 
   62         dest[
i * 6 + 1] =   
g[Y1];
 
   63         dest[
i * 6 + 2] = 
b_r[Y1];
 
   64         dest[
i * 6 + 3] = 
r_b[Y2];
 
   65         dest[
i * 6 + 4] =   
g[Y2];
 
   66         dest[
i * 6 + 5] = 
b_r[Y2];
 
   72         uint16_t *dest = (uint16_t *) _dest;
 
   73         const uint16_t *
r = (
const uint16_t *) _r;
 
   74         const uint16_t *
g = (
const uint16_t *) _g;
 
   75         const uint16_t *
b = (
const uint16_t *) _b;
 
   76         int dr1, dg1, db1, dr2, dg2, db2;
 
  101         dest[
i * 2 + 0] = 
r[Y1 + dr1] + 
g[Y1 + dg1] + 
b[Y1 + db1];
 
  102         dest[
i * 2 + 1] = 
r[Y2 + dr2] + 
g[Y2 + dg2] + 
b[Y2 + db2];
 
  104         uint8_t *dest = (uint8_t *) _dest;
 
  105         const uint8_t *
r = (
const uint8_t *) _r;
 
  106         const uint8_t *
g = (
const uint8_t *) _g;
 
  107         const uint8_t *
b = (
const uint8_t *) _b;
 
  108         int dr1, dg1, db1, dr2, dg2, db2;
 
  113             dr1 = dg1 = d32[(
i * 2 + 0) & 7];
 
  114             db1 =       d64[(
i * 2 + 0) & 7];
 
  115             dr2 = dg2 = d32[(
i * 2 + 1) & 7];
 
  116             db2 =       d64[(
i * 2 + 1) & 7];
 
  120             dr1 = db1 = 
d128[(
i * 2 + 0) & 7];
 
  121             dg1 =        d64[(
i * 2 + 0) & 7];
 
  122             dr2 = db2 = 
d128[(
i * 2 + 1) & 7];
 
  123             dg2 =        d64[(
i * 2 + 1) & 7];
 
  127             dest[
i] = 
r[Y1 + dr1] + 
g[Y1 + dg1] + 
b[Y1 + db1] +
 
  128                     ((
r[Y2 + dr2] + 
g[Y2 + dg2] + 
b[Y2 + db2]) << 4);
 
  130             dest[
i * 2 + 0] = 
r[Y1 + dr1] + 
g[Y1 + dg1] + 
b[Y1 + db1];
 
  131             dest[
i * 2 + 1] = 
r[Y2 + dr2] + 
g[Y2 + dg2] + 
b[Y2 + db2];
 
  136 #define WRITE_YUV2RGB_LSX(vec_y1, vec_y2, vec_u, vec_v, t1, t2, t3, t4) \ 
  138     Y1 = __lsx_vpickve2gr_w(vec_y1, t1);                                \ 
  139     Y2 = __lsx_vpickve2gr_w(vec_y2, t2);                                \ 
  140     U  = __lsx_vpickve2gr_w(vec_u, t3);                                 \ 
  141     V  = __lsx_vpickve2gr_w(vec_v, t4);                                 \ 
  142     r  =  c->table_rV[V];                                               \ 
  143     g  = (c->table_gU[U] + c->table_gV[V]);                             \ 
  144     b  =  c->table_bU[U];                                               \ 
  145     yuv2rgb_write(dest, count, Y1, Y2, 0, 0,                            \ 
  146                   r, g, b, y, target, 0);                               \ 
  152                        const int16_t **lumSrc, 
int lumFilterSize,
 
  153                        const int16_t *chrFilter, 
const int16_t **chrUSrc,
 
  154                        const int16_t **chrVSrc, 
int chrFilterSize,
 
  155                        const int16_t **alpSrc, uint8_t *dest, 
int dstW,
 
  163     int len_count = (dstW + 1) >> 1;
 
  164     const void *
r, *
g, *
b;
 
  166     __m128i 
headroom  = __lsx_vreplgr2vr_w(head);
 
  168     for (
i = 0; 
i < 
len; 
i++) {
 
  169         int Y1, Y2, 
U, 
V, count_lum = count << 1;
 
  170         __m128i l_src1, l_src2, l_src3, l_src4, u_src1, u_src2, v_src1, v_src2;
 
  171         __m128i yl_ev, yl_ev1, yl_ev2, yl_od1, yl_od2, yh_ev1, yh_ev2, yh_od1, yh_od2;
 
  172         __m128i u_ev1, u_ev2, u_od1, u_od2, v_ev1, v_ev2, v_od1, v_od2, 
temp;
 
  174         yl_ev  = __lsx_vldrepl_w(&t, 0);
 
  192         for (j = 0; j < lumFilterSize; j++) {
 
  193             temp   = __lsx_vldrepl_h((lumFilter + j), 0);
 
  194             DUP2_ARG2(__lsx_vld, lumSrc[j] + count_lum, 0, lumSrc[j] + count_lum,
 
  196             DUP2_ARG2(__lsx_vld, lumSrc[j] + count_lum, 32, lumSrc[j] + count_lum,
 
  198             yl_ev1  = __lsx_vmaddwev_w_h(yl_ev1, 
temp, l_src1);
 
  199             yl_od1  = __lsx_vmaddwod_w_h(yl_od1, 
temp, l_src1);
 
  200             yh_ev1  = __lsx_vmaddwev_w_h(yh_ev1, 
temp, l_src3);
 
  201             yh_od1  = __lsx_vmaddwod_w_h(yh_od1, 
temp, l_src3);
 
  202             yl_ev2  = __lsx_vmaddwev_w_h(yl_ev2, 
temp, l_src2);
 
  203             yl_od2  = __lsx_vmaddwod_w_h(yl_od2, 
temp, l_src2);
 
  204             yh_ev2  = __lsx_vmaddwev_w_h(yh_ev2, 
temp, l_src4);
 
  205             yh_od2  = __lsx_vmaddwod_w_h(yh_od2, 
temp, l_src4);
 
  207         for (j = 0; j < chrFilterSize; j++) {
 
  208             DUP2_ARG2(__lsx_vld, chrUSrc[j] + count, 0, chrVSrc[j] + count, 0,
 
  210             DUP2_ARG2(__lsx_vld, chrUSrc[j] + count, 16, chrVSrc[j] + count, 16,
 
  212             temp  = __lsx_vldrepl_h((chrFilter + j), 0);
 
  213             u_ev1 = __lsx_vmaddwev_w_h(u_ev1, 
temp, u_src1);
 
  214             u_od1 = __lsx_vmaddwod_w_h(u_od1, 
temp, u_src1);
 
  215             v_ev1 = __lsx_vmaddwev_w_h(v_ev1, 
temp, v_src1);
 
  216             v_od1 = __lsx_vmaddwod_w_h(v_od1, 
temp, v_src1);
 
  217             u_ev2 = __lsx_vmaddwev_w_h(u_ev2, 
temp, u_src2);
 
  218             u_od2 = __lsx_vmaddwod_w_h(u_od2, 
temp, u_src2);
 
  219             v_ev2 = __lsx_vmaddwev_w_h(v_ev2, 
temp, v_src2);
 
  220             v_od2 = __lsx_vmaddwod_w_h(v_od2, 
temp, v_src2);
 
  222         yl_ev1 = __lsx_vsrai_w(yl_ev1, 19);
 
  223         yh_ev1 = __lsx_vsrai_w(yh_ev1, 19);
 
  224         yl_od1 = __lsx_vsrai_w(yl_od1, 19);
 
  225         yh_od1 = __lsx_vsrai_w(yh_od1, 19);
 
  226         u_ev1  = __lsx_vsrai_w(u_ev1, 19);
 
  227         v_ev1  = __lsx_vsrai_w(v_ev1, 19);
 
  228         u_od1  = __lsx_vsrai_w(u_od1, 19);
 
  229         v_od1  = __lsx_vsrai_w(v_od1, 19);
 
  230         yl_ev2 = __lsx_vsrai_w(yl_ev2, 19);
 
  231         yh_ev2 = __lsx_vsrai_w(yh_ev2, 19);
 
  232         yl_od2 = __lsx_vsrai_w(yl_od2, 19);
 
  233         yh_od2 = __lsx_vsrai_w(yh_od2, 19);
 
  234         u_ev2  = __lsx_vsrai_w(u_ev2, 19);
 
  235         v_ev2  = __lsx_vsrai_w(v_ev2, 19);
 
  236         u_od2  = __lsx_vsrai_w(u_od2, 19);
 
  237         v_od2  = __lsx_vsrai_w(v_od2, 19);
 
  238         u_ev1  = __lsx_vadd_w(u_ev1, 
headroom);
 
  239         v_ev1  = __lsx_vadd_w(v_ev1, 
headroom);
 
  240         u_od1  = __lsx_vadd_w(u_od1, 
headroom);
 
  241         v_od1  = __lsx_vadd_w(v_od1, 
headroom);
 
  242         u_ev2  = __lsx_vadd_w(u_ev2, 
headroom);
 
  243         v_ev2  = __lsx_vadd_w(v_ev2, 
headroom);
 
  244         u_od2  = __lsx_vadd_w(u_od2, 
headroom);
 
  245         v_od2  = __lsx_vadd_w(v_od2, 
headroom);
 
  266         int Y1, Y2, 
U, 
V, count_lum = count << 1;
 
  267         __m128i l_src1, l_src2, u_src1, v_src1;
 
  268         __m128i yl_ev, yl_ev1, yl_ev2, yl_od1, yl_od2;
 
  269         __m128i u_ev1, u_od1, v_ev1, v_od1, 
temp;
 
  271         yl_ev  = __lsx_vldrepl_w(&t, 0);
 
  281         for (j = 0; j < lumFilterSize; j++) {
 
  282             temp   = __lsx_vldrepl_h((lumFilter + j), 0);
 
  283             DUP2_ARG2(__lsx_vld, lumSrc[j] + count_lum, 0, lumSrc[j] + count_lum,
 
  285             yl_ev1  = __lsx_vmaddwev_w_h(yl_ev1, 
temp, l_src1);
 
  286             yl_od1  = __lsx_vmaddwod_w_h(yl_od1, 
temp, l_src1);
 
  287             yl_ev2  = __lsx_vmaddwev_w_h(yl_ev2, 
temp, l_src2);
 
  288             yl_od2  = __lsx_vmaddwod_w_h(yl_od2, 
temp, l_src2);
 
  290         for (j = 0; j < chrFilterSize; j++) {
 
  291             DUP2_ARG2(__lsx_vld, chrUSrc[j] + count, 0, chrVSrc[j] + count, 0,
 
  293             temp  = __lsx_vldrepl_h((chrFilter + j), 0);
 
  294             u_ev1 = __lsx_vmaddwev_w_h(u_ev1, 
temp, u_src1);
 
  295             u_od1 = __lsx_vmaddwod_w_h(u_od1, 
temp, u_src1);
 
  296             v_ev1 = __lsx_vmaddwev_w_h(v_ev1, 
temp, v_src1);
 
  297             v_od1 = __lsx_vmaddwod_w_h(v_od1, 
temp, v_src1);
 
  299         yl_ev1 = __lsx_vsrai_w(yl_ev1, 19);
 
  300         yl_od1 = __lsx_vsrai_w(yl_od1, 19);
 
  301         u_ev1  = __lsx_vsrai_w(u_ev1, 19);
 
  302         v_ev1  = __lsx_vsrai_w(v_ev1, 19);
 
  303         u_od1  = __lsx_vsrai_w(u_od1, 19);
 
  304         v_od1  = __lsx_vsrai_w(v_od1, 19);
 
  305         yl_ev2 = __lsx_vsrai_w(yl_ev2, 19);
 
  306         yl_od2 = __lsx_vsrai_w(yl_od2, 19);
 
  307         u_ev1  = __lsx_vadd_w(u_ev1, 
headroom);
 
  308         v_ev1  = __lsx_vadd_w(v_ev1, 
headroom);
 
  309         u_od1  = __lsx_vadd_w(u_od1, 
headroom);
 
  310         v_od1  = __lsx_vadd_w(v_od1, 
headroom);
 
  324         int Y1, Y2, 
U, 
V, count_lum = count << 1;
 
  325         __m128i l_src1, u_src, v_src;
 
  326         __m128i yl_ev, yl_od;
 
  327         __m128i u_ev, u_od, v_ev, v_od, 
temp;
 
  329         yl_ev = __lsx_vldrepl_w(&t, 0);
 
  335         for (j = 0; j < lumFilterSize; j++) {
 
  336             temp   = __lsx_vldrepl_h((lumFilter + j), 0);
 
  337             l_src1 = __lsx_vld(lumSrc[j] + count_lum, 0);
 
  338             yl_ev  = __lsx_vmaddwev_w_h(yl_ev, 
temp, l_src1);
 
  339             yl_od  = __lsx_vmaddwod_w_h(yl_od, 
temp, l_src1);
 
  341         for (j = 0; j < chrFilterSize; j++) {
 
  342             DUP2_ARG2(__lsx_vld, chrUSrc[j] + count, 0, chrVSrc[j] + count, 0,
 
  344             temp  = __lsx_vldrepl_h((chrFilter + j), 0);
 
  345             u_ev  = __lsx_vmaddwev_w_h(u_ev, 
temp, u_src);
 
  346             u_od  = __lsx_vmaddwod_w_h(u_od, 
temp, u_src);
 
  347             v_ev  = __lsx_vmaddwev_w_h(v_ev, 
temp, v_src);
 
  348             v_od  = __lsx_vmaddwod_w_h(v_od, 
temp, v_src);
 
  350         yl_ev = __lsx_vsrai_w(yl_ev, 19);
 
  351         yl_od = __lsx_vsrai_w(yl_od, 19);
 
  352         u_ev  = __lsx_vsrai_w(u_ev, 19);
 
  353         v_ev  = __lsx_vsrai_w(v_ev, 19);
 
  354         u_od  = __lsx_vsrai_w(u_od, 19);
 
  355         v_od  = __lsx_vsrai_w(v_od, 19);
 
  356         u_ev  = __lsx_vadd_w(u_ev, 
headroom);
 
  357         v_ev  = __lsx_vadd_w(v_ev, 
headroom);
 
  358         u_od  = __lsx_vadd_w(u_od, 
headroom);
 
  359         v_od  = __lsx_vadd_w(v_od, 
headroom);
 
  368         int Y1, Y2, 
U, 
V, count_lum = count << 1;
 
  369         __m128i l_src1, u_src, v_src;
 
  370         __m128i yl_ev, yl_od;
 
  371         __m128i u_ev, u_od, v_ev, v_od, 
temp;
 
  373         yl_ev = __lsx_vldrepl_w(&t, 0);
 
  379         for (j = 0; j < lumFilterSize; j++) {
 
  380             temp   = __lsx_vldrepl_h((lumFilter + j), 0);
 
  381             l_src1 = __lsx_vld(lumSrc[j] + count_lum, 0);
 
  382             yl_ev  = __lsx_vmaddwev_w_h(yl_ev, 
temp, l_src1);
 
  383             yl_od  = __lsx_vmaddwod_w_h(yl_od, 
temp, l_src1);
 
  385         for (j = 0; j < chrFilterSize; j++) {
 
  386             DUP2_ARG2(__lsx_vld, chrUSrc[j] + count, 0, chrVSrc[j] + count, 0,
 
  388             temp  = __lsx_vldrepl_h((chrFilter + j), 0);
 
  389             u_ev  = __lsx_vmaddwev_w_h(u_ev, 
temp, u_src);
 
  390             u_od  = __lsx_vmaddwod_w_h(u_od, 
temp, u_src);
 
  391             v_ev  = __lsx_vmaddwev_w_h(v_ev, 
temp, v_src);
 
  392             v_od  = __lsx_vmaddwod_w_h(v_od, 
temp, v_src);
 
  394         yl_ev = __lsx_vsrai_w(yl_ev, 19);
 
  395         yl_od = __lsx_vsrai_w(yl_od, 19);
 
  396         u_ev  = __lsx_vsrai_w(u_ev, 19);
 
  397         v_ev  = __lsx_vsrai_w(v_ev, 19);
 
  398         u_od  = __lsx_vsrai_w(u_od, 19);
 
  399         v_od  = __lsx_vsrai_w(v_od, 19);
 
  400         u_ev  = __lsx_vadd_w(u_ev, 
headroom);
 
  401         v_ev  = __lsx_vadd_w(v_ev, 
headroom);
 
  402         u_od  = __lsx_vadd_w(u_od, 
headroom);
 
  403         v_od  = __lsx_vadd_w(v_od, 
headroom);
 
  410         int Y1, Y2, 
U, 
V, count_lum = count << 1;
 
  411         __m128i l_src1, u_src, v_src;
 
  412         __m128i yl_ev, yl_od;
 
  413         __m128i u_ev, u_od, v_ev, v_od, 
temp;
 
  415         yl_ev = __lsx_vldrepl_w(&t, 0);
 
  421         for (j = 0; j < lumFilterSize; j++) {
 
  422             temp   = __lsx_vldrepl_h((lumFilter + j), 0);
 
  423             l_src1 = __lsx_vld(lumSrc[j] + count_lum, 0);
 
  424             yl_ev  = __lsx_vmaddwev_w_h(yl_ev, 
temp, l_src1);
 
  425             yl_od  = __lsx_vmaddwod_w_h(yl_od, 
temp, l_src1);
 
  427         for (j = 0; j < chrFilterSize; j++) {
 
  428             DUP2_ARG2(__lsx_vld, chrUSrc[j] + count, 0, chrVSrc[j] + count, 0,
 
  430             temp  = __lsx_vldrepl_h((chrFilter + j), 0);
 
  431             u_ev  = __lsx_vmaddwev_w_h(u_ev, 
temp, u_src);
 
  432             u_od  = __lsx_vmaddwod_w_h(u_od, 
temp, u_src);
 
  433             v_ev  = __lsx_vmaddwev_w_h(v_ev, 
temp, v_src);
 
  434             v_od  = __lsx_vmaddwod_w_h(v_od, 
temp, v_src);
 
  436         yl_ev = __lsx_vsrai_w(yl_ev, 19);
 
  437         yl_od = __lsx_vsrai_w(yl_od, 19);
 
  438         u_ev  = __lsx_vsrai_w(u_ev, 19);
 
  439         v_ev  = __lsx_vsrai_w(v_ev, 19);
 
  440         u_od  = __lsx_vsrai_w(u_od, 19);
 
  441         v_od  = __lsx_vsrai_w(v_od, 19);
 
  442         u_ev  = __lsx_vadd_w(u_ev, 
headroom);
 
  443         v_ev  = __lsx_vadd_w(v_ev, 
headroom);
 
  444         u_od  = __lsx_vadd_w(u_od, 
headroom);
 
  445         v_od  = __lsx_vadd_w(v_od, 
headroom);
 
  450     for (; count < len_count; count++) {
 
  456         for (j = 0; j < lumFilterSize; j++) {
 
  457             Y1 += lumSrc[j][count * 2]     * lumFilter[j];
 
  458             Y2 += lumSrc[j][count * 2 + 1] * lumFilter[j];
 
  460         for (j = 0; j < chrFilterSize; j++) {
 
  461             U += chrUSrc[j][count] * chrFilter[j];
 
  462             V += chrVSrc[j][count] * chrFilter[j];
 
  474                       r, 
g, 
b, y, target, 0);
 
  480                        const int16_t *ubuf[2], 
const int16_t *vbuf[2],
 
  481                        const int16_t *abuf[2], uint8_t *dest, 
int dstW,
 
  482                        int yalpha, 
int uvalpha, 
int y,
 
  485     const int16_t *buf0  = buf[0],  *buf1  = buf[1],
 
  486                   *ubuf0 = ubuf[0], *ubuf1 = ubuf[1],
 
  487                   *vbuf0 = vbuf[0], *vbuf1 = vbuf[1];
 
  488     int yalpha1   = 4096 - yalpha;
 
  489     int uvalpha1  = 4096 - uvalpha;
 
  492     int len_count = (dstW + 1) >> 1;
 
  493     const void *
r, *
g, *
b;
 
  495     __m128i v_yalpha1  = __lsx_vreplgr2vr_w(yalpha1);
 
  496     __m128i v_uvalpha1 = __lsx_vreplgr2vr_w(uvalpha1);
 
  497     __m128i v_yalpha   = __lsx_vreplgr2vr_w(yalpha);
 
  498     __m128i v_uvalpha  = __lsx_vreplgr2vr_w(uvalpha);
 
  499     __m128i 
headroom   = __lsx_vreplgr2vr_w(head);
 
  500     __m128i 
zero       = __lsx_vldi(0);
 
  502     for (
i = 0; 
i < 
len; 
i += 8) {
 
  505         int c_dex = count << 1;
 
  506         __m128i y0_h, y0_l, y0, u0, v0;
 
  507         __m128i y1_h, y1_l, y1, u1, v1;
 
  508         __m128i y_l, y_h, 
u, v;
 
  510         DUP4_ARG2(__lsx_vldx, buf0, i_dex, ubuf0, c_dex, vbuf0, c_dex,
 
  511                   buf1, i_dex, y0, u0, v0, y1);
 
  512         DUP2_ARG2(__lsx_vldx, ubuf1, c_dex, vbuf1, c_dex, u1, v1);
 
  513         DUP2_ARG2(__lsx_vsllwil_w_h, y0, 0, y1, 0, y0_l, y1_l);
 
  514         DUP2_ARG1(__lsx_vexth_w_h, y0, y1, y0_h, y1_h);
 
  517         y0_l = __lsx_vmul_w(y0_l, v_yalpha1);
 
  518         y0_h = __lsx_vmul_w(y0_h, v_yalpha1);
 
  519         u0   = __lsx_vmul_w(u0, v_uvalpha1);
 
  520         v0   = __lsx_vmul_w(v0, v_uvalpha1);
 
  521         y_l  = __lsx_vmadd_w(y0_l, v_yalpha, y1_l);
 
  522         y_h  = __lsx_vmadd_w(y0_h, v_yalpha, y1_h);
 
  523         u    = __lsx_vmadd_w(u0, v_uvalpha, u1);
 
  524         v    = __lsx_vmadd_w(v0, v_uvalpha, v1);
 
  525         y_l  = __lsx_vsrai_w(y_l, 19);
 
  526         y_h  = __lsx_vsrai_w(y_h, 19);
 
  527         u    = __lsx_vsrai_w(
u, 19);
 
  528         v    = __lsx_vsrai_w(v, 19);
 
  539         __m128i y0_l, y0, u0, v0;
 
  540         __m128i y1_l, y1, u1, v1;
 
  543         y0   = __lsx_vldx(buf0, i_dex);
 
  544         u0   = __lsx_vldrepl_d((ubuf0 + count), 0);
 
  545         v0   = __lsx_vldrepl_d((vbuf0 + count), 0);
 
  546         y1   = __lsx_vldx(buf1, i_dex);
 
  547         u1   = __lsx_vldrepl_d((ubuf1 + count), 0);
 
  548         v1   = __lsx_vldrepl_d((vbuf1 + count), 0);
 
  552         y0_l = __lsx_vmul_w(y0_l, v_yalpha1);
 
  553         u0   = __lsx_vmul_w(u0, v_uvalpha1);
 
  554         v0   = __lsx_vmul_w(v0, v_uvalpha1);
 
  555         y_l  = __lsx_vmadd_w(y0_l, v_yalpha, y1_l);
 
  556         u    = __lsx_vmadd_w(u0, v_uvalpha, u1);
 
  557         v    = __lsx_vmadd_w(v0, v_uvalpha, v1);
 
  558         y_l  = __lsx_vsrai_w(y_l, 19);
 
  559         u    = __lsx_vsrai_w(
u, 19);
 
  560         v    = __lsx_vsrai_w(v, 19);
 
  567     for (; count < len_count; count++) {
 
  568         int Y1 = (buf0[count * 2]     * yalpha1  +
 
  569                   buf1[count * 2]     * yalpha)  >> 19;
 
  570         int Y2 = (buf0[count * 2 + 1] * yalpha1  +
 
  571                   buf1[count * 2 + 1] * yalpha) >> 19;
 
  572         int U  = (ubuf0[count] * uvalpha1 + ubuf1[count] * uvalpha) >> 19;
 
  573         int V  = (vbuf0[count] * uvalpha1 + vbuf1[count] * uvalpha) >> 19;
 
  581                       r, 
g, 
b, y, target, 0);
 
  587                        const int16_t *ubuf[2], 
const int16_t *vbuf[2],
 
  588                        const int16_t *abuf0, uint8_t *dest, 
int dstW,
 
  592     const int16_t *ubuf0 = ubuf[0], *vbuf0 = vbuf[0];
 
  594     int len       = (dstW - 7);
 
  595     int len_count = (dstW + 1) >> 1;
 
  596     const void *
r, *
g, *
b;
 
  601         __m128i 
headroom  = __lsx_vreplgr2vr_h(head);
 
  603         for (
i = 0; 
i < 
len; 
i += 8) {
 
  606             int c_dex = count << 1;
 
  607             __m128i src_y, src_u, src_v;
 
  608             __m128i 
u, v, uv, y_l, y_h;
 
  610             src_y = __lsx_vldx(buf0, i_dex);
 
  611             DUP2_ARG2(__lsx_vldx, ubuf0, c_dex, vbuf0, c_dex, src_u, src_v);
 
  612             src_y = __lsx_vsrari_h(src_y, 7);
 
  613             src_u = __lsx_vsrari_h(src_u, 7);
 
  614             src_v = __lsx_vsrari_h(src_v, 7);
 
  615             y_l   = __lsx_vsllwil_w_h(src_y, 0);
 
  616             y_h   = __lsx_vexth_w_h(src_y);
 
  617             uv    = __lsx_vilvl_h(src_v, src_u);
 
  619             v     = __lsx_vaddwod_w_h(uv, 
headroom);
 
  628             __m128i src_y, src_u, src_v;
 
  629             __m128i y_l, 
u, v, uv;
 
  631             src_y  = __lsx_vldx(buf0, i_dex);
 
  632             src_u  = __lsx_vldrepl_d((ubuf0 + count), 0);
 
  633             src_v  = __lsx_vldrepl_d((vbuf0 + count), 0);
 
  634             y_l    = __lsx_vsrari_h(src_y, 7);
 
  635             y_l    = __lsx_vsllwil_w_h(y_l, 0);
 
  636             uv     = __lsx_vilvl_h(src_v, src_u);
 
  637             uv     = __lsx_vsrari_h(uv, 7);
 
  639             v      = __lsx_vaddwod_w_h(uv, 
headroom);
 
  644         for (; count < len_count; count++) {
 
  645             int Y1 = (buf0[count * 2    ] + 64) >> 7;
 
  646             int Y2 = (buf0[count * 2 + 1] + 64) >> 7;
 
  647             int U  = (ubuf0[count]        + 64) >> 7;
 
  648             int V  = (vbuf0[count]        + 64) >> 7;
 
  656                           r, 
g, 
b, y, target, 0);
 
  659         const int16_t *ubuf1 = ubuf[1], *vbuf1 = vbuf[1];
 
  662         int uvalpha1 = 4096 - uvalpha;
 
  663         __m128i 
headroom    = __lsx_vreplgr2vr_w(HEADROOM);
 
  664         __m128i uvalpha_tmp1 = __lsx_vreplgr2vr_h(uvalpha1);
 
  665         __m128i uvalpha_tmp  = __lsx_vreplgr2vr_h(uvalpha);
 
  667         for (
i = 0; 
i < 
len; 
i += 8) {
 
  670             int c_dex = count << 1;
 
  671             __m128i src_y, src_u0, src_v0, src_u1, src_v1;
 
  672             __m128i y_l, y_h, u1, u2, v1, v2, u_ev, v_od;
 
  674             DUP4_ARG2(__lsx_vldx, buf0, i_dex, ubuf0, c_dex, vbuf0, c_dex,
 
  675                       ubuf1, c_dex, src_y, src_u0, src_v0, src_u1);
 
  676             src_v1 = __lsx_vldx(vbuf1, c_dex);
 
  677             src_y  = __lsx_vsrari_h(src_y, 7);
 
  679             u_ev    = __lsx_vmulwev_w_h(src_u0, uvalpha_tmp1);
 
  680             v_od    = __lsx_vmulwod_w_h(src_u0, uvalpha_tmp1);
 
  681             u1      = __lsx_vmaddwev_w_h(u_ev, src_u1, uvalpha_tmp);
 
  682             v1      = __lsx_vmaddwod_w_h(v_od, src_u1, uvalpha_tmp);
 
  683             u_ev    = __lsx_vmulwev_w_h(src_v0, uvalpha_tmp1);
 
  684             v_od    = __lsx_vmulwod_w_h(src_v0, uvalpha_tmp1);
 
  685             u2      = __lsx_vmaddwev_w_h(u_ev, src_v1, uvalpha_tmp);
 
  686             v2      = __lsx_vmaddwod_w_h(v_od, src_v1, uvalpha_tmp);
 
  688             y_l     = __lsx_vsllwil_w_h(src_y, 0);
 
  689             y_h     = __lsx_vexth_w_h(src_y);
 
  690             u1      = __lsx_vsrari_w(u1, 19);
 
  691             v1      = __lsx_vsrari_w(v1, 19);
 
  692             u2      = __lsx_vsrari_w(u2, 19);
 
  693             v2      = __lsx_vsrari_w(v2, 19);
 
  703         for (; count < len_count; count++) {
 
  704             int Y1 = (buf0[count * 2    ]         +  64) >> 7;
 
  705             int Y2 = (buf0[count * 2 + 1]         +  64) >> 7;
 
  706             int U  = (ubuf0[count] + ubuf1[count] + 128) >> 8;
 
  707             int V  = (vbuf0[count] + vbuf1[count] + 128) >> 8;
 
  715                           r, 
g, 
b, y, target, 0);
 
  720 #define YUV2RGBWRAPPERX(name, base, ext, fmt, hasAlpha)                               \ 
  721 static void name ## ext ## _X_lsx(SwsInternal *c, const int16_t *lumFilter,           \ 
  722                                   const int16_t **lumSrc, int lumFilterSize,          \ 
  723                                   const int16_t *chrFilter, const int16_t **chrUSrc,  \ 
  724                                   const int16_t **chrVSrc, int chrFilterSize,         \ 
  725                                   const int16_t **alpSrc, uint8_t *dest, int dstW,    \ 
  728     name ## base ## _X_template_lsx(c, lumFilter, lumSrc, lumFilterSize,              \ 
  729                                     chrFilter, chrUSrc, chrVSrc, chrFilterSize,       \ 
  730                                     alpSrc, dest, dstW, y, fmt, hasAlpha);            \ 
  733 #define YUV2RGBWRAPPERX2(name, base, ext, fmt, hasAlpha)                              \ 
  734 YUV2RGBWRAPPERX(name, base, ext, fmt, hasAlpha)                                       \ 
  735 static void name ## ext ## _2_lsx(SwsInternal *c, const int16_t *buf[2],              \ 
  736                                   const int16_t *ubuf[2], const int16_t *vbuf[2],     \ 
  737                                   const int16_t *abuf[2], uint8_t *dest, int dstW,    \ 
  738                                   int yalpha, int uvalpha, int y)                     \ 
  740     name ## base ## _2_template_lsx(c, buf, ubuf, vbuf, abuf, dest,                   \ 
  741                                     dstW, yalpha, uvalpha, y, fmt, hasAlpha);         \ 
  744 #define YUV2RGBWRAPPER(name, base, ext, fmt, hasAlpha)                                \ 
  745 YUV2RGBWRAPPERX2(name, base, ext, fmt, hasAlpha)                                      \ 
  746 static void name ## ext ## _1_lsx(SwsInternal *c, const int16_t *buf0,                \ 
  747                                   const int16_t *ubuf[2], const int16_t *vbuf[2],     \ 
  748                                   const int16_t *abuf0, uint8_t *dest, int dstW,      \ 
  749                                   int uvalpha, int y)                                 \ 
  751     name ## base ## _1_template_lsx(c, buf0, ubuf, vbuf, abuf0, dest,                 \ 
  752                                     dstW, uvalpha, y, fmt, hasAlpha);                 \ 
  757 #if CONFIG_SWSCALE_ALPHA 
  773     uint8_t *dest, 
int i, 
int R, 
int A, 
int G, 
int B,
 
  778     if ((
R | 
G | 
B) & 0xC0000000) {
 
  786         dest[0] = hasAlpha ? 
A : 255;
 
  800         dest[3] = hasAlpha ? 
A : 255;
 
  803         dest[0] = hasAlpha ? 
A : 255;
 
  817         dest[3] = hasAlpha ? 
A : 255;
 
  826         switch (
c->opts.dither) {
 
  833             R += (7*err[0] + 1*
c->dither_error[0][
i] + 5*
c->dither_error[0][
i+1] + 3*
c->dither_error[0][
i+2])>>4;
 
  834             G += (7*err[1] + 1*
c->dither_error[1][
i] + 5*
c->dither_error[1][
i+1] + 3*
c->dither_error[1][
i+2])>>4;
 
  835             B += (7*err[2] + 1*
c->dither_error[2][
i] + 5*
c->dither_error[2][
i+1] + 3*
c->dither_error[2][
i+2])>>4;
 
  836             c->dither_error[0][
i] = err[0];
 
  837             c->dither_error[1][
i] = err[1];
 
  838             c->dither_error[2][
i] = err[2];
 
  839             r = 
R >> (isrgb8 ? 5 : 7);
 
  840             g = 
G >> (isrgb8 ? 5 : 6);
 
  841             b = 
B >> (isrgb8 ? 6 : 7);
 
  845             err[0] = 
R - 
r*(isrgb8 ? 36 : 255);
 
  846             err[1] = 
G - 
g*(isrgb8 ? 36 : 85);
 
  847             err[2] = 
B - 
b*(isrgb8 ? 85 : 255);
 
  852 #define A_DITHER(u,v)   (((((u)+((v)*236))*119)&0xff)) 
  871 #define X_DITHER(u,v)   (((((u)^((v)*237))*181)&0x1ff)/2) 
  891             dest[0] = 
r + 2*
g + 8*
b;
 
  893             dest[0] = 
b + 2*
g + 8*
r;
 
  895             dest[0] = 
r + 8*
g + 64*
b;
 
  897             dest[0] = 
b + 4*
g + 32*
r;
 
  904 #define YUVTORGB_SETUP_LSX                                   \ 
  905     int y_offset   = c->yuv2rgb_y_offset;                    \ 
  906     int y_coeff    = c->yuv2rgb_y_coeff;                     \ 
  907     int v2r_coe    = c->yuv2rgb_v2r_coeff;                   \ 
  908     int v2g_coe    = c->yuv2rgb_v2g_coeff;                   \ 
  909     int u2g_coe    = c->yuv2rgb_u2g_coeff;                   \ 
  910     int u2b_coe    = c->yuv2rgb_u2b_coeff;                   \ 
  911     __m128i offset = __lsx_vreplgr2vr_w(y_offset);           \ 
  912     __m128i coeff  = __lsx_vreplgr2vr_w(y_coeff);            \ 
  913     __m128i v2r    = __lsx_vreplgr2vr_w(v2r_coe);            \ 
  914     __m128i v2g    = __lsx_vreplgr2vr_w(v2g_coe);            \ 
  915     __m128i u2g    = __lsx_vreplgr2vr_w(u2g_coe);            \ 
  916     __m128i u2b    = __lsx_vreplgr2vr_w(u2b_coe);            \ 
  918 #define YUVTORGB_LSX(y, u, v, R, G, B, offset, coeff,        \ 
  919                      y_temp, v2r, v2g, u2g, u2b)             \ 
  921      y = __lsx_vsub_w(y, offset);                            \ 
  922      y = __lsx_vmul_w(y, coeff);                             \ 
  923      y = __lsx_vadd_w(y, y_temp);                            \ 
  924      R = __lsx_vmadd_w(y, v, v2r);                           \ 
  925      v = __lsx_vmadd_w(y, v, v2g);                           \ 
  926      G = __lsx_vmadd_w(v, u, u2g);                           \ 
  927      B = __lsx_vmadd_w(y, u, u2b);                           \ 
  930 #define WRITE_FULL_A_LSX(r, g, b, a, t1, s)                                  \ 
  932     R = __lsx_vpickve2gr_w(r, t1);                                           \ 
  933     G = __lsx_vpickve2gr_w(g, t1);                                           \ 
  934     B = __lsx_vpickve2gr_w(b, t1);                                           \ 
  935     A = __lsx_vpickve2gr_w(a, t1);                                           \ 
  937         A = av_clip_uint8(A);                                                \ 
  938     yuv2rgb_write_full(c, dest, i + s, R, A, G, B, y, target, hasAlpha, err);\ 
  942 #define WRITE_FULL_LSX(r, g, b, t1, s)                                        \ 
  944     R = __lsx_vpickve2gr_w(r, t1);                                            \ 
  945     G = __lsx_vpickve2gr_w(g, t1);                                            \ 
  946     B = __lsx_vpickve2gr_w(b, t1);                                            \ 
  947     yuv2rgb_write_full(c, dest, i + s, R, 0, G, B, y, target, hasAlpha, err); \ 
  953                             const int16_t **lumSrc, 
int lumFilterSize,
 
  954                             const int16_t *chrFilter, 
const int16_t **chrUSrc,
 
  955                             const int16_t **chrVSrc, 
int chrFilterSize,
 
  956                             const int16_t **alpSrc, uint8_t *dest,
 
  960     int i, j, 
B, 
G, 
R, 
A;
 
  964     int a_temp     = 1 << 18;
 
  966     int tempc      = templ - (128 << 19);
 
  969     __m128i y_temp = __lsx_vreplgr2vr_w(ytemp);
 
  976     for (
i = 0; 
i < 
len; 
i += 8) {
 
  977         __m128i l_src, u_src, v_src;
 
  978         __m128i y_ev, y_od, u_ev, u_od, v_ev, v_od, 
temp;
 
  979         __m128i R_ev, R_od, G_ev, G_od, B_ev, B_od;
 
  982         y_ev = y_od = __lsx_vreplgr2vr_w(templ);
 
  983         u_ev = u_od = v_ev = v_od = __lsx_vreplgr2vr_w(tempc);
 
  984         for (j = 0; j < lumFilterSize; j++) {
 
  985             temp  = __lsx_vldrepl_h((lumFilter + j), 0);
 
  986             l_src = __lsx_vldx(lumSrc[j], n);
 
  987             y_ev  = __lsx_vmaddwev_w_h(y_ev, l_src, 
temp);
 
  988             y_od  = __lsx_vmaddwod_w_h(y_od, l_src, 
temp);
 
  990         for (j = 0; j < chrFilterSize; j++) {
 
  991             temp  = __lsx_vldrepl_h((chrFilter + j), 0);
 
  992             DUP2_ARG2(__lsx_vldx, chrUSrc[j], n, chrVSrc[j], n,
 
  995                       v_src, 
temp, u_ev, v_ev);
 
  997                       v_src, 
temp, u_od, v_od);
 
  999         y_ev = __lsx_vsrai_w(y_ev, 10);
 
 1000         y_od = __lsx_vsrai_w(y_od, 10);
 
 1001         u_ev = __lsx_vsrai_w(u_ev, 10);
 
 1002         u_od = __lsx_vsrai_w(u_od, 10);
 
 1003         v_ev = __lsx_vsrai_w(v_ev, 10);
 
 1004         v_od = __lsx_vsrai_w(v_od, 10);
 
 1006                      y_temp, v2r, v2g, u2g, u2b);
 
 1008                      y_temp, v2r, v2g, u2g, u2b);
 
 1011             __m128i a_src, a_ev, a_od;
 
 1013             a_ev = a_od = __lsx_vreplgr2vr_w(a_temp);
 
 1014             for (j = 0; j < lumFilterSize; j++) {
 
 1015                 temp  = __lsx_vldrepl_h(lumFilter + j, 0);
 
 1016                 a_src = __lsx_vldx(alpSrc[j], n);
 
 1017                 a_ev  = __lsx_vmaddwev_w_h(a_ev, a_src, 
temp);
 
 1018                 a_od  = __lsx_vmaddwod_w_h(a_od, a_src, 
temp);
 
 1020             a_ev = __lsx_vsrai_w(a_ev, 19);
 
 1021             a_od = __lsx_vsrai_w(a_od, 19);
 
 1041     if (dstW - 
i >= 4) {
 
 1042         __m128i l_src, u_src, v_src;
 
 1043         __m128i y_ev, u_ev, v_ev, uv, 
temp;
 
 1044         __m128i R_ev, G_ev, B_ev;
 
 1047         y_ev = __lsx_vreplgr2vr_w(templ);
 
 1048         u_ev = v_ev = __lsx_vreplgr2vr_w(tempc);
 
 1049         for (j = 0; j < lumFilterSize; j++) {
 
 1050             temp  = __lsx_vldrepl_h((lumFilter + j), 0);
 
 1051             l_src = __lsx_vldx(lumSrc[j], n);
 
 1052             l_src = __lsx_vilvl_h(l_src, l_src);
 
 1053             y_ev  = __lsx_vmaddwev_w_h(y_ev, l_src, 
temp);
 
 1055         for (j = 0; j < chrFilterSize; j++) {
 
 1056             temp  = __lsx_vldrepl_h((chrFilter + j), 0);
 
 1057             DUP2_ARG2(__lsx_vldx, chrUSrc[j], n, chrVSrc[j], n, u_src, v_src);
 
 1058             uv    = __lsx_vilvl_h(v_src, u_src);
 
 1059             u_ev  = __lsx_vmaddwev_w_h(u_ev, uv, 
temp);
 
 1060             v_ev  = __lsx_vmaddwod_w_h(v_ev, uv, 
temp);
 
 1062         y_ev = __lsx_vsrai_w(y_ev, 10);
 
 1063         u_ev = __lsx_vsrai_w(u_ev, 10);
 
 1064         v_ev = __lsx_vsrai_w(v_ev, 10);
 
 1066                      y_temp, v2r, v2g, u2g, u2b);
 
 1069             __m128i a_src, a_ev;
 
 1071             a_ev = __lsx_vreplgr2vr_w(a_temp);
 
 1072             for (j = 0; j < lumFilterSize; j++) {
 
 1073                 temp  = __lsx_vldrepl_h(lumFilter + j, 0);
 
 1074                 a_src = __lsx_vldx(alpSrc[j], n);
 
 1075                 a_src = __lsx_vilvl_h(a_src, a_src);
 
 1076                 a_ev  =  __lsx_vmaddwev_w_h(a_ev, a_src, 
temp);
 
 1078             a_ev = __lsx_vsrai_w(a_ev, 19);
 
 1091     for (; 
i < dstW; 
i++) {
 
 1093         int V, 
U = 
V = tempc;
 
 1096         for (j = 0; j < lumFilterSize; j++) {
 
 1097             Y += lumSrc[j][
i] * lumFilter[j];
 
 1099         for (j = 0; j < chrFilterSize; j++) {
 
 1100             U += chrUSrc[j][
i] * chrFilter[j];
 
 1101             V += chrVSrc[j][
i] * chrFilter[j];
 
 1109             for (j = 0; j < lumFilterSize; j++) {
 
 1110                 A += alpSrc[j][
i] * lumFilter[j];
 
 1119         R  = (unsigned)
Y + 
V * v2r_coe;
 
 1120         G  = (unsigned)
Y + 
V * v2g_coe + 
U * u2g_coe;
 
 1121         B  = (unsigned)
Y + 
U * u2b_coe;
 
 1122         yuv2rgb_write_full(
c, dest, 
i, 
R, 
A, 
G, 
B, y, target, hasAlpha, err);
 
 1125     c->dither_error[0][
i] = err[0];
 
 1126     c->dither_error[1][
i] = err[1];
 
 1127     c->dither_error[2][
i] = err[2];
 
 1132                             const int16_t *ubuf[2], 
const int16_t *vbuf[2],
 
 1133                             const int16_t *abuf[2], uint8_t *dest, 
int dstW,
 
 1134                             int yalpha, 
int uvalpha, 
int y,
 
 1137     const int16_t *buf0  = buf[0],  *buf1  = buf[1],
 
 1138                   *ubuf0 = ubuf[0], *ubuf1 = ubuf[1],
 
 1139                   *vbuf0 = vbuf[0], *vbuf1 = vbuf[1],
 
 1140                   *abuf0 = hasAlpha ? abuf[0] : 
NULL,
 
 1141                   *abuf1 = hasAlpha ? abuf[1] : 
NULL;
 
 1142     int yalpha1  = 4096 - yalpha;
 
 1143     int uvalpha1 = 4096 - uvalpha;
 
 1144     int uvtemp   = 128 << 19;
 
 1145     int atemp    = 1 << 18;
 
 1147     int ytemp    = 1 << 21;
 
 1152     __m128i v_uvalpha1 = __lsx_vreplgr2vr_w(uvalpha1);
 
 1153     __m128i v_yalpha1  = __lsx_vreplgr2vr_w(yalpha1);
 
 1154     __m128i v_uvalpha  = __lsx_vreplgr2vr_w(uvalpha);
 
 1155     __m128i v_yalpha   = __lsx_vreplgr2vr_w(yalpha);
 
 1156     __m128i uv         = __lsx_vreplgr2vr_w(uvtemp);
 
 1157     __m128i a_bias     = __lsx_vreplgr2vr_w(atemp);
 
 1158     __m128i y_temp     = __lsx_vreplgr2vr_w(ytemp);
 
 1168     for (
i = 0; 
i < 
len; 
i += 8) {
 
 1169         __m128i 
b0, 
b1, ub0, ub1, vb0, vb1;
 
 1170         __m128i y0_l, y0_h, y1_l, y1_h, u0_l, u0_h;
 
 1171         __m128i v0_l, v0_h, u1_l, u1_h, v1_l, v1_h;
 
 1172         __m128i y_l, y_h, v_l, v_h, u_l, u_h;
 
 1173         __m128i R_l, R_h, G_l, G_h, B_l, B_h;
 
 1176         DUP4_ARG2(__lsx_vldx, buf0, n, buf1, n, ubuf0,
 
 1177                   n, ubuf1, n, 
b0, 
b1, ub0, ub1);
 
 1178         DUP2_ARG2(__lsx_vldx, vbuf0, n, vbuf1, n, vb0 , vb1);
 
 1180         DUP4_ARG2(__lsx_vsllwil_w_h, ub0, 0, ub1, 0, vb0, 0, vb1, 0,
 
 1181                   u0_l, u1_l, v0_l, v1_l);
 
 1183         DUP4_ARG1(__lsx_vexth_w_h, ub0, ub1, vb0, vb1,
 
 1184                   u0_h, u1_h, v0_h, v1_h);
 
 1185         y0_l = __lsx_vmul_w(y0_l, v_yalpha1);
 
 1186         y0_h = __lsx_vmul_w(y0_h, v_yalpha1);
 
 1187         u0_l = __lsx_vmul_w(u0_l, v_uvalpha1);
 
 1188         u0_h = __lsx_vmul_w(u0_h, v_uvalpha1);
 
 1189         v0_l = __lsx_vmul_w(v0_l, v_uvalpha1);
 
 1190         v0_h = __lsx_vmul_w(v0_h, v_uvalpha1);
 
 1191         y_l  = __lsx_vmadd_w(y0_l, v_yalpha, y1_l);
 
 1192         y_h  = __lsx_vmadd_w(y0_h, v_yalpha, y1_h);
 
 1193         u_l  = __lsx_vmadd_w(u0_l, v_uvalpha, u1_l);
 
 1194         u_h  = __lsx_vmadd_w(u0_h, v_uvalpha, u1_h);
 
 1195         v_l  = __lsx_vmadd_w(v0_l, v_uvalpha, v1_l);
 
 1196         v_h  = __lsx_vmadd_w(v0_h, v_uvalpha, v1_h);
 
 1197         u_l  = __lsx_vsub_w(u_l, uv);
 
 1198         u_h  = __lsx_vsub_w(u_h, uv);
 
 1199         v_l  = __lsx_vsub_w(v_l, uv);
 
 1200         v_h  = __lsx_vsub_w(v_h, uv);
 
 1201         y_l  = __lsx_vsrai_w(y_l, 10);
 
 1202         y_h  = __lsx_vsrai_w(y_h, 10);
 
 1203         u_l  = __lsx_vsrai_w(u_l, 10);
 
 1204         u_h  = __lsx_vsrai_w(u_h, 10);
 
 1205         v_l  = __lsx_vsrai_w(v_l, 10);
 
 1206         v_h  = __lsx_vsrai_w(v_h, 10);
 
 1208                      y_temp, v2r, v2g, u2g, u2b);
 
 1210                      y_temp, v2r, v2g, u2g, u2b);
 
 1213             __m128i 
a0, 
a1, a0_l, a0_h;
 
 1214             __m128i a_l, a_h, a1_l, a1_h;
 
 1219             a_l = __lsx_vmadd_w(a_bias, a0_l, v_yalpha1);
 
 1220             a_h = __lsx_vmadd_w(a_bias, a0_h, v_yalpha1);
 
 1221             a_l = __lsx_vmadd_w(a_l, v_yalpha, a1_l);
 
 1222             a_h = __lsx_vmadd_w(a_h, v_yalpha, a1_h);
 
 1223             a_l = __lsx_vsrai_w(a_l, 19);
 
 1224             a_h = __lsx_vsrai_w(a_h, 19);
 
 1244     if (dstW - 
i >= 4) {
 
 1245         __m128i 
b0, 
b1, ub0, ub1, vb0, vb1;
 
 1246         __m128i y0_l, y1_l, u0_l;
 
 1247         __m128i v0_l, u1_l, v1_l;
 
 1248         __m128i y_l, u_l, v_l;
 
 1249         __m128i R_l, G_l, B_l;
 
 1252         DUP4_ARG2(__lsx_vldx, buf0, n, buf1, n, ubuf0, n,
 
 1253                   ubuf1, n, 
b0, 
b1, ub0, ub1);
 
 1254         DUP2_ARG2(__lsx_vldx, vbuf0, n, vbuf1, n, vb0, vb1);
 
 1256         DUP4_ARG2(__lsx_vsllwil_w_h, ub0, 0, ub1, 0, vb0, 0, vb1, 0,
 
 1257                   u0_l, u1_l, v0_l, v1_l);
 
 1258         y0_l = __lsx_vmul_w(y0_l, v_yalpha1);
 
 1259         u0_l = __lsx_vmul_w(u0_l, v_uvalpha1);
 
 1260         v0_l = __lsx_vmul_w(v0_l, v_uvalpha1);
 
 1261         y_l  = __lsx_vmadd_w(y0_l, v_yalpha, y1_l);
 
 1262         u_l  = __lsx_vmadd_w(u0_l, v_uvalpha, u1_l);
 
 1263         v_l  = __lsx_vmadd_w(v0_l, v_uvalpha, v1_l);
 
 1264         u_l  = __lsx_vsub_w(u_l, uv);
 
 1265         v_l  = __lsx_vsub_w(v_l, uv);
 
 1266         y_l  = __lsx_vsrai_w(y_l, 10);
 
 1267         u_l  = __lsx_vsrai_w(u_l, 10);
 
 1268         v_l  = __lsx_vsrai_w(v_l, 10);
 
 1270                      y_temp, v2r, v2g, u2g, u2b);
 
 1273             __m128i 
a0, 
a1, a0_l;
 
 1278             a_l = __lsx_vmadd_w(a_bias, a0_l, v_yalpha1);
 
 1279             a_l = __lsx_vmadd_w(a_l, v_yalpha, a1_l);
 
 1280             a_l = __lsx_vsrai_w(a_l, 19);
 
 1293     for (; 
i < dstW; 
i++){
 
 1294         int Y = ( buf0[
i] * yalpha1  +  buf1[
i] * yalpha         ) >> 10;
 
 1295         int U = (ubuf0[
i] * uvalpha1 + ubuf1[
i] * uvalpha- uvtemp) >> 10;
 
 1296         int V = (vbuf0[
i] * uvalpha1 + vbuf1[
i] * uvalpha- uvtemp) >> 10;
 
 1300             A = (abuf0[
i] * yalpha1 + abuf1[
i] * yalpha + atemp) >> 19;
 
 1308         R  = (unsigned)
Y + 
V * v2r_coe;
 
 1309         G  = (unsigned)
Y + 
V * v2g_coe + 
U * u2g_coe;
 
 1310         B  = (unsigned)
Y + 
U * u2b_coe;
 
 1311         yuv2rgb_write_full(
c, dest, 
i, 
R, 
A, 
G, 
B, y, target, hasAlpha, err);
 
 1314     c->dither_error[0][
i] = err[0];
 
 1315     c->dither_error[1][
i] = err[1];
 
 1316     c->dither_error[2][
i] = err[2];
 
 1321                             const int16_t *ubuf[2], 
const int16_t *vbuf[2],
 
 1322                             const int16_t *abuf0, uint8_t *dest, 
int dstW,
 
 1326     const int16_t *ubuf0 = ubuf[0], *vbuf0 = vbuf[0];
 
 1330     int ytemp      = 1 << 21;
 
 1333     __m128i y_temp = __lsx_vreplgr2vr_w(ytemp);
 
 1339     if (uvalpha < 2048) {
 
 1340         int uvtemp   = 128 << 7;
 
 1341         __m128i uv   = __lsx_vreplgr2vr_w(uvtemp);
 
 1342         __m128i 
bias = __lsx_vreplgr2vr_w(bias_int);
 
 1344         for (
i = 0; 
i < 
len; 
i += 8) {
 
 1345             __m128i 
b, 
ub, vb, ub_l, ub_h, vb_l, vb_h;
 
 1346             __m128i y_l, y_h, u_l, u_h, v_l, v_h;
 
 1347             __m128i R_l, R_h, G_l, G_h, B_l, B_h;
 
 1351             vb  = __lsx_vldx(vbuf0, n);
 
 1352             y_l = __lsx_vsllwil_w_h(
b, 2);
 
 1353             y_h = __lsx_vexth_w_h(
b);
 
 1354             DUP2_ARG2(__lsx_vsllwil_w_h, 
ub, 0, vb, 0, ub_l, vb_l);
 
 1356             y_h = __lsx_vslli_w(y_h, 2);
 
 1357             u_l = __lsx_vsub_w(ub_l, uv);
 
 1358             u_h = __lsx_vsub_w(ub_h, uv);
 
 1359             v_l = __lsx_vsub_w(vb_l, uv);
 
 1360             v_h = __lsx_vsub_w(vb_h, uv);
 
 1361             u_l = __lsx_vslli_w(u_l, 2);
 
 1362             u_h = __lsx_vslli_w(u_h, 2);
 
 1363             v_l = __lsx_vslli_w(v_l, 2);
 
 1364             v_h = __lsx_vslli_w(v_h, 2);
 
 1366                          y_temp, v2r, v2g, u2g, u2b);
 
 1368                          y_temp, v2r, v2g, u2g, u2b);
 
 1374                 a_src = __lsx_vld(abuf0 + 
i, 0);
 
 1375                 a_l   = __lsx_vsllwil_w_h(a_src, 0);
 
 1376                 a_h   = __lsx_vexth_w_h(a_src);
 
 1377                 a_l   = __lsx_vadd_w(a_l, 
bias);
 
 1378                 a_h   = __lsx_vadd_w(a_h, 
bias);
 
 1379                 a_l   = __lsx_vsrai_w(a_l, 7);
 
 1380                 a_h   = __lsx_vsrai_w(a_h, 7);
 
 1400         if (dstW - 
i >= 4) {
 
 1401             __m128i 
b, 
ub, vb, ub_l, vb_l;
 
 1402             __m128i y_l, u_l, v_l;
 
 1403             __m128i R_l, G_l, B_l;
 
 1407             vb  = __lsx_vldx(vbuf0, n);
 
 1408             y_l = __lsx_vsllwil_w_h(
b, 0);
 
 1409             DUP2_ARG2(__lsx_vsllwil_w_h, 
ub, 0, vb, 0, ub_l, vb_l);
 
 1410             y_l = __lsx_vslli_w(y_l, 2);
 
 1411             u_l = __lsx_vsub_w(ub_l, uv);
 
 1412             v_l = __lsx_vsub_w(vb_l, uv);
 
 1413             u_l = __lsx_vslli_w(u_l, 2);
 
 1414             v_l = __lsx_vslli_w(v_l, 2);
 
 1416                          y_temp, v2r, v2g, u2g, u2b);
 
 1421                 a_src = __lsx_vldx(abuf0, n);
 
 1422                 a_src = __lsx_vsllwil_w_h(a_src, 0);
 
 1423                 a_l   = __lsx_vadd_w(
bias, a_src);
 
 1424                 a_l   = __lsx_vsrai_w(a_l, 7);
 
 1437         for (; 
i < dstW; 
i++) {
 
 1438             int Y = buf0[
i] << 2;
 
 1439             int U = (ubuf0[
i] - uvtemp) << 2;
 
 1440             int V = (vbuf0[
i] - uvtemp) << 2;
 
 1444                 A = (abuf0[
i] + 64) >> 7;
 
 1451             R  = (unsigned)
Y + 
V * v2r_coe;
 
 1452             G  = (unsigned)
Y + 
V * v2g_coe + 
U * u2g_coe;
 
 1453             B  = (unsigned)
Y + 
U * u2b_coe;
 
 1454             yuv2rgb_write_full(
c, dest, 
i, 
R, 
A, 
G, 
B, y, target, hasAlpha, err);
 
 1458         const int16_t *ubuf1 = ubuf[1], *vbuf1 = vbuf[1];
 
 1459         int uvtemp   = 128 << 8;
 
 1460         __m128i uv   = __lsx_vreplgr2vr_w(uvtemp);
 
 1461         __m128i 
zero = __lsx_vldi(0);
 
 1462         __m128i 
bias = __lsx_vreplgr2vr_h(bias_int);
 
 1464         for (
i = 0; 
i < 
len; 
i += 8) {
 
 1465             __m128i 
b, ub0, ub1, vb0, vb1;
 
 1466             __m128i y_ev, y_od, u_ev, u_od, v_ev, v_od;
 
 1467             __m128i R_ev, R_od, G_ev, G_od, B_ev, B_od;
 
 1470             DUP4_ARG2(__lsx_vldx, buf0, n, ubuf0, n, vbuf0, n,
 
 1471                       ubuf1, n, 
b, ub0, vb0, ub1);
 
 1472             vb1 = __lsx_vldx(vbuf, n);
 
 1473             y_ev = __lsx_vaddwev_w_h(
b, 
zero);
 
 1474             y_od = __lsx_vaddwod_w_h(
b, 
zero);
 
 1475             DUP2_ARG2(__lsx_vaddwev_w_h, ub0, vb0, ub1, vb1, u_ev, v_ev);
 
 1476             DUP2_ARG2(__lsx_vaddwod_w_h, ub0, vb0, ub1, vb1, u_od, v_od);
 
 1477             DUP2_ARG2(__lsx_vslli_w, y_ev, 2, y_od, 2, y_ev, y_od);
 
 1478             DUP4_ARG2(__lsx_vsub_w, u_ev, uv, u_od, uv, v_ev, uv, v_od, uv,
 
 1479                       u_ev, u_od, v_ev, v_od);
 
 1480             DUP4_ARG2(__lsx_vslli_w, u_ev, 1, u_od, 1, v_ev, 1, v_od, 1,
 
 1481                       u_ev, u_od, v_ev, v_od);
 
 1483                          y_temp, v2r, v2g, u2g, u2b);
 
 1485                          y_temp, v2r, v2g, u2g, u2b);
 
 1491                 a_src = __lsx_vld(abuf0 + 
i, 0);
 
 1492                 a_ev  = __lsx_vaddwev_w_h(
bias, a_src);
 
 1493                 a_od  = __lsx_vaddwod_w_h(
bias, a_src);
 
 1494                 a_ev  = __lsx_vsrai_w(a_ev, 7);
 
 1495                 a_od  = __lsx_vsrai_w(a_od, 7);
 
 1515         if (dstW - 
i >= 4) {
 
 1516             __m128i 
b, ub0, ub1, vb0, vb1;
 
 1517             __m128i y_l, u_l, v_l;
 
 1518             __m128i R_l, G_l, B_l;
 
 1521             DUP4_ARG2(__lsx_vldx, buf0, n, ubuf0, n, vbuf0, n,
 
 1522                       ubuf1, n, 
b, ub0, vb0, ub1);
 
 1523             vb1 = __lsx_vldx(vbuf1, n);
 
 1524             y_l = __lsx_vsllwil_w_h(
b, 0);
 
 1525             y_l = __lsx_vslli_w(y_l, 2);
 
 1526             DUP4_ARG2(__lsx_vsllwil_w_h, ub0, 0, vb0, 0, ub1, 0, vb1, 0,
 
 1527                       ub0, vb0, ub1, vb1);
 
 1528             DUP2_ARG2(__lsx_vadd_w, ub0, ub1, vb0, vb1, u_l, v_l);
 
 1529             u_l = __lsx_vsub_w(u_l, uv);
 
 1530             v_l = __lsx_vsub_w(v_l, uv);
 
 1531             u_l = __lsx_vslli_w(u_l, 1);
 
 1532             v_l = __lsx_vslli_w(v_l, 1);
 
 1534                          y_temp, v2r, v2g, u2g, u2b);
 
 1540                 a_src  = __lsx_vld(abuf0 + 
i, 0);
 
 1541                 a_src  = __lsx_vilvl_h(a_src, a_src);
 
 1542                 a_l    = __lsx_vaddwev_w_h(
bias, a_l);
 
 1543                 a_l   = __lsx_vsrai_w(a_l, 7);
 
 1556         for (; 
i < dstW; 
i++) {
 
 1557             int Y = buf0[
i] << 2;
 
 1558             int U = (ubuf0[
i] + ubuf1[
i] - uvtemp) << 1;
 
 1559             int V = (vbuf0[
i] + vbuf1[
i] - uvtemp) << 1;
 
 1563                 A = (abuf0[
i] + 64) >> 7;
 
 1570             R  = (unsigned)
Y + 
V * v2r_coe;
 
 1571             G  = (unsigned)
Y + 
V * v2g_coe + 
U * u2g_coe;
 
 1572             B  = (unsigned)
Y + 
U * u2b_coe;
 
 1573             yuv2rgb_write_full(
c, dest, 
i, 
R, 
A, 
G, 
B, y, target, hasAlpha, err);
 
 1577     c->dither_error[0][
i] = err[0];
 
 1578     c->dither_error[1][
i] = err[1];
 
 1579     c->dither_error[2][
i] = err[2];
 
 1584                CONFIG_SWSCALE_ALPHA && 
c->needAlpha)
 
 1586                CONFIG_SWSCALE_ALPHA && 
c->needAlpha)
 
 1588                CONFIG_SWSCALE_ALPHA && 
c->needAlpha)
 
 1590                CONFIG_SWSCALE_ALPHA && 
c->needAlpha)
 
 1592 #if CONFIG_SWSCALE_ALPHA 
 1625     } 
else if (
is16BPS(dstFormat)) {
 
 1627     } 
else if (
isNBPS(dstFormat)) {
 
 1636         switch (
c->opts.dst_format) {
 
 1639             c->yuv2packedX = yuv2rgba32_full_X_lsx;
 
 1640             c->yuv2packed2 = yuv2rgba32_full_2_lsx;
 
 1641             c->yuv2packed1 = yuv2rgba32_full_1_lsx;
 
 1643 #if CONFIG_SWSCALE_ALPHA 
 1645                 c->yuv2packedX = yuv2rgba32_full_X_lsx;
 
 1646                 c->yuv2packed2 = yuv2rgba32_full_2_lsx;
 
 1647                 c->yuv2packed1 = yuv2rgba32_full_1_lsx;
 
 1651                 c->yuv2packedX = yuv2rgbx32_full_X_lsx;
 
 1652                 c->yuv2packed2 = yuv2rgbx32_full_2_lsx;
 
 1653                 c->yuv2packed1 = yuv2rgbx32_full_1_lsx;
 
 1659             c->yuv2packedX = yuv2argb32_full_X_lsx;
 
 1660             c->yuv2packed2 = yuv2argb32_full_2_lsx;
 
 1661             c->yuv2packed1 = yuv2argb32_full_1_lsx;
 
 1663 #if CONFIG_SWSCALE_ALPHA 
 1665                 c->yuv2packedX = yuv2argb32_full_X_lsx;
 
 1666                 c->yuv2packed2 = yuv2argb32_full_2_lsx;
 
 1667                 c->yuv2packed1 = yuv2argb32_full_1_lsx;
 
 1671                 c->yuv2packedX = yuv2xrgb32_full_X_lsx;
 
 1672                 c->yuv2packed2 = yuv2xrgb32_full_2_lsx;
 
 1673                 c->yuv2packed1 = yuv2xrgb32_full_1_lsx;
 
 1679             c->yuv2packedX = yuv2bgra32_full_X_lsx;
 
 1680             c->yuv2packed2 = yuv2bgra32_full_2_lsx;
 
 1681             c->yuv2packed1 = yuv2bgra32_full_1_lsx;
 
 1683 #if CONFIG_SWSCALE_ALPHA 
 1685                 c->yuv2packedX = yuv2bgra32_full_X_lsx;
 
 1686                 c->yuv2packed2 = yuv2bgra32_full_2_lsx;
 
 1687                 c->yuv2packed1 = yuv2bgra32_full_1_lsx;
 
 1691                 c->yuv2packedX = yuv2bgrx32_full_X_lsx;
 
 1692                 c->yuv2packed2 = yuv2bgrx32_full_2_lsx;
 
 1693                 c->yuv2packed1 = yuv2bgrx32_full_1_lsx;
 
 1699             c->yuv2packedX = yuv2abgr32_full_X_lsx;
 
 1700             c->yuv2packed2 = yuv2abgr32_full_2_lsx;
 
 1701             c->yuv2packed1 = yuv2abgr32_full_1_lsx;
 
 1703 #if CONFIG_SWSCALE_ALPHA 
 1705                 c->yuv2packedX = yuv2abgr32_full_X_lsx;
 
 1706                 c->yuv2packed2 = yuv2abgr32_full_2_lsx;
 
 1707                 c->yuv2packed1 = yuv2abgr32_full_1_lsx;
 
 1711                 c->yuv2packedX = yuv2xbgr32_full_X_lsx;
 
 1712                 c->yuv2packed2 = yuv2xbgr32_full_2_lsx;
 
 1713                 c->yuv2packed1 = yuv2xbgr32_full_1_lsx;
 
 1718             c->yuv2packedX = yuv2rgb24_full_X_lsx;
 
 1719             c->yuv2packed2 = yuv2rgb24_full_2_lsx;
 
 1720             c->yuv2packed1 = yuv2rgb24_full_1_lsx;
 
 1723             c->yuv2packedX = yuv2bgr24_full_X_lsx;
 
 1724             c->yuv2packed2 = yuv2bgr24_full_2_lsx;
 
 1725             c->yuv2packed1 = yuv2bgr24_full_1_lsx;
 
 1728             c->yuv2packedX = yuv2bgr4_byte_full_X_lsx;
 
 1729             c->yuv2packed2 = yuv2bgr4_byte_full_2_lsx;
 
 1730             c->yuv2packed1 = yuv2bgr4_byte_full_1_lsx;
 
 1733             c->yuv2packedX = yuv2rgb4_byte_full_X_lsx;
 
 1734             c->yuv2packed2 = yuv2rgb4_byte_full_2_lsx;
 
 1735             c->yuv2packed1 = yuv2rgb4_byte_full_1_lsx;
 
 1738             c->yuv2packedX = yuv2bgr8_full_X_lsx;
 
 1739             c->yuv2packed2 = yuv2bgr8_full_2_lsx;
 
 1740             c->yuv2packed1 = yuv2bgr8_full_1_lsx;
 
 1743             c->yuv2packedX = yuv2rgb8_full_X_lsx;
 
 1744             c->yuv2packed2 = yuv2rgb8_full_2_lsx;
 
 1745             c->yuv2packed1 = yuv2rgb8_full_1_lsx;
 
 1749         switch (
c->opts.dst_format) {
 
 1754 #if CONFIG_SWSCALE_ALPHA 
 1759                 c->yuv2packed1 = yuv2rgbx32_1_lsx;
 
 1760                 c->yuv2packed2 = yuv2rgbx32_2_lsx;
 
 1761                 c->yuv2packedX = yuv2rgbx32_X_lsx;
 
 1769 #if CONFIG_SWSCALE_ALPHA 
 1774                 c->yuv2packed1 = yuv2rgbx32_1_1_lsx;
 
 1775                 c->yuv2packed2 = yuv2rgbx32_1_2_lsx;
 
 1776                 c->yuv2packedX = yuv2rgbx32_1_X_lsx;
 
 1781             c->yuv2packed1 = yuv2rgb24_1_lsx;
 
 1782             c->yuv2packed2 = yuv2rgb24_2_lsx;
 
 1783             c->yuv2packedX = yuv2rgb24_X_lsx;
 
 1786             c->yuv2packed1 = yuv2bgr24_1_lsx;
 
 1787             c->yuv2packed2 = yuv2bgr24_2_lsx;
 
 1788             c->yuv2packedX = yuv2bgr24_X_lsx;
 
 1794             c->yuv2packed1 = yuv2rgb16_1_lsx;
 
 1795             c->yuv2packed2 = yuv2rgb16_2_lsx;
 
 1796             c->yuv2packedX = yuv2rgb16_X_lsx;
 
 1802             c->yuv2packed1 = yuv2rgb15_1_lsx;
 
 1803             c->yuv2packed2 = yuv2rgb15_2_lsx;
 
 1804             c->yuv2packedX = yuv2rgb15_X_lsx;
 
 1810             c->yuv2packed1 = yuv2rgb12_1_lsx;
 
 1811             c->yuv2packed2 = yuv2rgb12_2_lsx;
 
 1812             c->yuv2packedX = yuv2rgb12_X_lsx;
 
 1816             c->yuv2packed1 = yuv2rgb8_1_lsx;
 
 1817             c->yuv2packed2 = yuv2rgb8_2_lsx;
 
 1818             c->yuv2packedX = yuv2rgb8_X_lsx;
 
 1822             c->yuv2packed1 = yuv2rgb4_1_lsx;
 
 1823             c->yuv2packed2 = yuv2rgb4_2_lsx;
 
 1824             c->yuv2packedX = yuv2rgb4_X_lsx;
 
 1828             c->yuv2packed1 = yuv2rgb4b_1_lsx;
 
 1829             c->yuv2packed2 = yuv2rgb4b_2_lsx;
 
 1830             c->yuv2packedX = yuv2rgb4b_X_lsx;