25     0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20,
 
   26     0, 2, 2, 4, 4, 6, 6, 8, 16, 18, 18, 20, 20, 22, 22, 24,
 
   27     0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
 
   28     0, 1, 1, 2, 16, 17, 17, 18, 4, 5, 5, 6, 6, 7, 7, 8,
 
   29     0, 1, 1, 2, 16, 17, 17, 18, 16, 17, 17, 18, 18, 19, 19, 20
 
   33                                   uint32_t coeff0, uint32_t coeff1)
 
   40     v16i8 coeff_vec0 = __msa_fill_b(coeff0);
 
   41     v16i8 coeff_vec1 = __msa_fill_b(coeff1);
 
   42     v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
 
   49     res_r = __msa_dotp_u_h((v16u8) 
src0, coeff_vec);
 
   51     res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6);
 
   52     res_r = __msa_sat_u_h(res_r, 7);
 
   53     res = (v8i16) __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
 
   55     out0 = __msa_copy_u_h(res, 0);
 
   56     out1 = __msa_copy_u_h(res, 2);
 
   64                                   uint32_t coeff0, uint32_t coeff1)
 
   70     v16i8 coeff_vec0 = __msa_fill_b(coeff0);
 
   71     v16i8 coeff_vec1 = __msa_fill_b(coeff1);
 
   72     v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
 
   82     res_r = __msa_dotp_u_h(
src0, coeff_vec);
 
   84     res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6);
 
   85     res_r = __msa_sat_u_h(res_r, 7);
 
   86     res = (v8i16) __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
 
   92                                  uint32_t coeff0, uint32_t coeff1,
 
  103                                   uint32_t coeff0, uint32_t coeff1)
 
  109     v16i8 coeff_vec0 = __msa_fill_b(coeff0);
 
  110     v16i8 coeff_vec1 = __msa_fill_b(coeff1);
 
  111     v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
 
  118     res_r = __msa_dotp_u_h((v16u8) 
src0, coeff_vec);
 
  120     res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6);
 
  121     res_r = __msa_sat_u_h(res_r, 7);
 
  122     res = (v4i32) __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
 
  128                                   uint32_t coeff0, uint32_t coeff1)
 
  131     v8u16 res0_r, res1_r;
 
  133     v16i8 coeff_vec0 = __msa_fill_b(coeff0);
 
  134     v16i8 coeff_vec1 = __msa_fill_b(coeff1);
 
  135     v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
 
  146     out = (v16u8) __msa_pckev_b((v16i8) res1_r, (v16i8) res0_r);
 
  151                                   uint32_t coeff0, uint32_t coeff1)
 
  153     v16u8 
src0, 
src1, 
src2, src3, src4, src5, src6, src7, out0, out1;
 
  155     v8u16 res0, res1, res2, res3;
 
  156     v16i8 coeff_vec0 = __msa_fill_b(coeff0);
 
  157     v16i8 coeff_vec1 = __msa_fill_b(coeff1);
 
  158     v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
 
  166     DOTP_UB2_UH(src4, src6, coeff_vec, coeff_vec, res2, res3);
 
  167     SLLI_4V(res0, res1, res2, res3, 3);
 
  171     ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, 
stride);
 
  175                                  uint32_t coeff0, uint32_t coeff1,
 
  188                                   uint32_t coeff0, uint32_t coeff1)
 
  191     v8u16 res0, res1, res2, res3;
 
  193     v16i8 coeff_vec0 = __msa_fill_b(coeff0);
 
  194     v16i8 coeff_vec1 = __msa_fill_b(coeff1);
 
  195     v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
 
  202                 coeff_vec, res0, res1, res2, res3);
 
  203     SLLI_4V(res0, res1, res2, res3, 3);
 
  211                                   uint32_t coeff0, uint32_t coeff1)
 
  214     v16u8 out0, out1, out2, out3;
 
  215     v8u16 res0, res1, res2, res3, res4, res5, res6, res7;
 
  217     v16i8 coeff_vec0 = __msa_fill_b(coeff0);
 
  218     v16i8 coeff_vec1 = __msa_fill_b(coeff1);
 
  219     v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
 
  229                 coeff_vec, res0, res1, res2, res3);
 
  230     DOTP_UB4_UH(src4, src5, src6, src7, coeff_vec, coeff_vec, coeff_vec,
 
  231                 coeff_vec, res4, res5, res6, res7);
 
  232     SLLI_4V(res0, res1, res2, res3, 3);
 
  233     SLLI_4V(res4, res5, res6, res7, 3);
 
  240     ST_D8(out0, out1, out2, out3, 0, 1, 0, 1, 0, 1, 0, 1, dst, 
stride);
 
  249     v8u16 res0, res1, res2, res3;
 
  251     v16i8 coeff_vec0 = __msa_fill_b(coeff0);
 
  252     v16i8 coeff_vec1 = __msa_fill_b(coeff1);
 
  253     v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
 
  257     for (row = 
height >> 2; row--;) {
 
  264                     coeff_vec, res0, res1, res2, res3);
 
  265         SLLI_4V(res0, res1, res2, res3, 3);
 
  274         for (row = (
height % 4); row--;) {
 
  280             res0 = __msa_dotp_u_h(
src0, coeff_vec);
 
  282             res0 = (v8u16) __msa_srari_h((v8i16) res0, 6);
 
  283             res0 = __msa_sat_u_h(res0, 7);
 
  284             res0 = (v8u16) __msa_pckev_b((v16i8) res0, (v16i8) res0);
 
  293                                  uint32_t coeff0, uint32_t coeff1,
 
  306                                   uint32_t coeff0, uint32_t coeff1)
 
  313     v16i8 coeff_vec0 = __msa_fill_b(coeff0);
 
  314     v16i8 coeff_vec1 = __msa_fill_b(coeff1);
 
  315     v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
 
  321     tmp0 = (v16u8) __msa_ilvr_d((v2i64) tmp1, (v2i64) tmp0);
 
  323     res_r = __msa_dotp_u_h(tmp0, coeff_vec);
 
  325     res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6);
 
  326     res_r = __msa_sat_u_h(res_r, 7);
 
  327     res = (v8i16) __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
 
  329     out0 = __msa_copy_u_h(res, 0);
 
  330     out1 = __msa_copy_u_h(res, 2);
 
  338                                   uint32_t coeff0, uint32_t coeff1)
 
  341     v16u8 tmp0, tmp1, tmp2, tmp3;
 
  344     v16i8 coeff_vec0 = __msa_fill_b(coeff0);
 
  345     v16i8 coeff_vec1 = __msa_fill_b(coeff1);
 
  346     v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
 
  350                tmp0, tmp1, tmp2, tmp3);
 
  351     ILVR_W2_UB(tmp1, tmp0, tmp3, tmp2, tmp0, tmp2);
 
  353     tmp0 = (v16u8) __msa_ilvr_d((v2i64) tmp2, (v2i64) tmp0);
 
  355     res_r = __msa_dotp_u_h(tmp0, coeff_vec);
 
  357     res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6);
 
  358     res_r = __msa_sat_u_h(res_r, 7);
 
  360     res = (v8i16) __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
 
  366                                  uint32_t coeff0, uint32_t coeff1,
 
  377                                   uint32_t coeff0, uint32_t coeff1)
 
  383     v16i8 coeff_vec0 = __msa_fill_b(coeff0);
 
  384     v16i8 coeff_vec1 = __msa_fill_b(coeff1);
 
  385     v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
 
  390     tmp0 = (v16u8) __msa_ilvr_d((v2i64) tmp1, (v2i64) tmp0);
 
  391     res_r = __msa_dotp_u_h(tmp0, coeff_vec);
 
  393     res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6);
 
  394     res_r = __msa_sat_u_h(res_r, 7);
 
  395     res = (v4i32) __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
 
  401                                   uint32_t coeff0, uint32_t coeff1)
 
  404     v16u8 tmp0, tmp1, tmp2, tmp3;
 
  406     v8u16 res0_r, res1_r;
 
  407     v16i8 coeff_vec0 = __msa_fill_b(coeff0);
 
  408     v16i8 coeff_vec1 = __msa_fill_b(coeff1);
 
  409     v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
 
  412     ILVR_B4_UB(
src1, 
src0, 
src2, 
src1, src3, 
src2, src4, src3, tmp0, tmp1, tmp2,
 
  414     ILVR_D2_UB(tmp1, tmp0, tmp3, tmp2, tmp0, tmp2);
 
  415     DOTP_UB2_UH(tmp0, tmp2, coeff_vec, coeff_vec, res0_r, res1_r);
 
  420     out = (v16u8) __msa_pckev_b((v16i8) res1_r, (v16i8) res0_r);
 
  425                                   uint32_t coeff0, uint32_t coeff1)
 
  427     v16u8 
src0, 
src1, 
src2, src3, src4, src5, src6, src7, src8;
 
  428     v16u8 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, out0, out1;
 
  429     v8u16 res0, res1, res2, res3;
 
  430     v16i8 coeff_vec0 = __msa_fill_b(coeff0);
 
  431     v16i8 coeff_vec1 = __msa_fill_b(coeff1);
 
  432     v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
 
  437     ILVR_B4_UB(
src1, 
src0, 
src2, 
src1, src3, 
src2, src4, src3, tmp0, tmp1, tmp2,
 
  439     ILVR_B4_UB(src5, src4, src6, src5, src7, src6, src8, src7, tmp4, tmp5, tmp6,
 
  441     ILVR_D2_UB(tmp1, tmp0, tmp3, tmp2, tmp0, tmp2);
 
  442     ILVR_D2_UB(tmp5, tmp4, tmp7, tmp6, tmp4, tmp6);
 
  443     DOTP_UB2_UH(tmp0, tmp2, coeff_vec, coeff_vec, res0, res1);
 
  444     DOTP_UB2_UH(tmp4, tmp6, coeff_vec, coeff_vec, res2, res3);
 
  445     SLLI_4V(res0, res1, res2, res3, 3);
 
  449     ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, 
stride);
 
  453                                  uint32_t coeff0, uint32_t coeff1,
 
  466                                   uint32_t coeff0, uint32_t coeff1)
 
  469     v8u16 res0, res1, res2, res3;
 
  470     v16i8 coeff_vec0 = __msa_fill_b(coeff0);
 
  471     v16i8 coeff_vec1 = __msa_fill_b(coeff1);
 
  472     v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
 
  475     ILVR_B4_UB(
src1, 
src0, 
src2, 
src1, src3, 
src2, src4, src3, 
src0, 
src1, 
src2,
 
  478                 coeff_vec, res0, res1, res2, res3);
 
  479     SLLI_4V(res0, res1, res2, res3, 3);
 
  487                                   uint32_t coeff0, uint32_t coeff1)
 
  489     v16u8 
src0, 
src1, 
src2, src3, src4, src5, src6, src7, src8;
 
  490     v16u8 out0, out1, out2, out3;
 
  491     v8u16 res0, res1, res2, res3, res4, res5, res6, res7;
 
  492     v16i8 coeff_vec0 = __msa_fill_b(coeff0);
 
  493     v16i8 coeff_vec1 = __msa_fill_b(coeff1);
 
  494     v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
 
  499     ILVR_B4_UB(
src1, 
src0, 
src2, 
src1, src3, 
src2, src4, src3, 
src0, 
src1, 
src2,
 
  501     ILVR_B4_UB(src5, src4, src6, src5, src7, src6, src8, src7, src4, src5, src6,
 
  504                 coeff_vec, res0, res1, res2, res3);
 
  505     DOTP_UB4_UH(src4, src5, src6, src7, coeff_vec, coeff_vec, coeff_vec,
 
  506                 coeff_vec, res4, res5, res6, res7);
 
  507     SLLI_4V(res0, res1, res2, res3, 3);
 
  508     SLLI_4V(res4, res5, res6, res7, 3);
 
  515     ST_D8(out0, out1, out2, out3, 0, 1, 0, 1, 0, 1, 0, 1, dst, 
stride);
 
  519                                  uint32_t coeff0, uint32_t coeff1,
 
  530                                   uint32_t coef_hor0, uint32_t coef_hor1,
 
  531                                   uint32_t coef_ver0, uint32_t coef_ver1)
 
  535     v8u16 res_hz0, res_hz1, res_vt0, res_vt1;
 
  538     v16i8 coeff_hz_vec0 = __msa_fill_b(coef_hor0);
 
  539     v16i8 coeff_hz_vec1 = __msa_fill_b(coef_hor1);
 
  540     v16u8 coeff_hz_vec = (v16u8) __msa_ilvr_b(coeff_hz_vec0, coeff_hz_vec1);
 
  541     v8u16 coeff_vt_vec0 = (v8u16) __msa_fill_h(coef_ver0);
 
  542     v8u16 coeff_vt_vec1 = (v8u16) __msa_fill_h(coef_ver1);
 
  549     MUL2(res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_vt0, res_vt1);
 
  552     res_vt0 = (v8u16) __msa_srari_h((v8i16) res_vt0, 6);
 
  553     res_vt0 = __msa_sat_u_h(res_vt0, 7);
 
  554     res_vert = (v8i16) __msa_pckev_b((v16i8) res_vt0, (v16i8) res_vt0);
 
  556     out0 = __msa_copy_u_h(res_vert, 0);
 
  557     out1 = __msa_copy_u_h(res_vert, 1);
 
  565                                   uint32_t coef_hor0, uint32_t coef_hor1,
 
  566                                   uint32_t coef_ver0, uint32_t coef_ver1)
 
  569     v16u8 tmp0, tmp1, tmp2, tmp3;
 
  570     v8u16 res_hz0, res_hz1, res_vt0, res_vt1;
 
  573     v16i8 coeff_hz_vec0 = __msa_fill_b(coef_hor0);
 
  574     v16i8 coeff_hz_vec1 = __msa_fill_b(coef_hor1);
 
  575     v16u8 coeff_hz_vec = (v16u8) __msa_ilvr_b(coeff_hz_vec0, coeff_hz_vec1);
 
  576     v8u16 coeff_vt_vec0 = (v8u16) __msa_fill_h(coef_ver0);
 
  577     v8u16 coeff_vt_vec1 = (v8u16) __msa_fill_h(coef_ver1);
 
  587     MUL2(res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_vt0, res_vt1);
 
  590     res_vt0 = (v8u16) __msa_srari_h((v8i16) res_vt0, 6);
 
  591     res_vt0 = __msa_sat_u_h(res_vt0, 7);
 
  593     res = (v8i16) __msa_pckev_b((v16i8) res_vt0, (v16i8) res_vt0);
 
  599                                  uint32_t coef_hor0, uint32_t coef_hor1,
 
  600                                  uint32_t coef_ver0, uint32_t coef_ver1,
 
  613                                   uint32_t coef_hor0, uint32_t coef_hor1,
 
  614                                   uint32_t coef_ver0, uint32_t coef_ver1)
 
  617     v8u16 res_hz0, res_hz1, res_vt0, res_vt1;
 
  620     v16i8 coeff_hz_vec0 = __msa_fill_b(coef_hor0);
 
  621     v16i8 coeff_hz_vec1 = __msa_fill_b(coef_hor1);
 
  622     v16u8 coeff_hz_vec = (v16u8) __msa_ilvr_b(coeff_hz_vec0, coeff_hz_vec1);
 
  623     v8u16 coeff_vt_vec0 = (v8u16) __msa_fill_h(coef_ver0);
 
  624     v8u16 coeff_vt_vec1 = (v8u16) __msa_fill_h(coef_ver1);
 
  630     MUL2(res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_vt0, res_vt1);
 
  633     res_vt0 = (v8u16) __msa_srari_h((v8i16) res_vt0, 6);
 
  634     res_vt0 = __msa_sat_u_h(res_vt0, 7);
 
  635     res = (v4i32) __msa_pckev_b((v16i8) res_vt0, (v16i8) res_vt0);
 
  641                                   uint32_t coef_hor0, uint32_t coef_hor1,
 
  642                                   uint32_t coef_ver0, uint32_t coef_ver1)
 
  645     v8u16 res_hz0, res_hz1, res_hz2, res_hz3;
 
  646     v8u16 res_vt0, res_vt1, res_vt2, res_vt3;
 
  648     v16i8 coeff_hz_vec0 = __msa_fill_b(coef_hor0);
 
  649     v16i8 coeff_hz_vec1 = __msa_fill_b(coef_hor1);
 
  650     v16u8 coeff_hz_vec = (v16u8) __msa_ilvr_b(coeff_hz_vec0, coeff_hz_vec1);
 
  651     v8u16 coeff_vt_vec0 = (v8u16) __msa_fill_h(coef_ver0);
 
  652     v8u16 coeff_vt_vec1 = (v8u16) __msa_fill_h(coef_ver1);
 
  661                 coeff_hz_vec, coeff_hz_vec, res_hz0, res_hz1, res_hz2,
 
  663     MUL4(res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_hz2, coeff_vt_vec1,
 
  664          res_hz3, coeff_vt_vec0, res_vt0, res_vt1, res_vt2, res_vt3);
 
  665     ADD2(res_vt0, res_vt1, res_vt2, res_vt3, res_vt0, res_vt1);
 
  668     PCKEV_B2_SW(res_vt0, res_vt0, res_vt1, res_vt1, res0, res1);
 
  674                                   uint32_t coef_hor0, uint32_t coef_hor1,
 
  675                                   uint32_t coef_ver0, uint32_t coef_ver1)
 
  677     v16u8 
src0, 
src1, 
src2, src3, src4, src5, src6, src7, src8, res0, res1;
 
  678     v8u16 res_hz0, res_hz1, res_hz2, res_hz3, res_hz4, res_hz5, res_hz6, res_hz7;
 
  679     v8u16 res_vt0, res_vt1, res_vt2, res_vt3, res_vt4, res_vt5, res_vt6, res_vt7;
 
  681     v16i8 coeff_hz_vec0 = __msa_fill_b(coef_hor0);
 
  682     v16i8 coeff_hz_vec1 = __msa_fill_b(coef_hor1);
 
  683     v16u8 coeff_hz_vec = (v16u8) __msa_ilvr_b(coeff_hz_vec0, coeff_hz_vec1);
 
  684     v8u16 coeff_vt_vec0 = (v8u16) __msa_fill_h(coef_ver0);
 
  685     v8u16 coeff_vt_vec1 = (v8u16) __msa_fill_h(coef_ver1);
 
  698                 coeff_hz_vec, coeff_hz_vec, res_hz0, res_hz1, res_hz2, res_hz3);
 
  699     DOTP_UB4_UH(src4, src5, src6, src7, coeff_hz_vec, coeff_hz_vec,
 
  700                 coeff_hz_vec, coeff_hz_vec, res_hz4, res_hz5, res_hz6, res_hz7);
 
  701     MUL4(res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_hz2, coeff_vt_vec1,
 
  702          res_hz3, coeff_vt_vec0, res_vt0, res_vt1, res_vt2, res_vt3);
 
  703     MUL4(res_hz4, coeff_vt_vec1, res_hz5, coeff_vt_vec0, res_hz6, coeff_vt_vec1,
 
  704          res_hz7, coeff_vt_vec0, res_vt4, res_vt5, res_vt6, res_vt7);
 
  705     ADD2(res_vt0, res_vt1, res_vt2, res_vt3, res_vt0, res_vt1);
 
  706     ADD2(res_vt4, res_vt5, res_vt6, res_vt7, res_vt2, res_vt3);
 
  707     SRARI_H4_UH(res_vt0, res_vt1, res_vt2, res_vt3, 6);
 
  708     SAT_UH4_UH(res_vt0, res_vt1, res_vt2, res_vt3, 7);
 
  709     PCKEV_B2_UB(res_vt1, res_vt0, res_vt3, res_vt2, res0, res1);
 
  710     ST_W8(res0, res1, 0, 1, 2, 3, 0, 1, 2, 3, dst, 
stride);
 
  714                                  uint32_t coef_hor0, uint32_t coef_hor1,
 
  715                                  uint32_t coef_ver0, uint32_t coef_ver1,
 
  731                                   uint32_t coef_hor0, uint32_t coef_hor1,
 
  732                                   uint32_t coef_ver0, uint32_t coef_ver1)
 
  735     v8u16 res_hz0, res_hz1, res_hz2, res_hz3, res_hz4;
 
  736     v8u16 res_vt0, res_vt1, res_vt2, res_vt3;
 
  738     v16i8 coeff_hz_vec0 = __msa_fill_b(coef_hor0);
 
  739     v16i8 coeff_hz_vec1 = __msa_fill_b(coef_hor1);
 
  740     v16u8 coeff_hz_vec = (v16u8) __msa_ilvr_b(coeff_hz_vec0, coeff_hz_vec1);
 
  741     v8u16 coeff_vt_vec0 = (v8u16) __msa_fill_h(coef_ver0);
 
  742     v8u16 coeff_vt_vec1 = (v8u16) __msa_fill_h(coef_ver1);
 
  750     res_hz0 = __msa_dotp_u_h(
src0, coeff_hz_vec);
 
  758                 coeff_hz_vec, coeff_hz_vec, res_hz1, res_hz2, res_hz3, res_hz4);
 
  759     MUL4(res_hz1, coeff_vt_vec0, res_hz2, coeff_vt_vec0, res_hz3, coeff_vt_vec0,
 
  760          res_hz4, coeff_vt_vec0, res_vt0, res_vt1, res_vt2, res_vt3);
 
  762     res_vt0 += (res_hz0 * coeff_vt_vec1);
 
  763     res_vt1 += (res_hz1 * coeff_vt_vec1);
 
  764     res_vt2 += (res_hz2 * coeff_vt_vec1);
 
  765     res_vt3 += (res_hz3 * coeff_vt_vec1);
 
  767     SRARI_H4_UH(res_vt0, res_vt1, res_vt2, res_vt3, 6);
 
  768     SAT_UH4_UH(res_vt0, res_vt1, res_vt2, res_vt3, 7);
 
  769     PCKEV_B2_UB(res_vt1, res_vt0, res_vt3, res_vt2, out0, out1);
 
  774                                   uint32_t coef_hor0, uint32_t coef_hor1,
 
  775                                   uint32_t coef_ver0, uint32_t coef_ver1)
 
  777     v16u8 
src0, 
src1, 
src2, src3, src4, src5, src6, src7, src8;
 
  778     v16u8 out0, out1, out2, out3;
 
  779     v8u16 res_hz0, res_hz1, res_hz2, res_hz3, res_hz4;
 
  780     v8u16 res_hz5, res_hz6, res_hz7, res_hz8;
 
  781     v8u16 res_vt0, res_vt1, res_vt2, res_vt3;
 
  782     v8u16 res_vt4, res_vt5, res_vt6, res_vt7;
 
  784     v16i8 coeff_hz_vec0 = __msa_fill_b(coef_hor0);
 
  785     v16i8 coeff_hz_vec1 = __msa_fill_b(coef_hor1);
 
  786     v16u8 coeff_hz_vec = (v16u8) __msa_ilvr_b(coeff_hz_vec0, coeff_hz_vec1);
 
  787     v8u16 coeff_vt_vec0 = (v8u16) __msa_fill_h(coef_ver0);
 
  788     v8u16 coeff_vt_vec1 = (v8u16) __msa_fill_h(coef_ver1);
 
  800     res_hz0 = __msa_dotp_u_h(
src0, coeff_hz_vec);
 
  802                 coeff_hz_vec, coeff_hz_vec, res_hz1, res_hz2, res_hz3,
 
  804     DOTP_UB4_UH(src5, src6, src7, src8, coeff_hz_vec, coeff_hz_vec,
 
  805                 coeff_hz_vec, coeff_hz_vec, res_hz5, res_hz6, res_hz7, res_hz8);
 
  806     MUL4(res_hz1, coeff_vt_vec0, res_hz2, coeff_vt_vec0, res_hz3,
 
  807          coeff_vt_vec0, res_hz4, coeff_vt_vec0, res_vt0, res_vt1, res_vt2,
 
  809     MUL4(res_hz5, coeff_vt_vec0, res_hz6, coeff_vt_vec0, res_hz7,
 
  810          coeff_vt_vec0, res_hz8, coeff_vt_vec0, res_vt4, res_vt5, res_vt6,
 
  812     res_vt0 += (res_hz0 * coeff_vt_vec1);
 
  813     res_vt1 += (res_hz1 * coeff_vt_vec1);
 
  814     res_vt2 += (res_hz2 * coeff_vt_vec1);
 
  815     res_vt3 += (res_hz3 * coeff_vt_vec1);
 
  816     res_vt4 += (res_hz4 * coeff_vt_vec1);
 
  817     res_vt5 += (res_hz5 * coeff_vt_vec1);
 
  818     res_vt6 += (res_hz6 * coeff_vt_vec1);
 
  819     res_vt7 += (res_hz7 * coeff_vt_vec1);
 
  820     SRARI_H4_UH(res_vt0, res_vt1, res_vt2, res_vt3, 6);
 
  821     SRARI_H4_UH(res_vt4, res_vt5, res_vt6, res_vt7, 6);
 
  822     SAT_UH4_UH(res_vt0, res_vt1, res_vt2, res_vt3, 7);
 
  823     SAT_UH4_UH(res_vt4, res_vt5, res_vt6, res_vt7, 7);
 
  824     PCKEV_B2_UB(res_vt1, res_vt0, res_vt3, res_vt2, out0, out1);
 
  825     PCKEV_B2_UB(res_vt5, res_vt4, res_vt7, res_vt6, out2, out3);
 
  826     ST_D8(out0, out1, out2, out3, 0, 1, 0, 1, 0, 1, 0, 1, dst, 
stride);
 
  830                                  uint32_t coef_hor0, uint32_t coef_hor1,
 
  831                                  uint32_t coef_ver0, uint32_t coef_ver1,
 
  849     v16u8 dst_data = { 0 };
 
  853     v16i8 coeff_vec0 = __msa_fill_b(coeff0);
 
  854     v16i8 coeff_vec1 = __msa_fill_b(coeff1);
 
  855     v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
 
  864     dst_data = (v16u8) __msa_insert_h((v8i16) dst_data, 0, out0);
 
  865     dst_data = (v16u8) __msa_insert_h((v8i16) dst_data, 2, out1);
 
  869     res_r = __msa_dotp_u_h((v16u8) 
src0, coeff_vec);
 
  871     res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6);
 
  872     res_r = __msa_sat_u_h(res_r, 7);
 
  874     res = (v16u8) __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
 
  875     dst_data = __msa_aver_u_b(res, dst_data);
 
  877     out0 = __msa_copy_u_h((v8i16) dst_data, 0);
 
  878     out1 = __msa_copy_u_h((v8i16) dst_data, 2);
 
  889     uint16_t tp0, tp1, tp2, tp3;
 
  891     v16u8 dst0, dst_data = { 0 };
 
  894     v16i8 coeff_vec0 = __msa_fill_b(coeff0);
 
  895     v16i8 coeff_vec1 = __msa_fill_b(coeff1);
 
  896     v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
 
  905     dst_data = (v16u8) __msa_insert_h((v8i16) dst_data, 0, tp0);
 
  906     dst_data = (v16u8) __msa_insert_h((v8i16) dst_data, 1, tp1);
 
  907     dst_data = (v16u8) __msa_insert_h((v8i16) dst_data, 2, tp2);
 
  908     dst_data = (v16u8) __msa_insert_h((v8i16) dst_data, 3, tp3);
 
  912     src0 = (v16u8) __msa_ilvr_d((v2i64) 
src2, (v2i64) 
src0);
 
  914     res_r = __msa_dotp_u_h(
src0, coeff_vec);
 
  916     res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6);
 
  917     res_r = __msa_sat_u_h(res_r, 7);
 
  919     dst0 = (v16u8) __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
 
  920     dst0 = __msa_aver_u_b(dst0, dst_data);
 
  940     uint32_t load0, load1;
 
  942     v16u8 dst_data = { 0 };
 
  945     v16i8 coeff_vec0 = __msa_fill_b(coeff0);
 
  946     v16i8 coeff_vec1 = __msa_fill_b(coeff1);
 
  947     v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
 
  959     res_r = __msa_dotp_u_h((v16u8) 
src0, coeff_vec);
 
  961     res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6);
 
  962     res_r = __msa_sat_u_h(res_r, 7);
 
  963     res = __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
 
  964     dst_data = __msa_aver_u_b((v16u8) res, dst_data);
 
  973     uint32_t tp0, tp1, tp2, tp3;
 
  975     v16u8 
out, dst_data = { 0 };
 
  977     v8u16 res0_r, res1_r;
 
  978     v16i8 coeff_vec0 = __msa_fill_b(coeff0);
 
  979     v16i8 coeff_vec1 = __msa_fill_b(coeff1);
 
  980     v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
 
  993     out = (v16u8) __msa_pckev_b((v16i8) res1_r, (v16i8) res0_r);
 
  994     out = __msa_aver_u_b(
out, dst_data);
 
 1002     uint32_t tp0, tp1, tp2, tp3;
 
 1003     v16u8 
src0, 
src1, 
src2, src3, src4, src5, src6, src7, out0, out1;
 
 1004     v16u8 dst0 = { 0 }, dst1 = { 0 };
 
 1006     v8u16 res0, res1, res2, res3;
 
 1007     v16i8 coeff_vec0 = __msa_fill_b(coeff0);
 
 1008     v16i8 coeff_vec1 = __msa_fill_b(coeff1);
 
 1009     v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
 
 1021     DOTP_UB2_UH(src4, src6, coeff_vec, coeff_vec, res2, res3);
 
 1022     SLLI_4V(res0, res1, res2, res3, 3);
 
 1027     ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, 
stride);
 
 1036     } 
else if (4 == 
height) {
 
 1038     } 
else if (8 == 
height) {
 
 1047     uint64_t tp0, tp1, tp2, tp3;
 
 1049     v16u8 dst0 = { 0 }, dst1 = { 0 };
 
 1050     v8u16 res0, res1, res2, res3;
 
 1052     v16i8 coeff_vec0 = __msa_fill_b(coeff0);
 
 1053     v16i8 coeff_vec1 = __msa_fill_b(coeff1);
 
 1054     v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
 
 1064                 coeff_vec, res0, res1, res2, res3);
 
 1065     SLLI_4V(res0, res1, res2, res3, 3);
 
 1077     uint64_t tp0, tp1, tp2, tp3;
 
 1079     v16u8 out0, out1, out2, out3;
 
 1080     v16u8 dst0 = { 0 }, dst1 = { 0 }, dst2 = { 0 }, dst3 = { 0 };
 
 1081     v8u16 res0, res1, res2, res3, res4, res5, res6, res7;
 
 1083     v16i8 coeff_vec0 = __msa_fill_b(coeff0);
 
 1084     v16i8 coeff_vec1 = __msa_fill_b(coeff1);
 
 1085     v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
 
 1101                 coeff_vec, res0, res1, res2, res3);
 
 1102     DOTP_UB4_UH(src4, src5, src6, src7, coeff_vec, coeff_vec, coeff_vec,
 
 1103                 coeff_vec, res4, res5, res6, res7);
 
 1104     SLLI_4V(res0, res1, res2, res3, 3);
 
 1105     SLLI_4V(res4, res5, res6, res7, 3);
 
 1114     ST_D8(out0, out1, out2, out3, 0, 1, 0, 1, 0, 1, 0, 1, dst, 
stride);
 
 1123     } 
else if (8 == 
height) {
 
 1132     uint16_t out0, out1;
 
 1134     v16u8 dst_data = { 0 };
 
 1137     v16i8 coeff_vec0 = __msa_fill_b(coeff0);
 
 1138     v16i8 coeff_vec1 = __msa_fill_b(coeff1);
 
 1139     v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
 
 1145     dst_data = (v16u8) __msa_insert_h((v8i16) dst_data, 0, out0);
 
 1146     dst_data = (v16u8) __msa_insert_h((v8i16) dst_data, 2, out1);
 
 1150     tmp0 = (v16i8) __msa_ilvr_d((v2i64) tmp1, (v2i64) tmp0);
 
 1151     res_r = __msa_dotp_u_h((v16u8) tmp0, coeff_vec);
 
 1153     res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6);
 
 1154     res_r = __msa_sat_u_h(res_r, 7);
 
 1155     res = __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
 
 1156     out = (v8i16) __msa_aver_u_b((v16u8) res, dst_data);
 
 1157     out0 = __msa_copy_u_h(
out, 0);
 
 1158     out1 = __msa_copy_u_h(
out, 2);
 
 1169     uint16_t tp0, tp1, tp2, tp3;
 
 1171     v16u8 tmp0, tmp1, tmp2, tmp3;
 
 1174     v16i8 coeff_vec0 = __msa_fill_b(coeff0);
 
 1175     v16i8 coeff_vec1 = __msa_fill_b(coeff1);
 
 1176     v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
 
 1177     v16u8 dst_data = { 0 };
 
 1185     dst_data = (v16u8) __msa_insert_h((v8i16) dst_data, 0, tp0);
 
 1186     dst_data = (v16u8) __msa_insert_h((v8i16) dst_data, 1, tp1);
 
 1187     dst_data = (v16u8) __msa_insert_h((v8i16) dst_data, 2, tp2);
 
 1188     dst_data = (v16u8) __msa_insert_h((v8i16) dst_data, 3, tp3);
 
 1191                tmp0, tmp1, tmp2, tmp3);
 
 1192     ILVR_W2_UB(tmp1, tmp0, tmp3, tmp2, tmp0, tmp2);
 
 1194     tmp0 = (v16u8) __msa_ilvr_d((v2i64) tmp2, (v2i64) tmp0);
 
 1196     res_r = __msa_dotp_u_h(tmp0, coeff_vec);
 
 1198     res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6);
 
 1199     res_r = __msa_sat_u_h(res_r, 7);
 
 1201     res = (v8i16) __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
 
 1202     res = (v8i16) __msa_aver_u_b((v16u8) res, dst_data);
 
 1213     } 
else if (4 == 
height) {
 
 1222     uint32_t load0, load1;
 
 1224     v16u8 dst_data = { 0 };
 
 1227     v16i8 coeff_vec0 = __msa_fill_b(coeff0);
 
 1228     v16i8 coeff_vec1 = __msa_fill_b(coeff1);
 
 1229     v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
 
 1238     tmp0 = (v16u8) __msa_ilvr_d((v2i64) tmp1, (v2i64) tmp0);
 
 1240     res_r = __msa_dotp_u_h(tmp0, coeff_vec);
 
 1242     res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6);
 
 1243     res_r = __msa_sat_u_h(res_r, 7);
 
 1244     res = (v16u8) __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
 
 1245     res = __msa_aver_u_b(res, dst_data);
 
 1254     uint32_t tp0, tp1, tp2, tp3;
 
 1256     v16u8 tmp0, tmp1, tmp2, tmp3;
 
 1258     v8u16 res0_r, res1_r;
 
 1260     v16i8 coeff_vec0 = __msa_fill_b(coeff0);
 
 1261     v16i8 coeff_vec1 = __msa_fill_b(coeff1);
 
 1262     v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
 
 1267     ILVR_B4_UB(
src1, 
src0, 
src2, 
src1, src3, 
src2, src4, src3, tmp0, tmp1, tmp2,
 
 1269     ILVR_D2_UB(tmp1, tmp0, tmp3, tmp2, tmp0, tmp2);
 
 1270     DOTP_UB2_UH(tmp0, tmp2, coeff_vec, coeff_vec, res0_r, res1_r);
 
 1275     out = (v16u8) __msa_pckev_b((v16i8) res1_r, (v16i8) res0_r);
 
 1276     out = __msa_aver_u_b(
out, dst0);
 
 1284     uint32_t tp0, tp1, tp2, tp3;
 
 1285     v16u8 
src0, 
src1, 
src2, src3, src4, src5, src6, src7, src8;
 
 1286     v16u8 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, out0, out1;
 
 1287     v16u8 dst0 = { 0 }, dst1 = { 0 };
 
 1288     v8u16 res0, res1, res2, res3;
 
 1289     v16i8 coeff_vec0 = __msa_fill_b(coeff0);
 
 1290     v16i8 coeff_vec1 = __msa_fill_b(coeff1);
 
 1291     v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
 
 1300     ILVR_B4_UB(
src1, 
src0, 
src2, 
src1, src3, 
src2, src4, src3, tmp0, tmp1, tmp2,
 
 1302     ILVR_B4_UB(src5, src4, src6, src5, src7, src6, src8, src7, tmp4, tmp5, tmp6,
 
 1304     ILVR_D2_UB(tmp1, tmp0, tmp3, tmp2, tmp0, tmp2);
 
 1305     ILVR_D2_UB(tmp5, tmp4, tmp7, tmp6, tmp4, tmp6);
 
 1306     DOTP_UB2_UH(tmp0, tmp2, coeff_vec, coeff_vec, res0, res1);
 
 1307     DOTP_UB2_UH(tmp4, tmp6, coeff_vec, coeff_vec, res2, res3);
 
 1308     SLLI_4V(res0, res1, res2, res3, 3);
 
 1313     ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, 
stride);
 
 1322     } 
else if (4 == 
height) {
 
 1324     } 
else if (8 == 
height) {
 
 1333     uint64_t tp0, tp1, tp2, tp3;
 
 1336     v8u16 res0, res1, res2, res3;
 
 1337     v16u8 dst0 = { 0 }, dst1 = { 0 };
 
 1338     v16i8 coeff_vec0 = __msa_fill_b(coeff0);
 
 1339     v16i8 coeff_vec1 = __msa_fill_b(coeff1);
 
 1340     v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
 
 1349                 coeff_vec, res0, res1, res2, res3);
 
 1350     SLLI_4V(res0, res1, res2, res3, 3);
 
 1362     uint64_t tp0, tp1, tp2, tp3;
 
 1363     v16u8 
src0, 
src1, 
src2, src3, src4, src5, src6, src7, src8;
 
 1364     v16u8 out0, out1, out2, out3;
 
 1365     v16u8 dst0 = { 0 }, dst1 = { 0 }, dst2 = { 0 }, dst3 = { 0 };
 
 1366     v8u16 res0, res1, res2, res3, res4, res5, res6, res7;
 
 1367     v16i8 coeff_vec0 = __msa_fill_b(coeff0);
 
 1368     v16i8 coeff_vec1 = __msa_fill_b(coeff1);
 
 1369     v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
 
 1382     ILVR_B4_UB(src5, src4, src6, src5, src7, src6, src8, src7,
 
 1383                src4, src5, src6, src7);
 
 1385                 coeff_vec, res0, res1, res2, res3);
 
 1386     DOTP_UB4_UH(src4, src5, src6, src7, coeff_vec, coeff_vec, coeff_vec,
 
 1387                 coeff_vec, res4, res5, res6, res7);
 
 1388     SLLI_4V(res0, res1, res2, res3, 3);
 
 1389     SLLI_4V(res4, res5, res6, res7, 3);
 
 1398     ST_D8(out0, out1, out2, out3, 0, 1, 0, 1, 0, 1, 0, 1, dst, 
stride);
 
 1407     } 
else if (8 == 
height) {
 
 1419     uint16_t out0, out1;
 
 1422     v8u16 res_hz0, res_hz1, res_vt0, res_vt1;
 
 1424     v16i8 coeff_hz_vec0 = __msa_fill_b(coef_hor0);
 
 1425     v16i8 coeff_hz_vec1 = __msa_fill_b(coef_hor1);
 
 1426     v16u8 coeff_hz_vec = (v16u8) __msa_ilvr_b(coeff_hz_vec0, coeff_hz_vec1);
 
 1427     v8u16 coeff_vt_vec0 = (v8u16) __msa_fill_h(coef_ver0);
 
 1428     v8u16 coeff_vt_vec1 = (v8u16) __msa_fill_h(coef_ver1);
 
 1435     dst0 = (v16u8) __msa_insert_h((v8i16) dst0, 0, out0);
 
 1436     dst0 = (v16u8) __msa_insert_h((v8i16) dst0, 1, out1);
 
 1439     MUL2(res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_vt0, res_vt1);
 
 1442     res_vt0 = (v8u16) __msa_srari_h((v8i16) res_vt0, 6);
 
 1443     res_vt0 = __msa_sat_u_h(res_vt0, 7);
 
 1444     res = __msa_pckev_b((v16i8) res_vt0, (v16i8) res_vt0);
 
 1445     dst0 = __msa_aver_u_b((v16u8) res, dst0);
 
 1446     out0 = __msa_copy_u_h((v8i16) dst0, 0);
 
 1447     out1 = __msa_copy_u_h((v8i16) dst0, 1);
 
 1461     uint16_t tp0, tp1, tp2, tp3;
 
 1463     v16u8 tmp0, tmp1, tmp2, tmp3;
 
 1465     v8u16 res_hz0, res_hz1, res_vt0, res_vt1;
 
 1467     v16i8 coeff_hz_vec0 = __msa_fill_b(coef_hor0);
 
 1468     v16i8 coeff_hz_vec1 = __msa_fill_b(coef_hor1);
 
 1469     v16u8 coeff_hz_vec = (v16u8) __msa_ilvr_b(coeff_hz_vec0, coeff_hz_vec1);
 
 1470     v8u16 coeff_vt_vec0 = (v8u16) __msa_fill_h(coef_ver0);
 
 1471     v8u16 coeff_vt_vec1 = (v8u16) __msa_fill_h(coef_ver1);
 
 1480     dst0 = (v16u8) __msa_insert_h((v8i16) dst0, 0, tp0);
 
 1481     dst0 = (v16u8) __msa_insert_h((v8i16) dst0, 1, tp1);
 
 1482     dst0 = (v16u8) __msa_insert_h((v8i16) dst0, 2, tp2);
 
 1483     dst0 = (v16u8) __msa_insert_h((v8i16) dst0, 3, tp3);
 
 1488     MUL2(res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_vt0, res_vt1);
 
 1491     res_vt0 = (v8u16) __msa_srari_h((v8i16) res_vt0, 6);
 
 1492     res_vt0 = __msa_sat_u_h(res_vt0, 7);
 
 1493     res = __msa_pckev_b((v16i8) res_vt0, (v16i8) res_vt0);
 
 1494     dst0 = __msa_aver_u_b((v16u8) res, dst0);
 
 1509                                            coef_hor1, coef_ver0, coef_ver1);
 
 1510     } 
else if (4 == 
height) {
 
 1512                                            coef_hor1, coef_ver0, coef_ver1);
 
 1525     v16u8 dst0, dst_data = { 0 };
 
 1526     v8u16 res_hz0, res_hz1, res_vt0, res_vt1;
 
 1528     v16i8 coeff_hz_vec0 = __msa_fill_b(coef_hor0);
 
 1529     v16i8 coeff_hz_vec1 = __msa_fill_b(coef_hor1);
 
 1530     v16u8 coeff_hz_vec = (v16u8) __msa_ilvr_b(coeff_hz_vec0, coeff_hz_vec1);
 
 1531     v8u16 coeff_vt_vec0 = (v8u16) __msa_fill_h(coef_ver0);
 
 1532     v8u16 coeff_vt_vec1 = (v8u16) __msa_fill_h(coef_ver1);
 
 1541     MUL2(res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_vt0, res_vt1);
 
 1544     res_vt0 = (v8u16) __msa_srari_h((v8i16) res_vt0, 6);
 
 1545     res_vt0 = __msa_sat_u_h(res_vt0, 7);
 
 1546     dst0 = (v16u8) __msa_pckev_b((v16i8) res_vt0, (v16i8) res_vt0);
 
 1547     dst0 = __msa_aver_u_b(dst0, dst_data);
 
 1559     uint32_t tp0, tp1, tp2, tp3;
 
 1561     v16u8 
out, dst_data = { 0 };
 
 1562     v8u16 res_hz0, res_hz1, res_hz2, res_hz3;
 
 1563     v8u16 res_vt0, res_vt1, res_vt2, res_vt3;
 
 1565     v16i8 coeff_hz_vec0 = __msa_fill_b(coef_hor0);
 
 1566     v16i8 coeff_hz_vec1 = __msa_fill_b(coef_hor1);
 
 1567     v16u8 coeff_hz_vec = (v16u8) __msa_ilvr_b(coeff_hz_vec0, coeff_hz_vec1);
 
 1568     v8u16 coeff_vt_vec0 = (v8u16) __msa_fill_h(coef_ver0);
 
 1569     v8u16 coeff_vt_vec1 = (v8u16) __msa_fill_h(coef_ver1);
 
 1579                 coeff_hz_vec, coeff_hz_vec, res_hz0, res_hz1, res_hz2,
 
 1581     MUL4(res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_hz2, coeff_vt_vec1,
 
 1582          res_hz3, coeff_vt_vec0, res_vt0, res_vt1, res_vt2, res_vt3);
 
 1583     ADD2(res_vt0, res_vt1, res_vt2, res_vt3, res_vt0, res_vt1);
 
 1586     out = (v16u8) __msa_pckev_b((v16i8) res_vt1, (v16i8) res_vt0);
 
 1587     out = __msa_aver_u_b(
out, dst_data);
 
 1598     uint32_t tp0, tp1, tp2, tp3;
 
 1599     v16u8 
src0, 
src1, 
src2, src3, src4, src5, src6, src7, src8, res0, res1;
 
 1600     v16u8 dst0 = { 0 }, dst1 = { 0 };
 
 1601     v8u16 res_hz0, res_hz1, res_hz2, res_hz3, res_hz4, res_hz5, res_hz6, res_hz7;
 
 1602     v8u16 res_vt0, res_vt1, res_vt2, res_vt3, res_vt4, res_vt5, res_vt6, res_vt7;
 
 1604     v16i8 coeff_hz_vec0 = __msa_fill_b(coef_hor0);
 
 1605     v16i8 coeff_hz_vec1 = __msa_fill_b(coef_hor1);
 
 1606     v16u8 coeff_hz_vec = (v16u8) __msa_ilvr_b(coeff_hz_vec0, coeff_hz_vec1);
 
 1607     v8u16 coeff_vt_vec0 = (v8u16) __msa_fill_h(coef_ver0);
 
 1608     v8u16 coeff_vt_vec1 = (v8u16) __msa_fill_h(coef_ver1);
 
 1624                 coeff_hz_vec, coeff_hz_vec, res_hz0, res_hz1, res_hz2, res_hz3);
 
 1625     DOTP_UB4_UH(src4, src5, src6, src7, coeff_hz_vec, coeff_hz_vec,
 
 1626                 coeff_hz_vec, coeff_hz_vec, res_hz4, res_hz5, res_hz6, res_hz7);
 
 1627     MUL4(res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_hz2, coeff_vt_vec1,
 
 1628          res_hz3, coeff_vt_vec0, res_vt0, res_vt1, res_vt2, res_vt3);
 
 1629     MUL4(res_hz4, coeff_vt_vec1, res_hz5, coeff_vt_vec0, res_hz6, coeff_vt_vec1,
 
 1630          res_hz7, coeff_vt_vec0, res_vt4, res_vt5, res_vt6, res_vt7);
 
 1631     ADD2(res_vt0, res_vt1, res_vt2, res_vt3, res_vt0, res_vt1);
 
 1632     ADD2(res_vt4, res_vt5, res_vt6, res_vt7, res_vt2, res_vt3);
 
 1633     SRARI_H4_UH(res_vt0, res_vt1, res_vt2, res_vt3, 6);
 
 1634     SAT_UH4_UH(res_vt0, res_vt1, res_vt2, res_vt3, 7);
 
 1635     PCKEV_B2_UB(res_vt1, res_vt0, res_vt3, res_vt2, res0, res1);
 
 1637     ST_W8(res0, res1, 0, 1, 2, 3, 0, 1, 2, 3, dst, 
stride);
 
 1650                                            coef_hor1, coef_ver0, coef_ver1);
 
 1651     } 
else if (4 == 
height) {
 
 1653                                            coef_hor1, coef_ver0, coef_ver1);
 
 1654     } 
else if (8 == 
height) {
 
 1656                                            coef_hor1, coef_ver0, coef_ver1);
 
 1667     uint64_t tp0, tp1, tp2, tp3;
 
 1669     v8u16 res_hz0, res_hz1, res_hz2;
 
 1670     v8u16 res_hz3, res_hz4;
 
 1671     v8u16 res_vt0, res_vt1, res_vt2, res_vt3;
 
 1672     v16u8 dst0 = { 0 }, dst1 = { 0 };
 
 1674     v16i8 coeff_hz_vec0 = __msa_fill_b(coef_hor0);
 
 1675     v16i8 coeff_hz_vec1 = __msa_fill_b(coef_hor1);
 
 1676     v16u8 coeff_hz_vec = (v16u8) __msa_ilvr_b(coeff_hz_vec0, coeff_hz_vec1);
 
 1677     v8u16 coeff_vt_vec0 = (v8u16) __msa_fill_h(coef_ver0);
 
 1678     v8u16 coeff_vt_vec1 = (v8u16) __msa_fill_h(coef_ver1);
 
 1685     res_hz0 = __msa_dotp_u_h(
src0, coeff_hz_vec);
 
 1694                 coeff_hz_vec, coeff_hz_vec, res_hz1, res_hz2, res_hz3, res_hz4);
 
 1695     MUL4(res_hz1, coeff_vt_vec0, res_hz2, coeff_vt_vec0, res_hz3, coeff_vt_vec0,
 
 1696          res_hz4, coeff_vt_vec0, res_vt0, res_vt1, res_vt2, res_vt3);
 
 1697     res_vt0 += (res_hz0 * coeff_vt_vec1);
 
 1698     res_vt1 += (res_hz1 * coeff_vt_vec1);
 
 1699     res_vt2 += (res_hz2 * coeff_vt_vec1);
 
 1700     res_vt3 += (res_hz3 * coeff_vt_vec1);
 
 1701     SRARI_H4_UH(res_vt0, res_vt1, res_vt2, res_vt3, 6);
 
 1702     SAT_UH4_UH(res_vt0, res_vt1, res_vt2, res_vt3, 7);
 
 1703     PCKEV_B2_UB(res_vt1, res_vt0, res_vt3, res_vt2, out0, out1);
 
 1715     uint64_t tp0, tp1, tp2, tp3;
 
 1716     v16u8 
src0, 
src1, 
src2, src3, src4, src5, src6, src7, src8;
 
 1717     v16u8 out0, out1, out2, out3;
 
 1718     v16u8 dst0 = { 0 }, dst1 = { 0 }, dst2 = { 0 }, dst3 = { 0 };
 
 1719     v8u16 res_hz0, res_hz1, res_hz2, res_hz3, res_hz4;
 
 1720     v8u16 res_hz5, res_hz6, res_hz7, res_hz8;
 
 1721     v8u16 res_vt0, res_vt1, res_vt2, res_vt3;
 
 1722     v8u16 res_vt4, res_vt5, res_vt6, res_vt7;
 
 1724     v16i8 coeff_hz_vec0 = __msa_fill_b(coef_hor0);
 
 1725     v16i8 coeff_hz_vec1 = __msa_fill_b(coef_hor1);
 
 1726     v16u8 coeff_hz_vec = (v16u8) __msa_ilvr_b(coeff_hz_vec0, coeff_hz_vec1);
 
 1727     v8u16 coeff_vt_vec0 = (v8u16) __msa_fill_h(coef_ver0);
 
 1728     v8u16 coeff_vt_vec1 = (v8u16) __msa_fill_h(coef_ver1);
 
 1740     res_hz0 = __msa_dotp_u_h(
src0, coeff_hz_vec);
 
 1742                 coeff_hz_vec, coeff_hz_vec, res_hz1, res_hz2, res_hz3,
 
 1744     DOTP_UB4_UH(src5, src6, src7, src8, coeff_hz_vec, coeff_hz_vec,
 
 1745                 coeff_hz_vec, coeff_hz_vec, res_hz5, res_hz6, res_hz7, res_hz8);
 
 1746     MUL4(res_hz1, coeff_vt_vec0, res_hz2, coeff_vt_vec0, res_hz3,
 
 1747          coeff_vt_vec0, res_hz4, coeff_vt_vec0, res_vt0, res_vt1, res_vt2,
 
 1749     MUL4(res_hz5, coeff_vt_vec0, res_hz6, coeff_vt_vec0, res_hz7,
 
 1750          coeff_vt_vec0, res_hz8, coeff_vt_vec0, res_vt4, res_vt5, res_vt6,
 
 1758     res_vt0 += (res_hz0 * coeff_vt_vec1);
 
 1759     res_vt1 += (res_hz1 * coeff_vt_vec1);
 
 1760     res_vt2 += (res_hz2 * coeff_vt_vec1);
 
 1761     res_vt3 += (res_hz3 * coeff_vt_vec1);
 
 1762     res_vt4 += (res_hz4 * coeff_vt_vec1);
 
 1763     res_vt5 += (res_hz5 * coeff_vt_vec1);
 
 1764     res_vt6 += (res_hz6 * coeff_vt_vec1);
 
 1765     res_vt7 += (res_hz7 * coeff_vt_vec1);
 
 1766     SRARI_H4_UH(res_vt0, res_vt1, res_vt2, res_vt3, 6);
 
 1767     SRARI_H4_UH(res_vt4, res_vt5, res_vt6, res_vt7, 6);
 
 1768     SAT_UH4_UH(res_vt0, res_vt1, res_vt2, res_vt3, 7);
 
 1769     SAT_UH4_UH(res_vt4, res_vt5, res_vt6, res_vt7, 7);
 
 1770     PCKEV_B2_UB(res_vt1, res_vt0, res_vt3, res_vt2, out0, out1);
 
 1771     PCKEV_B2_UB(res_vt5, res_vt4, res_vt7, res_vt6, out2, out3);
 
 1774     ST_D8(out0, out1, out2, out3, 0, 1, 0, 1, 0, 1, 0, 1, dst, 
stride);
 
 1787                                            coef_hor1, coef_ver0, coef_ver1);
 
 1788     } 
else if (8 == 
height) {
 
 1790                                            coef_hor1, coef_ver0, coef_ver1);
 
 1797     uint32_t tp0, tp1, tp2, tp3, tp4, tp5, tp6, tp7;
 
 1806     } 
else if (4 == 
height) {
 
 1809     } 
else if (2 == 
height) {
 
 1820     uint64_t 
src0, 
src1, 
src2, src3, src4, src5, src6, src7;
 
 1828         SD4(src4, src5, src6, src7, dst, 
stride);
 
 1829     } 
else if (4 == 
height) {
 
 1838     uint32_t tp0, tp1, tp2, tp3;
 
 1839     v16u8 
src0 = { 0 }, 
src1 = { 0 }, dst0 = { 0 }, dst1 = { 0 };
 
 1852         ST_W8(dst0, dst1, 0, 1, 2, 3, 0, 1, 2, 3, dst, 
stride);
 
 1853     } 
else if (4 == 
height) {
 
 1858         dst0 = __msa_aver_u_b(
src0, dst0);
 
 1860     } 
else if (2 == 
height) {
 
 1865         dst0 = __msa_aver_u_b(
src0, dst0);
 
 1873     uint64_t tp0, tp1, tp2, tp3, tp4, tp5, tp6, tp7;
 
 1874     v16u8 
src0 = { 0 }, 
src1 = { 0 }, 
src2 = { 0 }, src3 = { 0 };
 
 1875     v16u8 dst0 = { 0 }, dst1 = { 0 }, dst2 = { 0 }, dst3 = { 0 };
 
 1891         AVER_UB4_UB(
src0, dst0, 
src1, dst1, 
src2, dst2, src3, dst3, dst0, dst1,
 
 1893         ST_D8(dst0, dst1, dst2, dst3, 0, 1, 0, 1, 0, 1, 0, 1, dst, 
stride);
 
 1894     } 
else if (4 == 
height) {
 
 1909     av_assert2(x < 8 && y < 8 && x >= 0 && y >= 0);
 
 1925     av_assert2(x < 8 && y < 8 && x >= 0 && y >= 0);
 
 1943     av_assert2(x < 8 && y < 8 && x >= 0 && y >= 0);
 
 1952         for (cnt = 
height; cnt--;) {
 
 1953             *((uint16_t *) dst) = *((uint16_t *) 
src);
 
 1964     av_assert2(x < 8 && y < 8 && x >= 0 && y >= 0);
 
 1982     av_assert2(x < 8 && y < 8 && x >= 0 && y >= 0);
 
 2001     av_assert2(x < 8 && y < 8 && x >= 0 && y >= 0);
 
 2011         for (cnt = 
height; cnt--;) {
 
 2012             dst[0] = (dst[0] + 
src[0] + 1) >> 1;
 
 2013             dst[1] = (dst[1] + 
src[1] + 1) >> 1;