28     uint32_t tp0, tp1, offset_val;
 
   31     v8i16 src0_r, tmp0, wgt, denom, 
offset;
 
   33     offset_val = (unsigned) offset_in << log2_denom;
 
   35     wgt = __msa_fill_h(src_weight);
 
   36     offset = __msa_fill_h(offset_val);
 
   37     denom = __msa_fill_h(log2_denom);
 
   41     src0_r = (v8i16) __msa_ilvr_b((v16i8) 
zero, (v16i8) 
src0);
 
   43     tmp0 = __msa_adds_s_h(tmp0, 
offset);
 
   44     tmp0 = __msa_maxi_s_h(tmp0, 0);
 
   45     tmp0 = __msa_srlr_h(tmp0, denom);
 
   46     tmp0 = (v8i16) __msa_sat_u_h((v8u16) tmp0, 7);
 
   47     src0 = (v16u8) __msa_pckev_b((v16i8) tmp0, (v16i8) tmp0);
 
   55     uint32_t tp0, tp1, tp2, tp3, offset_val;
 
   57     v8i16 src0_r, src1_r, tmp0, tmp1, wgt, denom, 
offset;
 
   59     offset_val = (unsigned) offset_in << log2_denom;
 
   61     wgt = __msa_fill_h(src_weight);
 
   62     offset = __msa_fill_h(offset_val);
 
   63     denom = __msa_fill_h(log2_denom);
 
   68     MUL2(wgt, src0_r, wgt, src1_r, tmp0, tmp1);
 
   71     tmp0 = __msa_srlr_h(tmp0, denom);
 
   72     tmp1 = __msa_srlr_h(tmp1, denom);
 
   74     src0 = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
 
   82     uint32_t tp0, tp1, tp2, tp3, offset_val;
 
   84     v8i16 src0_r, src1_r, src2_r, src3_r, tmp0, tmp1, tmp2, tmp3;
 
   87     offset_val = (unsigned) offset_in << log2_denom;
 
   89     wgt = __msa_fill_h(src_weight);
 
   90     offset = __msa_fill_h(offset_val);
 
   91     denom = __msa_fill_h(log2_denom);
 
   99     MUL4(wgt, src0_r, wgt, src1_r, wgt, src2_r, wgt, src3_r, tmp0, tmp1, tmp2,
 
  107     ST_W8(
src0, 
src1, 0, 1, 2, 3, 0, 1, 2, 3, 
data, 
stride);
 
  115     uint64_t tp0, tp1, tp2, tp3;
 
  117     v8i16 src0_r, src1_r, src2_r, src3_r, tmp0, tmp1, tmp2, tmp3;
 
  120     offset_val = (unsigned) offset_in << log2_denom;
 
  122     wgt = __msa_fill_h(src_weight);
 
  123     offset = __msa_fill_h(offset_val);
 
  124     denom = __msa_fill_h(log2_denom);
 
  131     MUL4(wgt, src0_r, wgt, src1_r, wgt, src2_r, wgt, src3_r, tmp0, tmp1, tmp2,
 
  146     uint64_t tp0, tp1, tp2, tp3;
 
  147     v16u8 
src0 = { 0 }, 
src1 = { 0 }, 
src2 = { 0 }, src3 = { 0 };
 
  148     v8i16 src0_r, src1_r, src2_r, src3_r, src4_r, src5_r, src6_r, src7_r;
 
  149     v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
 
  152     offset_val = (unsigned) offset_in << log2_denom;
 
  154     wgt = __msa_fill_h(src_weight);
 
  155     offset = __msa_fill_h(offset_val);
 
  156     denom = __msa_fill_h(log2_denom);
 
  168     MUL4(wgt, src0_r, wgt, src1_r, wgt, src2_r, wgt, src3_r, tmp0, tmp1, tmp2,
 
  170     MUL4(wgt, src4_r, wgt, src5_r, wgt, src6_r, wgt, src7_r, tmp4, tmp5, tmp6,
 
  176     MAXI_SH8_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, 0);
 
  177     SRLR_H8_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, denom);
 
  178     SAT_UH8_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, 7);
 
  179     PCKEV_B4_UB(tmp1, tmp0, tmp3, tmp2, tmp5, tmp4, tmp7, tmp6, 
src0, 
src1,
 
  181     ST_D8(
src0, 
src1, 
src2, src3, 0, 1, 0, 1, 0, 1, 0, 1, 
data, 
stride);
 
  188     uint32_t offset_val, cnt;
 
  189     uint64_t tp0, tp1, tp2, tp3;
 
  190     v16u8 
src0 = { 0 }, 
src1 = { 0 }, 
src2 = { 0 }, src3 = { 0 };
 
  191     v8i16 src0_r, src1_r, src2_r, src3_r, src4_r, src5_r, src6_r, src7_r;
 
  192     v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
 
  195     offset_val = (unsigned) offset_in << log2_denom;
 
  197     wgt = __msa_fill_h(src_weight);
 
  198     offset = __msa_fill_h(offset_val);
 
  199     denom = __msa_fill_h(log2_denom);
 
  201     for (cnt = 2; cnt--;) {
 
  212         MUL4(wgt, src0_r, wgt, src1_r, wgt, src2_r, wgt, src3_r, tmp0, tmp1,
 
  214         MUL4(wgt, src4_r, wgt, src5_r, wgt, src6_r, wgt, src7_r, tmp4, tmp5,
 
  217                     tmp0, tmp1, tmp2, tmp3);
 
  219                     tmp4, tmp5, tmp6, tmp7);
 
  220         MAXI_SH8_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, 0);
 
  221         SRLR_H8_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, denom);
 
  222         SAT_UH8_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, 7);
 
  223         PCKEV_B4_UB(tmp1, tmp0, tmp3, tmp2, tmp5, tmp4, tmp7, tmp6, 
src0, 
src1,
 
  225         ST_D8(
src0, 
src1, 
src2, src3, 0, 1, 0, 1, 0, 1, 0, 1, 
data, 
stride);
 
  235     v16i8 src_wgt, dst_wgt, wgt, vec0;
 
  236     v16u8 
src0 = { 0 }, dst0 = { 0 };
 
  237     v8i16 tmp0, denom, 
offset, max255 = __msa_ldi_h(255);
 
  239     offset_in = (unsigned) ((offset_in + 1) | 1) << log2_denom;
 
  240     offset_in += (128 * (src_weight + dst_weight));
 
  242     src_wgt = __msa_fill_b(src_weight);
 
  243     dst_wgt = __msa_fill_b(dst_weight);
 
  244     offset = __msa_fill_h(offset_in);
 
  245     denom = __msa_fill_h(log2_denom + 1);
 
  247     wgt = __msa_ilvev_b(dst_wgt, src_wgt);
 
  254     vec0 = (v16i8) __msa_ilvr_b((v16i8) dst0, (v16i8) 
src0);
 
  255     tmp0 = __msa_dpadd_s_h(
offset, wgt, vec0);
 
  257     tmp0 = __msa_maxi_s_h(tmp0, 0);
 
  258     tmp0 = __msa_min_s_h(max255, tmp0);
 
  259     dst0 = (v16u8) __msa_pckev_b((v16i8) tmp0, (v16i8) tmp0);
 
  267     uint32_t tp0, tp1, tp2, tp3;
 
  268     v16i8 src_wgt, dst_wgt, wgt, vec0, vec1;
 
  270     v8i16 tmp0, tmp1, denom, 
offset;
 
  272     offset_in = (unsigned) ((offset_in + 1) | 1) << log2_denom;
 
  273     offset_in += (128 * (src_weight + dst_weight));
 
  275     src_wgt = __msa_fill_b(src_weight);
 
  276     dst_wgt = __msa_fill_b(dst_weight);
 
  277     offset = __msa_fill_h(offset_in);
 
  278     denom = __msa_fill_h(log2_denom + 1);
 
  280     wgt = __msa_ilvev_b(dst_wgt, src_wgt);
 
  288     tmp0 = __msa_dpadd_s_h(
offset, wgt, vec0);
 
  289     tmp1 = __msa_dpadd_s_h(
offset, wgt, vec1);
 
  293     dst0 = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
 
  301     uint32_t tp0, tp1, tp2, tp3;
 
  302     v16i8 src_wgt, dst_wgt, wgt, vec0, vec1, vec2, vec3;
 
  304     v8i16 tmp0, tmp1, tmp2, tmp3, denom, 
offset;
 
  306     offset_in = (unsigned) ((offset_in + 1) | 1) << log2_denom;
 
  307     offset_in += (128 * (src_weight + dst_weight));
 
  309     src_wgt = __msa_fill_b(src_weight);
 
  310     dst_wgt = __msa_fill_b(dst_weight);
 
  311     offset = __msa_fill_h(offset_in);
 
  312     denom = __msa_fill_h(log2_denom + 1);
 
  313     wgt = __msa_ilvev_b(dst_wgt, src_wgt);
 
  327     tmp0 = __msa_dpadd_s_h(
offset, wgt, vec0);
 
  328     tmp1 = __msa_dpadd_s_h(
offset, wgt, vec1);
 
  329     tmp2 = __msa_dpadd_s_h(
offset, wgt, vec2);
 
  330     tmp3 = __msa_dpadd_s_h(
offset, wgt, vec3);
 
  331     SRA_4V(tmp0, tmp1, tmp2, tmp3, denom);
 
  334     ST_W8(dst0, dst1, 0, 1, 2, 3, 0, 1, 2, 3, dst, 
stride);
 
  341     uint64_t tp0, tp1, tp2, tp3;
 
  342     v16i8 src_wgt, dst_wgt, wgt, vec0, vec1, vec2, vec3;
 
  344     v8i16 tmp0, tmp1, tmp2, tmp3, denom, 
offset;
 
  346     offset_in = (unsigned) ((offset_in + 1) | 1) << log2_denom;
 
  347     offset_in += (128 * (src_weight + dst_weight));
 
  349     src_wgt = __msa_fill_b(src_weight);
 
  350     dst_wgt = __msa_fill_b(dst_weight);
 
  351     offset = __msa_fill_h(offset_in);
 
  352     denom = __msa_fill_h(log2_denom + 1);
 
  354     wgt = __msa_ilvev_b(dst_wgt, src_wgt);
 
  365     tmp0 = __msa_dpadd_s_h(
offset, wgt, vec0);
 
  366     tmp1 = __msa_dpadd_s_h(
offset, wgt, vec1);
 
  367     tmp2 = __msa_dpadd_s_h(
offset, wgt, vec2);
 
  368     tmp3 = __msa_dpadd_s_h(
offset, wgt, vec3);
 
  369     SRA_4V(tmp0, tmp1, tmp2, tmp3, denom);
 
  379     uint64_t tp0, tp1, tp2, tp3;
 
  380     v16i8 src_wgt, dst_wgt, wgt, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
 
  382     v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, denom, 
offset;
 
  384     offset_in = (unsigned) ((offset_in + 1) | 1) << log2_denom;
 
  385     offset_in += (128 * (src_weight + dst_weight));
 
  387     src_wgt = __msa_fill_b(src_weight);
 
  388     dst_wgt = __msa_fill_b(dst_weight);
 
  389     offset = __msa_fill_h(offset_in);
 
  390     denom = __msa_fill_h(log2_denom + 1);
 
  391     wgt = __msa_ilvev_b(dst_wgt, src_wgt);
 
  410     tmp0 = __msa_dpadd_s_h(
offset, wgt, vec0);
 
  411     tmp1 = __msa_dpadd_s_h(
offset, wgt, vec1);
 
  412     tmp2 = __msa_dpadd_s_h(
offset, wgt, vec2);
 
  413     tmp3 = __msa_dpadd_s_h(
offset, wgt, vec3);
 
  414     tmp4 = __msa_dpadd_s_h(
offset, wgt, vec4);
 
  415     tmp5 = __msa_dpadd_s_h(
offset, wgt, vec5);
 
  416     tmp6 = __msa_dpadd_s_h(
offset, wgt, vec6);
 
  417     tmp7 = __msa_dpadd_s_h(
offset, wgt, vec7);
 
  418     SRA_4V(tmp0, tmp1, tmp2, tmp3, denom);
 
  419     SRA_4V(tmp4, tmp5, tmp6, tmp7, denom);
 
  423     ST_D8(dst0, dst1, dst2, dst3, 0, 1, 0, 1, 0, 1, 0, 1, dst, 
stride);
 
  431     uint64_t tp0, tp1, tp2, tp3;
 
  432     v16i8 src_wgt, dst_wgt, wgt;
 
  434     v16u8 dst0, dst1, dst2, dst3;
 
  435     v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
 
  436     v8i16 temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
 
  439     offset_in = (unsigned) ((offset_in + 1) | 1) << log2_denom;
 
  440     offset_in += (128 * (src_weight + dst_weight));
 
  442     src_wgt = __msa_fill_b(src_weight);
 
  443     dst_wgt = __msa_fill_b(dst_weight);
 
  444     offset = __msa_fill_h(offset_in);
 
  445     denom = __msa_fill_h(log2_denom + 1);
 
  446     wgt = __msa_ilvev_b(dst_wgt, src_wgt);
 
  448     for (cnt = 2; cnt--;) {
 
  466                    vec0, vec2, vec4, vec6);
 
  468                    vec1, vec3, vec5, vec7);
 
  470         temp0 = __msa_dpadd_s_h(
offset, wgt, vec0);
 
  471         temp1 = __msa_dpadd_s_h(
offset, wgt, vec1);
 
  472         temp2 = __msa_dpadd_s_h(
offset, wgt, vec2);
 
  473         temp3 = __msa_dpadd_s_h(
offset, wgt, vec3);
 
  474         temp4 = __msa_dpadd_s_h(
offset, wgt, vec4);
 
  475         temp5 = __msa_dpadd_s_h(
offset, wgt, vec5);
 
  476         temp6 = __msa_dpadd_s_h(
offset, wgt, vec6);
 
  477         temp7 = __msa_dpadd_s_h(
offset, wgt, vec7);
 
  479         SRA_4V(temp0, temp1, temp2, temp3, denom);
 
  480         SRA_4V(temp4, temp5, temp6, temp7, denom);
 
  481         CLIP_SH8_0_255(temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7);
 
  482         PCKEV_B4_UB(temp1, temp0, temp3, temp2, temp5, temp4, temp7, temp6,
 
  483                     dst0, dst1, dst2, dst3);
 
  484         ST_D8(dst0, dst1, dst2, dst3, 0, 1, 0, 1, 0, 1, 0, 1, dst, 
stride);
 
  489 #define AVC_LPF_P0P1P2_OR_Q0Q1Q2(p3_or_q3_org_in, p0_or_q0_org_in,          \ 
  490                                  q3_or_p3_org_in, p1_or_q1_org_in,          \ 
  491                                  p2_or_q2_org_in, q1_or_p1_org_in,          \ 
  492                                  p0_or_q0_out, p1_or_q1_out, p2_or_q2_out)  \ 
  495     v8i16 const3 = __msa_ldi_h(3);                                          \ 
  497     threshold = (p0_or_q0_org_in) + (q3_or_p3_org_in);                      \ 
  498     threshold += (p1_or_q1_org_in);                                         \ 
  500     (p0_or_q0_out) = threshold << 1;                                        \ 
  501     (p0_or_q0_out) += (p2_or_q2_org_in);                                    \ 
  502     (p0_or_q0_out) += (q1_or_p1_org_in);                                    \ 
  503     (p0_or_q0_out) = __msa_srari_h((p0_or_q0_out), 3);                      \ 
  505     (p1_or_q1_out) = (p2_or_q2_org_in) + threshold;                         \ 
  506     (p1_or_q1_out) = __msa_srari_h((p1_or_q1_out), 2);                      \ 
  508     (p2_or_q2_out) = (p2_or_q2_org_in) * const3;                            \ 
  509     (p2_or_q2_out) += (p3_or_q3_org_in);                                    \ 
  510     (p2_or_q2_out) += (p3_or_q3_org_in);                                    \ 
  511     (p2_or_q2_out) += threshold;                                            \ 
  512     (p2_or_q2_out) = __msa_srari_h((p2_or_q2_out), 3);                      \ 
  516 #define AVC_LPF_P0_OR_Q0(p0_or_q0_org_in, q1_or_p1_org_in,   \ 
  517                          p1_or_q1_org_in, p0_or_q0_out)      \ 
  519     (p0_or_q0_out) = (p0_or_q0_org_in) + (q1_or_p1_org_in);  \ 
  520     (p0_or_q0_out) += (p1_or_q1_org_in);                     \ 
  521     (p0_or_q0_out) += (p1_or_q1_org_in);                     \ 
  522     (p0_or_q0_out) = __msa_srari_h((p0_or_q0_out), 2);       \ 
  525 #define AVC_LPF_P1_OR_Q1(p0_or_q0_org_in, q0_or_p0_org_in,    \ 
  526                          p1_or_q1_org_in, p2_or_q2_org_in,    \ 
  527                          negate_tc_in, tc_in, p1_or_q1_out)   \ 
  531     clip3 = (v8i16) __msa_aver_u_h((v8u16) p0_or_q0_org_in,   \ 
  532                                    (v8u16) q0_or_p0_org_in);  \ 
  533     temp = p1_or_q1_org_in << 1;                              \ 
  534     clip3 = clip3 - temp;                                     \ 
  535     clip3 = __msa_ave_s_h(p2_or_q2_org_in, clip3);            \ 
  536     CLIP_SH(clip3, negate_tc_in, tc_in);                      \ 
  537     p1_or_q1_out = p1_or_q1_org_in + clip3;                   \ 
  540 #define AVC_LPF_P0Q0(q0_or_p0_org_in, p0_or_q0_org_in,          \ 
  541                      p1_or_q1_org_in, q1_or_p1_org_in,          \ 
  542                      negate_threshold_in, threshold_in,         \ 
  543                      p0_or_q0_out, q0_or_p0_out)                \ 
  545     v8i16 q0_sub_p0, p1_sub_q1, delta;                          \ 
  547     q0_sub_p0 = q0_or_p0_org_in - p0_or_q0_org_in;              \ 
  548     p1_sub_q1 = p1_or_q1_org_in - q1_or_p1_org_in;              \ 
  551     delta = q0_sub_p0 + p1_sub_q1;                              \ 
  554     CLIP_SH(delta, negate_threshold_in, threshold_in);          \ 
  556     p0_or_q0_out = p0_or_q0_org_in + delta;                     \ 
  557     q0_or_p0_out = q0_or_p0_org_in - delta;                     \ 
  559     CLIP_SH2_0_255(p0_or_q0_out, q0_or_p0_out);                 \ 
  562 #define AVC_LPF_H_CHROMA_422(src, stride, tc_val, alpha, beta, res)      \ 
  564     uint32_t load0, load1, load2, load3;                                 \ 
  565     v16u8 src0 = { 0 };                                                  \ 
  566     v16u8 src1 = { 0 };                                                  \ 
  567     v16u8 src2 = { 0 };                                                  \ 
  568     v16u8 src3 = { 0 };                                                  \ 
  569     v16u8 p0_asub_q0, p1_asub_p0, q1_asub_q0;                            \ 
  570     v16u8 is_less_than, is_less_than_alpha, is_less_than_beta;           \ 
  571     v8i16 tc, q0_sub_p0, p1_sub_q1, delta;                               \ 
  572     v8i16 res0_r, res1_r;                                                \ 
  573     v16i8 zeros = { 0 };                                                 \ 
  576     LW4((src - 2), stride, load0, load1, load2, load3);                  \ 
  577     src0 = (v16u8) __msa_insert_w((v4i32) src0, 0, load0);               \ 
  578     src1 = (v16u8) __msa_insert_w((v4i32) src1, 0, load1);               \ 
  579     src2 = (v16u8) __msa_insert_w((v4i32) src2, 0, load2);               \ 
  580     src3 = (v16u8) __msa_insert_w((v4i32) src3, 0, load3);               \ 
  582     TRANSPOSE4x4_UB_UB(src0, src1, src2, src3, src0, src1, src2, src3);  \ 
  584     p0_asub_q0 = __msa_asub_u_b(src2, src1);                             \ 
  585     p1_asub_p0 = __msa_asub_u_b(src1, src0);                             \ 
  586     q1_asub_q0 = __msa_asub_u_b(src2, src3);                             \ 
  588     tc = __msa_fill_h(tc_val);                                           \ 
  590     is_less_than_alpha = (p0_asub_q0 < alpha);                           \ 
  591     is_less_than_beta = (p1_asub_p0 < beta);                             \ 
  592     is_less_than = is_less_than_alpha & is_less_than_beta;               \ 
  593     is_less_than_beta = (q1_asub_q0 < beta);                             \ 
  594     is_less_than = is_less_than_beta & is_less_than;                     \ 
  596     ILVR_B2_SH(src2, src1, src0, src3, q0_sub_p0, p1_sub_q1);            \ 
  597     HSUB_UB2_SH(q0_sub_p0, p1_sub_q1, q0_sub_p0, p1_sub_q1);             \ 
  600     delta = q0_sub_p0 + p1_sub_q1;                                       \ 
  601     delta = __msa_srari_h(delta, 3);                                     \ 
  603     CLIP_SH(delta, -tc, tc);                                             \ 
  605     ILVR_B2_SH(zeros, src1, zeros, src2, res0_r, res1_r);                \ 
  610     CLIP_SH2_0_255(res0_r, res1_r);                                      \ 
  611     PCKEV_B2_UB(res0_r, res0_r, res1_r, res1_r, res0, res1);             \ 
  613     res0 = __msa_bmnz_v(src1, res0, is_less_than);                       \ 
  614     res1 = __msa_bmnz_v(src2, res1, is_less_than);                       \ 
  616     res = (v16u8) __msa_ilvr_b((v16i8) res1, (v16i8) res0);              \ 
  619 #define TRANSPOSE2x4_B_UB(in0, in1, out0, out1, out2, out3)  \ 
  621     v16i8 zero_m = { 0 };                                    \ 
  623     out0 = (v16u8) __msa_ilvr_b((v16i8) in1, (v16i8) in0);   \ 
  624     out1 = (v16u8) __msa_sldi_b(zero_m, (v16i8) out0, 2);    \ 
  625     SLDI_B2_UB(zero_m, out1, zero_m, out2, 2, out2, out3);   \ 
  628 #define AVC_LPF_H_2BYTE_CHROMA_422(src, stride, tc_val, alpha, beta, res)  \ 
  630     uint32_t load0, load1;                                                 \ 
  631     v16u8 src0 = { 0 };                                                    \ 
  632     v16u8 src1 = { 0 };                                                    \ 
  633     v16u8 src2 = { 0 };                                                    \ 
  634     v16u8 src3 = { 0 };                                                    \ 
  635     v16u8 p0_asub_q0, p1_asub_p0, q1_asub_q0;                              \ 
  636     v16u8 is_less_than, is_less_than_alpha, is_less_than_beta;             \ 
  637     v8i16 tc, q0_sub_p0, p1_sub_q1, delta, res0_r, res1_r;                 \ 
  638     v16i8 zeros = { 0 };                                                   \ 
  641     load0 = LW(src - 2);                                                   \ 
  642     load1 = LW(src - 2 + stride);                                          \ 
  644     src0 = (v16u8) __msa_insert_w((v4i32) src0, 0, load0);                 \ 
  645     src1 = (v16u8) __msa_insert_w((v4i32) src1, 0, load1);                 \ 
  647     TRANSPOSE2x4_B_UB(src0, src1, src0, src1, src2, src3);                 \ 
  649     p0_asub_q0 = __msa_asub_u_b(src2, src1);                               \ 
  650     p1_asub_p0 = __msa_asub_u_b(src1, src0);                               \ 
  651     q1_asub_q0 = __msa_asub_u_b(src2, src3);                               \ 
  653     tc = __msa_fill_h(tc_val);                                             \ 
  655     is_less_than_alpha = (p0_asub_q0 < alpha);                             \ 
  656     is_less_than_beta = (p1_asub_p0 < beta);                               \ 
  657     is_less_than = is_less_than_alpha & is_less_than_beta;                 \ 
  658     is_less_than_beta = (q1_asub_q0 < beta);                               \ 
  659     is_less_than = is_less_than_beta & is_less_than;                       \ 
  661     ILVR_B2_SH(src2, src1, src0, src3, q0_sub_p0, p1_sub_q1);              \ 
  662     HSUB_UB2_SH(q0_sub_p0, p1_sub_q1, q0_sub_p0, p1_sub_q1);               \ 
  665     delta = q0_sub_p0 + p1_sub_q1;                                         \ 
  666     delta = __msa_srari_h(delta, 3);                                       \ 
  667     CLIP_SH(delta, -tc, tc);                                               \ 
  669     ILVR_B2_SH(zeros, src1, zeros, src2, res0_r, res1_r);                  \ 
  674     CLIP_SH2_0_255(res0_r, res1_r);                                        \ 
  675     PCKEV_B2_UB(res0_r, res0_r, res1_r, res1_r, res0, res1);               \ 
  677     res0 = __msa_bmnz_v(src1, res0, is_less_than);                         \ 
  678     res1 = __msa_bmnz_v(src2, res1, is_less_than);                         \ 
  680     res = (v16u8) __msa_ilvr_b((v16i8) res1, (v16i8) res0);                \ 
  688     v16u8 p0_asub_q0, p1_asub_p0, q1_asub_q0;
 
  689     v16u8 is_less_than, is_less_than_beta, is_less_than_alpha;
 
  690     v16u8 p1_org, p0_org, q0_org, q1_org;
 
  692     LD_UB4(
data - (img_width << 1), img_width, p1_org, p0_org, q0_org, q1_org);
 
  694     p0_asub_q0 = __msa_asub_u_b(p0_org, q0_org);
 
  695     p1_asub_p0 = __msa_asub_u_b(p1_org, p0_org);
 
  696     q1_asub_q0 = __msa_asub_u_b(q1_org, q0_org);
 
  698     is_less_than_alpha = (p0_asub_q0 < alpha_in);
 
  699     is_less_than_beta = (p1_asub_p0 < beta_in);
 
  700     is_less_than = is_less_than_beta & is_less_than_alpha;
 
  701     is_less_than_beta = (q1_asub_q0 < beta_in);
 
  702     is_less_than = is_less_than_beta & is_less_than;
 
  704     if (!__msa_test_bz_v(is_less_than)) {
 
  705         v16u8 p2_asub_p0, q2_asub_q0, p0, 
q0, negate_is_less_than_beta;
 
  711         v8i16 p1_org_r, p0_org_r, q0_org_r, q1_org_r;
 
  712         v8i16 p1_org_l, p0_org_l, q0_org_l, q1_org_l;
 
  713         v16u8 q2_org = 
LD_UB(
data + (2 * img_width));
 
  714         v16u8 p2_org = 
LD_UB(
data - (3 * img_width));
 
  715         v16u8 tmp_flag = (v16u8)__msa_fill_b((alpha_in >> 2) + 2);
 
  721         tmp_flag = (p0_asub_q0 < tmp_flag);
 
  723         p2_asub_p0 = __msa_asub_u_b(p2_org, p0_org);
 
  724         is_less_than_beta = (p2_asub_p0 < beta_in);
 
  725         is_less_than_beta = is_less_than_beta & tmp_flag;
 
  726         negate_is_less_than_beta = __msa_xori_b(is_less_than_beta, 0xff);
 
  727         is_less_than_beta = is_less_than_beta & is_less_than;
 
  728         negate_is_less_than_beta = negate_is_less_than_beta & is_less_than;
 
  730         q1_org_r = (v8i16) __msa_ilvr_b(
zero, (v16i8) q1_org);
 
  731         q1_org_l = (v8i16) __msa_ilvl_b(
zero, (v16i8) q1_org);
 
  734         if (!__msa_test_bz_v(is_less_than_beta)) {
 
  735             v8i16 p3_org_l, p3_org_r;
 
  736             v16u8 p3_org = 
LD_UB(
data - (img_width << 2));
 
  745                                      p2_r, q1_org_r, p0_r, p1_r, p2_r);
 
  749                                      p2_l, q1_org_l, p0_l, p1_l, p2_l);
 
  751             PCKEV_B3_UB(p0_l, p0_r, p1_l, p1_r, p2_l, p2_r, p0, p1, p2);
 
  753             p0_org = __msa_bmnz_v(p0_org, p0, is_less_than_beta);
 
  754             p1_org = __msa_bmnz_v(p1_org, p1, is_less_than_beta);
 
  755             p2_org = __msa_bmnz_v(p2_org, p2, is_less_than_beta);
 
  765         p0 = (v16u8) __msa_pckev_b((v16i8) p0_l, (v16i8) p0_r);
 
  766         p0_org = __msa_bmnz_v(p0_org, p0, negate_is_less_than_beta);
 
  771         q2_asub_q0 = __msa_asub_u_b(q2_org, q0_org);
 
  772         is_less_than_beta = (q2_asub_q0 < beta_in);
 
  773         is_less_than_beta = is_less_than_beta & tmp_flag;
 
  774         negate_is_less_than_beta = __msa_xori_b(is_less_than_beta, 0xff);
 
  775         is_less_than_beta = is_less_than_beta & is_less_than;
 
  776         negate_is_less_than_beta = negate_is_less_than_beta & is_less_than;
 
  779         if (!__msa_test_bz_v(is_less_than_beta)) {
 
  780             v8i16 q3_org_r, q3_org_l;
 
  781             v16u8 q3_org = 
LD_UB(
data + (3 * img_width));
 
  790                                      q2_r, p1_org_r, q0_r, q1_r, q2_r);
 
  794                                      q2_l, p1_org_l, q0_l, q1_l, q2_l);
 
  797             q0_org = __msa_bmnz_v(q0_org, 
q0, is_less_than_beta);
 
  798             q1_org = __msa_bmnz_v(q1_org, 
q1, is_less_than_beta);
 
  799             q2_org = __msa_bmnz_v(q2_org, q2, is_less_than_beta);
 
  809         q0 = (v16u8) __msa_pckev_b((v16i8) q0_l, (v16i8) q0_r);
 
  810         q0_org = __msa_bmnz_v(q0_org, 
q0, negate_is_less_than_beta);
 
  822     v16u8 
alpha, beta, p0_asub_q0;
 
  823     v16u8 is_less_than_alpha, is_less_than, is_less_than_beta;
 
  824     v16u8 p3_org, p2_org, p1_org, p0_org, q0_org, q1_org, q2_org, q3_org;
 
  825     v16u8 p1_asub_p0, q1_asub_q0;
 
  829         v16u8 row0, row1, row2, row3, row4, row5, row6, row7;
 
  830         v16u8 row8, row9, row10, row11, row12, row13, row14, row15;
 
  832         LD_UB8(
src, img_width, row0, row1, row2, row3, row4, row5, row6, row7);
 
  834                row8, row9, row10, row11, row12, row13, row14, row15);
 
  837                             row4, row5, row6, row7,
 
  838                             row8, row9, row10, row11,
 
  839                             row12, row13, row14, row15,
 
  840                             p3_org, p2_org, p1_org, p0_org,
 
  841                             q0_org, q1_org, q2_org, q3_org);
 
  844     p0_asub_q0 = __msa_asub_u_b(p0_org, q0_org);
 
  845     p1_asub_p0 = __msa_asub_u_b(p1_org, p0_org);
 
  846     q1_asub_q0 = __msa_asub_u_b(q1_org, q0_org);
 
  848     alpha = (v16u8) __msa_fill_b(alpha_in);
 
  849     beta = (v16u8) __msa_fill_b(beta_in);
 
  851     is_less_than_alpha = (p0_asub_q0 < 
alpha);
 
  852     is_less_than_beta = (p1_asub_p0 < beta);
 
  853     is_less_than = is_less_than_beta & is_less_than_alpha;
 
  854     is_less_than_beta = (q1_asub_q0 < beta);
 
  855     is_less_than = is_less_than_beta & is_less_than;
 
  857     if (!__msa_test_bz_v(is_less_than)) {
 
  863         v16u8 tmp_flag, p0, 
q0, p2_asub_p0, q2_asub_q0;
 
  864         v16u8 negate_is_less_than_beta;
 
  865         v8i16 p1_org_r, p0_org_r, q0_org_r, q1_org_r;
 
  866         v8i16 p1_org_l, p0_org_l, q0_org_l, q1_org_l;
 
  873         tmp_flag = 
alpha >> 2;
 
  874         tmp_flag = tmp_flag + 2;
 
  875         tmp_flag = (p0_asub_q0 < tmp_flag);
 
  877         p2_asub_p0 = __msa_asub_u_b(p2_org, p0_org);
 
  878         is_less_than_beta = (p2_asub_p0 < beta);
 
  879         is_less_than_beta = tmp_flag & is_less_than_beta;
 
  880         negate_is_less_than_beta = __msa_xori_b(is_less_than_beta, 0xff);
 
  881         is_less_than_beta = is_less_than_beta & is_less_than;
 
  882         negate_is_less_than_beta = negate_is_less_than_beta & is_less_than;
 
  884         if (!__msa_test_bz_v(is_less_than_beta)) {
 
  886             v8i16 p3_org_r, p3_org_l;
 
  894                                      p2_r, q1_org_r, p0_r, p1_r, p2_r);
 
  898                                          p2_l, q1_org_l, p0_l, p1_l, p2_l);
 
  900             PCKEV_B3_UB(p0_l, p0_r, p1_l, p1_r, p2_l, p2_r, p0, p1, p2);
 
  901             p0_org = __msa_bmnz_v(p0_org, p0, is_less_than_beta);
 
  902             p1_org = __msa_bmnz_v(p1_org, p1, is_less_than_beta);
 
  903             p2_org = __msa_bmnz_v(p2_org, p2, is_less_than_beta);
 
  909         p0 = (v16u8) __msa_pckev_b((v16i8) p0_l, (v16i8) p0_r);
 
  910         p0_org = __msa_bmnz_v(p0_org, p0, negate_is_less_than_beta);
 
  912         q2_asub_q0 = __msa_asub_u_b(q2_org, q0_org);
 
  913         is_less_than_beta = (q2_asub_q0 < beta);
 
  915         is_less_than_beta = is_less_than_beta & tmp_flag;
 
  916         negate_is_less_than_beta = __msa_xori_b(is_less_than_beta, 0xff);
 
  918         is_less_than_beta = is_less_than_beta & is_less_than;
 
  919         negate_is_less_than_beta = negate_is_less_than_beta & is_less_than;
 
  921         if (!__msa_test_bz_v(is_less_than_beta)) {
 
  923             v8i16 q3_org_r, q3_org_l;
 
  931                                      q2_r, p1_org_r, q0_r, q1_r, q2_r);
 
  935                                      q2_l, p1_org_l, q0_l, q1_l, q2_l);
 
  938             q0_org = __msa_bmnz_v(q0_org, 
q0, is_less_than_beta);
 
  939             q1_org = __msa_bmnz_v(q1_org, 
q1, is_less_than_beta);
 
  940             q2_org = __msa_bmnz_v(q2_org, q2, is_less_than_beta);
 
  946         q0 = (v16u8) __msa_pckev_b((v16i8) q0_l, (v16i8) q0_r);
 
  947         q0_org = __msa_bmnz_v(q0_org, 
q0, negate_is_less_than_beta);
 
  950         v8i16 tp0, tp1, tp2, tp3, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
 
  960         ST_W4(tmp3, 0, 1, 2, 3, 
src, img_width);
 
  961         ST_H4(tmp2, 0, 1, 2, 3, 
src + 4, img_width);
 
  962         src += 4 * img_width;
 
  963         ST_W4(tmp4, 0, 1, 2, 3, 
src, img_width);
 
  964         ST_H4(tmp2, 4, 5, 6, 7, 
src + 4, img_width);
 
  965         src += 4 * img_width;
 
  967         ST_W4(tmp6, 0, 1, 2, 3, 
src, img_width);
 
  968         ST_H4(tmp5, 0, 1, 2, 3, 
src + 4, img_width);
 
  969         src += 4 * img_width;
 
  970         ST_W4(tmp7, 0, 1, 2, 3, 
src, img_width);
 
  971         ST_H4(tmp5, 4, 5, 6, 7, 
src + 4, img_width);
 
  981     uint64_t load0, load1;
 
  984     v8u16 src0_r, src1_r, src2_r, src3_r, src4_r, src5_r, src6_r, src7_r;
 
  985     v8u16 dst0_r, dst1_r, dst4_r, dst5_r;
 
  986     v8u16 dst2_x_r, dst2_y_r, dst3_x_r, dst3_y_r;
 
  987     v16u8 dst0, dst1, dst4, dst5, dst2_x, dst2_y, dst3_x, dst3_y;
 
  988     v8i16 tmp0, tmp1, tmp2, tmp3;
 
  990     v16u8 p0_asub_q0, p1_asub_p0, q1_asub_q0, p2_asub_p0, q2_asub_q0;
 
  991     v16u8 is_less_than, is_less_than_alpha, is_less_than_beta;
 
  992     v16u8 is_less_than_beta1, is_less_than_beta2;
 
 1001     v16i8 zeros = { 0 };
 
 1003     load0 = 
LD(
src - 4);
 
 1005     src0 = (v16i8) __msa_insert_d((v2i64) 
src0, 0, load0);
 
 1006     src1 = (v16i8) __msa_insert_d((v2i64) 
src1, 0, load1);
 
 1010     src2 = (v16i8) __msa_insert_d((v2i64) 
src2, 0, load0);
 
 1011     src3 = (v16i8) __msa_insert_d((v2i64) src3, 0, load1);
 
 1015     src4 = (v16i8) __msa_insert_d((v2i64) src4, 0, load0);
 
 1016     src5 = (v16i8) __msa_insert_d((v2i64) src5, 0, load1);
 
 1020     src6 = (v16i8) __msa_insert_d((v2i64) src6, 0, load0);
 
 1021     src7 = (v16i8) __msa_insert_d((v2i64) src7, 0, load1);
 
 1029     ILVR_W2_SB(tmp2, tmp0, tmp3, tmp1, src6, src3);
 
 1034     p0_asub_q0 = __msa_asub_u_b((v16u8) 
src2, (v16u8) src3);
 
 1035     p1_asub_p0 = __msa_asub_u_b((v16u8) 
src1, (v16u8) 
src2);
 
 1036     q1_asub_q0 = __msa_asub_u_b((v16u8) src4, (v16u8) src3);
 
 1038     alpha = (v16u8) __msa_fill_b(alpha_in);
 
 1039     beta = (v16u8) __msa_fill_b(beta_in);
 
 1041     is_less_than_alpha = (p0_asub_q0 < 
alpha);
 
 1042     is_less_than_beta = (p1_asub_p0 < beta);
 
 1043     is_less_than = is_less_than_alpha & is_less_than_beta;
 
 1044     is_less_than_beta = (q1_asub_q0 < beta);
 
 1045     is_less_than = is_less_than & is_less_than_beta;
 
 1050     is_less_than_alpha = (p0_asub_q0 < 
alpha);
 
 1052     p2_asub_p0 = __msa_asub_u_b((v16u8) 
src0, (v16u8) 
src2);
 
 1053     is_less_than_beta1 = (p2_asub_p0 < beta);
 
 1054     q2_asub_q0 = __msa_asub_u_b((v16u8) src5, (v16u8) src3);
 
 1055     is_less_than_beta2 = (q2_asub_q0 < beta);
 
 1058                src0_r, src1_r, src2_r, src3_r);
 
 1059     ILVR_B4_UH(zeros, src4, zeros, src5, zeros, src6, zeros, src7,
 
 1060                src4_r, src5_r, src6_r, src7_r);
 
 1062     dst2_x_r = src1_r + src2_r + src3_r;
 
 1063     dst2_x_r = src0_r + (2 * (dst2_x_r)) + src4_r;
 
 1064     dst2_x_r = (v8u16) __msa_srari_h((v8i16) dst2_x_r, 3);
 
 1065     dst1_r = src0_r + src1_r + src2_r + src3_r;
 
 1066     dst1_r = (v8u16) __msa_srari_h((v8i16) dst1_r, 2);
 
 1068     dst0_r = (2 * src6_r) + (3 * src0_r);
 
 1069     dst0_r += src1_r + src2_r + src3_r;
 
 1070     dst0_r = (v8u16) __msa_srari_h((v8i16) dst0_r, 3);
 
 1071     dst2_y_r = (2 * src1_r) + src2_r + src4_r;
 
 1072     dst2_y_r = (v8u16) __msa_srari_h((v8i16) dst2_y_r, 2);
 
 1074     PCKEV_B2_UB(dst2_x_r, dst2_x_r, dst2_y_r, dst2_y_r, dst2_x, dst2_y);
 
 1075     dst2_x = __msa_bmnz_v(dst2_y, dst2_x, is_less_than_beta1);
 
 1077     dst3_x_r = src2_r + src3_r + src4_r;
 
 1078     dst3_x_r = src1_r + (2 * dst3_x_r) + src5_r;
 
 1079     dst3_x_r = (v8u16) __msa_srari_h((v8i16) dst3_x_r, 3);
 
 1080     dst4_r = src2_r + src3_r + src4_r + src5_r;
 
 1081     dst4_r = (v8u16) __msa_srari_h((v8i16) dst4_r, 2);
 
 1083     dst5_r = (2 * src7_r) + (3 * src5_r);
 
 1084     dst5_r += src4_r + src3_r + src2_r;
 
 1085     dst5_r = (v8u16) __msa_srari_h((v8i16) dst5_r, 3);
 
 1086     dst3_y_r = (2 * src4_r) + src3_r + src1_r;
 
 1087     dst3_y_r = (v8u16) __msa_srari_h((v8i16) dst3_y_r, 2);
 
 1089     PCKEV_B2_UB(dst3_x_r, dst3_x_r, dst3_y_r, dst3_y_r, dst3_x, dst3_y);
 
 1090     dst3_x = __msa_bmnz_v(dst3_y, dst3_x, is_less_than_beta2);
 
 1092     dst2_y_r = (2 * src1_r) + src2_r + src4_r;
 
 1093     dst2_y_r = (v8u16) __msa_srari_h((v8i16) dst2_y_r, 2);
 
 1094     dst3_y_r = (2 * src4_r) + src3_r + src1_r;
 
 1095     dst3_y_r = (v8u16) __msa_srari_h((v8i16) dst3_y_r, 2);
 
 1097     PCKEV_B2_UB(dst2_y_r, dst2_y_r, dst3_y_r, dst3_y_r, dst2_y, dst3_y);
 
 1099     dst2_x = __msa_bmnz_v(dst2_y, dst2_x, is_less_than_alpha);
 
 1100     dst3_x = __msa_bmnz_v(dst3_y, dst3_x, is_less_than_alpha);
 
 1101     dst2_x = __msa_bmnz_v((v16u8) 
src2, dst2_x, is_less_than);
 
 1102     dst3_x = __msa_bmnz_v((v16u8) src3, dst3_x, is_less_than);
 
 1104     is_less_than = is_less_than_alpha & is_less_than;
 
 1105     dst1 = (v16u8) __msa_pckev_b((v16i8) dst1_r, (v16i8) dst1_r);
 
 1106     is_less_than_beta1 = is_less_than_beta1 & is_less_than;
 
 1107     dst1 = __msa_bmnz_v((v16u8) 
src1, dst1, is_less_than_beta1);
 
 1109     dst0 = (v16u8) __msa_pckev_b((v16i8) dst0_r, (v16i8) dst0_r);
 
 1110     dst0 = __msa_bmnz_v((v16u8) 
src0, dst0, is_less_than_beta1);
 
 1111     dst4 = (v16u8) __msa_pckev_b((v16i8) dst4_r, (v16i8) dst4_r);
 
 1112     is_less_than_beta2 = is_less_than_beta2 & is_less_than;
 
 1113     dst4 = __msa_bmnz_v((v16u8) src4, dst4, is_less_than_beta2);
 
 1114     dst5 = (v16u8) __msa_pckev_b((v16i8) dst5_r, (v16i8) dst5_r);
 
 1115     dst5 = __msa_bmnz_v((v16u8) src5, dst5, is_less_than_beta2);
 
 1117     ILVR_B2_UB(dst1, dst0, dst3_x, dst2_x, dst0, dst1);
 
 1118     dst2_x = (v16u8) __msa_ilvr_b((v16i8) dst5, (v16i8) dst4);
 
 1122     ILVR_W2_UB(tmp2, tmp0, tmp3, tmp1, dst0, dst4);
 
 1123     SLDI_B2_UB(zeros, dst0, zeros, dst4, 8, dst1, dst5);
 
 1124     dst2_x = (v16u8) __msa_ilvl_w((v4i32) tmp2, (v4i32) tmp0);
 
 1125     dst2_y = (v16u8) __msa_ilvl_w((v4i32) tmp3, (v4i32) tmp1);
 
 1126     SLDI_B2_UB(zeros, dst2_x, zeros, dst2_y, 8, dst3_x, dst3_y);
 
 1128     out0 = __msa_copy_u_w((v4i32) dst0, 0);
 
 1129     out1 = __msa_copy_u_h((v8i16) dst0, 2);
 
 1130     out2 = __msa_copy_u_w((v4i32) dst1, 0);
 
 1131     out3 = __msa_copy_u_h((v8i16) dst1, 2);
 
 1133     SW(out0, (
src - 3));
 
 1134     SH(out1, (
src + 1));
 
 1136     SW(out2, (
src - 3));
 
 1137     SH(out3, (
src + 1));
 
 1140     out0 = __msa_copy_u_w((v4i32) dst2_x, 0);
 
 1141     out1 = __msa_copy_u_h((v8i16) dst2_x, 2);
 
 1142     out2 = __msa_copy_u_w((v4i32) dst3_x, 0);
 
 1143     out3 = __msa_copy_u_h((v8i16) dst3_x, 2);
 
 1145     SW(out0, (
src - 3));
 
 1146     SH(out1, (
src + 1));
 
 1148     SW(out2, (
src - 3));
 
 1149     SH(out3, (
src + 1));
 
 1152     out0 = __msa_copy_u_w((v4i32) dst4, 0);
 
 1153     out1 = __msa_copy_u_h((v8i16) dst4, 2);
 
 1154     out2 = __msa_copy_u_w((v4i32) dst5, 0);
 
 1155     out3 = __msa_copy_u_h((v8i16) dst5, 2);
 
 1157     SW(out0, (
src - 3));
 
 1158     SH(out1, (
src + 1));
 
 1160     SW(out2, (
src - 3));
 
 1161     SH(out3, (
src + 1));
 
 1164     out0 = __msa_copy_u_w((v4i32) dst2_y, 0);
 
 1165     out1 = __msa_copy_u_h((v8i16) dst2_y, 2);
 
 1166     out2 = __msa_copy_u_w((v4i32) dst3_y, 0);
 
 1167     out3 = __msa_copy_u_h((v8i16) dst3_y, 2);
 
 1169     SW(out0, (
src - 3));
 
 1170     SH(out1, (
src + 1));
 
 1172     SW(out2, (
src - 3));
 
 1173     SH(out3, (
src + 1));
 
 1179                                                        ptrdiff_t img_width)
 
 1183     v8i16 p0_or_q0, q0_or_p0;
 
 1184     v16u8 p1_or_q1_org, p0_or_q0_org, q0_or_p0_org, q1_or_p1_org;
 
 1186     v16u8 p0_asub_q0, p1_asub_p0, q1_asub_q0;
 
 1187     v16u8 is_less_than_alpha, is_less_than_beta;
 
 1188     v8i16 p1_org_r, p0_org_r, q0_org_r, q1_org_r;
 
 1190     alpha = (v16u8) __msa_fill_b(alpha_in);
 
 1191     beta = (v16u8) __msa_fill_b(beta_in);
 
 1193     LD_UB4(data_cb_or_cr - (img_width << 1), img_width,
 
 1194            p1_or_q1_org, p0_or_q0_org, q0_or_p0_org, q1_or_p1_org);
 
 1196     p0_asub_q0 = __msa_asub_u_b(p0_or_q0_org, q0_or_p0_org);
 
 1197     p1_asub_p0 = __msa_asub_u_b(p1_or_q1_org, p0_or_q0_org);
 
 1198     q1_asub_q0 = __msa_asub_u_b(q1_or_p1_org, q0_or_p0_org);
 
 1200     is_less_than_alpha = (p0_asub_q0 < 
alpha);
 
 1201     is_less_than_beta = (p1_asub_p0 < beta);
 
 1202     is_less_than = is_less_than_beta & is_less_than_alpha;
 
 1203     is_less_than_beta = (q1_asub_q0 < beta);
 
 1204     is_less_than = is_less_than_beta & is_less_than;
 
 1206     is_less_than = (v16u8) __msa_ilvr_d((v2i64) 
zero, (v2i64) is_less_than);
 
 1208     if (!__msa_test_bz_v(is_less_than)) {
 
 1210                    zero, q1_or_p1_org, p1_org_r, p0_org_r, q0_org_r, q1_org_r);
 
 1216             __msa_bmnz_v(p0_or_q0_org, (v16u8) p0_or_q0, is_less_than);
 
 1218             __msa_bmnz_v(q0_or_p0_org, (v16u8) q0_or_p0, is_less_than);
 
 1220         ST_UB(q0_or_p0_org, data_cb_or_cr);
 
 1221         ST_UB(p0_or_q0_org, data_cb_or_cr - img_width);
 
 1228                                                        ptrdiff_t img_width)
 
 1231     v16u8 
alpha, beta, is_less_than;
 
 1232     v8i16 p0_or_q0, q0_or_p0;
 
 1233     v16u8 p1_or_q1_org, p0_or_q0_org, q0_or_p0_org, q1_or_p1_org;
 
 1235     v16u8 p0_asub_q0, p1_asub_p0, q1_asub_q0;
 
 1236     v16u8 is_less_than_alpha, is_less_than_beta;
 
 1237     v8i16 p1_org_r, p0_org_r, q0_org_r, q1_org_r;
 
 1240         v16u8 row0, row1, row2, row3, row4, row5, row6, row7;
 
 1242         LD_UB8((data_cb_or_cr - 2), img_width,
 
 1243                row0, row1, row2, row3, row4, row5, row6, row7);
 
 1246                            p1_or_q1_org, p0_or_q0_org,
 
 1247                            q0_or_p0_org, q1_or_p1_org);
 
 1250     alpha = (v16u8) __msa_fill_b(alpha_in);
 
 1251     beta = (v16u8) __msa_fill_b(beta_in);
 
 1253     p0_asub_q0 = __msa_asub_u_b(p0_or_q0_org, q0_or_p0_org);
 
 1254     p1_asub_p0 = __msa_asub_u_b(p1_or_q1_org, p0_or_q0_org);
 
 1255     q1_asub_q0 = __msa_asub_u_b(q1_or_p1_org, q0_or_p0_org);
 
 1257     is_less_than_alpha = (p0_asub_q0 < 
alpha);
 
 1258     is_less_than_beta = (p1_asub_p0 < beta);
 
 1259     is_less_than = is_less_than_beta & is_less_than_alpha;
 
 1260     is_less_than_beta = (q1_asub_q0 < beta);
 
 1261     is_less_than = is_less_than_beta & is_less_than;
 
 1262     is_less_than = (v16u8) __msa_ilvr_d((v2i64) 
zero, (v2i64) is_less_than);
 
 1264     if (!__msa_test_bz_v(is_less_than)) {
 
 1266                    zero, q1_or_p1_org, p1_org_r, p0_org_r, q0_org_r, q1_org_r);
 
 1275             __msa_bmnz_v(p0_or_q0_org, (v16u8) p0_or_q0, is_less_than);
 
 1277             __msa_bmnz_v(q0_or_p0_org, (v16u8) q0_or_p0, is_less_than);
 
 1278         tmp1 = (v8i16) __msa_ilvr_b((v16i8) q0_or_p0_org, (v16i8) p0_or_q0_org);
 
 1281         ST_H4(tmp1, 0, 1, 2, 3, data_cb_or_cr, img_width);
 
 1282         data_cb_or_cr += 4 * img_width;
 
 1283         ST_H4(tmp1, 4, 5, 6, 7, data_cb_or_cr, img_width);
 
 1288                                                    uint8_t iAlpha, uint8_t iBeta,
 
 1291     v16u8 p0, p1, p2, 
q0, 
q1, q2;
 
 1292     v16i8 iTc, negiTc, negTc, 
flags, 
f;
 
 1293     v8i16 p0_l, p0_r, p1_l, p1_r, p2_l, p2_r, q0_l, q0_r, q1_l, q1_r, q2_l, q2_r;
 
 1294     v8i16 tc_l, tc_r, negTc_l, negTc_r;
 
 1295     v8i16 iTc_l, iTc_r, negiTc_l, negiTc_r;
 
 1299     v16u8 bDetaP0Q0, bDetaP1P0, bDetaQ1Q0, bDetaP2P0, bDetaQ2Q0;
 
 1300     v16i8 const_1_b = __msa_ldi_b(1);
 
 1301     v8i16 const_1_h = __msa_ldi_h(1);
 
 1302     v8i16 const_4_h = __msa_ldi_h(4);
 
 1303     v8i16 const_not_255_h = __msa_ldi_h(~255);
 
 1305     v16i8 
tc = { pTc[0  >> 2], pTc[1  >> 2], pTc[2  >> 2], pTc[3  >> 2],
 
 1306                  pTc[4  >> 2], pTc[5  >> 2], pTc[6  >> 2], pTc[7  >> 2],
 
 1307                  pTc[8  >> 2], pTc[9  >> 2], pTc[10 >> 2], pTc[11 >> 2],
 
 1308                  pTc[12 >> 2], pTc[13 >> 2], pTc[14 >> 2], pTc[15 >> 2] };
 
 1313     LD_SH8(pPix - 3, iStride, 
t0, 
t1, 
t2, 
t3, q1_l, q1_r, q2_l, q2_r);
 
 1314     LD_SH8(pPix + 8 * iStride - 3, iStride, p0_l, p0_r, p1_l, p1_r,
 
 1315            p2_l, p2_r, q0_l, q0_r);
 
 1317                         p0_l, p0_r, p1_l, p1_r, p2_l, p2_r, q0_l, q0_r,
 
 1320     alpha = (v16u8)__msa_fill_b(iAlpha);
 
 1321     beta  = (v16u8)__msa_fill_b(iBeta);
 
 1323     bDetaP0Q0 = __msa_asub_u_b(p0, 
q0);
 
 1324     bDetaP1P0 = __msa_asub_u_b(p1, p0);
 
 1325     bDetaQ1Q0 = __msa_asub_u_b(
q1, 
q0);
 
 1326     bDetaP2P0 = __msa_asub_u_b(p2, p0);
 
 1327     bDetaQ2Q0 = __msa_asub_u_b(q2, 
q0);
 
 1328     bDetaP0Q0 = (v16u8)__msa_clt_u_b(bDetaP0Q0, 
alpha);
 
 1329     bDetaP1P0 = (v16u8)__msa_clt_u_b(bDetaP1P0, beta);
 
 1330     bDetaQ1Q0 = (v16u8)__msa_clt_u_b(bDetaQ1Q0, beta);
 
 1331     bDetaP2P0 = (v16u8)__msa_clt_u_b(bDetaP2P0, beta);
 
 1332     bDetaQ2Q0 = (v16u8)__msa_clt_u_b(bDetaQ2Q0, beta);
 
 1347     f = (v16i8)bDetaP0Q0 & (v16i8)bDetaP1P0 & (v16i8)bDetaQ1Q0;
 
 1348     flags = 
f & (v16i8)bDetaP2P0;
 
 1350     iTc += ((~
flags) & const_1_b);
 
 1351     flags = 
f & (v16i8)bDetaQ2Q0;
 
 1353     iTc += ((~
flags) & const_1_b);
 
 1354     negiTc = 
zero - iTc;
 
 1363     t0 = (p2_l + ((p0_l + q0_l + const_1_h) >> 1) - (p1_l << 1)) >> 1;
 
 1364     t0 = __msa_max_s_h(negTc_l, 
t0);
 
 1365     t0 = __msa_min_s_h(tc_l, 
t0);
 
 1368     t0 = (q2_l + ((p0_l + q0_l + const_1_h) >> 1) - (q1_l << 1)) >> 1;
 
 1369     t0 = __msa_max_s_h(negTc_l, 
t0);
 
 1370     t0 = __msa_min_s_h(tc_l, 
t0);
 
 1373     t0 = (((q0_l - p0_l) << 2) + (p1_l - q1_l) + const_4_h) >> 3;
 
 1374     t0 = __msa_max_s_h(negiTc_l, 
t0);
 
 1375     t0 = __msa_min_s_h(iTc_l, 
t0);
 
 1380     t2 = 
t1 & const_not_255_h;
 
 1381     t3 = __msa_cle_s_h((v8i16)
zero, 
t1);
 
 1386     t2 = 
t1 & const_not_255_h;
 
 1387     t3 = __msa_cle_s_h((v8i16)
zero, 
t1);
 
 1393     t0 = (p2_r + ((p0_r + q0_r + const_1_h) >> 1) - (p1_r << 1)) >> 1;
 
 1394     t0 = __msa_max_s_h(negTc_r, 
t0);
 
 1395     t0 = __msa_min_s_h(tc_r, 
t0);
 
 1398     t0 = (q2_r + ((p0_r + q0_r + const_1_h) >> 1) - (q1_r << 1)) >> 1;
 
 1399     t0 = __msa_max_s_h(negTc_r, 
t0);
 
 1400     t0 = __msa_min_s_h(tc_r, 
t0);
 
 1403     t0 = (((q0_r - p0_r) << 2) + (p1_r - q1_r) + const_4_h) >> 3;
 
 1404     t0 = __msa_max_s_h(negiTc_r, 
t0);
 
 1405     t0 = __msa_min_s_h(iTc_r, 
t0);
 
 1410     t2 = 
t1 & const_not_255_h;
 
 1411     t3 = __msa_cle_s_h((v8i16)
zero, 
t1);
 
 1416     t2 = 
t1 & const_not_255_h;
 
 1417     t3 = __msa_cle_s_h((v8i16)
zero, 
t1);
 
 1422     PCKEV_B4(v8i16, p1_l, p1_r, p0_l, p0_r, q0_l, q0_r, q1_l, q1_r,
 
 1429     t1 = (v8i16)(
flags & (~(__msa_ceq_b((v16i8)bDetaP2P0, 
zero))));
 
 1430     p1 = (v16u8)(
t0 & 
t1) + (p1 & (v16u8)(~
t1));
 
 1431     t2 = (v8i16)(
flags & (~(__msa_ceq_b((v16i8)bDetaQ2Q0, 
zero))));
 
 1439     ST_W8(p1, p0, 0, 1, 2, 3, 0, 1, 2, 3, pPix - 2, iStride);
 
 1440     ST_W8(
q0, 
q1, 0, 1, 2, 3, 0, 1, 2, 3, pPix + 8 * iStride - 2, iStride);
 
 1444                                                    uint8_t bs0, uint8_t bs1,
 
 1445                                                    uint8_t bs2, uint8_t bs3,
 
 1446                                                    uint8_t tc0, uint8_t tc1,
 
 1447                                                    uint8_t tc2, uint8_t tc3,
 
 1450                                                    ptrdiff_t image_width)
 
 1455     tmp_vec = (v16u8) __msa_fill_b(bs0);
 
 1456     bs = (v16u8) __msa_insve_w((v4i32) bs, 0, (v4i32) tmp_vec);
 
 1457     tmp_vec = (v16u8) __msa_fill_b(bs1);
 
 1458     bs = (v16u8) __msa_insve_w((v4i32) bs, 1, (v4i32) tmp_vec);
 
 1459     tmp_vec = (v16u8) __msa_fill_b(bs2);
 
 1460     bs = (v16u8) __msa_insve_w((v4i32) bs, 2, (v4i32) tmp_vec);
 
 1461     tmp_vec = (v16u8) __msa_fill_b(bs3);
 
 1462     bs = (v16u8) __msa_insve_w((v4i32) bs, 3, (v4i32) tmp_vec);
 
 1464     if (!__msa_test_bz_v(bs)) {
 
 1465         v16u8 
alpha, beta, is_less_than, is_less_than_beta;
 
 1466         v16u8 p0, 
q0, p2_org, p1_org, p0_org, q0_org, q1_org, q2_org;
 
 1467         v16u8 p0_asub_q0, p1_asub_p0, q1_asub_q0;
 
 1468         v16u8 is_less_than_alpha, is_bs_greater_than0;
 
 1469         v8i16 p0_r, q0_r, p0_l, q0_l;
 
 1470         v8i16 p1_org_r, p0_org_r, q0_org_r, q1_org_r;
 
 1471         v8i16 p1_org_l, p0_org_l, q0_org_l, q1_org_l;
 
 1475         tmp_vec = (v16u8) __msa_fill_b(tc0);
 
 1476         tc = (v16i8) __msa_insve_w((v4i32) 
tc, 0, (v4i32) tmp_vec);
 
 1477         tmp_vec = (v16u8) __msa_fill_b(tc1);
 
 1478         tc = (v16i8) __msa_insve_w((v4i32) 
tc, 1, (v4i32) tmp_vec);
 
 1479         tmp_vec = (v16u8) __msa_fill_b(tc2);
 
 1480         tc = (v16i8) __msa_insve_w((v4i32) 
tc, 2, (v4i32) tmp_vec);
 
 1481         tmp_vec = (v16u8) __msa_fill_b(tc3);
 
 1482         tc = (v16i8) __msa_insve_w((v4i32) 
tc, 3, (v4i32) tmp_vec);
 
 1484         alpha = (v16u8) __msa_fill_b(alpha_in);
 
 1485         beta = (v16u8) __msa_fill_b(beta_in);
 
 1487         LD_UB5(
data - (3 * image_width), image_width,
 
 1488                p2_org, p1_org, p0_org, q0_org, q1_org);
 
 1490         is_bs_greater_than0 = ((v16u8) 
zero < bs);
 
 1491         p0_asub_q0 = __msa_asub_u_b(p0_org, q0_org);
 
 1492         p1_asub_p0 = __msa_asub_u_b(p1_org, p0_org);
 
 1493         q1_asub_q0 = __msa_asub_u_b(q1_org, q0_org);
 
 1495         is_less_than_alpha = (p0_asub_q0 < 
alpha);
 
 1496         is_less_than_beta = (p1_asub_p0 < beta);
 
 1497         is_less_than = is_less_than_beta & is_less_than_alpha;
 
 1498         is_less_than_beta = (q1_asub_q0 < beta);
 
 1499         is_less_than = is_less_than_beta & is_less_than;
 
 1500         is_less_than = is_less_than & is_bs_greater_than0;
 
 1502         if (!__msa_test_bz_v(is_less_than)) {
 
 1503             v16i8 sign_negate_tc, negate_tc;
 
 1504             v8i16 negate_tc_r, i16_negatetc_l, tc_l, tc_r;
 
 1505             v16u8 p2_asub_p0, q2_asub_q0;
 
 1507             q2_org = 
LD_UB(
data + (2 * image_width));
 
 1509             sign_negate_tc = __msa_clti_s_b(negate_tc, 0);
 
 1511             ILVRL_B2_SH(sign_negate_tc, negate_tc, negate_tc_r, i16_negatetc_l);
 
 1518             p2_asub_p0 = __msa_asub_u_b(p2_org, p0_org);
 
 1519             is_less_than_beta = (p2_asub_p0 < beta);
 
 1520             is_less_than_beta = is_less_than_beta & is_less_than;
 
 1522             if (!__msa_test_bz_v(is_less_than_beta)) {
 
 1526                 v8i16 p2_org_r = (v8i16) __msa_ilvr_b(
zero, (v16i8) p2_org);
 
 1527                 v8i16 p2_org_l = (v8i16) __msa_ilvl_b(
zero, (v16i8) p2_org);
 
 1530                                  negate_tc_r, tc_r, p1_r);
 
 1532                                  i16_negatetc_l, tc_l, p1_l);
 
 1534                 p1 = (v16u8) __msa_pckev_b((v16i8) p1_l, (v16i8) p1_r);
 
 1535                 p1_org = __msa_bmnz_v(p1_org, p1, is_less_than_beta);
 
 1536                 ST_UB(p1_org, 
data - (2 * image_width));
 
 1538                 is_less_than_beta = __msa_andi_b(is_less_than_beta, 1);
 
 1539                 tc = 
tc + (v16i8) is_less_than_beta;
 
 1542             q2_asub_q0 = __msa_asub_u_b(q2_org, q0_org);
 
 1543             is_less_than_beta = (q2_asub_q0 < beta);
 
 1544             is_less_than_beta = is_less_than_beta & is_less_than;
 
 1546             q1_org_r = (v8i16) __msa_ilvr_b(
zero, (v16i8) q1_org);
 
 1547             q1_org_l = (v8i16) __msa_ilvl_b(
zero, (v16i8) q1_org);
 
 1549             if (!__msa_test_bz_v(is_less_than_beta)) {
 
 1553                 v8i16 q2_org_r = (v8i16) __msa_ilvr_b(
zero, (v16i8) q2_org);
 
 1554                 v8i16 q2_org_l = (v8i16) __msa_ilvl_b(
zero, (v16i8) q2_org);
 
 1557                                  negate_tc_r, tc_r, q1_r);
 
 1559                                  i16_negatetc_l, tc_l, q1_l);
 
 1561                 q1 = (v16u8) __msa_pckev_b((v16i8) q1_l, (v16i8) q1_r);
 
 1562                 q1_org = __msa_bmnz_v(q1_org, 
q1, is_less_than_beta);
 
 1565                 is_less_than_beta = __msa_andi_b(is_less_than_beta, 1);
 
 1566                 tc = 
tc + (v16i8) is_less_than_beta;
 
 1569                 v16i8 negate_thresh, sign_negate_thresh;
 
 1570                 v8i16 threshold_r, threshold_l;
 
 1571                 v8i16 negate_thresh_l, negate_thresh_r;
 
 1573                 negate_thresh = 
zero - 
tc;
 
 1574                 sign_negate_thresh = __msa_clti_s_b(negate_thresh, 0);
 
 1577                            threshold_r, negate_thresh_r);
 
 1579                              negate_thresh_r, threshold_r, p0_r, q0_r);
 
 1581                 threshold_l = (v8i16) __msa_ilvl_b(
zero, 
tc);
 
 1582                 negate_thresh_l = (v8i16) __msa_ilvl_b(sign_negate_thresh,
 
 1585                              negate_thresh_l, threshold_l, p0_l, q0_l);
 
 1590             p0_org = __msa_bmnz_v(p0_org, p0, is_less_than);
 
 1591             q0_org = __msa_bmnz_v(q0_org, 
q0, is_less_than);
 
 1604     uint32_t out0, out1, out2, out3;
 
 1617     v8i16 src4, src5, src6, src7;
 
 1618     v16u8 p0_asub_q0, p1_asub_p0, q1_asub_q0, p2_asub_p0, q2_asub_q0;
 
 1619     v16u8 is_less_than, is_less_than_alpha, is_less_than_beta;
 
 1620     v16u8 is_less_than_beta1, is_less_than_beta2;
 
 1621     v8i16 
tc, tc_orig_r, tc_plus1;
 
 1622     v16u8 is_tc_orig1, is_tc_orig2, tc_orig = { 0 };
 
 1623     v8i16 p0_ilvr_q0, p0_add_q0, q0_sub_p0, p1_sub_q1;
 
 1624     v8i16 src2_r, src3_r;
 
 1625     v8i16 p2_r, p1_r, q2_r, q1_r;
 
 1626     v16u8 p2, q2, p0, 
q0;
 
 1628     v16i8 zeros = { 0 };
 
 1630     alpha = (v16u8) __msa_fill_b(alpha_in);
 
 1631     beta = (v16u8) __msa_fill_b(beta_in);
 
 1637         inp0 = (v16i8) __msa_insert_d((v2i64) inp0, 0, load);
 
 1639         inp1 = (v16i8) __msa_insert_d((v2i64) inp1, 0, load);
 
 1647         inp2 = (v16i8) __msa_insert_d((v2i64) inp2, 0, load);
 
 1649         inp3 = (v16i8) __msa_insert_d((v2i64) inp3, 0, load);
 
 1657         inp4 = (v16i8) __msa_insert_d((v2i64) inp4, 0, load);
 
 1659         inp5 = (v16i8) __msa_insert_d((v2i64) inp5, 0, load);
 
 1667         inp6 = (v16i8) __msa_insert_d((v2i64) inp6, 0, load);
 
 1669         inp7 = (v16i8) __msa_insert_d((v2i64) inp7, 0, load);
 
 1673     ILVR_B4_SB(inp1, inp0, inp3, inp2, inp5, inp4, inp7, inp6,
 
 1679     src0 = (v16i8) __msa_ilvr_w((v4i32) src6, (v4i32) src4);
 
 1680     src1 = __msa_sldi_b(zeros, (v16i8) 
src0, 8);
 
 1681     src2 = (v16i8) __msa_ilvl_w((v4i32) src6, (v4i32) src4);
 
 1682     src3 = __msa_sldi_b(zeros, (v16i8) 
src2, 8);
 
 1683     src4 = (v8i16) __msa_ilvr_w((v4i32) src7, (v4i32) src5);
 
 1684     src5 = (v8i16) __msa_sldi_b(zeros, (v16i8) src4, 8);
 
 1686     p0_asub_q0 = __msa_asub_u_b((v16u8) 
src2, (v16u8) src3);
 
 1687     p1_asub_p0 = __msa_asub_u_b((v16u8) 
src1, (v16u8) 
src2);
 
 1688     q1_asub_q0 = __msa_asub_u_b((v16u8) src4, (v16u8) src3);
 
 1689     p2_asub_p0 = __msa_asub_u_b((v16u8) 
src0, (v16u8) 
src2);
 
 1690     q2_asub_q0 = __msa_asub_u_b((v16u8) src5, (v16u8) src3);
 
 1692     is_less_than_alpha = (p0_asub_q0 < 
alpha);
 
 1693     is_less_than_beta = (p1_asub_p0 < beta);
 
 1694     is_less_than = is_less_than_alpha & is_less_than_beta;
 
 1695     is_less_than_beta = (q1_asub_q0 < beta);
 
 1696     is_less_than = is_less_than_beta & is_less_than;
 
 1698     is_less_than_beta1 = (p2_asub_p0 < beta);
 
 1699     is_less_than_beta2 = (q2_asub_q0 < beta);
 
 1701     p0_ilvr_q0 = (v8i16) __msa_ilvr_b((v16i8) src3, (v16i8) 
src2);
 
 1702     p0_add_q0 = (v8i16) __msa_hadd_u_h((v16u8) p0_ilvr_q0, (v16u8) p0_ilvr_q0);
 
 1703     p0_add_q0 = __msa_srari_h(p0_add_q0, 1);
 
 1709     ILVR_B2_SH(zeros, src5, zeros, src4, q2_r, q1_r);
 
 1715     tc_orig = (v16u8) __msa_insert_w((v4i32) tc_orig, 0, tc_val);
 
 1716     tc_orig = (v16u8) __msa_ilvr_b((v16i8) tc_orig, (v16i8) tc_orig);
 
 1717     is_tc_orig1 = tc_orig;
 
 1718     is_tc_orig2 = tc_orig;
 
 1719     tc_orig_r = (v8i16) __msa_ilvr_b(zeros, (v16i8) tc_orig);
 
 1722     CLIP_SH(p2_r, -tc_orig_r, tc_orig_r);
 
 1723     CLIP_SH(q2_r, -tc_orig_r, tc_orig_r);
 
 1730     is_tc_orig1 = (zeros < is_tc_orig1);
 
 1731     is_tc_orig2 = is_tc_orig1;
 
 1732     is_tc_orig1 = is_less_than_beta1 & is_tc_orig1;
 
 1733     is_tc_orig2 = is_less_than_beta2 & is_tc_orig2;
 
 1734     is_tc_orig1 = is_less_than & is_tc_orig1;
 
 1735     is_tc_orig2 = is_less_than & is_tc_orig2;
 
 1737     p2 = __msa_bmnz_v((v16u8) 
src1, p2, is_tc_orig1);
 
 1738     q2 = __msa_bmnz_v((v16u8) src4, q2, is_tc_orig2);
 
 1740     q0_sub_p0 = __msa_hsub_u_h((v16u8) p0_ilvr_q0, (v16u8) p0_ilvr_q0);
 
 1742     p1_sub_q1 = p1_r - q1_r;
 
 1743     q0_sub_p0 += p1_sub_q1;
 
 1744     q0_sub_p0 = __msa_srari_h(q0_sub_p0, 3);
 
 1747     is_less_than_beta1 = (v16u8) __msa_ilvr_b((v16i8) is_less_than_beta1,
 
 1748                                               (v16i8) is_less_than_beta1);
 
 1749     tc = (v8i16) __msa_bmnz_v((v16u8) 
tc, (v16u8) tc_plus1, is_less_than_beta1);
 
 1751     is_less_than_beta2 = (v16u8) __msa_ilvr_b((v16i8) is_less_than_beta2,
 
 1752                                               (v16i8) is_less_than_beta2);
 
 1753     tc = (v8i16) __msa_bmnz_v((v16u8) 
tc, (v16u8) tc_plus1, is_less_than_beta2);
 
 1758     src2_r += q0_sub_p0;
 
 1759     src3_r -= q0_sub_p0;
 
 1765     p0 = __msa_bmnz_v((v16u8) 
src2, p0, is_less_than);
 
 1766     q0 = __msa_bmnz_v((v16u8) src3, 
q0, is_less_than);
 
 1774     out0 = __msa_copy_u_w(dst0, 0);
 
 1775     out1 = __msa_copy_u_w(dst0, 1);
 
 1776     out2 = __msa_copy_u_w(dst0, 2);
 
 1777     out3 = __msa_copy_u_w(dst0, 3);
 
 1797     out0 = __msa_copy_u_w(dst1, 0);
 
 1798     out1 = __msa_copy_u_w(dst1, 1);
 
 1799     out2 = __msa_copy_u_w(dst1, 2);
 
 1800     out3 = __msa_copy_u_w(dst1, 3);
 
 1819                                                        uint8_t bs0, uint8_t bs1,
 
 1820                                                        uint8_t bs2, uint8_t bs3,
 
 1821                                                        uint8_t tc0, uint8_t tc1,
 
 1822                                                        uint8_t tc2, uint8_t tc3,
 
 1825                                                        ptrdiff_t img_width)
 
 1831     v16u8 p0, 
q0, p0_asub_q0, p1_asub_p0, q1_asub_q0;
 
 1833     v16u8 is_less_than_beta, is_less_than_alpha, is_bs_greater_than0;
 
 1835     v16u8 p1_org, p0_org, q0_org, q1_org;
 
 1836     v8i16 p1_org_r, p0_org_r, q0_org_r, q1_org_r;
 
 1837     v16i8 negate_tc, sign_negate_tc;
 
 1838     v8i16 tc_r, negate_tc_r;
 
 1841     tmp_vec = (v8i16) __msa_fill_b(bs0);
 
 1842     bs = __msa_insve_h(bs, 0, tmp_vec);
 
 1843     tmp_vec = (v8i16) __msa_fill_b(bs1);
 
 1844     bs = __msa_insve_h(bs, 1, tmp_vec);
 
 1845     tmp_vec = (v8i16) __msa_fill_b(bs2);
 
 1846     bs = __msa_insve_h(bs, 2, tmp_vec);
 
 1847     tmp_vec = (v8i16) __msa_fill_b(bs3);
 
 1848     bs = __msa_insve_h(bs, 3, tmp_vec);
 
 1850     if (!__msa_test_bz_v((v16u8) bs)) {
 
 1851         tmp_vec = (v8i16) __msa_fill_b(tc0);
 
 1852         tc = __msa_insve_h(
tc, 0, tmp_vec);
 
 1853         tmp_vec = (v8i16) __msa_fill_b(tc1);
 
 1854         tc = __msa_insve_h(
tc, 1, tmp_vec);
 
 1855         tmp_vec = (v8i16) __msa_fill_b(tc2);
 
 1856         tc = __msa_insve_h(
tc, 2, tmp_vec);
 
 1857         tmp_vec = (v8i16) __msa_fill_b(tc3);
 
 1858         tc = __msa_insve_h(
tc, 3, tmp_vec);
 
 1860         is_bs_greater_than0 = (v16u8) (
zero < (v16i8) bs);
 
 1862         alpha = (v16u8) __msa_fill_b(alpha_in);
 
 1863         beta = (v16u8) __msa_fill_b(beta_in);
 
 1866                p1_org, p0_org, q0_org, q1_org);
 
 1868         p0_asub_q0 = __msa_asub_u_b(p0_org, q0_org);
 
 1869         p1_asub_p0 = __msa_asub_u_b(p1_org, p0_org);
 
 1870         q1_asub_q0 = __msa_asub_u_b(q1_org, q0_org);
 
 1872         is_less_than_alpha = (p0_asub_q0 < 
alpha);
 
 1873         is_less_than_beta = (p1_asub_p0 < beta);
 
 1874         is_less_than = is_less_than_beta & is_less_than_alpha;
 
 1875         is_less_than_beta = (q1_asub_q0 < beta);
 
 1876         is_less_than = is_less_than_beta & is_less_than;
 
 1877         is_less_than = is_less_than & is_bs_greater_than0;
 
 1879         is_less_than = (v16u8) __msa_ilvr_d((v2i64) 
zero, (v2i64) is_less_than);
 
 1881         if (!__msa_test_bz_v(is_less_than)) {
 
 1882             negate_tc = 
zero - (v16i8) 
tc;
 
 1883             sign_negate_tc = __msa_clti_s_b(negate_tc, 0);
 
 1888                        p1_org_r, p0_org_r, q0_org_r, q1_org_r);
 
 1890             AVC_LPF_P0Q0(q0_org_r, p0_org_r, p1_org_r, q1_org_r, negate_tc_r,
 
 1895             p0_org = __msa_bmnz_v(p0_org, p0, is_less_than);
 
 1896             q0_org = __msa_bmnz_v(q0_org, 
q0, is_less_than);
 
 1905                                                        uint8_t bs0, uint8_t bs1,
 
 1906                                                        uint8_t bs2, uint8_t bs3,
 
 1907                                                        uint8_t tc0, uint8_t tc1,
 
 1908                                                        uint8_t tc2, uint8_t tc3,
 
 1911                                                        ptrdiff_t img_width)
 
 1915     v16u8 p0_asub_q0, p1_asub_p0, q1_asub_q0;
 
 1916     v16u8 is_less_than, is_less_than_beta, is_less_than_alpha;
 
 1920     v16u8 p1_org, p0_org, q0_org, q1_org;
 
 1921     v8i16 p1_org_r, p0_org_r, q0_org_r, q1_org_r;
 
 1922     v16u8 is_bs_greater_than0;
 
 1923     v8i16 tc_r, negate_tc_r;
 
 1924     v16i8 negate_tc, sign_negate_tc;
 
 1926     v16u8 row0, row1, row2, row3, row4, row5, row6, row7;
 
 1927     v8i16 tmp1, tmp_vec, bs = { 0 };
 
 1930     tmp_vec = (v8i16) __msa_fill_b(bs0);
 
 1931     bs = __msa_insve_h(bs, 0, tmp_vec);
 
 1932     tmp_vec = (v8i16) __msa_fill_b(bs1);
 
 1933     bs = __msa_insve_h(bs, 1, tmp_vec);
 
 1934     tmp_vec = (v8i16) __msa_fill_b(bs2);
 
 1935     bs = __msa_insve_h(bs, 2, tmp_vec);
 
 1936     tmp_vec = (v8i16) __msa_fill_b(bs3);
 
 1937     bs = __msa_insve_h(bs, 3, tmp_vec);
 
 1939     if (!__msa_test_bz_v((v16u8) bs)) {
 
 1940         tmp_vec = (v8i16) __msa_fill_b(tc0);
 
 1941         tc = __msa_insve_h(
tc, 0, tmp_vec);
 
 1942         tmp_vec = (v8i16) __msa_fill_b(tc1);
 
 1943         tc = __msa_insve_h(
tc, 1, tmp_vec);
 
 1944         tmp_vec = (v8i16) __msa_fill_b(tc2);
 
 1945         tc = __msa_insve_h(
tc, 2, tmp_vec);
 
 1946         tmp_vec = (v8i16) __msa_fill_b(tc3);
 
 1947         tc = __msa_insve_h(
tc, 3, tmp_vec);
 
 1949         is_bs_greater_than0 = (v16u8) (
zero < (v16i8) bs);
 
 1952                row0, row1, row2, row3, row4, row5, row6, row7);
 
 1955                            row4, row5, row6, row7,
 
 1956                            p1_org, p0_org, q0_org, q1_org);
 
 1958         p0_asub_q0 = __msa_asub_u_b(p0_org, q0_org);
 
 1959         p1_asub_p0 = __msa_asub_u_b(p1_org, p0_org);
 
 1960         q1_asub_q0 = __msa_asub_u_b(q1_org, q0_org);
 
 1962         alpha = (v16u8) __msa_fill_b(alpha_in);
 
 1963         beta = (v16u8) __msa_fill_b(beta_in);
 
 1965         is_less_than_alpha = (p0_asub_q0 < 
alpha);
 
 1966         is_less_than_beta = (p1_asub_p0 < beta);
 
 1967         is_less_than = is_less_than_beta & is_less_than_alpha;
 
 1968         is_less_than_beta = (q1_asub_q0 < beta);
 
 1969         is_less_than = is_less_than_beta & is_less_than;
 
 1970         is_less_than = is_bs_greater_than0 & is_less_than;
 
 1972         is_less_than = (v16u8) __msa_ilvr_d((v2i64) 
zero, (v2i64) is_less_than);
 
 1974         if (!__msa_test_bz_v(is_less_than)) {
 
 1976                        p1_org_r, p0_org_r, q0_org_r, q1_org_r);
 
 1978             negate_tc = 
zero - (v16i8) 
tc;
 
 1979             sign_negate_tc = __msa_clti_s_b(negate_tc, 0);
 
 1983             AVC_LPF_P0Q0(q0_org_r, p0_org_r, p1_org_r, q1_org_r, negate_tc_r,
 
 1988             p0_org = __msa_bmnz_v(p0_org, p0, is_less_than);
 
 1989             q0_org = __msa_bmnz_v(q0_org, 
q0, is_less_than);
 
 1990             tmp1 = (v8i16) __msa_ilvr_b((v16i8) q0_org, (v16i8) p0_org);
 
 1992             ST_H4(tmp1, 0, 1, 2, 3, 
src, img_width);
 
 1993             src += 4 * img_width;
 
 1994             ST_H4(tmp1, 4, 5, 6, 7, 
src, img_width);
 
 2004     v16u8 
alpha, beta, res;
 
 2006     alpha = (v16u8) __msa_fill_b(alpha_in);
 
 2007     beta = (v16u8) __msa_fill_b(beta_in);
 
 2009     for (col = 0; col < 4; col++) {
 
 2010         tc_val = (tc0[col] - 1) + 1;
 
 2031     v16u8 
alpha, beta, res;
 
 2033     alpha = (v16u8) __msa_fill_b(alpha_in);
 
 2034     beta = (v16u8) __msa_fill_b(beta_in);
 
 2036     for (col = 0; col < 4; col++) {
 
 2037         tc_val = (tc0[col] - 1) + 1;
 
 2046         out0 = __msa_copy_s_h((v8i16) res, 0);
 
 2047         out1 = __msa_copy_s_h((v8i16) res, 1);
 
 2049         SH(out0, (
src - 1));
 
 2051         SH(out1, (
src - 1));
 
 2057                                   int alpha, 
int beta, int8_t *
tc)
 
 2080                                   int alpha, 
int beta, int8_t *
tc)
 
 2099                                            alpha, beta, img_width);
 
 2103                                     int alpha, 
int beta, int8_t *
tc)
 
 2121                                                alpha, beta, img_width);
 
 2125                                     int alpha, 
int beta, int8_t *
tc)
 
 2143                                                alpha, beta, img_width);
 
 2147                                   int alpha, 
int beta)
 
 2155                                   int alpha, 
int beta)
 
 2163                                     int alpha, 
int beta)
 
 2171                                     int alpha, 
int beta)
 
 2213                                    int height, 
int log2_denom,
 
 2214                                    int weight_src, 
int offset_in)
 
 2216     uint32_t offset_val;
 
 2219     v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
 
 2220     v8i16 src0_l, src1_l, src2_l, src3_l, src0_r, src1_r, src2_r, src3_r;
 
 2221     v8i16 src4_l, src5_l, src6_l, src7_l, src4_r, src5_r, src6_r, src7_r;
 
 2222     v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
 
 2223     v8i16 tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15;
 
 2224     v8i16 wgt, denom, 
offset;
 
 2226     offset_val = (unsigned) offset_in << log2_denom;
 
 2228     wgt = __msa_fill_h(weight_src);
 
 2229     offset = __msa_fill_h(offset_val);
 
 2230     denom = __msa_fill_h(log2_denom);
 
 2233     ILVR_B4_SH(
zero, 
src0, 
zero, 
src1, 
zero, 
src2, 
zero, src3, src0_r, src1_r,
 
 2235     ILVL_B4_SH(
zero, 
src0, 
zero, 
src1, 
zero, 
src2, 
zero, src3, src0_l, src1_l,
 
 2237     ILVR_B4_SH(
zero, src4, 
zero, src5, 
zero, src6, 
zero, src7, src4_r, src5_r,
 
 2239     ILVL_B4_SH(
zero, src4, 
zero, src5, 
zero, src6, 
zero, src7, src4_l, src5_l,
 
 2241     MUL4(wgt, src0_r, wgt, src0_l, wgt, src1_r, wgt, src1_l, tmp0, tmp1, tmp2,
 
 2243     MUL4(wgt, src2_r, wgt, src2_l, wgt, src3_r, wgt, src3_l, tmp4, tmp5, tmp6,
 
 2245     MUL4(wgt, src4_r, wgt, src4_l, wgt, src5_r, wgt, src5_l, tmp8, tmp9, tmp10,
 
 2247     MUL4(wgt, src6_r, wgt, src6_l, wgt, src7_r, wgt, src7_l, tmp12, tmp13,
 
 2254                 tmp9, tmp10, tmp11);
 
 2256                 tmp12, tmp13, tmp14, tmp15);
 
 2257     MAXI_SH8_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, 0);
 
 2258     MAXI_SH8_SH(tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, 0);
 
 2259     SRLR_H8_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, denom);
 
 2260     SRLR_H8_SH(tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, denom);
 
 2261     SAT_UH8_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, 7);
 
 2262     SAT_UH8_SH(tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, 7);
 
 2263     PCKEV_B4_UB(tmp1, tmp0, tmp3, tmp2, tmp5, tmp4, tmp7, tmp6, dst0, dst1,
 
 2265     PCKEV_B4_UB(tmp9, tmp8, tmp11, tmp10, tmp13, tmp12, tmp15, tmp14, dst4,
 
 2273                    src1_r, src2_r, src3_r);
 
 2275                    src1_l, src2_l, src3_l);
 
 2277                    src5_r, src6_r, src7_r);
 
 2279                    src5_l, src6_l, src7_l);
 
 2280         MUL4(wgt, src0_r, wgt, src0_l, wgt, src1_r, wgt, src1_l, tmp0, tmp1,
 
 2282         MUL4(wgt, src2_r, wgt, src2_l, wgt, src3_r, wgt, src3_l, tmp4, tmp5,
 
 2284         MUL4(wgt, src4_r, wgt, src4_l, wgt, src5_r, wgt, src5_l, tmp8, tmp9,
 
 2286         MUL4(wgt, src6_r, wgt, src6_l, wgt, src7_r, wgt, src7_l, tmp12, tmp13,
 
 2289                     tmp0, tmp1, tmp2, tmp3);
 
 2291                     tmp4, tmp5, tmp6, tmp7);
 
 2293                     tmp8, tmp9, tmp10, tmp11);
 
 2295                     tmp12, tmp13, tmp14, tmp15);
 
 2296         MAXI_SH8_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, 0);
 
 2297         MAXI_SH8_SH(tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, 0);
 
 2298         SRLR_H8_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, denom);
 
 2299         SRLR_H8_SH(tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, denom);
 
 2300         SAT_UH8_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, 7);
 
 2301         SAT_UH8_SH(tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, 7);
 
 2302         PCKEV_B4_UB(tmp1, tmp0, tmp3, tmp2, tmp5, tmp4, tmp7, tmp6, dst0, dst1,
 
 2304         PCKEV_B4_UB(tmp9, tmp8, tmp11, tmp10, tmp13, tmp12, tmp15, tmp14, dst4,
 
 2311                                   int height, 
int log2_denom,
 
 2312                                   int weight_src, 
int offset)
 
 2316     } 
else if (8 == 
height) {
 
 2324                                   int height, 
int log2_denom,
 
 2325                                   int weight_src, 
int offset)
 
 2329     } 
else if (4 == 
height) {
 
 2338                                      int log2_denom, 
int weight_dst,
 
 2339                                      int weight_src, 
int offset_in)
 
 2341     v16i8 src_wgt, dst_wgt, wgt;
 
 2343     v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
 
 2344     v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
 
 2345     v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
 
 2346     v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
 
 2347     v8i16 tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15;
 
 2350     offset_in = (unsigned) ((offset_in + 1) | 1) << log2_denom;
 
 2351     offset_in += (128 * (weight_src + weight_dst));
 
 2353     src_wgt = __msa_fill_b(weight_src);
 
 2354     dst_wgt = __msa_fill_b(weight_dst);
 
 2355     offset = __msa_fill_h(offset_in);
 
 2356     denom = __msa_fill_h(log2_denom + 1);
 
 2358     wgt = __msa_ilvev_b(dst_wgt, src_wgt);
 
 2362     LD_UB8(dst, 
stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
 
 2365     ILVR_B4_SB(dst0, 
src0, dst1, 
src1, dst2, 
src2, dst3, src3, vec0, vec2, vec4,
 
 2367     ILVL_B4_SB(dst0, 
src0, dst1, 
src1, dst2, 
src2, dst3, src3, vec1, vec3, vec5,
 
 2369     ILVR_B4_SB(dst4, src4, dst5, src5, dst6, src6, dst7, src7, vec8, vec10,
 
 2371     ILVL_B4_SB(dst4, src4, dst5, src5, dst6, src6, dst7, src7, vec9, vec11,
 
 2373     tmp0 = __msa_dpadd_s_h(
offset, wgt, vec0);
 
 2374     tmp1 = __msa_dpadd_s_h(
offset, wgt, vec1);
 
 2375     tmp2 = __msa_dpadd_s_h(
offset, wgt, vec2);
 
 2376     tmp3 = __msa_dpadd_s_h(
offset, wgt, vec3);
 
 2377     tmp4 = __msa_dpadd_s_h(
offset, wgt, vec4);
 
 2378     tmp5 = __msa_dpadd_s_h(
offset, wgt, vec5);
 
 2379     tmp6 = __msa_dpadd_s_h(
offset, wgt, vec6);
 
 2380     tmp7 = __msa_dpadd_s_h(
offset, wgt, vec7);
 
 2381     tmp8 = __msa_dpadd_s_h(
offset, wgt, vec8);
 
 2382     tmp9 = __msa_dpadd_s_h(
offset, wgt, vec9);
 
 2383     tmp10 = __msa_dpadd_s_h(
offset, wgt, vec10);
 
 2384     tmp11 = __msa_dpadd_s_h(
offset, wgt, vec11);
 
 2385     tmp12 = __msa_dpadd_s_h(
offset, wgt, vec12);
 
 2386     tmp13 = __msa_dpadd_s_h(
offset, wgt, vec13);
 
 2387     tmp14 = __msa_dpadd_s_h(
offset, wgt, vec14);
 
 2388     tmp15 = __msa_dpadd_s_h(
offset, wgt, vec15);
 
 2389     SRA_4V(tmp0, tmp1, tmp2, tmp3, denom);
 
 2390     SRA_4V(tmp4, tmp5, tmp6, tmp7, denom);
 
 2391     SRA_4V(tmp8, tmp9, tmp10, tmp11, denom);
 
 2392     SRA_4V(tmp12, tmp13, tmp14, tmp15, denom);
 
 2394     CLIP_SH8_0_255(tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15);
 
 2395     PCKEV_B4_UB(tmp1, tmp0, tmp3, tmp2, tmp5, tmp4, tmp7, tmp6, dst0, dst1,
 
 2397     PCKEV_B4_UB(tmp9, tmp8, tmp11, tmp10, tmp13, tmp12, tmp15, tmp14, dst4,
 
 2399     ST_UB8(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst, 
stride);
 
 2404         LD_UB8(dst, 
stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
 
 2407         ILVR_B4_SB(dst0, 
src0, dst1, 
src1, dst2, 
src2, dst3, src3, vec0, vec2,
 
 2409         ILVL_B4_SB(dst0, 
src0, dst1, 
src1, dst2, 
src2, dst3, src3, vec1, vec3,
 
 2411         ILVR_B4_SB(dst4, src4, dst5, src5, dst6, src6, dst7, src7, vec8, vec10,
 
 2413         ILVL_B4_SB(dst4, src4, dst5, src5, dst6, src6, dst7, src7, vec9, vec11,
 
 2415         tmp0 = __msa_dpadd_s_h(
offset, wgt, vec0);
 
 2416         tmp1 = __msa_dpadd_s_h(
offset, wgt, vec1);
 
 2417         tmp2 = __msa_dpadd_s_h(
offset, wgt, vec2);
 
 2418         tmp3 = __msa_dpadd_s_h(
offset, wgt, vec3);
 
 2419         tmp4 = __msa_dpadd_s_h(
offset, wgt, vec4);
 
 2420         tmp5 = __msa_dpadd_s_h(
offset, wgt, vec5);
 
 2421         tmp6 = __msa_dpadd_s_h(
offset, wgt, vec6);
 
 2422         tmp7 = __msa_dpadd_s_h(
offset, wgt, vec7);
 
 2423         tmp8 = __msa_dpadd_s_h(
offset, wgt, vec8);
 
 2424         tmp9 = __msa_dpadd_s_h(
offset, wgt, vec9);
 
 2425         tmp10 = __msa_dpadd_s_h(
offset, wgt, vec10);
 
 2426         tmp11 = __msa_dpadd_s_h(
offset, wgt, vec11);
 
 2427         tmp12 = __msa_dpadd_s_h(
offset, wgt, vec12);
 
 2428         tmp13 = __msa_dpadd_s_h(
offset, wgt, vec13);
 
 2429         tmp14 = __msa_dpadd_s_h(
offset, wgt, vec14);
 
 2430         tmp15 = __msa_dpadd_s_h(
offset, wgt, vec15);
 
 2431         SRA_4V(tmp0, tmp1, tmp2, tmp3, denom);
 
 2432         SRA_4V(tmp4, tmp5, tmp6, tmp7, denom);
 
 2433         SRA_4V(tmp8, tmp9, tmp10, tmp11, denom);
 
 2434         SRA_4V(tmp12, tmp13, tmp14, tmp15, denom);
 
 2436         CLIP_SH8_0_255(tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15);
 
 2437         PCKEV_B4_UB(tmp1, tmp0, tmp3, tmp2, tmp5, tmp4, tmp7, tmp6, dst0, dst1,
 
 2439         PCKEV_B4_UB(tmp9, tmp8, tmp11, tmp10, tmp13, tmp12, tmp15, tmp14, dst4,
 
 2441         ST_UB8(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst, 
stride);
 
 2447                                     int log2_denom, 
int weight_dst,
 
 2448                                     int weight_src, 
int offset)
 
 2453     } 
else if (8 == 
height) {
 
 2464                                     int log2_denom, 
int weight_dst,
 
 2465                                     int weight_src, 
int offset)
 
 2470     } 
else if (4 == 
height) {