doxygen/3.3/me__cmp__msa_8c_source.html

 /*

  * Copyright (c) 2015 Parag Salasakar (Parag.Salasakar@imgtec.com)

  *

  * This file is part of FFmpeg.

  *

  * FFmpeg is free software; you can redistribute it and/or

  * modify it under the terms of the GNU Lesser General Public

  * License as published by the Free Software Foundation; either

  * version 2.1 of the License, or (at your option) any later version.

  *

  * FFmpeg is distributed in the hope that it will be useful,

  * but WITHOUT ANY WARRANTY; without even the implied warranty of

  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU

  * Lesser General Public License for more details.

  *

  * You should have received a copy of the GNU Lesser General Public

  * License along with FFmpeg; if not, write to the Free Software

  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA

  */


 #include "libavutil/mips/generic_macros_msa.h"

 #include "me_cmp_mips.h"


 static uint32_t sad_8width_msa(uint8_t *src, int32_t src_stride,

                                uint8_t *ref, int32_t ref_stride,

                                int32_t height)

 {

     int32_t ht_cnt;

     v16u8 src0, src1, src2, src3, ref0, ref1, ref2, ref3;

     v8u16 sad = { 0 };


     for (ht_cnt = (height >> 2); ht_cnt--;) {

         LD_UB4(src, src_stride, src0, src1, src2, src3);

         src += (4 * src_stride);

         LD_UB4(ref, ref_stride, ref0, ref1, ref2, ref3);

         ref += (4 * ref_stride);


         PCKEV_D4_UB(src1, src0, src3, src2, ref1, ref0, ref3, ref2,

                     src0, src1, ref0, ref1);

         sad += SAD_UB2_UH(src0, src1, ref0, ref1);

     }


     return (HADD_UH_U32(sad));

 }


 static uint32_t sad_16width_msa(uint8_t *src, int32_t src_stride,

                                 uint8_t *ref, int32_t ref_stride,

                                 int32_t height)

 {

     int32_t ht_cnt;

     v16u8 src0, src1, ref0, ref1;

     v8u16 sad = { 0 };


     for (ht_cnt = (height >> 2); ht_cnt--;) {

         LD_UB2(src, src_stride, src0, src1);

         src += (2 * src_stride);

         LD_UB2(ref, ref_stride, ref0, ref1);

         ref += (2 * ref_stride);

         sad += SAD_UB2_UH(src0, src1, ref0, ref1);


         LD_UB2(src, src_stride, src0, src1);

         src += (2 * src_stride);

         LD_UB2(ref, ref_stride, ref0, ref1);

         ref += (2 * ref_stride);

         sad += SAD_UB2_UH(src0, src1, ref0, ref1);

     }


     return (HADD_UH_U32(sad));

 }


 static uint32_t sad_horiz_bilinear_filter_8width_msa(uint8_t *src,

                                                      int32_t src_stride,

                                                      uint8_t *ref,

                                                      int32_t ref_stride,

                                                      int32_t height)

 {

     int32_t ht_cnt;

     v16u8 src0, src1, src2, src3, comp0, comp1;

     v16u8 ref0, ref1, ref2, ref3, ref4, ref5;

     v8u16 sad = { 0 };


     for (ht_cnt = (height >> 3); ht_cnt--;) {

         LD_UB4(src, src_stride, src0, src1, src2, src3);

         src += (4 * src_stride);

         LD_UB4(ref, ref_stride, ref0, ref1, ref2, ref3);

         ref += (4 * ref_stride);


         PCKEV_D2_UB(src1, src0, src3, src2, src0, src1);

         PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref4, ref5);

         SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1);

         SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1);

         PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1);

         AVER_UB2_UB(ref4, ref0, ref5, ref1, comp0, comp1);

         sad += SAD_UB2_UH(src0, src1, comp0, comp1);


         LD_UB4(src, src_stride, src0, src1, src2, src3);

         src += (4 * src_stride);

         LD_UB4(ref, ref_stride, ref0, ref1, ref2, ref3);

         ref += (4 * ref_stride);


         PCKEV_D2_UB(src1, src0, src3, src2, src0, src1);

         PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref4, ref5);

         SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1);

         SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1);

         PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1);

         AVER_UB2_UB(ref4, ref0, ref5, ref1, comp0, comp1);

         sad += SAD_UB2_UH(src0, src1, comp0, comp1);

     }


     return (HADD_UH_U32(sad));

 }


 static uint32_t sad_horiz_bilinear_filter_16width_msa(uint8_t *src,

                                                       int32_t src_stride,

                                                       uint8_t *ref,

                                                       int32_t ref_stride,

                                                       int32_t height)

 {

     int32_t ht_cnt;

     v16u8 src0, src1, src2, src3, comp0, comp1;

     v16u8 ref00, ref10, ref20, ref30, ref01, ref11, ref21, ref31;

     v8u16 sad = { 0 };


     for (ht_cnt = (height >> 3); ht_cnt--;) {

         LD_UB4(src, src_stride, src0, src1, src2, src3);

         src += (4 * src_stride);

         LD_UB4(ref, ref_stride, ref00, ref10, ref20, ref30);

         LD_UB4(ref + 1, ref_stride, ref01, ref11, ref21, ref31);

         ref += (4 * ref_stride);


         AVER_UB2_UB(ref01, ref00, ref11, ref10, comp0, comp1);

         sad += SAD_UB2_UH(src0, src1, comp0, comp1);

         AVER_UB2_UB(ref21, ref20, ref31, ref30, comp0, comp1);

         sad += SAD_UB2_UH(src2, src3, comp0, comp1);


         LD_UB4(src, src_stride, src0, src1, src2, src3);

         src += (4 * src_stride);

         LD_UB4(ref, ref_stride, ref00, ref10, ref20, ref30);

         LD_UB4(ref + 1, ref_stride, ref01, ref11, ref21, ref31);

         ref += (4 * ref_stride);


         AVER_UB2_UB(ref01, ref00, ref11, ref10, comp0, comp1);

         sad += SAD_UB2_UH(src0, src1, comp0, comp1);

         AVER_UB2_UB(ref21, ref20, ref31, ref30, comp0, comp1);

         sad += SAD_UB2_UH(src2, src3, comp0, comp1);

     }


     return (HADD_UH_U32(sad));

 }


 static uint32_t sad_vert_bilinear_filter_8width_msa(uint8_t *src,

                                                     int32_t src_stride,

                                                     uint8_t *ref,

                                                     int32_t ref_stride,

                                                     int32_t height)

 {

     int32_t ht_cnt;

     v16u8 src0, src1, src2, src3, comp0, comp1;

     v16u8 ref0, ref1, ref2, ref3, ref4;

     v8u16 sad = { 0 };


     for (ht_cnt = (height >> 3); ht_cnt--;) {

         LD_UB4(src, src_stride, src0, src1, src2, src3);

         src += (4 * src_stride);

         LD_UB5(ref, ref_stride, ref0, ref1, ref2, ref3, ref4);

         ref += (4 * ref_stride);


         PCKEV_D2_UB(src1, src0, src3, src2, src0, src1);

         PCKEV_D2_UB(ref1, ref0, ref2, ref1, ref0, ref1);

         PCKEV_D2_UB(ref3, ref2, ref4, ref3, ref2, ref3);

         AVER_UB2_UB(ref1, ref0, ref3, ref2, comp0, comp1);

         sad += SAD_UB2_UH(src0, src1, comp0, comp1);


         LD_UB4(src, src_stride, src0, src1, src2, src3);

         src += (4 * src_stride);

         LD_UB5(ref, ref_stride, ref0, ref1, ref2, ref3, ref4);

         ref += (4 * ref_stride);


         PCKEV_D2_UB(src1, src0, src3, src2, src0, src1);

         PCKEV_D2_UB(ref1, ref0, ref2, ref1, ref0, ref1);

         PCKEV_D2_UB(ref3, ref2, ref4, ref3, ref2, ref3);

         AVER_UB2_UB(ref1, ref0, ref3, ref2, comp0, comp1);

         sad += SAD_UB2_UH(src0, src1, comp0, comp1);

     }


     return (HADD_UH_U32(sad));

 }


 static uint32_t sad_vert_bilinear_filter_16width_msa(uint8_t *src,

                                                      int32_t src_stride,

                                                      uint8_t *ref,

                                                      int32_t ref_stride,

                                                      int32_t height)

 {

     int32_t ht_cnt;

     v16u8 src0, src1, src2, src3, comp0, comp1;

     v16u8 ref0, ref1, ref2, ref3, ref4;

     v8u16 sad = { 0 };


     for (ht_cnt = (height >> 3); ht_cnt--;) {

         LD_UB5(ref, ref_stride, ref4, ref0, ref1, ref2, ref3);

         ref += (5 * ref_stride);

         LD_UB4(src, src_stride, src0, src1, src2, src3);

         src += (4 * src_stride);


         AVER_UB2_UB(ref0, ref4, ref1, ref0, comp0, comp1);

         sad += SAD_UB2_UH(src0, src1, comp0, comp1);

         AVER_UB2_UB(ref2, ref1, ref3, ref2, comp0, comp1);

         sad += SAD_UB2_UH(src2, src3, comp0, comp1);


         ref4 = ref3;


         LD_UB4(ref, ref_stride, ref0, ref1, ref2, ref3);

         ref += (3 * ref_stride);

         LD_UB4(src, src_stride, src0, src1, src2, src3);

         src += (4 * src_stride);


         AVER_UB2_UB(ref0, ref4, ref1, ref0, comp0, comp1);

         sad += SAD_UB2_UH(src0, src1, comp0, comp1);

         AVER_UB2_UB(ref2, ref1, ref3, ref2, comp0, comp1);

         sad += SAD_UB2_UH(src2, src3, comp0, comp1);

     }


     return (HADD_UH_U32(sad));

 }


 static uint32_t sad_hv_bilinear_filter_8width_msa(uint8_t *src,

                                                   int32_t src_stride,

                                                   uint8_t *ref,

                                                   int32_t ref_stride,

                                                   int32_t height)

 {

     int32_t ht_cnt;

     v16u8 src0, src1, src2, src3, temp0, temp1, diff;

     v16u8 ref0, ref1, ref2, ref3, ref4;

     v16i8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };

     v8u16 comp0, comp1, comp2, comp3;

     v8u16 sad = { 0 };


     for (ht_cnt = (height >> 2); ht_cnt--;) {

         LD_UB5(ref, ref_stride, ref4, ref0, ref1, ref2, ref3);

         ref += (4 * ref_stride);

         LD_UB4(src, src_stride, src0, src1, src2, src3);

         src += (4 * src_stride);


         PCKEV_D2_UB(src1, src0, src3, src2, src0, src1);


         VSHF_B2_UB(ref4, ref4, ref0, ref0, mask, mask, temp0, temp1);

         comp0 = __msa_hadd_u_h(temp0, temp0);

         comp1 = __msa_hadd_u_h(temp1, temp1);

         comp0 += comp1;

         comp0 = (v8u16) __msa_srari_h((v8i16) comp0, 2);

         comp0 = (v8u16) __msa_pckev_b((v16i8) comp0, (v16i8) comp0);


         temp0 = (v16u8) __msa_vshf_b(mask, (v16i8) ref1, (v16i8) ref1);

         comp2 = __msa_hadd_u_h(temp0, temp0);

         comp1 += comp2;

         comp1 = (v8u16) __msa_srari_h((v8i16) comp1, 2);

         comp1 = (v8u16) __msa_pckev_b((v16i8) comp1, (v16i8) comp1);

         comp1 = (v8u16) __msa_pckev_d((v2i64) comp1, (v2i64) comp0);

         diff = (v16u8) __msa_asub_u_b(src0, (v16u8) comp1);

         sad += __msa_hadd_u_h(diff, diff);


         temp1 = (v16u8) __msa_vshf_b(mask, (v16i8) ref2, (v16i8) ref2);

         comp3 = __msa_hadd_u_h(temp1, temp1);

         comp2 += comp3;

         comp2 = (v8u16) __msa_srari_h((v8i16) comp2, 2);

         comp2 = (v8u16) __msa_pckev_b((v16i8) comp2, (v16i8) comp2);


         temp0 = (v16u8) __msa_vshf_b(mask, (v16i8) ref3, (v16i8) ref3);

         comp0 = __msa_hadd_u_h(temp0, temp0);

         comp3 += comp0;

         comp3 = (v8u16) __msa_srari_h((v8i16) comp3, 2);

         comp3 = (v8u16) __msa_pckev_b((v16i8) comp3, (v16i8) comp3);

         comp3 = (v8u16) __msa_pckev_d((v2i64) comp3, (v2i64) comp2);

         diff = (v16u8) __msa_asub_u_b(src1, (v16u8) comp3);

         sad += __msa_hadd_u_h(diff, diff);

     }


     return (HADD_UH_U32(sad));

 }


 static uint32_t sad_hv_bilinear_filter_16width_msa(uint8_t *src,

                                                    int32_t src_stride,

                                                    uint8_t *ref,

                                                    int32_t ref_stride,

                                                    int32_t height)

 {

     int32_t ht_cnt;

     v16u8 src0, src1, src2, src3, comp, diff;

     v16u8 temp0, temp1, temp2, temp3;

     v16u8 ref00, ref01, ref02, ref03, ref04, ref10, ref11, ref12, ref13, ref14;

     v8u16 comp0, comp1, comp2, comp3;

     v8u16 sad = { 0 };


     for (ht_cnt = (height >> 3); ht_cnt--;) {

         LD_UB4(src, src_stride, src0, src1, src2, src3);

         src += (4 * src_stride);

         LD_UB5(ref, ref_stride, ref04, ref00, ref01, ref02, ref03);

         LD_UB5(ref + 1, ref_stride, ref14, ref10, ref11, ref12, ref13);

         ref += (5 * ref_stride);


         ILVRL_B2_UB(ref14, ref04, temp0, temp1);

         comp0 = __msa_hadd_u_h(temp0, temp0);

         comp1 = __msa_hadd_u_h(temp1, temp1);

         ILVRL_B2_UB(ref10, ref00, temp2, temp3);

         comp2 = __msa_hadd_u_h(temp2, temp2);

         comp3 = __msa_hadd_u_h(temp3, temp3);

         comp0 += comp2;

         comp1 += comp3;

         SRARI_H2_UH(comp0, comp1, 2);

         comp = (v16u8) __msa_pckev_b((v16i8) comp1, (v16i8) comp0);

         diff = __msa_asub_u_b(src0, comp);

         sad += __msa_hadd_u_h(diff, diff);


         ILVRL_B2_UB(ref11, ref01, temp0, temp1);

         comp0 = __msa_hadd_u_h(temp0, temp0);

         comp1 = __msa_hadd_u_h(temp1, temp1);

         comp2 += comp0;

         comp3 += comp1;

         SRARI_H2_UH(comp2, comp3, 2);

         comp = (v16u8) __msa_pckev_b((v16i8) comp3, (v16i8) comp2);

         diff = __msa_asub_u_b(src1, comp);

         sad += __msa_hadd_u_h(diff, diff);


         ILVRL_B2_UB(ref12, ref02, temp2, temp3);

         comp2 = __msa_hadd_u_h(temp2, temp2);

         comp3 = __msa_hadd_u_h(temp3, temp3);

         comp0 += comp2;

         comp1 += comp3;

         SRARI_H2_UH(comp0, comp1, 2);

         comp = (v16u8) __msa_pckev_b((v16i8) comp1, (v16i8) comp0);

         diff = __msa_asub_u_b(src2, comp);

         sad += __msa_hadd_u_h(diff, diff);


         ILVRL_B2_UB(ref13, ref03, temp0, temp1);

         comp0 = __msa_hadd_u_h(temp0, temp0);

         comp1 = __msa_hadd_u_h(temp1, temp1);

         comp2 += comp0;

         comp3 += comp1;

         SRARI_H2_UH(comp2, comp3, 2);

         comp = (v16u8) __msa_pckev_b((v16i8) comp3, (v16i8) comp2);

         diff = __msa_asub_u_b(src3, comp);

         sad += __msa_hadd_u_h(diff, diff);


         LD_UB4(src, src_stride, src0, src1, src2, src3);

         src += (4 * src_stride);

         LD_UB4(ref, ref_stride, ref00, ref01, ref02, ref03);

         LD_UB4(ref + 1, ref_stride, ref10, ref11, ref12, ref13);

         ref += (3 * ref_stride);


         ILVRL_B2_UB(ref10, ref00, temp2, temp3);

         comp2 = __msa_hadd_u_h(temp2, temp2);

         comp3 = __msa_hadd_u_h(temp3, temp3);

         comp0 += comp2;

         comp1 += comp3;

         SRARI_H2_UH(comp0, comp1, 2);

         comp = (v16u8) __msa_pckev_b((v16i8) comp1, (v16i8) comp0);

         diff = __msa_asub_u_b(src0, comp);

         sad += __msa_hadd_u_h(diff, diff);


         ILVRL_B2_UB(ref11, ref01, temp0, temp1);

         comp0 = __msa_hadd_u_h(temp0, temp0);

         comp1 = __msa_hadd_u_h(temp1, temp1);

         comp2 += comp0;

         comp3 += comp1;

         SRARI_H2_UH(comp2, comp3, 2);

         comp = (v16u8) __msa_pckev_b((v16i8) comp3, (v16i8) comp2);

         diff = __msa_asub_u_b(src1, comp);

         sad += __msa_hadd_u_h(diff, diff);


         ILVRL_B2_UB(ref12, ref02, temp2, temp3);

         comp2 = __msa_hadd_u_h(temp2, temp2);

         comp3 = __msa_hadd_u_h(temp3, temp3);

         comp0 += comp2;

         comp1 += comp3;

         SRARI_H2_UH(comp0, comp1, 2);

         comp = (v16u8) __msa_pckev_b((v16i8) comp1, (v16i8) comp0);

         diff = __msa_asub_u_b(src2, comp);

         sad += __msa_hadd_u_h(diff, diff);


         ILVRL_B2_UB(ref13, ref03, temp0, temp1);

         comp0 = __msa_hadd_u_h(temp0, temp0);

         comp1 = __msa_hadd_u_h(temp1, temp1);

         comp2 += comp0;

         comp3 += comp1;

         SRARI_H2_UH(comp2, comp3, 2);

         comp = (v16u8) __msa_pckev_b((v16i8) comp3, (v16i8) comp2);

         diff = __msa_asub_u_b(src3, comp);

         sad += __msa_hadd_u_h(diff, diff);

     }


     return (HADD_UH_U32(sad));

 }


 #define CALC_MSE_B(src, ref, var)                                    \

 {                                                                    \

     v16u8 src_l0_m, src_l1_m;                                        \

     v8i16 res_l0_m, res_l1_m;                                        \

                                                                      \

     ILVRL_B2_UB(src, ref, src_l0_m, src_l1_m);                       \

     HSUB_UB2_SH(src_l0_m, src_l1_m, res_l0_m, res_l1_m);             \

     DPADD_SH2_SW(res_l0_m, res_l1_m, res_l0_m, res_l1_m, var, var);  \

 }


 static uint32_t sse_4width_msa(uint8_t *src_ptr, int32_t src_stride,

                                uint8_t *ref_ptr, int32_t ref_stride,

                                int32_t height)

 {

     int32_t ht_cnt;

     uint32_t sse;

     uint32_t src0, src1, src2, src3;

     uint32_t ref0, ref1, ref2, ref3;

     v16u8 src = { 0 };

     v16u8 ref = { 0 };

     v4i32 var = { 0 };


     for (ht_cnt = (height >> 2); ht_cnt--;) {

         LW4(src_ptr, src_stride, src0, src1, src2, src3);

         src_ptr += (4 * src_stride);

         LW4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3);

         ref_ptr += (4 * ref_stride);


         INSERT_W4_UB(src0, src1, src2, src3, src);

         INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);

         CALC_MSE_B(src, ref, var);

     }


     sse = HADD_SW_S32(var);


     return sse;

 }


 static uint32_t sse_8width_msa(uint8_t *src_ptr, int32_t src_stride,

                                uint8_t *ref_ptr, int32_t ref_stride,

                                int32_t height)

 {

     int32_t ht_cnt;

     uint32_t sse;

     v16u8 src0, src1, src2, src3;

     v16u8 ref0, ref1, ref2, ref3;

     v4i32 var = { 0 };


     for (ht_cnt = (height >> 2); ht_cnt--;) {

         LD_UB4(src_ptr, src_stride, src0, src1, src2, src3);

         src_ptr += (4 * src_stride);

         LD_UB4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3);

         ref_ptr += (4 * ref_stride);


         PCKEV_D4_UB(src1, src0, src3, src2, ref1, ref0, ref3, ref2,

                     src0, src1, ref0, ref1);

         CALC_MSE_B(src0, ref0, var);

         CALC_MSE_B(src1, ref1, var);

     }


     sse = HADD_SW_S32(var);


     return sse;

 }


 static uint32_t sse_16width_msa(uint8_t *src_ptr, int32_t src_stride,

                                 uint8_t *ref_ptr, int32_t ref_stride,

                                 int32_t height)

 {

     int32_t ht_cnt;

     uint32_t sse;

     v16u8 src, ref;

     v4i32 var = { 0 };


     for (ht_cnt = (height >> 2); ht_cnt--;) {

         src = LD_UB(src_ptr);

         src_ptr += src_stride;

         ref = LD_UB(ref_ptr);

         ref_ptr += ref_stride;

         CALC_MSE_B(src, ref, var);


         src = LD_UB(src_ptr);

         src_ptr += src_stride;

         ref = LD_UB(ref_ptr);

         ref_ptr += ref_stride;

         CALC_MSE_B(src, ref, var);


         src = LD_UB(src_ptr);

         src_ptr += src_stride;

         ref = LD_UB(ref_ptr);

         ref_ptr += ref_stride;

         CALC_MSE_B(src, ref, var);


         src = LD_UB(src_ptr);

         src_ptr += src_stride;

         ref = LD_UB(ref_ptr);

         ref_ptr += ref_stride;

         CALC_MSE_B(src, ref, var);

     }


     sse = HADD_SW_S32(var);


     return sse;

 }


 static int32_t hadamard_diff_8x8_msa(uint8_t *src, int32_t src_stride,

                                      uint8_t *ref, int32_t ref_stride)

 {

     v16u8 src0, src1, src2, src3, src4, src5, src6, src7;

     v16u8 ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7;

     v8u16 diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7;

     v8u16 temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;

     v8i16 sum = { 0 };

     v8i16 zero = { 0 };


     LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);

     LD_UB8(ref, ref_stride, ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7);

     ILVR_B8_UH(src0, ref0, src1, ref1, src2, ref2, src3, ref3,

                src4, ref4, src5, ref5, src6, ref6, src7, ref7,

                diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7);

     HSUB_UB4_UH(diff0, diff1, diff2, diff3, diff0, diff1, diff2, diff3);

     HSUB_UB4_UH(diff4, diff5, diff6, diff7, diff4, diff5, diff6, diff7);

     TRANSPOSE8x8_UH_UH(diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7,

                        diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7);

     BUTTERFLY_8(diff0, diff2, diff4, diff6, diff7, diff5, diff3, diff1,

                 temp0, temp2, temp4, temp6, temp7, temp5, temp3, temp1);

     BUTTERFLY_8(temp0, temp1, temp4, temp5, temp7, temp6, temp3, temp2,

                 diff0, diff1, diff4, diff5, diff7, diff6, diff3, diff2);

     BUTTERFLY_8(diff0, diff1, diff2, diff3, diff7, diff6, diff5, diff4,

                 temp0, temp1, temp2, temp3, temp7, temp6, temp5, temp4);

     TRANSPOSE8x8_UH_UH(temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7,

                        temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7);

     BUTTERFLY_8(temp0, temp2, temp4, temp6, temp7, temp5, temp3, temp1,

                 diff0, diff2, diff4, diff6, diff7, diff5, diff3, diff1);

     BUTTERFLY_8(diff0, diff1, diff4, diff5, diff7, diff6, diff3, diff2,

                 temp0, temp1, temp4, temp5, temp7, temp6, temp3, temp2);

     ADD4(temp0, temp4, temp1, temp5, temp2, temp6, temp3, temp7,

          diff0, diff1, diff2, diff3);

     sum = __msa_asub_s_h((v8i16) temp3, (v8i16) temp7);

     sum += __msa_asub_s_h((v8i16) temp2, (v8i16) temp6);

     sum += __msa_asub_s_h((v8i16) temp1, (v8i16) temp5);

     sum += __msa_asub_s_h((v8i16) temp0, (v8i16) temp4);

     sum += __msa_add_a_h((v8i16) diff0, zero);

     sum += __msa_add_a_h((v8i16) diff1, zero);

     sum += __msa_add_a_h((v8i16) diff2, zero);

     sum += __msa_add_a_h((v8i16) diff3, zero);


     return (HADD_UH_U32(sum));

 }


 static int32_t hadamard_intra_8x8_msa(uint8_t *src, int32_t src_stride,

                                       uint8_t *ref, int32_t ref_stride)

 {

     int32_t sum_res = 0;

     v16u8 src0, src1, src2, src3, src4, src5, src6, src7;

     v8u16 diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7;

     v8u16 temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;

     v8i16 sum = { 0 };

     v16i8 zero = { 0 };


     LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);

     TRANSPOSE8x8_UB_UB(src0, src1, src2, src3, src4, src5, src6, src7,

                        src0, src1, src2, src3, src4, src5, src6, src7);

     ILVR_B8_UH(zero, src0, zero, src1, zero, src2, zero, src3,

                zero, src4, zero, src5, zero, src6, zero, src7,

                diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7);

     BUTTERFLY_8(diff0, diff2, diff4, diff6, diff7, diff5, diff3, diff1,

                 temp0, temp2, temp4, temp6, temp7, temp5, temp3, temp1);

     BUTTERFLY_8(temp0, temp1, temp4, temp5, temp7, temp6, temp3, temp2,

                 diff0, diff1, diff4, diff5, diff7, diff6, diff3, diff2);

     BUTTERFLY_8(diff0, diff1, diff2, diff3, diff7, diff6, diff5, diff4,

                 temp0, temp1, temp2, temp3, temp7, temp6, temp5, temp4);

     TRANSPOSE8x8_UH_UH(temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7,

                        temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7);

     BUTTERFLY_8(temp0, temp2, temp4, temp6, temp7, temp5, temp3, temp1,

                 diff0, diff2, diff4, diff6, diff7, diff5, diff3, diff1);

     BUTTERFLY_8(diff0, diff1, diff4, diff5, diff7, diff6, diff3, diff2,

                 temp0, temp1, temp4, temp5, temp7, temp6, temp3, temp2);

     ADD4(temp0, temp4, temp1, temp5, temp2, temp6, temp3, temp7,

          diff0, diff1, diff2, diff3);

     sum = __msa_asub_s_h((v8i16) temp3, (v8i16) temp7);

     sum += __msa_asub_s_h((v8i16) temp2, (v8i16) temp6);

     sum += __msa_asub_s_h((v8i16) temp1, (v8i16) temp5);

     sum += __msa_asub_s_h((v8i16) temp0, (v8i16) temp4);

     sum += __msa_add_a_h((v8i16) diff0, (v8i16) zero);

     sum += __msa_add_a_h((v8i16) diff1, (v8i16) zero);

     sum += __msa_add_a_h((v8i16) diff2, (v8i16) zero);

     sum += __msa_add_a_h((v8i16) diff3, (v8i16) zero);

     sum_res = (HADD_UH_U32(sum));

     sum_res -= abs(temp0[0] + temp4[0]);


     return sum_res;

 }


 int ff_pix_abs16_msa(MpegEncContext *v, uint8_t *src, uint8_t *ref,

                      ptrdiff_t stride, int height)

 {

     return sad_16width_msa(src, stride, ref, stride, height);

 }


 int ff_pix_abs8_msa(MpegEncContext *v, uint8_t *src, uint8_t *ref,

                     ptrdiff_t stride, int height)

 {

     return sad_8width_msa(src, stride, ref, stride, height);

 }


 int ff_pix_abs16_x2_msa(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,

                         ptrdiff_t stride, int h)

 {

     return sad_horiz_bilinear_filter_16width_msa(pix1, stride, pix2, stride, h);

 }


 int ff_pix_abs16_y2_msa(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,

                         ptrdiff_t stride, int h)

 {

     return sad_vert_bilinear_filter_16width_msa(pix1, stride, pix2, stride, h);

 }


 int ff_pix_abs16_xy2_msa(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,

                          ptrdiff_t stride, int h)

 {

     return sad_hv_bilinear_filter_16width_msa(pix1, stride, pix2, stride, h);

 }


 int ff_pix_abs8_x2_msa(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,

                        ptrdiff_t stride, int h)

 {

     return sad_horiz_bilinear_filter_8width_msa(pix1, stride, pix2, stride, h);

 }


 int ff_pix_abs8_y2_msa(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,

                        ptrdiff_t stride, int h)

 {

     return sad_vert_bilinear_filter_8width_msa(pix1, stride, pix2, stride, h);

 }


 int ff_pix_abs8_xy2_msa(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,

                         ptrdiff_t stride, int h)

 {

     return sad_hv_bilinear_filter_8width_msa(pix1, stride, pix2, stride, h);

 }


 int ff_sse16_msa(MpegEncContext *v, uint8_t *src, uint8_t *ref,

                  ptrdiff_t stride, int height)

 {

     return sse_16width_msa(src, stride, ref, stride, height);

 }


 int ff_sse8_msa(MpegEncContext *v, uint8_t *src, uint8_t *ref,

                 ptrdiff_t stride, int height)

 {

     return sse_8width_msa(src, stride, ref, stride, height);

 }


 int ff_sse4_msa(MpegEncContext *v, uint8_t *src, uint8_t *ref,

                 ptrdiff_t stride, int height)

 {

     return sse_4width_msa(src, stride, ref, stride, height);

 }


 int ff_hadamard8_diff8x8_msa(MpegEncContext *s, uint8_t *dst, uint8_t *src,

                              ptrdiff_t stride, int h)

 {

     return hadamard_diff_8x8_msa(src, stride, dst, stride);

 }


 int ff_hadamard8_intra8x8_msa(MpegEncContext *s, uint8_t *dst, uint8_t *src,

                               ptrdiff_t stride, int h)

 {

     return hadamard_intra_8x8_msa(src, stride, dst, stride);

 }


 /* Hadamard Transform functions */

 #define WRAPPER8_16_SQ(name8, name16)                      \

 int name16(MpegEncContext *s, uint8_t *dst, uint8_t *src,  \

            ptrdiff_t stride, int h)                        \

 {                                                          \

     int score = 0;                                         \

     score += name8(s, dst, src, stride, 8);                \

     score += name8(s, dst + 8, src + 8, stride, 8);        \

     if(h == 16) {                                          \

         dst += 8 * stride;                                 \

         src += 8 * stride;                                 \

         score +=name8(s, dst, src, stride, 8);             \

         score +=name8(s, dst + 8, src + 8, stride, 8);     \

     }                                                      \

     return score;                                          \

 }


 WRAPPER8_16_SQ(ff_hadamard8_diff8x8_msa, ff_hadamard8_diff16_msa);

 WRAPPER8_16_SQ(ff_hadamard8_intra8x8_msa, ff_hadamard8_intra16_msa);

s
const char * s
Definition: avisynth_c.h:768

ff_pix_abs8_y2_msa
int ff_pix_abs8_y2_msa(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, ptrdiff_t stride, int h)
Definition: me_cmp_msa.c:626

h
h
Definition: vp9dsp_template.c:2038

ff_pix_abs16_y2_msa
int ff_pix_abs16_y2_msa(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, ptrdiff_t stride, int h)
Definition: me_cmp_msa.c:608

ff_sse8_msa
int ff_sse8_msa(MpegEncContext *v, uint8_t *src, uint8_t *ref, ptrdiff_t stride, int height)
Definition: me_cmp_msa.c:644

ff_pix_abs16_xy2_msa
int ff_pix_abs16_xy2_msa(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, ptrdiff_t stride, int h)
Definition: me_cmp_msa.c:614

SAD_UB2_UH
#define SAD_UB2_UH(in0, in1, ref0, ref1)
Definition: generic_macros_msa.h:1318

ff_hadamard8_diff16_msa
int ff_hadamard8_diff16_msa(MpegEncContext *s, uint8_t *dst, uint8_t *src, ptrdiff_t stride, int h)

sse
static int sse(MpegEncContext *s, uint8_t *src1, uint8_t *src2, int w, int h, int stride)
Definition: mpegvideo_enc.c:2793

ff_hadamard8_diff8x8_msa
int ff_hadamard8_diff8x8_msa(MpegEncContext *s, uint8_t *dst, uint8_t *src, ptrdiff_t stride, int h)
Definition: me_cmp_msa.c:656

ff_pix_abs8_xy2_msa
int ff_pix_abs8_xy2_msa(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, ptrdiff_t stride, int h)
Definition: me_cmp_msa.c:632

LD_UB4
#define LD_UB4(...)
Definition: generic_macros_msa.h:323

ff_pix_abs8_msa
int ff_pix_abs8_msa(MpegEncContext *v, uint8_t *src, uint8_t *ref, ptrdiff_t stride, int height)
Definition: me_cmp_msa.c:596

src
#define src
Definition: vp8dsp.c:254

ff_pix_abs16_x2_msa
int ff_pix_abs16_x2_msa(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, ptrdiff_t stride, int h)
Definition: me_cmp_msa.c:602

ff_hadamard8_intra8x8_msa
int ff_hadamard8_intra8x8_msa(MpegEncContext *s, uint8_t *dst, uint8_t *src, ptrdiff_t stride, int h)
Definition: me_cmp_msa.c:662

hadamard_diff_8x8_msa
static int32_t hadamard_diff_8x8_msa(uint8_t *src, int32_t src_stride, uint8_t *ref, int32_t ref_stride)
Definition: me_cmp_msa.c:501

uint8_t
uint8_t
Definition: audio_convert.c:194

LD_UB2
#define LD_UB2(...)
Definition: generic_macros_msa.h:307

HSUB_UB4_UH
#define HSUB_UB4_UH(...)
Definition: generic_macros_msa.h:1306

ILVRL_B2_UB
#define ILVRL_B2_UB(...)
Definition: generic_macros_msa.h:1673

height
#define height

CALC_MSE_B
#define CALC_MSE_B(src, ref, var)
Definition: me_cmp_msa.c:396

LD_UB5
#define LD_UB5(...)
Definition: generic_macros_msa.h:331

ADD4
#define ADD4(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, out3)
Definition: generic_macros_msa.h:2258

ff_pix_abs8_x2_msa
int ff_pix_abs8_x2_msa(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, ptrdiff_t stride, int h)
Definition: me_cmp_msa.c:620

sad_hv_bilinear_filter_16width_msa
static uint32_t sad_hv_bilinear_filter_16width_msa(uint8_t *src, int32_t src_stride, uint8_t *ref, int32_t ref_stride, int32_t height)
Definition: me_cmp_msa.c:283

mask
static const uint16_t mask[17]
Definition: lzw.c:38

generic_macros_msa.h

zero
#define zero
Definition: regdef.h:64

PCKEV_D4_UB
#define PCKEV_D4_UB(...)
Definition: generic_macros_msa.h:1956

TRANSPOSE8x8_UH_UH
#define TRANSPOSE8x8_UH_UH(...)
Definition: generic_macros_msa.h:2624

sad_vert_bilinear_filter_16width_msa
static uint32_t sad_vert_bilinear_filter_16width_msa(uint8_t *src, int32_t src_stride, uint8_t *ref, int32_t ref_stride, int32_t height)
Definition: me_cmp_msa.c:189

sad_16width_msa
static uint32_t sad_16width_msa(uint8_t *src, int32_t src_stride, uint8_t *ref, int32_t ref_stride, int32_t height)
Definition: me_cmp_msa.c:46

sad_8width_msa
static uint32_t sad_8width_msa(uint8_t *src, int32_t src_stride, uint8_t *ref, int32_t ref_stride, int32_t height)
Definition: me_cmp_msa.c:24

ILVR_B8_UH
#define ILVR_B8_UH(...)
Definition: generic_macros_msa.h:1571

SLDI_B2_UB
#define SLDI_B2_UB(...)
Definition: generic_macros_msa.h:851

hadamard_intra_8x8_msa
static int32_t hadamard_intra_8x8_msa(uint8_t *src, int32_t src_stride, uint8_t *ref, int32_t ref_stride)
Definition: me_cmp_msa.c:546

ff_sse16_msa
int ff_sse16_msa(MpegEncContext *v, uint8_t *src, uint8_t *ref, ptrdiff_t stride, int height)
Definition: me_cmp_msa.c:638

LD_UB8
#define LD_UB8(...)
Definition: generic_macros_msa.h:357

ff_sse4_msa
int ff_sse4_msa(MpegEncContext *v, uint8_t *src, uint8_t *ref, ptrdiff_t stride, int height)
Definition: me_cmp_msa.c:650

SRARI_H2_UH
#define SRARI_H2_UH(...)
Definition: generic_macros_msa.h:2193

int32_t
int32_t
Definition: audio_convert.c:194

AVER_UB2_UB
#define AVER_UB2_UB(...)
Definition: generic_macros_msa.h:793

BUTTERFLY_8
#define BUTTERFLY_8(in0, in1, in2, in3, in4, in5, in6, in7,out0, out1, out2, out3, out4, out5, out6, out7)
Definition: generic_macros_msa.h:2382

sad_horiz_bilinear_filter_8width_msa
static uint32_t sad_horiz_bilinear_filter_8width_msa(uint8_t *src, int32_t src_stride, uint8_t *ref, int32_t ref_stride, int32_t height)
Definition: me_cmp_msa.c:71

sse_8width_msa
static uint32_t sse_8width_msa(uint8_t *src_ptr, int32_t src_stride, uint8_t *ref_ptr, int32_t ref_stride, int32_t height)
Definition: me_cmp_msa.c:434

INSERT_W4_UB
#define INSERT_W4_UB(...)
Definition: generic_macros_msa.h:1353

sse_16width_msa
static uint32_t sse_16width_msa(uint8_t *src_ptr, int32_t src_stride, uint8_t *ref_ptr, int32_t ref_stride, int32_t height)
Definition: me_cmp_msa.c:461

comp
static void comp(unsigned char *dst, ptrdiff_t dst_stride, unsigned char *src, ptrdiff_t src_stride, int add)
Definition: eamad.c:83

src1
#define src1
Definition: h264pred.c:139

TRANSPOSE8x8_UB_UB
#define TRANSPOSE8x8_UB_UB(...)
Definition: generic_macros_msa.h:2493

HADD_UH_U32
#define HADD_UH_U32(in)
Definition: generic_macros_msa.h:1217

src0
#define src0
Definition: h264pred.c:138

WRAPPER8_16_SQ
#define WRAPPER8_16_SQ(name8, name16)
Definition: me_cmp_msa.c:669

MpegEncContext
MpegEncContext.
Definition: mpegvideo.h:78

sse_4width_msa
static uint32_t sse_4width_msa(uint8_t *src_ptr, int32_t src_stride, uint8_t *ref_ptr, int32_t ref_stride, int32_t height)
Definition: me_cmp_msa.c:406

stride
GLint GLenum GLboolean GLsizei stride
Definition: opengl_enc.c:105

LW4
#define LW4(psrc, stride, out0, out1, out2, out3)
Definition: generic_macros_msa.h:238

ref
static int ref[MAX_W *MAX_W]
Definition: jpeg2000dwt.c:107

ff_pix_abs16_msa
int ff_pix_abs16_msa(MpegEncContext *v, uint8_t *src, uint8_t *ref, ptrdiff_t stride, int height)
Definition: me_cmp_msa.c:590

sad_hv_bilinear_filter_8width_msa
static uint32_t sad_hv_bilinear_filter_8width_msa(uint8_t *src, int32_t src_stride, uint8_t *ref, int32_t ref_stride, int32_t height)
Definition: me_cmp_msa.c:227

diff
static av_always_inline int diff(const uint32_t a, const uint32_t b)
Definition: vf_palettegen.c:133

LD_UB
#define LD_UB(...)
Definition: generic_macros_msa.h:31

me_cmp_mips.h

VSHF_B2_UB
#define VSHF_B2_UB(...)
Definition: generic_macros_msa.h:878

sad_vert_bilinear_filter_8width_msa
static uint32_t sad_vert_bilinear_filter_8width_msa(uint8_t *src, int32_t src_stride, uint8_t *ref, int32_t ref_stride, int32_t height)
Definition: me_cmp_msa.c:151

PCKEV_D2_UB
#define PCKEV_D2_UB(...)
Definition: generic_macros_msa.h:1946

ff_hadamard8_intra16_msa
int ff_hadamard8_intra16_msa(MpegEncContext *s, uint8_t *dst, uint8_t *src, ptrdiff_t stride, int h)

sad_horiz_bilinear_filter_16width_msa
static uint32_t sad_horiz_bilinear_filter_16width_msa(uint8_t *src, int32_t src_stride, uint8_t *ref, int32_t ref_stride, int32_t height)
Definition: me_cmp_msa.c:113

HADD_SW_S32
#define HADD_SW_S32(in)
Definition: generic_macros_msa.h:1198