32 for (ht_cnt = (
height >> 2); ht_cnt--;) {
34 src += (4 * src_stride);
35 LD_UB4(
ref, ref_stride, ref0, ref1, ref2, ref3);
36 ref += (4 * ref_stride);
54 for (ht_cnt = (
height >> 2); ht_cnt--;) {
56 src += (2 * src_stride);
58 ref += (2 * ref_stride);
62 src += (2 * src_stride);
64 ref += (2 * ref_stride);
79 v16u8 ref0, ref1, ref2, ref3, ref4, ref5;
82 for (ht_cnt = (
height >> 3); ht_cnt--;) {
84 src += (4 * src_stride);
85 LD_UB4(
ref, ref_stride, ref0, ref1, ref2, ref3);
86 ref += (4 * ref_stride);
90 SLDI_B4_UB(ref0, ref0, ref1, ref1, ref2, ref2, ref3, ref3, 1,
91 ref0, ref1, ref2, ref3);
97 src += (4 * src_stride);
98 LD_UB4(
ref, ref_stride, ref0, ref1, ref2, ref3);
99 ref += (4 * ref_stride);
103 SLDI_B4_UB(ref0, ref0, ref1, ref1, ref2, ref2, ref3, ref3, 1,
104 ref0, ref1, ref2, ref3);
121 v16u8 ref00, ref10, ref20, ref30, ref01, ref11, ref21, ref31;
124 for (ht_cnt = (
height >> 3); ht_cnt--;) {
126 src += (4 * src_stride);
127 LD_UB4(
ref, ref_stride, ref00, ref10, ref20, ref30);
128 LD_UB4(
ref + 1, ref_stride, ref01, ref11, ref21, ref31);
129 ref += (4 * ref_stride);
131 AVER_UB2_UB(ref01, ref00, ref11, ref10, comp0, comp1);
133 AVER_UB2_UB(ref21, ref20, ref31, ref30, comp0, comp1);
137 src += (4 * src_stride);
138 LD_UB4(
ref, ref_stride, ref00, ref10, ref20, ref30);
139 LD_UB4(
ref + 1, ref_stride, ref01, ref11, ref21, ref31);
140 ref += (4 * ref_stride);
142 AVER_UB2_UB(ref01, ref00, ref11, ref10, comp0, comp1);
144 AVER_UB2_UB(ref21, ref20, ref31, ref30, comp0, comp1);
159 v16u8 ref0, ref1, ref2, ref3, ref4;
162 for (ht_cnt = (
height >> 3); ht_cnt--;) {
164 src += (4 * src_stride);
165 LD_UB5(
ref, ref_stride, ref0, ref1, ref2, ref3, ref4);
166 ref += (4 * ref_stride);
175 src += (4 * src_stride);
176 LD_UB5(
ref, ref_stride, ref0, ref1, ref2, ref3, ref4);
177 ref += (4 * ref_stride);
197 v16u8 ref0, ref1, ref2, ref3, ref4;
200 for (ht_cnt = (
height >> 3); ht_cnt--;) {
201 LD_UB5(
ref, ref_stride, ref4, ref0, ref1, ref2, ref3);
202 ref += (5 * ref_stride);
204 src += (4 * src_stride);
213 LD_UB4(
ref, ref_stride, ref0, ref1, ref2, ref3);
214 ref += (3 * ref_stride);
216 src += (4 * src_stride);
235 v16u8 ref0, ref1, ref2, ref3, ref4;
236 v16i8
mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
237 v8u16 comp0, comp1, comp2, comp3;
240 for (ht_cnt = (
height >> 2); ht_cnt--;) {
241 LD_UB5(
ref, ref_stride, ref4, ref0, ref1, ref2, ref3);
242 ref += (4 * ref_stride);
244 src += (4 * src_stride);
249 comp0 = __msa_hadd_u_h(temp0, temp0);
250 comp1 = __msa_hadd_u_h(temp1, temp1);
252 comp0 = (v8u16) __msa_srari_h((v8i16) comp0, 2);
253 comp0 = (v8u16) __msa_pckev_b((v16i8) comp0, (v16i8) comp0);
255 temp0 = (v16u8) __msa_vshf_b(
mask, (v16i8) ref1, (v16i8) ref1);
256 comp2 = __msa_hadd_u_h(temp0, temp0);
258 comp1 = (v8u16) __msa_srari_h((v8i16) comp1, 2);
259 comp1 = (v8u16) __msa_pckev_b((v16i8) comp1, (v16i8) comp1);
260 comp1 = (v8u16) __msa_pckev_d((v2i64) comp1, (v2i64) comp0);
261 diff = (v16u8) __msa_asub_u_b(
src0, (v16u8) comp1);
264 temp1 = (v16u8) __msa_vshf_b(
mask, (v16i8) ref2, (v16i8) ref2);
265 comp3 = __msa_hadd_u_h(temp1, temp1);
267 comp2 = (v8u16) __msa_srari_h((v8i16) comp2, 2);
268 comp2 = (v8u16) __msa_pckev_b((v16i8) comp2, (v16i8) comp2);
270 temp0 = (v16u8) __msa_vshf_b(
mask, (v16i8) ref3, (v16i8) ref3);
271 comp0 = __msa_hadd_u_h(temp0, temp0);
273 comp3 = (v8u16) __msa_srari_h((v8i16) comp3, 2);
274 comp3 = (v8u16) __msa_pckev_b((v16i8) comp3, (v16i8) comp3);
275 comp3 = (v8u16) __msa_pckev_d((v2i64) comp3, (v2i64) comp2);
276 diff = (v16u8) __msa_asub_u_b(
src1, (v16u8) comp3);
291 v16u8 temp0, temp1, temp2, temp3;
292 v16u8 ref00, ref01, ref02, ref03, ref04, ref10, ref11, ref12, ref13, ref14;
293 v8u16 comp0, comp1, comp2, comp3;
296 for (ht_cnt = (
height >> 3); ht_cnt--;) {
298 src += (4 * src_stride);
299 LD_UB5(
ref, ref_stride, ref04, ref00, ref01, ref02, ref03);
300 LD_UB5(
ref + 1, ref_stride, ref14, ref10, ref11, ref12, ref13);
301 ref += (5 * ref_stride);
304 comp0 = __msa_hadd_u_h(temp0, temp0);
305 comp1 = __msa_hadd_u_h(temp1, temp1);
307 comp2 = __msa_hadd_u_h(temp2, temp2);
308 comp3 = __msa_hadd_u_h(temp3, temp3);
312 comp = (v16u8) __msa_pckev_b((v16i8) comp1, (v16i8) comp0);
317 comp0 = __msa_hadd_u_h(temp0, temp0);
318 comp1 = __msa_hadd_u_h(temp1, temp1);
322 comp = (v16u8) __msa_pckev_b((v16i8) comp3, (v16i8) comp2);
327 comp2 = __msa_hadd_u_h(temp2, temp2);
328 comp3 = __msa_hadd_u_h(temp3, temp3);
332 comp = (v16u8) __msa_pckev_b((v16i8) comp1, (v16i8) comp0);
337 comp0 = __msa_hadd_u_h(temp0, temp0);
338 comp1 = __msa_hadd_u_h(temp1, temp1);
342 comp = (v16u8) __msa_pckev_b((v16i8) comp3, (v16i8) comp2);
347 src += (4 * src_stride);
348 LD_UB4(
ref, ref_stride, ref00, ref01, ref02, ref03);
349 LD_UB4(
ref + 1, ref_stride, ref10, ref11, ref12, ref13);
350 ref += (3 * ref_stride);
353 comp2 = __msa_hadd_u_h(temp2, temp2);
354 comp3 = __msa_hadd_u_h(temp3, temp3);
358 comp = (v16u8) __msa_pckev_b((v16i8) comp1, (v16i8) comp0);
363 comp0 = __msa_hadd_u_h(temp0, temp0);
364 comp1 = __msa_hadd_u_h(temp1, temp1);
368 comp = (v16u8) __msa_pckev_b((v16i8) comp3, (v16i8) comp2);
373 comp2 = __msa_hadd_u_h(temp2, temp2);
374 comp3 = __msa_hadd_u_h(temp3, temp3);
378 comp = (v16u8) __msa_pckev_b((v16i8) comp1, (v16i8) comp0);
383 comp0 = __msa_hadd_u_h(temp0, temp0);
384 comp1 = __msa_hadd_u_h(temp1, temp1);
388 comp = (v16u8) __msa_pckev_b((v16i8) comp3, (v16i8) comp2);
396 #define CALC_MSE_B(src, ref, var) \
398 v16u8 src_l0_m, src_l1_m; \
399 v8i16 res_l0_m, res_l1_m; \
401 ILVRL_B2_UB(src, ref, src_l0_m, src_l1_m); \
402 HSUB_UB2_SH(src_l0_m, src_l1_m, res_l0_m, res_l1_m); \
403 DPADD_SH2_SW(res_l0_m, res_l1_m, res_l0_m, res_l1_m, var, var); \
407 uint8_t *ref_ptr,
int32_t ref_stride,
413 uint32_t ref0, ref1, ref2, ref3;
418 for (ht_cnt = (
height >> 2); ht_cnt--;) {
420 src_ptr += (4 * src_stride);
421 LW4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3);
422 ref_ptr += (4 * ref_stride);
435 uint8_t *ref_ptr,
int32_t ref_stride,
441 v16u8 ref0, ref1, ref2, ref3;
444 for (ht_cnt = (
height >> 2); ht_cnt--;) {
446 src_ptr += (4 * src_stride);
447 LD_UB4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3);
448 ref_ptr += (4 * ref_stride);
462 uint8_t *ref_ptr,
int32_t ref_stride,
470 for (ht_cnt = (
height >> 2); ht_cnt--;) {
472 src_ptr += src_stride;
474 ref_ptr += ref_stride;
478 src_ptr += src_stride;
480 ref_ptr += ref_stride;
484 src_ptr += src_stride;
486 ref_ptr += ref_stride;
490 src_ptr += src_stride;
492 ref_ptr += ref_stride;
505 v16u8 ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7;
506 v8u16 diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7;
507 v8u16 temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
512 LD_UB8(
ref, ref_stride, ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7);
514 src4, ref4, src5, ref5, src6, ref6, src7, ref7,
515 diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7);
516 HSUB_UB4_UH(diff0, diff1, diff2, diff3, diff0, diff1, diff2, diff3);
517 HSUB_UB4_UH(diff4, diff5, diff6, diff7, diff4, diff5, diff6, diff7);
519 diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7);
520 BUTTERFLY_8(diff0, diff2, diff4, diff6, diff7, diff5, diff3, diff1,
521 temp0, temp2, temp4, temp6, temp7, temp5, temp3, temp1);
522 BUTTERFLY_8(temp0, temp1, temp4, temp5, temp7, temp6, temp3, temp2,
523 diff0, diff1, diff4, diff5, diff7, diff6, diff3, diff2);
524 BUTTERFLY_8(diff0, diff1, diff2, diff3, diff7, diff6, diff5, diff4,
525 temp0, temp1, temp2, temp3, temp7, temp6, temp5, temp4);
527 temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7);
528 BUTTERFLY_8(temp0, temp2, temp4, temp6, temp7, temp5, temp3, temp1,
529 diff0, diff2, diff4, diff6, diff7, diff5, diff3, diff1);
530 BUTTERFLY_8(diff0, diff1, diff4, diff5, diff7, diff6, diff3, diff2,
531 temp0, temp1, temp4, temp5, temp7, temp6, temp3, temp2);
532 ADD4(temp0, temp4, temp1, temp5, temp2, temp6, temp3, temp7,
533 diff0, diff1, diff2, diff3);
534 sum = __msa_asub_s_h((v8i16) temp3, (v8i16) temp7);
535 sum += __msa_asub_s_h((v8i16) temp2, (v8i16) temp6);
536 sum += __msa_asub_s_h((v8i16) temp1, (v8i16) temp5);
537 sum += __msa_asub_s_h((v8i16) temp0, (v8i16) temp4);
538 sum += __msa_add_a_h((v8i16) diff0,
zero);
539 sum += __msa_add_a_h((v8i16) diff1,
zero);
540 sum += __msa_add_a_h((v8i16) diff2,
zero);
541 sum += __msa_add_a_h((v8i16) diff3,
zero);
551 v8u16 diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7;
552 v8u16 temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
561 diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7);
562 BUTTERFLY_8(diff0, diff2, diff4, diff6, diff7, diff5, diff3, diff1,
563 temp0, temp2, temp4, temp6, temp7, temp5, temp3, temp1);
564 BUTTERFLY_8(temp0, temp1, temp4, temp5, temp7, temp6, temp3, temp2,
565 diff0, diff1, diff4, diff5, diff7, diff6, diff3, diff2);
566 BUTTERFLY_8(diff0, diff1, diff2, diff3, diff7, diff6, diff5, diff4,
567 temp0, temp1, temp2, temp3, temp7, temp6, temp5, temp4);
569 temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7);
570 BUTTERFLY_8(temp0, temp2, temp4, temp6, temp7, temp5, temp3, temp1,
571 diff0, diff2, diff4, diff6, diff7, diff5, diff3, diff1);
572 BUTTERFLY_8(diff0, diff1, diff4, diff5, diff7, diff6, diff3, diff2,
573 temp0, temp1, temp4, temp5, temp7, temp6, temp3, temp2);
574 ADD4(temp0, temp4, temp1, temp5, temp2, temp6, temp3, temp7,
575 diff0, diff1, diff2, diff3);
576 sum = __msa_asub_s_h((v8i16) temp3, (v8i16) temp7);
577 sum += __msa_asub_s_h((v8i16) temp2, (v8i16) temp6);
578 sum += __msa_asub_s_h((v8i16) temp1, (v8i16) temp5);
579 sum += __msa_asub_s_h((v8i16) temp0, (v8i16) temp4);
580 sum += __msa_add_a_h((v8i16) diff0, (v8i16)
zero);
581 sum += __msa_add_a_h((v8i16) diff1, (v8i16)
zero);
582 sum += __msa_add_a_h((v8i16) diff2, (v8i16)
zero);
583 sum += __msa_add_a_h((v8i16) diff3, (v8i16)
zero);
585 sum_res -=
abs(temp0[0] + temp4[0]);
669 #define WRAPPER8_16_SQ(name8, name16) \
670 int name16(MpegEncContext *s, uint8_t *dst, uint8_t *src, \
671 ptrdiff_t stride, int h) \
674 score += name8(s, dst, src, stride, 8); \
675 score += name8(s, dst + 8, src + 8, stride, 8); \
679 score +=name8(s, dst, src, stride, 8); \
680 score +=name8(s, dst + 8, src + 8, stride, 8); \