28 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
30 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20,
32 8, 9, 9, 10, 10, 11, 11, 12, 24, 25, 25, 26, 26, 27, 27, 28
41 __m128i mask0, mask1, mask2, mask3, out1, out2;
43 __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
44 __m128i filt0, filt1, filt2, filt3;
45 __m128i res0, res1, res2, res3;
52 filt0, filt1, filt2, filt3);
54 DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
55 mask3 = __lsx_vaddi_bu(mask0, 6);
57 for (loop_cnt =
height; loop_cnt--;) {
61 src4, src5, src6, src7);
68 DUP4_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec1, filt0, vec2, filt0,
69 vec3, filt0, res0, res1, res2, res3);
74 DUP4_ARG3(__lsx_vdp2add_h_bu_b, res0, vec0, filt2, res1, vec1, filt2,
75 res2, vec2, filt2, res3, vec3, filt2, res0, res1, res2, res3);
80 DUP4_ARG3(__lsx_vdp2add_h_bu_b, res0, vec4, filt1, res1, vec5, filt1,
81 res2, vec6, filt1, res3, vec7, filt1, res0, res1, res2, res3);
86 DUP4_ARG3(__lsx_vdp2add_h_bu_b, res0, vec4, filt3, res1, vec5, filt3,
87 res2, vec6, filt3, res3, vec7, filt3, res0, res1, res2, res3);
89 DUP2_ARG3(__lsx_vssrarni_bu_h, res1, res0, 6, res3, res2, 6,
91 __lsx_vst(out1,
dst, 0);
92 __lsx_vst(out2,
dst, 16);
94 DUP2_ARG3(__lsx_vshuf_b, src4, src4, mask0, src5, src5, mask0,
96 DUP2_ARG3(__lsx_vshuf_b, src6, src6, mask0, src7, src7, mask0,
98 DUP4_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec1, filt0, vec2, filt0,
99 vec3, filt0, res0, res1, res2, res3);
100 DUP2_ARG3(__lsx_vshuf_b, src4, src4, mask2, src5, src5, mask2,
102 DUP2_ARG3(__lsx_vshuf_b, src6, src6, mask2, src7, src7, mask2,
104 DUP4_ARG3(__lsx_vdp2add_h_bu_b, res0, vec0, filt2, res1, vec1, filt2,
105 res2, vec2, filt2, res3, vec3, filt2, res0, res1, res2, res3);
106 DUP2_ARG3(__lsx_vshuf_b, src4, src4, mask1, src5, src5, mask1,
108 DUP2_ARG3(__lsx_vshuf_b, src6, src6, mask1, src7, src7, mask1,
110 DUP4_ARG3(__lsx_vdp2add_h_bu_b, res0, vec4, filt1, res1, vec5, filt1,
111 res2, vec6, filt1, res3, vec7, filt1, res0, res1, res2, res3);
112 DUP2_ARG3(__lsx_vshuf_b, src4, src4, mask3, src5, src5, mask3,
114 DUP2_ARG3(__lsx_vshuf_b, src6, src6, mask3, src7, src7, mask3,
116 DUP4_ARG3(__lsx_vdp2add_h_bu_b, res0, vec4, filt3, res1, vec5, filt3,
117 res2, vec6, filt3, res3, vec7, filt3, res0, res1, res2, res3);
119 DUP2_ARG3(__lsx_vssrarni_bu_h, res1, res0, 6, res3, res2, 6,
121 __lsx_vst(out1,
dst, 32);
122 __lsx_vst(out2,
dst, 48);
133 int32_t src_stride_2x = (src_stride << 1);
134 int32_t dst_stride_2x = (dst_stride << 1);
135 int32_t src_stride_4x = (src_stride << 2);
136 int32_t dst_stride_4x = (dst_stride << 2);
137 int32_t src_stride_3x = src_stride_2x + src_stride;
138 int32_t dst_stride_3x = dst_stride_2x + dst_stride;
140 __m128i
src0,
src1,
src2, src3, src4, src5, src6, src7, src8, src9, src10;
141 __m128i src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
142 __m128i src65_r, src87_r, src109_r, filt0, filt1, filt2, filt3;
144 __m128i out0_r, out1_r, out2_r, out3_r;
146 src -= src_stride_3x;
148 filt0, filt1, filt2, filt3);
152 src3 = __lsx_vldx(
src, src_stride_3x);
153 src += src_stride_4x;
154 src4 = __lsx_vld(
src, 0);
155 DUP2_ARG2(__lsx_vldx,
src, src_stride,
src, src_stride_2x, src5, src6);
156 src += src_stride_3x;
158 src10_r, src32_r, src54_r, src21_r);
159 DUP2_ARG2(__lsx_vilvl_b, src4, src3, src6, src5, src43_r, src65_r);
161 for (loop_cnt = (
height >> 2); loop_cnt--;) {
162 src7 = __lsx_vld(
src, 0);
163 DUP2_ARG2(__lsx_vldx,
src, src_stride,
src, src_stride_2x, src8, src9);
164 src10 = __lsx_vldx(
src, src_stride_3x);
165 src += src_stride_4x;
167 DUP4_ARG2(__lsx_vilvl_b, src7, src6, src8, src7, src9, src8, src10,
168 src9, src76_r, src87_r, src98_r, src109_r);
169 DUP4_ARG2(__lsx_vdp2_h_bu_b, src10_r, filt0, src21_r, filt0, src32_r,
170 filt0, src43_r, filt0, out0_r, out1_r, out2_r, out3_r);
171 DUP4_ARG3(__lsx_vdp2add_h_bu_b, out0_r, src32_r, filt1, out1_r,
172 src43_r, filt1, out2_r, src54_r, filt1, out3_r, src65_r,
173 filt1, out0_r, out1_r, out2_r, out3_r);
174 DUP4_ARG3(__lsx_vdp2add_h_bu_b, out0_r, src54_r, filt2, out1_r,
175 src65_r, filt2, out2_r, src76_r, filt2, out3_r, src87_r,
176 filt2, out0_r, out1_r, out2_r, out3_r);
177 DUP4_ARG3(__lsx_vdp2add_h_bu_b, out0_r, src76_r, filt3, out1_r,
178 src87_r, filt3, out2_r, src98_r, filt3, out3_r, src109_r,
179 filt3, out0_r, out1_r, out2_r, out3_r);
181 DUP2_ARG3(__lsx_vssrarni_bu_h, out1_r, out0_r, 6, out3_r, out2_r, 6,
183 __lsx_vstelm_d(tmp0,
dst, 0, 0);
184 __lsx_vstelm_d(tmp0,
dst + dst_stride, 0, 1);
185 __lsx_vstelm_d(tmp1,
dst + dst_stride_2x, 0, 0);
186 __lsx_vstelm_d(tmp1,
dst + dst_stride_3x, 0, 1);
187 dst += dst_stride_4x;
204 const uint8_t *src_tmp;
206 uint32_t loop_cnt, cnt;
207 const int32_t src_stride_2x = (src_stride << 1);
208 const int32_t dst_stride_2x = (dst_stride << 1);
209 const int32_t src_stride_4x = (src_stride << 2);
210 const int32_t dst_stride_4x = (dst_stride << 2);
211 const int32_t src_stride_3x = src_stride_2x + src_stride;
212 const int32_t dst_stride_3x = dst_stride_2x + dst_stride;
214 __m128i
src0,
src1,
src2, src3, src4, src5, src6, src7, src8, src9, src10;
215 __m128i filt0, filt1, filt2, filt3;
216 __m128i src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
217 __m128i src65_r, src87_r, src109_r, src10_l, src32_l, src54_l, src76_l;
218 __m128i src98_l, src21_l, src43_l, src65_l, src87_l, src109_l;
219 __m128i tmp0, tmp1, tmp2, tmp3;
220 __m128i out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
222 src -= src_stride_3x;
223 DUP4_ARG2(__lsx_vldrepl_h,
filter, 0,
filter, 2,
filter, 4,
filter, 6, filt0,
224 filt1, filt2, filt3);
226 for (cnt = (
width >> 4); cnt--;) {
230 src0 = __lsx_vld(src_tmp, 0);
231 DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride_2x,
233 src3 = __lsx_vldx(src_tmp, src_stride_3x);
234 src_tmp += src_stride_4x;
235 src4 = __lsx_vld(src_tmp, 0);
236 DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride_2x,
238 src_tmp += src_stride_3x;
240 src10_r, src32_r, src54_r, src21_r);
241 DUP2_ARG2(__lsx_vilvl_b, src4, src3, src6, src5, src43_r, src65_r);
243 src10_l, src32_l, src54_l, src21_l);
244 DUP2_ARG2(__lsx_vilvh_b, src4, src3, src6, src5, src43_l, src65_l);
246 for (loop_cnt = (
height >> 2); loop_cnt--;) {
247 src7 = __lsx_vld(src_tmp, 0);
248 DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride_2x,
250 src10 = __lsx_vldx(src_tmp, src_stride_3x);
251 src_tmp += src_stride_4x;
252 DUP4_ARG2(__lsx_vilvl_b, src7, src6, src8, src7, src9, src8, src10,
253 src9, src76_r, src87_r, src98_r, src109_r);
254 DUP4_ARG2(__lsx_vilvh_b, src7, src6, src8, src7, src9, src8, src10,
255 src9, src76_l, src87_l, src98_l, src109_l);
256 DUP4_ARG2(__lsx_vdp2_h_bu_b, src10_r, filt0, src21_r, filt0, src32_r,
257 filt0, src43_r, filt0, out0_r, out1_r, out2_r, out3_r);
258 DUP4_ARG3(__lsx_vdp2add_h_bu_b, out0_r, src32_r, filt1, out1_r,
259 src43_r, filt1, out2_r, src54_r, filt1, out3_r, src65_r,
260 filt1, out0_r, out1_r, out2_r, out3_r);
261 DUP4_ARG3(__lsx_vdp2add_h_bu_b, out0_r, src54_r, filt2, out1_r,
262 src65_r, filt2, out2_r, src76_r, filt2, out3_r, src87_r,
263 filt2, out0_r, out1_r, out2_r, out3_r);
264 DUP4_ARG3(__lsx_vdp2add_h_bu_b, out0_r, src76_r, filt3, out1_r,
265 src87_r, filt3, out2_r, src98_r, filt3, out3_r, src109_r,
266 filt3, out0_r, out1_r, out2_r, out3_r);
267 DUP4_ARG2(__lsx_vdp2_h_bu_b, src10_l, filt0, src21_l, filt0, src32_l,
268 filt0, src43_l, filt0, out0_l, out1_l, out2_l, out3_l);
269 DUP4_ARG3(__lsx_vdp2add_h_bu_b, out0_l, src32_l, filt1, out1_l,
270 src43_l, filt1, out2_l, src54_l, filt1, out3_l, src65_l,
271 filt1, out0_l, out1_l, out2_l, out3_l);
272 DUP4_ARG3(__lsx_vdp2add_h_bu_b, out0_l, src54_l, filt2, out1_l,
273 src65_l, filt2, out2_l, src76_l, filt2, out3_l, src87_l,
274 filt2, out0_l, out1_l, out2_l, out3_l);
275 DUP4_ARG3(__lsx_vdp2add_h_bu_b, out0_l, src76_l, filt3, out1_l,
276 src87_l, filt3, out2_l, src98_l, filt3, out3_l, src109_l,
277 filt3, out0_l, out1_l, out2_l, out3_l);
278 DUP4_ARG3(__lsx_vssrarni_bu_h, out0_l, out0_r, 6, out1_l, out1_r,
279 6, out2_l, out2_r, 6, out3_l, out3_r, 6,
280 tmp0, tmp1, tmp2, tmp3);
281 __lsx_vst(tmp0, dst_tmp, 0);
282 __lsx_vstx(tmp1, dst_tmp, dst_stride);
283 __lsx_vstx(tmp2, dst_tmp, dst_stride_2x);
284 __lsx_vstx(tmp3, dst_tmp, dst_stride_3x);
285 dst_tmp += dst_stride_4x;
339 int32_t dst_stride,
const int8_t *filter_x,
342 uint32_t loop_cnt, cnt;
343 const uint8_t *src_tmp;
345 const int32_t src_stride_2x = (src_stride << 1);
346 const int32_t dst_stride_2x = (dst_stride << 1);
347 const int32_t src_stride_4x = (src_stride << 2);
348 const int32_t src_stride_3x = src_stride_2x + src_stride;
351 __m128i
src0,
src1,
src2, src3, src4, src5, src6, src7, src8;
352 __m128i filt0, filt1, filt2, filt3;
353 __m128i filt_h0, filt_h1, filt_h2, filt_h3;
354 __m128i mask1, mask2, mask3;
356 __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
357 __m128i vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
358 __m128i dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8;
359 __m128i dst0_r, dst0_l, dst1_r, dst1_l;
360 __m128i dst10_r, dst32_r, dst54_r, dst76_r;
361 __m128i dst10_l, dst32_l, dst54_l, dst76_l;
362 __m128i dst21_r, dst43_r, dst65_r, dst87_r;
363 __m128i dst21_l, dst43_l, dst65_l, dst87_l;
366 src -= (src_stride_3x + 3);
367 DUP4_ARG2(__lsx_vldrepl_h, filter_x, 0, filter_x, 2, filter_x, 4,
368 filter_x, 6, filt0, filt1, filt2, filt3);
370 filter_vec = __lsx_vld(filter_y, 0);
371 filter_vec = __lsx_vsllwil_h_b(filter_vec, 0);
372 DUP4_ARG2(__lsx_vreplvei_w, filter_vec, 0, filter_vec, 1, filter_vec, 2,
373 filter_vec, 3, filt_h0, filt_h1, filt_h2, filt_h3);
375 DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
376 mask3 = __lsx_vaddi_bu(mask0, 6);
378 for (cnt =
width >> 3; cnt--;) {
382 src0 = __lsx_vld(src_tmp, 0);
383 DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride_2x,
385 src3 = __lsx_vldx(src_tmp, src_stride_3x);
386 src_tmp += src_stride_4x;
387 src4 = __lsx_vld(src_tmp, 0);
388 DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride_2x,
390 src_tmp += src_stride_3x;
394 src0, mask2,
src0,
src0, mask3, vec0, vec1, vec2, vec3);
396 src1, mask2,
src1,
src1, mask3, vec4, vec5, vec6, vec7);
398 src2, mask2,
src2,
src2, mask3, vec8, vec9, vec10, vec11);
399 DUP4_ARG3(__lsx_vshuf_b, src3, src3, mask0, src3, src3, mask1, src3,
400 src3, mask2, src3, src3, mask3, vec12, vec13, vec14, vec15);
401 DUP4_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec4, filt0, vec8, filt0,
402 vec12, filt0, dst0, dst1, dst2, dst3);
403 DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec1, filt1, dst1, vec5, filt1,
404 dst2, vec9, filt1, dst3, vec13, filt1, dst0, dst1, dst2, dst3);
405 DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec2, filt2, dst1, vec6, filt2,
406 dst2, vec10, filt2, dst3, vec14, filt2, dst0, dst1, dst2, dst3);
407 DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec3, filt3, dst1, vec7, filt3,
408 dst2, vec11, filt3, dst3, vec15, filt3, dst0, dst1, dst2, dst3);
410 DUP4_ARG3(__lsx_vshuf_b, src4, src4, mask0, src4, src4, mask1, src4,
411 src4, mask2, src4, src4, mask3, vec0, vec1, vec2, vec3);
412 DUP4_ARG3(__lsx_vshuf_b, src5, src5, mask0, src5, src5, mask1, src5,
413 src5, mask2, src5, src5, mask3, vec4, vec5, vec6, vec7);
414 DUP4_ARG3(__lsx_vshuf_b, src6, src6, mask0, src6, src6, mask1, src6,
415 src6, mask2, src6, src6, mask3, vec8, vec9, vec10, vec11);
416 DUP2_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec4, filt0, dst4, dst5);
417 dst6 = __lsx_vdp2_h_bu_b(vec8, filt0);
418 DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst4, vec1, filt1, dst5, vec5, filt1,
419 dst6, vec9, filt1, dst4, vec2, filt2, dst4, dst5, dst6, dst4);
420 DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst5, vec6, filt2, dst6, vec10, filt2,
421 dst4, vec3, filt3, dst5, vec7, filt3, dst5, dst6, dst4, dst5);
422 dst6 = __lsx_vdp2add_h_bu_b(dst6, vec11, filt3);
423 DUP4_ARG2(__lsx_vilvl_h, dst1, dst0, dst3, dst2, dst5, dst4, dst2,
424 dst1, dst10_r, dst32_r, dst54_r, dst21_r);
425 DUP4_ARG2(__lsx_vilvh_h, dst1, dst0, dst3, dst2, dst5, dst4, dst2,
426 dst1, dst10_l, dst32_l, dst54_l, dst21_l);
427 DUP2_ARG2(__lsx_vilvl_h, dst4, dst3, dst6, dst5, dst43_r, dst65_r);
428 DUP2_ARG2(__lsx_vilvh_h, dst4, dst3, dst6, dst5, dst43_l, dst65_l);
430 for (loop_cnt =
height >> 1; loop_cnt--;) {
431 src7 = __lsx_vld(src_tmp, 0);
432 src8 = __lsx_vldx(src_tmp, src_stride);
433 src_tmp += src_stride_2x;
435 DUP4_ARG3(__lsx_vshuf_b, src7, src7, mask0, src7, src7, mask1, src7,
436 src7, mask2, src7, src7, mask3, vec0, vec1, vec2, vec3);
437 dst7 = __lsx_vdp2_h_bu_b(vec0, filt0);
438 DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst7, vec1, filt1, dst7, vec2,
440 dst7 = __lsx_vdp2add_h_bu_b(dst7, vec3, filt3);
441 dst76_r = __lsx_vilvl_h(dst7, dst6);
442 dst76_l = __lsx_vilvh_h(dst7, dst6);
443 DUP2_ARG2(__lsx_vdp2_w_h, dst10_r, filt_h0, dst10_l, filt_h0,
445 DUP4_ARG3(__lsx_vdp2add_w_h, dst0_r, dst32_r, filt_h1, dst0_l,
446 dst32_l, filt_h1, dst0_r, dst54_r, filt_h2, dst0_l,
447 dst54_l, filt_h2, dst0_r, dst0_l, dst0_r, dst0_l);
448 DUP2_ARG3(__lsx_vdp2add_w_h, dst0_r, dst76_r, filt_h3, dst0_l,
449 dst76_l, filt_h3, dst0_r, dst0_l);
450 DUP2_ARG2(__lsx_vsrai_w, dst0_r, 6, dst0_l, 6, dst0_r, dst0_l);
452 DUP4_ARG3(__lsx_vshuf_b, src8, src8, mask0, src8, src8, mask1, src8,
453 src8, mask2, src8, src8, mask3, vec0, vec1, vec2, vec3);
454 dst8 = __lsx_vdp2_h_bu_b(vec0, filt0);
455 DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst8, vec1, filt1, dst8, vec2,
457 dst8 = __lsx_vdp2add_h_bu_b(dst8, vec3, filt3);
459 dst87_r = __lsx_vilvl_h(dst8, dst7);
460 dst87_l = __lsx_vilvh_h(dst8, dst7);
461 DUP2_ARG2(__lsx_vdp2_w_h, dst21_r, filt_h0, dst21_l, filt_h0,
463 DUP4_ARG3(__lsx_vdp2add_w_h, dst1_r, dst43_r, filt_h1, dst1_l,
464 dst43_l, filt_h1, dst1_r, dst65_r, filt_h2, dst1_l,
465 dst65_l, filt_h2, dst1_r, dst1_l, dst1_r, dst1_l);
466 DUP2_ARG3(__lsx_vdp2add_w_h, dst1_r, dst87_r, filt_h3, dst1_l,
467 dst87_l, filt_h3, dst1_r, dst1_l);
468 DUP2_ARG2(__lsx_vsrai_w, dst1_r, 6, dst1_l, 6, dst1_r, dst1_l);
469 DUP4_ARG2(__lsx_vsrari_w, dst0_r, 6, dst0_l, 6,dst1_r, 6, dst1_l,
470 6, dst0_r, dst0_l, dst1_r, dst1_l);
471 DUP4_ARG1(__lsx_vclip255_w, dst0_l, dst0_r, dst1_l, dst1_r,
472 dst0_l, dst0_r, dst1_l, dst1_r);
473 DUP2_ARG2(__lsx_vpickev_h, dst0_l, dst0_r, dst1_l, dst1_r,
475 out = __lsx_vpickev_b(dst1, dst0);
476 __lsx_vstelm_d(
out, dst_tmp, 0, 0);
477 __lsx_vstelm_d(
out, dst_tmp + dst_stride, 0, 1);
478 dst_tmp += dst_stride_2x;
500 int32_t dst_stride,
const int8_t *filter_x,
504 filter_x, filter_y,
height, 8);
508 int32_t dst_stride,
const int8_t *filter_x,
512 filter_x, filter_y,
height, 16);
516 int32_t dst_stride,
const int8_t *filter_x,
520 filter_x, filter_y,
height, 24);
524 int32_t dst_stride,
const int8_t *filter_x,
528 filter_x, filter_y,
height, 32);
532 int32_t dst_stride,
const int8_t *filter_x,
536 filter_x, filter_y,
height, 48);
540 int32_t dst_stride,
const int8_t *filter_x,
544 filter_x, filter_y,
height, 64);
553 int32_t src_stride_2x = (src_stride << 1);
554 int32_t src_stride_3x = src_stride_2x + src_stride;
557 __m128i
src0,
src1,
src2, src3, src4, src6, src7, src8, src9, src10;
558 __m128i filt0, filt1;
559 __m128i src10_r, src32_r, src76_r, src98_r, src21_r, src43_r, src87_r;
560 __m128i src109_r, src10_l, src32_l, src21_l, src43_l;
561 __m128i out0_r, out1_r, out2_r, out3_r, out0_l, out1_l;
562 __m128i out1, out2, out3, out4;
575 src6 = __lsx_vld(
_src, 0);
577 src += src_stride_3x;
578 _src += src_stride_3x;
579 DUP2_ARG2(__lsx_vilvl_b, src7, src6, src8, src7, src76_r, src87_r);
581 for (loop_cnt =
height >> 1; loop_cnt--;) {
585 DUP2_ARG2(__lsx_vilvl_b, src3,
src2, src4, src3, src32_r, src43_r);
586 DUP2_ARG2(__lsx_vilvh_b, src3,
src2, src4, src3, src32_l, src43_l);
589 src += src_stride_2x;
590 _src += src_stride_2x;
591 DUP2_ARG2(__lsx_vilvl_b, src9, src8, src10, src9, src98_r, src109_r);
594 DUP4_ARG2(__lsx_vdp2_h_bu_b, src10_r, filt0, src10_l, filt0, src21_r,
595 filt0, src21_l, filt0, out0_r, out0_l, out1_r, out1_l);
596 DUP4_ARG3(__lsx_vdp2add_h_bu_b, out0_r, src32_r, filt1, out0_l, src32_l,
597 filt1, out1_r, src43_r, filt1, out1_l, src43_l, filt1,
598 out0_r, out0_l, out1_r, out1_l);
601 DUP2_ARG2(__lsx_vdp2_h_bu_b, src76_r, filt0, src87_r, filt0,
603 DUP2_ARG3(__lsx_vdp2add_h_bu_b, out2_r, src98_r, filt1, out3_r,
604 src109_r, filt1, out2_r, out3_r);
607 DUP4_ARG3(__lsx_vssrarni_bu_h, out0_l, out0_r, 6, out2_r, out2_r, 6,
608 out1_l, out1_r, 6, out3_r, out3_r, 6, out1, out2, out3, out4);
609 __lsx_vst(out1,
dst, 0);
610 __lsx_vstelm_d(out2,
dst, 16, 0);
612 __lsx_vst(out3,
dst, 0);
613 __lsx_vstelm_d(out4,
dst, 16, 0);
633 int32_t src_stride_2x = (src_stride << 1);
634 int32_t dst_stride_2x = (dst_stride << 1);
635 int32_t src_stride_3x = src_stride_2x + src_stride;
638 __m128i
src0,
src1,
src2, src3, src4, src6, src7, src8, src9, src10;
639 __m128i src10_r, src32_r, src76_r, src98_r;
640 __m128i src21_r, src43_r, src87_r, src109_r;
641 __m128i out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
642 __m128i src10_l, src32_l, src76_l, src98_l;
643 __m128i src21_l, src43_l, src87_l, src109_l;
644 __m128i filt0, filt1;
659 src6 = __lsx_vld(
_src, 0);
661 src += src_stride_3x;
662 _src += src_stride_3x;
664 DUP2_ARG2(__lsx_vilvl_b, src7, src6, src8, src7, src76_r, src87_r);
665 DUP2_ARG2(__lsx_vilvh_b, src7, src6, src8, src7, src76_l, src87_l);
667 for (loop_cnt = (
height >> 1); loop_cnt--;) {
671 DUP2_ARG2(__lsx_vilvl_b, src3,
src2, src4, src3, src32_r, src43_r);
672 DUP2_ARG2(__lsx_vilvh_b, src3,
src2, src4, src3, src32_l, src43_l);
675 DUP4_ARG2(__lsx_vdp2_h_bu_b, src10_r, filt0, src10_l, filt0, src21_r,
676 filt0, src21_l, filt0, out0_r, out0_l, out1_r, out1_l);
677 DUP4_ARG3(__lsx_vdp2add_h_bu_b, out0_r, src32_r, filt1, out0_l, src32_l,
678 filt1, out1_r, src43_r, filt1, out1_l, src43_l, filt1,
679 out0_r, out0_l, out1_r, out1_l);
681 DUP2_ARG3(__lsx_vssrarni_bu_h, out0_l, out0_r, 6, out1_l, out1_r, 6,
683 __lsx_vst(out1,
dst, 0);
684 __lsx_vstx(out2,
dst, dst_stride);
693 src += src_stride_2x;
694 _src += src_stride_2x;
695 DUP2_ARG2(__lsx_vilvl_b, src9, src8, src10, src9, src98_r, src109_r);
696 DUP2_ARG2(__lsx_vilvh_b, src9, src8, src10, src9, src98_l, src109_l);
699 DUP4_ARG2(__lsx_vdp2_h_bu_b, src76_r, filt0, src76_l, filt0, src87_r,
700 filt0, src87_l, filt0, out2_r, out2_l, out3_r, out3_l);
701 DUP4_ARG3(__lsx_vdp2add_h_bu_b, out2_r, src98_r, filt1, out2_l, src98_l,
702 filt1, out3_r, src109_r, filt1, out3_l, src109_l, filt1,
703 out2_r, out2_l, out3_r, out3_l);
706 DUP2_ARG3(__lsx_vssrarni_bu_h, out2_l, out2_r, 6, out3_l, out3_r, 6,
708 __lsx_vst(out1,
dst, 16);
709 __lsx_vst(out2,
dst + dst_stride, 16);
711 dst += dst_stride_2x;
723 int32_t dst_stride,
const int8_t *filter_x,
724 const int8_t *filter_y)
726 const int32_t src_stride_2x = (src_stride << 1);
727 const int32_t src_stride_4x = (src_stride << 2);
728 const int32_t src_stride_3x = src_stride_2x + src_stride;
731 __m128i filt0, filt1;
732 __m128i filt_h0, filt_h1, filter_vec;
735 __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9;
736 __m128i dst0, dst1, dst2, dst3, dst4;
737 __m128i dst0_r, dst0_l, dst1_r, dst1_l;
738 __m128i dst10_r, dst32_r, dst21_r, dst43_r;
739 __m128i dst10_l, dst32_l, dst21_l, dst43_l;
740 __m128i out0_r, out1_r;
742 src -= (src_stride + 1);
743 DUP2_ARG2(__lsx_vldrepl_h, filter_x, 0, filter_x, 2, filt0, filt1);
745 filter_vec = __lsx_vld(filter_y, 0);
746 filter_vec = __lsx_vsllwil_h_b(filter_vec, 0);
747 DUP2_ARG2(__lsx_vreplvei_w, filter_vec, 0, filter_vec, 1, filt_h0, filt_h1);
749 mask1 = __lsx_vaddi_bu(mask0, 2);
752 src_stride_3x,
src, src_stride_4x,
src1,
src2, src3, src4);
755 mask0,
src1,
src1, mask1, vec0, vec1, vec2, vec3);
757 mask0, src3, src3, mask1, vec4, vec5, vec6, vec7);
758 DUP2_ARG3(__lsx_vshuf_b, src4, src4, mask0, src4, src4, mask1, vec8, vec9);
760 DUP4_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec2, filt0, vec4, filt0, vec6,
761 filt0, dst0, dst1, dst2, dst3);
762 dst4 = __lsx_vdp2_h_bu_b(vec8, filt0);
763 DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec1, filt1, dst1, vec3, filt1, dst2,
764 vec5, filt1, dst3, vec7, filt1, dst0, dst1, dst2, dst3);
765 dst4 = __lsx_vdp2add_h_bu_b(dst4, vec9, filt1);
766 DUP4_ARG2(__lsx_vilvl_h, dst1, dst0, dst2, dst1, dst3, dst2, dst4, dst3,
767 dst10_r, dst21_r, dst32_r, dst43_r);
768 DUP4_ARG2(__lsx_vilvh_h, dst1, dst0, dst2, dst1, dst3, dst2, dst4, dst3,
769 dst10_l, dst21_l, dst32_l, dst43_l);
770 DUP4_ARG2(__lsx_vdp2_w_h, dst10_r, filt_h0, dst10_l, filt_h0, dst21_r,
771 filt_h0, dst21_l, filt_h0, dst0_r, dst0_l, dst1_r, dst1_l);
772 DUP4_ARG3(__lsx_vdp2add_w_h, dst0_r, dst32_r, filt_h1, dst0_l, dst32_l,
773 filt_h1, dst1_r, dst43_r, filt_h1, dst1_l, dst43_l, filt_h1,
774 dst0_r, dst0_l, dst1_r, dst1_l);
775 DUP2_ARG3(__lsx_vsrani_h_w, dst0_l, dst0_r, 6, dst1_l, dst1_r, 6,
777 out = __lsx_vssrarni_bu_h(out1_r, out0_r, 6);
778 __lsx_vstelm_d(
out,
dst, 0, 0);
779 __lsx_vstelm_d(
out,
dst + dst_stride, 0, 1);
784 int32_t dst_stride,
const int8_t *filter_x,
785 const int8_t *filter_y)
787 const int32_t src_stride_2x = (src_stride << 1);
788 const int32_t dst_stride_2x = (dst_stride << 1);
789 const int32_t src_stride_4x = (src_stride << 2);
790 const int32_t dst_stride_4x = (dst_stride << 2);
791 const int32_t src_stride_3x = src_stride_2x + src_stride;
792 const int32_t dst_stride_3x = dst_stride_2x + dst_stride;
793 __m128i out0, out1, out2;
794 __m128i
src0,
src1,
src2, src3, src4, src5, src6, src7, src8;
795 __m128i filt0, filt1;
796 __m128i filt_h0, filt_h1, filter_vec;
799 __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9;
800 __m128i vec10, vec11, vec12, vec13, vec14, vec15, vec16, vec17;
801 __m128i dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8;
802 __m128i dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
803 __m128i dst4_r, dst4_l, dst5_r, dst5_l;
804 __m128i dst10_r, dst32_r, dst10_l, dst32_l;
805 __m128i dst21_r, dst43_r, dst21_l, dst43_l;
806 __m128i dst54_r, dst54_l, dst65_r, dst65_l;
807 __m128i dst76_r, dst76_l, dst87_r, dst87_l;
808 __m128i out0_r, out1_r, out2_r, out3_r, out4_r, out5_r;
810 src -= (src_stride + 1);
811 DUP2_ARG2(__lsx_vldrepl_h, filter_x, 0, filter_x, 2, filt0, filt1);
813 filter_vec = __lsx_vld(filter_y, 0);
814 filter_vec = __lsx_vsllwil_h_b(filter_vec, 0);
815 DUP2_ARG2(__lsx_vreplvei_w, filter_vec, 0, filter_vec, 1, filt_h0, filt_h1);
817 mask1 = __lsx_vaddi_bu(mask0, 2);
821 src_stride_3x,
src, src_stride_4x,
src1,
src2, src3, src4);
822 src += src_stride_4x;
824 src_stride_3x,
src, src_stride_4x, src5, src6, src7, src8);
827 mask0,
src1,
src1, mask1, vec0, vec1, vec2, vec3);
829 mask0, src3, src3, mask1, vec4, vec5, vec6, vec7);
830 DUP4_ARG3(__lsx_vshuf_b, src4, src4, mask0, src4, src4, mask1, src5, src5,
831 mask0, src5, src5, mask1, vec8, vec9, vec10, vec11);
832 DUP4_ARG3(__lsx_vshuf_b, src6, src6, mask0, src6, src6, mask1, src7, src7,
833 mask0, src7, src7, mask1, vec12, vec13, vec14, vec15);
834 DUP2_ARG3(__lsx_vshuf_b, src8, src8, mask0, src8, src8, mask1, vec16, vec17);
836 DUP4_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec2, filt0, vec4, filt0, vec6,
837 filt0, dst0, dst1, dst2, dst3);
838 DUP4_ARG2(__lsx_vdp2_h_bu_b, vec8, filt0, vec10, filt0, vec12, filt0, vec14,
839 filt0, dst4, dst5, dst6, dst7);
840 dst8 = __lsx_vdp2_h_bu_b(vec16, filt0);
841 DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec1, filt1, dst1, vec3, filt1, dst2,
842 vec5, filt1, dst3, vec7, filt1, dst0, dst1, dst2, dst3);
843 DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst4, vec9, filt1, dst5, vec11, filt1, dst6,
844 vec13, filt1, dst7, vec15, filt1, dst4, dst5, dst6, dst7);
845 dst8 = __lsx_vdp2add_h_bu_b(dst8, vec17, filt1);
847 DUP4_ARG2(__lsx_vilvl_h, dst1, dst0, dst2, dst1, dst3, dst2, dst4, dst3,
848 dst10_r, dst21_r, dst32_r, dst43_r);
849 DUP4_ARG2(__lsx_vilvh_h, dst1, dst0, dst2, dst1, dst3, dst2, dst4, dst3,
850 dst10_l, dst21_l, dst32_l, dst43_l);
851 DUP4_ARG2(__lsx_vilvl_h, dst5, dst4, dst6, dst5, dst7, dst6, dst8, dst7,
852 dst54_r, dst65_r, dst76_r, dst87_r);
853 DUP4_ARG2(__lsx_vilvh_h, dst5, dst4, dst6, dst5, dst7, dst6, dst8, dst7,
854 dst54_l, dst65_l, dst76_l, dst87_l);
856 DUP4_ARG2(__lsx_vdp2_w_h, dst10_r, filt_h0, dst10_l, filt_h0, dst21_r,
857 filt_h0, dst21_l, filt_h0, dst0_r, dst0_l, dst1_r, dst1_l);
858 DUP4_ARG2(__lsx_vdp2_w_h, dst32_r, filt_h0, dst32_l, filt_h0, dst43_r,
859 filt_h0, dst43_l, filt_h0, dst2_r, dst2_l, dst3_r, dst3_l);
860 DUP4_ARG2(__lsx_vdp2_w_h, dst54_r, filt_h0, dst54_l, filt_h0, dst65_r,
861 filt_h0, dst65_l, filt_h0, dst4_r, dst4_l, dst5_r, dst5_l);
862 DUP4_ARG3(__lsx_vdp2add_w_h, dst0_r, dst32_r, filt_h1, dst0_l, dst32_l,
863 filt_h1, dst1_r, dst43_r, filt_h1, dst1_l, dst43_l, filt_h1,
864 dst0_r, dst0_l, dst1_r, dst1_l);
865 DUP4_ARG3(__lsx_vdp2add_w_h, dst2_r, dst54_r, filt_h1, dst2_l, dst54_l,
866 filt_h1, dst3_r, dst65_r, filt_h1, dst3_l, dst65_l, filt_h1,
867 dst2_r, dst2_l, dst3_r, dst3_l);
868 DUP4_ARG3(__lsx_vdp2add_w_h, dst4_r, dst76_r, filt_h1, dst4_l, dst76_l,
869 filt_h1, dst5_r, dst87_r, filt_h1, dst5_l, dst87_l, filt_h1,
870 dst4_r, dst4_l, dst5_r, dst5_l);
872 DUP4_ARG3(__lsx_vsrani_h_w, dst0_l, dst0_r, 6, dst1_l, dst1_r, 6, dst2_l,
873 dst2_r, 6, dst3_l, dst3_r, 6, out0_r, out1_r, out2_r, out3_r);
874 DUP2_ARG3(__lsx_vsrani_h_w, dst4_l, dst4_r, 6, dst5_l, dst5_r, 6,
876 DUP2_ARG3(__lsx_vssrarni_bu_h, out1_r, out0_r, 6, out3_r, out2_r, 6,
878 out2 = __lsx_vssrarni_bu_h(out5_r, out4_r, 6);
880 __lsx_vstelm_d(out0,
dst, 0, 0);
881 __lsx_vstelm_d(out0,
dst + dst_stride, 0, 1);
882 __lsx_vstelm_d(out1,
dst + dst_stride_2x, 0, 0);
883 __lsx_vstelm_d(out1,
dst + dst_stride_3x, 0, 1);
884 dst += dst_stride_4x;
885 __lsx_vstelm_d(out2,
dst, 0, 0);
886 __lsx_vstelm_d(out2,
dst + dst_stride, 0, 1);
891 int32_t dst_stride,
const int8_t *filter_x,
894 uint32_t loop_cnt, cnt;
895 const uint8_t *src_tmp;
897 const int32_t src_stride_2x = (src_stride << 1);
898 const int32_t dst_stride_2x = (dst_stride << 1);
899 const int32_t src_stride_4x = (src_stride << 2);
900 const int32_t dst_stride_4x = (dst_stride << 2);
901 const int32_t src_stride_3x = src_stride_2x + src_stride;
902 const int32_t dst_stride_3x = dst_stride_2x + dst_stride;
906 __m128i filt0, filt1;
907 __m128i filt_h0, filt_h1, filter_vec;
910 __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
911 __m128i dst0, dst1, dst2, dst3, dst4, dst5;
912 __m128i dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
913 __m128i dst10_r, dst32_r, dst21_r, dst43_r;
914 __m128i dst10_l, dst32_l, dst21_l, dst43_l;
915 __m128i dst54_r, dst54_l, dst65_r, dst65_l, dst6;
916 __m128i out0_r, out1_r, out2_r, out3_r;
918 src -= (src_stride + 1);
919 DUP2_ARG2(__lsx_vldrepl_h, filter_x, 0, filter_x, 2, filt0, filt1);
921 filter_vec = __lsx_vld(filter_y, 0);
922 filter_vec = __lsx_vsllwil_h_b(filter_vec, 0);
923 DUP2_ARG2(__lsx_vreplvei_w, filter_vec, 0, filter_vec, 1, filt_h0, filt_h1);
924 mask1 = __lsx_vaddi_bu(mask0, 2);
926 for (cnt = width8mult; cnt--;) {
930 src0 = __lsx_vld(src_tmp, 0);
931 DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride_2x,
933 src_tmp += src_stride_3x;
942 DUP2_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec2, filt0, dst0, dst1);
943 dst2 = __lsx_vdp2_h_bu_b(vec4, filt0);
944 DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec1, filt1, dst1, vec3, filt1,
946 dst2 = __lsx_vdp2add_h_bu_b(dst2, vec5, filt1);
948 DUP2_ARG2(__lsx_vilvl_h, dst1, dst0, dst2, dst1, dst10_r, dst21_r);
949 DUP2_ARG2(__lsx_vilvh_h, dst1, dst0, dst2, dst1, dst10_l, dst21_l);
951 for (loop_cnt = (
height >> 2); loop_cnt--;) {
952 src3 = __lsx_vld(src_tmp, 0);
953 DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride_2x,
955 src6 = __lsx_vldx(src_tmp, src_stride_3x);
956 src_tmp += src_stride_4x;
958 DUP4_ARG3(__lsx_vshuf_b, src3, src3, mask0, src3, src3, mask1, src4,
959 src4, mask0, src4, src4, mask1, vec0, vec1, vec2, vec3);
960 DUP4_ARG3(__lsx_vshuf_b, src5, src5, mask0, src5, src5, mask1, src6,
961 src6, mask0, src6, src6, mask1, vec4, vec5, vec6, vec7);
963 DUP4_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec2, filt0, vec4, filt0,
964 vec6, filt0, dst3, dst4, dst5, dst6);
965 DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst3, vec1, filt1, dst4, vec3,
966 filt1, dst5, vec5, filt1, dst6, vec7, filt1,
967 dst3, dst4, dst5, dst6);
969 DUP4_ARG2(__lsx_vilvl_h, dst3, dst2, dst4, dst3, dst5, dst4,
970 dst6, dst5, dst32_r, dst43_r, dst54_r, dst65_r);
971 DUP4_ARG2(__lsx_vilvh_h, dst3, dst2, dst4, dst3, dst5, dst4,
972 dst6, dst5, dst32_l, dst43_l, dst54_l, dst65_l);
974 DUP4_ARG2(__lsx_vdp2_w_h, dst10_r, filt_h0, dst10_l, filt_h0, dst21_r,
975 filt_h0, dst21_l, filt_h0, dst0_r, dst0_l, dst1_r, dst1_l);
976 DUP4_ARG2(__lsx_vdp2_w_h, dst32_r, filt_h0, dst32_l, filt_h0, dst43_r,
977 filt_h0, dst43_l, filt_h0, dst2_r, dst2_l, dst3_r, dst3_l);
978 DUP4_ARG3(__lsx_vdp2add_w_h, dst0_r, dst32_r, filt_h1, dst0_l,
979 dst32_l, filt_h1, dst1_r, dst43_r, filt_h1, dst1_l,
980 dst43_l, filt_h1, dst0_r, dst0_l, dst1_r, dst1_l);
981 DUP4_ARG3(__lsx_vdp2add_w_h, dst2_r, dst54_r, filt_h1, dst2_l,
982 dst54_l, filt_h1, dst3_r, dst65_r, filt_h1, dst3_l,
983 dst65_l, filt_h1, dst2_r, dst2_l, dst3_r, dst3_l);
985 DUP4_ARG3(__lsx_vsrani_h_w, dst0_l, dst0_r, 6, dst1_l, dst1_r, 6,
986 dst2_l, dst2_r, 6, dst3_l, dst3_r, 6, out0_r, out1_r,
988 DUP2_ARG3(__lsx_vssrarni_bu_h, out1_r, out0_r, 6, out3_r, out2_r,
990 __lsx_vstelm_d(out0, dst_tmp, 0, 0);
991 __lsx_vstelm_d(out0, dst_tmp + dst_stride, 0, 1);
992 __lsx_vstelm_d(out1, dst_tmp + dst_stride_2x, 0, 0);
993 __lsx_vstelm_d(out1, dst_tmp + dst_stride_3x, 0, 1);
994 dst_tmp += dst_stride_4x;
1009 int32_t dst_stride,
const int8_t *filter_x,
1014 }
else if (6 ==
height) {
1016 }
else if (0 == (
height & 0x03)) {
1023 int32_t dst_stride,
const int8_t *filter_x,
1027 const uint8_t *src_tmp;
1029 const int32_t src_stride_2x = (src_stride << 1);
1030 const int32_t dst_stride_2x = (dst_stride << 1);
1031 const int32_t src_stride_4x = (src_stride << 2);
1032 const int32_t dst_stride_4x = (dst_stride << 2);
1033 const int32_t src_stride_3x = src_stride_2x + src_stride;
1034 const int32_t dst_stride_3x = dst_stride_2x + dst_stride;
1037 __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1038 __m128i mask0, mask1;
1039 __m128i filt0, filt1, filt_h0, filt_h1, filter_vec, tmp0, tmp1, tmp2, tmp3;
1040 __m128i dsth0, dsth1, dsth2, dsth3, dsth4, dsth5, dsth6;
1041 __m128i dst10_r, dst32_r, dst54_r, dst21_r, dst43_r, dst65_r;
1042 __m128i dst10_l, dst32_l, dst54_l, dst21_l, dst43_l, dst65_l;
1043 __m128i dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
1045 src -= (src_stride + 1);
1046 DUP2_ARG2(__lsx_vldrepl_h, filter_x, 0, filter_x, 2, filt0, filt1);
1048 filter_vec = __lsx_vld(filter_y, 0);
1049 filter_vec = __lsx_vsllwil_h_b(filter_vec, 0);
1050 DUP2_ARG2(__lsx_vreplvei_w, filter_vec, 0, filter_vec, 1, filt_h0, filt_h1);
1053 mask1 = __lsx_vaddi_bu(mask0, 2);
1058 src0 = __lsx_vld(src_tmp, 0);
1060 src_tmp += src_stride_3x;
1066 DUP2_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec2, filt0, dsth0, dsth1);
1067 dsth2 = __lsx_vdp2_h_bu_b(vec4, filt0);
1068 DUP2_ARG3(__lsx_vdp2add_h_bu_b, dsth0, vec1, filt1, dsth1, vec3, filt1, dsth0, dsth1);
1069 dsth2 = __lsx_vdp2add_h_bu_b(dsth2, vec5, filt1);
1071 DUP2_ARG2(__lsx_vilvl_h, dsth1, dsth0, dsth2, dsth1, dst10_r, dst21_r);
1072 DUP2_ARG2(__lsx_vilvh_h, dsth1, dsth0, dsth2, dsth1, dst10_l, dst21_l);
1074 for (loop_cnt =
height >> 2; loop_cnt--;) {
1075 src3 = __lsx_vld(src_tmp, 0);
1076 DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride_2x, src4, src5);
1077 src6 = __lsx_vldx(src_tmp, src_stride_3x);
1078 src_tmp += src_stride_4x;
1080 DUP4_ARG3(__lsx_vshuf_b, src3, src3, mask0, src3, src3, mask1, src4,
1081 src4, mask0, src4, src4, mask1, vec0, vec1, vec2, vec3);
1082 DUP4_ARG3(__lsx_vshuf_b, src5, src5, mask0, src5, src5, mask1, src6,
1083 src6, mask0, src6, src6, mask1, vec4, vec5, vec6, vec7);
1085 DUP4_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec2, filt0, vec4, filt0,
1086 vec6, filt0, dsth3, dsth4, dsth5, dsth6);
1087 DUP4_ARG3(__lsx_vdp2add_h_bu_b, dsth3, vec1, filt1, dsth4,
1088 vec3, filt1, dsth5, vec5, filt1, dsth6, vec7, filt1,
1089 dsth3, dsth4, dsth5, dsth6);
1091 DUP4_ARG2(__lsx_vilvl_h, dsth3, dsth2, dsth4, dsth3, dsth5, dsth4,
1092 dsth6, dsth5, dst32_r, dst43_r, dst54_r, dst65_r);
1093 DUP4_ARG2(__lsx_vilvh_h, dsth3, dsth2, dsth4, dsth3, dsth5, dsth4,
1094 dsth6, dsth5, dst32_l, dst43_l, dst54_l, dst65_l);
1096 DUP4_ARG2(__lsx_vdp2_w_h, dst10_r, filt_h0, dst10_l, filt_h0, dst21_r,
1097 filt_h0, dst21_l, filt_h0, dst0_r, dst0_l, dst1_r, dst1_l);
1098 DUP4_ARG2(__lsx_vdp2_w_h, dst32_r, filt_h0, dst32_l, filt_h0, dst43_r,
1099 filt_h0, dst43_l, filt_h0, dst2_r, dst2_l, dst3_r, dst3_l);
1100 DUP4_ARG3(__lsx_vdp2add_w_h, dst0_r, dst32_r, filt_h1, dst0_l, dst32_l,
1101 filt_h1, dst1_r, dst43_r, filt_h1, dst1_l, dst43_l, filt_h1,
1102 dst0_r, dst0_l, dst1_r, dst1_l);
1103 DUP4_ARG3(__lsx_vdp2add_w_h, dst2_r, dst54_r, filt_h1, dst2_l, dst54_l,
1104 filt_h1, dst3_r, dst65_r, filt_h1, dst3_l, dst65_l, filt_h1,
1105 dst2_r, dst2_l, dst3_r, dst3_l);
1107 DUP4_ARG3(__lsx_vsrani_h_w, dst0_l, dst0_r, 6, dst1_l, dst1_r, 6, dst2_l,
1108 dst2_r, 6, dst3_l, dst3_r, 6, tmp0, tmp1, tmp2, tmp3);
1109 DUP2_ARG3(__lsx_vssrarni_bu_h, tmp1, tmp0, 6, tmp3, tmp2, 6, out0, out1);
1111 __lsx_vstelm_d(out0, dst_tmp, 0, 0);
1112 __lsx_vstelm_d(out0, dst_tmp + dst_stride, 0, 1);
1113 __lsx_vstelm_d(out1, dst_tmp + dst_stride_2x, 0, 0);
1114 __lsx_vstelm_d(out1, dst_tmp + dst_stride_3x, 0, 1);
1115 dst_tmp += dst_stride_4x;
1127 src0 = __lsx_vld(src_tmp, 0);
1129 src_tmp += src_stride_3x;
1135 DUP2_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec2, filt0, dsth0, dsth1);
1136 dsth2 = __lsx_vdp2_h_bu_b(vec4, filt0);
1137 DUP2_ARG3(__lsx_vdp2add_h_bu_b, dsth0, vec1, filt1, dsth1, vec3, filt1, dsth0, dsth1);
1138 dsth2 = __lsx_vdp2add_h_bu_b(dsth2, vec5, filt1);
1140 DUP2_ARG2(__lsx_vilvl_h, dsth1, dsth0, dsth2, dsth1, dst10_r, dst21_r);
1141 DUP2_ARG2(__lsx_vilvh_h, dsth1, dsth0, dsth2, dsth1, dst10_l, dst21_l);
1143 for (loop_cnt =
height >> 2; loop_cnt--;) {
1144 src3 = __lsx_vld(src_tmp, 0);
1145 DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride_2x, src4, src5);
1146 src6 = __lsx_vldx(src_tmp, src_stride_3x);
1147 src_tmp += src_stride_4x;
1149 DUP4_ARG3(__lsx_vshuf_b, src3, src3, mask0, src3, src3, mask1, src4,
1150 src4, mask0, src4, src4, mask1, vec0, vec1, vec2, vec3);
1151 DUP4_ARG3(__lsx_vshuf_b, src5, src5, mask0, src5, src5, mask1, src6,
1152 src6, mask0, src6, src6, mask1, vec4, vec5, vec6, vec7);
1154 DUP4_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec2, filt0, vec4, filt0,
1155 vec6, filt0, dsth3, dsth4, dsth5, dsth6);
1156 DUP4_ARG3(__lsx_vdp2add_h_bu_b, dsth3, vec1, filt1, dsth4,
1157 vec3, filt1, dsth5, vec5, filt1, dsth6, vec7, filt1,
1158 dsth3, dsth4, dsth5, dsth6);
1160 DUP4_ARG2(__lsx_vilvl_h, dsth3, dsth2, dsth4, dsth3, dsth5, dsth4,
1161 dsth6, dsth5, dst32_r, dst43_r, dst54_r, dst65_r);
1162 DUP4_ARG2(__lsx_vilvh_h, dsth3, dsth2, dsth4, dsth3, dsth5, dsth4,
1163 dsth6, dsth5, dst32_l, dst43_l, dst54_l, dst65_l);
1165 DUP4_ARG2(__lsx_vdp2_w_h, dst10_r, filt_h0, dst10_l, filt_h0, dst21_r,
1166 filt_h0, dst21_l, filt_h0, dst0_r, dst0_l, dst1_r, dst1_l);
1167 DUP4_ARG2(__lsx_vdp2_w_h, dst32_r, filt_h0, dst32_l, filt_h0, dst43_r,
1168 filt_h0, dst43_l, filt_h0, dst2_r, dst2_l, dst3_r, dst3_l);
1169 DUP4_ARG3(__lsx_vdp2add_w_h, dst0_r, dst32_r, filt_h1, dst0_l, dst32_l,
1170 filt_h1, dst1_r, dst43_r, filt_h1, dst1_l, dst43_l, filt_h1,
1171 dst0_r, dst0_l, dst1_r, dst1_l);
1172 DUP4_ARG3(__lsx_vdp2add_w_h, dst2_r, dst54_r, filt_h1, dst2_l, dst54_l,
1173 filt_h1, dst3_r, dst65_r, filt_h1, dst3_l, dst65_l, filt_h1,
1174 dst2_r, dst2_l, dst3_r, dst3_l);
1176 DUP4_ARG3(__lsx_vsrani_h_w, dst0_l, dst0_r, 6, dst1_l, dst1_r, 6, dst2_l,
1177 dst2_r, 6, dst3_l, dst3_r, 6, tmp0, tmp1, tmp2, tmp3);
1178 DUP2_ARG3(__lsx_vssrarni_bu_h, tmp1, tmp0, 6, tmp3, tmp2, 6, out0, out1);
1180 __lsx_vstelm_w(out0, dst_tmp, 0, 0);
1181 __lsx_vstelm_w(out0, dst_tmp + dst_stride, 0, 2);
1182 __lsx_vstelm_w(out1, dst_tmp + dst_stride_2x, 0, 0);
1183 __lsx_vstelm_w(out1, dst_tmp + dst_stride_3x, 0, 2);
1184 dst_tmp += dst_stride_4x;
1195 int32_t dst_stride,
const int8_t *filter_x,
1202 int32_t dst_stride,
const int8_t *filter_x,
1209 int32_t dst_stride,
const int8_t *filter_x,
1215 #define UNI_MC(PEL, DIR, WIDTH, TAP, DIR1, FILT_DIR) \
1216 void ff_hevc_put_hevc_uni_##PEL##_##DIR##WIDTH##_8_lsx(uint8_t *dst, \
1217 ptrdiff_t dst_stride, \
1218 const uint8_t *src, \
1219 ptrdiff_t src_stride, \
1225 const int8_t *filter = ff_hevc_##PEL##_filters[FILT_DIR]; \
1227 common_##DIR1##_##TAP##t_##WIDTH##w_lsx(src, src_stride, dst, dst_stride, \
1243 #define UNI_MC_HV(PEL, WIDTH, TAP) \
1244 void ff_hevc_put_hevc_uni_##PEL##_hv##WIDTH##_8_lsx(uint8_t *dst, \
1245 ptrdiff_t dst_stride, \
1246 const uint8_t *src, \
1247 ptrdiff_t src_stride, \
1253 const int8_t *filter_x = ff_hevc_##PEL##_filters[mx]; \
1254 const int8_t *filter_y = ff_hevc_##PEL##_filters[my]; \
1256 hevc_hv_##TAP##t_##WIDTH##w_lsx(src, src_stride, dst, dst_stride, \
1257 filter_x, filter_y, height); \