28 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
29 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20
37 vec0 = __lsx_vsadd_h(in0, vec0);
38 vec1 = __lsx_vsadd_h(in1, vec1);
39 out = __lsx_vssrarni_bu_h(vec1, vec0, 7);
46 const int16_t *src1_ptr,
int32_t src2_stride,
51 int32_t src_stride_2x = (src_stride << 1);
52 int32_t dst_stride_2x = (dst_stride << 1);
53 int32_t src_stride_4x = (src_stride << 2);
54 int32_t dst_stride_4x = (dst_stride << 2);
55 int32_t src2_stride_2x = (src2_stride << 1);
56 int32_t src2_stride_4x = (src2_stride << 2);
57 int32_t src_stride_3x = src_stride_2x + src_stride;
58 int32_t dst_stride_3x = dst_stride_2x + dst_stride;
59 int32_t src2_stride_3x = src2_stride_2x + src2_stride;
61 __m128i
zero = __lsx_vldi(0);
62 __m128i in0, in1, in2, in3;
63 __m128i tmp0, tmp1, tmp2, tmp3;
64 __m128i reg0, reg1, reg2, reg3;
65 __m128i dst0, dst1, dst2, dst3;
68 reg0 = __lsx_vldrepl_w(src0_ptr, 0);
69 reg1 = __lsx_vldrepl_w(src0_ptr + src_stride, 0);
70 reg2 = __lsx_vldrepl_w(src0_ptr + src_stride_2x, 0);
71 reg3 = __lsx_vldrepl_w(src0_ptr + src_stride_3x, 0);
72 src0_ptr += src_stride_4x;
73 DUP2_ARG2(__lsx_vilvl_w, reg1, reg0, reg3, reg2, tmp0, tmp1);
74 src0 = __lsx_vilvl_d(tmp1, tmp0);
75 reg0 = __lsx_vldrepl_w(src0_ptr, 0);
76 reg1 = __lsx_vldrepl_w(src0_ptr + src_stride, 0);
77 reg2 = __lsx_vldrepl_w(src0_ptr + src_stride_2x, 0);
78 reg3 = __lsx_vldrepl_w(src0_ptr + src_stride_3x, 0);
79 DUP2_ARG2(__lsx_vilvl_w, reg1, reg0, reg3, reg2, tmp0, tmp1);
80 src1 = __lsx_vilvl_d(tmp1, tmp0);
81 src0_ptr += src_stride_4x;
83 tmp0 = __lsx_vldrepl_d(src1_ptr, 0);
84 tmp1 = __lsx_vldrepl_d(src1_ptr + src2_stride, 0);
85 tmp2 = __lsx_vldrepl_d(src1_ptr + src2_stride_2x, 0);
86 tmp3 = __lsx_vldrepl_d(src1_ptr + src2_stride_3x, 0);
87 src1_ptr += src2_stride_4x;
88 DUP2_ARG2(__lsx_vilvl_d, tmp1, tmp0, tmp3, tmp2, in0, in1);
89 tmp0 = __lsx_vldrepl_d(src1_ptr, 0);
90 tmp1 = __lsx_vldrepl_d(src1_ptr + src2_stride, 0);
91 tmp2 = __lsx_vldrepl_d(src1_ptr + src2_stride_2x, 0);
92 tmp3 = __lsx_vldrepl_d(src1_ptr + src2_stride_3x, 0);
93 src1_ptr += src2_stride_4x;
94 DUP2_ARG2(__lsx_vilvl_d, tmp1, tmp0, tmp3, tmp2, in2, in3);
97 DUP2_ARG2(__lsx_vslli_h, dst1, 6, dst3, 6, dst1, dst3);
100 __lsx_vstelm_w(dst0,
dst, 0, 0);
101 __lsx_vstelm_w(dst0,
dst + dst_stride, 0, 1);
102 __lsx_vstelm_w(dst0,
dst + dst_stride_2x, 0, 2);
103 __lsx_vstelm_w(dst0,
dst + dst_stride_3x, 0, 3);
104 dst += dst_stride_4x;
105 __lsx_vstelm_w(dst1,
dst, 0, 0);
106 __lsx_vstelm_w(dst1,
dst + dst_stride, 0, 1);
107 __lsx_vstelm_w(dst1,
dst + dst_stride_2x, 0, 2);
108 __lsx_vstelm_w(dst1,
dst + dst_stride_3x, 0, 3);
109 dst += dst_stride_4x;
112 reg0 = __lsx_vldrepl_w(src0_ptr, 0);
113 reg1 = __lsx_vldrepl_w(src0_ptr + src_stride, 0);
114 reg2 = __lsx_vldrepl_d(src1_ptr, 0);
115 reg3 = __lsx_vldrepl_d(src1_ptr + src2_stride, 0);
116 src0 = __lsx_vilvl_w(reg1, reg0);
117 in0 = __lsx_vilvl_d(reg3, reg2);
118 dst0 = __lsx_vsllwil_hu_bu(
src0, 6);
119 dst0 = __lsx_vsadd_h(dst0, in0);
120 dst0 = __lsx_vssrarni_bu_h(dst0, dst0, 7);
121 __lsx_vstelm_w(dst0,
dst, 0, 0);
122 __lsx_vstelm_w(dst0,
dst + dst_stride, 0, 1);
123 src0_ptr += src_stride_2x;
124 src1_ptr += src2_stride_2x;
125 dst += dst_stride_2x;
131 const int16_t *src1_ptr,
int32_t src2_stride,
136 int32_t src_stride_2x = (src_stride << 1);
137 int32_t dst_stride_2x = (dst_stride << 1);
138 int32_t src_stride_4x = (src_stride << 2);
139 int32_t dst_stride_4x = (dst_stride << 2);
140 int32_t src2_stride_x = (src2_stride << 1);
141 int32_t src2_stride_2x = (src2_stride << 2);
142 int32_t src_stride_3x = src_stride_2x + src_stride;
143 int32_t dst_stride_3x = dst_stride_2x + dst_stride;
144 int32_t src2_stride_3x = src2_stride_2x + src2_stride_x;
145 __m128i out0, out1, out2, out3;
146 __m128i
zero = __lsx_vldi(0);
148 __m128i in0, in1, in2, in3, in4, in5, in6, in7;
149 __m128i dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
150 __m128i reg0, reg1, reg2, reg3;
152 for (loop_cnt = (
height >> 3); loop_cnt--;) {
153 reg0 = __lsx_vldrepl_d(src0_ptr, 0);
154 reg1 = __lsx_vldrepl_d(src0_ptr + src_stride, 0);
155 reg2 = __lsx_vldrepl_d(src0_ptr + src_stride_2x, 0);
156 reg3 = __lsx_vldrepl_d(src0_ptr + src_stride_3x, 0);
158 src0_ptr += src_stride_4x;
159 reg0 = __lsx_vldrepl_d(src0_ptr, 0);
160 reg1 = __lsx_vldrepl_d(src0_ptr + src_stride, 0);
161 reg2 = __lsx_vldrepl_d(src0_ptr + src_stride_2x, 0);
162 reg3 = __lsx_vldrepl_d(src0_ptr + src_stride_3x, 0);
163 DUP2_ARG2(__lsx_vilvl_d, reg1, reg0, reg3, reg2,
src2, src3);
164 src0_ptr += src_stride_4x;
165 in0 = __lsx_vld(src1_ptr, 0);
166 DUP2_ARG2(__lsx_vldx, src1_ptr, src2_stride_x, src1_ptr,
167 src2_stride_2x, in1, in2);
168 in3 = __lsx_vldx(src1_ptr, src2_stride_3x);
169 src1_ptr += src2_stride_2x;
170 in4 = __lsx_vld(src1_ptr, 0);
171 DUP2_ARG2(__lsx_vldx, src1_ptr, src2_stride_x, src1_ptr,
172 src2_stride_2x, in5, in6);
173 in7 = __lsx_vldx(src1_ptr, src2_stride_3x);
174 src1_ptr += src2_stride_2x;
176 dst0, dst2, dst4, dst6);
178 dst1, dst3, dst5, dst7);
179 DUP4_ARG2(__lsx_vslli_h, dst1, 6, dst3, 6, dst5, 6, dst7, 6, dst1, dst3,
185 __lsx_vstelm_w(out0,
dst, 0, 0);
186 __lsx_vstelm_w(out0,
dst + dst_stride, 0, 2);
187 __lsx_vstelm_h(out0,
dst, 4, 2);
188 __lsx_vstelm_h(out0,
dst + dst_stride, 4, 6);
189 __lsx_vstelm_w(out1,
dst + dst_stride_2x, 0, 0);
190 __lsx_vstelm_w(out1,
dst + dst_stride_3x, 0, 2);
191 __lsx_vstelm_h(out1,
dst + dst_stride_2x, 4, 2);
192 __lsx_vstelm_h(out1,
dst + dst_stride_3x, 4, 6);
193 dst += dst_stride_4x;
194 __lsx_vstelm_w(out2,
dst, 0, 0);
195 __lsx_vstelm_w(out2,
dst + dst_stride, 0, 2);
196 __lsx_vstelm_h(out2,
dst, 4, 2);
197 __lsx_vstelm_h(out2,
dst + dst_stride, 4, 6);
198 __lsx_vstelm_w(out3,
dst + dst_stride_2x, 0, 0);
199 __lsx_vstelm_w(out3,
dst + dst_stride_3x, 0, 2);
200 __lsx_vstelm_h(out3,
dst + dst_stride_2x, 4, 2);
201 __lsx_vstelm_h(out3,
dst + dst_stride_3x, 4, 6);
202 dst += dst_stride_4x;
205 reg0 = __lsx_vldrepl_d(src0_ptr, 0);
206 reg1 = __lsx_vldrepl_d(src0_ptr + src_stride, 0);
207 src0 = __lsx_vilvl_d(reg1, reg0);
208 src0_ptr += src_stride_2x;
209 in0 = __lsx_vld(src1_ptr, 0);
210 in1 = __lsx_vldx(src1_ptr, src2_stride_x);
211 src1_ptr += src2_stride_x;
212 dst0 = __lsx_vsllwil_hu_bu(
src0, 6);
214 dst1 = __lsx_vslli_h(dst1, 6);
216 __lsx_vstelm_w(out0,
dst, 0, 0);
217 __lsx_vstelm_h(out0,
dst, 4, 2);
219 __lsx_vstelm_w(out0,
dst, 0, 2);
220 __lsx_vstelm_h(out0,
dst, 4, 6);
227 const int16_t *src1_ptr,
int32_t src2_stride,
232 int32_t src_stride_2x = (src_stride << 1);
233 int32_t dst_stride_2x = (dst_stride << 1);
234 int32_t src_stride_4x = (src_stride << 2);
235 int32_t dst_stride_4x = (dst_stride << 2);
236 int32_t src2_stride_x = (src2_stride << 1);
237 int32_t src2_stride_2x = (src2_stride << 2);
238 int32_t src_stride_3x = src_stride_2x + src_stride;
239 int32_t dst_stride_3x = dst_stride_2x + dst_stride;
240 int32_t src2_stride_3x = src2_stride_2x + src2_stride_x;
241 __m128i out0, out1, out2, out3;
243 __m128i
zero = __lsx_vldi(0);
244 __m128i in0, in1, in2, in3, in4, in5, in6, in7;
245 __m128i dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
246 __m128i reg0, reg1, reg2, reg3;
248 for (loop_cnt = (
height >> 3); loop_cnt--;) {
249 reg0 = __lsx_vldrepl_d(src0_ptr, 0);
250 reg1 = __lsx_vldrepl_d(src0_ptr + src_stride, 0);
251 reg2 = __lsx_vldrepl_d(src0_ptr + src_stride_2x, 0);
252 reg3 = __lsx_vldrepl_d(src0_ptr + src_stride_3x, 0);
254 src0_ptr += src_stride_4x;
255 reg0 = __lsx_vldrepl_d(src0_ptr, 0);
256 reg1 = __lsx_vldrepl_d(src0_ptr + src_stride, 0);
257 reg2 = __lsx_vldrepl_d(src0_ptr + src_stride_2x, 0);
258 reg3 = __lsx_vldrepl_d(src0_ptr + src_stride_3x, 0);
259 DUP2_ARG2(__lsx_vilvl_d, reg1, reg0, reg3, reg2,
src2, src3);
260 src0_ptr += src_stride_4x;
262 dst0, dst2, dst4, dst6);
264 src3, dst1, dst3, dst5, dst7);
265 DUP4_ARG2(__lsx_vslli_h, dst1, 6, dst3, 6, dst5, 6, dst7, 6, dst1,
267 in0 = __lsx_vld(src1_ptr, 0);
268 DUP2_ARG2(__lsx_vldx, src1_ptr, src2_stride_x, src1_ptr,
269 src2_stride_2x, in1, in2);
270 in3 = __lsx_vldx(src1_ptr, src2_stride_3x);
271 src1_ptr += src2_stride_2x;
272 in4 = __lsx_vld(src1_ptr, 0);
273 DUP2_ARG2(__lsx_vldx, src1_ptr, src2_stride_x, src1_ptr,
274 src2_stride_2x, in5, in6);
275 in7 = __lsx_vldx(src1_ptr, src2_stride_3x);
276 src1_ptr += src2_stride_2x;
281 __lsx_vstelm_d(out0,
dst, 0, 0);
282 __lsx_vstelm_d(out0,
dst + dst_stride, 0, 1);
283 __lsx_vstelm_d(out1,
dst + dst_stride_2x, 0, 0);
284 __lsx_vstelm_d(out1,
dst + dst_stride_3x, 0, 1);
285 dst += dst_stride_4x;
286 __lsx_vstelm_d(out2,
dst, 0, 0);
287 __lsx_vstelm_d(out2,
dst + dst_stride, 0, 1);
288 __lsx_vstelm_d(out3,
dst + dst_stride_2x, 0, 0);
289 __lsx_vstelm_d(out3,
dst + dst_stride_3x, 0, 1);
290 dst += dst_stride_4x;
293 reg0 = __lsx_vldrepl_d(src0_ptr, 0);
294 reg1 = __lsx_vldrepl_d(src0_ptr + src_stride, 0);
295 src0 = __lsx_vilvl_d(reg1, reg0);
296 in0 = __lsx_vld(src1_ptr, 0);
297 in1 = __lsx_vldx(src1_ptr, src2_stride_x);
298 dst0 = __lsx_vsllwil_hu_bu(
src0, 6);
300 dst1 = __lsx_vslli_h(dst1, 6);
302 __lsx_vstelm_d(out0,
dst, 0, 0);
303 __lsx_vstelm_d(out0,
dst + dst_stride, 0, 1);
304 src0_ptr += src_stride_2x;
305 src1_ptr += src2_stride_x;
306 dst += dst_stride_2x;
312 const int16_t *src1_ptr,
int32_t src2_stride,
315 uint32_t loop_cnt =
height >> 2;
316 uint32_t res = (
height & 3) >> 1;
317 int32_t src_stride_2x = (src_stride << 1);
318 int32_t dst_stride_2x = (dst_stride << 1);
319 int32_t src_stride_4x = (src_stride << 2);
320 int32_t dst_stride_4x = (dst_stride << 2);
321 int32_t src2_stride_x = (src2_stride << 1);
322 int32_t src2_stride_2x = (src2_stride << 2);
323 int32_t src_stride_3x = src_stride_2x + src_stride;
324 int32_t dst_stride_3x = dst_stride_2x + dst_stride;
325 int32_t src2_stride_3x = src2_stride_2x + src2_stride_x;
326 const int16_t *_src1 = src1_ptr + 8;
327 __m128i out0, out1, out2;
329 __m128i in0, in1, in2, in3, in4, in5, in6, in7;
330 __m128i dst0, dst1, dst2, dst3, dst4, dst5;
332 for (; loop_cnt--;) {
333 src0 = __lsx_vld(src0_ptr, 0);
334 DUP2_ARG2(__lsx_vldx, src0_ptr, src_stride, src0_ptr, src_stride_2x,
src1,
src2);
335 src3 = __lsx_vldx(src0_ptr, src_stride_3x);
336 src0_ptr += src_stride_4x;
337 in0 = __lsx_vld(src1_ptr, 0);
338 DUP2_ARG2(__lsx_vldx, src1_ptr, src2_stride_x, src1_ptr,
339 src2_stride_2x, in1, in2);
340 in3 = __lsx_vldx(src1_ptr, src2_stride_3x);
341 src1_ptr += src2_stride_2x;
342 in4 = __lsx_vld(_src1, 0);
343 DUP2_ARG2(__lsx_vldx, _src1, src2_stride_x, _src1, src2_stride_2x, in5, in6);
344 in7 = __lsx_vldx(_src1, src2_stride_3x);
345 _src1 += src2_stride_2x;
347 DUP2_ARG2(__lsx_vilvl_d, in5, in4, in7, in6, in4, in5);
349 dst0, dst1, dst2, dst3)
355 __lsx_vstelm_d(out0,
dst, 0, 0);
356 __lsx_vstelm_d(out0,
dst + dst_stride, 0, 1);
357 __lsx_vstelm_d(out1,
dst + dst_stride_2x, 0, 0);
358 __lsx_vstelm_d(out1,
dst + dst_stride_3x, 0, 1);
359 __lsx_vstelm_w(out2,
dst, 8, 0);
360 __lsx_vstelm_w(out2,
dst + dst_stride, 8, 1);
361 __lsx_vstelm_w(out2,
dst + dst_stride_2x, 8, 2);
362 __lsx_vstelm_w(out2,
dst + dst_stride_3x, 8, 3);
363 dst += dst_stride_4x;
366 src0 = __lsx_vld(src0_ptr, 0);
367 src1 = __lsx_vld(src0_ptr + src_stride, 0);
368 in0 = __lsx_vld(src1_ptr, 0);
369 in1 = __lsx_vldx(src1_ptr, src2_stride_x);
370 dst0 = __lsx_vsllwil_hu_bu(
src0, 6);
371 dst1 = __lsx_vsllwil_hu_bu(
src1, 6);
373 __lsx_vstelm_d(out0,
dst, 0, 0);
374 __lsx_vstelm_d(out0,
dst + dst_stride, 0, 1);
376 in0 = __lsx_vldrepl_d(_src1, 0);
377 in1 = __lsx_vldrepl_d(_src1 + src2_stride, 0);
379 in0 = __lsx_vilvl_d(in1, in0);
380 dst0 = __lsx_vsllwil_hu_bu(
src0, 6);
381 dst0 = __lsx_vsadd_h(dst0, in0);
382 dst0 = __lsx_vssrarni_bu_h(dst0, dst0, 7);
383 __lsx_vstelm_w(dst0,
dst, 8, 0);
384 __lsx_vstelm_w(dst0,
dst + dst_stride, 8, 1);
385 src0_ptr += src_stride_2x;
386 _src1 += src2_stride_x;
387 src1_ptr += src2_stride_x;
388 dst += dst_stride_2x;
394 const int16_t *src1_ptr,
int32_t src2_stride,
397 uint32_t loop_cnt =
height >> 2;
398 uint32_t res = (
height & 3) >> 1;
399 int32_t src_stride_2x = (src_stride << 1);
400 int32_t dst_stride_2x = (dst_stride << 1);
401 int32_t src_stride_4x = (src_stride << 2);
402 int32_t dst_stride_4x = (dst_stride << 2);
403 int32_t src2_stride_x = (src2_stride << 1);
404 int32_t src2_stride_2x = (src2_stride << 2);
405 int32_t src_stride_3x = src_stride_2x + src_stride;
406 int32_t dst_stride_3x = dst_stride_2x + dst_stride;
407 int32_t src2_stride_3x = src2_stride_2x + src2_stride_x;
408 const int16_t *_src1 = src1_ptr + 8;
409 __m128i out0, out1, out2, out3;
411 __m128i in0, in1, in2, in3, in4, in5, in6, in7;
412 __m128i dst0_r, dst1_r, dst2_r, dst3_r, dst0_l, dst1_l, dst2_l, dst3_l;
415 for (; loop_cnt--;) {
416 src0 = __lsx_vld(src0_ptr, 0);
417 DUP2_ARG2(__lsx_vldx, src0_ptr, src_stride, src0_ptr, src_stride_2x,
419 src3 = __lsx_vldx(src0_ptr, src_stride_3x);
420 src0_ptr += src_stride_4x;
421 in0 = __lsx_vld(src1_ptr, 0);
422 DUP2_ARG2(__lsx_vldx, src1_ptr, src2_stride_x, src1_ptr,
423 src2_stride_2x, in1, in2);
424 in3 = __lsx_vldx(src1_ptr, src2_stride_3x);
425 src1_ptr += src2_stride_2x;
426 in4 = __lsx_vld(_src1, 0);
427 DUP2_ARG2(__lsx_vldx, _src1, src2_stride_x, _src1, src2_stride_2x,
429 in7 = __lsx_vldx(_src1, src2_stride_3x);
430 _src1 += src2_stride_2x;
432 dst0_r, dst1_r, dst2_r, dst3_r)
434 dst0_l, dst1_l, dst2_l, dst3_l);
435 DUP4_ARG2(__lsx_vslli_h, dst0_l, 6, dst1_l, 6, dst2_l, 6, dst3_l, 6,
436 dst0_l, dst1_l, dst2_l, dst3_l);
442 __lsx_vst(out0,
dst, 0);
443 __lsx_vstx(out1,
dst, dst_stride);
444 __lsx_vstx(out2,
dst, dst_stride_2x);
445 __lsx_vstx(out3,
dst, dst_stride_3x);
446 dst += dst_stride_4x;
449 src0 = __lsx_vld(src0_ptr, 0);
450 src1 = __lsx_vldx(src0_ptr, src_stride);
451 in0 = __lsx_vld(src1_ptr, 0);
452 in1 = __lsx_vldx(src1_ptr, src2_stride_x);
453 in4 = __lsx_vld(_src1, 0);
454 in5 = __lsx_vldx(_src1, src2_stride_x);
458 DUP2_ARG2(__lsx_vslli_h, dst0_l, 6, dst1_l, 6, dst0_l, dst1_l);
461 __lsx_vst(out0,
dst, 0);
462 __lsx_vstx(out1,
dst, dst_stride);
464 src0_ptr += src_stride_2x;
465 _src1 += src2_stride_x;
466 src1_ptr += src2_stride_x;
467 dst += dst_stride_2x;
473 const int16_t *src1_ptr,
int32_t src2_stride,
484 const int16_t *src1_ptr,
int32_t src2_stride,
495 const int16_t *src1_ptr,
int32_t src2_stride,
506 const int16_t *src1_ptr,
int32_t src2_stride,
516 const int16_t *src1_ptr,
int32_t src2_stride,
521 const int32_t dst_stride_2x = (dst_stride << 1);
523 __m128i filt0, filt1, filt2, filt3;
524 __m128i mask1, mask2, mask3;
525 __m128i vec0, vec1, vec2, vec3;
526 __m128i dst0, dst1, dst2, dst3;
527 __m128i in0, in1, in2, in3;
532 filt0, filt1, filt2, filt3);
534 DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
535 mask3 = __lsx_vaddi_bu(mask0, 6);
537 for (loop_cnt = (
height >> 1); loop_cnt--;) {
539 src0_ptr += src_stride;
541 src0_ptr += src_stride;
542 DUP2_ARG2(__lsx_vld, src1_ptr, 0, src1_ptr, 16, in0, in1);
543 src1_ptr += src2_stride;
544 DUP2_ARG2(__lsx_vld, src1_ptr, 0, src1_ptr, 16, in2, in3);
545 src1_ptr += src2_stride;
551 DUP4_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec1, filt0, vec2, filt0,
552 vec3, filt0, dst0, dst1, dst2, dst3);
557 DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec0, filt1, dst1, vec1, filt1,
558 dst2, vec2, filt1, dst3, vec3, filt1, dst0, dst1, dst2, dst3);
563 DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec0, filt2, dst1, vec1, filt2,
564 dst2, vec2, filt2, dst3, vec3, filt2, dst0, dst1, dst2, dst3);
569 DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec0, filt3, dst1, vec1, filt3,
570 dst2, vec2, filt3, dst3, vec3, filt3, dst0, dst1, dst2, dst3);
574 __lsx_vst(dst0,
dst, 0);
575 __lsx_vstx(dst1,
dst, dst_stride);
576 dst += dst_stride_2x;
581 const int16_t *src1_ptr,
int32_t src2_stride,
587 __m128i filt0, filt1, filt2, filt3;
588 __m128i mask1, mask2, mask3, mask4, mask5, mask6, mask7;
589 __m128i vec0, vec1, vec2, vec3;
590 __m128i dst0, dst1, dst2;
591 __m128i in0, in1, in2;
596 filt0, filt1, filt2, filt3);
598 DUP4_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask0, 6, mask0, 8, mask1,
599 mask2, mask3, mask4);
600 DUP2_ARG2(__lsx_vaddi_bu, mask0, 10, mask0, 12, mask5, mask6);
601 mask7 = __lsx_vaddi_bu(mask0, 14);
603 for (loop_cnt =
height; loop_cnt--;) {
605 src0_ptr += src_stride;
606 DUP2_ARG2(__lsx_vld, src1_ptr, 0, src1_ptr, 16, in0, in1);
607 in2 = __lsx_vld(src1_ptr, 32);
608 src1_ptr += src2_stride;
611 src1, mask0,
src0,
src0, mask1, vec0, vec1, vec2, vec3);
612 DUP2_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec1, filt0, dst0, dst1);
613 dst2 = __lsx_vdp2_h_bu_b(vec2, filt0);
614 dst0 = __lsx_vdp2add_h_bu_b(dst0, vec3, filt1);
616 src0, mask2,
src1,
src0, mask6, vec0, vec1, vec2, vec3);
617 DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst1, vec0, filt1, dst2, vec1, filt1,
618 dst0, vec2, filt2, dst1, vec3, filt2, dst1, dst2, dst0, dst1);
620 mask7,
src1,
src1, mask3, vec0, vec1, vec2, vec3);
621 DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst2, vec0, filt2, dst0, vec1, filt3,
622 dst1, vec2, filt3, dst2, vec3, filt3, dst2, dst0, dst1, dst2);
625 dst2 = __lsx_vsadd_h(dst2, in2);
626 tmp1 = __lsx_vssrarni_bu_h(dst2, dst2, 7);
628 __lsx_vst(tmp0,
dst, 0);
629 __lsx_vstelm_d(tmp1,
dst, 16, 0);
635 const int16_t *src1_ptr,
int32_t src2_stride,
646 const int16_t *src1_ptr,
int32_t src2_stride,
657 const int16_t *src1_ptr,
int32_t src2_stride,
673 int32_t src_stride_2x = (src_stride << 1);
674 int32_t dst_stride_2x = (dst_stride << 1);
675 int32_t src_stride_4x = (src_stride << 2);
676 int32_t dst_stride_4x = (dst_stride << 2);
677 int32_t src2_stride_x = (src2_stride << 1);
678 int32_t src2_stride_2x = (src2_stride << 2);
679 int32_t src_stride_3x = src_stride_2x + src_stride;
680 int32_t dst_stride_3x = dst_stride_2x + dst_stride;
681 int32_t src2_stride_3x = src2_stride_2x + src2_stride_x;
683 __m128i src6, src7, src8, src9, src10;
684 __m128i in0, in1, in2, in3;
685 __m128i src10_r, src32_r, src54_r, src76_r, src98_r;
686 __m128i src21_r, src43_r, src65_r, src87_r, src109_r;
687 __m128i dst0_r, dst1_r, dst2_r, dst3_r;
688 __m128i filt0, filt1, filt2, filt3;
690 src0_ptr -= src_stride_3x;
693 filt0, filt1, filt2, filt3);
695 src0 = __lsx_vld(src0_ptr, 0);
696 DUP2_ARG2(__lsx_vldx, src0_ptr, src_stride, src0_ptr, src_stride_2x,
698 src3 = __lsx_vldx(src0_ptr, src_stride_3x);
699 src0_ptr += src_stride_4x;
700 src4 = __lsx_vld(src0_ptr, 0);
701 DUP2_ARG2(__lsx_vldx, src0_ptr, src_stride, src0_ptr, src_stride_2x,
703 src0_ptr += src_stride_3x;
705 src10_r, src32_r, src54_r, src21_r);
706 DUP2_ARG2(__lsx_vilvl_b, src4, src3, src6, src5, src43_r, src65_r);
708 for (loop_cnt = (
height >> 2); loop_cnt--;) {
709 src7 = __lsx_vld(src0_ptr, 0);
710 DUP2_ARG2(__lsx_vldx, src0_ptr, src_stride, src0_ptr, src_stride_2x,
712 src10 = __lsx_vldx(src0_ptr, src_stride_3x);
713 src0_ptr += src_stride_4x;
714 in0 = __lsx_vld(src1_ptr, 0);
715 DUP2_ARG2(__lsx_vldx, src1_ptr, src2_stride_x, src1_ptr, src2_stride_2x,
717 in3 = __lsx_vldx(src1_ptr, src2_stride_3x);
718 src1_ptr += src2_stride_2x;
719 DUP4_ARG2(__lsx_vilvl_b, src7, src6, src8, src7, src9, src8, src10, src9,
720 src76_r, src87_r, src98_r, src109_r);
722 DUP4_ARG2(__lsx_vdp2_h_bu_b, src10_r, filt0, src21_r, filt0, src32_r,
723 filt0, src43_r, filt0, dst0_r, dst1_r, dst2_r, dst3_r);
724 DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0_r, src32_r, filt1, dst1_r, src43_r,
725 filt1, dst2_r, src54_r, filt1, dst3_r, src65_r, filt1,
726 dst0_r, dst1_r, dst2_r, dst3_r);
727 DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0_r, src54_r, filt2, dst1_r, src65_r,
728 filt2, dst2_r, src76_r, filt2, dst3_r, src87_r, filt2,
729 dst0_r, dst1_r, dst2_r, dst3_r);
730 DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0_r, src76_r, filt3, dst1_r, src87_r,
731 filt3, dst2_r, src98_r, filt3, dst3_r, src109_r, filt3,
732 dst0_r, dst1_r, dst2_r, dst3_r);
736 __lsx_vstelm_d(dst0_r,
dst, 0, 0);
737 __lsx_vstelm_d(dst0_r,
dst + dst_stride, 0, 1);
738 __lsx_vstelm_d(dst1_r,
dst + dst_stride_2x, 0, 0);
739 __lsx_vstelm_d(dst1_r,
dst + dst_stride_3x, 0, 1);
740 dst += dst_stride_4x;
755 const int16_t *src1_ptr,
int32_t src2_stride,
760 const uint8_t *src0_ptr_tmp;
761 const int16_t *src1_ptr_tmp;
765 int32_t src_stride_2x = (src_stride << 1);
766 int32_t dst_stride_2x = (dst_stride << 1);
767 int32_t src_stride_4x = (src_stride << 2);
768 int32_t src_stride_3x = src_stride_2x + src_stride;
769 __m128i
src0,
src1,
src2, src3, src4, src5, src6, src7, src8;
770 __m128i in0, in1, in2, in3;
771 __m128i src10_r, src32_r, src54_r, src76_r;
772 __m128i src21_r, src43_r, src65_r, src87_r;
773 __m128i dst0_r, dst1_r;
774 __m128i src10_l, src32_l, src54_l, src76_l;
775 __m128i src21_l, src43_l, src65_l, src87_l;
776 __m128i dst0_l, dst1_l;
777 __m128i filt0, filt1, filt2, filt3;
779 src0_ptr -= src_stride_3x;
782 filt0, filt1, filt2, filt3);
784 for (cnt = (
width >> 4); cnt--;) {
785 src0_ptr_tmp = src0_ptr;
786 src1_ptr_tmp = src1_ptr;
789 src0 = __lsx_vld(src0_ptr_tmp, 0);
790 DUP2_ARG2(__lsx_vldx, src0_ptr_tmp, src_stride, src0_ptr_tmp,
792 src3 = __lsx_vldx(src0_ptr_tmp, src_stride_3x);
793 src0_ptr_tmp += src_stride_4x;
794 src4 = __lsx_vld(src0_ptr_tmp, 0);
795 DUP2_ARG2(__lsx_vldx, src0_ptr_tmp, src_stride, src0_ptr_tmp,
796 src_stride_2x, src5, src6);
797 src0_ptr_tmp += src_stride_3x;
800 src10_r, src32_r, src54_r, src21_r);
801 DUP2_ARG2(__lsx_vilvl_b, src4, src3, src6, src5, src43_r, src65_r);
803 src10_l, src32_l, src54_l, src21_l);
804 DUP2_ARG2(__lsx_vilvh_b, src4, src3, src6, src5, src43_l, src65_l);
806 for (loop_cnt = (
height >> 1); loop_cnt--;) {
807 src7 = __lsx_vld(src0_ptr_tmp, 0);
808 src8 = __lsx_vldx(src0_ptr_tmp, src_stride);
809 src0_ptr_tmp += src_stride_2x;
810 DUP2_ARG2(__lsx_vld, src1_ptr_tmp, 0, src1_ptr_tmp, 16, in0, in2);
811 src1_ptr_tmp += src2_stride;
812 DUP2_ARG2(__lsx_vld, src1_ptr_tmp, 0, src1_ptr_tmp, 16, in1, in3);
813 src1_ptr_tmp += src2_stride;
815 DUP2_ARG2(__lsx_vilvl_b, src7, src6, src8, src7, src76_r, src87_r);
816 DUP2_ARG2(__lsx_vilvh_b, src7, src6, src8, src7, src76_l, src87_l);
818 DUP4_ARG2(__lsx_vdp2_h_bu_b, src10_r, filt0, src21_r, filt0, src10_l,
819 filt0, src21_l, filt0, dst0_r, dst1_r, dst0_l, dst1_l);
820 DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0_r, src32_r, filt1, dst1_r,
821 src43_r, filt1, dst0_l, src32_l, filt1, dst1_l, src43_l,
822 filt1, dst0_r, dst1_r, dst0_l, dst1_l);
823 DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0_r, src54_r, filt2, dst1_r,
824 src65_r, filt2, dst0_l, src54_l, filt2, dst1_l, src65_l,
825 filt2, dst0_r, dst1_r, dst0_l, dst1_l);
826 DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0_r, src76_r, filt3, dst1_r,
827 src87_r, filt3, dst0_l, src76_l, filt3, dst1_l, src87_l,
828 filt3, dst0_r, dst1_r, dst0_l, dst1_l);
832 __lsx_vst(dst0_r, dst_tmp, 0);
833 __lsx_vstx(dst1_r, dst_tmp, dst_stride);
834 dst_tmp += dst_stride_2x;
858 const int16_t *src1_ptr,
int32_t src2_stride,
867 const int16_t *src1_ptr,
int32_t src2_stride,
878 const int16_t *src1_ptr,
int32_t src2_stride,
887 const int16_t *src1_ptr,
int32_t src2_stride,
896 const int16_t *src1_ptr,
int32_t src2_stride,
906 const int16_t *src1_ptr,
int32_t src2_stride,
908 const int8_t *filter_x,
const int8_t *filter_y,
913 const uint8_t *src0_ptr_tmp;
914 const int16_t *src1_ptr_tmp;
916 int32_t src_stride_2x = (src_stride << 1);
917 int32_t src_stride_4x = (src_stride << 2);
918 int32_t src_stride_3x = src_stride_2x + src_stride;
922 __m128i filt0, filt1, filt2, filt3;
923 __m128i filt_h0, filt_h1, filt_h2, filt_h3;
925 __m128i mask1, mask2, mask3;
926 __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
927 __m128i vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
928 __m128i dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
929 __m128i dst0_r, dst0_l;
930 __m128i dst10_r, dst32_r, dst54_r, dst76_r;
931 __m128i dst10_l, dst32_l, dst54_l, dst76_l;
933 src0_ptr -= src_stride_3x + 3;
935 DUP4_ARG2(__lsx_vldrepl_h, filter_x, 0, filter_x, 2, filter_x, 4, filter_x,
936 6, filt0, filt1, filt2, filt3);
937 filt_h3 = __lsx_vld(filter_y, 0);
938 filt_h3 = __lsx_vsllwil_h_b(filt_h3, 0);
940 DUP4_ARG2(__lsx_vreplvei_w, filt_h3, 0, filt_h3, 1, filt_h3, 2, filt_h3, 3,
941 filt_h0, filt_h1, filt_h2, filt_h3);
943 DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
944 mask3 = __lsx_vaddi_bu(mask0, 6);
946 for (cnt =
width >> 3; cnt--;) {
947 src0_ptr_tmp = src0_ptr;
949 src1_ptr_tmp = src1_ptr;
951 src0 = __lsx_vld(src0_ptr_tmp, 0);
952 DUP2_ARG2(__lsx_vldx, src0_ptr_tmp, src_stride, src0_ptr_tmp,
954 src3 = __lsx_vldx(src0_ptr_tmp, src_stride_3x);
955 src0_ptr_tmp += src_stride_4x;
956 src4 = __lsx_vld(src0_ptr_tmp, 0);
957 DUP2_ARG2(__lsx_vldx, src0_ptr_tmp, src_stride, src0_ptr_tmp,
958 src_stride_2x, src5, src6);
959 src0_ptr_tmp += src_stride_3x;
963 src0, mask2,
src0,
src0, mask3, vec0, vec1, vec2, vec3);
965 src1, mask2,
src1,
src1, mask3, vec4, vec5, vec6, vec7);
967 src2, mask2,
src2,
src2, mask3, vec8, vec9, vec10, vec11);
968 DUP4_ARG3(__lsx_vshuf_b, src3, src3, mask0, src3, src3, mask1, src3,
969 src3, mask2, src3, src3, mask3, vec12, vec13, vec14, vec15);
970 DUP4_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec4, filt0, vec8, filt0,
971 vec12, filt0, dst0, dst1, dst2, dst3);
972 DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec1, filt1, dst1, vec5, filt1,
973 dst2, vec9, filt1, dst3, vec13, filt1, dst0, dst1, dst2, dst3);
974 DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec2, filt2, dst1, vec6, filt2,
975 dst2, vec10, filt2, dst3, vec14, filt2, dst0, dst1, dst2, dst3);
976 DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec3, filt3, dst1, vec7, filt3,
977 dst2, vec11, filt3, dst3, vec15, filt3, dst0, dst1, dst2, dst3);
979 DUP4_ARG3(__lsx_vshuf_b, src4, src4, mask0, src4, src4, mask1, src4,
980 src4, mask2, src4, src4, mask3, vec0, vec1, vec2, vec3);
981 DUP4_ARG3(__lsx_vshuf_b, src5, src5, mask0, src5, src5, mask1, src5,
982 src5, mask2, src5, src5, mask3, vec4, vec5, vec6, vec7);
983 DUP4_ARG3(__lsx_vshuf_b, src6, src6, mask0, src6, src6, mask1, src6,
984 src6, mask2, src6, src6, mask3, vec8, vec9, vec10, vec11);
985 DUP2_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec4, filt0, dst4, dst5);
986 dst6 = __lsx_vdp2_h_bu_b(vec8, filt0);
987 DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst4, vec1, filt1, dst5, vec5, filt1,
988 dst6, vec9, filt1, dst4, vec2, filt2, dst4, dst5, dst6, dst4);
989 DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst5, vec6, filt2, dst6, vec10, filt2,
990 dst4, vec3, filt3, dst5, vec7, filt3, dst5, dst6, dst4, dst5);
991 dst6 = __lsx_vdp2add_h_bu_b(dst6, vec11, filt3);
993 for (loop_cnt =
height; loop_cnt--;) {
994 src7 = __lsx_vld(src0_ptr_tmp, 0);
995 src0_ptr_tmp += src_stride;
997 in0 = __lsx_vld(src1_ptr_tmp, 0);
998 src1_ptr_tmp += src2_stride;
1000 DUP4_ARG3(__lsx_vshuf_b, src7, src7, mask0, src7, src7, mask1, src7,
1001 src7, mask2, src7, src7, mask3, vec0, vec1, vec2, vec3);
1002 dst7 = __lsx_vdp2_h_bu_b(vec0, filt0);
1003 DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst7, vec1, filt1, dst7, vec2,
1005 dst7 = __lsx_vdp2add_h_bu_b(dst7, vec3, filt3);
1006 DUP4_ARG2(__lsx_vilvl_h, dst1, dst0, dst3, dst2, dst5, dst4, dst7,
1007 dst6, dst10_r, dst32_r, dst54_r, dst76_r);
1008 DUP4_ARG2(__lsx_vilvh_h, dst1, dst0, dst3, dst2, dst5, dst4, dst7,
1009 dst6, dst10_l, dst32_l, dst54_l, dst76_l);
1011 DUP2_ARG2(__lsx_vdp2_w_h, dst10_r, filt_h0, dst10_l, filt_h0,
1013 DUP4_ARG3(__lsx_vdp2add_w_h, dst0_r, dst32_r, filt_h1, dst0_l,
1014 dst32_l, filt_h1, dst0_r, dst54_r, filt_h2, dst0_l,
1015 dst54_l, filt_h2, dst0_r, dst0_l, dst0_r, dst0_l);
1016 DUP2_ARG3(__lsx_vdp2add_w_h, dst0_r, dst76_r, filt_h3, dst0_l,
1017 dst76_l, filt_h3, dst0_r, dst0_l);
1018 dst0_r = __lsx_vsrli_w(dst0_r, 6);
1019 dst0_l = __lsx_vsrli_w(dst0_l, 6);
1021 tmp = __lsx_vpickev_h(dst0_l, dst0_r);
1022 tmp = __lsx_vsadd_h(
tmp, in0);
1023 tmp = __lsx_vmaxi_h(
tmp, 0);
1025 __lsx_vstelm_d(
out, dst_tmp, 0, 0);
1026 dst_tmp += dst_stride;
1044 const int16_t *src1_ptr,
int32_t src2_stride,
1046 const int8_t *filter_x,
const int8_t *filter_y,
1050 dst, dst_stride, filter_x, filter_y,
height, 8);
1054 const int16_t *src1_ptr,
int32_t src2_stride,
1056 const int8_t *filter_x,
const int8_t *filter_y,
1060 dst, dst_stride, filter_x, filter_y,
height, 16);
1064 const int16_t *src1_ptr,
int32_t src2_stride,
1066 const int8_t *filter_x,
const int8_t *filter_y,
1070 dst, dst_stride, filter_x, filter_y,
height, 24);
1074 const int16_t *src1_ptr,
int32_t src2_stride,
1076 const int8_t *filter_x,
const int8_t *filter_y,
1080 dst, dst_stride, filter_x, filter_y,
height, 32);
1084 const int16_t *src1_ptr,
int32_t src2_stride,
1086 const int8_t *filter_x,
const int8_t *filter_y,
1090 dst, dst_stride, filter_x, filter_y,
height, 48);
1094 const int16_t *src1_ptr,
int32_t src2_stride,
1096 const int8_t *filter_x,
const int8_t *filter_y,
1100 dst, dst_stride, filter_x, filter_y,
height, 64);
1104 const int16_t *src1_ptr,
int32_t src2_stride,
1108 const int16_t *src1_ptr_tmp;
1110 uint32_t loop_cnt =
height >> 2;
1111 uint32_t res = (
height & 3) >> 1;
1112 int32_t dst_stride_2x = (dst_stride << 1);
1113 int32_t dst_stride_4x = (dst_stride << 2);
1114 int32_t dst_stride_3x = dst_stride_2x + dst_stride;
1115 int32_t src2_stride_x = src2_stride << 1;
1116 int32_t src2_stride_2x = src2_stride << 2;
1117 int32_t src2_stride_3x = src2_stride_2x + src2_stride_x;
1119 __m128i
src0,
src1,
src2, src3, src4, src5, src6, src7;
1120 __m128i in0, in1, in2, in3, in4, in5, in6, in7;
1121 __m128i filt0, filt1;
1123 __m128i mask1, mask2, mask3;
1124 __m128i vec0, vec1, vec2, vec3;
1125 __m128i dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
1130 DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 8, mask1, mask2);
1131 mask3 = __lsx_vaddi_bu(mask0, 10);
1134 src1_ptr_tmp = src1_ptr + 16;
1136 for (; loop_cnt--;) {
1138 src0_ptr += src_stride;
1139 DUP2_ARG2(__lsx_vld, src0_ptr, 0, src0_ptr, 16,
src2, src3);
1140 src0_ptr += src_stride;
1141 DUP2_ARG2(__lsx_vld, src0_ptr, 0, src0_ptr, 16, src4, src5);
1142 src0_ptr += src_stride;
1143 DUP2_ARG2(__lsx_vld, src0_ptr, 0, src0_ptr, 16, src6, src7);
1144 src0_ptr += src_stride;
1146 DUP2_ARG2(__lsx_vld, src1_ptr, 0, src1_ptr, 16, in0, in1);
1147 src1_ptr += src2_stride;
1148 DUP2_ARG2(__lsx_vld, src1_ptr, 0, src1_ptr, 16, in2, in3);
1149 src1_ptr += src2_stride;
1150 DUP2_ARG2(__lsx_vld, src1_ptr, 0, src1_ptr, 16, in4, in5);
1151 src1_ptr += src2_stride;
1152 DUP2_ARG2(__lsx_vld, src1_ptr, 0, src1_ptr, 16, in6, in7);
1153 src1_ptr += src2_stride;
1156 src2, mask0, src3,
src2, mask2, vec0, vec1, vec2, vec3);
1157 DUP4_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec1, filt0, vec2, filt0,
1158 vec3, filt0, dst0, dst1, dst2, dst3);
1160 src2, mask1, src3,
src2, mask3, vec0, vec1, vec2, vec3);
1161 DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec0, filt1, dst1, vec1, filt1,
1162 dst2, vec2, filt1, dst3, vec3, filt1, dst0, dst1, dst2, dst3);
1164 DUP4_ARG3(__lsx_vshuf_b, src4, src4, mask0, src5, src4, mask2, src6,
1165 src6, mask0, src7, src6, mask2, vec0, vec1, vec2, vec3);
1166 DUP4_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec1, filt0, vec2, filt0,
1167 vec3, filt0, dst4, dst5, dst6, dst7);
1168 DUP4_ARG3(__lsx_vshuf_b, src4, src4, mask1, src5, src4, mask3, src6,
1169 src6, mask1, src7, src6, mask3, vec0, vec1, vec2, vec3);
1170 DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst4, vec0, filt1, dst5, vec1, filt1,
1171 dst6, vec2, filt1, dst7, vec3, filt1, dst4, dst5, dst6, dst7);
1177 __lsx_vst(dst0,
dst, 0);
1178 __lsx_vstx(dst1,
dst, dst_stride);
1179 __lsx_vstx(dst2,
dst, dst_stride_2x);
1180 __lsx_vstx(dst3,
dst, dst_stride_3x);
1181 dst += dst_stride_4x;
1183 in0 = __lsx_vld(src1_ptr_tmp, 0);
1184 DUP2_ARG2(__lsx_vldx, src1_ptr_tmp, src2_stride_x, src1_ptr_tmp,
1185 src2_stride_2x, in1, in2);
1186 in3 = __lsx_vldx(src1_ptr_tmp, src2_stride_3x);
1187 src1_ptr_tmp += src2_stride_2x;
1190 src5, mask0, src7, src7, mask0, vec0, vec1, vec2, vec3);
1191 DUP4_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec1, filt0, vec2, filt0,
1192 vec3, filt0, dst0, dst1, dst2, dst3);
1194 src5, mask1, src7, src7, mask1, vec0, vec1, vec2, vec3);
1195 DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec0, filt1, dst1, vec1, filt1,
1196 dst2, vec2, filt1, dst3, vec3, filt1, dst0, dst1, dst2, dst3);
1199 __lsx_vstelm_d(dst0, dst_tmp, 0, 0);
1200 __lsx_vstelm_d(dst0, dst_tmp + dst_stride, 0, 1);
1201 __lsx_vstelm_d(dst1, dst_tmp + dst_stride_2x, 0, 0);
1202 __lsx_vstelm_d(dst1, dst_tmp + dst_stride_3x, 0, 1);
1203 dst_tmp += dst_stride_4x;
1207 src0_ptr += src_stride;
1208 DUP2_ARG2(__lsx_vld, src0_ptr, 0, src0_ptr, 16,
src2, src3);
1209 src0_ptr += src_stride;
1210 DUP2_ARG2(__lsx_vld, src1_ptr, 0, src1_ptr, 16, in0, in1);
1211 src1_ptr += src2_stride;
1212 DUP2_ARG2(__lsx_vld, src1_ptr, 0, src1_ptr, 16, in2, in3);
1213 src1_ptr += src2_stride;
1216 src2, mask0, src3,
src2, mask2, vec0, vec1, vec2, vec3);
1217 DUP4_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec1, filt0, vec2, filt0,
1218 vec3, filt0, dst0, dst1, dst2, dst3);
1220 src2, mask1, src3,
src2, mask3, vec0, vec1, vec2, vec3);
1221 DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec0, filt1, dst1, vec1, filt1,
1222 dst2, vec2, filt1, dst3, vec3, filt1, dst0, dst1, dst2, dst3);
1225 __lsx_vst(dst0,
dst, 0);
1226 __lsx_vstx(dst1,
dst, dst_stride);
1227 dst += dst_stride_2x;
1229 in0 = __lsx_vld(src1_ptr_tmp, 0);
1230 in1 = __lsx_vldx(src1_ptr_tmp, src2_stride_x);
1231 src1_ptr_tmp += src2_stride_x;
1233 DUP2_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec1, filt0, dst0, dst1);
1235 DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec0, filt1, dst1, vec1, filt1, dst0, dst1);
1237 __lsx_vstelm_d(dst0, dst_tmp, 0, 0);
1238 __lsx_vstelm_d(dst0, dst_tmp + dst_stride, 0, 1);
1239 dst_tmp += dst_stride_2x;
1244 const int16_t *src1_ptr,
int32_t src2_stride,
1250 __m128i in0, in1, in2, in3;
1251 __m128i filt0, filt1;
1253 __m128i mask1, mask2, mask3;
1254 __m128i dst0, dst1, dst2, dst3;
1255 __m128i vec0, vec1, vec2, vec3;
1261 DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 8, mask1, mask2);
1262 mask3 = __lsx_vaddi_bu(mask0, 10);
1264 for (loop_cnt =
height; loop_cnt--;) {
1266 src2 = __lsx_vld(src0_ptr, 24);
1267 src0_ptr += src_stride;
1268 DUP4_ARG2(__lsx_vld, src1_ptr, 0, src1_ptr, 16, src1_ptr, 32,
1269 src1_ptr, 48, in0, in1, in2, in3);
1270 src1_ptr += src2_stride;
1272 src1, mask0,
src2,
src2, mask0, vec0, vec1, vec2, vec3);
1273 DUP4_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec1, filt0, vec2, filt0,
1274 vec3, filt0, dst0, dst1, dst2, dst3);
1276 src1, mask1,
src2,
src2, mask1, vec0, vec1, vec2, vec3);
1277 DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec0, filt1, dst1, vec1, filt1,
1278 dst2, vec2, filt1, dst3, vec3, filt1, dst0, dst1, dst2, dst3);
1281 __lsx_vst(dst0,
dst, 0);
1282 __lsx_vst(dst1,
dst, 16);
1288 const int16_t *src1_ptr,
int32_t src2_stride,
1293 int32_t src_stride_2x = (src_stride << 1);
1294 int32_t dst_stride_2x = (dst_stride << 1);
1295 int32_t dst_stride_4x = (dst_stride << 2);
1296 int32_t src_stride_4x = (src_stride << 2);
1297 int32_t src2_stride_x = (src2_stride << 1);
1298 int32_t src2_stride_2x = (src2_stride << 2);
1299 int32_t src_stride_3x = src_stride_2x + src_stride;
1300 int32_t dst_stride_3x = dst_stride_2x + dst_stride;
1301 int32_t src2_stride_3x = src2_stride_2x + src2_stride_x;
1302 const int16_t *_src1 = src1_ptr + 8;
1304 __m128i in0, in1, in2, in3, in4, in5, in6, in7;
1305 __m128i src10_r, src32_r, src21_r, src43_r, src54_r, src65_r;
1306 __m128i dst0_r, dst1_r, dst2_r, dst3_r;
1307 __m128i src10_l, src32_l, src54_l, src21_l, src43_l, src65_l;
1308 __m128i src2110, src4332, src6554;
1309 __m128i dst0_l, dst1_l, filt0, filt1;
1311 src0_ptr -= src_stride;
1314 src0 = __lsx_vld(src0_ptr, 0);
1315 DUP2_ARG2(__lsx_vldx, src0_ptr, src_stride, src0_ptr, src_stride_2x,
1317 src0_ptr += src_stride_3x;
1320 src2110 = __lsx_vilvl_d(src21_l, src10_l);
1322 for (loop_cnt = (
height >> 2); loop_cnt--;) {
1323 src3 = __lsx_vld(src0_ptr, 0);
1324 DUP2_ARG2(__lsx_vldx, src0_ptr, src_stride, src0_ptr, src_stride_2x,
1326 src6 = __lsx_vldx(src0_ptr, src_stride_3x);
1327 src0_ptr += src_stride_4x;
1328 in0 = __lsx_vld(src1_ptr, 0);
1329 DUP2_ARG2(__lsx_vldx, src1_ptr, src2_stride_x, src1_ptr,
1330 src2_stride_2x, in1, in2);
1331 in3 = __lsx_vldx(src1_ptr, src2_stride_3x);
1332 src1_ptr += src2_stride_2x;
1333 in4 = __lsx_vld(_src1, 0);
1334 DUP2_ARG2(__lsx_vldx, _src1, src2_stride_x, _src1, src2_stride_2x,
1336 in7 = __lsx_vldx(_src1, src2_stride_3x);
1337 _src1 += src2_stride_2x;
1338 DUP2_ARG2(__lsx_vilvl_d, in5, in4, in7, in6, in4, in5);
1340 DUP2_ARG2(__lsx_vilvl_b, src3,
src2, src4, src3, src32_r, src43_r);
1341 DUP2_ARG2(__lsx_vilvh_b, src3,
src2, src4, src3, src32_l, src43_l);
1342 src4332 = __lsx_vilvl_d(src43_l, src32_l);
1343 DUP2_ARG2(__lsx_vilvl_b, src5, src4, src6, src5, src54_r, src65_r);
1344 DUP2_ARG2(__lsx_vilvh_b, src5, src4, src6, src5, src54_l, src65_l);
1345 src6554 = __lsx_vilvl_d(src65_l, src54_l);
1347 DUP4_ARG2(__lsx_vdp2_h_bu_b, src10_r, filt0, src21_r, filt0, src2110,
1348 filt0, src32_r, filt0, dst0_r, dst1_r, dst0_l, dst2_r);
1349 DUP2_ARG2(__lsx_vdp2_h_bu_b, src43_r, filt0, src4332, filt0,
1351 DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0_r, src32_r, filt1, dst1_r,
1352 src43_r, filt1, dst0_l, src4332, filt1, dst2_r, src54_r,
1353 filt1, dst0_r, dst1_r, dst0_l, dst2_r);
1354 DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst3_r, src65_r, filt1, dst1_l,
1355 src6554, filt1, dst3_r, dst1_l);
1359 __lsx_vstelm_d(dst0_r,
dst, 0, 0);
1360 __lsx_vstelm_d(dst0_r,
dst + dst_stride, 0, 1);
1361 __lsx_vstelm_d(dst1_r,
dst + dst_stride_2x, 0, 0);
1362 __lsx_vstelm_d(dst1_r,
dst + dst_stride_3x, 0, 1);
1363 __lsx_vstelm_w(dst0_l,
dst, 8, 0);
1364 __lsx_vstelm_w(dst0_l,
dst + dst_stride, 8, 1);
1365 __lsx_vstelm_w(dst0_l,
dst + dst_stride_2x, 8, 2);
1366 __lsx_vstelm_w(dst0_l,
dst + dst_stride_3x, 8, 3);
1367 dst += dst_stride_4x;
1377 const int16_t *src1_ptr,
int32_t src2_stride,
1381 uint32_t loop_cnt =
height >> 2;
1382 const int32_t src_stride_2x = (src_stride << 1);
1383 const int32_t dst_stride_2x = (dst_stride << 1);
1384 const int32_t src_stride_3x = src_stride_2x + src_stride;
1386 __m128i in0, in1, in2, in3;
1387 __m128i src10_r, src32_r, src21_r, src43_r;
1388 __m128i src10_l, src32_l, src21_l, src43_l;
1389 __m128i dst0_r, dst1_r, dst0_l, dst1_l;
1390 __m128i filt0, filt1;
1392 src0_ptr -= src_stride;
1395 src0 = __lsx_vld(src0_ptr, 0);
1396 DUP2_ARG2(__lsx_vldx, src0_ptr, src_stride, src0_ptr, src_stride_2x,
1398 src0_ptr += src_stride_3x;
1402 for (; loop_cnt--;) {
1403 src3 = __lsx_vld(src0_ptr, 0);
1404 src4 = __lsx_vldx(src0_ptr, src_stride);
1405 src0_ptr += src_stride_2x;
1406 DUP2_ARG2(__lsx_vld, src1_ptr, 0, src1_ptr, 16, in0, in2);
1407 src1_ptr += src2_stride;
1408 DUP2_ARG2(__lsx_vld, src1_ptr, 0, src1_ptr, 16, in1, in3);
1409 src1_ptr += src2_stride;
1410 DUP2_ARG2(__lsx_vilvl_b, src3,
src2, src4, src3, src32_r, src43_r);
1411 DUP2_ARG2(__lsx_vilvh_b, src3,
src2, src4, src3, src32_l, src43_l);
1413 DUP4_ARG2(__lsx_vdp2_h_bu_b, src10_r, filt0, src21_r, filt0, src10_l,
1414 filt0, src21_l, filt0, dst0_r, dst1_r, dst0_l, dst1_l);
1415 DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0_r, src32_r, filt1, dst1_r, src43_r,
1416 filt1, dst0_l, src32_l, filt1, dst1_l, src43_l, filt1,
1417 dst0_r, dst1_r, dst0_l, dst1_l);
1421 __lsx_vst(dst0_r,
dst, 0);
1422 __lsx_vstx(dst1_r,
dst, dst_stride);
1423 dst += dst_stride_2x;
1425 src5 = __lsx_vld(src0_ptr, 0);
1426 src2 = __lsx_vldx(src0_ptr, src_stride);
1427 src0_ptr += src_stride_2x;
1428 DUP2_ARG2(__lsx_vld, src1_ptr, 0, src1_ptr, 16, in0, in2);
1429 src1_ptr += src2_stride;
1430 DUP2_ARG2(__lsx_vld, src1_ptr, 0, src1_ptr, 16, in1, in3);
1431 src1_ptr += src2_stride;
1432 DUP2_ARG2(__lsx_vilvl_b, src5, src4,
src2, src5, src10_r, src21_r);
1433 DUP2_ARG2(__lsx_vilvh_b, src5, src4,
src2, src5, src10_l, src21_l);
1435 DUP4_ARG2(__lsx_vdp2_h_bu_b, src32_r, filt0, src32_l, filt0, src43_r,
1436 filt0, src43_l, filt0, dst0_r, dst0_l, dst1_r, dst1_l);
1437 DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0_r, src10_r, filt1, dst0_l,
1438 src10_l, filt1, dst1_r, src21_r, filt1, dst1_l, src21_l,
1439 filt1, dst0_r, dst0_l, dst1_r, dst1_l);
1442 __lsx_vst(dst0_r,
dst, 0);
1443 __lsx_vstx(dst1_r,
dst, dst_stride);
1444 dst += dst_stride_2x;
1449 const int16_t *src1_ptr,
int32_t src2_stride,
1454 int32_t dst_stride_2x = dst_stride << 1;
1456 __m128i src6, src7, src8, src9, src10, src11;
1457 __m128i in0, in1, in2, in3, in4, in5;
1458 __m128i src10_r, src32_r, src76_r, src98_r;
1459 __m128i src21_r, src43_r, src87_r, src109_r;
1460 __m128i src10_l, src32_l, src21_l, src43_l;
1461 __m128i dst0_r, dst1_r, dst2_r, dst3_r;
1462 __m128i dst0_l, dst1_l;
1463 __m128i filt0, filt1;
1465 src0_ptr -= src_stride;
1469 DUP2_ARG2(__lsx_vld, src0_ptr, 0, src0_ptr, 16,
src0, src6);
1470 src0_ptr += src_stride;
1471 DUP2_ARG2(__lsx_vld, src0_ptr, 0, src0_ptr, 16,
src1, src7);
1472 src0_ptr += src_stride;
1473 DUP2_ARG2(__lsx_vld, src0_ptr, 0, src0_ptr, 16,
src2, src8);
1474 src0_ptr += src_stride;
1478 DUP2_ARG2(__lsx_vilvl_b, src7, src6, src8, src7, src76_r, src87_r);
1480 for (loop_cnt = (
height >> 2); loop_cnt--;) {
1482 DUP2_ARG2(__lsx_vld, src0_ptr, 0, src0_ptr, 16, src3, src9);
1483 src0_ptr += src_stride;
1484 DUP2_ARG2(__lsx_vld, src0_ptr, 0, src0_ptr, 16, src4, src10);
1485 src0_ptr += src_stride;
1486 DUP2_ARG2(__lsx_vld, src1_ptr, 0, src1_ptr, 16, in0, in2);
1487 in4 = __lsx_vld(src1_ptr, 32);
1488 src1_ptr += src2_stride;
1489 DUP2_ARG2(__lsx_vld, src1_ptr, 0, src1_ptr, 16, in1, in3);
1490 in5 = __lsx_vld(src1_ptr, 32);
1491 src1_ptr += src2_stride;
1492 DUP2_ARG2(__lsx_vilvl_b, src3,
src2, src4, src3, src32_r, src43_r);
1493 DUP2_ARG2(__lsx_vilvh_b, src3,
src2, src4, src3, src32_l, src43_l);
1495 DUP2_ARG2(__lsx_vilvl_b, src9, src8, src10, src9, src98_r, src109_r);
1497 DUP4_ARG2(__lsx_vdp2_h_bu_b, src10_r, filt0, src10_l, filt0, src21_r,
1498 filt0, src21_l, filt0, dst0_r, dst0_l, dst1_r, dst1_l);
1499 DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0_r, src32_r, filt1, dst0_l,
1500 src32_l, filt1, dst1_r, src43_r, filt1, dst1_l, src43_l, filt1,
1501 dst0_r, dst0_l, dst1_r, dst1_l);
1503 DUP2_ARG2(__lsx_vdp2_h_bu_b, src76_r, filt0, src87_r, filt0,
1505 DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst2_r, src98_r, filt1, dst3_r,
1506 src109_r, filt1, dst2_r, dst3_r);
1511 __lsx_vst(dst0_r,
dst, 0);
1512 __lsx_vstx(dst1_r,
dst, dst_stride);
1513 __lsx_vstelm_d(dst2_r,
dst, 16, 0);
1514 __lsx_vstelm_d(dst2_r,
dst + dst_stride, 16, 1);
1515 dst += dst_stride_2x;
1518 DUP4_ARG2(__lsx_vld, src0_ptr, 0, src1_ptr, 0, src1_ptr, 16, src1_ptr,
1519 32, src5, in0, in2, in4);
1520 src1_ptr += src2_stride;
1521 DUP4_ARG2(__lsx_vld, src0_ptr, 16, src1_ptr, 0, src1_ptr, 16, src1_ptr,
1522 32, src11, in1, in3, in5);
1523 src1_ptr += src2_stride;
1524 src0_ptr += src_stride;
1525 DUP2_ARG2(__lsx_vld, src0_ptr, 0, src0_ptr, 16,
src2, src8);
1526 src0_ptr += src_stride;
1527 DUP2_ARG2(__lsx_vilvl_b, src5, src4,
src2, src5, src10_r, src21_r);
1528 DUP2_ARG2(__lsx_vilvh_b, src5, src4,
src2, src5, src10_l, src21_l);
1530 DUP2_ARG2(__lsx_vilvl_b, src11, src10, src8, src11, src76_r, src87_r);
1532 DUP4_ARG2(__lsx_vdp2_h_bu_b, src32_r, filt0, src32_l, filt0, src43_r,
1533 filt0, src43_l, filt0, dst0_r, dst0_l, dst1_r, dst1_l);
1534 DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0_r, src10_r, filt1, dst0_l,
1535 src10_l, filt1, dst1_r, src21_r, filt1, dst1_l, src21_l,
1536 filt1, dst0_r, dst0_l, dst1_r, dst1_l);
1539 DUP2_ARG2(__lsx_vdp2_h_bu_b, src98_r, filt0, src109_r, filt0,
1541 DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst2_r, src76_r, filt1, dst3_r,
1542 src87_r, filt1, dst2_r, dst3_r);
1547 __lsx_vst(dst0_r,
dst, 0);
1548 __lsx_vstx(dst1_r,
dst, dst_stride);
1549 __lsx_vstelm_d(dst2_r,
dst, 16, 0);
1550 __lsx_vstelm_d(dst2_r,
dst + dst_stride, 16, 1);
1551 dst += dst_stride_2x;
1556 const int16_t *src1_ptr,
int32_t src2_stride,
1569 const int8_t *filter_x,
const int8_t *filter_y)
1571 int32_t src_stride_2x = (src_stride << 1);
1572 int32_t src_stride_4x = (src_stride << 2);
1573 int32_t src_stride_3x = src_stride_2x + src_stride;
1577 __m128i filt0, filt1;
1578 __m128i filt_h0, filt_h1;
1580 __m128i mask1, filter_vec;
1581 __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9;
1582 __m128i dst0, dst1, dst2, dst3, dst4;
1583 __m128i dst0_r, dst0_l, dst1_r, dst1_l;
1584 __m128i dst10_r, dst32_r, dst21_r, dst43_r;
1585 __m128i dst10_l, dst32_l, dst21_l, dst43_l;
1589 src0_ptr -= (src_stride + 1);
1590 DUP2_ARG2(__lsx_vldrepl_h, filter_x, 0, filter_x, 2, filt0, filt1);
1592 filter_vec = __lsx_vld(filter_y, 0);
1593 filter_vec = __lsx_vsllwil_h_b(filter_vec, 0);
1594 DUP2_ARG2(__lsx_vreplvei_w, filter_vec, 0, filter_vec, 1, filt_h0, filt_h1);
1596 mask1 = __lsx_vaddi_bu(mask0, 2);
1598 src0 = __lsx_vld(src0_ptr, 0);
1599 DUP4_ARG2(__lsx_vldx, src0_ptr, src_stride, src0_ptr, src_stride_2x,
1600 src0_ptr, src_stride_3x, src0_ptr, src_stride_4x,
1603 DUP2_ARG2(__lsx_vld, src1_ptr, 0, src1_ptr + src2_stride, 0, in0, in1);
1608 DUP2_ARG3(__lsx_vshuf_b, src3, src3, mask0, src3, src3, mask1, vec6, vec7);
1609 DUP2_ARG3(__lsx_vshuf_b, src4, src4, mask0, src4, src4, mask1, vec8, vec9);
1611 DUP4_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec2, filt0, vec4, filt0, vec6,
1612 filt0, dst0, dst1, dst2, dst3);
1613 dst4 = __lsx_vdp2_h_bu_b(vec8, filt0);
1614 DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec1, filt1, dst1, vec3, filt1, dst2,
1615 vec5, filt1, dst3, vec7, filt1, dst0, dst1, dst2, dst3);
1616 dst4 = __lsx_vdp2add_h_bu_b(dst4, vec9, filt1);
1618 DUP2_ARG2(__lsx_vilvl_h, dst1, dst0, dst2, dst1, dst10_r, dst21_r);
1619 DUP2_ARG2(__lsx_vilvh_h, dst1, dst0, dst2, dst1, dst10_l, dst21_l);
1620 DUP2_ARG2(__lsx_vilvl_h, dst3, dst2, dst4, dst3, dst32_r, dst43_r);
1621 DUP2_ARG2(__lsx_vilvh_h, dst3, dst2, dst4, dst3, dst32_l, dst43_l);
1622 DUP4_ARG2(__lsx_vdp2_w_h, dst10_r, filt_h0, dst10_l, filt_h0, dst21_r,
1623 filt_h0, dst21_l, filt_h0, dst0_r, dst0_l, dst1_r, dst1_l);
1624 DUP4_ARG3(__lsx_vdp2add_w_h, dst0_r, dst32_r, filt_h1, dst0_l, dst32_l,
1625 filt_h1, dst1_r, dst43_r, filt_h1, dst1_l, dst43_l, filt_h1,
1626 dst0_r, dst0_l, dst1_r, dst1_l);
1627 DUP4_ARG2(__lsx_vsrai_w, dst0_r, 6, dst0_l, 6, dst1_r, 6, dst1_l, 6,
1628 dst0_r, dst0_l, dst1_r, dst1_l);
1629 DUP2_ARG2(__lsx_vpickev_h, dst0_l, dst0_r, dst1_l, dst1_r, tmp0, tmp1);
1630 DUP2_ARG2(__lsx_vsadd_h, in0, tmp0, in1, tmp1, tmp0, tmp1);
1631 DUP2_ARG2(__lsx_vmaxi_h, tmp0, 0, tmp1, 0, tmp0, tmp1);
1632 out = __lsx_vssrlrni_bu_h(tmp1, tmp0, 7);
1633 __lsx_vstelm_d(
out,
dst, 0, 0);
1634 __lsx_vstelm_d(
out,
dst + dst_stride, 0, 1);
1639 const int16_t *src1_ptr,
int32_t src2_stride,
1641 const int8_t *filter_x,
const int8_t *filter_y,
1645 int32_t src_stride_2x = (src_stride << 1);
1646 int32_t dst_stride_2x = (dst_stride << 1);
1647 int32_t src_stride_4x = (src_stride << 2);
1648 int32_t src2_stride_x = (src2_stride << 1);
1649 int32_t src2_stride_2x = (src2_stride << 2);
1650 int32_t src_stride_3x = src_stride_2x + src_stride;
1651 int32_t dst_stride_3x = dst_stride_2x + dst_stride;
1652 int32_t src2_stride_3x = src2_stride_2x + src2_stride_x;
1655 __m128i
src0,
src1,
src2, src3, src4, src5, src6, mask0, mask1;
1656 __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1657 __m128i filt0, filt1, filt_h0, filt_h1, filter_vec;
1658 __m128i dst0, dst1, dst2, dst3, dst4, dst5, dst6, tmp0, tmp1, tmp2, tmp3;
1659 __m128i in0, in1, in2, in3;
1660 __m128i dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
1661 __m128i dst10_r, dst32_r, dst54_r, dst21_r, dst43_r, dst65_r;
1662 __m128i dst10_l, dst32_l, dst54_l, dst21_l, dst43_l, dst65_l;
1664 src0_ptr -= (src_stride + 1);
1665 DUP2_ARG2(__lsx_vldrepl_h, filter_x, 0, filter_x, 2, filt0, filt1);
1667 filter_vec = __lsx_vld(filter_y, 0);
1668 filter_vec = __lsx_vsllwil_h_b(filter_vec, 0);
1669 DUP2_ARG2(__lsx_vreplvei_w, filter_vec, 0, filter_vec, 1, filt_h0, filt_h1);
1672 mask1 = __lsx_vaddi_bu(mask0, 2);
1674 for (cnt = width8mult; cnt--;) {
1675 src0 = __lsx_vld(src0_ptr, 0);
1676 DUP2_ARG2(__lsx_vldx, src0_ptr, src_stride, src0_ptr, src_stride_2x,
1678 src3 = __lsx_vldx(src0_ptr, src_stride_3x);
1679 src0_ptr += src_stride_4x;
1680 src4 = __lsx_vld(src0_ptr, 0);
1681 DUP2_ARG2(__lsx_vldx, src0_ptr, src_stride, src0_ptr, src_stride_2x,
1683 src0_ptr += (8 - src_stride_4x);
1685 in0 = __lsx_vld(src1_ptr, 0);
1686 DUP2_ARG2(__lsx_vldx, src1_ptr, src2_stride_x, src1_ptr,
1687 src2_stride_2x, in1, in2);
1688 in3 = __lsx_vldx(src1_ptr, src2_stride_3x);
1698 DUP2_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec2, filt0, dst0, dst1);
1699 dst2 = __lsx_vdp2_h_bu_b(vec4, filt0);
1700 DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec1, filt1, dst1, vec3, filt1,
1702 dst2 = __lsx_vdp2add_h_bu_b(dst2, vec5, filt1);
1704 DUP2_ARG2(__lsx_vilvl_h, dst1, dst0, dst2, dst1, dst10_r, dst21_r);
1705 DUP2_ARG2(__lsx_vilvh_h, dst1, dst0, dst2, dst1, dst10_l, dst21_l);
1707 DUP2_ARG3(__lsx_vshuf_b, src3, src3, mask0, src3, src3, mask1,
1709 DUP2_ARG3(__lsx_vshuf_b, src4, src4, mask0, src4, src4, mask1,
1711 DUP2_ARG3(__lsx_vshuf_b, src5, src5, mask0, src5, src5, mask1,
1713 DUP2_ARG3(__lsx_vshuf_b, src6, src6, mask0, src6, src6, mask1,
1716 DUP4_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec2, filt0, vec4, filt0,
1717 vec6, filt0, dst3, dst4, dst5, dst6);
1718 DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst3, vec1, filt1, dst4, vec3, filt1,
1719 dst5, vec5, filt1, dst6, vec7, filt1, dst3, dst4, dst5, dst6);
1721 DUP2_ARG2(__lsx_vilvl_h, dst3, dst2, dst4, dst3, dst32_r, dst43_r);
1722 DUP2_ARG2(__lsx_vilvh_h, dst3, dst2, dst4, dst3, dst32_l, dst43_l);
1723 DUP2_ARG2(__lsx_vilvl_h, dst5, dst4, dst6, dst5, dst54_r, dst65_r);
1724 DUP2_ARG2(__lsx_vilvh_h, dst5, dst4, dst6, dst5, dst54_l, dst65_l);
1726 DUP4_ARG2(__lsx_vdp2_w_h, dst10_r, filt_h0, dst10_l, filt_h0, dst21_r,
1727 filt_h0, dst21_l, filt_h0, dst0_r, dst0_l, dst1_r, dst1_l);
1728 DUP4_ARG2(__lsx_vdp2_w_h, dst32_r, filt_h0, dst32_l, filt_h0, dst43_r,
1729 filt_h0, dst43_l, filt_h0, dst2_r, dst2_l, dst3_r, dst3_l);
1730 DUP4_ARG3(__lsx_vdp2add_w_h, dst0_r, dst32_r, filt_h1, dst0_l, dst32_l,
1731 filt_h1, dst1_r, dst43_r, filt_h1, dst1_l, dst43_l, filt_h1,
1732 dst0_r, dst0_l, dst1_r, dst1_l);
1733 DUP4_ARG3(__lsx_vdp2add_w_h, dst2_r, dst54_r, filt_h1, dst2_l, dst54_l,
1734 filt_h1, dst3_r, dst65_r, filt_h1, dst3_l, dst65_l, filt_h1,
1735 dst2_r, dst2_l, dst3_r, dst3_l);
1737 DUP4_ARG2(__lsx_vsrai_w, dst0_r, 6, dst0_l, 6, dst1_r, 6, dst1_l, 6,
1738 dst0_r, dst0_l, dst1_r, dst1_l);
1739 DUP4_ARG2(__lsx_vsrai_w, dst2_r, 6, dst2_l, 6, dst3_r, 6, dst3_l, 6,
1740 dst2_r, dst2_l, dst3_r, dst3_l);
1741 DUP4_ARG2(__lsx_vpickev_h, dst0_l, dst0_r, dst1_l, dst1_r, dst2_l,
1742 dst2_r, dst3_l, dst3_r, tmp0, tmp1, tmp2, tmp3);
1743 DUP4_ARG2(__lsx_vsadd_h, in0, tmp0, in1, tmp1, in2, tmp2, in3, tmp3,
1744 tmp0, tmp1, tmp2, tmp3);
1745 DUP4_ARG2(__lsx_vmaxi_h, tmp0, 0, tmp1, 0, tmp2, 0, tmp3, 0,
1746 tmp0, tmp1, tmp2, tmp3);
1747 DUP2_ARG3(__lsx_vssrlrni_bu_h, tmp1, tmp0, 7, tmp3, tmp2, 7, out0, out1);
1748 __lsx_vstelm_d(out0,
dst, 0, 0);
1749 __lsx_vstelm_d(out0,
dst + dst_stride, 0, 1);
1750 __lsx_vstelm_d(out1,
dst + dst_stride_2x, 0, 0);
1751 __lsx_vstelm_d(out1,
dst + dst_stride_3x, 0, 1);
1759 const int8_t *filter_x,
const int8_t *filter_y)
1761 int32_t src_stride_2x = (src_stride << 1);
1762 int32_t dst_stride_2x = (dst_stride << 1);
1763 int32_t src_stride_4x = (src_stride << 2);
1764 int32_t dst_stride_4x = (dst_stride << 2);
1765 int32_t src2_stride_x = (src2_stride << 1);
1766 int32_t src2_stride_2x = (src2_stride << 2);
1767 int32_t src_stride_3x = src_stride_2x + src_stride;
1768 int32_t dst_stride_3x = dst_stride_2x + dst_stride;
1769 int32_t src2_stride_3x = src2_stride_2x + src2_stride_x;
1771 __m128i out0, out1, out2;
1772 __m128i
src0,
src1,
src2, src3, src4, src5, src6, src7, src8;
1773 __m128i in0, in1, in2, in3, in4, in5;
1774 __m128i filt0, filt1;
1775 __m128i filt_h0, filt_h1;
1777 __m128i mask1, filter_vec;
1778 __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9;
1779 __m128i vec10, vec11, vec12, vec13, vec14, vec15, vec16, vec17;
1780 __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
1781 __m128i dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8;
1782 __m128i dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
1783 __m128i dst4_r, dst4_l, dst5_r, dst5_l;
1784 __m128i dst10_r, dst32_r, dst10_l, dst32_l;
1785 __m128i dst21_r, dst43_r, dst21_l, dst43_l;
1786 __m128i dst54_r, dst54_l, dst65_r, dst65_l;
1787 __m128i dst76_r, dst76_l, dst87_r, dst87_l;
1789 src0_ptr -= (src_stride + 1);
1790 DUP2_ARG2(__lsx_vldrepl_h, filter_x, 0, filter_x, 2, filt0, filt1);
1792 filter_vec = __lsx_vld(filter_y, 0);
1793 filter_vec = __lsx_vsllwil_h_b(filter_vec, 0);
1794 DUP2_ARG2(__lsx_vreplvei_w, filter_vec, 0, filter_vec, 1, filt_h0, filt_h1);
1796 mask1 = __lsx_vaddi_bu(mask0, 2);
1798 src0 = __lsx_vld(src0_ptr, 0);
1799 DUP2_ARG2(__lsx_vldx, src0_ptr, src_stride, src0_ptr, src_stride_2x,
1801 src3 = __lsx_vldx(src0_ptr, src_stride_3x);
1802 src0_ptr += src_stride_4x;
1803 src4 = __lsx_vld(src0_ptr, 0);
1804 DUP4_ARG2(__lsx_vldx, src0_ptr, src_stride, src0_ptr, src_stride_2x,
1805 src0_ptr, src_stride_3x, src0_ptr, src_stride_4x,
1806 src5, src6, src7, src8);
1808 in0 = __lsx_vld(src1_ptr, 0);
1809 DUP2_ARG2(__lsx_vldx, src1_ptr, src2_stride_x, src1_ptr, src2_stride_2x,
1811 in3 = __lsx_vldx(src1_ptr, src2_stride_3x);
1812 src1_ptr += src2_stride_2x;
1813 in4 = __lsx_vld(src1_ptr, 0);
1814 in5 = __lsx_vldx(src1_ptr, src2_stride_x);
1819 DUP2_ARG3(__lsx_vshuf_b, src3, src3, mask0, src3, src3, mask1, vec6, vec7);
1820 DUP2_ARG3(__lsx_vshuf_b, src4, src4, mask0, src4, src4, mask1, vec8, vec9);
1821 DUP2_ARG3(__lsx_vshuf_b, src5, src5, mask0, src5, src5, mask1, vec10, vec11);
1822 DUP2_ARG3(__lsx_vshuf_b, src6, src6, mask0, src6, src6, mask1, vec12, vec13);
1823 DUP2_ARG3(__lsx_vshuf_b, src7, src7, mask0, src7, src7, mask1, vec14, vec15);
1824 DUP2_ARG3(__lsx_vshuf_b, src8, src8, mask0, src8, src8, mask1, vec16, vec17);
1826 DUP4_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec2, filt0, vec4, filt0, vec6,
1827 filt0, dst0, dst1, dst2, dst3);
1828 dst4 = __lsx_vdp2_h_bu_b(vec8, filt0);
1829 DUP4_ARG2(__lsx_vdp2_h_bu_b, vec10, filt0, vec12, filt0, vec14, filt0,
1830 vec16, filt0, dst5, dst6, dst7, dst8);
1831 DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec1, filt1, dst1, vec3, filt1, dst2,
1832 vec5, filt1, dst3, vec7, filt1, dst0, dst1, dst2, dst3);
1833 dst4 = __lsx_vdp2add_h_bu_b(dst4, vec9, filt1);
1834 DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst5, vec11, filt1, dst6, vec13, filt1,
1835 dst7, vec15, filt1, dst8, vec17, filt1, dst5, dst6, dst7, dst8);
1837 DUP4_ARG2(__lsx_vilvl_h, dst1, dst0, dst2, dst1, dst3, dst2, dst4, dst3,
1838 dst10_r, dst21_r, dst32_r, dst43_r);
1839 DUP4_ARG2(__lsx_vilvh_h, dst1, dst0, dst2, dst1, dst3, dst2, dst4, dst3,
1840 dst10_l, dst21_l, dst32_l, dst43_l);
1841 DUP4_ARG2(__lsx_vilvl_h, dst5, dst4, dst6, dst5, dst7, dst6, dst8, dst7,
1842 dst54_r, dst65_r, dst76_r, dst87_r);
1843 DUP4_ARG2(__lsx_vilvh_h, dst5, dst4, dst6, dst5, dst7, dst6, dst8, dst7,
1844 dst54_l, dst65_l, dst76_l, dst87_l);
1846 DUP4_ARG2(__lsx_vdp2_w_h, dst10_r, filt_h0, dst10_l, filt_h0, dst21_r,
1847 filt_h0, dst21_l, filt_h0, dst0_r, dst0_l, dst1_r, dst1_l);
1848 DUP4_ARG2(__lsx_vdp2_w_h, dst32_r, filt_h0, dst32_l, filt_h0, dst43_r,
1849 filt_h0, dst43_l, filt_h0, dst2_r, dst2_l, dst3_r, dst3_l);
1850 DUP4_ARG2(__lsx_vdp2_w_h, dst54_r, filt_h0, dst54_l, filt_h0, dst65_r,
1851 filt_h0, dst65_l, filt_h0, dst4_r, dst4_l, dst5_r, dst5_l);
1852 DUP4_ARG3(__lsx_vdp2add_w_h, dst0_r, dst32_r, filt_h1, dst0_l, dst32_l,
1853 filt_h1, dst1_r, dst43_r, filt_h1, dst1_l, dst43_l, filt_h1,
1854 dst0_r, dst0_l, dst1_r, dst1_l);
1855 DUP4_ARG3(__lsx_vdp2add_w_h, dst2_r, dst54_r, filt_h1, dst2_l, dst54_l,
1856 filt_h1, dst3_r, dst65_r, filt_h1, dst3_l, dst65_l, filt_h1,
1857 dst2_r, dst2_l, dst3_r, dst3_l);
1858 DUP4_ARG3(__lsx_vdp2add_w_h, dst4_r, dst76_r, filt_h1, dst4_l, dst76_l,
1859 filt_h1, dst5_r, dst87_r, filt_h1, dst5_l, dst87_l, filt_h1,
1860 dst4_r, dst4_l, dst5_r, dst5_l);
1862 DUP4_ARG2(__lsx_vsrai_w, dst0_r, 6, dst0_l, 6, dst1_r, 6, dst1_l, 6,
1863 dst0_r, dst0_l, dst1_r, dst1_l);
1864 DUP4_ARG2(__lsx_vsrai_w, dst2_r, 6, dst2_l, 6, dst3_r, 6, dst3_l, 6,
1865 dst2_r, dst2_l, dst3_r, dst3_l);
1866 DUP4_ARG2(__lsx_vsrai_w, dst4_r, 6, dst4_l, 6, dst5_r, 6, dst5_l, 6,
1867 dst4_r, dst4_l, dst5_r, dst5_l);
1868 DUP4_ARG2(__lsx_vpickev_h, dst0_l, dst0_r, dst1_l, dst1_r, dst2_l, dst2_r,
1869 dst3_l, dst3_r, tmp0, tmp1, tmp2, tmp3);
1870 DUP2_ARG2(__lsx_vpickev_h, dst4_l, dst4_r, dst5_l, dst5_r, tmp4, tmp5);
1871 DUP4_ARG2(__lsx_vsadd_h, in0, tmp0, in1, tmp1, in2, tmp2, in3, tmp3,
1872 tmp0, tmp1, tmp2, tmp3);
1873 DUP2_ARG2(__lsx_vsadd_h, in4, tmp4, in5, tmp5, tmp4, tmp5);
1874 DUP4_ARG2(__lsx_vmaxi_h, tmp0, 0, tmp1, 0, tmp2, 0, tmp3, 0,
1875 tmp0, tmp1, tmp2, tmp3);
1876 DUP2_ARG2(__lsx_vmaxi_h, tmp4, 0, tmp5, 0, tmp4, tmp5);
1877 DUP2_ARG3(__lsx_vssrlrni_bu_h, tmp1, tmp0, 7, tmp3, tmp2, 7, out0, out1);
1878 out2 = __lsx_vssrlrni_bu_h(tmp5, tmp4, 7);
1879 __lsx_vstelm_d(out0,
dst, 0, 0);
1880 __lsx_vstelm_d(out0,
dst + dst_stride, 0, 1);
1881 __lsx_vstelm_d(out1,
dst + dst_stride_2x, 0, 0);
1882 __lsx_vstelm_d(out1,
dst + dst_stride_3x, 0, 1);
1883 dst += dst_stride_4x;
1884 __lsx_vstelm_d(out2,
dst, 0, 0);
1885 __lsx_vstelm_d(out2,
dst + dst_stride, 0, 1);
1890 const int16_t *src1_ptr,
int32_t src2_stride,
1892 const int8_t *filter_x,
const int8_t *filter_y,
1895 uint32_t loop_cnt, cnt;
1896 const uint8_t *src0_ptr_tmp;
1897 const int16_t *src1_ptr_tmp;
1899 const int32_t src_stride_2x = (src_stride << 1);
1900 const int32_t dst_stride_2x = (dst_stride << 1);
1901 const int32_t src_stride_4x = (src_stride << 2);
1902 const int32_t dst_stride_4x = (dst_stride << 2);
1903 const int32_t src2_stride_x = (src2_stride << 1);
1904 const int32_t src2_stride_2x = (src2_stride << 2);
1905 const int32_t src_stride_3x = src_stride_2x + src_stride;
1906 const int32_t dst_stride_3x = dst_stride_2x + dst_stride;
1907 const int32_t src2_stride_3x = src2_stride_2x + src2_stride_x;
1910 __m128i in0, in1, in2, in3;
1911 __m128i filt0, filt1;
1912 __m128i filt_h0, filt_h1;
1914 __m128i mask1, filter_vec;
1915 __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1916 __m128i dst0, dst1, dst2, dst3, dst4, dst5;
1917 __m128i dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
1918 __m128i tmp0, tmp1, tmp2, tmp3;
1919 __m128i dst10_r, dst32_r, dst21_r, dst43_r;
1920 __m128i dst10_l, dst32_l, dst21_l, dst43_l;
1921 __m128i dst54_r, dst54_l, dst65_r, dst65_l, dst6;
1923 src0_ptr -= (src_stride + 1);
1925 DUP2_ARG2(__lsx_vldrepl_h, filter_x, 0, filter_x, 2, filt0, filt1);
1927 filter_vec = __lsx_vld(filter_y, 0);
1928 filter_vec = __lsx_vsllwil_h_b(filter_vec, 0);
1930 DUP2_ARG2(__lsx_vreplvei_w, filter_vec, 0, filter_vec, 1, filt_h0, filt_h1);
1932 mask1 = __lsx_vaddi_bu(mask0, 2);
1934 for (cnt =
width >> 3; cnt--;) {
1935 src0_ptr_tmp = src0_ptr;
1937 src1_ptr_tmp = src1_ptr;
1939 src0 = __lsx_vld(src0_ptr_tmp, 0);
1940 DUP2_ARG2(__lsx_vldx, src0_ptr_tmp, src_stride, src0_ptr_tmp,
1942 src0_ptr_tmp += src_stride_3x;
1951 DUP2_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec2, filt0, dst0, dst1);
1952 dst2 = __lsx_vdp2_h_bu_b(vec4, filt0);
1953 DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec1, filt1, dst1, vec3, filt1,
1955 dst2 = __lsx_vdp2add_h_bu_b(dst2, vec5, filt1);
1957 DUP2_ARG2(__lsx_vilvl_h, dst1, dst0, dst2, dst1, dst10_r, dst21_r);
1958 DUP2_ARG2(__lsx_vilvh_h, dst1, dst0, dst2, dst1, dst10_l, dst21_l);
1960 for (loop_cnt =
height >> 2; loop_cnt--;) {
1961 src3 = __lsx_vld(src0_ptr_tmp, 0);
1962 DUP2_ARG2(__lsx_vldx, src0_ptr_tmp, src_stride, src0_ptr_tmp,
1963 src_stride_2x, src4, src5);
1964 src6 = __lsx_vldx(src0_ptr_tmp, src_stride_3x);
1965 src0_ptr_tmp += src_stride_4x;
1966 in0 = __lsx_vld(src1_ptr_tmp, 0);
1967 DUP2_ARG2(__lsx_vldx, src1_ptr_tmp, src2_stride_x, src1_ptr_tmp,
1968 src2_stride_2x, in1, in2);
1969 in3 = __lsx_vldx(src1_ptr_tmp, src2_stride_3x);
1970 src1_ptr_tmp += src2_stride_2x;
1972 DUP4_ARG3(__lsx_vshuf_b, src3, src3, mask0, src3, src3, mask1, src4,
1973 src4, mask0, src4, src4, mask1, vec0, vec1, vec2, vec3);
1974 DUP4_ARG3(__lsx_vshuf_b, src5, src5, mask0, src5, src5, mask1, src6,
1975 src6, mask0, src6, src6, mask1, vec4, vec5, vec6, vec7);
1977 DUP4_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec2, filt0, vec4, filt0,
1978 vec6, filt0, dst3, dst4, dst5, dst6);
1979 DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst3, vec1, filt1, dst4, vec3,
1980 filt1, dst5, vec5, filt1, dst6, vec7, filt1,
1981 dst3, dst4, dst5, dst6);
1983 DUP2_ARG2(__lsx_vilvl_h, dst3, dst2, dst4, dst3, dst32_r, dst43_r);
1984 DUP2_ARG2(__lsx_vilvh_h, dst3, dst2, dst4, dst3, dst32_l, dst43_l);
1985 DUP2_ARG2(__lsx_vilvl_h, dst5, dst4, dst6, dst5, dst54_r, dst65_r);
1986 DUP2_ARG2(__lsx_vilvh_h, dst5, dst4, dst6, dst5, dst54_l, dst65_l);
1988 DUP4_ARG2(__lsx_vdp2_w_h, dst10_r, filt_h0, dst10_l, filt_h0, dst21_r,
1989 filt_h0, dst21_l, filt_h0, dst0_r, dst0_l, dst1_r, dst1_l);
1990 DUP4_ARG2(__lsx_vdp2_w_h, dst32_r, filt_h0, dst32_l, filt_h0, dst43_r,
1991 filt_h0, dst43_l, filt_h0, dst2_r, dst2_l, dst3_r, dst3_l);
1992 DUP4_ARG3(__lsx_vdp2add_w_h, dst0_r, dst32_r, filt_h1, dst0_l,
1993 dst32_l, filt_h1, dst1_r, dst43_r, filt_h1, dst1_l,
1994 dst43_l, filt_h1, dst0_r, dst0_l, dst1_r, dst1_l);
1995 DUP4_ARG3(__lsx_vdp2add_w_h, dst2_r, dst54_r, filt_h1, dst2_l,
1996 dst54_l, filt_h1, dst3_r, dst65_r, filt_h1, dst3_l,
1997 dst65_l, filt_h1, dst2_r, dst2_l, dst3_r, dst3_l);
1999 DUP4_ARG2(__lsx_vsrai_w, dst0_r, 6, dst0_l, 6, dst1_r, 6, dst1_l, 6,
2000 dst0_r, dst0_l, dst1_r, dst1_l);
2001 DUP4_ARG2(__lsx_vsrai_w, dst2_r, 6, dst2_l, 6, dst3_r, 6, dst3_l, 6,
2002 dst2_r, dst2_l, dst3_r, dst3_l);
2003 DUP4_ARG2(__lsx_vpickev_h, dst0_l, dst0_r, dst1_l, dst1_r, dst2_l,
2004 dst2_r, dst3_l, dst3_r, tmp0, tmp1, tmp2, tmp3);
2005 DUP4_ARG2(__lsx_vsadd_h, in0, tmp0, in1, tmp1, in2, tmp2, in3, tmp3,
2006 tmp0, tmp1, tmp2, tmp3);
2007 DUP4_ARG2(__lsx_vmaxi_h, tmp0, 0, tmp1, 0, tmp2, 0, tmp3, 0, tmp0,
2009 DUP2_ARG3(__lsx_vssrlrni_bu_h, tmp1, tmp0, 7, tmp3, tmp2, 7, out0, out1);
2010 __lsx_vstelm_d(out0, dst_tmp, 0, 0);
2011 __lsx_vstelm_d(out0, dst_tmp + dst_stride, 0, 1);
2012 __lsx_vstelm_d(out1, dst_tmp + dst_stride_2x, 0, 0);
2013 __lsx_vstelm_d(out1, dst_tmp + dst_stride_3x, 0, 1);
2014 dst_tmp += dst_stride_4x;
2030 const int16_t *src1_ptr,
int32_t src2_stride,
2032 const int8_t *filter_x,
const int8_t *filter_y,
2037 dst, dst_stride, filter_x, filter_y);
2038 }
else if (4 ==
height) {
2040 dst, dst_stride, filter_x, filter_y, 1);
2041 }
else if (6 ==
height) {
2043 dst, dst_stride, filter_x, filter_y);
2046 dst, dst_stride, filter_x, filter_y,
height, 8);
2051 const int16_t *src1_ptr,
int32_t src2_stride,
2053 const int8_t *filter_x,
const int8_t *filter_y,
2058 dst, dst_stride, filter_x, filter_y, 2);
2061 dst, dst_stride, filter_x, filter_y,
height, 16);
2066 const int16_t *src1_ptr,
int32_t src2_stride,
2068 const int8_t *filter_x,
const int8_t *filter_y,
2072 dst, dst_stride, filter_x, filter_y,
height, 24);
2076 const int16_t *src1_ptr,
int32_t src2_stride,
2078 const int8_t *filter_x,
const int8_t *filter_y,
2082 dst, dst_stride, filter_x, filter_y,
height, 32);
2085 #define BI_MC_COPY(WIDTH) \
2086 void ff_hevc_put_hevc_bi_pel_pixels##WIDTH##_8_lsx(uint8_t *dst, \
2087 ptrdiff_t dst_stride, \
2088 const uint8_t *src, \
2089 ptrdiff_t src_stride, \
2090 const int16_t *src_16bit, \
2096 hevc_bi_copy_##WIDTH##w_lsx(src, src_stride, src_16bit, MAX_PB_SIZE, \
2097 dst, dst_stride, height); \
2112 #define BI_MC(PEL, DIR, WIDTH, TAP, DIR1, FILT_DIR) \
2113 void ff_hevc_put_hevc_bi_##PEL##_##DIR##WIDTH##_8_lsx(uint8_t *dst, \
2114 ptrdiff_t dst_stride, \
2115 const uint8_t *src, \
2116 ptrdiff_t src_stride, \
2117 const int16_t *src_16bit, \
2123 const int8_t *filter = ff_hevc_##PEL##_filters[FILT_DIR]; \
2125 hevc_##DIR1##_##TAP##t_##WIDTH##w_lsx(src, src_stride, src_16bit, \
2126 MAX_PB_SIZE, dst, dst_stride, \
2137 BI_MC(qpel, v, 16, 8, vt,
my);
2138 BI_MC(qpel, v, 24, 8, vt,
my);
2139 BI_MC(qpel, v, 32, 8, vt,
my);
2140 BI_MC(qpel, v, 48, 8, vt,
my);
2141 BI_MC(qpel, v, 64, 8, vt,
my);
2146 BI_MC(epel, v, 12, 4, vt,
my);
2147 BI_MC(epel, v, 16, 4, vt,
my);
2148 BI_MC(epel, v, 24, 4, vt,
my);
2149 BI_MC(epel, v, 32, 4, vt,
my);
2153 #define BI_MC_HV(PEL, WIDTH, TAP) \
2154 void ff_hevc_put_hevc_bi_##PEL##_hv##WIDTH##_8_lsx(uint8_t *dst, \
2155 ptrdiff_t dst_stride, \
2156 const uint8_t *src, \
2157 ptrdiff_t src_stride, \
2158 const int16_t *src_16bit, \
2164 const int8_t *filter_x = ff_hevc_##PEL##_filters[mx]; \
2165 const int8_t *filter_y = ff_hevc_##PEL##_filters[my]; \
2167 hevc_hv_##TAP##t_##WIDTH##w_lsx(src, src_stride, src_16bit, \
2168 MAX_PB_SIZE, dst, dst_stride, \
2169 filter_x, filter_y, height); \