25 #define VP9_LPF_FILTER4_4W(p1_in, p0_in, q0_in, q1_in, mask_in, hev_in, \
26 p1_out, p0_out, q0_out, q1_out) \
28 v16i8 p1_m, p0_m, q0_m, q1_m, q0_sub_p0, filt, filt1, filt2; \
29 const v16i8 cnst4b = __msa_ldi_b(4); \
30 const v16i8 cnst3b = __msa_ldi_b(3); \
32 p1_m = (v16i8) __msa_xori_b(p1_in, 0x80); \
33 p0_m = (v16i8) __msa_xori_b(p0_in, 0x80); \
34 q0_m = (v16i8) __msa_xori_b(q0_in, 0x80); \
35 q1_m = (v16i8) __msa_xori_b(q1_in, 0x80); \
37 filt = __msa_subs_s_b(p1_m, q1_m); \
39 filt = filt & (v16i8) hev_in; \
41 q0_sub_p0 = __msa_subs_s_b(q0_m, p0_m); \
42 filt = __msa_adds_s_b(filt, q0_sub_p0); \
43 filt = __msa_adds_s_b(filt, q0_sub_p0); \
44 filt = __msa_adds_s_b(filt, q0_sub_p0); \
45 filt = filt & (v16i8) mask_in; \
47 filt1 = __msa_adds_s_b(filt, cnst4b); \
50 filt2 = __msa_adds_s_b(filt, cnst3b); \
53 q0_m = __msa_subs_s_b(q0_m, filt1); \
54 q0_out = __msa_xori_b((v16u8) q0_m, 0x80); \
55 p0_m = __msa_adds_s_b(p0_m, filt2); \
56 p0_out = __msa_xori_b((v16u8) p0_m, 0x80); \
58 filt = __msa_srari_b(filt1, 1); \
59 hev_in = __msa_xori_b((v16u8) hev_in, 0xff); \
60 filt = filt & (v16i8) hev_in; \
62 q1_m = __msa_subs_s_b(q1_m, filt); \
63 q1_out = __msa_xori_b((v16u8) q1_m, 0x80); \
64 p1_m = __msa_adds_s_b(p1_m, filt); \
65 p1_out = __msa_xori_b((v16u8) p1_m, 0x80); \
68 #define VP9_FLAT4(p3_in, p2_in, p0_in, q0_in, q2_in, q3_in, flat_out) \
70 v16u8 tmp, p2_a_sub_p0, q2_a_sub_q0, p3_a_sub_p0, q3_a_sub_q0; \
71 v16u8 zero_in = { 0 }; \
73 tmp = __msa_ori_b(zero_in, 1); \
74 p2_a_sub_p0 = __msa_asub_u_b(p2_in, p0_in); \
75 q2_a_sub_q0 = __msa_asub_u_b(q2_in, q0_in); \
76 p3_a_sub_p0 = __msa_asub_u_b(p3_in, p0_in); \
77 q3_a_sub_q0 = __msa_asub_u_b(q3_in, q0_in); \
79 p2_a_sub_p0 = __msa_max_u_b(p2_a_sub_p0, q2_a_sub_q0); \
80 flat_out = __msa_max_u_b(p2_a_sub_p0, flat_out); \
81 p3_a_sub_p0 = __msa_max_u_b(p3_a_sub_p0, q3_a_sub_q0); \
82 flat_out = __msa_max_u_b(p3_a_sub_p0, flat_out); \
84 flat_out = (tmp < (v16u8) flat_out); \
85 flat_out = __msa_xori_b(flat_out, 0xff); \
86 flat_out = flat_out & (mask); \
89 #define VP9_FLAT5(p7_in, p6_in, p5_in, p4_in, p0_in, q0_in, q4_in, \
90 q5_in, q6_in, q7_in, flat_in, flat2_out) \
92 v16u8 tmp, zero_in = { 0 }; \
93 v16u8 p4_a_sub_p0, q4_a_sub_q0, p5_a_sub_p0, q5_a_sub_q0; \
94 v16u8 p6_a_sub_p0, q6_a_sub_q0, p7_a_sub_p0, q7_a_sub_q0; \
96 tmp = __msa_ori_b(zero_in, 1); \
97 p4_a_sub_p0 = __msa_asub_u_b(p4_in, p0_in); \
98 q4_a_sub_q0 = __msa_asub_u_b(q4_in, q0_in); \
99 p5_a_sub_p0 = __msa_asub_u_b(p5_in, p0_in); \
100 q5_a_sub_q0 = __msa_asub_u_b(q5_in, q0_in); \
101 p6_a_sub_p0 = __msa_asub_u_b(p6_in, p0_in); \
102 q6_a_sub_q0 = __msa_asub_u_b(q6_in, q0_in); \
103 p7_a_sub_p0 = __msa_asub_u_b(p7_in, p0_in); \
104 q7_a_sub_q0 = __msa_asub_u_b(q7_in, q0_in); \
106 p4_a_sub_p0 = __msa_max_u_b(p4_a_sub_p0, q4_a_sub_q0); \
107 flat2_out = __msa_max_u_b(p5_a_sub_p0, q5_a_sub_q0); \
108 flat2_out = __msa_max_u_b(p4_a_sub_p0, flat2_out); \
109 p6_a_sub_p0 = __msa_max_u_b(p6_a_sub_p0, q6_a_sub_q0); \
110 flat2_out = __msa_max_u_b(p6_a_sub_p0, flat2_out); \
111 p7_a_sub_p0 = __msa_max_u_b(p7_a_sub_p0, q7_a_sub_q0); \
112 flat2_out = __msa_max_u_b(p7_a_sub_p0, flat2_out); \
114 flat2_out = (tmp < (v16u8) flat2_out); \
115 flat2_out = __msa_xori_b(flat2_out, 0xff); \
116 flat2_out = flat2_out & flat_in; \
119 #define VP9_FILTER8(p3_in, p2_in, p1_in, p0_in, \
120 q0_in, q1_in, q2_in, q3_in, \
121 p2_filt8_out, p1_filt8_out, p0_filt8_out, \
122 q0_filt8_out, q1_filt8_out, q2_filt8_out) \
124 v8u16 tmp0, tmp1, tmp2; \
126 tmp2 = p2_in + p1_in + p0_in; \
129 tmp0 = tmp0 + tmp2 + q0_in; \
130 tmp1 = tmp0 + p3_in + p2_in; \
131 p2_filt8_out = (v8i16) __msa_srari_h((v8i16) tmp1, 3); \
133 tmp1 = tmp0 + p1_in + q1_in; \
134 p1_filt8_out = (v8i16) __msa_srari_h((v8i16) tmp1, 3); \
136 tmp1 = q2_in + q1_in + q0_in; \
137 tmp2 = tmp2 + tmp1; \
138 tmp0 = tmp2 + (p0_in); \
139 tmp0 = tmp0 + (p3_in); \
140 p0_filt8_out = (v8i16) __msa_srari_h((v8i16) tmp0, 3); \
142 tmp0 = q2_in + q3_in; \
143 tmp0 = p0_in + tmp1 + tmp0; \
144 tmp1 = q3_in + q3_in; \
145 tmp1 = tmp1 + tmp0; \
146 q2_filt8_out = (v8i16) __msa_srari_h((v8i16) tmp1, 3); \
148 tmp0 = tmp2 + q3_in; \
149 tmp1 = tmp0 + q0_in; \
150 q0_filt8_out = (v8i16) __msa_srari_h((v8i16) tmp1, 3); \
152 tmp1 = tmp0 - p2_in; \
153 tmp0 = q1_in + q3_in; \
154 tmp1 = tmp0 + tmp1; \
155 q1_filt8_out = (v8i16) __msa_srari_h((v8i16) tmp1, 3); \
158 #define LPF_MASK_HEV(p3_in, p2_in, p1_in, p0_in, \
159 q0_in, q1_in, q2_in, q3_in, \
160 limit_in, b_limit_in, thresh_in, \
161 hev_out, mask_out, flat_out) \
163 v16u8 p3_asub_p2_m, p2_asub_p1_m, p1_asub_p0_m, q1_asub_q0_m; \
164 v16u8 p1_asub_q1_m, p0_asub_q0_m, q3_asub_q2_m, q2_asub_q1_m; \
167 p3_asub_p2_m = __msa_asub_u_b(p3_in, p2_in); \
168 p2_asub_p1_m = __msa_asub_u_b(p2_in, p1_in); \
169 p1_asub_p0_m = __msa_asub_u_b(p1_in, p0_in); \
170 q1_asub_q0_m = __msa_asub_u_b(q1_in, q0_in); \
171 q2_asub_q1_m = __msa_asub_u_b(q2_in, q1_in); \
172 q3_asub_q2_m = __msa_asub_u_b(q3_in, q2_in); \
173 p0_asub_q0_m = __msa_asub_u_b(p0_in, q0_in); \
174 p1_asub_q1_m = __msa_asub_u_b(p1_in, q1_in); \
177 flat_out = __msa_max_u_b(p1_asub_p0_m, q1_asub_q0_m); \
178 hev_out = thresh_in < (v16u8) flat_out; \
181 p0_asub_q0_m = __msa_adds_u_b(p0_asub_q0_m, p0_asub_q0_m); \
182 p1_asub_q1_m >>= 1; \
183 p0_asub_q0_m = __msa_adds_u_b(p0_asub_q0_m, p1_asub_q1_m); \
185 mask_out = b_limit_in < p0_asub_q0_m; \
186 mask_out = __msa_max_u_b(flat_out, mask_out); \
187 p3_asub_p2_m = __msa_max_u_b(p3_asub_p2_m, p2_asub_p1_m); \
188 mask_out = __msa_max_u_b(p3_asub_p2_m, mask_out); \
189 q2_asub_q1_m = __msa_max_u_b(q2_asub_q1_m, q3_asub_q2_m); \
190 mask_out = __msa_max_u_b(q2_asub_q1_m, mask_out); \
192 mask_out = limit_in < (v16u8) mask_out; \
193 mask_out = __msa_xori_b(mask_out, 0xff); \
201 uint64_t p1_d, p0_d, q0_d, q1_d;
203 v16u8 p3, p2, p1, p0, q3, q2,
q1,
q0, p1_out, p0_out, q0_out, q1_out;
206 LD_UB8((src - 4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3);
208 thresh = (v16u8) __msa_fill_b(thresh_ptr);
209 b_limit = (v16u8) __msa_fill_b(b_limit_ptr);
210 limit = (v16u8) __msa_fill_b(limit_ptr);
212 LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
217 p1_d = __msa_copy_u_d((v2i64) p1_out, 0);
218 p0_d = __msa_copy_u_d((v2i64) p0_out, 0);
219 q0_d = __msa_copy_u_d((v2i64) q0_out, 0);
220 q1_d = __msa_copy_u_d((v2i64) q1_out, 0);
221 SD4(p1_d, p0_d, q0_d, q1_d, (src - 2 * pitch), pitch);
230 v16u8
mask,
hev,
flat, thresh0, b_limit0, limit0, thresh1, b_limit1, limit1;
231 v16u8 p3, p2, p1, p0, q3, q2,
q1,
q0;
234 LD_UB8((src - 4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3);
236 thresh0 = (v16u8) __msa_fill_b(thresh_ptr);
237 thresh1 = (v16u8) __msa_fill_b(thresh_ptr >> 8);
238 thresh0 = (v16u8) __msa_ilvr_d((v2i64) thresh1, (v2i64) thresh0);
240 b_limit0 = (v16u8) __msa_fill_b(b_limit_ptr);
241 b_limit1 = (v16u8) __msa_fill_b(b_limit_ptr >> 8);
242 b_limit0 = (v16u8) __msa_ilvr_d((v2i64) b_limit1, (v2i64) b_limit0);
244 limit0 = (v16u8) __msa_fill_b(limit_ptr);
245 limit1 = (v16u8) __msa_fill_b(limit_ptr >> 8);
246 limit0 = (v16u8) __msa_ilvr_d((v2i64) limit1, (v2i64) limit0);
248 LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit0, b_limit0, thresh0,
252 ST_UB4(p1, p0, q0, q1, (src - 2 * pitch), pitch);
260 uint64_t p2_d, p1_d, p0_d, q0_d, q1_d, q2_d;
262 v16u8 p3, p2, p1, p0, q3, q2,
q1,
q0;
263 v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
264 v8i16 p2_filter8, p1_filter8, p0_filter8;
265 v8i16 q0_filter8, q1_filter8, q2_filter8;
266 v8u16 p3_r, p2_r, p1_r, p0_r, q3_r, q2_r, q1_r, q0_r;
270 LD_UB8((src - 4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3);
272 thresh = (v16u8) __msa_fill_b(thresh_ptr);
273 b_limit = (v16u8) __msa_fill_b(b_limit_ptr);
274 limit = (v16u8) __msa_fill_b(limit_ptr);
276 LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
282 flat = (v16u8) __msa_ilvr_d((v2i64)
zero, (v2i64) flat);
285 if (__msa_test_bz_v(flat)) {
286 p1_d = __msa_copy_u_d((v2i64) p1_out, 0);
287 p0_d = __msa_copy_u_d((v2i64) p0_out, 0);
288 q0_d = __msa_copy_u_d((v2i64) q0_out, 0);
289 q1_d = __msa_copy_u_d((v2i64) q1_out, 0);
290 SD4(p1_d, p0_d, q0_d, q1_d, (src - 2 * pitch), pitch);
292 ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1,
293 zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r,
295 VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filter8,
296 p1_filter8, p0_filter8, q0_filter8, q1_filter8, q2_filter8);
299 PCKEV_B4_SH(zero, p2_filter8, zero, p1_filter8, zero, p0_filter8,
300 zero, q0_filter8, p2_filter8, p1_filter8, p0_filter8,
302 PCKEV_B2_SH(zero, q1_filter8, zero, q2_filter8, q1_filter8, q2_filter8);
305 p2_out = __msa_bmnz_v(p2, (v16u8) p2_filter8, flat);
306 p1_out = __msa_bmnz_v(p1_out, (v16u8) p1_filter8, flat);
307 p0_out = __msa_bmnz_v(p0_out, (v16u8) p0_filter8, flat);
308 q0_out = __msa_bmnz_v(q0_out, (v16u8) q0_filter8, flat);
309 q1_out = __msa_bmnz_v(q1_out, (v16u8) q1_filter8, flat);
310 q2_out = __msa_bmnz_v(q2, (v16u8) q2_filter8, flat);
312 p2_d = __msa_copy_u_d((v2i64) p2_out, 0);
313 p1_d = __msa_copy_u_d((v2i64) p1_out, 0);
314 p0_d = __msa_copy_u_d((v2i64) p0_out, 0);
315 q0_d = __msa_copy_u_d((v2i64) q0_out, 0);
316 q1_d = __msa_copy_u_d((v2i64) q1_out, 0);
317 q2_d = __msa_copy_u_d((v2i64) q2_out, 0);
321 SD4(p2_d, p1_d, p0_d, q0_d, src, pitch);
334 v16u8 p3, p2, p1, p0, q3, q2,
q1,
q0;
335 v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
337 v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r;
338 v8u16 p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l;
339 v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r;
340 v8i16 q0_filt8_r, q1_filt8_r, q2_filt8_r;
341 v8i16 p2_filt8_l, p1_filt8_l, p0_filt8_l;
342 v8i16 q0_filt8_l, q1_filt8_l, q2_filt8_l;
346 LD_UB8(src - (4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3);
348 thresh = (v16u8) __msa_fill_b(thresh_ptr);
349 tmp = (v16u8) __msa_fill_b(thresh_ptr >> 8);
350 thresh = (v16u8) __msa_ilvr_d((v2i64)
tmp, (v2i64) thresh);
352 b_limit = (v16u8) __msa_fill_b(b_limit_ptr);
353 tmp = (v16u8) __msa_fill_b(b_limit_ptr >> 8);
354 b_limit = (v16u8) __msa_ilvr_d((v2i64)
tmp, (v2i64) b_limit);
356 limit = (v16u8) __msa_fill_b(limit_ptr);
357 tmp = (v16u8) __msa_fill_b(limit_ptr >> 8);
358 limit = (v16u8) __msa_ilvr_d((v2i64)
tmp, (v2i64) limit);
361 LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
368 if (__msa_test_bz_v(flat)) {
369 ST_UB4(p1_out, p0_out, q0_out, q1_out, (src - 2 * pitch), pitch);
371 ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1,
372 zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r,
374 VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r,
375 p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r);
377 ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l, p2_l, p1_l,
379 ILVL_B4_UH(zero, q0, zero, q1, zero, q2, zero, q3, q0_l, q1_l, q2_l,
381 VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l,
382 p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l);
385 PCKEV_B4_SH(p2_filt8_l, p2_filt8_r, p1_filt8_l, p1_filt8_r, p0_filt8_l,
386 p0_filt8_r, q0_filt8_l, q0_filt8_r, p2_filt8_r, p1_filt8_r,
387 p0_filt8_r, q0_filt8_r);
388 PCKEV_B2_SH(q1_filt8_l, q1_filt8_r, q2_filt8_l, q2_filt8_r,
389 q1_filt8_r, q2_filt8_r);
392 p2_out = __msa_bmnz_v(p2, (v16u8) p2_filt8_r, flat);
393 p1_out = __msa_bmnz_v(p1_out, (v16u8) p1_filt8_r, flat);
394 p0_out = __msa_bmnz_v(p0_out, (v16u8) p0_filt8_r, flat);
395 q0_out = __msa_bmnz_v(q0_out, (v16u8) q0_filt8_r, flat);
396 q1_out = __msa_bmnz_v(q1_out, (v16u8) q1_filt8_r, flat);
397 q2_out = __msa_bmnz_v(q2, (v16u8) q2_filt8_r, flat);
401 ST_UB4(p2_out, p1_out, p0_out, q0_out, src, pitch);
403 ST_UB2(q1_out, q2_out, src, pitch);
413 v16u8 p3, p2, p1, p0, q3, q2,
q1,
q0;
414 v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
416 v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r;
417 v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r;
418 v8i16 q0_filt8_r, q1_filt8_r, q2_filt8_r;
422 LD_UB8(src - (4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3);
424 thresh = (v16u8) __msa_fill_b(thresh_ptr);
425 tmp = (v16u8) __msa_fill_b(thresh_ptr >> 8);
426 thresh = (v16u8) __msa_ilvr_d((v2i64)
tmp, (v2i64) thresh);
428 b_limit = (v16u8) __msa_fill_b(b_limit_ptr);
429 tmp = (v16u8) __msa_fill_b(b_limit_ptr >> 8);
430 b_limit = (v16u8) __msa_ilvr_d((v2i64)
tmp, (v2i64) b_limit);
432 limit = (v16u8) __msa_fill_b(limit_ptr);
433 tmp = (v16u8) __msa_fill_b(limit_ptr >> 8);
434 limit = (v16u8) __msa_ilvr_d((v2i64)
tmp, (v2i64) limit);
437 LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
443 flat = (v16u8) __msa_ilvr_d((v2i64)
zero, (v2i64) flat);
446 if (__msa_test_bz_v(flat)) {
447 ST_UB4(p1_out, p0_out, q0_out, q1_out, (src - 2 * pitch), pitch);
449 ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1,
450 zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r,
452 VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r,
453 p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r);
456 PCKEV_B4_SH(p2_filt8_r, p2_filt8_r, p1_filt8_r, p1_filt8_r,
457 p0_filt8_r, p0_filt8_r, q0_filt8_r, q0_filt8_r,
458 p2_filt8_r, p1_filt8_r, p0_filt8_r, q0_filt8_r);
459 PCKEV_B2_SH(q1_filt8_r, q1_filt8_r, q2_filt8_r, q2_filt8_r,
460 q1_filt8_r, q2_filt8_r);
463 p2_out = __msa_bmnz_v(p2, (v16u8) p2_filt8_r, flat);
464 p1_out = __msa_bmnz_v(p1_out, (v16u8) p1_filt8_r, flat);
465 p0_out = __msa_bmnz_v(p0_out, (v16u8) p0_filt8_r, flat);
466 q0_out = __msa_bmnz_v(q0_out, (v16u8) q0_filt8_r, flat);
467 q1_out = __msa_bmnz_v(q1_out, (v16u8) q1_filt8_r, flat);
468 q2_out = __msa_bmnz_v(q2, (v16u8) q2_filt8_r, flat);
472 ST_UB4(p2_out, p1_out, p0_out, q0_out, src, pitch);
474 ST_UB2(q1_out, q2_out, src, pitch);
484 v16u8 p3, p2, p1, p0, q3, q2,
q1,
q0;
485 v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
487 v8u16 p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l;
488 v8i16 p2_filt8_l, p1_filt8_l, p0_filt8_l;
489 v8i16 q0_filt8_l, q1_filt8_l, q2_filt8_l;
493 LD_UB8(src - (4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3);
495 thresh = (v16u8) __msa_fill_b(thresh_ptr);
496 tmp = (v16u8) __msa_fill_b(thresh_ptr >> 8);
497 thresh = (v16u8) __msa_ilvr_d((v2i64)
tmp, (v2i64) thresh);
499 b_limit = (v16u8) __msa_fill_b(b_limit_ptr);
500 tmp = (v16u8) __msa_fill_b(b_limit_ptr >> 8);
501 b_limit = (v16u8) __msa_ilvr_d((v2i64)
tmp, (v2i64) b_limit);
503 limit = (v16u8) __msa_fill_b(limit_ptr);
504 tmp = (v16u8) __msa_fill_b(limit_ptr >> 8);
505 limit = (v16u8) __msa_ilvr_d((v2i64)
tmp, (v2i64) limit);
508 LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
514 flat = (v16u8) __msa_insve_d((v2i64)
flat, 0, (v2i64) zero);
517 if (__msa_test_bz_v(flat)) {
518 ST_UB4(p1_out, p0_out, q0_out, q1_out, (src - 2 * pitch), pitch);
520 ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l, p2_l, p1_l,
522 ILVL_B4_UH(zero, q0, zero, q1, zero, q2, zero, q3, q0_l, q1_l, q2_l,
524 VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l,
525 p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l);
528 PCKEV_B4_SH(p2_filt8_l, p2_filt8_l, p1_filt8_l, p1_filt8_l,
529 p0_filt8_l, p0_filt8_l, q0_filt8_l, q0_filt8_l,
530 p2_filt8_l, p1_filt8_l, p0_filt8_l, q0_filt8_l);
531 PCKEV_B2_SH(q1_filt8_l, q1_filt8_l, q2_filt8_l, q2_filt8_l,
532 q1_filt8_l, q2_filt8_l);
535 p2_out = __msa_bmnz_v(p2, (v16u8) p2_filt8_l, flat);
536 p1_out = __msa_bmnz_v(p1_out, (v16u8) p1_filt8_l, flat);
537 p0_out = __msa_bmnz_v(p0_out, (v16u8) p0_filt8_l, flat);
538 q0_out = __msa_bmnz_v(q0_out, (v16u8) q0_filt8_l, flat);
539 q1_out = __msa_bmnz_v(q1_out, (v16u8) q1_filt8_l, flat);
540 q2_out = __msa_bmnz_v(q2, (v16u8) q2_filt8_l, flat);
544 ST_UB4(p2_out, p1_out, p0_out, q0_out, src, pitch);
546 ST_UB2(q1_out, q2_out, src, pitch);
557 v16u8 p3, p2, p1, p0, q3, q2,
q1,
q0;
558 v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
560 v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r;
561 v8u16 p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l;
562 v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r;
563 v8i16 q0_filt8_r, q1_filt8_r, q2_filt8_r;
564 v8i16 p2_filt8_l, p1_filt8_l, p0_filt8_l;
565 v8i16 q0_filt8_l, q1_filt8_l, q2_filt8_l;
569 LD_UB8(src - (4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3);
571 thresh = (v16u8) __msa_fill_b(thresh_ptr);
572 b_limit = (v16u8) __msa_fill_b(b_limit_ptr);
573 limit = (v16u8) __msa_fill_b(limit_ptr);
576 LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
583 if (__msa_test_bz_v(flat)) {
584 ST_UB4(p1_out, p0_out, q0_out, q1_out, (src - 2 * pitch), pitch);
588 ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1,
589 zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r,
591 VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r,
592 p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r);
594 ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l, p2_l, p1_l,
596 ILVL_B4_UH(zero, q0, zero, q1, zero, q2, zero, q3, q0_l, q1_l, q2_l,
598 VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l,
599 p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l);
602 PCKEV_B4_SH(p2_filt8_l, p2_filt8_r, p1_filt8_l, p1_filt8_r, p0_filt8_l,
603 p0_filt8_r, q0_filt8_l, q0_filt8_r, p2_filt8_r, p1_filt8_r,
604 p0_filt8_r, q0_filt8_r);
605 PCKEV_B2_SH(q1_filt8_l, q1_filt8_r, q2_filt8_l, q2_filt8_r, q1_filt8_r,
609 p2_out = __msa_bmnz_v(p2, (v16u8) p2_filt8_r, flat);
610 p1_out = __msa_bmnz_v(p1_out, (v16u8) p1_filt8_r, flat);
611 p0_out = __msa_bmnz_v(p0_out, (v16u8) p0_filt8_r, flat);
612 q0_out = __msa_bmnz_v(q0_out, (v16u8) q0_filt8_r, flat);
613 q1_out = __msa_bmnz_v(q1_out, (v16u8) q1_filt8_r, flat);
614 q2_out = __msa_bmnz_v(q2, (v16u8) q2_filt8_r, flat);
616 ST_UB4(p2_out, p1_out, p0_out, q0_out, filter48, 16);
617 filter48 += (4 * 16);
618 ST_UB2(q1_out, q2_out, filter48, 16);
619 filter48 += (2 * 16);
620 ST_UB(flat, filter48);
628 v16u8
flat, flat2, filter8;
630 v16u8 p7, p6, p5, p4, p3, p2, p1, p0,
q0,
q1, q2, q3, q4, q5, q6, q7;
631 v8u16 p7_r_in, p6_r_in, p5_r_in, p4_r_in;
632 v8u16 p3_r_in, p2_r_in, p1_r_in, p0_r_in;
633 v8u16 q7_r_in, q6_r_in, q5_r_in, q4_r_in;
634 v8u16 q3_r_in, q2_r_in, q1_r_in, q0_r_in;
635 v8u16 p7_l_in, p6_l_in, p5_l_in, p4_l_in;
636 v8u16 p3_l_in, p2_l_in, p1_l_in, p0_l_in;
637 v8u16 q7_l_in, q6_l_in, q5_l_in, q4_l_in;
638 v8u16 q3_l_in, q2_l_in, q1_l_in, q0_l_in;
639 v8u16 tmp0_r, tmp1_r, tmp0_l, tmp1_l;
642 flat =
LD_UB(filter48 + 96);
644 LD_UB8((src - 8 * pitch), pitch, p7, p6, p5, p4, p3, p2, p1, p0);
645 LD_UB8(src, pitch, q0, q1, q2, q3, q4, q5, q6, q7);
646 VP9_FLAT5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, flat, flat2);
649 if (__msa_test_bz_v(flat2)) {
650 LD_UB4(filter48, 16, p2, p1, p0, q0);
651 LD_UB2(filter48 + 4 * 16, 16, q1, q2);
654 ST_UB4(p2, p1, p0, q0, src, pitch);
656 ST_UB2(q1, q2, src, pitch);
660 ILVR_B8_UH(zero, p7, zero, p6, zero, p5, zero, p4, zero, p3, zero, p2,
661 zero, p1, zero, p0, p7_r_in, p6_r_in, p5_r_in, p4_r_in,
662 p3_r_in, p2_r_in, p1_r_in, p0_r_in);
664 q0_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8)
q0);
666 tmp0_r = p7_r_in << 3;
670 tmp1_r = p6_r_in + p5_r_in;
677 r_out = __msa_srari_h((v8i16) tmp1_r, 4);
679 ILVL_B4_UH(zero, p7, zero, p6, zero, p5, zero, p4, p7_l_in, p6_l_in,
681 ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l_in, p2_l_in,
683 q0_l_in = (v8u16) __msa_ilvl_b(zero, (v16i8)
q0);
685 tmp0_l = p7_l_in << 3;
689 tmp1_l = p6_l_in + p5_l_in;
696 l_out = __msa_srari_h((v8i16) tmp1_l, 4);
698 r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
699 p6 = __msa_bmnz_v(p6, (v16u8) r_out, flat2);
704 q1_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8)
q1);
705 tmp0_r = p5_r_in - p6_r_in;
709 r_out = __msa_srari_h((v8i16) tmp1_r, 4);
711 q1_l_in = (v8u16) __msa_ilvl_b(zero, (v16i8)
q1);
712 tmp0_l = p5_l_in - p6_l_in;
716 l_out = __msa_srari_h((v8i16) tmp1_l, 4);
718 r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
719 p5 = __msa_bmnz_v(p5, (v16u8) r_out, flat2);
724 q2_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q2);
725 tmp0_r = p4_r_in - p5_r_in;
729 r_out = (v8i16) __msa_srari_h((v8i16) tmp1_r, 4);
731 q2_l_in = (v8u16) __msa_ilvl_b(zero, (v16i8) q2);
732 tmp0_l = p4_l_in - p5_l_in;
736 l_out = __msa_srari_h((v8i16) tmp1_l, 4);
738 r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
739 p4 = __msa_bmnz_v(p4, (v16u8) r_out, flat2);
744 q3_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q3);
745 tmp0_r = p3_r_in - p4_r_in;
749 r_out = __msa_srari_h((v8i16) tmp1_r, 4);
751 q3_l_in = (v8u16) __msa_ilvl_b(zero, (v16i8) q3);
752 tmp0_l = p3_l_in - p4_l_in;
756 l_out = __msa_srari_h((v8i16) tmp1_l, 4);
758 r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
759 p3 = __msa_bmnz_v(p3, (v16u8) r_out, flat2);
764 q4_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q4);
765 filter8 =
LD_UB(filter48);
766 tmp0_r = p2_r_in - p3_r_in;
770 r_out = __msa_srari_h((v8i16) tmp1_r, 4);
772 q4_l_in = (v8u16) __msa_ilvl_b(zero, (v16i8) q4);
773 tmp0_l = p2_l_in - p3_l_in;
777 l_out = __msa_srari_h((v8i16) tmp1_l, 4);
779 r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
780 filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2);
785 q5_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q5);
786 filter8 =
LD_UB(filter48 + 16);
787 tmp0_r = p1_r_in - p2_r_in;
791 r_out = __msa_srari_h((v8i16) tmp1_r, 4);
793 q5_l_in = (v8u16) __msa_ilvl_b(zero, (v16i8) q5);
794 tmp0_l = p1_l_in - p2_l_in;
798 l_out = __msa_srari_h((v8i16) tmp1_l, 4);
800 r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
801 filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2);
806 q6_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q6);
807 filter8 =
LD_UB(filter48 + 32);
808 tmp0_r = p0_r_in - p1_r_in;
812 r_out = __msa_srari_h((v8i16) tmp1_r, 4);
814 q6_l_in = (v8u16) __msa_ilvl_b(zero, (v16i8) q6);
815 tmp0_l = p0_l_in - p1_l_in;
819 l_out = __msa_srari_h((v8i16) tmp1_l, 4);
821 r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
822 filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2);
827 q7_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q7);
828 filter8 =
LD_UB(filter48 + 48);
829 tmp0_r = q7_r_in - p0_r_in;
833 r_out = __msa_srari_h((v8i16) tmp1_r, 4);
835 q7_l_in = (v8u16) __msa_ilvl_b(zero, (v16i8) q7);
836 tmp0_l = q7_l_in - p0_l_in;
840 l_out = __msa_srari_h((v8i16) tmp1_l, 4);
842 r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
843 filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2);
848 filter8 =
LD_UB(filter48 + 64);
849 tmp0_r = q7_r_in - q0_r_in;
853 r_out = __msa_srari_h((v8i16) tmp1_r, 4);
855 tmp0_l = q7_l_in - q0_l_in;
859 l_out = __msa_srari_h((v8i16) tmp1_l, 4);
861 r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
862 filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2);
867 filter8 =
LD_UB(filter48 + 80);
868 tmp0_r = q7_r_in - q1_r_in;
872 r_out = __msa_srari_h((v8i16) tmp1_r, 4);
874 tmp0_l = q7_l_in - q1_l_in;
878 l_out = __msa_srari_h((v8i16) tmp1_l, 4);
880 r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
881 filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2);
886 tmp0_r = q7_r_in - q2_r_in;
890 r_out = __msa_srari_h((v8i16) tmp1_r, 4);
892 tmp0_l = q7_l_in - q2_l_in;
896 l_out = __msa_srari_h((v8i16) tmp1_l, 4);
898 r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
899 q3 = __msa_bmnz_v(q3, (v16u8) r_out, flat2);
904 tmp0_r = q7_r_in - q3_r_in;
908 r_out = __msa_srari_h((v8i16) tmp1_r, 4);
910 tmp0_l = q7_l_in - q3_l_in;
914 l_out = __msa_srari_h((v8i16) tmp1_l, 4);
916 r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
917 q4 = __msa_bmnz_v(q4, (v16u8) r_out, flat2);
922 tmp0_r = q7_r_in - q4_r_in;
926 r_out = __msa_srari_h((v8i16) tmp1_r, 4);
928 tmp0_l = q7_l_in - q4_l_in;
932 l_out = __msa_srari_h((v8i16) tmp1_l, 4);
934 r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
935 q5 = __msa_bmnz_v(q5, (v16u8) r_out, flat2);
940 tmp0_r = q7_r_in - q5_r_in;
944 r_out = __msa_srari_h((v8i16) tmp1_r, 4);
946 tmp0_l = q7_l_in - q5_l_in;
950 l_out = __msa_srari_h((v8i16) tmp1_l, 4);
952 r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
953 q6 = __msa_bmnz_v(q6, (v16u8) r_out, flat2);
967 b_limit_ptr, limit_ptr, thresh_ptr);
969 if (0 == early_exit) {
979 uint64_t p2_d, p1_d, p0_d, q0_d, q1_d, q2_d;
980 uint64_t dword0, dword1;
981 v16u8 flat2,
mask,
hev,
flat, thresh, b_limit, limit;
982 v16u8 p3, p2, p1, p0, q3, q2,
q1,
q0, p7, p6, p5, p4, q4, q5, q6, q7;
983 v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
984 v16u8 p0_filter16, p1_filter16;
985 v8i16 p2_filter8, p1_filter8, p0_filter8;
986 v8i16 q0_filter8, q1_filter8, q2_filter8;
987 v8u16 p7_r, p6_r, p5_r, p4_r, q7_r, q6_r, q5_r, q4_r;
988 v8u16 p3_r, p2_r, p1_r, p0_r, q3_r, q2_r, q1_r, q0_r;
990 v8u16 tmp0, tmp1, tmp2;
993 LD_UB8((src - 4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3);
995 thresh = (v16u8) __msa_fill_b(thresh_ptr);
996 b_limit = (v16u8) __msa_fill_b(b_limit_ptr);
997 limit = (v16u8) __msa_fill_b(limit_ptr);
999 LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
1001 VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
1005 flat = (v16u8) __msa_ilvr_d((v2i64)
zero, (v2i64) flat);
1008 if (__msa_test_bz_v(flat)) {
1009 p1_d = __msa_copy_u_d((v2i64) p1_out, 0);
1010 p0_d = __msa_copy_u_d((v2i64) p0_out, 0);
1011 q0_d = __msa_copy_u_d((v2i64) q0_out, 0);
1012 q1_d = __msa_copy_u_d((v2i64) q1_out, 0);
1013 SD4(p1_d, p0_d, q0_d, q1_d, src - 2 * pitch, pitch);
1016 ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero,
1017 q1, zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r,
1019 VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r,
1020 p2_filter8, p1_filter8, p0_filter8, q0_filter8,
1021 q1_filter8, q2_filter8);
1024 PCKEV_B4_SH(zero, p2_filter8, zero, p1_filter8, zero, p0_filter8,
1025 zero, q0_filter8, p2_filter8, p1_filter8, p0_filter8,
1027 PCKEV_B2_SH(zero, q1_filter8, zero, q2_filter8, q1_filter8,
1031 p2_out = __msa_bmnz_v(p2, (v16u8) p2_filter8, flat);
1032 p1_out = __msa_bmnz_v(p1_out, (v16u8) p1_filter8, flat);
1033 p0_out = __msa_bmnz_v(p0_out, (v16u8) p0_filter8, flat);
1034 q0_out = __msa_bmnz_v(q0_out, (v16u8) q0_filter8, flat);
1035 q1_out = __msa_bmnz_v(q1_out, (v16u8) q1_filter8, flat);
1036 q2_out = __msa_bmnz_v(q2, (v16u8) q2_filter8, flat);
1039 LD_UB4((src - 8 * pitch), pitch, p7, p6, p5, p4);
1040 LD_UB4(src + (4 * pitch), pitch, q4, q5, q6, q7);
1042 VP9_FLAT5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, flat, flat2);
1045 if (__msa_test_bz_v(flat2)) {
1046 p2_d = __msa_copy_u_d((v2i64) p2_out, 0);
1047 p1_d = __msa_copy_u_d((v2i64) p1_out, 0);
1048 p0_d = __msa_copy_u_d((v2i64) p0_out, 0);
1049 q0_d = __msa_copy_u_d((v2i64) q0_out, 0);
1050 q1_d = __msa_copy_u_d((v2i64) q1_out, 0);
1051 q2_d = __msa_copy_u_d((v2i64) q2_out, 0);
1053 SD4(p2_d, p1_d, p0_d, q0_d, src - 3 * pitch, pitch);
1054 SD(q1_d, src + pitch);
1055 SD(q2_d, src + 2 * pitch);
1058 ILVR_B8_UH(zero, p7, zero, p6, zero, p5, zero, p4, zero, q4,
1059 zero, q5, zero, q6, zero, q7, p7_r, p6_r, p5_r, p4_r,
1060 q4_r, q5_r, q6_r, q7_r);
1070 tmp1 = p6_r + p5_r + p4_r + p3_r;
1071 tmp1 += (p2_r + p1_r + p0_r);
1073 p0_filter16 = (v16u8) __msa_srari_h((v8i16) tmp1, 4);
1074 tmp0 = p5_r - p6_r + q1_r - p7_r;
1076 p1_filter16 = (v16u8) __msa_srari_h((v8i16) tmp1, 4);
1078 p0_filter16, p1_filter16);
1079 p0_filter16 = __msa_bmnz_v(p6, p0_filter16, flat2);
1080 p1_filter16 = __msa_bmnz_v(p5, p1_filter16, flat2);
1081 dword0 = __msa_copy_u_d((v2i64) p0_filter16, 0);
1082 dword1 = __msa_copy_u_d((v2i64) p1_filter16, 0);
1089 tmp0 = p4_r - p5_r + q2_r - p7_r;
1090 tmp2 = p3_r - p4_r + q3_r - p7_r;
1092 p0_filter16 = (v16u8) __msa_srari_h((v8i16) tmp1, 4);
1094 p1_filter16 = (v16u8) __msa_srari_h((v8i16) tmp1, 4);
1096 p0_filter16, p1_filter16);
1097 p0_filter16 = __msa_bmnz_v(p4, p0_filter16, flat2);
1098 p1_filter16 = __msa_bmnz_v(p3, p1_filter16, flat2);
1099 dword0 = __msa_copy_u_d((v2i64) p0_filter16, 0);
1100 dword1 = __msa_copy_u_d((v2i64) p1_filter16, 0);
1107 tmp0 = p2_r - p3_r + q4_r - p7_r;
1108 tmp2 = p1_r - p2_r + q5_r - p7_r;
1110 p0_filter16 = (v16u8) __msa_srari_h((v8i16) tmp1, 4);
1112 p1_filter16 = (v16u8) __msa_srari_h((v8i16) tmp1, 4);
1114 p0_filter16, p1_filter16);
1115 p0_filter16 = __msa_bmnz_v(p2_out, p0_filter16, flat2);
1116 p1_filter16 = __msa_bmnz_v(p1_out, p1_filter16, flat2);
1117 dword0 = __msa_copy_u_d((v2i64) p0_filter16, 0);
1118 dword1 = __msa_copy_u_d((v2i64) p1_filter16, 0);
1125 tmp0 = (p0_r - p1_r) + (q6_r - p7_r);
1126 tmp2 = (q7_r - p0_r) + (q0_r - p7_r);
1128 p0_filter16 = (v16u8) __msa_srari_h((v8i16) tmp1, 4);
1130 p1_filter16 = (v16u8) __msa_srari_h((v8i16) tmp1, 4);
1132 p0_filter16, p1_filter16);
1133 p0_filter16 = __msa_bmnz_v(p0_out, p0_filter16, flat2);
1134 p1_filter16 = __msa_bmnz_v(q0_out, p1_filter16, flat2);
1135 dword0 = __msa_copy_u_d((v2i64) p0_filter16, 0);
1136 dword1 = __msa_copy_u_d((v2i64) p1_filter16, 0);
1143 tmp0 = q7_r - q0_r + q1_r - p6_r;
1144 tmp2 = q7_r - q1_r + q2_r - p5_r;
1146 p0_filter16 = (v16u8) __msa_srari_h((v8i16) tmp1, 4);
1148 p1_filter16 = (v16u8) __msa_srari_h((v8i16) tmp1, 4);
1150 p0_filter16, p1_filter16);
1151 p0_filter16 = __msa_bmnz_v(q1_out, p0_filter16, flat2);
1152 p1_filter16 = __msa_bmnz_v(q2_out, p1_filter16, flat2);
1153 dword0 = __msa_copy_u_d((v2i64) p0_filter16, 0);
1154 dword1 = __msa_copy_u_d((v2i64) p1_filter16, 0);
1161 tmp0 = (q7_r - q2_r) + (q3_r - p4_r);
1162 tmp2 = (q7_r - q3_r) + (q4_r - p3_r);
1164 p0_filter16 = (v16u8) __msa_srari_h((v8i16) tmp1, 4);
1166 p1_filter16 = (v16u8) __msa_srari_h((v8i16) tmp1, 4);
1168 p0_filter16, p1_filter16);
1169 p0_filter16 = __msa_bmnz_v(q3, p0_filter16, flat2);
1170 p1_filter16 = __msa_bmnz_v(q4, p1_filter16, flat2);
1171 dword0 = __msa_copy_u_d((v2i64) p0_filter16, 0);
1172 dword1 = __msa_copy_u_d((v2i64) p1_filter16, 0);
1179 tmp0 = (q7_r - q4_r) + (q5_r - p2_r);
1180 tmp2 = (q7_r - q5_r) + (q6_r - p1_r);
1182 p0_filter16 = (v16u8) __msa_srari_h((v8i16) tmp1, 4);
1184 p1_filter16 = (v16u8) __msa_srari_h((v8i16) tmp1, 4);
1186 p0_filter16, p1_filter16);
1187 p0_filter16 = __msa_bmnz_v(q5, p0_filter16, flat2);
1188 p1_filter16 = __msa_bmnz_v(q6, p1_filter16, flat2);
1189 dword0 = __msa_copy_u_d((v2i64) p0_filter16, 0);
1190 dword1 = __msa_copy_u_d((v2i64) p1_filter16, 0);
1204 v16u8 p3, p2, p1, p0, q3, q2,
q1,
q0;
1205 v8i16 vec0, vec1, vec2, vec3;
1207 LD_UB8((src - 4), pitch, p3, p2, p1, p0, q0, q1, q2, q3);
1209 thresh = (v16u8) __msa_fill_b(thresh_ptr);
1210 b_limit = (v16u8) __msa_fill_b(b_limit_ptr);
1211 limit = (v16u8) __msa_fill_b(limit_ptr);
1214 p3, p2, p1, p0, q0, q1, q2, q3);
1215 LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
1222 ST4x4_UB(vec2, vec2, 0, 1, 2, 3, src, pitch);
1224 ST4x4_UB(vec3, vec3, 0, 1, 2, 3, src, pitch);
1233 v16u8 thresh0, b_limit0, limit0, thresh1, b_limit1, limit1;
1234 v16u8 p3, p2, p1, p0, q3, q2,
q1,
q0;
1235 v16u8 row0, row1, row2, row3, row4, row5, row6, row7;
1236 v16u8 row8, row9, row10, row11, row12, row13, row14, row15;
1237 v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
1239 LD_UB8(src - 4, pitch, row0, row1, row2, row3, row4, row5, row6, row7);
1240 LD_UB8(src - 4 + (8 * pitch), pitch,
1241 row8, row9, row10, row11, row12, row13, row14, row15);
1244 row8, row9, row10, row11, row12, row13, row14, row15,
1245 p3, p2, p1, p0, q0, q1, q2, q3);
1247 thresh0 = (v16u8) __msa_fill_b(thresh_ptr);
1248 thresh1 = (v16u8) __msa_fill_b(thresh_ptr >> 8);
1249 thresh0 = (v16u8) __msa_ilvr_d((v2i64) thresh1, (v2i64) thresh0);
1251 b_limit0 = (v16u8) __msa_fill_b(b_limit_ptr);
1252 b_limit1 = (v16u8) __msa_fill_b(b_limit_ptr >> 8);
1253 b_limit0 = (v16u8) __msa_ilvr_d((v2i64) b_limit1, (v2i64) b_limit0);
1255 limit0 = (v16u8) __msa_fill_b(limit_ptr);
1256 limit1 = (v16u8) __msa_fill_b(limit_ptr >> 8);
1257 limit0 = (v16u8) __msa_ilvr_d((v2i64) limit1, (v2i64) limit0);
1259 LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit0, b_limit0, thresh0,
1279 v16u8 p3, p2, p1, p0, q3, q2,
q1,
q0;
1280 v16u8 p1_out, p0_out, q0_out, q1_out;
1282 v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r;
1283 v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r;
1284 v8i16 q0_filt8_r, q1_filt8_r, q2_filt8_r;
1286 v8i16 vec0, vec1, vec2, vec3, vec4;
1289 LD_UB8(src - 4, pitch, p3, p2, p1, p0, q0, q1, q2, q3);
1292 p3, p2, p1, p0, q0, q1, q2, q3);
1294 thresh = (v16u8) __msa_fill_b(thresh_ptr);
1295 b_limit = (v16u8) __msa_fill_b(b_limit_ptr);
1296 limit = (v16u8) __msa_fill_b(limit_ptr);
1299 LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
1302 VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
1307 flat = (v16u8) __msa_ilvr_d((v2i64)
zero, (v2i64) flat);
1310 if (__msa_test_bz_v(flat)) {
1312 ILVR_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1);
1316 ST4x4_UB(vec2, vec2, 0, 1, 2, 3, src, pitch);
1318 ST4x4_UB(vec3, vec3, 0, 1, 2, 3, src, pitch);
1320 ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1,
1321 zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r,
1323 VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r,
1324 p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r);
1326 PCKEV_B4_SH(p2_filt8_r, p2_filt8_r, p1_filt8_r, p1_filt8_r, p0_filt8_r,
1327 p0_filt8_r, q0_filt8_r, q0_filt8_r, p2_filt8_r, p1_filt8_r,
1328 p0_filt8_r, q0_filt8_r);
1329 PCKEV_B2_SH(q1_filt8_r, q1_filt8_r, q2_filt8_r, q2_filt8_r, q1_filt8_r,
1333 p2 = __msa_bmnz_v(p2, (v16u8) p2_filt8_r, flat);
1334 p1 = __msa_bmnz_v(p1_out, (v16u8) p1_filt8_r, flat);
1335 p0 = __msa_bmnz_v(p0_out, (v16u8) p0_filt8_r, flat);
1336 q0 = __msa_bmnz_v(q0_out, (v16u8) q0_filt8_r, flat);
1337 q1 = __msa_bmnz_v(q1_out, (v16u8) q1_filt8_r, flat);
1338 q2 = __msa_bmnz_v(q2, (v16u8) q2_filt8_r, flat);
1343 vec4 = (v8i16) __msa_ilvr_b((v16i8) q2, (v16i8) q1);
1346 ST4x4_UB(vec2, vec2, 0, 1, 2, 3, src, pitch);
1349 ST4x4_UB(vec3, vec3, 0, 1, 2, 3, src, pitch);
1360 v16u8 p3, p2, p1, p0, q3, q2,
q1,
q0;
1361 v16u8 p1_out, p0_out, q0_out, q1_out;
1363 v16u8 row4, row5, row6, row7, row12, row13, row14, row15;
1364 v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r;
1365 v8u16 p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l;
1366 v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r;
1367 v8i16 q0_filt8_r, q1_filt8_r, q2_filt8_r;
1368 v8i16 p2_filt8_l, p1_filt8_l, p0_filt8_l;
1369 v8i16 q0_filt8_l, q1_filt8_l, q2_filt8_l;
1371 v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1375 LD_UB8(temp_src, pitch, p0, p1, p2, p3, row4, row5, row6, row7);
1376 temp_src += (8 * pitch);
1377 LD_UB8(temp_src, pitch, q3, q2, q1, q0, row12, row13, row14, row15);
1381 q3, q2, q1, q0, row12, row13, row14, row15,
1382 p3, p2, p1, p0, q0, q1, q2, q3);
1384 thresh = (v16u8) __msa_fill_b(thresh_ptr);
1385 vec0 = (v8i16) __msa_fill_b(thresh_ptr >> 8);
1386 thresh = (v16u8) __msa_ilvr_d((v2i64) vec0, (v2i64) thresh);
1388 b_limit = (v16u8) __msa_fill_b(b_limit_ptr);
1389 vec0 = (v8i16) __msa_fill_b(b_limit_ptr >> 8);
1390 b_limit = (v16u8) __msa_ilvr_d((v2i64) vec0, (v2i64) b_limit);
1392 limit = (v16u8) __msa_fill_b(limit_ptr);
1393 vec0 = (v8i16) __msa_fill_b(limit_ptr >> 8);
1394 limit = (v16u8) __msa_ilvr_d((v2i64) vec0, (v2i64) limit);
1397 LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
1400 VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
1406 if (__msa_test_bz_v(flat)) {
1407 ILVR_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1);
1409 ILVL_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1);
1417 ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1,
1418 zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r,
1420 VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r,
1421 p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r);
1423 ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l, p2_l, p1_l,
1425 ILVL_B4_UH(zero, q0, zero, q1, zero, q2, zero, q3, q0_l, q1_l, q2_l,
1429 VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l,
1430 p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l);
1433 PCKEV_B4_SH(p2_filt8_l, p2_filt8_r, p1_filt8_l, p1_filt8_r, p0_filt8_l,
1434 p0_filt8_r, q0_filt8_l, q0_filt8_r, p2_filt8_r, p1_filt8_r,
1435 p0_filt8_r, q0_filt8_r);
1436 PCKEV_B2_SH(q1_filt8_l, q1_filt8_r, q2_filt8_l, q2_filt8_r, q1_filt8_r,
1440 p2 = __msa_bmnz_v(p2, (v16u8) p2_filt8_r, flat);
1441 p1 = __msa_bmnz_v(p1_out, (v16u8) p1_filt8_r, flat);
1442 p0 = __msa_bmnz_v(p0_out, (v16u8) p0_filt8_r, flat);
1443 q0 = __msa_bmnz_v(q0_out, (v16u8) q0_filt8_r, flat);
1444 q1 = __msa_bmnz_v(q1_out, (v16u8) q1_filt8_r, flat);
1445 q2 = __msa_bmnz_v(q2, (v16u8) q2_filt8_r, flat);
1454 ST4x4_UB(vec3, vec3, 0, 1, 2, 3, src, pitch);
1457 ST4x4_UB(vec4, vec4, 0, 1, 2, 3, src, pitch);
1460 ST4x4_UB(vec6, vec6, 0, 1, 2, 3, src, pitch);
1463 ST4x4_UB(vec7, vec7, 0, 1, 2, 3, src, pitch);
1474 v16u8 p3, p2, p1, p0, q3, q2,
q1,
q0;
1475 v16u8 p1_out, p0_out, q0_out, q1_out;
1477 v16u8 row4, row5, row6, row7, row12, row13, row14, row15;
1478 v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r;
1479 v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r;
1480 v8i16 q0_filt8_r, q1_filt8_r, q2_filt8_r;
1482 v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1486 LD_UB8(temp_src, pitch, p0, p1, p2, p3, row4, row5, row6, row7);
1487 temp_src += (8 * pitch);
1488 LD_UB8(temp_src, pitch, q3, q2, q1, q0, row12, row13, row14, row15);
1492 q3, q2, q1, q0, row12, row13, row14, row15,
1493 p3, p2, p1, p0, q0, q1, q2, q3);
1495 thresh = (v16u8) __msa_fill_b(thresh_ptr);
1496 vec0 = (v8i16) __msa_fill_b(thresh_ptr >> 8);
1497 thresh = (v16u8) __msa_ilvr_d((v2i64) vec0, (v2i64) thresh);
1499 b_limit = (v16u8) __msa_fill_b(b_limit_ptr);
1500 vec0 = (v8i16) __msa_fill_b(b_limit_ptr >> 8);
1501 b_limit = (v16u8) __msa_ilvr_d((v2i64) vec0, (v2i64) b_limit);
1503 limit = (v16u8) __msa_fill_b(limit_ptr);
1504 vec0 = (v8i16) __msa_fill_b(limit_ptr >> 8);
1505 limit = (v16u8) __msa_ilvr_d((v2i64) vec0, (v2i64) limit);
1508 LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
1511 VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
1516 flat = (v16u8) __msa_ilvr_d((v2i64)
zero, (v2i64) flat);
1519 if (__msa_test_bz_v(flat)) {
1520 ILVR_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1);
1522 ILVL_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1);
1530 ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1,
1531 zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r,
1533 VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r,
1534 p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r);
1537 PCKEV_B4_SH(p2_filt8_r, p2_filt8_r, p1_filt8_r, p1_filt8_r,
1538 p0_filt8_r, p0_filt8_r, q0_filt8_r, q0_filt8_r,
1539 p2_filt8_r, p1_filt8_r, p0_filt8_r, q0_filt8_r);
1540 PCKEV_B2_SH(q1_filt8_r, q1_filt8_r, q2_filt8_r, q2_filt8_r,
1541 q1_filt8_r, q2_filt8_r);
1544 p2 = __msa_bmnz_v(p2, (v16u8) p2_filt8_r, flat);
1545 p1 = __msa_bmnz_v(p1_out, (v16u8) p1_filt8_r, flat);
1546 p0 = __msa_bmnz_v(p0_out, (v16u8) p0_filt8_r, flat);
1547 q0 = __msa_bmnz_v(q0_out, (v16u8) q0_filt8_r, flat);
1548 q1 = __msa_bmnz_v(q1_out, (v16u8) q1_filt8_r, flat);
1549 q2 = __msa_bmnz_v(q2, (v16u8) q2_filt8_r, flat);
1558 ST4x4_UB(vec3, vec3, 0, 1, 2, 3, src, pitch);
1561 ST4x4_UB(vec4, vec4, 0, 1, 2, 3, src, pitch);
1564 ST4x4_UB(vec6, vec6, 0, 1, 2, 3, src, pitch);
1567 ST4x4_UB(vec7, vec7, 0, 1, 2, 3, src, pitch);
1578 v16u8 p3, p2, p1, p0, q3, q2,
q1,
q0;
1579 v16u8 p1_out, p0_out, q0_out, q1_out;
1581 v16u8 row4, row5, row6, row7, row12, row13, row14, row15;
1582 v8u16 p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l;
1583 v8i16 p2_filt8_l, p1_filt8_l, p0_filt8_l;
1584 v8i16 q0_filt8_l, q1_filt8_l, q2_filt8_l;
1586 v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1590 LD_UB8(temp_src, pitch, p0, p1, p2, p3, row4, row5, row6, row7);
1591 temp_src += (8 * pitch);
1592 LD_UB8(temp_src, pitch, q3, q2, q1, q0, row12, row13, row14, row15);
1596 q3, q2, q1, q0, row12, row13, row14, row15,
1597 p3, p2, p1, p0, q0, q1, q2, q3);
1599 thresh = (v16u8) __msa_fill_b(thresh_ptr);
1600 vec0 = (v8i16) __msa_fill_b(thresh_ptr >> 8);
1601 thresh = (v16u8) __msa_ilvr_d((v2i64) vec0, (v2i64) thresh);
1603 b_limit = (v16u8) __msa_fill_b(b_limit_ptr);
1604 vec0 = (v8i16) __msa_fill_b(b_limit_ptr >> 8);
1605 b_limit = (v16u8) __msa_ilvr_d((v2i64) vec0, (v2i64) b_limit);
1607 limit = (v16u8) __msa_fill_b(limit_ptr);
1608 vec0 = (v8i16) __msa_fill_b(limit_ptr >> 8);
1609 limit = (v16u8) __msa_ilvr_d((v2i64) vec0, (v2i64) limit);
1612 LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
1615 VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
1620 flat = (v16u8) __msa_insve_d((v2i64)
flat, 0, (v2i64) zero);
1623 if (__msa_test_bz_v(flat)) {
1624 ILVR_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1);
1626 ILVL_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1);
1634 ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l, p2_l, p1_l,
1636 ILVL_B4_UH(zero, q0, zero, q1, zero, q2, zero, q3, q0_l, q1_l, q2_l,
1639 VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l,
1640 p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l);
1643 PCKEV_B4_SH(p2_filt8_l, p2_filt8_l, p1_filt8_l, p1_filt8_l,
1644 p0_filt8_l, p0_filt8_l, q0_filt8_l, q0_filt8_l,
1645 p2_filt8_l, p1_filt8_l, p0_filt8_l, q0_filt8_l);
1646 PCKEV_B2_SH(q1_filt8_l, q1_filt8_l, q2_filt8_l, q2_filt8_l,
1647 q1_filt8_l, q2_filt8_l);
1650 p2 = __msa_bmnz_v(p2, (v16u8) p2_filt8_l, flat);
1651 p1 = __msa_bmnz_v(p1_out, (v16u8) p1_filt8_l, flat);
1652 p0 = __msa_bmnz_v(p0_out, (v16u8) p0_filt8_l, flat);
1653 q0 = __msa_bmnz_v(q0_out, (v16u8) q0_filt8_l, flat);
1654 q1 = __msa_bmnz_v(q1_out, (v16u8) q1_filt8_l, flat);
1655 q2 = __msa_bmnz_v(q2, (v16u8) q2_filt8_l, flat);
1664 ST4x4_UB(vec3, vec3, 0, 1, 2, 3, src, pitch);
1667 ST4x4_UB(vec4, vec4, 0, 1, 2, 3, src, pitch);
1670 ST4x4_UB(vec6, vec6, 0, 1, 2, 3, src, pitch);
1673 ST4x4_UB(vec7, vec7, 0, 1, 2, 3, src, pitch);
1681 v16u8 p7_org, p6_org, p5_org, p4_org, p3_org, p2_org, p1_org, p0_org;
1682 v16i8 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
1683 v16u8 p7, p6, p5, p4, p3, p2, p1, p0,
q0,
q1, q2, q3, q4, q5, q6, q7;
1686 p7_org, p6_org, p5_org, p4_org, p3_org, p2_org, p1_org, p0_org);
1689 p0_org, p7, p6, p5, p4, p3, p2, p1, p0);
1691 ILVL_B4_SB(p5_org, p7_org, p4_org, p6_org, p1_org, p3_org, p0_org, p2_org,
1692 tmp0, tmp1, tmp2, tmp3);
1693 ILVR_B2_SB(tmp1, tmp0, tmp3, tmp2, tmp4, tmp6);
1694 ILVL_B2_SB(tmp1, tmp0, tmp3, tmp2, tmp5, tmp7);
1699 ST_UB8(p7, p6, p5, p4, p3, p2, p1, p0, output, out_pitch);
1700 output += (8 * out_pitch);
1701 ST_UB8(q0, q1, q2, q3, q4, q5, q6, q7, output, out_pitch);
1707 v16u8 p7_o, p6_o, p5_o, p4_o, p3_o, p2_o, p1_o, p0_o;
1708 v16u8 p7, p6, p5, p4, p3, p2, p1, p0,
q0,
q1, q2, q3, q4, q5, q6, q7;
1710 LD_UB8(input, in_pitch, p7, p6, p5, p4, p3, p2, p1, p0);
1711 LD_UB8(input + (8 * in_pitch), in_pitch, q0, q1, q2, q3, q4, q5, q6, q7);
1712 TRANSPOSE16x8_UB_UB(p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5,
1713 q6, q7, p7_o, p6_o, p5_o, p4_o, p3_o, p2_o, p1_o, p0_o);
1714 ST_UB8(p7_o, p6_o, p5_o, p4_o, p3_o, p2_o, p1_o, p0_o, output, out_pitch);
1720 v16u8 row0, row1, row2, row3, row4, row5, row6, row7;
1721 v16u8 row8, row9, row10, row11, row12, row13, row14, row15;
1722 v8i16 tmp0, tmp1, tmp4, tmp5, tmp6, tmp7;
1724 v16u8 p7, p6, p5, p4, p3, p2, p1, p0,
q0,
q1, q2, q3, q4, q5, q6, q7;
1726 LD_UB8(input, in_pitch, row0, row1, row2, row3, row4, row5, row6, row7);
1727 input += (8 * in_pitch);
1729 row8, row9, row10, row11, row12, row13, row14, row15);
1732 row8, row9, row10, row11, row12, row13, row14, row15,
1733 p7, p6, p5, p4, p3, p2, p1, p0);
1737 q7 = (v16u8) __msa_ilvod_d((v2i64) row8, (v2i64) row0);
1738 q6 = (v16u8) __msa_ilvod_d((v2i64) row9, (v2i64) row1);
1739 q5 = (v16u8) __msa_ilvod_d((v2i64) row10, (v2i64) row2);
1740 q4 = (v16u8) __msa_ilvod_d((v2i64) row11, (v2i64) row3);
1741 q3 = (v16u8) __msa_ilvod_d((v2i64) row12, (v2i64) row4);
1742 q2 = (v16u8) __msa_ilvod_d((v2i64) row13, (v2i64) row5);
1743 q1 = (v16u8) __msa_ilvod_d((v2i64) row14, (v2i64) row6);
1744 q0 = (v16u8) __msa_ilvod_d((v2i64) row15, (v2i64) row7);
1747 tmp4 = (v8i16) __msa_ilvod_b((v16i8) q6, (v16i8) q7);
1748 tmp5 = (v8i16) __msa_ilvod_b((v16i8) q4, (v16i8) q5);
1751 tmp6 = (v8i16) __msa_ilvod_b((v16i8) q2, (v16i8) q3);
1752 tmp7 = (v8i16) __msa_ilvod_b((v16i8)
q0, (v16i8) q1);
1755 q0 = (v16u8) __msa_ilvev_w(tmp3, tmp2);
1756 q4 = (v16u8) __msa_ilvod_w(tmp3, tmp2);
1758 tmp2 = (v4i32) __msa_ilvod_h(tmp1, tmp0);
1759 tmp3 = (v4i32) __msa_ilvod_h((v8i16) q7, (v8i16) q5);
1760 q2 = (v16u8) __msa_ilvev_w(tmp3, tmp2);
1761 q6 = (v16u8) __msa_ilvod_w(tmp3, tmp2);
1764 q1 = (v16u8) __msa_ilvev_w(tmp3, tmp2);
1765 q5 = (v16u8) __msa_ilvod_w(tmp3, tmp2);
1767 tmp2 = (v4i32) __msa_ilvod_h(tmp5, tmp4);
1768 tmp3 = (v4i32) __msa_ilvod_h(tmp7, tmp6);
1769 q3 = (v16u8) __msa_ilvev_w(tmp3, tmp2);
1770 q7 = (v16u8) __msa_ilvod_w(tmp3, tmp2);
1772 ST_UB8(p7, p6, p5, p4, p3, p2, p1, p0, output, out_pitch);
1773 output += (8 * out_pitch);
1774 ST_UB8(q0, q1, q2, q3, q4, q5, q6, q7, output, out_pitch);
1783 v16u8 p3, p2, p1, p0, q3, q2,
q1,
q0;
1784 v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
1786 v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r;
1787 v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r;
1788 v8i16 q0_filt8_r, q1_filt8_r, q2_filt8_r;
1790 v8i16 vec0, vec1, vec2, vec3;
1793 LD_UB8(src - (4 * 16), 16, p3, p2, p1, p0, q0, q1, q2, q3);
1795 thresh = (v16u8) __msa_fill_b(thresh_ptr);
1796 b_limit = (v16u8) __msa_fill_b(b_limit_ptr);
1797 limit = (v16u8) __msa_fill_b(limit_ptr);
1800 LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
1803 VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
1808 flat = (v16u8) __msa_ilvr_d((v2i64)
zero, (v2i64) flat);
1811 if (__msa_test_bz_v(flat)) {
1812 ILVR_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1);
1814 ST4x8_UB(vec2, vec3, (src_org - 2), pitch_org);
1817 ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1,
1818 zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r,
1820 VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r,
1821 p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r);
1824 p2_r = (v8u16) __msa_pckev_b((v16i8) p2_filt8_r, (v16i8) p2_filt8_r);
1825 p1_r = (v8u16) __msa_pckev_b((v16i8) p1_filt8_r, (v16i8) p1_filt8_r);
1826 p0_r = (v8u16) __msa_pckev_b((v16i8) p0_filt8_r, (v16i8) p0_filt8_r);
1827 q0_r = (v8u16) __msa_pckev_b((v16i8) q0_filt8_r, (v16i8) q0_filt8_r);
1828 q1_r = (v8u16) __msa_pckev_b((v16i8) q1_filt8_r, (v16i8) q1_filt8_r);
1829 q2_r = (v8u16) __msa_pckev_b((v16i8) q2_filt8_r, (v16i8) q2_filt8_r);
1832 p2_out = __msa_bmnz_v(p2, (v16u8) p2_r, flat);
1833 p1_out = __msa_bmnz_v(p1_out, (v16u8) p1_r, flat);
1834 p0_out = __msa_bmnz_v(p0_out, (v16u8) p0_r, flat);
1835 q0_out = __msa_bmnz_v(q0_out, (v16u8) q0_r, flat);
1836 q1_out = __msa_bmnz_v(q1_out, (v16u8) q1_r, flat);
1837 q2_out = __msa_bmnz_v(q2, (v16u8) q2_r, flat);
1839 ST_UB4(p2_out, p1_out, p0_out, q0_out, filter48, 16);
1840 filter48 += (4 * 16);
1841 ST_UB2(q1_out, q2_out, filter48, 16);
1842 filter48 += (2 * 16);
1843 ST_UB(flat, filter48);
1853 v16u8 filter8,
flat, flat2;
1854 v16u8 p7, p6, p5, p4, p3, p2, p1, p0,
q0,
q1, q2, q3, q4, q5, q6, q7;
1855 v8u16 p7_r_in, p6_r_in, p5_r_in, p4_r_in;
1856 v8u16 p3_r_in, p2_r_in, p1_r_in, p0_r_in;
1857 v8u16 q7_r_in, q6_r_in, q5_r_in, q4_r_in;
1858 v8u16 q3_r_in, q2_r_in, q1_r_in, q0_r_in;
1859 v8u16 tmp0_r, tmp1_r;
1862 flat =
LD_UB(filter48 + 6 * 16);
1864 LD_UB8((src - 8 * 16), 16, p7, p6, p5, p4, p3, p2, p1, p0);
1865 LD_UB8(src, 16, q0, q1, q2, q3, q4, q5, q6, q7);
1867 VP9_FLAT5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, flat, flat2);
1870 if (__msa_test_bz_v(flat2)) {
1871 v8i16 vec0, vec1, vec2, vec3, vec4;
1873 LD_UB4(filter48, 16, p2, p1, p0, q0);
1874 LD_UB2(filter48 + 4 * 16, 16, q1, q2);
1878 vec2 = (v8i16) __msa_ilvr_b((v16i8) q2, (v16i8) q1);
1881 ST4x4_UB(vec3, vec3, 0, 1, 2, 3, src_org, pitch);
1882 ST2x4_UB(vec2, 0, (src_org + 4), pitch);
1883 src_org += (4 * pitch);
1884 ST4x4_UB(vec4, vec4, 0, 1, 2, 3, src_org, pitch);
1885 ST2x4_UB(vec2, 4, (src_org + 4), pitch);
1891 ILVR_B8_UH(zero, p7, zero, p6, zero, p5, zero, p4, zero, p3, zero, p2,
1892 zero, p1, zero, p0, p7_r_in, p6_r_in, p5_r_in, p4_r_in,
1893 p3_r_in, p2_r_in, p1_r_in, p0_r_in);
1894 q0_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8)
q0);
1896 tmp0_r = p7_r_in << 3;
1900 tmp1_r = p6_r_in + p5_r_in;
1908 r_out = __msa_srari_h((v8i16) tmp1_r, 4);
1909 r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out);
1910 p6 = __msa_bmnz_v(p6, (v16u8) r_out, flat2);
1915 q1_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8)
q1);
1916 tmp0_r = p5_r_in - p6_r_in;
1920 r_out = __msa_srari_h((v8i16) tmp1_r, 4);
1921 r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out);
1922 p5 = __msa_bmnz_v(p5, (v16u8) r_out, flat2);
1927 q2_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q2);
1928 tmp0_r = p4_r_in - p5_r_in;
1932 r_out = __msa_srari_h((v8i16) tmp1_r, 4);
1933 r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out);
1934 p4 = __msa_bmnz_v(p4, (v16u8) r_out, flat2);
1939 q3_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q3);
1940 tmp0_r = p3_r_in - p4_r_in;
1944 r_out = __msa_srari_h((v8i16) tmp1_r, 4);
1945 r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out);
1946 p3 = __msa_bmnz_v(p3, (v16u8) r_out, flat2);
1951 q4_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q4);
1952 filter8 =
LD_UB(filter48);
1953 tmp0_r = p2_r_in - p3_r_in;
1957 r_out = __msa_srari_h((v8i16) tmp1_r, 4);
1958 r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out);
1959 filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2);
1964 q5_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q5);
1965 filter8 =
LD_UB(filter48 + 16);
1966 tmp0_r = p1_r_in - p2_r_in;
1970 r_out = __msa_srari_h((v8i16) tmp1_r, 4);
1971 r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out);
1972 filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2);
1977 q6_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q6);
1978 filter8 =
LD_UB(filter48 + 32);
1979 tmp0_r = p0_r_in - p1_r_in;
1983 r_out = __msa_srari_h((v8i16) tmp1_r, 4);
1984 r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out);
1985 filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2);
1990 q7_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q7);
1991 filter8 =
LD_UB(filter48 + 48);
1992 tmp0_r = q7_r_in - p0_r_in;
1996 r_out = __msa_srari_h((v8i16) tmp1_r, 4);
1997 r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out);
1998 filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2);
2003 filter8 =
LD_UB(filter48 + 64);
2004 tmp0_r = q7_r_in - q0_r_in;
2008 r_out = __msa_srari_h((v8i16) tmp1_r, 4);
2009 r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out);
2010 filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2);
2015 filter8 =
LD_UB(filter48 + 80);
2016 tmp0_r = q7_r_in - q1_r_in;
2020 r_out = __msa_srari_h((v8i16) tmp1_r, 4);
2021 r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out);
2022 filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2);
2027 tmp0_r = q7_r_in - q2_r_in;
2031 r_out = __msa_srari_h((v8i16) tmp1_r, 4);
2032 r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out);
2033 q3 = __msa_bmnz_v(q3, (v16u8) r_out, flat2);
2038 tmp0_r = q7_r_in - q3_r_in;
2042 r_out = __msa_srari_h((v8i16) tmp1_r, 4);
2043 r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out);
2044 q4 = __msa_bmnz_v(q4, (v16u8) r_out, flat2);
2049 tmp0_r = q7_r_in - q4_r_in;
2053 r_out = __msa_srari_h((v8i16) tmp1_r, 4);
2054 r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out);
2055 q5 = __msa_bmnz_v(q5, (v16u8) r_out, flat2);
2060 tmp0_r = q7_r_in - q5_r_in;
2064 r_out = __msa_srari_h((v8i16) tmp1_r, 4);
2065 r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out);
2066 q6 = __msa_bmnz_v(q6, (v16u8) r_out, flat2);
2080 uint8_t *filter48 = &transposed_input[16 * 16];
2085 &filter48[0], src, pitch,
2086 b_limit_ptr, limit_ptr, thresh_ptr);
2088 if (0 == early_exit) {
2092 if (0 == early_exit) {
2099 uint8_t *src_org, ptrdiff_t pitch,
2104 v16u8 p3, p2, p1, p0, q3, q2,
q1,
q0;
2105 v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
2107 v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r;
2108 v8u16 p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l;
2109 v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r;
2110 v8i16 q0_filt8_r, q1_filt8_r, q2_filt8_r;
2111 v8i16 p2_filt8_l, p1_filt8_l, p0_filt8_l;
2112 v8i16 q0_filt8_l, q1_filt8_l, q2_filt8_l;
2114 v8i16 vec0, vec1, vec2, vec3, vec4, vec5;
2117 LD_UB8(src - (4 * 16), 16, p3, p2, p1, p0, q0, q1, q2, q3);
2119 thresh = (v16u8) __msa_fill_b(thresh_ptr);
2120 b_limit = (v16u8) __msa_fill_b(b_limit_ptr);
2121 limit = (v16u8) __msa_fill_b(limit_ptr);
2124 LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
2127 VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
2133 if (__msa_test_bz_v(flat)) {
2134 ILVR_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1);
2136 ILVL_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1);
2140 ST4x8_UB(vec2, vec3, src_org, pitch);
2141 src_org += 8 * pitch;
2142 ST4x8_UB(vec4, vec5, src_org, pitch);
2146 ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1,
2147 zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r,
2149 VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r,
2150 p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r);
2151 ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l, p2_l, p1_l,
2153 ILVL_B4_UH(zero, q0, zero, q1, zero, q2, zero, q3, q0_l, q1_l, q2_l,
2155 VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l,
2156 p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l);
2159 PCKEV_B4_SH(p2_filt8_l, p2_filt8_r, p1_filt8_l, p1_filt8_r, p0_filt8_l,
2160 p0_filt8_r, q0_filt8_l, q0_filt8_r, p2_filt8_r, p1_filt8_r,
2161 p0_filt8_r, q0_filt8_r);
2162 PCKEV_B2_SH(q1_filt8_l, q1_filt8_r, q2_filt8_l, q2_filt8_r, q1_filt8_r,
2166 p2_out = __msa_bmnz_v(p2, (v16u8) p2_filt8_r, flat);
2167 p1_out = __msa_bmnz_v(p1_out, (v16u8) p1_filt8_r, flat);
2168 p0_out = __msa_bmnz_v(p0_out, (v16u8) p0_filt8_r, flat);
2169 q0_out = __msa_bmnz_v(q0_out, (v16u8) q0_filt8_r, flat);
2170 q1_out = __msa_bmnz_v(q1_out, (v16u8) q1_filt8_r, flat);
2171 q2_out = __msa_bmnz_v(q2, (v16u8) q2_filt8_r, flat);
2173 ST_UB4(p2_out, p1_out, p0_out, q0_out, filter48, 16);
2174 filter48 += (4 * 16);
2175 ST_UB2(q1_out, q2_out, filter48, 16);
2176 filter48 += (2 * 16);
2177 ST_UB(flat, filter48);
2186 v16u8
flat, flat2, filter8;
2188 v16u8 p7, p6, p5, p4, p3, p2, p1, p0,
q0,
q1, q2, q3, q4, q5, q6, q7;
2189 v8u16 p7_r_in, p6_r_in, p5_r_in, p4_r_in;
2190 v8u16 p3_r_in, p2_r_in, p1_r_in, p0_r_in;
2191 v8u16 q7_r_in, q6_r_in, q5_r_in, q4_r_in;
2192 v8u16 q3_r_in, q2_r_in, q1_r_in, q0_r_in;
2193 v8u16 p7_l_in, p6_l_in, p5_l_in, p4_l_in;
2194 v8u16 p3_l_in, p2_l_in, p1_l_in, p0_l_in;
2195 v8u16 q7_l_in, q6_l_in, q5_l_in, q4_l_in;
2196 v8u16 q3_l_in, q2_l_in, q1_l_in, q0_l_in;
2197 v8u16 tmp0_r, tmp1_r, tmp0_l, tmp1_l;
2200 flat =
LD_UB(filter48 + 6 * 16);
2202 LD_UB8((src - 8 * 16), 16, p7, p6, p5, p4, p3, p2, p1, p0);
2203 LD_UB8(src, 16, q0, q1, q2, q3, q4, q5, q6, q7);
2205 VP9_FLAT5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, flat, flat2);
2208 if (__msa_test_bz_v(flat2)) {
2209 v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
2211 LD_UB4(filter48, 16, p2, p1, p0, q0);
2212 LD_UB2(filter48 + 4 * 16, 16, q1, q2);
2221 ST4x4_UB(vec3, vec3, 0, 1, 2, 3, src_org, pitch);
2222 ST2x4_UB(vec2, 0, (src_org + 4), pitch);
2223 src_org += (4 * pitch);
2224 ST4x4_UB(vec4, vec4, 0, 1, 2, 3, src_org, pitch);
2225 ST2x4_UB(vec2, 4, (src_org + 4), pitch);
2226 src_org += (4 * pitch);
2227 ST4x4_UB(vec6, vec6, 0, 1, 2, 3, src_org, pitch);
2228 ST2x4_UB(vec5, 0, (src_org + 4), pitch);
2229 src_org += (4 * pitch);
2230 ST4x4_UB(vec7, vec7, 0, 1, 2, 3, src_org, pitch);
2231 ST2x4_UB(vec5, 4, (src_org + 4), pitch);
2237 ILVR_B8_UH(zero, p7, zero, p6, zero, p5, zero, p4, zero, p3, zero, p2,
2238 zero, p1, zero, p0, p7_r_in, p6_r_in, p5_r_in, p4_r_in,
2239 p3_r_in, p2_r_in, p1_r_in, p0_r_in);
2240 q0_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8)
q0);
2242 tmp0_r = p7_r_in << 3;
2246 tmp1_r = p6_r_in + p5_r_in;
2253 r_out = __msa_srari_h((v8i16) tmp1_r, 4);
2255 ILVL_B4_UH(zero, p7, zero, p6, zero, p5, zero, p4, p7_l_in, p6_l_in,
2257 ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l_in, p2_l_in,
2259 q0_l_in = (v8u16) __msa_ilvl_b(zero, (v16i8)
q0);
2261 tmp0_l = p7_l_in << 3;
2265 tmp1_l = p6_l_in + p5_l_in;
2272 l_out = __msa_srari_h((v8i16) tmp1_l, 4);
2274 r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
2275 p6 = __msa_bmnz_v(p6, (v16u8) r_out, flat2);
2280 q1_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8)
q1);
2281 tmp0_r = p5_r_in - p6_r_in;
2285 r_out = __msa_srari_h((v8i16) tmp1_r, 4);
2286 q1_l_in = (v8u16) __msa_ilvl_b(zero, (v16i8)
q1);
2287 tmp0_l = p5_l_in - p6_l_in;
2291 l_out = __msa_srari_h((v8i16) tmp1_l, 4);
2292 r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
2293 p5 = __msa_bmnz_v(p5, (v16u8) r_out, flat2);
2298 q2_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q2);
2299 tmp0_r = p4_r_in - p5_r_in;
2303 r_out = __msa_srari_h((v8i16) tmp1_r, 4);
2304 q2_l_in = (v8u16) __msa_ilvl_b(zero, (v16i8) q2);
2305 tmp0_l = p4_l_in - p5_l_in;
2309 l_out = __msa_srari_h((v8i16) tmp1_l, 4);
2310 r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
2311 p4 = __msa_bmnz_v(p4, (v16u8) r_out, flat2);
2316 q3_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q3);
2317 tmp0_r = p3_r_in - p4_r_in;
2321 r_out = __msa_srari_h((v8i16) tmp1_r, 4);
2322 q3_l_in = (v8u16) __msa_ilvl_b(zero, (v16i8) q3);
2323 tmp0_l = p3_l_in - p4_l_in;
2327 l_out = __msa_srari_h((v8i16) tmp1_l, 4);
2328 r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
2329 p3 = __msa_bmnz_v(p3, (v16u8) r_out, flat2);
2334 q4_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q4);
2335 filter8 =
LD_UB(filter48);
2336 tmp0_r = p2_r_in - p3_r_in;
2340 r_out = __msa_srari_h((v8i16) tmp1_r, 4);
2341 q4_l_in = (v8u16) __msa_ilvl_b(zero, (v16i8) q4);
2342 tmp0_l = p2_l_in - p3_l_in;
2346 l_out = __msa_srari_h((v8i16) tmp1_l, 4);
2347 r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
2348 filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2);
2349 ST_UB(filter8, src);
2353 q5_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q5);
2354 filter8 =
LD_UB(filter48 + 16);
2355 tmp0_r = p1_r_in - p2_r_in;
2359 r_out = __msa_srari_h((v8i16) tmp1_r, 4);
2360 q5_l_in = (v8u16) __msa_ilvl_b(zero, (v16i8) q5);
2361 tmp0_l = p1_l_in - p2_l_in;
2365 l_out = __msa_srari_h((v8i16) (tmp1_l), 4);
2366 r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
2367 filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2);
2368 ST_UB(filter8, src);
2372 q6_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q6);
2373 filter8 =
LD_UB(filter48 + 32);
2374 tmp0_r = p0_r_in - p1_r_in;
2378 r_out = __msa_srari_h((v8i16) tmp1_r, 4);
2379 q6_l_in = (v8u16) __msa_ilvl_b(zero, (v16i8) q6);
2380 tmp0_l = p0_l_in - p1_l_in;
2384 l_out = __msa_srari_h((v8i16) tmp1_l, 4);
2385 r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
2386 filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2);
2387 ST_UB(filter8, src);
2391 q7_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q7);
2392 filter8 =
LD_UB(filter48 + 48);
2393 tmp0_r = q7_r_in - p0_r_in;
2397 r_out = __msa_srari_h((v8i16) tmp1_r, 4);
2398 q7_l_in = (v8u16) __msa_ilvl_b(zero, (v16i8) q7);
2399 tmp0_l = q7_l_in - p0_l_in;
2403 l_out = __msa_srari_h((v8i16) tmp1_l, 4);
2404 r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
2405 filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2);
2406 ST_UB(filter8, src);
2410 filter8 =
LD_UB(filter48 + 64);
2411 tmp0_r = q7_r_in - q0_r_in;
2415 r_out = __msa_srari_h((v8i16) tmp1_r, 4);
2416 tmp0_l = q7_l_in - q0_l_in;
2420 l_out = __msa_srari_h((v8i16) tmp1_l, 4);
2421 r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
2422 filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2);
2423 ST_UB(filter8, src);
2427 filter8 =
LD_UB(filter48 + 80);
2428 tmp0_r = q7_r_in - q1_r_in;
2432 r_out = __msa_srari_h((v8i16) tmp1_r, 4);
2433 tmp0_l = q7_l_in - q1_l_in;
2437 l_out = __msa_srari_h((v8i16) tmp1_l, 4);
2438 r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
2439 filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2);
2440 ST_UB(filter8, src);
2444 tmp0_r = q7_r_in - q2_r_in;
2448 r_out = __msa_srari_h((v8i16) tmp1_r, 4);
2449 tmp0_l = q7_l_in - q2_l_in;
2453 l_out = __msa_srari_h((v8i16) tmp1_l, 4);
2454 r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
2455 q3 = __msa_bmnz_v(q3, (v16u8) r_out, flat2);
2460 tmp0_r = q7_r_in - q3_r_in;
2464 r_out = __msa_srari_h((v8i16) tmp1_r, 4);
2465 tmp0_l = q7_l_in - q3_l_in;
2469 l_out = __msa_srari_h((v8i16) tmp1_l, 4);
2470 r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
2471 q4 = __msa_bmnz_v(q4, (v16u8) r_out, flat2);
2476 tmp0_r = q7_r_in - q4_r_in;
2480 r_out = __msa_srari_h((v8i16) tmp1_r, 4);
2481 tmp0_l = q7_l_in - q4_l_in;
2485 l_out = __msa_srari_h((v8i16) tmp1_l, 4);
2486 r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
2487 q5 = __msa_bmnz_v(q5, (v16u8) r_out, flat2);
2492 tmp0_r = q7_r_in - q5_r_in;
2496 r_out = __msa_srari_h((v8i16) tmp1_r, 4);
2497 tmp0_l = q7_l_in - q5_l_in;
2501 l_out = __msa_srari_h((v8i16) tmp1_l, 4);
2502 r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
2503 q6 = __msa_bmnz_v(q6, (v16u8) r_out, flat2);
2517 uint8_t *filter48 = &transposed_input[16 * 16];
2522 &filter48[0], src, pitch,
2523 b_limit_ptr, limit_ptr, thresh_ptr);
2525 if (0 == early_exit) {
2529 if (0 == early_exit) {
void ff_loop_filter_h_8_8_msa(uint8_t *src, ptrdiff_t pitch, int32_t b_limit_ptr, int32_t limit_ptr, int32_t thresh_ptr)
static void vp9_transpose_8x16_to_16x8(uint8_t *input, int32_t in_pitch, uint8_t *output, int32_t out_pitch)
static void vp9_transpose_16x16(uint8_t *input, int32_t in_pitch, uint8_t *output, int32_t out_pitch)
void ff_loop_filter_v_44_16_msa(uint8_t *src, ptrdiff_t pitch, int32_t b_limit_ptr, int32_t limit_ptr, int32_t thresh_ptr)
void ff_loop_filter_v_4_8_msa(uint8_t *src, ptrdiff_t pitch, int32_t b_limit_ptr, int32_t limit_ptr, int32_t thresh_ptr)
#define VP9_FILTER8(p3_in, p2_in, p1_in, p0_in,q0_in, q1_in, q2_in, q3_in,p2_filt8_out, p1_filt8_out, p0_filt8_out,q0_filt8_out, q1_filt8_out, q2_filt8_out)
void ff_loop_filter_h_48_16_msa(uint8_t *src, ptrdiff_t pitch, int32_t b_limit_ptr, int32_t limit_ptr, int32_t thresh_ptr)
static const uint8_t q1[256]
#define ST4x4_UB(in0, in1, idx0, idx1, idx2, idx3, pdst, stride)
#define TRANSPOSE16x8_UB_UB(in0, in1, in2, in3, in4, in5, in6, in7,in8, in9, in10, in11, in12, in13, in14, in15,out0, out1, out2, out3, out4, out5, out6, out7)
#define VP9_FLAT4(p3_in, p2_in, p0_in, q0_in, q2_in, q3_in, flat_out)
#define SLDI_B4_0_UB(...)
static int32_t vp9_hz_lpf_t4_and_t8_16w(uint8_t *src, ptrdiff_t pitch, uint8_t *filter48, int32_t b_limit_ptr, int32_t limit_ptr, int32_t thresh_ptr)
static void vp9_hz_lpf_t16_16w(uint8_t *src, ptrdiff_t pitch, uint8_t *filter48)
static av_always_inline int hev(uint8_t *p, ptrdiff_t stride, int thresh)
static const uint16_t mask[17]
void ff_loop_filter_h_84_16_msa(uint8_t *src, ptrdiff_t pitch, int32_t b_limit_ptr, int32_t limit_ptr, int32_t thresh_ptr)
static int32_t vp9_vt_lpf_t4_and_t8_8w(uint8_t *src, uint8_t *filter48, uint8_t *src_org, int32_t pitch_org, int32_t b_limit_ptr, int32_t limit_ptr, int32_t thresh_ptr)
void ff_loop_filter_h_4_8_msa(uint8_t *src, ptrdiff_t pitch, int32_t b_limit_ptr, int32_t limit_ptr, int32_t thresh_ptr)
void ff_loop_filter_v_48_16_msa(uint8_t *src, ptrdiff_t pitch, int32_t b_limit_ptr, int32_t limit_ptr, int32_t thresh_ptr)
static const uint8_t q0[256]
void ff_loop_filter_v_16_16_msa(uint8_t *src, ptrdiff_t pitch, int32_t b_limit_ptr, int32_t limit_ptr, int32_t thresh_ptr)
#define VP9_FLAT5(p7_in, p6_in, p5_in, p4_in, p0_in, q0_in, q4_in,q5_in, q6_in, q7_in, flat_in, flat2_out)
void ff_loop_filter_h_16_8_msa(uint8_t *src, ptrdiff_t pitch, int32_t b_limit_ptr, int32_t limit_ptr, int32_t thresh_ptr)
static int32_t vp9_vt_lpf_t16_16w(uint8_t *src, uint8_t *src_org, ptrdiff_t pitch, uint8_t *filter48)
#define ST2x4_UB(in, stidx, pdst, stride)
#define LPF_MASK_HEV(p3_in, p2_in, p1_in, p0_in,q0_in, q1_in, q2_in, q3_in,limit_in, b_limit_in, thresh_in,hev_out, mask_out, flat_out)
static int32_t vp9_vt_lpf_t16_8w(uint8_t *src, uint8_t *src_org, ptrdiff_t pitch, uint8_t *filter48)
void ff_loop_filter_v_16_8_msa(uint8_t *src, ptrdiff_t pitch, int32_t b_limit_ptr, int32_t limit_ptr, int32_t thresh_ptr)
#define TRANSPOSE8x8_UB_UB(...)
void ff_loop_filter_h_44_16_msa(uint8_t *src, ptrdiff_t pitch, int32_t b_limit_ptr, int32_t limit_ptr, int32_t thresh_ptr)
static void vp9_transpose_16x8_to_8x16(uint8_t *input, int32_t in_pitch, uint8_t *output, int32_t out_pitch)
void ff_loop_filter_v_84_16_msa(uint8_t *src, ptrdiff_t pitch, int32_t b_limit_ptr, int32_t limit_ptr, int32_t thresh_ptr)
#define SD4(in0, in1, in2, in3, pdst, stride)
#define ST4x8_UB(in0, in1, pdst, stride)
#define ALLOC_ALIGNED(align)
void ff_loop_filter_v_88_16_msa(uint8_t *src, ptrdiff_t pitch, int32_t b_limit_ptr, int32_t limit_ptr, int32_t thresh_ptr)
void ff_loop_filter_v_8_8_msa(uint8_t *src, ptrdiff_t pitch, int32_t b_limit_ptr, int32_t limit_ptr, int32_t thresh_ptr)
void ff_loop_filter_h_16_16_msa(uint8_t *src, ptrdiff_t pitch, int32_t b_limit_ptr, int32_t limit_ptr, int32_t thresh_ptr)
#define VP9_LPF_FILTER4_4W(p1_in, p0_in, q0_in, q1_in, mask_in, hev_in,p1_out, p0_out, q0_out, q1_out)
#define ST8x1_UB(in, pdst)
void ff_loop_filter_h_88_16_msa(uint8_t *src, ptrdiff_t pitch, int32_t b_limit_ptr, int32_t limit_ptr, int32_t thresh_ptr)
static int32_t vp9_vt_lpf_t4_and_t8_16w(uint8_t *src, uint8_t *filter48, uint8_t *src_org, ptrdiff_t pitch, int32_t b_limit_ptr, int32_t limit_ptr, int32_t thresh_ptr)