FFmpeg
hevc_mc_uniw_msa.c
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2015 - 2017 Manojkumar Bhosale (Manojkumar.Bhosale@imgtec.com)
3  *
4  * This file is part of FFmpeg.
5  *
6  * FFmpeg is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU Lesser General Public
8  * License as published by the Free Software Foundation; either
9  * version 2.1 of the License, or (at your option) any later version.
10  *
11  * FFmpeg is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14  * Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with FFmpeg; if not, write to the Free Software
18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19  */
20 
24 
25 static const uint8_t ff_hevc_mask_arr[16 * 2] __attribute__((aligned(0x40))) = {
26  /* 8 width cases */
27  0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
28  /* 4 width cases */
29  0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20
30 };
31 
32 #define HEVC_UNIW_RND_CLIP2_MAX_SATU_H(in0_h, in1_h, wgt_w, offset_h, rnd_w, \
33  out0_h, out1_h) \
34 { \
35  v4i32 in0_r_m, in0_l_m, in1_r_m, in1_l_m; \
36  v8i16 zero = { 0 }; \
37  \
38  ILVRL_H2_SW(zero, in0_h, in0_r_m, in0_l_m); \
39  ILVRL_H2_SW(zero, in1_h, in1_r_m, in1_l_m); \
40  MUL4(in0_r_m, wgt_w, in0_l_m, wgt_w, in1_r_m, wgt_w, in1_l_m, wgt_w, \
41  in0_r_m, in0_l_m, in1_r_m, in1_l_m); \
42  SRAR_W4_SW(in0_r_m, in0_l_m, in1_r_m, in1_l_m, rnd_w); \
43  ADD4(in0_r_m, offset_h, in0_l_m, offset_h, in1_r_m, offset_h, in1_l_m, \
44  offset_h, in0_r_m, in0_l_m, in1_r_m, in1_l_m); \
45  CLIP_SW4_0_255(in0_r_m, in0_l_m, in1_r_m, in1_l_m); \
46  PCKEV_H2_SH(in0_l_m, in0_r_m, in1_l_m, in1_r_m, out0_h, out1_h); \
47 }
48 
49 #define HEVC_UNIW_RND_CLIP4_MAX_SATU_H(in0_h, in1_h, in2_h, in3_h, wgt_w, \
50  offset_h, rnd_w, out0_h, out1_h, \
51  out2_h, out3_h) \
52 { \
53  HEVC_UNIW_RND_CLIP2_MAX_SATU_H(in0_h, in1_h, wgt_w, offset_h, rnd_w, \
54  out0_h, out1_h); \
55  HEVC_UNIW_RND_CLIP2_MAX_SATU_H(in2_h, in3_h, wgt_w, offset_h, rnd_w, \
56  out2_h, out3_h); \
57 }
58 
59 #define HEVC_FILT_8TAP_4W_SH(in0, in1, in2, in3, filt0, filt1, \
60  filt2, filt3, dst0, dst1) \
61 { \
62  v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; \
63  ILVRL_B2_SH(zero, in0, tmp0, tmp4); \
64  ILVRL_B2_SH(zero, in1, tmp1, tmp5); \
65  ILVRL_B2_SH(zero, in2, tmp2, tmp6); \
66  ILVRL_B2_SH(zero, in3, tmp3, tmp7); \
67  dst0 = __msa_dotp_s_w((v8i16) tmp0, (v8i16) filt0); \
68  dst1 = __msa_dotp_s_w((v8i16) tmp4, (v8i16) filt0); \
69  DPADD_SH2_SW(tmp1, tmp5, filt1, filt1, dst0, dst1); \
70  DPADD_SH2_SW(tmp2, tmp6, filt2, filt2, dst0, dst1); \
71  DPADD_SH2_SW(tmp3, tmp7, filt3, filt3, dst0, dst1); \
72 }
73 
74 static void hevc_uniwgt_copy_4w_msa(const uint8_t *src,
75  int32_t src_stride,
76  uint8_t *dst,
77  int32_t dst_stride,
81  int32_t rnd_val)
82 {
83  uint32_t loop_cnt, tp0, tp1, tp2, tp3;
84  v16i8 zero = { 0 };
85  v16u8 out0, out1;
86  v16i8 src0 = { 0 }, src1 = { 0 };
87  v8i16 dst0, dst1, dst2, dst3, offset_vec;
88  v4i32 weight_vec, rnd_vec;
89 
90  weight_vec = __msa_fill_w(weight);
91  offset_vec = __msa_fill_w(offset);
92  rnd_vec = __msa_fill_w(rnd_val);
93 
94  if (2 == height) {
95  v4i32 dst0_r, dst0_l;
96 
97  LW2(src, src_stride, tp0, tp1);
98  INSERT_W2_SB(tp0, tp1, src0);
99  dst0 = (v8i16) __msa_ilvr_b(zero, src0);
100  dst0 <<= 6;
101 
102  ILVRL_H2_SW(zero, dst0, dst0_r, dst0_l);
103  DOTP_SH2_SW(dst0_r, dst0_l, weight_vec, weight_vec, dst0_r, dst0_l);
104  SRAR_W2_SW(dst0_r, dst0_l, rnd_vec);
105  dst0_r += offset_vec;
106  dst0_l += offset_vec;
107  CLIP_SW2_0_255(dst0_r, dst0_l);
108  dst0 = __msa_pckev_h((v8i16) dst0_l, (v8i16) dst0_r);
109  out0 = (v16u8) __msa_pckev_b((v16i8) dst0, (v16i8) dst0);
110  ST_W2(out0, 0, 1, dst, dst_stride);
111  } else if (4 == height) {
112  LW4(src, src_stride, tp0, tp1, tp2, tp3);
113  INSERT_W4_SB(tp0, tp1, tp2, tp3, src0);
114  ILVRL_B2_SH(zero, src0, dst0, dst1);
115  SLLI_2V(dst0, dst1, 6);
116  HEVC_UNIW_RND_CLIP2_MAX_SATU_H(dst0, dst1, weight_vec, offset_vec,
117  rnd_vec, dst0, dst1);
118  out0 = (v16u8) __msa_pckev_b((v16i8) dst1, (v16i8) dst0);
119  ST_W4(out0, 0, 1, 2, 3, dst, dst_stride);
120  } else if (0 == (height % 8)) {
121  for (loop_cnt = (height >> 3); loop_cnt--;) {
122  LW4(src, src_stride, tp0, tp1, tp2, tp3);
123  src += 4 * src_stride;
124  INSERT_W4_SB(tp0, tp1, tp2, tp3, src0);
125  LW4(src, src_stride, tp0, tp1, tp2, tp3);
126  src += 4 * src_stride;
127  INSERT_W4_SB(tp0, tp1, tp2, tp3, src1);
128  ILVRL_B2_SH(zero, src0, dst0, dst1);
129  ILVRL_B2_SH(zero, src1, dst2, dst3);
130  SLLI_4V(dst0, dst1, dst2, dst3, 6);
131  HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3, weight_vec,
132  offset_vec, rnd_vec, dst0, dst1,
133  dst2, dst3);
134  PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
135  ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
136  dst += 8 * dst_stride;
137  }
138  }
139 }
140 
141 static void hevc_uniwgt_copy_6w_msa(const uint8_t *src,
142  int32_t src_stride,
143  uint8_t *dst,
144  int32_t dst_stride,
145  int32_t height,
146  int32_t weight,
147  int32_t offset,
148  int32_t rnd_val)
149 {
150  uint32_t loop_cnt;
151  int32_t res = height & 0x07;
152  uint64_t tp0, tp1, tp2, tp3;
153  v16i8 zero = { 0 };
154  v16u8 out0, out1, out2, out3;
155  v16i8 src0, src1, src2, src3;
156  v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, offset_vec;
157  v4i32 weight_vec, rnd_vec;
158 
159  weight_vec = __msa_fill_w(weight);
160  offset_vec = __msa_fill_w(offset);
161  rnd_vec = __msa_fill_w(rnd_val);
162 
163  for (loop_cnt = (height >> 3); loop_cnt--;) {
164  LD4(src, src_stride, tp0, tp1, tp2, tp3);
165  src += (4 * src_stride);
166  INSERT_D2_SB(tp0, tp1, src0);
167  INSERT_D2_SB(tp2, tp3, src1);
168  LD4(src, src_stride, tp0, tp1, tp2, tp3);
169  src += (4 * src_stride);
170  INSERT_D2_SB(tp0, tp1, src2);
171  INSERT_D2_SB(tp2, tp3, src3);
172 
173  ILVRL_B2_SH(zero, src0, dst0, dst1);
174  ILVRL_B2_SH(zero, src1, dst2, dst3);
175  ILVRL_B2_SH(zero, src2, dst4, dst5);
176  ILVRL_B2_SH(zero, src3, dst6, dst7);
177 
178  SLLI_4V(dst0, dst1, dst2, dst3, 6);
179  SLLI_4V(dst4, dst5, dst6, dst7, 6);
180 
181  HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3, weight_vec,
182  offset_vec, rnd_vec, dst0, dst1, dst2,
183  dst3);
184  HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst4, dst5, dst6, dst7, weight_vec,
185  offset_vec, rnd_vec, dst4, dst5, dst6,
186  dst7);
187  PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
188  PCKEV_B2_UB(dst5, dst4, dst7, dst6, out2, out3);
189 
190  ST_W2(out0, 0, 2, dst, dst_stride);
191  ST_H2(out0, 2, 6, dst + 4, dst_stride);
192  ST_W2(out1, 0, 2, dst + 2 * dst_stride, dst_stride);
193  ST_H2(out1, 2, 6, dst + 2 * dst_stride + 4, dst_stride);
194  dst += (4 * dst_stride);
195  ST_W2(out2, 0, 2, dst, dst_stride);
196  ST_H2(out2, 2, 6, dst + 4, dst_stride);
197  ST_W2(out3, 0, 2, dst + 2 * dst_stride, dst_stride);
198  ST_H2(out3, 2, 6, dst + 2 * dst_stride + 4, dst_stride);
199  dst += (4 * dst_stride);
200  }
201  if (res) {
202  LD4(src, src_stride, tp0, tp1, tp2, tp3);
203  src += (4 * src_stride);
204  INSERT_D2_SB(tp0, tp1, src0);
205  INSERT_D2_SB(tp2, tp3, src1);
206  LD4(src, src_stride, tp0, tp1, tp2, tp3);
207  src += (4 * src_stride);
208  INSERT_D2_SB(tp0, tp1, src2);
209  INSERT_D2_SB(tp2, tp3, src3);
210 
211  ILVRL_B2_SH(zero, src0, dst0, dst1);
212  ILVRL_B2_SH(zero, src1, dst2, dst3);
213  ILVRL_B2_SH(zero, src2, dst4, dst5);
214  ILVRL_B2_SH(zero, src3, dst6, dst7);
215 
216  SLLI_4V(dst0, dst1, dst2, dst3, 6);
217  SLLI_4V(dst4, dst5, dst6, dst7, 6);
218 
219  HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3, weight_vec,
220  offset_vec, rnd_vec, dst0, dst1, dst2,
221  dst3);
222  HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst4, dst5, dst6, dst7, weight_vec,
223  offset_vec, rnd_vec, dst4, dst5, dst6,
224  dst7);
225  PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
226  PCKEV_B2_UB(dst5, dst4, dst7, dst6, out2, out3);
227 
228  if (res == 2) {
229  ST_W2(out0, 0, 2, dst, dst_stride);
230  ST_H2(out0, 2, 6, dst + 4, dst_stride);
231  } else if (res == 4) {
232  ST_W2(out0, 0, 2, dst, dst_stride);
233  ST_H2(out0, 2, 6, dst + 4, dst_stride);
234  ST_W2(out1, 0, 2, dst + 2 * dst_stride, dst_stride);
235  ST_H2(out1, 2, 6, dst + 2 * dst_stride + 4, dst_stride);
236  } else {
237  ST_W2(out0, 0, 2, dst, dst_stride);
238  ST_H2(out0, 2, 6, dst + 4, dst_stride);
239  ST_W2(out1, 0, 2, dst + 2 * dst_stride, dst_stride);
240  ST_H2(out1, 2, 6, dst + 2 * dst_stride + 4, dst_stride);
241  dst += (4 * dst_stride);
242  ST_W2(out2, 0, 2, dst, dst_stride);
243  ST_H2(out2, 2, 6, dst + 4, dst_stride);
244  }
245  }
246 }
247 
248 static void hevc_uniwgt_copy_8w_msa(const uint8_t *src,
249  int32_t src_stride,
250  uint8_t *dst,
251  int32_t dst_stride,
252  int32_t height,
253  int32_t weight,
254  int32_t offset,
255  int32_t rnd_val)
256 {
257  uint32_t loop_cnt;
258  uint64_t tp0, tp1, tp2, tp3;
259  v16i8 src0 = { 0 }, src1 = { 0 }, src2 = { 0 }, src3 = { 0 };
260  v16i8 zero = { 0 };
261  v16u8 out0, out1, out2, out3;
262  v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, offset_vec;
263  v4i32 weight_vec, rnd_vec;
264 
265  weight_vec = __msa_fill_w(weight);
266  offset_vec = __msa_fill_w(offset);
267  rnd_vec = __msa_fill_w(rnd_val);
268 
269  if (2 == height) {
270  LD2(src, src_stride, tp0, tp1);
271  INSERT_D2_SB(tp0, tp1, src0);
272  ILVRL_B2_SH(zero, src0, dst0, dst1);
273  SLLI_2V(dst0, dst1, 6);
274  HEVC_UNIW_RND_CLIP2_MAX_SATU_H(dst0, dst1, weight_vec, offset_vec,
275  rnd_vec, dst0, dst1);
276  out0 = (v16u8) __msa_pckev_b((v16i8) dst1, (v16i8) dst0);
277  ST_D2(out0, 0, 1, dst, dst_stride);
278  } else if (4 == height) {
279  LD4(src, src_stride, tp0, tp1, tp2, tp3);
280  INSERT_D2_SB(tp0, tp1, src0);
281  INSERT_D2_SB(tp2, tp3, src1);
282  ILVRL_B2_SH(zero, src0, dst0, dst1);
283  ILVRL_B2_SH(zero, src1, dst2, dst3);
284  SLLI_4V(dst0, dst1, dst2, dst3, 6);
285  HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3, weight_vec,
286  offset_vec, rnd_vec, dst0, dst1, dst2,
287  dst3);
288  PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
289  ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
290  } else if (6 == height) {
291  LD4(src, src_stride, tp0, tp1, tp2, tp3);
292  src += 4 * src_stride;
293  INSERT_D2_SB(tp0, tp1, src0);
294  INSERT_D2_SB(tp2, tp3, src1);
295  LD2(src, src_stride, tp0, tp1);
296  INSERT_D2_SB(tp0, tp1, src2);
297  ILVRL_B2_SH(zero, src0, dst0, dst1);
298  ILVRL_B2_SH(zero, src1, dst2, dst3);
299  ILVRL_B2_SH(zero, src2, dst4, dst5);
300  SLLI_4V(dst0, dst1, dst2, dst3, 6);
301  SLLI_2V(dst4, dst5, 6);
302  HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3, weight_vec,
303  offset_vec, rnd_vec, dst0, dst1, dst2,
304  dst3);
305  HEVC_UNIW_RND_CLIP2_MAX_SATU_H(dst4, dst5, weight_vec, offset_vec,
306  rnd_vec, dst4, dst5);
307  PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2);
308  ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
309  ST_D2(out2, 0, 1, dst + 4 * dst_stride, dst_stride);
310  } else if (0 == height % 8) {
311  for (loop_cnt = (height >> 3); loop_cnt--;) {
312  LD4(src, src_stride, tp0, tp1, tp2, tp3);
313  src += 4 * src_stride;
314  INSERT_D2_SB(tp0, tp1, src0);
315  INSERT_D2_SB(tp2, tp3, src1);
316  LD4(src, src_stride, tp0, tp1, tp2, tp3);
317  src += 4 * src_stride;
318  INSERT_D2_SB(tp0, tp1, src2);
319  INSERT_D2_SB(tp2, tp3, src3);
320 
321  ILVRL_B2_SH(zero, src0, dst0, dst1);
322  ILVRL_B2_SH(zero, src1, dst2, dst3);
323  ILVRL_B2_SH(zero, src2, dst4, dst5);
324  ILVRL_B2_SH(zero, src3, dst6, dst7);
325  SLLI_4V(dst0, dst1, dst2, dst3, 6);
326  SLLI_4V(dst4, dst5, dst6, dst7, 6);
327  HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3, weight_vec,
328  offset_vec, rnd_vec, dst0, dst1,
329  dst2, dst3);
330  HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst4, dst5, dst6, dst7, weight_vec,
331  offset_vec, rnd_vec, dst4, dst5,
332  dst6, dst7);
333  PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
334  PCKEV_B2_UB(dst5, dst4, dst7, dst6, out2, out3);
335  ST_D8(out0, out1, out2, out3, 0, 1, 0, 1, 0, 1, 0, 1,
336  dst, dst_stride);
337  dst += (8 * dst_stride);
338  }
339  }
340 }
341 
342 static void hevc_uniwgt_copy_12w_msa(const uint8_t *src,
343  int32_t src_stride,
344  uint8_t *dst,
345  int32_t dst_stride,
346  int32_t height,
347  int32_t weight,
348  int32_t offset,
349  int32_t rnd_val)
350 {
351  uint32_t loop_cnt;
352  v16u8 out0, out1, out2;
353  v16i8 src0, src1, src2, src3;
354  v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
355  v8i16 offset_vec;
356  v16i8 zero = { 0 };
357  v4i32 weight_vec, rnd_vec;
358 
359  weight_vec = __msa_fill_w(weight);
360  offset_vec = __msa_fill_w(offset);
361  rnd_vec = __msa_fill_w(rnd_val);
362 
363  for (loop_cnt = 4; loop_cnt--;) {
364  LD_SB4(src, src_stride, src0, src1, src2, src3);
365  src += (4 * src_stride);
366  ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3,
367  dst0, dst1, dst2, dst3);
368 
369  ILVL_W2_SB(src1, src0, src3, src2, src0, src1);
370  ILVR_B2_SH(zero, src0, zero, src1, dst4, dst5);
371  SLLI_4V(dst0, dst1, dst2, dst3, 6);
372  SLLI_2V(dst4, dst5, 6);
373  HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3, weight_vec,
374  offset_vec, rnd_vec, dst0, dst1, dst2,
375  dst3);
376  HEVC_UNIW_RND_CLIP2_MAX_SATU_H(dst4, dst5, weight_vec, offset_vec,
377  rnd_vec, dst4, dst5);
378 
379  PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2);
380  ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
381  ST_W4(out2, 0, 1, 2, 3, dst + 8, dst_stride);
382  dst += (4 * dst_stride);
383  }
384 }
385 
386 static void hevc_uniwgt_copy_16w_msa(const uint8_t *src,
387  int32_t src_stride,
388  uint8_t *dst,
389  int32_t dst_stride,
390  int32_t height,
391  int32_t weight,
392  int32_t offset,
393  int32_t rnd_val)
394 {
395  uint32_t loop_cnt;
396  v16u8 out0, out1, out2, out3;
397  v16i8 src0, src1, src2, src3;
398  v16i8 zero = { 0 };
399  v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, offset_vec;
400  v4i32 weight_vec, rnd_vec;
401 
402  weight_vec = __msa_fill_w(weight);
403  offset_vec = __msa_fill_w(offset);
404  rnd_vec = __msa_fill_w(rnd_val);
405 
406  for (loop_cnt = height >> 2; loop_cnt--;) {
407  LD_SB4(src, src_stride, src0, src1, src2, src3);
408  src += (4 * src_stride);
409  ILVRL_B2_SH(zero, src0, dst0, dst1);
410  ILVRL_B2_SH(zero, src1, dst2, dst3);
411  ILVRL_B2_SH(zero, src2, dst4, dst5);
412  ILVRL_B2_SH(zero, src3, dst6, dst7);
413  SLLI_4V(dst0, dst1, dst2, dst3, 6);
414  SLLI_4V(dst4, dst5, dst6, dst7, 6);
415  HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3, weight_vec,
416  offset_vec, rnd_vec, dst0, dst1, dst2,
417  dst3);
418  HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst4, dst5, dst6, dst7, weight_vec,
419  offset_vec, rnd_vec, dst4, dst5, dst6,
420  dst7);
421  PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
422  PCKEV_B2_UB(dst5, dst4, dst7, dst6, out2, out3);
423  ST_UB4(out0, out1, out2, out3, dst, dst_stride);
424  dst += (4 * dst_stride);
425  }
426 }
427 
428 static void hevc_uniwgt_copy_24w_msa(const uint8_t *src,
429  int32_t src_stride,
430  uint8_t *dst,
431  int32_t dst_stride,
432  int32_t height,
433  int32_t weight,
434  int32_t offset,
435  int32_t rnd_val)
436 {
437  uint32_t loop_cnt;
438  v16u8 out0, out1, out2, out3, out4, out5;
439  v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
440  v16i8 zero = { 0 };
441  v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, offset_vec;
442  v8i16 dst8, dst9, dst10, dst11;
443  v4i32 weight_vec, rnd_vec;
444 
445  weight_vec = __msa_fill_w(weight);
446  offset_vec = __msa_fill_w(offset);
447  rnd_vec = __msa_fill_w(rnd_val);
448 
449  for (loop_cnt = (height >> 2); loop_cnt--;) {
450  LD_SB4(src, src_stride, src0, src1, src4, src5);
451  LD_SB4(src + 16, src_stride, src2, src3, src6, src7);
452  src += (4 * src_stride);
453 
454  ILVRL_B2_SH(zero, src0, dst0, dst1);
455  ILVRL_B2_SH(zero, src1, dst2, dst3);
456  ILVR_B2_SH(zero, src2, zero, src3, dst4, dst5);
457  ILVRL_B2_SH(zero, src4, dst6, dst7);
458  ILVRL_B2_SH(zero, src5, dst8, dst9);
459  ILVR_B2_SH(zero, src6, zero, src7, dst10, dst11);
460  SLLI_4V(dst0, dst1, dst2, dst3, 6);
461  SLLI_4V(dst4, dst5, dst6, dst7, 6);
462  SLLI_4V(dst8, dst9, dst10, dst11, 6);
463  HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3, weight_vec,
464  offset_vec, rnd_vec, dst0, dst1, dst2,
465  dst3);
466  HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst4, dst5, dst6, dst7, weight_vec,
467  offset_vec, rnd_vec, dst4, dst5, dst6,
468  dst7);
469  HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst8, dst9, dst10, dst11, weight_vec,
470  offset_vec, rnd_vec, dst8, dst9, dst10,
471  dst11);
472  PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2);
473  PCKEV_B3_UB(dst7, dst6, dst9, dst8, dst11, dst10, out3, out4, out5);
474  ST_UB4(out0, out1, out3, out4, dst, dst_stride);
475  ST_D4(out2, out5, 0, 1, 0, 1, dst + 16, dst_stride);
476  dst += (4 * dst_stride);
477  }
478 }
479 
480 static void hevc_uniwgt_copy_32w_msa(const uint8_t *src,
481  int32_t src_stride,
482  uint8_t *dst,
483  int32_t dst_stride,
484  int32_t height,
485  int32_t weight,
486  int32_t offset,
487  int32_t rnd_val)
488 {
489  uint32_t loop_cnt;
490  v16u8 out0, out1, out2, out3;
491  v16i8 src0, src1, src2, src3;
492  v16i8 zero = { 0 };
493  v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, offset_vec;
494  v4i32 weight_vec, rnd_vec;
495 
496  weight_vec = __msa_fill_w(weight);
497  offset_vec = __msa_fill_w(offset);
498  rnd_vec = __msa_fill_w(rnd_val);
499 
500  for (loop_cnt = (height >> 1); loop_cnt--;) {
501  LD_SB2(src, src_stride, src0, src1);
502  LD_SB2(src + 16, src_stride, src2, src3);
503  src += (2 * src_stride);
504 
505  ILVRL_B2_SH(zero, src0, dst0, dst1);
506  ILVRL_B2_SH(zero, src1, dst2, dst3);
507  ILVRL_B2_SH(zero, src2, dst4, dst5);
508  ILVRL_B2_SH(zero, src3, dst6, dst7);
509  SLLI_4V(dst0, dst1, dst2, dst3, 6);
510  SLLI_4V(dst4, dst5, dst6, dst7, 6);
511  HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3, weight_vec,
512  offset_vec, rnd_vec, dst0, dst1, dst2,
513  dst3);
514  HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst4, dst5, dst6, dst7, weight_vec,
515  offset_vec, rnd_vec, dst4, dst5, dst6,
516  dst7);
517  PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
518  PCKEV_B2_UB(dst5, dst4, dst7, dst6, out2, out3);
519  ST_UB2(out0, out1, dst, dst_stride);
520  ST_UB2(out2, out3, dst + 16, dst_stride);
521  dst += (2 * dst_stride);
522  }
523 }
524 
525 static void hevc_uniwgt_copy_48w_msa(const uint8_t *src,
526  int32_t src_stride,
527  uint8_t *dst,
528  int32_t dst_stride,
529  int32_t height,
530  int32_t weight,
531  int32_t offset,
532  int32_t rnd_val)
533 {
534  uint32_t loop_cnt;
535  v16u8 out0, out1, out2, out3, out4, out5;
536  v16i8 src0, src1, src2, src3, src4, src5;
537  v16i8 zero = { 0 };
538  v8i16 dst0, dst1, dst2, dst3, dst4, dst5, offset_vec;
539  v8i16 dst6, dst7, dst8, dst9, dst10, dst11;
540  v4i32 weight_vec, rnd_vec;
541 
542  weight_vec = __msa_fill_w(weight);
543  offset_vec = __msa_fill_w(offset);
544  rnd_vec = __msa_fill_w(rnd_val);
545 
546  for (loop_cnt = (height >> 1); loop_cnt--;) {
547  LD_SB3(src, 16, src0, src1, src2);
548  src += src_stride;
549  LD_SB3(src, 16, src3, src4, src5);
550  src += src_stride;
551 
552  ILVRL_B2_SH(zero, src0, dst0, dst1);
553  ILVRL_B2_SH(zero, src1, dst2, dst3);
554  ILVRL_B2_SH(zero, src2, dst4, dst5);
555  ILVRL_B2_SH(zero, src3, dst6, dst7);
556  ILVRL_B2_SH(zero, src4, dst8, dst9);
557  ILVRL_B2_SH(zero, src5, dst10, dst11);
558  SLLI_4V(dst0, dst1, dst2, dst3, 6);
559  SLLI_4V(dst4, dst5, dst6, dst7, 6);
560  SLLI_4V(dst8, dst9, dst10, dst11, 6);
561  HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3, weight_vec,
562  offset_vec, rnd_vec, dst0, dst1, dst2,
563  dst3);
564  HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst4, dst5, dst6, dst7, weight_vec,
565  offset_vec, rnd_vec, dst4, dst5, dst6,
566  dst7);
567  HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst8, dst9, dst10, dst11, weight_vec,
568  offset_vec, rnd_vec, dst8, dst9, dst10,
569  dst11);
570  PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2);
571  PCKEV_B3_UB(dst7, dst6, dst9, dst8, dst11, dst10, out3, out4, out5);
572  ST_UB2(out0, out1, dst, 16);
573  ST_UB(out2, dst + 32);
574  dst += dst_stride;
575  ST_UB2(out3, out4, dst, 16);
576  ST_UB(out5, dst + 32);
577  dst += dst_stride;
578  }
579 }
580 
581 static void hevc_uniwgt_copy_64w_msa(const uint8_t *src,
582  int32_t src_stride,
583  uint8_t *dst,
584  int32_t dst_stride,
585  int32_t height,
586  int32_t weight,
587  int32_t offset,
588  int32_t rnd_val)
589 {
590  uint32_t loop_cnt;
591  v16u8 out0, out1, out2, out3, out4, out5, out6, out7;
592  v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
593  v16i8 zero = { 0 };
594  v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, offset_vec;
595  v8i16 dst8, dst9, dst10, dst11, dst12, dst13, dst14, dst15;
596  v4i32 weight_vec, rnd_vec;
597 
598  weight_vec = __msa_fill_w(weight);
599  offset_vec = __msa_fill_w(offset);
600  rnd_vec = __msa_fill_w(rnd_val);
601 
602  for (loop_cnt = (height >> 1); loop_cnt--;) {
603  LD_SB4(src, 16, src0, src1, src2, src3);
604  src += src_stride;
605  LD_SB4(src, 16, src4, src5, src6, src7);
606  src += src_stride;
607 
608  ILVRL_B2_SH(zero, src0, dst0, dst1);
609  ILVRL_B2_SH(zero, src1, dst2, dst3);
610  ILVRL_B2_SH(zero, src2, dst4, dst5);
611  ILVRL_B2_SH(zero, src3, dst6, dst7);
612  ILVRL_B2_SH(zero, src4, dst8, dst9);
613  ILVRL_B2_SH(zero, src5, dst10, dst11);
614  ILVRL_B2_SH(zero, src6, dst12, dst13);
615  ILVRL_B2_SH(zero, src7, dst14, dst15);
616  SLLI_4V(dst0, dst1, dst2, dst3, 6);
617  SLLI_4V(dst4, dst5, dst6, dst7, 6);
618  SLLI_4V(dst8, dst9, dst10, dst11, 6);
619  SLLI_4V(dst12, dst13, dst14, dst15, 6);
620  HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3, weight_vec,
621  offset_vec, rnd_vec, dst0, dst1, dst2,
622  dst3);
623  HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst4, dst5, dst6, dst7, weight_vec,
624  offset_vec, rnd_vec, dst4, dst5, dst6,
625  dst7);
626  HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst8, dst9, dst10, dst11, weight_vec,
627  offset_vec, rnd_vec, dst8, dst9, dst10,
628  dst11);
629  HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst12, dst13, dst14, dst15, weight_vec,
630  offset_vec, rnd_vec, dst12, dst13, dst14,
631  dst15);
632  PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
633  PCKEV_B2_UB(dst5, dst4, dst7, dst6, out2, out3);
634  PCKEV_B2_UB(dst9, dst8, dst11, dst10, out4, out5);
635  PCKEV_B2_UB(dst13, dst12, dst15, dst14, out6, out7);
636  ST_UB4(out0, out1, out2, out3, dst, 16);
637  dst += dst_stride;
638  ST_UB4(out4, out5, out6, out7, dst, 16);
639  dst += dst_stride;
640  }
641 }
642 
643 static void hevc_hz_uniwgt_8t_4w_msa(const uint8_t *src,
644  int32_t src_stride,
645  uint8_t *dst,
646  int32_t dst_stride,
647  const int8_t *filter,
648  int32_t height,
649  int32_t weight,
650  int32_t offset,
651  int32_t rnd_val)
652 {
653  uint32_t loop_cnt;
654  uint32_t res = height & 0x07;
655  v16u8 out0, out1;
656  v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
657  v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
658  v8i16 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
659  v16i8 mask0, mask1, mask2, mask3;
660  v8i16 filter_vec, filt0, filt1, filt2, filt3;
661  v4i32 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
662  v4i32 weight_vec, rnd_vec, offset_vec;
663  v8i16 zero = { 0 };
664 
665  src -= 3;
666 
667  weight_vec = __msa_fill_w(weight);
668  rnd_vec = __msa_fill_w(rnd_val);
669  offset_vec = __msa_fill_w(offset);
670 
671  filter_vec = LD_SH(filter);
672  UNPCK_R_SB_SH(filter_vec, filter_vec);
673  SPLATI_W4_SH(filter_vec, filt0, filt1, filt2, filt3);
674 
675  mask0 = LD_SB(&ff_hevc_mask_arr[16]);
676  mask1 = mask0 + 2;
677  mask2 = mask0 + 4;
678  mask3 = mask0 + 6;
679 
680  for (loop_cnt = (height >> 3); loop_cnt--;) {
681  LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
682  src += (8 * src_stride);
683  VSHF_B4_SB(src0, src1, mask0, mask1, mask2, mask3,
684  vec0, vec1, vec2, vec3);
685  VSHF_B4_SB(src2, src3, mask0, mask1, mask2, mask3,
686  vec4, vec5, vec6, vec7);
687  VSHF_B4_SB(src4, src5, mask0, mask1, mask2, mask3,
688  vec8, vec9, vec10, vec11);
689  VSHF_B4_SB(src6, src7, mask0, mask1, mask2, mask3,
690  vec12, vec13, vec14, vec15);
691 
692  HEVC_FILT_8TAP_4W_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
693  filt3, dst0, dst1);
694  HEVC_FILT_8TAP_4W_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
695  filt3, dst2, dst3);
696  HEVC_FILT_8TAP_4W_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
697  filt3, dst4, dst5);
698  HEVC_FILT_8TAP_4W_SH(vec12, vec13, vec14, vec15, filt0, filt1, filt2,
699  filt3, dst6, dst7);
700 
701  MUL4(dst0, weight_vec, dst1, weight_vec, dst2, weight_vec, dst3,
702  weight_vec, dst0, dst1, dst2, dst3)
703  MUL4(dst4, weight_vec, dst5, weight_vec, dst6, weight_vec, dst7,
704  weight_vec, dst4, dst5, dst6, dst7);
705  SRAR_W4_SW(dst0, dst1, dst2, dst3, rnd_vec);
706  SRAR_W4_SW(dst4, dst5, dst6, dst7, rnd_vec);
707  ADD4(dst0, offset_vec, dst1, offset_vec, dst2, offset_vec, dst3,
708  offset_vec, dst0, dst1, dst2, dst3);
709  ADD4(dst4, offset_vec, dst5, offset_vec, dst6, offset_vec, dst7,
710  offset_vec, dst4, dst5, dst6, dst7);
711  CLIP_SW8_0_255(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
712  PCKEV_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6,
713  vec0, vec1, vec2, vec3);
714  PCKEV_B2_UB(vec1, vec0, vec3, vec2, out0, out1);
715 
716  ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
717  dst += (8 * dst_stride);
718  }
719  if (res) {
720  LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
721  VSHF_B4_SB(src0, src1, mask0, mask1, mask2, mask3,
722  vec0, vec1, vec2, vec3);
723  VSHF_B4_SB(src2, src3, mask0, mask1, mask2, mask3,
724  vec4, vec5, vec6, vec7);
725  VSHF_B4_SB(src4, src5, mask0, mask1, mask2, mask3,
726  vec8, vec9, vec10, vec11);
727  VSHF_B4_SB(src6, src7, mask0, mask1, mask2, mask3,
728  vec12, vec13, vec14, vec15);
729 
730  HEVC_FILT_8TAP_4W_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
731  filt3, dst0, dst1);
732  HEVC_FILT_8TAP_4W_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
733  filt3, dst2, dst3);
734  HEVC_FILT_8TAP_4W_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
735  filt3, dst4, dst5);
736  HEVC_FILT_8TAP_4W_SH(vec12, vec13, vec14, vec15, filt0, filt1, filt2,
737  filt3, dst6, dst7);
738 
739  MUL4(dst0, weight_vec, dst1, weight_vec, dst2, weight_vec, dst3,
740  weight_vec, dst0, dst1, dst2, dst3)
741  MUL4(dst4, weight_vec, dst5, weight_vec, dst6, weight_vec, dst7,
742  weight_vec, dst4, dst5, dst6, dst7);
743  SRAR_W4_SW(dst0, dst1, dst2, dst3, rnd_vec);
744  SRAR_W4_SW(dst4, dst5, dst6, dst7, rnd_vec);
745  ADD4(dst0, offset_vec, dst1, offset_vec, dst2, offset_vec, dst3,
746  offset_vec, dst0, dst1, dst2, dst3);
747  ADD4(dst4, offset_vec, dst5, offset_vec, dst6, offset_vec, dst7,
748  offset_vec, dst4, dst5, dst6, dst7);
749  CLIP_SW8_0_255(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
750  PCKEV_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6,
751  vec0, vec1, vec2, vec3);
752  PCKEV_B2_UB(vec1, vec0, vec3, vec2, out0, out1);
753 
754  if (res == 2) {
755  ST_W2(out0, 0, 1, dst, dst_stride);
756  } else if (res == 4) {
757  ST_W4(out0, 0, 1, 2, 3, dst, dst_stride);
758  } else {
759  ST_W4(out0, 0, 1, 2, 3, dst, dst_stride);
760  ST_W2(out1, 0, 1, dst + 4 * dst_stride, dst_stride);
761  }
762  }
763 }
764 
765 static void hevc_hz_uniwgt_8t_8w_msa(const uint8_t *src,
766  int32_t src_stride,
767  uint8_t *dst,
768  int32_t dst_stride,
769  const int8_t *filter,
770  int32_t height,
771  int32_t weight,
772  int32_t offset,
773  int32_t rnd_val)
774 {
775  uint32_t loop_cnt;
776  uint32_t res = height & 0x03;
777  v16u8 out0, out1;
778  v16i8 src0, src1, src2, src3;
779  v8i16 filt0, filt1, filt2, filt3;
780  v16i8 mask0, mask1, mask2, mask3;
781  v8i16 filter_vec;
782  v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
783  v8i16 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
784  v4i32 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
785  v4i32 weight_vec, rnd_vec, offset_vec;
786  v8i16 zero = { 0 };
787 
788  src -= 3;
789 
790  weight_vec = __msa_fill_w(weight);
791  rnd_vec = __msa_fill_w(rnd_val);
792  offset_vec = __msa_fill_w(offset);
793 
794  filter_vec = LD_SH(filter);
795  UNPCK_R_SB_SH(filter_vec, filter_vec);
796  SPLATI_W4_SH(filter_vec, filt0, filt1, filt2, filt3);
797 
798  mask0 = LD_SB(&ff_hevc_mask_arr[0]);
799  mask1 = mask0 + 2;
800  mask2 = mask0 + 4;
801  mask3 = mask0 + 6;
802 
803  for (loop_cnt = (height >> 2); loop_cnt--;) {
804  LD_SB4(src, src_stride, src0, src1, src2, src3);
805  src += (4 * src_stride);
806 
807  VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
808  vec0, vec1, vec2, vec3);
809  VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
810  vec4, vec5, vec6, vec7);
811  VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
812  vec8, vec9, vec10, vec11);
813  VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
814  vec12, vec13, vec14, vec15);
815  HEVC_FILT_8TAP_4W_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
816  filt3, dst0, dst1);
817  HEVC_FILT_8TAP_4W_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
818  filt3, dst2, dst3);
819  HEVC_FILT_8TAP_4W_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
820  filt3, dst4, dst5);
821  HEVC_FILT_8TAP_4W_SH(vec12, vec13, vec14, vec15, filt0, filt1, filt2,
822  filt3, dst6, dst7);
823 
824  MUL4(dst0, weight_vec, dst1, weight_vec, dst2, weight_vec, dst3,
825  weight_vec, dst0, dst1, dst2, dst3)
826  MUL4(dst4, weight_vec, dst5, weight_vec, dst6, weight_vec, dst7,
827  weight_vec, dst4, dst5, dst6, dst7);
828  SRAR_W4_SW(dst0, dst1, dst2, dst3, rnd_vec);
829  SRAR_W4_SW(dst4, dst5, dst6, dst7, rnd_vec);
830  ADD4(dst0, offset_vec, dst1, offset_vec, dst2, offset_vec, dst3,
831  offset_vec, dst0, dst1, dst2, dst3);
832  ADD4(dst4, offset_vec, dst5, offset_vec, dst6, offset_vec, dst7,
833  offset_vec, dst4, dst5, dst6, dst7);
834  CLIP_SW8_0_255(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
835  PCKEV_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6,
836  vec0, vec1, vec2, vec3);
837  PCKEV_B2_UB(vec1, vec0, vec3, vec2, out0, out1);
838  ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
839  dst += (4 * dst_stride);
840  }
841  if (res) {
842  LD_SB2(src, src_stride, src0, src1);
843 
844  VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
845  vec0, vec1, vec2, vec3);
846  VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
847  vec4, vec5, vec6, vec7);
848  HEVC_FILT_8TAP_4W_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
849  filt3, dst0, dst1);
850  HEVC_FILT_8TAP_4W_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
851  filt3, dst2, dst3);
852 
853  MUL4(dst0, weight_vec, dst1, weight_vec, dst2, weight_vec, dst3,
854  weight_vec, dst0, dst1, dst2, dst3)
855  SRAR_W4_SW(dst0, dst1, dst2, dst3, rnd_vec);
856  ADD4(dst0, offset_vec, dst1, offset_vec, dst2, offset_vec, dst3,
857  offset_vec, dst0, dst1, dst2, dst3);
858  CLIP_SW4_0_255(dst0, dst1, dst2, dst3);
859  PCKEV_H2_SH(dst1, dst0, dst3, dst2, vec0, vec1);
860  out0 = __msa_pckev_b((v16i8) vec1, (v16i8) vec0);
861  ST_D2(out0, 0, 1, dst, dst_stride);
862  }
863 }
864 
865 static void hevc_hz_uniwgt_8t_12w_msa(const uint8_t *src,
866  int32_t src_stride,
867  uint8_t *dst,
868  int32_t dst_stride,
869  const int8_t *filter,
870  int32_t height,
871  int32_t weight,
872  int32_t offset,
873  int32_t rnd_val)
874 {
875  uint32_t loop_cnt;
876  v16u8 out0, out1;
877  v8i16 filt0, filt1, filt2, filt3;
878  v16i8 src0, src1, src2, src3;
879  v16i8 mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask7;
880  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
881  v8i16 filter_vec;
882  v4i32 dst0, dst1, dst2, dst3;
883  v4i32 dst00, dst01;
884  v4i32 weight_vec, rnd_vec, offset_vec;
885  v8i16 zero = { 0 };
886 
887  src -= 3;
888 
889  weight_vec = __msa_fill_w(weight);
890  rnd_vec = __msa_fill_w(rnd_val);
891  offset_vec = __msa_fill_w(offset);
892 
893  filter_vec = LD_SH(filter);
894  UNPCK_R_SB_SH(filter_vec, filter_vec);
895  SPLATI_W4_SH(filter_vec, filt0, filt1, filt2, filt3);
896 
897  mask0 = LD_SB(&ff_hevc_mask_arr[0]);
898  mask1 = mask0 + 2;
899  mask2 = mask0 + 4;
900  mask3 = mask0 + 6;
901  mask4 = LD_SB(&ff_hevc_mask_arr[16]);
902  mask5 = mask4 + 2;
903  mask6 = mask4 + 4;
904  mask7 = mask4 + 6;
905 
906  for (loop_cnt = (height >> 1); loop_cnt--;) {
907  LD_SB2(src, src_stride, src0, src1);
908  LD_SB2(src + 8, src_stride, src2, src3);
909  src += (2 * src_stride);
910 
911  VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
912  vec0, vec1, vec2, vec3);
913  VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
914  vec4, vec5, vec6, vec7);
915  HEVC_FILT_8TAP_4W_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
916  filt3, dst0, dst1);
917  HEVC_FILT_8TAP_4W_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
918  filt3, dst2, dst3);
919  VSHF_B4_SB(src2, src3, mask4, mask5, mask6, mask7,
920  vec0, vec1, vec2, vec3);
921  HEVC_FILT_8TAP_4W_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
922  filt3, dst00, dst01);
923 
924  MUL4(dst0, weight_vec, dst1, weight_vec, dst2, weight_vec, dst3,
925  weight_vec, dst0, dst1, dst2, dst3)
926  MUL2(dst00, weight_vec, dst01, weight_vec, dst00, dst01);
927  SRAR_W4_SW(dst0, dst1, dst2, dst3, rnd_vec);
928  SRAR_W2_SW(dst00, dst01, rnd_vec);
929  ADD4(dst0, offset_vec, dst1, offset_vec, dst2, offset_vec, dst3,
930  offset_vec, dst0, dst1, dst2, dst3);
931  ADD2(dst00, offset_vec, dst01, offset_vec, dst00, dst01);
932  CLIP_SW4_0_255(dst0, dst1, dst2, dst3);
933  CLIP_SW2_0_255(dst00, dst01);
934  PCKEV_H2_SH(dst1, dst0, dst3, dst2, vec0, vec1);
935  vec2 = __msa_pckev_h((v8i16) dst01, (v8i16) dst00);
936  PCKEV_B2_UB(vec1, vec0, zero, vec2, out0, out1);
937 
938  ST_D2(out0, 0, 1, dst, dst_stride);
939  ST_W2(out1, 0, 1, dst + 8, dst_stride);
940  dst += (2 * dst_stride);
941  }
942 }
943 
944 static void hevc_hz_uniwgt_8t_16w_msa(const uint8_t *src,
945  int32_t src_stride,
946  uint8_t *dst,
947  int32_t dst_stride,
948  const int8_t *filter,
949  int32_t height,
950  int32_t weight,
951  int32_t offset,
952  int32_t rnd_val)
953 {
954  uint32_t loop_cnt;
955  v16u8 out0, out1;
956  v16i8 src0, src1, src2, src3;
957  v8i16 filt0, filt1, filt2, filt3;
958  v16i8 mask0, mask1, mask2, mask3;
959  v8i16 filter_vec;
960  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
961  v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
962  v4i32 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
963  v4i32 weight_vec, rnd_vec, offset_vec;
964  v8i16 zero = { 0 };
965 
966  src -= 3;
967 
968  weight_vec = __msa_fill_w(weight);
969  rnd_vec = __msa_fill_w(rnd_val);
970  offset_vec = __msa_fill_w(offset);
971 
972  filter_vec = LD_SH(filter);
973  UNPCK_R_SB_SH(filter_vec, filter_vec);
974  SPLATI_W4_SH(filter_vec, filt0, filt1, filt2, filt3);
975 
976  mask0 = LD_SB(&ff_hevc_mask_arr[0]);
977  mask1 = mask0 + 2;
978  mask2 = mask0 + 4;
979  mask3 = mask0 + 6;
980 
981  for (loop_cnt = (height >> 1); loop_cnt--;) {
982  LD_SB2(src, src_stride, src0, src2);
983  LD_SB2(src + 8, src_stride, src1, src3);
984  src += (2 * src_stride);
985 
986  VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
987  vec0, vec1, vec2, vec3);
988  VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
989  vec4, vec5, vec6, vec7);
990  VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
991  vec8, vec9, vec10, vec11);
992  VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
993  vec12, vec13, vec14, vec15);
994  HEVC_FILT_8TAP_4W_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
995  filt3, dst0, dst1);
996  HEVC_FILT_8TAP_4W_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
997  filt3, dst2, dst3);
998  HEVC_FILT_8TAP_4W_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
999  filt3, dst4, dst5);
1000  HEVC_FILT_8TAP_4W_SH(vec12, vec13, vec14, vec15, filt0, filt1, filt2,
1001  filt3, dst6, dst7);
1002 
1003  MUL4(dst0, weight_vec, dst1, weight_vec, dst2, weight_vec, dst3,
1004  weight_vec, dst0, dst1, dst2, dst3)
1005  MUL4(dst4, weight_vec, dst5, weight_vec, dst6, weight_vec, dst7,
1006  weight_vec, dst4, dst5, dst6, dst7);
1007  SRAR_W4_SW(dst0, dst1, dst2, dst3, rnd_vec);
1008  SRAR_W4_SW(dst4, dst5, dst6, dst7, rnd_vec);
1009  ADD4(dst0, offset_vec, dst1, offset_vec, dst2, offset_vec, dst3,
1010  offset_vec, dst0, dst1, dst2, dst3);
1011  ADD4(dst4, offset_vec, dst5, offset_vec, dst6, offset_vec, dst7,
1012  offset_vec, dst4, dst5, dst6, dst7);
1013  CLIP_SW8_0_255(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
1014  PCKEV_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6,
1015  vec0, vec1, vec2, vec3);
1016  PCKEV_B2_UB(vec1, vec0, vec3, vec2, out0, out1);
1017 
1018  ST_UB2(out0, out1, dst, dst_stride);
1019  dst += (2 * dst_stride);
1020  }
1021 }
1022 
1023 static void hevc_hz_uniwgt_8t_24w_msa(const uint8_t *src,
1024  int32_t src_stride,
1025  uint8_t *dst,
1026  int32_t dst_stride,
1027  const int8_t *filter,
1028  int32_t height,
1029  int32_t weight,
1030  int32_t offset,
1031  int32_t rnd_val)
1032 {
1033  uint32_t loop_cnt;
1034  v16u8 out0, out1, out2;
1035  v16i8 src0, src1, src2, src3;
1036  v8i16 filt0, filt1, filt2, filt3;
1037  v16i8 mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask7;
1038  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1039  v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
1040  v4i32 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
1041  v4i32 dst8, dst9, dst10, dst11;
1042  v8i16 filter_vec;
1043  v4i32 weight_vec, rnd_vec, offset_vec;
1044  v8i16 zero = { 0 };
1045 
1046  src -= 3;
1047 
1048  weight_vec = __msa_fill_w(weight);
1049  rnd_vec = __msa_fill_w(rnd_val);
1050  offset_vec = __msa_fill_w(offset);
1051 
1052  filter_vec = LD_SH(filter);
1053  UNPCK_R_SB_SH(filter_vec, filter_vec);
1054  SPLATI_W4_SH(filter_vec, filt0, filt1, filt2, filt3);
1055 
1056  mask0 = LD_SB(&ff_hevc_mask_arr[0]);
1057  mask1 = mask0 + 2;
1058  mask2 = mask0 + 4;
1059  mask3 = mask0 + 6;
1060  mask4 = mask0 + 8;
1061  mask5 = mask0 + 10;
1062  mask6 = mask0 + 12;
1063  mask7 = mask0 + 14;
1064 
1065  for (loop_cnt = 16; loop_cnt--;) {
1066  LD_SB2(src, 16, src0, src1);
1067  src += src_stride;
1068  LD_SB2(src, 16, src2, src3);
1069  src += src_stride;
1070  VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
1071  vec0, vec1, vec2, vec3);
1072  VSHF_B4_SB(src0, src1, mask4, mask5, mask6, mask7,
1073  vec4, vec5, vec6, vec7);
1074  VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
1075  vec8, vec9, vec10, vec11);
1076  VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
1077  vec12, vec13, vec14, vec15);
1078  HEVC_FILT_8TAP_4W_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1079  filt3, dst0, dst1);
1080  HEVC_FILT_8TAP_4W_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
1081  filt3, dst2, dst3);
1082  HEVC_FILT_8TAP_4W_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
1083  filt3, dst4, dst5);
1084  HEVC_FILT_8TAP_4W_SH(vec12, vec13, vec14, vec15, filt0, filt1, filt2,
1085  filt3, dst6, dst7);
1086 
1087  VSHF_B4_SB(src2, src3, mask4, mask5, mask6, mask7,
1088  vec0, vec1, vec2, vec3);
1089  VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
1090  vec4, vec5, vec6, vec7);
1091  HEVC_FILT_8TAP_4W_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1092  filt3, dst8, dst9);
1093  HEVC_FILT_8TAP_4W_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
1094  filt3, dst10, dst11);
1095 
1096  MUL4(dst0, weight_vec, dst1, weight_vec, dst2, weight_vec, dst3,
1097  weight_vec, dst0, dst1, dst2, dst3)
1098  MUL4(dst4, weight_vec, dst5, weight_vec, dst6, weight_vec, dst7,
1099  weight_vec, dst4, dst5, dst6, dst7);
1100  MUL4(dst8, weight_vec, dst9, weight_vec, dst10, weight_vec, dst11,
1101  weight_vec, dst8, dst9, dst10, dst11)
1102  SRAR_W4_SW(dst0, dst1, dst2, dst3, rnd_vec);
1103  SRAR_W4_SW(dst4, dst5, dst6, dst7, rnd_vec);
1104  SRAR_W4_SW(dst8, dst9, dst10, dst11, rnd_vec);
1105  ADD4(dst0, offset_vec, dst1, offset_vec, dst2, offset_vec, dst3,
1106  offset_vec, dst0, dst1, dst2, dst3);
1107  ADD4(dst4, offset_vec, dst5, offset_vec, dst6, offset_vec, dst7,
1108  offset_vec, dst4, dst5, dst6, dst7);
1109  ADD4(dst8, offset_vec, dst9, offset_vec, dst10, offset_vec, dst11,
1110  offset_vec, dst8, dst9, dst10, dst11);
1111  CLIP_SW8_0_255(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
1112  CLIP_SW4_0_255(dst8, dst9, dst10, dst11);
1113  PCKEV_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6,
1114  vec0, vec1, vec2, vec3);
1115  PCKEV_H2_SH(dst9, dst8, dst11, dst10, vec4, vec5);
1116 
1117  PCKEV_B3_UB(vec1, vec0, vec4, vec3, vec5, vec2, out0, out1, out2);
1118  ST_UB2(out0, out1, dst, dst_stride);
1119  ST_D2(out2, 0, 1, dst + 16, dst_stride);
1120  dst += (2 * dst_stride);
1121  }
1122 }
1123 
1124 static void hevc_hz_uniwgt_8t_32w_msa(const uint8_t *src,
1125  int32_t src_stride,
1126  uint8_t *dst,
1127  int32_t dst_stride,
1128  const int8_t *filter,
1129  int32_t height,
1130  int32_t weight,
1131  int32_t offset,
1132  int32_t rnd_val)
1133 {
1134  uint32_t loop_cnt;
1135  v16u8 out0, out1, out2, out3;
1136  v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
1137  v8i16 filt0, filt1, filt2, filt3;
1138  v16i8 mask0, mask1, mask2, mask3;
1139  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1140  v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
1141  v8i16 filter_vec;
1142  v4i32 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
1143  v4i32 dst10, dst11, dst12, dst13, dst14, dst15, dst16, dst17;
1144  v4i32 weight_vec, rnd_vec, offset_vec;
1145  v8i16 zero = { 0 };
1146 
1147  src -= 3;
1148 
1149  weight_vec = __msa_fill_w(weight);
1150  rnd_vec = __msa_fill_w(rnd_val);
1151  offset_vec = __msa_fill_w(offset);
1152 
1153  filter_vec = LD_SH(filter);
1154  UNPCK_R_SB_SH(filter_vec, filter_vec);
1155  SPLATI_W4_SH(filter_vec, filt0, filt1, filt2, filt3);
1156 
1157  mask0 = LD_SB(&ff_hevc_mask_arr[0]);
1158  mask1 = mask0 + 2;
1159  mask2 = mask0 + 4;
1160  mask3 = mask0 + 6;
1161 
1162  for (loop_cnt = height >> 1; loop_cnt--;) {
1163  LD_SB4(src, 8, src0, src1, src2, src3);
1164  src += src_stride;
1165  LD_SB4(src, 8, src4, src5, src6, src7);
1166  src += src_stride;
1167 
1168  VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
1169  vec0, vec1, vec2, vec3);
1170  VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
1171  vec4, vec5, vec6, vec7);
1172  VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
1173  vec8, vec9, vec10, vec11);
1174  VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
1175  vec12, vec13, vec14, vec15);
1176  HEVC_FILT_8TAP_4W_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1177  filt3, dst0, dst1);
1178  HEVC_FILT_8TAP_4W_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
1179  filt3, dst2, dst3);
1180  HEVC_FILT_8TAP_4W_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
1181  filt3, dst4, dst5);
1182  HEVC_FILT_8TAP_4W_SH(vec12, vec13, vec14, vec15, filt0, filt1, filt2,
1183  filt3, dst6, dst7);
1184 
1185  VSHF_B4_SB(src4, src4, mask0, mask1, mask2, mask3,
1186  vec0, vec1, vec2, vec3);
1187  VSHF_B4_SB(src5, src5, mask0, mask1, mask2, mask3,
1188  vec4, vec5, vec6, vec7);
1189  VSHF_B4_SB(src6, src6, mask0, mask1, mask2, mask3,
1190  vec8, vec9, vec10, vec11);
1191  VSHF_B4_SB(src7, src7, mask0, mask1, mask2, mask3,
1192  vec12, vec13, vec14, vec15);
1193  HEVC_FILT_8TAP_4W_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1194  filt3, dst10, dst11);
1195  HEVC_FILT_8TAP_4W_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
1196  filt3, dst12, dst13);
1197  HEVC_FILT_8TAP_4W_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
1198  filt3, dst14, dst15);
1199  HEVC_FILT_8TAP_4W_SH(vec12, vec13, vec14, vec15, filt0, filt1, filt2,
1200  filt3, dst16, dst17);
1201 
1202  MUL4(dst0, weight_vec, dst1, weight_vec, dst2, weight_vec, dst3,
1203  weight_vec, dst0, dst1, dst2, dst3)
1204  MUL4(dst4, weight_vec, dst5, weight_vec, dst6, weight_vec, dst7,
1205  weight_vec, dst4, dst5, dst6, dst7);
1206  MUL4(dst10, weight_vec, dst11, weight_vec, dst12, weight_vec, dst13,
1207  weight_vec, dst10, dst11, dst12, dst13)
1208  MUL4(dst14, weight_vec, dst15, weight_vec, dst16, weight_vec, dst17,
1209  weight_vec, dst14, dst15, dst16, dst17);
1210  SRAR_W4_SW(dst0, dst1, dst2, dst3, rnd_vec);
1211  SRAR_W4_SW(dst4, dst5, dst6, dst7, rnd_vec);
1212  SRAR_W4_SW(dst10, dst11, dst12, dst13, rnd_vec);
1213  SRAR_W4_SW(dst14, dst15, dst16, dst17, rnd_vec);
1214  ADD4(dst0, offset_vec, dst1, offset_vec, dst2, offset_vec, dst3,
1215  offset_vec, dst0, dst1, dst2, dst3);
1216  ADD4(dst4, offset_vec, dst5, offset_vec, dst6, offset_vec, dst7,
1217  offset_vec, dst4, dst5, dst6, dst7);
1218  ADD4(dst10, offset_vec, dst11, offset_vec, dst12, offset_vec, dst13,
1219  offset_vec, dst10, dst11, dst12, dst13);
1220  ADD4(dst14, offset_vec, dst15, offset_vec, dst16, offset_vec, dst17,
1221  offset_vec, dst14, dst15, dst16, dst17);
1222  CLIP_SW8_0_255(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
1223  CLIP_SW8_0_255(dst10, dst11, dst12, dst13, dst14, dst15, dst16, dst17);
1224  PCKEV_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6,
1225  vec0, vec1, vec2, vec3);
1226  PCKEV_H4_SH(dst11, dst10, dst13, dst12, dst15, dst14, dst17, dst16,
1227  vec4, vec5, vec6, vec7);
1228 
1229  PCKEV_B2_UB(vec1, vec0, vec3, vec2, out0, out1);
1230  PCKEV_B2_UB(vec5, vec4, vec7, vec6, out2, out3);
1231  ST_UB2(out0, out1, dst, 16);
1232  dst += dst_stride;
1233  ST_UB2(out2, out3, dst, 16);
1234  dst += dst_stride;
1235  }
1236 }
1237 
1238 static void hevc_hz_uniwgt_8t_48w_msa(const uint8_t *src,
1239  int32_t src_stride,
1240  uint8_t *dst,
1241  int32_t dst_stride,
1242  const int8_t *filter,
1243  int32_t height,
1244  int32_t weight,
1245  int32_t offset,
1246  int32_t rnd_val)
1247 {
1248  uint32_t loop_cnt;
1249  v16u8 out0, out1, out2;
1250  v16i8 src0, src1, src2, src3;
1251  v8i16 filt0, filt1, filt2, filt3;
1252  v16i8 mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask7;
1253  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1254  v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
1255  v4i32 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
1256  v4i32 dst8, dst9, dst10, dst11;
1257  v8i16 filter_vec;
1258  v4i32 weight_vec, rnd_vec, offset_vec;
1259  v8i16 zero = { 0 };
1260 
1261  src -= 3;
1262 
1263  weight_vec = __msa_fill_w(weight);
1264  rnd_vec = __msa_fill_w(rnd_val);
1265  offset_vec = __msa_fill_w(offset);
1266 
1267  filter_vec = LD_SH(filter);
1268  UNPCK_R_SB_SH(filter_vec, filter_vec);
1269  SPLATI_W4_SH(filter_vec, filt0, filt1, filt2, filt3);
1270 
1271  mask0 = LD_SB(&ff_hevc_mask_arr[0]);
1272  mask1 = mask0 + 2;
1273  mask2 = mask0 + 4;
1274  mask3 = mask0 + 6;
1275  mask4 = mask0 + 8;
1276  mask5 = mask0 + 10;
1277  mask6 = mask0 + 12;
1278  mask7 = mask0 + 14;
1279 
1280  for (loop_cnt = 64; loop_cnt--;) {
1281  LD_SB3(src, 16, src0, src1, src2);
1282  src3 = LD_SB(src + 40);
1283  src += src_stride;
1284 
1285  VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
1286  vec0, vec1, vec2, vec3);
1287  VSHF_B4_SB(src0, src1, mask4, mask5, mask6, mask7,
1288  vec4, vec5, vec6, vec7);
1289  VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
1290  vec8, vec9, vec10, vec11);
1291  VSHF_B4_SB(src1, src2, mask4, mask5, mask6, mask7,
1292  vec12, vec13, vec14, vec15);
1293  HEVC_FILT_8TAP_4W_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1294  filt3, dst0, dst1);
1295  HEVC_FILT_8TAP_4W_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
1296  filt3, dst2, dst3);
1297  HEVC_FILT_8TAP_4W_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
1298  filt3, dst4, dst5);
1299  HEVC_FILT_8TAP_4W_SH(vec12, vec13, vec14, vec15, filt0, filt1, filt2,
1300  filt3, dst6, dst7);
1301 
1302  VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
1303  vec0, vec1, vec2, vec3);
1304  VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
1305  vec4, vec5, vec6, vec7);
1306  HEVC_FILT_8TAP_4W_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1307  filt3, dst8, dst9);
1308  HEVC_FILT_8TAP_4W_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
1309  filt3, dst10, dst11);
1310 
1311  MUL4(dst0, weight_vec, dst1, weight_vec, dst2, weight_vec, dst3,
1312  weight_vec, dst0, dst1, dst2, dst3)
1313  MUL4(dst4, weight_vec, dst5, weight_vec, dst6, weight_vec, dst7,
1314  weight_vec, dst4, dst5, dst6, dst7);
1315  MUL4(dst8, weight_vec, dst9, weight_vec, dst10, weight_vec, dst11,
1316  weight_vec, dst8, dst9, dst10, dst11)
1317  SRAR_W4_SW(dst0, dst1, dst2, dst3, rnd_vec);
1318  SRAR_W4_SW(dst4, dst5, dst6, dst7, rnd_vec);
1319  SRAR_W4_SW(dst8, dst9, dst10, dst11, rnd_vec);
1320  ADD4(dst0, offset_vec, dst1, offset_vec, dst2, offset_vec, dst3,
1321  offset_vec, dst0, dst1, dst2, dst3);
1322  ADD4(dst4, offset_vec, dst5, offset_vec, dst6, offset_vec, dst7,
1323  offset_vec, dst4, dst5, dst6, dst7);
1324  ADD4(dst8, offset_vec, dst9, offset_vec, dst10, offset_vec, dst11,
1325  offset_vec, dst8, dst9, dst10, dst11);
1326  CLIP_SW8_0_255(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
1327  CLIP_SW4_0_255(dst8, dst9, dst10, dst11);
1328  PCKEV_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6,
1329  vec0, vec1, vec2, vec3);
1330  PCKEV_H2_SH(dst9, dst8, dst11, dst10, vec4, vec5);
1331  PCKEV_B2_UB(vec1, vec0, vec3, vec2, out0, out1);
1332  out2 = __msa_pckev_b((v16i8) vec5, (v16i8) vec4);
1333  ST_UB2(out0, out1, dst, 16);
1334  ST_UB(out2, dst + 32);
1335  dst += dst_stride;
1336  }
1337 }
1338 
1339 static void hevc_hz_uniwgt_8t_64w_msa(const uint8_t *src,
1340  int32_t src_stride,
1341  uint8_t *dst,
1342  int32_t dst_stride,
1343  const int8_t *filter,
1344  int32_t height,
1345  int32_t weight,
1346  int32_t offset,
1347  int32_t rnd_val)
1348 {
1349  const uint8_t *src_tmp;
1350  uint8_t *dst_tmp;
1351  uint32_t loop_cnt, cnt;
1352  v16u8 out0, out1;
1353  v16i8 src0, src1, src2;
1354  v8i16 filt0, filt1, filt2, filt3;
1355  v16i8 mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask7;
1356  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1357  v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
1358  v4i32 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
1359  v8i16 filter_vec;
1360  v4i32 weight_vec, rnd_vec, offset_vec;
1361  v8i16 zero = { 0 };
1362 
1363  src -= 3;
1364 
1365  weight_vec = __msa_fill_w(weight);
1366  rnd_vec = __msa_fill_w(rnd_val);
1367  offset_vec = __msa_fill_w(offset);
1368 
1369  filter_vec = LD_SH(filter);
1370  UNPCK_R_SB_SH(filter_vec, filter_vec);
1371  SPLATI_W4_SH(filter_vec, filt0, filt1, filt2, filt3);
1372 
1373  mask0 = LD_SB(&ff_hevc_mask_arr[0]);
1374  mask1 = mask0 + 2;
1375  mask2 = mask0 + 4;
1376  mask3 = mask0 + 6;
1377  mask4 = mask0 + 8;
1378  mask5 = mask0 + 10;
1379  mask6 = mask0 + 12;
1380  mask7 = mask0 + 14;
1381 
1382  for (loop_cnt = height; loop_cnt--;) {
1383  src_tmp = src;
1384  dst_tmp = dst;
1385 
1386  for (cnt = 2; cnt--;) {
1387  LD_SB2(src_tmp, 16, src0, src1);
1388  src2 = LD_SB(src_tmp + 24);
1389  src_tmp += 32;
1390 
1391  VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
1392  vec0, vec1, vec2, vec3);
1393  VSHF_B4_SB(src0, src1, mask4, mask5, mask6, mask7,
1394  vec4, vec5, vec6, vec7);
1395  VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
1396  vec8, vec9, vec10, vec11);
1397  VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
1398  vec12, vec13, vec14, vec15);
1399  HEVC_FILT_8TAP_4W_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1400  filt3, dst0, dst1);
1401  HEVC_FILT_8TAP_4W_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
1402  filt3, dst2, dst3);
1403  HEVC_FILT_8TAP_4W_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
1404  filt3, dst4, dst5);
1405  HEVC_FILT_8TAP_4W_SH(vec12, vec13, vec14, vec15, filt0, filt1, filt2,
1406  filt3, dst6, dst7);
1407  MUL4(dst0, weight_vec, dst1, weight_vec, dst2, weight_vec, dst3,
1408  weight_vec, dst0, dst1, dst2, dst3)
1409  MUL4(dst4, weight_vec, dst5, weight_vec, dst6, weight_vec, dst7,
1410  weight_vec, dst4, dst5, dst6, dst7);
1411  SRAR_W4_SW(dst0, dst1, dst2, dst3, rnd_vec);
1412  SRAR_W4_SW(dst4, dst5, dst6, dst7, rnd_vec);
1413  ADD4(dst0, offset_vec, dst1, offset_vec, dst2, offset_vec, dst3,
1414  offset_vec, dst0, dst1, dst2, dst3);
1415  ADD4(dst4, offset_vec, dst5, offset_vec, dst6, offset_vec, dst7,
1416  offset_vec, dst4, dst5, dst6, dst7);
1417  CLIP_SW8_0_255(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
1418  PCKEV_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6,
1419  vec0, vec1, vec2, vec3);
1420  PCKEV_B2_UB(vec1, vec0, vec3, vec2, out0, out1);
1421  ST_UB2(out0, out1, dst_tmp, 16);
1422  dst_tmp += 32;
1423  }
1424 
1425  src += src_stride;
1426  dst += dst_stride;
1427  }
1428 }
1429 
1430 static void hevc_vt_uniwgt_8t_4w_msa(const uint8_t *src,
1431  int32_t src_stride,
1432  uint8_t *dst,
1433  int32_t dst_stride,
1434  const int8_t *filter,
1435  int32_t height,
1436  int32_t weight,
1437  int32_t offset,
1438  int32_t rnd_val)
1439 {
1440  int32_t loop_cnt;
1441  int32_t res = height & 0x07;
1442  v16u8 out0, out1;
1443  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
1444  v16i8 src9, src10, src11, src12, src13, src14;
1445  v16i8 src10_r, src32_r, src54_r, src76_r, src98_r;
1446  v16i8 src21_r, src43_r, src65_r, src87_r, src109_r;
1447  v16i8 src1110_r, src1211_r, src1312_r, src1413_r;
1448  v16i8 src2110, src4332, src6554, src8776, src10998;
1449  v16i8 src12111110, src14131312;
1450  v8i16 filter_vec;
1451  v4i32 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
1452  v8i16 filt0, filt1, filt2, filt3;
1453  v8i16 vec0, vec1, vec2, vec3;
1454  v4i32 weight_vec, rnd_vec, offset_vec;
1455  v8i16 zero = { 0 };
1456 
1457  src -= (3 * src_stride);
1458 
1459 
1460  weight_vec = __msa_fill_w(weight);
1461  rnd_vec = __msa_fill_w(rnd_val);
1462  offset_vec = __msa_fill_w(offset);
1463 
1464  filter_vec = LD_SH(filter);
1465  UNPCK_R_SB_SH(filter_vec, filter_vec);
1466  SPLATI_W4_SH(filter_vec, filt0, filt1, filt2, filt3);
1467 
1468  LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
1469  src += (7 * src_stride);
1470 
1471  ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
1472  src10_r, src32_r, src54_r, src21_r);
1473 
1474  ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1475 
1476  ILVR_D3_SB(src21_r, src10_r, src43_r,
1477  src32_r, src65_r, src54_r, src2110, src4332, src6554);
1478 
1479 
1480  for (loop_cnt = (height >> 3); loop_cnt--;) {
1481  LD_SB8(src, src_stride,
1482  src7, src8, src9, src10, src11, src12, src13, src14);
1483  src += (8 * src_stride);
1484  ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9,
1485  src76_r, src87_r, src98_r, src109_r);
1486  ILVR_B4_SB(src11, src10, src12, src11, src13, src12, src14, src13,
1487  src1110_r, src1211_r, src1312_r, src1413_r);
1488  ILVR_D4_SB(src87_r, src76_r, src109_r, src98_r, src1211_r, src1110_r,
1489  src1413_r, src1312_r,
1490  src8776, src10998, src12111110, src14131312);
1491  HEVC_FILT_8TAP_4W_SH(src2110, src4332, src6554, src8776, filt0,
1492  filt1, filt2, filt3, dst0, dst1);
1493  HEVC_FILT_8TAP_4W_SH(src4332, src6554, src8776, src10998, filt0,
1494  filt1, filt2, filt3, dst2, dst3);
1495  HEVC_FILT_8TAP_4W_SH(src6554, src8776, src10998, src12111110,
1496  filt0, filt1, filt2, filt3, dst4, dst5);
1497  HEVC_FILT_8TAP_4W_SH(src8776, src10998, src12111110, src14131312,
1498  filt0, filt1, filt2, filt3, dst6, dst7);
1499  MUL4(dst0, weight_vec, dst1, weight_vec, dst2, weight_vec, dst3,
1500  weight_vec, dst0, dst1, dst2, dst3)
1501  MUL4(dst4, weight_vec, dst5, weight_vec, dst6, weight_vec, dst7,
1502  weight_vec, dst4, dst5, dst6, dst7);
1503  SRAR_W4_SW(dst0, dst1, dst2, dst3, rnd_vec);
1504  SRAR_W4_SW(dst4, dst5, dst6, dst7, rnd_vec);
1505  ADD4(dst0, offset_vec, dst1, offset_vec, dst2, offset_vec, dst3,
1506  offset_vec, dst0, dst1, dst2, dst3);
1507  ADD4(dst4, offset_vec, dst5, offset_vec, dst6, offset_vec, dst7,
1508  offset_vec, dst4, dst5, dst6, dst7);
1509  CLIP_SW8_0_255(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
1510  PCKEV_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6,
1511  vec0, vec1, vec2, vec3);
1512  PCKEV_B2_UB(vec1, vec0, vec3, vec2, out0, out1);
1513 
1514 
1515  ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
1516  dst += (8 * dst_stride);
1517 
1518  src2110 = src10998;
1519  src4332 = src12111110;
1520  src6554 = src14131312;
1521  src6 = src14;
1522  }
1523  if (res) {
1524  LD_SB8(src, src_stride,
1525  src7, src8, src9, src10, src11, src12, src13, src14);
1526  ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9,
1527  src76_r, src87_r, src98_r, src109_r);
1528  ILVR_B4_SB(src11, src10, src12, src11, src13, src12, src14, src13,
1529  src1110_r, src1211_r, src1312_r, src1413_r);
1530  ILVR_D4_SB(src87_r, src76_r, src109_r, src98_r, src1211_r, src1110_r,
1531  src1413_r, src1312_r,
1532  src8776, src10998, src12111110, src14131312);
1533  HEVC_FILT_8TAP_4W_SH(src2110, src4332, src6554, src8776, filt0,
1534  filt1, filt2, filt3, dst0, dst1);
1535  HEVC_FILT_8TAP_4W_SH(src4332, src6554, src8776, src10998, filt0,
1536  filt1, filt2, filt3, dst2, dst3);
1537  HEVC_FILT_8TAP_4W_SH(src6554, src8776, src10998, src12111110,
1538  filt0, filt1, filt2, filt3, dst4, dst5);
1539  HEVC_FILT_8TAP_4W_SH(src8776, src10998, src12111110, src14131312,
1540  filt0, filt1, filt2, filt3, dst6, dst7);
1541  MUL4(dst0, weight_vec, dst1, weight_vec, dst2, weight_vec, dst3,
1542  weight_vec, dst0, dst1, dst2, dst3)
1543  MUL4(dst4, weight_vec, dst5, weight_vec, dst6, weight_vec, dst7,
1544  weight_vec, dst4, dst5, dst6, dst7);
1545  SRAR_W4_SW(dst0, dst1, dst2, dst3, rnd_vec);
1546  SRAR_W4_SW(dst4, dst5, dst6, dst7, rnd_vec);
1547  ADD4(dst0, offset_vec, dst1, offset_vec, dst2, offset_vec, dst3,
1548  offset_vec, dst0, dst1, dst2, dst3);
1549  ADD4(dst4, offset_vec, dst5, offset_vec, dst6, offset_vec, dst7,
1550  offset_vec, dst4, dst5, dst6, dst7);
1551  CLIP_SW8_0_255(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
1552  PCKEV_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6,
1553  vec0, vec1, vec2, vec3);
1554  PCKEV_B2_UB(vec1, vec0, vec3, vec2, out0, out1);
1555 
1556  if (res == 2) {
1557  ST_W2(out0, 0, 1, dst, dst_stride);
1558  } else if (res == 4) {
1559  ST_W4(out0, 0, 1, 2, 3, dst, dst_stride);
1560  } else {
1561  ST_W4(out0, 0, 1, 2, 3, dst, dst_stride);
1562  ST_W2(out1, 0, 1, dst + 4 * dst_stride, dst_stride);
1563  }
1564  }
1565 }
1566 
1567 static void hevc_vt_uniwgt_8t_8w_msa(const uint8_t *src,
1568  int32_t src_stride,
1569  uint8_t *dst,
1570  int32_t dst_stride,
1571  const int8_t *filter,
1572  int32_t height,
1573  int32_t weight,
1574  int32_t offset,
1575  int32_t rnd_val)
1576 {
1577  int32_t loop_cnt;
1578  int32_t res = height & 0x03;
1579  v16u8 out0, out1;
1580  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1581  v16i8 src10_r, src32_r, src54_r, src76_r, src98_r;
1582  v16i8 src21_r, src43_r, src65_r, src87_r, src109_r;
1583  v8i16 filt0, filt1, filt2, filt3;
1584  v8i16 filter_vec, vec0, vec1, vec2, vec3;
1585  v4i32 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
1586  v4i32 weight_vec, rnd_vec, offset_vec;
1587  v8i16 zero = { 0 };
1588 
1589  src -= (3 * src_stride);
1590 
1591  weight_vec = __msa_fill_w(weight);
1592  rnd_vec = __msa_fill_w(rnd_val);
1593  offset_vec = __msa_fill_w(offset);
1594 
1595  filter_vec = LD_SH(filter);
1596  UNPCK_R_SB_SH(filter_vec, filter_vec);
1597  SPLATI_W4_SH(filter_vec, filt0, filt1, filt2, filt3);
1598 
1599  LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
1600  src += (7 * src_stride);
1601 
1602  ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
1603  src10_r, src32_r, src54_r, src21_r);
1604  ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1605 
1606  for (loop_cnt = (height >> 2); loop_cnt--;) {
1607  LD_SB4(src, src_stride, src7, src8, src9, src10);
1608  src += (4 * src_stride);
1609  ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9,
1610  src76_r, src87_r, src98_r, src109_r);
1611  HEVC_FILT_8TAP_4W_SH(src10_r, src32_r, src54_r, src76_r, filt0,
1612  filt1, filt2, filt3, dst0, dst1);
1613  HEVC_FILT_8TAP_4W_SH(src21_r, src43_r, src65_r, src87_r, filt0,
1614  filt1, filt2, filt3, dst2, dst3);
1615  HEVC_FILT_8TAP_4W_SH(src32_r, src54_r, src76_r, src98_r,
1616  filt0, filt1, filt2, filt3, dst4, dst5);
1617  HEVC_FILT_8TAP_4W_SH(src43_r, src65_r, src87_r, src109_r,
1618  filt0, filt1, filt2, filt3, dst6, dst7);
1619  MUL4(dst0, weight_vec, dst1, weight_vec, dst2, weight_vec, dst3,
1620  weight_vec, dst0, dst1, dst2, dst3)
1621  MUL4(dst4, weight_vec, dst5, weight_vec, dst6, weight_vec, dst7,
1622  weight_vec, dst4, dst5, dst6, dst7);
1623  SRAR_W4_SW(dst0, dst1, dst2, dst3, rnd_vec);
1624  SRAR_W4_SW(dst4, dst5, dst6, dst7, rnd_vec);
1625  ADD4(dst0, offset_vec, dst1, offset_vec, dst2, offset_vec, dst3,
1626  offset_vec, dst0, dst1, dst2, dst3);
1627  ADD4(dst4, offset_vec, dst5, offset_vec, dst6, offset_vec, dst7,
1628  offset_vec, dst4, dst5, dst6, dst7);
1629  CLIP_SW8_0_255(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
1630  PCKEV_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6,
1631  vec0, vec1, vec2, vec3);
1632  PCKEV_B2_UB(vec1, vec0, vec3, vec2, out0, out1);
1633  ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
1634  dst += (4 * dst_stride);
1635 
1636  src10_r = src54_r;
1637  src32_r = src76_r;
1638  src54_r = src98_r;
1639  src21_r = src65_r;
1640  src43_r = src87_r;
1641  src65_r = src109_r;
1642  src6 = src10;
1643  }
1644  if (res) {
1645  LD_SB2(src, src_stride, src7, src8);
1646  ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
1647  HEVC_FILT_8TAP_4W_SH(src10_r, src32_r, src54_r, src76_r, filt0,
1648  filt1, filt2, filt3, dst0, dst1);
1649  HEVC_FILT_8TAP_4W_SH(src21_r, src43_r, src65_r, src87_r, filt0,
1650  filt1, filt2, filt3, dst2, dst3);
1651  MUL4(dst0, weight_vec, dst1, weight_vec, dst2, weight_vec, dst3,
1652  weight_vec, dst0, dst1, dst2, dst3)
1653  SRAR_W4_SW(dst0, dst1, dst2, dst3, rnd_vec);
1654  ADD4(dst0, offset_vec, dst1, offset_vec, dst2, offset_vec, dst3,
1655  offset_vec, dst0, dst1, dst2, dst3);
1656  CLIP_SW4_0_255(dst0, dst1, dst2, dst3);
1657  PCKEV_H2_SH(dst1, dst0, dst3, dst2, vec0, vec1);
1658  out0 = __msa_pckev_b((v16i8) vec1, (v16i8) vec0);
1659  ST_D2(out0, 0, 1, dst, dst_stride);
1660  }
1661 }
1662 
1663 static void hevc_vt_uniwgt_8t_12w_msa(const uint8_t *src,
1664  int32_t src_stride,
1665  uint8_t *dst,
1666  int32_t dst_stride,
1667  const int8_t *filter,
1668  int32_t height,
1669  int32_t weight,
1670  int32_t offset,
1671  int32_t rnd_val)
1672 {
1673  int32_t loop_cnt;
1674  v16u8 out0, out1, out2;
1675  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1676  v16i8 src10_r, src32_r, src54_r, src76_r, src98_r;
1677  v16i8 src21_r, src43_r, src65_r, src87_r, src109_r;
1678  v16i8 src10_l, src32_l, src54_l, src76_l, src98_l;
1679  v16i8 src21_l, src43_l, src65_l, src87_l, src109_l;
1680  v16i8 src2110, src4332, src6554, src8776, src10998;
1681  v8i16 filt0, filt1, filt2, filt3;
1682  v4i32 dst0, dst1, dst2, dst3, dst4, dst5;
1683  v4i32 dst6, dst7, dst8, dst9, dst10, dst11;
1684  v8i16 filter_vec, vec0, vec1, vec2, vec3, vec4, vec5;
1685  v4i32 weight_vec, rnd_vec, offset_vec;
1686  v8i16 zero = { 0 };
1687 
1688  src -= (3 * src_stride);
1689 
1690  weight_vec = __msa_fill_w(weight);
1691  rnd_vec = __msa_fill_w(rnd_val);
1692  offset_vec = __msa_fill_w(offset);
1693 
1694  filter_vec = LD_SH(filter);
1695  UNPCK_R_SB_SH(filter_vec, filter_vec);
1696  SPLATI_W4_SH(filter_vec, filt0, filt1, filt2, filt3);
1697 
1698  LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
1699  src += (7 * src_stride);
1700 
1701  ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
1702  src10_r, src32_r, src54_r, src21_r);
1703  ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1704  ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
1705  src10_l, src32_l, src54_l, src21_l);
1706  ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
1707  ILVR_D3_SB(src21_l, src10_l, src43_l, src32_l, src65_l, src54_l,
1708  src2110, src4332, src6554);
1709 
1710  for (loop_cnt = 4; loop_cnt--;) {
1711  LD_SB4(src, src_stride, src7, src8, src9, src10);
1712  src += (4 * src_stride);
1713 
1714  ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9,
1715  src76_r, src87_r, src98_r, src109_r);
1716  ILVL_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9,
1717  src76_l, src87_l, src98_l, src109_l);
1718  ILVR_D2_SB(src87_l, src76_l, src109_l, src98_l, src8776, src10998);
1719 
1720  HEVC_FILT_8TAP_4W_SH(src10_r, src32_r, src54_r, src76_r, filt0,
1721  filt1, filt2, filt3, dst0, dst1);
1722  HEVC_FILT_8TAP_4W_SH(src21_r, src43_r, src65_r, src87_r, filt0,
1723  filt1, filt2, filt3, dst2, dst3);
1724  HEVC_FILT_8TAP_4W_SH(src32_r, src54_r, src76_r, src98_r,
1725  filt0, filt1, filt2, filt3, dst4, dst5);
1726  HEVC_FILT_8TAP_4W_SH(src43_r, src65_r, src87_r, src109_r,
1727  filt0, filt1, filt2, filt3, dst6, dst7);
1728  HEVC_FILT_8TAP_4W_SH(src2110, src4332, src6554, src8776,
1729  filt0, filt1, filt2, filt3, dst8, dst9);
1730  HEVC_FILT_8TAP_4W_SH(src4332, src6554, src8776, src10998,
1731  filt0, filt1, filt2, filt3, dst10, dst11);
1732 
1733  MUL4(dst0, weight_vec, dst1, weight_vec, dst2, weight_vec, dst3,
1734  weight_vec, dst0, dst1, dst2, dst3)
1735  MUL4(dst4, weight_vec, dst5, weight_vec, dst6, weight_vec, dst7,
1736  weight_vec, dst4, dst5, dst6, dst7);
1737  MUL4(dst8, weight_vec, dst9, weight_vec, dst10, weight_vec, dst11,
1738  weight_vec, dst8, dst9, dst10, dst11);
1739  SRAR_W4_SW(dst0, dst1, dst2, dst3, rnd_vec);
1740  SRAR_W4_SW(dst4, dst5, dst6, dst7, rnd_vec);
1741  SRAR_W4_SW(dst8, dst9, dst10, dst11, rnd_vec);
1742  ADD4(dst0, offset_vec, dst1, offset_vec, dst2, offset_vec, dst3,
1743  offset_vec, dst0, dst1, dst2, dst3);
1744  ADD4(dst4, offset_vec, dst5, offset_vec, dst6, offset_vec, dst7,
1745  offset_vec, dst4, dst5, dst6, dst7);
1746  ADD4(dst8, offset_vec, dst9, offset_vec, dst10, offset_vec, dst11,
1747  offset_vec, dst8, dst9, dst10, dst11);
1748  CLIP_SW8_0_255(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
1749  CLIP_SW4_0_255(dst8, dst9, dst10, dst11);
1750  PCKEV_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6,
1751  vec0, vec1, vec2, vec3);
1752  PCKEV_H2_SH(dst9, dst8, dst11, dst10, vec4, vec5);
1753  PCKEV_B3_UB(vec1, vec0, vec3, vec2, vec5, vec4, out0, out1, out2);
1754  ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
1755  ST_W4(out2, 0, 1, 2, 3, dst + 8, dst_stride);
1756  dst += (4 * dst_stride);
1757 
1758  src10_r = src54_r;
1759  src32_r = src76_r;
1760  src54_r = src98_r;
1761  src21_r = src65_r;
1762  src43_r = src87_r;
1763  src65_r = src109_r;
1764  src2110 = src6554;
1765  src4332 = src8776;
1766  src6554 = src10998;
1767  src6 = src10;
1768  }
1769 }
1770 
1771 static void hevc_vt_uniwgt_8t_16multx4mult_msa(const uint8_t *src,
1772  int32_t src_stride,
1773  uint8_t *dst,
1774  int32_t dst_stride,
1775  const int8_t *filter,
1776  int32_t height,
1777  int32_t weight,
1778  int32_t offset,
1779  int32_t rnd_val,
1780  int32_t weightmul16)
1781 {
1782  const uint8_t *src_tmp;
1783  uint8_t *dst_tmp;
1784  int32_t loop_cnt, cnt;
1785  int32_t res = height & 0x03;
1786  v16u8 out0, out1, out2, out3;
1787  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1788  v16i8 src10_r, src32_r, src54_r, src76_r;
1789  v16i8 src21_r, src43_r, src65_r, src87_r;
1790  v16i8 src10_l, src32_l, src54_l, src76_l;
1791  v16i8 src21_l, src43_l, src65_l, src87_l;
1792  v16i8 src98_r, src109_r, src98_l, src109_l;
1793  v8i16 filt0, filt1, filt2, filt3;
1794  v8i16 filter_vec, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1795  v4i32 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
1796  v4i32 dst8, dst9, dst10, dst11, dst12, dst13, dst14, dst15;
1797  v4i32 weight_vec, rnd_vec, offset_vec;
1798  v8i16 zero = { 0 };
1799 
1800  src -= (3 * src_stride);
1801 
1802  weight_vec = __msa_fill_w(weight);
1803  rnd_vec = __msa_fill_w(rnd_val);
1804  offset_vec = __msa_fill_w(offset);
1805 
1806 
1807  filter_vec = LD_SH(filter);
1808  UNPCK_R_SB_SH(filter_vec, filter_vec);
1809  SPLATI_W4_SH(filter_vec, filt0, filt1, filt2, filt3);
1810 
1811  for (cnt = weightmul16; cnt--;) {
1812  src_tmp = src;
1813  dst_tmp = dst;
1814 
1815  LD_SB7(src_tmp, src_stride, src0, src1, src2, src3, src4, src5, src6);
1816  src_tmp += (7 * src_stride);
1817 
1818  for (loop_cnt = (height >> 2); loop_cnt--;) {
1819  LD_SB4(src_tmp, src_stride, src7, src8, src9, src10);
1820  src_tmp += (4 * src_stride);
1821 
1822  ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
1823  src10_r, src32_r, src54_r, src21_r);
1824  ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1825  ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
1826  src10_l, src32_l, src54_l, src21_l);
1827  ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
1828  ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9,
1829  src76_r, src87_r, src98_r, src109_r);
1830  ILVL_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9,
1831  src76_l, src87_l, src98_l, src109_l);
1832 
1833  HEVC_FILT_8TAP_4W_SH(src10_r, src32_r, src54_r, src76_r, filt0,
1834  filt1, filt2, filt3, dst0, dst1);
1835  HEVC_FILT_8TAP_4W_SH(src10_l, src32_l, src54_l, src76_l, filt0,
1836  filt1, filt2, filt3, dst2, dst3);
1837  HEVC_FILT_8TAP_4W_SH(src21_r, src43_r, src65_r, src87_r, filt0,
1838  filt1, filt2, filt3, dst4, dst5);
1839  HEVC_FILT_8TAP_4W_SH(src21_l, src43_l, src65_l, src87_l, filt0,
1840  filt1, filt2, filt3, dst6, dst7);
1841  HEVC_FILT_8TAP_4W_SH(src32_r, src54_r, src76_r, src98_r, filt0,
1842  filt1, filt2, filt3, dst8, dst9);
1843  HEVC_FILT_8TAP_4W_SH(src32_l, src54_l, src76_l, src98_l, filt0,
1844  filt1, filt2, filt3, dst10, dst11);
1845  HEVC_FILT_8TAP_4W_SH(src43_r, src65_r, src87_r, src109_r, filt0,
1846  filt1, filt2, filt3, dst12, dst13);
1847  HEVC_FILT_8TAP_4W_SH(src43_l, src65_l, src87_l, src109_l, filt0,
1848  filt1, filt2, filt3, dst14, dst15);
1849 
1850  MUL4(dst0, weight_vec, dst1, weight_vec, dst2, weight_vec, dst3,
1851  weight_vec, dst0, dst1, dst2, dst3)
1852  MUL4(dst4, weight_vec, dst5, weight_vec, dst6, weight_vec, dst7,
1853  weight_vec, dst4, dst5, dst6, dst7);
1854  MUL4(dst8, weight_vec, dst9, weight_vec, dst10, weight_vec, dst11,
1855  weight_vec, dst8, dst9, dst10, dst11);
1856  MUL4(dst12, weight_vec, dst13, weight_vec, dst14, weight_vec, dst15,
1857  weight_vec, dst12, dst13, dst14, dst15);
1858  SRAR_W4_SW(dst0, dst1, dst2, dst3, rnd_vec);
1859  SRAR_W4_SW(dst4, dst5, dst6, dst7, rnd_vec);
1860  SRAR_W4_SW(dst8, dst9, dst10, dst11, rnd_vec);
1861  SRAR_W4_SW(dst12, dst13, dst14, dst15, rnd_vec);
1862  ADD4(dst0, offset_vec, dst1, offset_vec, dst2, offset_vec, dst3,
1863  offset_vec, dst0, dst1, dst2, dst3);
1864  ADD4(dst4, offset_vec, dst5, offset_vec, dst6, offset_vec, dst7,
1865  offset_vec, dst4, dst5, dst6, dst7);
1866  ADD4(dst8, offset_vec, dst9, offset_vec, dst10, offset_vec, dst11,
1867  offset_vec, dst8, dst9, dst10, dst11);
1868  ADD4(dst12, offset_vec, dst13, offset_vec, dst14, offset_vec, dst15,
1869  offset_vec, dst12, dst13, dst14, dst15);
1870  CLIP_SW8_0_255(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
1871  CLIP_SW8_0_255(dst8, dst9, dst10, dst11, dst12, dst13, dst14, dst15);
1872  PCKEV_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6,
1873  vec0, vec1, vec2, vec3);
1874  PCKEV_H4_SH(dst9, dst8, dst11, dst10, dst13, dst12, dst15,
1875  dst14, vec4, vec5, vec6, vec7);
1876  PCKEV_B4_UB(vec1, vec0, vec3, vec2, vec5, vec4, vec7, vec6,
1877  out0, out1, out2, out3);
1878 
1879  ST_UB4(out0, out1, out2, out3, dst_tmp, dst_stride);
1880  dst_tmp += (4 * dst_stride);
1881 
1882  src0 = src4;
1883  src1 = src5;
1884  src2 = src6;
1885  src3 = src7;
1886  src4 = src8;
1887  src5 = src9;
1888  src6 = src10;
1889  }
1890  if (res) {
1891  LD_SB2(src_tmp, src_stride, src7, src8);
1892 
1893  ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
1894  src10_r, src32_r, src54_r, src21_r);
1895  ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1896  ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
1897  src10_l, src32_l, src54_l, src21_l);
1898  ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
1899  ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
1900  ILVL_B2_SB(src7, src6, src8, src7, src76_l, src87_l);
1901 
1902  HEVC_FILT_8TAP_4W_SH(src10_r, src32_r, src54_r, src76_r, filt0,
1903  filt1, filt2, filt3, dst0, dst1);
1904  HEVC_FILT_8TAP_4W_SH(src10_l, src32_l, src54_l, src76_l, filt0,
1905  filt1, filt2, filt3, dst2, dst3);
1906  HEVC_FILT_8TAP_4W_SH(src21_r, src43_r, src65_r, src87_r, filt0,
1907  filt1, filt2, filt3, dst4, dst5);
1908  HEVC_FILT_8TAP_4W_SH(src21_l, src43_l, src65_l, src87_l, filt0,
1909  filt1, filt2, filt3, dst6, dst7);
1910  MUL4(dst0, weight_vec, dst1, weight_vec, dst2, weight_vec, dst3,
1911  weight_vec, dst0, dst1, dst2, dst3)
1912  MUL4(dst4, weight_vec, dst5, weight_vec, dst6, weight_vec, dst7,
1913  weight_vec, dst4, dst5, dst6, dst7);
1914  SRAR_W4_SW(dst0, dst1, dst2, dst3, rnd_vec);
1915  SRAR_W4_SW(dst4, dst5, dst6, dst7, rnd_vec);
1916  ADD4(dst0, offset_vec, dst1, offset_vec, dst2, offset_vec, dst3,
1917  offset_vec, dst0, dst1, dst2, dst3);
1918  ADD4(dst4, offset_vec, dst5, offset_vec, dst6, offset_vec, dst7,
1919  offset_vec, dst4, dst5, dst6, dst7);
1920  CLIP_SW8_0_255(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
1921  PCKEV_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6,
1922  vec0, vec1, vec2, vec3);
1923  PCKEV_B2_UB(vec1, vec0, vec3, vec2, out0, out1);
1924 
1925  ST_UB2(out0, out1, dst_tmp, dst_stride);
1926  }
1927 
1928  src += 16;
1929  dst += 16;
1930  }
1931 }
1932 
1933 static void hevc_vt_uniwgt_8t_16w_msa(const uint8_t *src,
1934  int32_t src_stride,
1935  uint8_t *dst,
1936  int32_t dst_stride,
1937  const int8_t *filter,
1938  int32_t height,
1939  int32_t weight,
1940  int32_t offset,
1941  int32_t rnd_val)
1942 {
1943  hevc_vt_uniwgt_8t_16multx4mult_msa(src, src_stride, dst, dst_stride,
1944  filter, height, weight,
1945  offset, rnd_val, 1);
1946 }
1947 
1948 static void hevc_vt_uniwgt_8t_24w_msa(const uint8_t *src,
1949  int32_t src_stride,
1950  uint8_t *dst,
1951  int32_t dst_stride,
1952  const int8_t *filter,
1953  int32_t height,
1954  int32_t weight,
1955  int32_t offset,
1956  int32_t rnd_val)
1957 {
1958  hevc_vt_uniwgt_8t_16multx4mult_msa(src, src_stride, dst, dst_stride,
1959  filter, 32, weight,
1960  offset, rnd_val, 1);
1961 
1962  hevc_vt_uniwgt_8t_8w_msa(src + 16, src_stride, dst + 16, dst_stride,
1963  filter, 32, weight, offset, rnd_val);
1964 }
1965 
1966 static void hevc_vt_uniwgt_8t_32w_msa(const uint8_t *src,
1967  int32_t src_stride,
1968  uint8_t *dst,
1969  int32_t dst_stride,
1970  const int8_t *filter,
1971  int32_t height,
1972  int32_t weight,
1973  int32_t offset,
1974  int32_t rnd_val)
1975 {
1976  hevc_vt_uniwgt_8t_16multx4mult_msa(src, src_stride, dst, dst_stride,
1977  filter, height, weight,
1978  offset, rnd_val, 2);
1979 }
1980 
1981 static void hevc_vt_uniwgt_8t_48w_msa(const uint8_t *src,
1982  int32_t src_stride,
1983  uint8_t *dst,
1984  int32_t dst_stride,
1985  const int8_t *filter,
1986  int32_t height,
1987  int32_t weight,
1988  int32_t offset,
1989  int32_t rnd_val)
1990 {
1991  hevc_vt_uniwgt_8t_16multx4mult_msa(src, src_stride, dst, dst_stride,
1992  filter, 64, weight,
1993  offset, rnd_val, 3);
1994 }
1995 
1996 static void hevc_vt_uniwgt_8t_64w_msa(const uint8_t *src,
1997  int32_t src_stride,
1998  uint8_t *dst,
1999  int32_t dst_stride,
2000  const int8_t *filter,
2001  int32_t height,
2002  int32_t weight,
2003  int32_t offset,
2004  int32_t rnd_val)
2005 {
2006  hevc_vt_uniwgt_8t_16multx4mult_msa(src, src_stride, dst, dst_stride,
2007  filter, height, weight,
2008  offset, rnd_val, 4);
2009 }
2010 
2011 static void hevc_hv_uniwgt_8t_4w_msa(const uint8_t *src,
2012  int32_t src_stride,
2013  uint8_t *dst,
2014  int32_t dst_stride,
2015  const int8_t *filter_x,
2016  const int8_t *filter_y,
2017  int32_t height,
2018  int32_t weight,
2019  int32_t offset,
2020  int32_t rnd_val)
2021 {
2022  uint32_t loop_cnt;
2023  v16u8 out;
2024  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
2025  v8i16 filt0, filt1, filt2, filt3;
2026  v8i16 filt_h0, filt_h1, filt_h2, filt_h3;
2027  v16i8 mask1, mask2, mask3;
2028  v8i16 filter_vec;
2029  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
2030  v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
2031  v4i32 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8, dst9, dst10;
2032  v8i16 dst10_r, dst32_r, dst54_r, dst76_r, dst98_r;
2033  v8i16 dst21_r, dst43_r, dst65_r, dst87_r, dst109_r;
2034  v4i32 dst0_r, dst1_r, dst2_r, dst3_r;
2035  v4i32 weight_vec, offset_vec, rnd_vec;
2036  v16i8 mask0 = LD_SB(ff_hevc_mask_arr + 16);
2037  v8i16 zero = { 0 };
2038 
2039  src -= ((3 * src_stride) + 3);
2040  filter_vec = LD_SH(filter_x);
2041  UNPCK_R_SB_SH(filter_vec, filter_vec);
2042  SPLATI_W4_SH(filter_vec, filt0, filt1, filt2, filt3);
2043 
2044  filter_vec = LD_SH(filter_y);
2045  UNPCK_R_SB_SH(filter_vec, filter_vec);
2046  SPLATI_W4_SH(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3);
2047 
2048  mask1 = mask0 + 2;
2049  mask2 = mask0 + 4;
2050  mask3 = mask0 + 6;
2051 
2052  weight_vec = __msa_fill_w(weight);
2053  offset_vec = __msa_fill_w(offset);
2054  rnd_vec = __msa_fill_w(rnd_val);
2055 
2056  LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
2057  src += (7 * src_stride);
2058 
2059  /* row 0 row 1 row 2 row 3 */
2060  VSHF_B4_SB(src0, src3, mask0, mask1, mask2, mask3, vec0, vec1, vec2, vec3);
2061  VSHF_B4_SB(src1, src4, mask0, mask1, mask2, mask3, vec4, vec5, vec6, vec7);
2062  VSHF_B4_SB(src2, src5, mask0, mask1, mask2, mask3,
2063  vec8, vec9, vec10, vec11);
2064  VSHF_B4_SB(src3, src6, mask0, mask1, mask2, mask3,
2065  vec12, vec13, vec14, vec15);
2066  HEVC_FILT_8TAP_4W_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
2067  filt3, dst0, dst3);
2068  HEVC_FILT_8TAP_4W_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
2069  filt3, dst1, dst4);
2070  HEVC_FILT_8TAP_4W_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
2071  filt3, dst2, dst5);
2072  vec0 = __msa_ilvl_b((v16i8) zero, (v16i8) vec12);
2073  vec1 = __msa_ilvl_b((v16i8) zero, (v16i8) vec13);
2074  vec2 = __msa_ilvl_b((v16i8) zero, (v16i8) vec14);
2075  vec3 = __msa_ilvl_b((v16i8) zero, (v16i8) vec15);
2076  dst6 = __msa_dotp_s_w((v8i16) vec0, (v8i16) filt0);
2077  dst6 = __msa_dpadd_s_w((v4i32) dst6, (v8i16) vec1, (v8i16) filt1);
2078  dst6 = __msa_dpadd_s_w((v4i32) dst6, (v8i16) vec2, (v8i16) filt2);
2079  dst6 = __msa_dpadd_s_w((v4i32) dst6, (v8i16) vec3, (v8i16) filt3);
2080 
2081  ILVEV_H2_SH(dst0, dst1, dst3, dst4, dst10_r, dst43_r);
2082  ILVEV_H2_SH(dst1, dst2, dst4, dst5, dst21_r, dst54_r);
2083  ILVEV_H2_SH(dst2, dst3, dst5, dst6, dst32_r, dst65_r);
2084 
2085  for (loop_cnt = height >> 2; loop_cnt--;) {
2086  LD_SB4(src, src_stride, src7, src8, src9, src10);
2087  src += (4 * src_stride);
2088 
2089  VSHF_B4_SB(src7, src9, mask0, mask1, mask2, mask3,
2090  vec0, vec1, vec2, vec3);
2091  VSHF_B4_SB(src8, src10, mask0, mask1, mask2, mask3,
2092  vec4, vec5, vec6, vec7);
2093  HEVC_FILT_8TAP_4W_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
2094  filt3, dst7, dst9);
2095  HEVC_FILT_8TAP_4W_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
2096  filt3, dst8, dst10);
2097 
2098  dst76_r = __msa_ilvev_h((v8i16) dst7, (v8i16) dst6);
2099  ILVEV_H2_SH(dst7, dst8, dst9, dst10, dst87_r, dst109_r);
2100  dst98_r = __msa_ilvev_h((v8i16) dst9, (v8i16) dst8);
2101 
2102  dst0_r = HEVC_FILT_8TAP(dst10_r, dst32_r, dst54_r, dst76_r, filt_h0,
2103  filt_h1, filt_h2, filt_h3);
2104  dst1_r = HEVC_FILT_8TAP(dst21_r, dst43_r, dst65_r, dst87_r, filt_h0,
2105  filt_h1, filt_h2, filt_h3);
2106  dst2_r = HEVC_FILT_8TAP(dst32_r, dst54_r, dst76_r, dst98_r, filt_h0,
2107  filt_h1, filt_h2, filt_h3);
2108  dst3_r = HEVC_FILT_8TAP(dst43_r, dst65_r, dst87_r, dst109_r, filt_h0,
2109  filt_h1, filt_h2, filt_h3);
2110 
2111  SRA_4V(dst0_r, dst1_r, dst2_r, dst3_r, 6);
2112  MUL2(dst0_r, weight_vec, dst1_r, weight_vec, dst0_r, dst1_r);
2113  MUL2(dst2_r, weight_vec, dst3_r, weight_vec, dst2_r, dst3_r);
2114  SRAR_W4_SW(dst0_r, dst1_r, dst2_r, dst3_r, rnd_vec);
2115  ADD2(dst0_r, offset_vec, dst1_r, offset_vec, dst0_r, dst1_r);
2116  ADD2(dst2_r, offset_vec, dst3_r, offset_vec, dst2_r, dst3_r);
2117  CLIP_SW4_0_255(dst0_r, dst1_r, dst2_r, dst3_r);
2118  PCKEV_H2_SW(dst1_r, dst0_r, dst3_r, dst2_r, dst0_r, dst1_r);
2119  out = (v16u8) __msa_pckev_b((v16i8) dst1_r, (v16i8) dst0_r);
2120  ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
2121  dst += (4 * dst_stride);
2122 
2123  dst10_r = dst54_r;
2124  dst32_r = dst76_r;
2125  dst54_r = dst98_r;
2126  dst21_r = dst65_r;
2127  dst43_r = dst87_r;
2128  dst65_r = dst109_r;
2129  dst6 = dst10;
2130  }
2131 }
2132 
2133 static void hevc_hv_uniwgt_8t_8multx2mult_msa(const uint8_t *src,
2134  int32_t src_stride,
2135  uint8_t *dst,
2136  int32_t dst_stride,
2137  const int8_t *filter_x,
2138  const int8_t *filter_y,
2139  int32_t height,
2140  int32_t weight,
2141  int32_t offset,
2142  int32_t rnd_val,
2143  int32_t width)
2144 {
2145  uint32_t loop_cnt, cnt;
2146  const uint8_t *src_tmp;
2147  uint8_t *dst_tmp;
2148  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
2149  v8i16 filt0, filt1, filt2, filt3;
2150  v8i16 filt_h0, filt_h1, filt_h2, filt_h3;
2151  v16i8 mask1, mask2, mask3;
2152  v8i16 filter_vec;
2153  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
2154  v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
2155  v4i32 dst0_r, dst0_l, dst1_r, dst1_l;
2156  v8i16 dst10_r, dst32_r, dst54_r, dst76_r;
2157  v8i16 dst10_l, dst32_l, dst54_l, dst76_l;
2158  v8i16 dst21_r, dst43_r, dst65_r, dst87_r;
2159  v8i16 dst21_l, dst43_l, dst65_l, dst87_l;
2160  v4i32 weight_vec, offset_vec, rnd_vec;
2161  v4i32 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8, dst9, dst10;
2162  v4i32 dst11, dst12, dst13, dst14, dst15;
2163  v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
2164  v8i16 zero = { 0 };
2165 
2166  src -= ((3 * src_stride) + 3);
2167 
2168  weight_vec = __msa_fill_w(weight);
2169  offset_vec = __msa_fill_w(offset);
2170  rnd_vec = __msa_fill_w(rnd_val);
2171 
2172 
2173  filter_vec = LD_SH(filter_x);
2174  UNPCK_R_SB_SH(filter_vec, filter_vec);
2175  SPLATI_W4_SW(filter_vec, filt0, filt1, filt2, filt3);
2176 
2177  filter_vec = LD_SH(filter_y);
2178  UNPCK_R_SB_SH(filter_vec, filter_vec);
2179  SPLATI_W4_SW(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3);
2180 
2181  mask1 = mask0 + 2;
2182  mask2 = mask0 + 4;
2183  mask3 = mask0 + 6;
2184 
2185  for (cnt = width >> 3; cnt--;) {
2186  src_tmp = src;
2187  dst_tmp = dst;
2188 
2189  LD_SB7(src_tmp, src_stride, src0, src1, src2, src3, src4, src5, src6);
2190  src_tmp += (7 * src_stride);
2191 
2192  VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
2193  vec0, vec1, vec2, vec3);
2194  VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
2195  vec4, vec5, vec6, vec7);
2196  VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
2197  vec8, vec9, vec10, vec11);
2198  VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
2199  vec12, vec13, vec14, vec15);
2200  HEVC_FILT_8TAP_4W_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
2201  filt3, dst0, dst1);
2202  HEVC_FILT_8TAP_4W_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
2203  filt3, dst2, dst3);
2204  HEVC_FILT_8TAP_4W_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
2205  filt3, dst4, dst5);
2206  HEVC_FILT_8TAP_4W_SH(vec12, vec13, vec14, vec15, filt0, filt1,
2207  filt2, filt3, dst6, dst7);
2208 
2209  VSHF_B4_SB(src4, src4, mask0, mask1, mask2, mask3,
2210  vec0, vec1, vec2, vec3);
2211  VSHF_B4_SB(src5, src5, mask0, mask1, mask2, mask3,
2212  vec4, vec5, vec6, vec7);
2213  VSHF_B4_SB(src6, src6, mask0, mask1, mask2, mask3,
2214  vec8, vec9, vec10, vec11);
2215  HEVC_FILT_8TAP_4W_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
2216  filt3, dst8, dst9);
2217  HEVC_FILT_8TAP_4W_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
2218  filt3, dst10, dst11);
2219  HEVC_FILT_8TAP_4W_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
2220  filt3, dst12, dst13);
2221 
2222  ILVEV_H2_SH(dst0, dst2, dst1, dst3, dst10_r, dst10_l);
2223  ILVEV_H2_SH(dst2, dst4, dst3, dst5, dst21_r, dst21_l);
2224  ILVEV_H2_SH(dst4, dst6, dst5, dst7, dst32_r, dst32_l);
2225  ILVEV_H2_SH(dst6, dst8, dst7, dst9, dst43_r, dst43_l);
2226  ILVEV_H2_SH(dst8, dst10, dst9, dst11, dst54_r, dst54_l);
2227  ILVEV_H2_SH(dst10, dst12, dst11, dst13, dst65_r, dst65_l);
2228 
2229  for (loop_cnt = height >> 1; loop_cnt--;) {
2230  LD_SB2(src_tmp, src_stride, src7, src8);
2231  src_tmp += 2 * src_stride;
2232 
2233  VSHF_B4_SB(src7, src7, mask0, mask1, mask2, mask3,
2234  vec0, vec1, vec2, vec3);
2235  HEVC_FILT_8TAP_4W_SH(vec0, vec1, vec2, vec3, filt0, filt1,
2236  filt2, filt3, dst14, dst15);
2237 
2238  ILVEV_H2_SH(dst12, dst14, dst13, dst15, dst76_r, dst76_l);
2239  dst0_r = HEVC_FILT_8TAP(dst10_r, dst32_r, dst54_r, dst76_r,
2240  filt_h0, filt_h1, filt_h2, filt_h3);
2241  dst0_l = HEVC_FILT_8TAP(dst10_l, dst32_l, dst54_l, dst76_l,
2242  filt_h0, filt_h1, filt_h2, filt_h3);
2243  dst0_r >>= 6;
2244  dst0_l >>= 6;
2245 
2246  /* row 8 */
2247  VSHF_B4_SB(src8, src8, mask0, mask1, mask2, mask3,
2248  vec0, vec1, vec2, vec3);
2249  HEVC_FILT_8TAP_4W_SH(vec0, vec1, vec2, vec3, filt0, filt1,
2250  filt2, filt3, dst0, dst1);
2251 
2252  ILVEV_H2_SH(dst14, dst0, dst15, dst1, dst87_r, dst87_l);
2253  dst1_r = HEVC_FILT_8TAP(dst21_r, dst43_r, dst65_r, dst87_r,
2254  filt_h0, filt_h1, filt_h2, filt_h3);
2255  dst1_l = HEVC_FILT_8TAP(dst21_l, dst43_l, dst65_l, dst87_l,
2256  filt_h0, filt_h1, filt_h2, filt_h3);
2257  dst1_r >>= 6;
2258  dst1_l >>= 6;
2259 
2260  MUL2(dst0_r, weight_vec, dst0_l, weight_vec, dst0_r, dst0_l);
2261  MUL2(dst1_r, weight_vec, dst1_l, weight_vec, dst1_r, dst1_l);
2262  SRAR_W4_SW(dst0_r, dst1_r, dst0_l, dst1_l, rnd_vec);
2263  ADD2(dst0_r, offset_vec, dst0_l, offset_vec, dst0_r, dst0_l);
2264  ADD2(dst1_r, offset_vec, dst1_l, offset_vec, dst1_r, dst1_l);
2265  CLIP_SW4_0_255(dst0_r, dst1_r, dst0_l, dst1_l);
2266 
2267  PCKEV_H2_SW(dst0_l, dst0_r, dst1_l, dst1_r, dst0_r, dst1_r);
2268  dst0_r = (v4i32) __msa_pckev_b((v16i8) dst1_r, (v16i8) dst0_r);
2269  ST_D2(dst0_r, 0, 1, dst_tmp, dst_stride);
2270  dst_tmp += (2 * dst_stride);
2271 
2272  dst10_r = dst32_r;
2273  dst32_r = dst54_r;
2274  dst54_r = dst76_r;
2275  dst10_l = dst32_l;
2276  dst32_l = dst54_l;
2277  dst54_l = dst76_l;
2278  dst21_r = dst43_r;
2279  dst43_r = dst65_r;
2280  dst65_r = dst87_r;
2281  dst21_l = dst43_l;
2282  dst43_l = dst65_l;
2283  dst65_l = dst87_l;
2284  dst12 = dst0;
2285  dst13 = dst1;
2286  }
2287 
2288  src += 8;
2289  dst += 8;
2290  }
2291 }
2292 
2293 static void hevc_hv_uniwgt_8t_8w_msa(const uint8_t *src,
2294  int32_t src_stride,
2295  uint8_t *dst,
2296  int32_t dst_stride,
2297  const int8_t *filter_x,
2298  const int8_t *filter_y,
2299  int32_t height,
2300  int32_t weight,
2301  int32_t offset,
2302  int32_t rnd_val)
2303 {
2304  hevc_hv_uniwgt_8t_8multx2mult_msa(src, src_stride, dst, dst_stride,
2305  filter_x, filter_y, height, weight,
2306  offset, rnd_val, 8);
2307 }
2308 
2309 static void hevc_hv_uniwgt_8t_12w_msa(const uint8_t *src,
2310  int32_t src_stride,
2311  uint8_t *dst,
2312  int32_t dst_stride,
2313  const int8_t *filter_x,
2314  const int8_t *filter_y,
2315  int32_t height,
2316  int32_t weight,
2317  int32_t offset,
2318  int32_t rnd_val)
2319 {
2320  uint32_t loop_cnt;
2321  const uint8_t *src_tmp;
2322  uint8_t *dst_tmp;
2323  v16u8 out;
2324  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
2325  v16i8 mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask7;
2326  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
2327  v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
2328  v4i32 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
2329  v4i32 dst8, dst9, dst10, dst11, dst12, dst13, dst14, dst15;
2330  v8i16 filt0, filt1, filt2, filt3, filt_h0, filt_h1, filt_h2, filt_h3;
2331  v8i16 dst10_r, dst32_r, dst54_r, dst76_r, dst10_l, dst32_l, dst54_l;
2332  v8i16 dst98_r, dst21_r, dst43_r, dst65_r, dst87_r, dst109_r;
2333  v8i16 dst76_l, filter_vec;
2334  v4i32 dst0_r, dst0_l, dst1_r, dst2_r, dst3_r;
2335  v4i32 weight_vec, offset_vec, rnd_vec;
2336  v8i16 zero = { 0 };
2337 
2338  src -= ((3 * src_stride) + 3);
2339 
2340  filter_vec = LD_SH(filter_x);
2341  UNPCK_R_SB_SH(filter_vec, filter_vec);
2342  SPLATI_W4_SH(filter_vec, filt0, filt1, filt2, filt3);
2343 
2344  filter_vec = LD_SH(filter_y);
2345  UNPCK_R_SB_SH(filter_vec, filter_vec);
2346 
2347  SPLATI_W4_SH(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3);
2348 
2349  weight_vec = __msa_fill_w(weight);
2350  offset_vec = __msa_fill_w(offset);
2351  rnd_vec = __msa_fill_w(rnd_val);
2352 
2353  mask0 = LD_SB(ff_hevc_mask_arr);
2354  mask1 = mask0 + 2;
2355  mask2 = mask0 + 4;
2356  mask3 = mask0 + 6;
2357 
2358  src_tmp = src;
2359  dst_tmp = dst;
2360 
2361  LD_SB7(src_tmp, src_stride, src0, src1, src2, src3, src4, src5, src6);
2362  src_tmp += (7 * src_stride);
2363 
2364  /* row 0 row 1 row 2 row 3 */
2365  VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3, vec0, vec1, vec2, vec3);
2366  VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3, vec4, vec5, vec6, vec7);
2367  VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3, vec8, vec9, vec10,
2368  vec11);
2369  VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3, vec12, vec13, vec14,
2370  vec15);
2371  HEVC_FILT_8TAP_4W_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
2372  filt3, dst0, dst1);
2373  HEVC_FILT_8TAP_4W_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
2374  filt3, dst2, dst3);
2375  HEVC_FILT_8TAP_4W_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
2376  filt3, dst4, dst5);
2377  HEVC_FILT_8TAP_4W_SH(vec12, vec13, vec14, vec15, filt0, filt1,
2378  filt2, filt3, dst6, dst7);
2379  VSHF_B4_SB(src4, src4, mask0, mask1, mask2, mask3, vec0, vec1, vec2, vec3);
2380  VSHF_B4_SB(src5, src5, mask0, mask1, mask2, mask3, vec4, vec5, vec6, vec7);
2381  VSHF_B4_SB(src6, src6, mask0, mask1, mask2, mask3, vec8, vec9, vec10,
2382  vec11);
2383  HEVC_FILT_8TAP_4W_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
2384  filt3, dst8, dst9);
2385  HEVC_FILT_8TAP_4W_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
2386  filt3, dst10, dst11);
2387  HEVC_FILT_8TAP_4W_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
2388  filt3, dst12, dst13);
2389 
2390  for (loop_cnt = 16; loop_cnt--;) {
2391  src7 = LD_SB(src_tmp);
2392  src_tmp += src_stride;
2393 
2394  VSHF_B4_SB(src7, src7, mask0, mask1, mask2, mask3, vec0, vec1, vec2,
2395  vec3);
2396  HEVC_FILT_8TAP_4W_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
2397  filt3, dst14, dst15);
2398  ILVEV_H2_SH(dst0, dst2, dst1, dst3, dst10_r, dst10_l);
2399  ILVEV_H2_SH(dst4, dst6, dst5, dst7, dst32_r, dst32_l);
2400  ILVEV_H2_SH(dst8, dst10, dst9, dst11, dst54_r, dst54_l);
2401  ILVEV_H2_SH(dst12, dst14, dst13, dst15, dst76_r, dst76_l);
2402 
2403  dst0_r = HEVC_FILT_8TAP(dst10_r, dst32_r, dst54_r, dst76_r,
2404  filt_h0, filt_h1, filt_h2, filt_h3);
2405  dst0_l = HEVC_FILT_8TAP(dst10_l, dst32_l, dst54_l, dst76_l,
2406  filt_h0, filt_h1, filt_h2, filt_h3);
2407  dst0_r >>= 6;
2408  dst0_l >>= 6;
2409 
2410  MUL2(dst0_r, weight_vec, dst0_l, weight_vec, dst0_r, dst0_l);
2411  SRAR_W2_SW(dst0_r, dst0_l, rnd_vec);
2412  ADD2(dst0_r, offset_vec, dst0_l, offset_vec, dst0_r, dst0_l);
2413  CLIP_SW2_0_255(dst0_r, dst0_l);
2414  dst0_r = (v4i32) __msa_pckev_h((v8i16) dst0_l, (v8i16) dst0_r);
2415  out = (v16u8) __msa_pckev_b((v16i8) dst0_r, (v16i8) dst0_r);
2416  ST_D1(out, 0, dst_tmp);
2417  dst_tmp += dst_stride;
2418 
2419  dst0 = dst2;
2420  dst1 = dst3;
2421  dst2 = dst4;
2422  dst3 = dst5;
2423  dst4 = dst6;
2424  dst5 = dst7;
2425  dst6 = dst8;
2426  dst7 = dst9;
2427  dst8 = dst10;
2428  dst9 = dst11;
2429  dst10 = dst12;
2430  dst11 = dst13;
2431  dst12 = dst14;
2432  dst13 = dst15;
2433  }
2434 
2435  src += 8;
2436  dst += 8;
2437 
2438  mask4 = LD_SB(ff_hevc_mask_arr + 16);
2439  mask5 = mask4 + 2;
2440  mask6 = mask4 + 4;
2441  mask7 = mask4 + 6;
2442 
2443  LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
2444  src += (7 * src_stride);
2445 
2446  VSHF_B4_SB(src0, src3, mask4, mask5, mask6, mask7, vec0, vec1, vec2, vec3);
2447  VSHF_B4_SB(src1, src4, mask4, mask5, mask6, mask7, vec4, vec5, vec6, vec7);
2448  VSHF_B4_SB(src2, src5, mask4, mask5, mask6, mask7, vec8, vec9, vec10,
2449  vec11);
2450  VSHF_B4_SB(src3, src6, mask4, mask5, mask6, mask7, vec12, vec13, vec14,
2451  vec15);
2452  HEVC_FILT_8TAP_4W_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
2453  filt3, dst0, dst3);
2454  HEVC_FILT_8TAP_4W_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
2455  filt3, dst1, dst4);
2456  HEVC_FILT_8TAP_4W_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
2457  filt3, dst2, dst5);
2458  HEVC_FILT_8TAP_4W_SH(vec12, vec13, vec14, vec15, filt0, filt1, filt2,
2459  filt3, dst3, dst6);
2460  ILVEV_H2_SH(dst0, dst1, dst3, dst4, dst10_r, dst43_r);
2461  ILVEV_H2_SH(dst1, dst2, dst4, dst5, dst21_r, dst54_r);
2462  ILVEV_H2_SH(dst2, dst3, dst5, dst6, dst32_r, dst65_r);
2463 
2464  for (loop_cnt = 4; loop_cnt--;) {
2465  LD_SB4(src, src_stride, src7, src8, src9, src10);
2466  src += (4 * src_stride);
2467 
2468  VSHF_B4_SB(src7, src9, mask4, mask5, mask6, mask7, vec0, vec1, vec2,
2469  vec3);
2470  VSHF_B4_SB(src8, src10, mask4, mask5, mask6, mask7, vec4, vec5, vec6,
2471  vec7);
2472  HEVC_FILT_8TAP_4W_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
2473  filt3, dst7, dst9);
2474  HEVC_FILT_8TAP_4W_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
2475  filt3, dst8, dst10);
2476 
2477  ILVEV_H2_SH(dst6, dst7, dst7, dst8, dst76_r, dst87_r);
2478  ILVEV_H2_SH(dst9, dst10, dst8, dst9, dst109_r, dst98_r);
2479 
2480  dst0_r = HEVC_FILT_8TAP(dst10_r, dst32_r, dst54_r, dst76_r, filt_h0,
2481  filt_h1, filt_h2, filt_h3);
2482  dst1_r = HEVC_FILT_8TAP(dst21_r, dst43_r, dst65_r, dst87_r, filt_h0,
2483  filt_h1, filt_h2, filt_h3);
2484  dst2_r = HEVC_FILT_8TAP(dst32_r, dst54_r, dst76_r, dst98_r, filt_h0,
2485  filt_h1, filt_h2, filt_h3);
2486  dst3_r = HEVC_FILT_8TAP(dst43_r, dst65_r, dst87_r, dst109_r, filt_h0,
2487  filt_h1, filt_h2, filt_h3);
2488 
2489  SRA_4V(dst0_r, dst1_r, dst2_r, dst3_r, 6);
2490  MUL2(dst0_r, weight_vec, dst1_r, weight_vec, dst0_r, dst1_r);
2491  MUL2(dst2_r, weight_vec, dst3_r, weight_vec, dst2_r, dst3_r);
2492  SRAR_W4_SW(dst0_r, dst1_r, dst2_r, dst3_r, rnd_vec);
2493  ADD2(dst0_r, offset_vec, dst1_r, offset_vec, dst0_r, dst1_r);
2494  ADD2(dst2_r, offset_vec, dst3_r, offset_vec, dst2_r, dst3_r);
2495  CLIP_SW4_0_255(dst0_r, dst1_r, dst2_r, dst3_r);
2496  PCKEV_H2_SW(dst1_r, dst0_r, dst3_r, dst2_r, dst0_r, dst1_r);
2497  out = (v16u8) __msa_pckev_b((v16i8) dst1_r, (v16i8) dst0_r);
2498  ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
2499  dst += (4 * dst_stride);
2500 
2501  dst10_r = dst54_r;
2502  dst32_r = dst76_r;
2503  dst54_r = dst98_r;
2504  dst21_r = dst65_r;
2505  dst43_r = dst87_r;
2506  dst65_r = dst109_r;
2507  dst6 = dst10;
2508  }
2509 }
2510 
2511 static void hevc_hv_uniwgt_8t_16w_msa(const uint8_t *src,
2512  int32_t src_stride,
2513  uint8_t *dst,
2514  int32_t dst_stride,
2515  const int8_t *filter_x,
2516  const int8_t *filter_y,
2517  int32_t height,
2518  int32_t weight,
2519  int32_t offset,
2520  int32_t rnd_val)
2521 {
2522  hevc_hv_uniwgt_8t_8multx2mult_msa(src, src_stride, dst, dst_stride,
2523  filter_x, filter_y, height, weight,
2524  offset, rnd_val, 16);
2525 }
2526 
2527 static void hevc_hv_uniwgt_8t_24w_msa(const uint8_t *src,
2528  int32_t src_stride,
2529  uint8_t *dst,
2530  int32_t dst_stride,
2531  const int8_t *filter_x,
2532  const int8_t *filter_y,
2533  int32_t height,
2534  int32_t weight,
2535  int32_t offset,
2536  int32_t rnd_val)
2537 {
2538  hevc_hv_uniwgt_8t_8multx2mult_msa(src, src_stride, dst, dst_stride,
2539  filter_x, filter_y, height, weight,
2540  offset, rnd_val, 24);
2541 }
2542 
2543 static void hevc_hv_uniwgt_8t_32w_msa(const uint8_t *src,
2544  int32_t src_stride,
2545  uint8_t *dst,
2546  int32_t dst_stride,
2547  const int8_t *filter_x,
2548  const int8_t *filter_y,
2549  int32_t height,
2550  int32_t weight,
2551  int32_t offset,
2552  int32_t rnd_val)
2553 {
2554  hevc_hv_uniwgt_8t_8multx2mult_msa(src, src_stride, dst, dst_stride,
2555  filter_x, filter_y, height, weight,
2556  offset, rnd_val, 32);
2557 }
2558 
2559 static void hevc_hv_uniwgt_8t_48w_msa(const uint8_t *src,
2560  int32_t src_stride,
2561  uint8_t *dst,
2562  int32_t dst_stride,
2563  const int8_t *filter_x,
2564  const int8_t *filter_y,
2565  int32_t height,
2566  int32_t weight,
2567  int32_t offset,
2568  int32_t rnd_val)
2569 {
2570  hevc_hv_uniwgt_8t_8multx2mult_msa(src, src_stride, dst, dst_stride,
2571  filter_x, filter_y, height, weight,
2572  offset, rnd_val, 48);
2573 }
2574 
2575 static void hevc_hv_uniwgt_8t_64w_msa(const uint8_t *src,
2576  int32_t src_stride,
2577  uint8_t *dst,
2578  int32_t dst_stride,
2579  const int8_t *filter_x,
2580  const int8_t *filter_y,
2581  int32_t height,
2582  int32_t weight,
2583  int32_t offset,
2584  int32_t rnd_val)
2585 {
2586  hevc_hv_uniwgt_8t_8multx2mult_msa(src, src_stride, dst, dst_stride,
2587  filter_x, filter_y, height, weight,
2588  offset, rnd_val, 64);
2589 }
2590 
2591 static void hevc_hz_uniwgt_4t_4x2_msa(const uint8_t *src,
2592  int32_t src_stride,
2593  uint8_t *dst,
2594  int32_t dst_stride,
2595  const int8_t *filter,
2596  int32_t weight,
2597  int32_t offset,
2598  int32_t rnd_val)
2599 {
2600  v16u8 out;
2601  v8i16 filt0, filt1, filter_vec;
2602  v16i8 src0, src1, vec0, vec1;
2603  v8i16 tmp0, tmp1, tmp2, tmp3;
2604  v16i8 mask1;
2605  v4i32 dst0, dst1;
2606  v4i32 weight_vec, rnd_vec, offset_vec;
2607  v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[16]);
2608  v8i16 zero = { 0 };
2609 
2610  src -= 1;
2611 
2612  filter_vec = LD_SH(filter);
2613  UNPCK_R_SB_SH(filter_vec, filter_vec);
2614  SPLATI_W2_SH(filter_vec, 0, filt0, filt1);
2615 
2616  mask1 = mask0 + 2;
2617 
2618  weight_vec = __msa_fill_w(weight);
2619  rnd_vec = __msa_fill_w(rnd_val);
2620  offset_vec = __msa_fill_w(offset);
2621 
2622  LD_SB2(src, src_stride, src0, src1);
2623 
2624  VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1);
2625  ILVRL_B2_SH(zero, vec0, tmp0, tmp1);
2626  ILVRL_B2_SH(zero, vec1, tmp2, tmp3);
2627  dst0 = HEVC_FILT_4TAP_SW(tmp0, tmp2, filt0, filt1);
2628  dst1 = HEVC_FILT_4TAP_SW(tmp1, tmp3, filt0, filt1);
2629 
2630  MUL2(dst0, weight_vec, dst1, weight_vec, dst0, dst1);
2631  SRAR_W2_SW(dst0, dst1, rnd_vec);
2632  ADD2(dst0, offset_vec, dst1, offset_vec, dst0, dst1);
2633  CLIP_SW2_0_255(dst0, dst1);
2634  vec0 = __msa_pckev_h((v8i16) dst1, (v8i16) dst0);
2635  out = (v16u8) __msa_pckev_b((v16i8) vec0, (v16i8) vec0);
2636  ST_W2(out, 0, 1, dst, dst_stride);
2637  dst += (4 * dst_stride);
2638 }
2639 
2640 static void hevc_hz_uniwgt_4t_4x4_msa(const uint8_t *src,
2641  int32_t src_stride,
2642  uint8_t *dst,
2643  int32_t dst_stride,
2644  const int8_t *filter,
2645  int32_t weight,
2646  int32_t offset,
2647  int32_t rnd_val)
2648 {
2649  v16u8 out;
2650  v8i16 filt0, filt1;
2651  v16i8 src0, src1, src2, src3;
2652  v16i8 mask1, vec0, vec1, vec2, vec3;
2653  v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
2654  v4i32 dst0, dst1, dst2, dst3;
2655  v8i16 filter_vec;
2656  v4i32 weight_vec, rnd_vec, offset_vec;
2657  v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[16]);
2658  v8i16 zero = { 0 };
2659 
2660  src -= 1;
2661 
2662  /* rearranging filter */
2663  filter_vec = LD_SH(filter);
2664  UNPCK_R_SB_SH(filter_vec, filter_vec);
2665  SPLATI_W2_SH(filter_vec, 0, filt0, filt1);
2666 
2667  weight_vec = __msa_fill_w(weight);
2668  rnd_vec = __msa_fill_w(rnd_val);
2669  offset_vec = __msa_fill_w(offset);
2670  mask1 = mask0 + 2;
2671 
2672  LD_SB4(src, src_stride, src0, src1, src2, src3);
2673 
2674  VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1);
2675  VSHF_B2_SB(src2, src3, src2, src3, mask0, mask1, vec2, vec3);
2676  ILVRL_B2_SH(zero, vec0, tmp0, tmp1);
2677  ILVRL_B2_SH(zero, vec1, tmp2, tmp3);
2678  ILVRL_B2_SH(zero, vec2, tmp4, tmp5);
2679  ILVRL_B2_SH(zero, vec3, tmp6, tmp7);
2680  dst0 = HEVC_FILT_4TAP_SW(tmp0, tmp2, filt0, filt1);
2681  dst1 = HEVC_FILT_4TAP_SW(tmp1, tmp3, filt0, filt1);
2682  dst2 = HEVC_FILT_4TAP_SW(tmp4, tmp6, filt0, filt1);
2683  dst3 = HEVC_FILT_4TAP_SW(tmp5, tmp7, filt0, filt1);
2684 
2685  MUL4(dst0, weight_vec, dst1, weight_vec, dst2, weight_vec, dst3,
2686  weight_vec, dst0, dst1, dst2, dst3);
2687  SRAR_W4_SW(dst0, dst1, dst2, dst3, rnd_vec);
2688  ADD4(dst0, offset_vec, dst1, offset_vec, dst2, offset_vec, dst3,
2689  offset_vec, dst0, dst1, dst2, dst3);
2690  CLIP_SW4_0_255(dst0, dst1, dst2, dst3);
2691  tmp0 = __msa_pckev_h((v8i16) dst1, (v8i16) dst0);
2692  tmp1 = __msa_pckev_h((v8i16) dst3, (v8i16) dst2);
2693 
2694  out = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
2695  ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
2696 }
2697 
2698 static void hevc_hz_uniwgt_4t_4x8multiple_msa(const uint8_t *src,
2699  int32_t src_stride,
2700  uint8_t *dst,
2701  int32_t dst_stride,
2702  const int8_t *filter,
2703  int32_t height,
2704  int32_t weight,
2705  int32_t offset,
2706  int32_t rnd_val)
2707 {
2708  uint32_t loop_cnt;
2709  v16u8 out0, out1;
2710  v8i16 filt0, filt1;
2711  v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
2712  v16i8 mask1, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
2713  v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
2714  v4i32 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
2715  v8i16 filter_vec;
2716  v4i32 weight_vec, rnd_vec, offset_vec;
2717  v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[16]);
2718  v8i16 zero = { 0 };
2719 
2720  src -= 1;
2721 
2722  filter_vec = LD_SH(filter);
2723  UNPCK_R_SB_SH(filter_vec, filter_vec);
2724  SPLATI_W2_SH(filter_vec, 0, filt0, filt1);
2725 
2726  weight_vec = __msa_fill_w(weight);
2727  rnd_vec = __msa_fill_w(rnd_val);
2728  offset_vec = __msa_fill_w(offset);
2729 
2730  mask1 = mask0 + 2;
2731 
2732  for (loop_cnt = (height >> 3); loop_cnt--;) {
2733  LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
2734  src += (8 * src_stride);
2735 
2736  VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1);
2737  VSHF_B2_SB(src2, src3, src2, src3, mask0, mask1, vec2, vec3);
2738  VSHF_B2_SB(src4, src5, src4, src5, mask0, mask1, vec4, vec5);
2739  VSHF_B2_SB(src6, src7, src6, src7, mask0, mask1, vec6, vec7);
2740  ILVRL_B2_SH(zero, vec0, tmp0, tmp1);
2741  ILVRL_B2_SH(zero, vec1, tmp2, tmp3);
2742  ILVRL_B2_SH(zero, vec2, tmp4, tmp5);
2743  ILVRL_B2_SH(zero, vec3, tmp6, tmp7);
2744  dst0 = HEVC_FILT_4TAP_SW(tmp0, tmp2, filt0, filt1);
2745  dst1 = HEVC_FILT_4TAP_SW(tmp1, tmp3, filt0, filt1);
2746  dst2 = HEVC_FILT_4TAP_SW(tmp4, tmp6, filt0, filt1);
2747  dst3 = HEVC_FILT_4TAP_SW(tmp5, tmp7, filt0, filt1);
2748  ILVRL_B2_SH(zero, vec4, tmp0, tmp1);
2749  ILVRL_B2_SH(zero, vec5, tmp2, tmp3);
2750  ILVRL_B2_SH(zero, vec6, tmp4, tmp5);
2751  ILVRL_B2_SH(zero, vec7, tmp6, tmp7);
2752  dst4 = HEVC_FILT_4TAP_SW(tmp0, tmp2, filt0, filt1);
2753  dst5 = HEVC_FILT_4TAP_SW(tmp1, tmp3, filt0, filt1);
2754  dst6 = HEVC_FILT_4TAP_SW(tmp4, tmp6, filt0, filt1);
2755  dst7 = HEVC_FILT_4TAP_SW(tmp5, tmp7, filt0, filt1);
2756 
2757  MUL4(dst0, weight_vec, dst1, weight_vec, dst2, weight_vec, dst3,
2758  weight_vec, dst0, dst1, dst2, dst3);
2759  MUL4(dst4, weight_vec, dst5, weight_vec, dst6, weight_vec,
2760  dst7, weight_vec, dst4, dst5, dst6, dst7);
2761  SRAR_W4_SW(dst0, dst1, dst2, dst3, rnd_vec);
2762  SRAR_W4_SW(dst4, dst5, dst6, dst7, rnd_vec);
2763  ADD4(dst0, offset_vec, dst1, offset_vec, dst2, offset_vec, dst3,
2764  offset_vec, dst0, dst1, dst2, dst3);
2765  ADD4(dst4, offset_vec, dst5, offset_vec, dst6, offset_vec,
2766  dst7, offset_vec, dst4, dst5, dst6, dst7);
2767  CLIP_SW8_0_255(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
2768  tmp0 = __msa_pckev_h((v8i16) dst1, (v8i16) dst0);
2769  tmp1 = __msa_pckev_h((v8i16) dst3, (v8i16) dst2);
2770  tmp2 = __msa_pckev_h((v8i16) dst5, (v8i16) dst4);
2771  tmp3 = __msa_pckev_h((v8i16) dst7, (v8i16) dst6);
2772  PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
2773  ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
2774  dst += (8 * dst_stride);
2775  }
2776 }
2777 
2778 static void hevc_hz_uniwgt_4t_4w_msa(const uint8_t *src,
2779  int32_t src_stride,
2780  uint8_t *dst,
2781  int32_t dst_stride,
2782  const int8_t *filter,
2783  int32_t height,
2784  int32_t weight,
2785  int32_t offset,
2786  int32_t rnd_val)
2787 {
2788  if (2 == height) {
2789  hevc_hz_uniwgt_4t_4x2_msa(src, src_stride, dst, dst_stride,
2790  filter, weight, offset, rnd_val);
2791  } else if (4 == height) {
2792  hevc_hz_uniwgt_4t_4x4_msa(src, src_stride, dst, dst_stride,
2793  filter, weight, offset, rnd_val);
2794  } else if (8 == height || 16 == height) {
2795  hevc_hz_uniwgt_4t_4x8multiple_msa(src, src_stride, dst, dst_stride,
2796  filter, height, weight,
2797  offset, rnd_val);
2798  }
2799 }
2800 
2801 static void hevc_hz_uniwgt_4t_6w_msa(const uint8_t *src,
2802  int32_t src_stride,
2803  uint8_t *dst,
2804  int32_t dst_stride,
2805  const int8_t *filter,
2806  int32_t height,
2807  int32_t weight,
2808  int32_t offset,
2809  int32_t rnd_val)
2810 {
2811  v16u8 out0, out1, out2, out3;
2812  v8i16 filter_vec, filt0, filt1;
2813  v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
2814  v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
2815  v16i8 mask1;
2816  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
2817  v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
2818  v4i32 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
2819  v4i32 dst8, dst9, dst10, dst11, dst12, dst13, dst14, dst15;
2820  v4i32 weight_vec, rnd_vec, offset_vec;
2821  v8i16 zero = { 0 };
2822 
2823  src -= 1;
2824 
2825  filter_vec = LD_SH(filter);
2826  UNPCK_R_SB_SH(filter_vec, filter_vec);
2827  SPLATI_W2_SH(filter_vec, 0, filt0, filt1);
2828 
2829  weight_vec = __msa_fill_w(weight);
2830  rnd_vec = __msa_fill_w(rnd_val);
2831  offset_vec = __msa_fill_w(offset);
2832 
2833  mask1 = mask0 + 2;
2834 
2835  LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
2836  VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
2837  VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
2838  VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
2839  VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec6, vec7);
2840  ILVRL_B2_SH(zero, vec0, tmp0, tmp1);
2841  ILVRL_B2_SH(zero, vec1, tmp2, tmp3);
2842  ILVRL_B2_SH(zero, vec2, tmp4, tmp5);
2843  ILVRL_B2_SH(zero, vec3, tmp6, tmp7);
2844  dst0 = HEVC_FILT_4TAP_SW(tmp0, tmp2, filt0, filt1);
2845  dst1 = HEVC_FILT_4TAP_SW(tmp1, tmp3, filt0, filt1);
2846  dst2 = HEVC_FILT_4TAP_SW(tmp4, tmp6, filt0, filt1);
2847  dst3 = HEVC_FILT_4TAP_SW(tmp5, tmp7, filt0, filt1);
2848  ILVRL_B2_SH(zero, vec4, tmp0, tmp1);
2849  ILVRL_B2_SH(zero, vec5, tmp2, tmp3);
2850  ILVRL_B2_SH(zero, vec6, tmp4, tmp5);
2851  ILVRL_B2_SH(zero, vec7, tmp6, tmp7);
2852  dst4 = HEVC_FILT_4TAP_SW(tmp0, tmp2, filt0, filt1);
2853  dst5 = HEVC_FILT_4TAP_SW(tmp1, tmp3, filt0, filt1);
2854  dst6 = HEVC_FILT_4TAP_SW(tmp4, tmp6, filt0, filt1);
2855  dst7 = HEVC_FILT_4TAP_SW(tmp5, tmp7, filt0, filt1);
2856  VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
2857  VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec2, vec3);
2858  VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec4, vec5);
2859  VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec6, vec7);
2860  ILVRL_B2_SH(zero, vec0, tmp0, tmp1);
2861  ILVRL_B2_SH(zero, vec1, tmp2, tmp3);
2862  ILVRL_B2_SH(zero, vec2, tmp4, tmp5);
2863  ILVRL_B2_SH(zero, vec3, tmp6, tmp7);
2864  dst8 = HEVC_FILT_4TAP_SW(tmp0, tmp2, filt0, filt1);
2865  dst9 = HEVC_FILT_4TAP_SW(tmp1, tmp3, filt0, filt1);
2866  dst10 = HEVC_FILT_4TAP_SW(tmp4, tmp6, filt0, filt1);
2867  dst11 = HEVC_FILT_4TAP_SW(tmp5, tmp7, filt0, filt1);
2868  ILVRL_B2_SH(zero, vec4, tmp0, tmp1);
2869  ILVRL_B2_SH(zero, vec5, tmp2, tmp3);
2870  ILVRL_B2_SH(zero, vec6, tmp4, tmp5);
2871  ILVRL_B2_SH(zero, vec7, tmp6, tmp7);
2872  dst12 = HEVC_FILT_4TAP_SW(tmp0, tmp2, filt0, filt1);
2873  dst13 = HEVC_FILT_4TAP_SW(tmp1, tmp3, filt0, filt1);
2874  dst14 = HEVC_FILT_4TAP_SW(tmp4, tmp6, filt0, filt1);
2875  dst15 = HEVC_FILT_4TAP_SW(tmp5, tmp7, filt0, filt1);
2876 
2877  MUL4(dst0, weight_vec, dst1, weight_vec, dst2, weight_vec, dst3,
2878  weight_vec, dst0, dst1, dst2, dst3);
2879  MUL4(dst4, weight_vec, dst5, weight_vec, dst6, weight_vec,
2880  dst7, weight_vec, dst4, dst5, dst6, dst7);
2881  SRAR_W4_SW(dst0, dst1, dst2, dst3, rnd_vec);
2882  SRAR_W4_SW(dst4, dst5, dst6, dst7, rnd_vec);
2883  ADD4(dst0, offset_vec, dst1, offset_vec, dst2, offset_vec, dst3,
2884  offset_vec, dst0, dst1, dst2, dst3);
2885  ADD4(dst4, offset_vec, dst5, offset_vec, dst6, offset_vec,
2886  dst7, offset_vec, dst4, dst5, dst6, dst7);
2887  CLIP_SW8_0_255(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
2888 
2889  MUL4(dst8, weight_vec, dst9, weight_vec, dst10, weight_vec, dst11,
2890  weight_vec, dst8, dst9, dst10, dst11);
2891  MUL4(dst12, weight_vec, dst13, weight_vec, dst14, weight_vec,
2892  dst15, weight_vec, dst12, dst13, dst14, dst15);
2893  SRAR_W4_SW(dst8, dst9, dst10, dst11, rnd_vec);
2894  SRAR_W4_SW(dst12, dst13, dst14, dst15, rnd_vec);
2895  ADD4(dst8, offset_vec, dst9, offset_vec, dst10, offset_vec, dst11,
2896  offset_vec, dst8, dst9, dst10, dst11);
2897  ADD4(dst12, offset_vec, dst13, offset_vec, dst14, offset_vec,
2898  dst15, offset_vec, dst12, dst13, dst14, dst15);
2899  CLIP_SW8_0_255(dst8, dst9, dst10, dst11, dst12, dst13, dst14, dst15);
2900 
2901  PCKEV_H2_SH(dst1, dst0, dst3, dst2, tmp0, tmp1);
2902  PCKEV_H2_SH(dst5, dst4, dst7, dst6, tmp2, tmp3);
2903  PCKEV_H2_SH(dst9, dst8, dst11, dst10, tmp4, tmp5);
2904  PCKEV_H2_SH(dst13, dst12, dst15, dst14, tmp6, tmp7);
2905  PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
2906  PCKEV_B2_UB(tmp5, tmp4, tmp7, tmp6, out2, out3);
2907  ST_W2(out0, 0, 2, dst, dst_stride);
2908  ST_H2(out0, 2, 6, dst + 4, dst_stride);
2909  ST_W2(out1, 0, 2, dst + 2 * dst_stride, dst_stride);
2910  ST_H2(out1, 2, 6, dst + 2 * dst_stride + 4, dst_stride);
2911  dst += (4 * dst_stride);
2912  ST_W2(out2, 0, 2, dst, dst_stride);
2913  ST_H2(out2, 2, 6, dst + 4, dst_stride);
2914  ST_W2(out3, 0, 2, dst + 2 * dst_stride, dst_stride);
2915  ST_H2(out3, 2, 6, dst + 2 * dst_stride + 4, dst_stride);
2916 }
2917 
2918 static void hevc_hz_uniwgt_4t_8x2_msa(const uint8_t *src,
2919  int32_t src_stride,
2920  uint8_t *dst,
2921  int32_t dst_stride,
2922  const int8_t *filter,
2923  int32_t weight,
2924  int32_t offset,
2925  int32_t rnd_val)
2926 {
2927  v16u8 out;
2928  v8i16 filter_vec, filt0, filt1;
2929  v16i8 src0, src1;
2930  v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
2931  v16i8 mask1;
2932  v16i8 vec0, vec1, vec2, vec3;
2933  v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
2934  v4i32 weight_vec, rnd_vec, offset_vec;
2935  v4i32 dst0, dst1, dst2, dst3;
2936  v8i16 zero = { 0 };
2937 
2938  src -= 1;
2939 
2940  filter_vec = LD_SH(filter);
2941  UNPCK_R_SB_SH(filter_vec, filter_vec);
2942  SPLATI_W2_SH(filter_vec, 0, filt0, filt1);
2943 
2944  weight_vec = __msa_fill_w(weight);
2945  rnd_vec = __msa_fill_w(rnd_val);
2946  offset_vec = __msa_fill_w(offset);
2947  mask1 = mask0 + 2;
2948 
2949  LD_SB2(src, src_stride, src0, src1);
2950 
2951  VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
2952  VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
2953  ILVRL_B2_SH(zero, vec0, tmp0, tmp1);
2954  ILVRL_B2_SH(zero, vec1, tmp2, tmp3);
2955  ILVRL_B2_SH(zero, vec2, tmp4, tmp5);
2956  ILVRL_B2_SH(zero, vec3, tmp6, tmp7);
2957  dst0 = HEVC_FILT_4TAP_SW(tmp0, tmp2, filt0, filt1);
2958  dst1 = HEVC_FILT_4TAP_SW(tmp1, tmp3, filt0, filt1);
2959  dst2 = HEVC_FILT_4TAP_SW(tmp4, tmp6, filt0, filt1);
2960  dst3 = HEVC_FILT_4TAP_SW(tmp5, tmp7, filt0, filt1);
2961 
2962  MUL4(dst0, weight_vec, dst1, weight_vec, dst2, weight_vec, dst3,
2963  weight_vec, dst0, dst1, dst2, dst3);
2964  SRAR_W4_SW(dst0, dst1, dst2, dst3, rnd_vec);
2965  ADD4(dst0, offset_vec, dst1, offset_vec, dst2, offset_vec, dst3,
2966  offset_vec, dst0, dst1, dst2, dst3);
2967  CLIP_SW4_0_255(dst0, dst1, dst2, dst3);
2968  PCKEV_H2_SH(dst1, dst0, dst3, dst2, tmp0, tmp1);
2969 
2970  out = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
2971  ST_D2(out, 0, 1, dst, dst_stride);
2972 }
2973 
2974 static void hevc_hz_uniwgt_4t_8x4_msa(const uint8_t *src,
2975  int32_t src_stride,
2976  uint8_t *dst,
2977  int32_t dst_stride,
2978  const int8_t *filter,
2979  int32_t weight,
2980  int32_t offset,
2981  int32_t rnd_val)
2982 {
2983  v16u8 out0, out1;
2984  v16i8 src0, src1, src2, src3;
2985  v16i8 mask0, mask1, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
2986  v8i16 filter_vec, filt0, filt1;
2987  v4i32 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
2988  v4i32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
2989  v4i32 weight_vec, rnd_vec, offset_vec;
2990  v8i16 zero = { 0 };
2991 
2992  src -= 1;
2993 
2994  filter_vec = LD_SH(filter);
2995  UNPCK_R_SB_SH(filter_vec, filter_vec);
2996  SPLATI_W2_SH(filter_vec, 0, filt0, filt1);
2997 
2998  weight_vec = __msa_fill_w(weight);
2999  rnd_vec = __msa_fill_w(rnd_val);
3000  offset_vec = __msa_fill_w(offset);
3001 
3002  mask0 = LD_SB(&ff_hevc_mask_arr[0]);
3003  mask1 = mask0 + 2;
3004 
3005  LD_SB4(src, src_stride, src0, src1, src2, src3);
3006  VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3007  VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
3008  VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
3009  VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec6, vec7);
3010  ILVRL_B2_SH(zero, vec0, tmp0, tmp1);
3011  ILVRL_B2_SH(zero, vec1, tmp2, tmp3);
3012  ILVRL_B2_SH(zero, vec2, tmp4, tmp5);
3013  ILVRL_B2_SH(zero, vec3, tmp6, tmp7);
3014  dst0 = HEVC_FILT_4TAP_SW(tmp0, tmp2, filt0, filt1);
3015  dst1 = HEVC_FILT_4TAP_SW(tmp1, tmp3, filt0, filt1);
3016  dst2 = HEVC_FILT_4TAP_SW(tmp4, tmp6, filt0, filt1);
3017  dst3 = HEVC_FILT_4TAP_SW(tmp5, tmp7, filt0, filt1);
3018  ILVRL_B2_SH(zero, vec4, tmp0, tmp1);
3019  ILVRL_B2_SH(zero, vec5, tmp2, tmp3);
3020  ILVRL_B2_SH(zero, vec6, tmp4, tmp5);
3021  ILVRL_B2_SH(zero, vec7, tmp6, tmp7);
3022  dst4 = HEVC_FILT_4TAP_SW(tmp0, tmp2, filt0, filt1);
3023  dst5 = HEVC_FILT_4TAP_SW(tmp1, tmp3, filt0, filt1);
3024  dst6 = HEVC_FILT_4TAP_SW(tmp4, tmp6, filt0, filt1);
3025  dst7 = HEVC_FILT_4TAP_SW(tmp5, tmp7, filt0, filt1);
3026 
3027  MUL4(dst0, weight_vec, dst1, weight_vec, dst2, weight_vec, dst3,
3028  weight_vec, dst0, dst1, dst2, dst3);
3029  MUL4(dst4, weight_vec, dst5, weight_vec, dst6, weight_vec,
3030  dst7, weight_vec, dst4, dst5, dst6, dst7);
3031  SRAR_W4_SW(dst0, dst1, dst2, dst3, rnd_vec);
3032  SRAR_W4_SW(dst4, dst5, dst6, dst7, rnd_vec);
3033  ADD4(dst0, offset_vec, dst1, offset_vec, dst2, offset_vec, dst3,
3034  offset_vec, dst0, dst1, dst2, dst3);
3035  ADD4(dst4, offset_vec, dst5, offset_vec, dst6, offset_vec,
3036  dst7, offset_vec, dst4, dst5, dst6, dst7);
3037  CLIP_SW8_0_255(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
3038 
3039  PCKEV_H2_SH(dst1, dst0, dst3, dst2, tmp0, tmp1);
3040  PCKEV_H2_SH(dst5, dst4, dst7, dst6, tmp2, tmp3);
3041 
3042  PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
3043  ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
3044 }
3045 
3046 static void hevc_hz_uniwgt_4t_8x6_msa(const uint8_t *src,
3047  int32_t src_stride,
3048  uint8_t *dst,
3049  int32_t dst_stride,
3050  const int8_t *filter,
3051  int32_t weight,
3052  int32_t offset,
3053  int32_t rnd_val)
3054 {
3055  v16u8 out0, out1, out2;
3056  v8i16 filter_vec, filt0, filt1;
3057  v16i8 src0, src1, src2, src3, src4, src5;
3058  v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]);
3059  v16i8 mask1;
3060  v16i8 vec11;
3061  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9, vec10;
3062  v4i32 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
3063  v4i32 dst8, dst9, dst10, dst11;
3064  v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
3065  v4i32 weight_vec, rnd_vec, offset_vec;
3066  v8i16 zero = { 0 };
3067 
3068  src -= 1;
3069 
3070  filter_vec = LD_SH(filter);
3071  UNPCK_R_SB_SH(filter_vec, filter_vec);
3072  SPLATI_W2_SH(filter_vec, 0, filt0, filt1);
3073 
3074  weight_vec = __msa_fill_w(weight);
3075  rnd_vec = __msa_fill_w(rnd_val);
3076  offset_vec = __msa_fill_w(offset);
3077 
3078  mask1 = mask0 + 2;
3079 
3080  LD_SB6(src, src_stride, src0, src1, src2, src3, src4, src5);
3081 
3082  VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3083  VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
3084  VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
3085  VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec6, vec7);
3086  VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec8, vec9);
3087  VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec10, vec11);
3088  ILVRL_B2_SH(zero, vec0, tmp0, tmp1);
3089  ILVRL_B2_SH(zero, vec1, tmp2, tmp3);
3090  ILVRL_B2_SH(zero, vec2, tmp4, tmp5);
3091  ILVRL_B2_SH(zero, vec3, tmp6, tmp7);
3092  dst0 = HEVC_FILT_4TAP_SW(tmp0, tmp2, filt0, filt1);
3093  dst1 = HEVC_FILT_4TAP_SW(tmp1, tmp3, filt0, filt1);
3094  dst2 = HEVC_FILT_4TAP_SW(tmp4, tmp6, filt0, filt1);
3095  dst3 = HEVC_FILT_4TAP_SW(tmp5, tmp7, filt0, filt1);
3096  ILVRL_B2_SH(zero, vec4, tmp0, tmp1);
3097  ILVRL_B2_SH(zero, vec5, tmp2, tmp3);
3098  ILVRL_B2_SH(zero, vec6, tmp4, tmp5);
3099  ILVRL_B2_SH(zero, vec7, tmp6, tmp7);
3100  dst4 = HEVC_FILT_4TAP_SW(tmp0, tmp2, filt0, filt1);
3101  dst5 = HEVC_FILT_4TAP_SW(tmp1, tmp3, filt0, filt1);
3102  dst6 = HEVC_FILT_4TAP_SW(tmp4, tmp6, filt0, filt1);
3103  dst7 = HEVC_FILT_4TAP_SW(tmp5, tmp7, filt0, filt1);
3104  ILVRL_B2_SH(zero, vec8, tmp0, tmp1);
3105  ILVRL_B2_SH(zero, vec9, tmp2, tmp3);
3106  ILVRL_B2_SH(zero, vec10, tmp4, tmp5);
3107  ILVRL_B2_SH(zero, vec11, tmp6, tmp7);
3108  dst8 = HEVC_FILT_4TAP_SW(tmp0, tmp2, filt0, filt1);
3109  dst9 = HEVC_FILT_4TAP_SW(tmp1, tmp3, filt0, filt1);
3110  dst10 = HEVC_FILT_4TAP_SW(tmp4, tmp6, filt0, filt1);
3111  dst11 = HEVC_FILT_4TAP_SW(tmp5, tmp7, filt0, filt1);
3112 
3113  MUL4(dst0, weight_vec, dst1, weight_vec, dst2, weight_vec, dst3,
3114  weight_vec, dst0, dst1, dst2, dst3);
3115  MUL4(dst4, weight_vec, dst5, weight_vec, dst6, weight_vec,
3116  dst7, weight_vec, dst4, dst5, dst6, dst7);
3117  SRAR_W4_SW(dst0, dst1, dst2, dst3, rnd_vec);
3118  SRAR_W4_SW(dst4, dst5, dst6, dst7, rnd_vec);
3119  ADD4(dst0, offset_vec, dst1, offset_vec, dst2, offset_vec, dst3,
3120  offset_vec, dst0, dst1, dst2, dst3);
3121  ADD4(dst4, offset_vec, dst5, offset_vec, dst6, offset_vec,
3122  dst7, offset_vec, dst4, dst5, dst6, dst7);
3123  CLIP_SW8_0_255(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
3124 
3125  MUL4(dst8, weight_vec, dst9, weight_vec, dst10, weight_vec, dst11,
3126  weight_vec, dst8, dst9, dst10, dst11);
3127  SRAR_W4_SW(dst8, dst9, dst10, dst11, rnd_vec);
3128  ADD4(dst8, offset_vec, dst9, offset_vec, dst10, offset_vec, dst11,
3129  offset_vec, dst8, dst9, dst10, dst11);
3130  CLIP_SW4_0_255(dst8, dst9, dst10, dst11);
3131 
3132  PCKEV_H2_SH(dst1, dst0, dst3, dst2, tmp0, tmp1);
3133  PCKEV_H2_SH(dst5, dst4, dst7, dst6, tmp2, tmp3);
3134  PCKEV_H2_SH(dst9, dst8, dst11, dst10, tmp4, tmp5);
3135 
3136  PCKEV_B3_UB(tmp1, tmp0, tmp3, tmp2, tmp5, tmp4, out0, out1, out2);
3137  ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
3138  ST_D2(out2, 0, 1, dst + 4 * dst_stride, dst_stride);
3139 }
3140 
3141 static void hevc_hz_uniwgt_4t_8x8multiple_msa(const uint8_t *src,
3142  int32_t src_stride,
3143  uint8_t *dst,
3144  int32_t dst_stride,
3145  const int8_t *filter,
3146  int32_t height,
3147  int32_t weight,
3148  int32_t offset,
3149  int32_t rnd_val)
3150 {
3151  uint32_t loop_cnt;
3152  v8i16 filter_vec, filt0, filt1;
3153  v16u8 out0, out1, out2, out3;
3154  v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
3155  v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]);
3156  v16i8 mask1;
3157  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
3158  v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
3159  v4i32 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
3160  v4i32 dst8, dst9, dst10, dst11, dst12, dst13, dst14, dst15;
3161  v4i32 weight_vec, rnd_vec, offset_vec;
3162  v8i16 zero = { 0 };
3163 
3164  src -= 1;
3165 
3166  filter_vec = LD_SH(filter);
3167  UNPCK_R_SB_SH(filter_vec, filter_vec);
3168  SPLATI_W2_SH(filter_vec, 0, filt0, filt1);
3169 
3170  weight_vec = __msa_fill_w(weight);
3171  rnd_vec = __msa_fill_w(rnd_val);
3172  offset_vec = __msa_fill_w(offset);
3173 
3174  mask1 = mask0 + 2;
3175 
3176  for (loop_cnt = (height >> 3); loop_cnt--;) {
3177  LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
3178  src += (8 * src_stride);
3179 
3180  VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3181  VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
3182  VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
3183  VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec6, vec7);
3184 
3185  ILVRL_B2_SH(zero, vec0, tmp0, tmp1);
3186  ILVRL_B2_SH(zero, vec1, tmp2, tmp3);
3187  ILVRL_B2_SH(zero, vec2, tmp4, tmp5);
3188  ILVRL_B2_SH(zero, vec3, tmp6, tmp7);
3189  dst0 = HEVC_FILT_4TAP_SW(tmp0, tmp2, filt0, filt1);
3190  dst1 = HEVC_FILT_4TAP_SW(tmp1, tmp3, filt0, filt1);
3191  dst2 = HEVC_FILT_4TAP_SW(tmp4, tmp6, filt0, filt1);
3192  dst3 = HEVC_FILT_4TAP_SW(tmp5, tmp7, filt0, filt1);
3193  ILVRL_B2_SH(zero, vec4, tmp0, tmp1);
3194  ILVRL_B2_SH(zero, vec5, tmp2, tmp3);
3195  ILVRL_B2_SH(zero, vec6, tmp4, tmp5);
3196  ILVRL_B2_SH(zero, vec7, tmp6, tmp7);
3197  dst4 = HEVC_FILT_4TAP_SW(tmp0, tmp2, filt0, filt1);
3198  dst5 = HEVC_FILT_4TAP_SW(tmp1, tmp3, filt0, filt1);
3199  dst6 = HEVC_FILT_4TAP_SW(tmp4, tmp6, filt0, filt1);
3200  dst7 = HEVC_FILT_4TAP_SW(tmp5, tmp7, filt0, filt1);
3201  VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
3202  VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec2, vec3);
3203  VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec4, vec5);
3204  VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec6, vec7);
3205  ILVRL_B2_SH(zero, vec0, tmp0, tmp1);
3206  ILVRL_B2_SH(zero, vec1, tmp2, tmp3);
3207  ILVRL_B2_SH(zero, vec2, tmp4, tmp5);
3208  ILVRL_B2_SH(zero, vec3, tmp6, tmp7);
3209  dst8 = HEVC_FILT_4TAP_SW(tmp0, tmp2, filt0, filt1);
3210  dst9 = HEVC_FILT_4TAP_SW(tmp1, tmp3, filt0, filt1);
3211  dst10 = HEVC_FILT_4TAP_SW(tmp4, tmp6, filt0, filt1);
3212  dst11 = HEVC_FILT_4TAP_SW(tmp5, tmp7, filt0, filt1);
3213  ILVRL_B2_SH(zero, vec4, tmp0, tmp1);
3214  ILVRL_B2_SH(zero, vec5, tmp2, tmp3);
3215  ILVRL_B2_SH(zero, vec6, tmp4, tmp5);
3216  ILVRL_B2_SH(zero, vec7, tmp6, tmp7);
3217  dst12 = HEVC_FILT_4TAP_SW(tmp0, tmp2, filt0, filt1);
3218  dst13 = HEVC_FILT_4TAP_SW(tmp1, tmp3, filt0, filt1);
3219  dst14 = HEVC_FILT_4TAP_SW(tmp4, tmp6, filt0, filt1);
3220  dst15 = HEVC_FILT_4TAP_SW(tmp5, tmp7, filt0, filt1);
3221 
3222  MUL4(dst0, weight_vec, dst1, weight_vec, dst2, weight_vec, dst3,
3223  weight_vec, dst0, dst1, dst2, dst3);
3224  MUL4(dst4, weight_vec, dst5, weight_vec, dst6, weight_vec,
3225  dst7, weight_vec, dst4, dst5, dst6, dst7);
3226  SRAR_W4_SW(dst0, dst1, dst2, dst3, rnd_vec);
3227  SRAR_W4_SW(dst4, dst5, dst6, dst7, rnd_vec);
3228  ADD4(dst0, offset_vec, dst1, offset_vec, dst2, offset_vec, dst3,
3229  offset_vec, dst0, dst1, dst2, dst3);
3230  ADD4(dst4, offset_vec, dst5, offset_vec, dst6, offset_vec,
3231  dst7, offset_vec, dst4, dst5, dst6, dst7);
3232  CLIP_SW8_0_255(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
3233 
3234  MUL4(dst8, weight_vec, dst9, weight_vec, dst10, weight_vec, dst11,
3235  weight_vec, dst8, dst9, dst10, dst11);
3236  MUL4(dst12, weight_vec, dst13, weight_vec, dst14, weight_vec,
3237  dst15, weight_vec, dst12, dst13, dst14, dst15);
3238  SRAR_W4_SW(dst8, dst9, dst10, dst11, rnd_vec);
3239  SRAR_W4_SW(dst12, dst13, dst14, dst15, rnd_vec);
3240  ADD4(dst8, offset_vec, dst9, offset_vec, dst10, offset_vec, dst11,
3241  offset_vec, dst8, dst9, dst10, dst11);
3242  ADD4(dst12, offset_vec, dst13, offset_vec, dst14, offset_vec,
3243  dst15, offset_vec, dst12, dst13, dst14, dst15);
3244  CLIP_SW8_0_255(dst8, dst9, dst10, dst11, dst12, dst13, dst14, dst15);
3245 
3246  PCKEV_H2_SH(dst1, dst0, dst3, dst2, tmp0, tmp1);
3247  PCKEV_H2_SH(dst5, dst4, dst7, dst6, tmp2, tmp3);
3248  PCKEV_H2_SH(dst9, dst8, dst11, dst10, tmp4, tmp5);
3249  PCKEV_H2_SH(dst13, dst12, dst15, dst14, tmp6, tmp7);
3250 
3251  PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
3252  PCKEV_B2_UB(tmp5, tmp4, tmp7, tmp6, out2, out3);
3253  ST_D8(out0, out1, out2, out3, 0, 1, 0, 1, 0, 1, 0, 1, dst, dst_stride);
3254  dst += (8 * dst_stride);
3255  }
3256 }
3257 
3258 static void hevc_hz_uniwgt_4t_8w_msa(const uint8_t *src,
3259  int32_t src_stride,
3260  uint8_t *dst,
3261  int32_t dst_stride,
3262  const int8_t *filter,
3263  int32_t height,
3264  int32_t weight,
3265  int32_t offset,
3266  int32_t rnd_val)
3267 {
3268  if (2 == height) {
3269  hevc_hz_uniwgt_4t_8x2_msa(src, src_stride, dst, dst_stride,
3270  filter, weight, offset, rnd_val);
3271  } else if (4 == height) {
3272  hevc_hz_uniwgt_4t_8x4_msa(src, src_stride, dst, dst_stride,
3273  filter, weight, offset, rnd_val);
3274  } else if (6 == height) {
3275  hevc_hz_uniwgt_4t_8x6_msa(src, src_stride, dst, dst_stride,
3276  filter, weight, offset, rnd_val);
3277  } else {
3278  hevc_hz_uniwgt_4t_8x8multiple_msa(src, src_stride, dst, dst_stride,
3280  rnd_val);
3281  }
3282 }
3283 
3284 static void hevc_hz_uniwgt_4t_12w_msa(const uint8_t *src,
3285  int32_t src_stride,
3286  uint8_t *dst,
3287  int32_t dst_stride,
3288  const int8_t *filter,
3289  int32_t height,
3290  int32_t weight,
3291  int32_t offset,
3292  int32_t rnd_val)
3293 {
3294  uint32_t loop_cnt;
3295  v16u8 out0, out1, out2;
3296  v8i16 filter_vec, filt0, filt1;
3297  v16i8 src0, src1, src2, src3;
3298  v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]);
3299  v16i8 mask2 = { 8, 9, 9, 10, 10, 11, 11, 12, 24, 25, 25, 26, 26, 27, 27, 28
3300  };
3301  v16i8 mask1;
3302  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9, vec10;
3303  v4i32 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
3304  v4i32 dst8, dst9, dst10, dst11;
3305  v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
3306  v16i8 mask3, vec11;
3307  v4i32 weight_vec, rnd_vec, offset_vec;
3308  v8i16 zero = { 0 };
3309 
3310  src -= 1;
3311 
3312  filter_vec = LD_SH(filter);
3313  UNPCK_R_SB_SH(filter_vec, filter_vec);
3314  SPLATI_W2_SH(filter_vec, 0, filt0, filt1);
3315 
3316  weight_vec = __msa_fill_w(weight);
3317  rnd_vec = __msa_fill_w(rnd_val);
3318  offset_vec = __msa_fill_w(offset);
3319 
3320  mask1 = mask0 + 2;
3321  mask3 = mask2 + 2;
3322 
3323  for (loop_cnt = 4; loop_cnt--;) {
3324  LD_SB4(src, src_stride, src0, src1, src2, src3);
3325  src += (4 * src_stride);
3326 
3327  VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3328  VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
3329  VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
3330  VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec6, vec7);
3331  VSHF_B2_SB(src0, src1, src0, src1, mask2, mask3, vec8, vec9);
3332  VSHF_B2_SB(src2, src3, src2, src3, mask2, mask3, vec10, vec11);
3333 
3334  ILVRL_B2_SH(zero, vec0, tmp0, tmp1);
3335  ILVRL_B2_SH(zero, vec1, tmp2, tmp3);
3336  ILVRL_B2_SH(zero, vec2, tmp4, tmp5);
3337  ILVRL_B2_SH(zero, vec3, tmp6, tmp7);
3338  dst0 = HEVC_FILT_4TAP_SW(tmp0, tmp2, filt0, filt1);
3339  dst1 = HEVC_FILT_4TAP_SW(tmp1, tmp3, filt0, filt1);
3340  dst2 = HEVC_FILT_4TAP_SW(tmp4, tmp6, filt0, filt1);
3341  dst3 = HEVC_FILT_4TAP_SW(tmp5, tmp7, filt0, filt1);
3342  ILVRL_B2_SH(zero, vec4, tmp0, tmp1);
3343  ILVRL_B2_SH(zero, vec5, tmp2, tmp3);
3344  ILVRL_B2_SH(zero, vec6, tmp4, tmp5);
3345  ILVRL_B2_SH(zero, vec7, tmp6, tmp7);
3346  dst4 = HEVC_FILT_4TAP_SW(tmp0, tmp2, filt0, filt1);
3347  dst5 = HEVC_FILT_4TAP_SW(tmp1, tmp3, filt0, filt1);
3348  dst6 = HEVC_FILT_4TAP_SW(tmp4, tmp6, filt0, filt1);
3349  dst7 = HEVC_FILT_4TAP_SW(tmp5, tmp7, filt0, filt1);
3350  ILVRL_B2_SH(zero, vec8, tmp0, tmp1);
3351  ILVRL_B2_SH(zero, vec9, tmp2, tmp3);
3352  ILVRL_B2_SH(zero, vec10, tmp4, tmp5);
3353  ILVRL_B2_SH(zero, vec11, tmp6, tmp7);
3354  dst8 = HEVC_FILT_4TAP_SW(tmp0, tmp2, filt0, filt1);
3355  dst9 = HEVC_FILT_4TAP_SW(tmp1, tmp3, filt0, filt1);
3356  dst10 = HEVC_FILT_4TAP_SW(tmp4, tmp6, filt0, filt1);
3357  dst11 = HEVC_FILT_4TAP_SW(tmp5, tmp7, filt0, filt1);
3358 
3359  MUL4(dst0, weight_vec, dst1, weight_vec, dst2, weight_vec, dst3,
3360  weight_vec, dst0, dst1, dst2, dst3);
3361  MUL4(dst4, weight_vec, dst5, weight_vec, dst6, weight_vec,
3362  dst7, weight_vec, dst4, dst5, dst6, dst7);
3363  SRAR_W4_SW(dst0, dst1, dst2, dst3, rnd_vec);
3364  SRAR_W4_SW(dst4, dst5, dst6, dst7, rnd_vec);
3365  ADD4(dst0, offset_vec, dst1, offset_vec, dst2, offset_vec, dst3,
3366  offset_vec, dst0, dst1, dst2, dst3);
3367  ADD4(dst4, offset_vec, dst5, offset_vec, dst6, offset_vec,
3368  dst7, offset_vec, dst4, dst5, dst6, dst7);
3369  CLIP_SW8_0_255(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
3370 
3371  MUL4(dst8, weight_vec, dst9, weight_vec, dst10, weight_vec, dst11,
3372  weight_vec, dst8, dst9, dst10, dst11);
3373  SRAR_W4_SW(dst8, dst9, dst10, dst11, rnd_vec);
3374  ADD4(dst8, offset_vec, dst9, offset_vec, dst10, offset_vec, dst11,
3375  offset_vec, dst8, dst9, dst10, dst11);
3376  CLIP_SW4_0_255(dst8, dst9, dst10, dst11);
3377 
3378  PCKEV_H2_SH(dst1, dst0, dst3, dst2, tmp0, tmp1);
3379  PCKEV_H2_SH(dst5, dst4, dst7, dst6, tmp2, tmp3);
3380  PCKEV_H2_SH(dst9, dst8, dst11, dst10, tmp4, tmp5);
3381 
3382  PCKEV_B3_UB(tmp1, tmp0, tmp3, tmp2, tmp5, tmp4, out0, out1, out2);
3383  ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
3384  ST_W4(out2, 0, 1, 2, 3, dst + 8, dst_stride);
3385  dst += (4 * dst_stride);
3386  }
3387 }
3388 
3389 static void hevc_hz_uniwgt_4t_16w_msa(const uint8_t *src,
3390  int32_t src_stride,
3391  uint8_t *dst,
3392  int32_t dst_stride,
3393  const int8_t *filter,
3394  int32_t height,
3395  int32_t weight,
3396  int32_t offset,
3397  int32_t rnd_val)
3398 {
3399  uint32_t loop_cnt;
3400  v16u8 out0, out1, out2, out3;
3401  v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
3402  v8i16 filter_vec, filt0, filt1;
3403  v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]);
3404  v16i8 mask1;
3405  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
3406  v4i32 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
3407  v4i32 dst8, dst9, dst10, dst11, dst12, dst13, dst14, dst15;
3408  v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
3409  v4i32 weight_vec, rnd_vec, offset_vec;
3410  v8i16 zero = { 0 };
3411 
3412  src -= 1;
3413 
3414  filter_vec = LD_SH(filter);
3415  UNPCK_R_SB_SH(filter_vec, filter_vec);
3416  SPLATI_W2_SH(filter_vec, 0, filt0, filt1);
3417 
3418  weight_vec = __msa_fill_w(weight);
3419  rnd_vec = __msa_fill_w(rnd_val);
3420  offset_vec = __msa_fill_w(offset);
3421 
3422  mask1 = mask0 + 2;
3423 
3424  for (loop_cnt = (height >> 2); loop_cnt--;) {
3425  LD_SB4(src, src_stride, src0, src2, src4, src6);
3426  LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
3427  src += (4 * src_stride);
3428 
3429  VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3430  VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
3431  VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
3432  VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec6, vec7);
3433  ILVRL_B2_SH(zero, vec0, tmp0, tmp1);
3434  ILVRL_B2_SH(zero, vec1, tmp2, tmp3);
3435  ILVRL_B2_SH(zero, vec2, tmp4, tmp5);
3436  ILVRL_B2_SH(zero, vec3, tmp6, tmp7);
3437  dst0 = HEVC_FILT_4TAP_SW(tmp0, tmp2, filt0, filt1);
3438  dst1 = HEVC_FILT_4TAP_SW(tmp1, tmp3, filt0, filt1);
3439  dst2 = HEVC_FILT_4TAP_SW(tmp4, tmp6, filt0, filt1);
3440  dst3 = HEVC_FILT_4TAP_SW(tmp5, tmp7, filt0, filt1);
3441  ILVRL_B2_SH(zero, vec4, tmp0, tmp1);
3442  ILVRL_B2_SH(zero, vec5, tmp2, tmp3);
3443  ILVRL_B2_SH(zero, vec6, tmp4, tmp5);
3444  ILVRL_B2_SH(zero, vec7, tmp6, tmp7);
3445  dst4 = HEVC_FILT_4TAP_SW(tmp0, tmp2, filt0, filt1);
3446  dst5 = HEVC_FILT_4TAP_SW(tmp1, tmp3, filt0, filt1);
3447  dst6 = HEVC_FILT_4TAP_SW(tmp4, tmp6, filt0, filt1);
3448  dst7 = HEVC_FILT_4TAP_SW(tmp5, tmp7, filt0, filt1);
3449 
3450  VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
3451  VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec2, vec3);
3452  VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec4, vec5);
3453  VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec6, vec7);
3454  ILVRL_B2_SH(zero, vec0, tmp0, tmp1);
3455  ILVRL_B2_SH(zero, vec1, tmp2, tmp3);
3456  ILVRL_B2_SH(zero, vec2, tmp4, tmp5);
3457  ILVRL_B2_SH(zero, vec3, tmp6, tmp7);
3458  dst8 = HEVC_FILT_4TAP_SW(tmp0, tmp2, filt0, filt1);
3459  dst9 = HEVC_FILT_4TAP_SW(tmp1, tmp3, filt0, filt1);
3460  dst10 = HEVC_FILT_4TAP_SW(tmp4, tmp6, filt0, filt1);
3461  dst11 = HEVC_FILT_4TAP_SW(tmp5, tmp7, filt0, filt1);
3462  ILVRL_B2_SH(zero, vec4, tmp0, tmp1);
3463  ILVRL_B2_SH(zero, vec5, tmp2, tmp3);
3464  ILVRL_B2_SH(zero, vec6, tmp4, tmp5);
3465  ILVRL_B2_SH(zero, vec7, tmp6, tmp7);
3466  dst12 = HEVC_FILT_4TAP_SW(tmp0, tmp2, filt0, filt1);
3467  dst13 = HEVC_FILT_4TAP_SW(tmp1, tmp3, filt0, filt1);
3468  dst14 = HEVC_FILT_4TAP_SW(tmp4, tmp6, filt0, filt1);
3469  dst15 = HEVC_FILT_4TAP_SW(tmp5, tmp7, filt0, filt1);
3470 
3471  MUL4(dst0, weight_vec, dst1, weight_vec, dst2, weight_vec, dst3,
3472  weight_vec, dst0, dst1, dst2, dst3);
3473  MUL4(dst4, weight_vec, dst5, weight_vec, dst6, weight_vec,
3474  dst7, weight_vec, dst4, dst5, dst6, dst7);
3475  SRAR_W4_SW(dst0, dst1, dst2, dst3, rnd_vec);
3476  SRAR_W4_SW(dst4, dst5, dst6, dst7, rnd_vec);
3477  ADD4(dst0, offset_vec, dst1, offset_vec, dst2, offset_vec, dst3,
3478  offset_vec, dst0, dst1, dst2, dst3);
3479  ADD4(dst4, offset_vec, dst5, offset_vec, dst6, offset_vec,
3480  dst7, offset_vec, dst4, dst5, dst6, dst7);
3481  CLIP_SW8_0_255(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
3482 
3483  MUL4(dst8, weight_vec, dst9, weight_vec, dst10, weight_vec, dst11,
3484  weight_vec, dst8, dst9, dst10, dst11);
3485  MUL4(dst12, weight_vec, dst13, weight_vec, dst14, weight_vec,
3486  dst15, weight_vec, dst12, dst13, dst14, dst15);
3487  SRAR_W4_SW(dst8, dst9, dst10, dst11, rnd_vec);
3488  SRAR_W4_SW(dst12, dst13, dst14, dst15, rnd_vec);
3489  ADD4(dst8, offset_vec, dst9, offset_vec, dst10, offset_vec, dst11,
3490  offset_vec, dst8, dst9, dst10, dst11);
3491  ADD4(dst12, offset_vec, dst13, offset_vec, dst14, offset_vec,
3492  dst15, offset_vec, dst12, dst13, dst14, dst15);
3493  CLIP_SW8_0_255(dst8, dst9, dst10, dst11, dst12, dst13, dst14, dst15);
3494 
3495  PCKEV_H2_SH(dst1, dst0, dst3, dst2, tmp0, tmp1);
3496  PCKEV_H2_SH(dst5, dst4, dst7, dst6, tmp2, tmp3);
3497  PCKEV_H2_SH(dst9, dst8, dst11, dst10, tmp4, tmp5);
3498  PCKEV_H2_SH(dst13, dst12, dst15, dst14, tmp6, tmp7);
3499  PCKEV_B4_UB(tmp1, tmp0, tmp3, tmp2, tmp5, tmp4, tmp7, tmp6,
3500  out0, out1, out2, out3);
3501 
3502  ST_UB4(out0, out1, out2, out3, dst, dst_stride);
3503  dst += (4 * dst_stride);
3504  }
3505 }
3506 
3507 static void hevc_hz_uniwgt_4t_24w_msa(const uint8_t *src,
3508  int32_t src_stride,
3509  uint8_t *dst,
3510  int32_t dst_stride,
3511  const int8_t *filter,
3512  int32_t height,
3513  int32_t weight,
3514  int32_t offset,
3515  int32_t rnd_val)
3516 {
3517  uint32_t loop_cnt;
3518  v16u8 out0, out1, out2;
3519  v16i8 src0, src1, src2, src3;
3520  v8i16 filter_vec, filt0, filt1;
3521  v16i8 mask0, mask1, mask2, mask3;
3522  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
3523  v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
3524  v4i32 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
3525  v4i32 dst8, dst9, dst10, dst11;
3526  v4i32 weight_vec, rnd_vec, offset_vec;
3527  v8i16 zero = { 0 };
3528 
3529  src -= 1;
3530 
3531  filter_vec = LD_SH(filter);
3532  UNPCK_R_SB_SH(filter_vec, filter_vec);
3533  SPLATI_W2_SH(filter_vec, 0, filt0, filt1);
3534 
3535  weight_vec = __msa_fill_w(weight);
3536  rnd_vec = __msa_fill_w(rnd_val);
3537  offset_vec = __msa_fill_w(offset);
3538 
3539  mask0 = LD_SB(&ff_hevc_mask_arr[0]);
3540  mask1 = mask0 + 2;
3541  mask2 = mask0 + 8;
3542  mask3 = mask0 + 10;
3543 
3544  for (loop_cnt = 16; loop_cnt--;) {
3545  LD_SB2(src, src_stride, src0, src2);
3546  LD_SB2(src + 16, src_stride, src1, src3);
3547  src += (2 * src_stride);
3548 
3549  VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3550  VSHF_B2_SB(src0, src1, src0, src1, mask2, mask3, vec2, vec3);
3551  VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
3552  VSHF_B2_SB(src2, src3, src2, src3, mask2, mask3, vec6, vec7);
3553  ILVRL_B2_SH(zero, vec0, tmp0, tmp1);
3554  ILVRL_B2_SH(zero, vec1, tmp2, tmp3);
3555  ILVRL_B2_SH(zero, vec2, tmp4, tmp5);
3556  ILVRL_B2_SH(zero, vec3, tmp6, tmp7);
3557  dst0 = HEVC_FILT_4TAP_SW(tmp0, tmp2, filt0, filt1);
3558  dst1 = HEVC_FILT_4TAP_SW(tmp1, tmp3, filt0, filt1);
3559  dst2 = HEVC_FILT_4TAP_SW(tmp4, tmp6, filt0, filt1);
3560  dst3 = HEVC_FILT_4TAP_SW(tmp5, tmp7, filt0, filt1);
3561  ILVRL_B2_SH(zero, vec4, tmp0, tmp1);
3562  ILVRL_B2_SH(zero, vec5, tmp2, tmp3);
3563  ILVRL_B2_SH(zero, vec6, tmp4, tmp5);
3564  ILVRL_B2_SH(zero, vec7, tmp6, tmp7);
3565  dst4 = HEVC_FILT_4TAP_SW(tmp0, tmp2, filt0, filt1);
3566  dst5 = HEVC_FILT_4TAP_SW(tmp1, tmp3, filt0, filt1);
3567  dst6 = HEVC_FILT_4TAP_SW(tmp4, tmp6, filt0, filt1);
3568  dst7 = HEVC_FILT_4TAP_SW(tmp5, tmp7, filt0, filt1);
3569 
3570  VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
3571  VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec2, vec3);
3572  ILVRL_B2_SH(zero, vec0, tmp0, tmp1);
3573  ILVRL_B2_SH(zero, vec1, tmp2, tmp3);
3574  ILVRL_B2_SH(zero, vec2, tmp4, tmp5);
3575  ILVRL_B2_SH(zero, vec3, tmp6, tmp7);
3576  dst8 = HEVC_FILT_4TAP_SW(tmp0, tmp2, filt0, filt1);
3577  dst9 = HEVC_FILT_4TAP_SW(tmp1, tmp3, filt0, filt1);
3578  dst10 = HEVC_FILT_4TAP_SW(tmp4, tmp6, filt0, filt1);
3579  dst11 = HEVC_FILT_4TAP_SW(tmp5, tmp7, filt0, filt1);
3580 
3581  MUL4(dst0, weight_vec, dst1, weight_vec, dst2, weight_vec, dst3,
3582  weight_vec, dst0, dst1, dst2, dst3);
3583  MUL4(dst4, weight_vec, dst5, weight_vec, dst6, weight_vec,
3584  dst7, weight_vec, dst4, dst5, dst6, dst7);
3585  SRAR_W4_SW(dst0, dst1, dst2, dst3, rnd_vec);
3586  SRAR_W4_SW(dst4, dst5, dst6, dst7, rnd_vec);
3587  ADD4(dst0, offset_vec, dst1, offset_vec, dst2, offset_vec, dst3,
3588  offset_vec, dst0, dst1, dst2, dst3);
3589  ADD4(dst4, offset_vec, dst5, offset_vec, dst6, offset_vec,
3590  dst7, offset_vec, dst4, dst5, dst6, dst7);
3591  CLIP_SW8_0_255(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
3592 
3593  MUL4(dst8, weight_vec, dst9, weight_vec, dst10, weight_vec, dst11,
3594  weight_vec, dst8, dst9, dst10, dst11);
3595  SRAR_W4_SW(dst8, dst9, dst10, dst11, rnd_vec);
3596  ADD4(dst8, offset_vec, dst9, offset_vec, dst10, offset_vec, dst11,
3597  offset_vec, dst8, dst9, dst10, dst11);
3598  CLIP_SW4_0_255(dst8, dst9, dst10, dst11);
3599 
3600  PCKEV_H2_SH(dst1, dst0, dst3, dst2, tmp0, tmp1);
3601  PCKEV_H2_SH(dst5, dst4, dst7, dst6, tmp2, tmp3);
3602  PCKEV_H2_SH(dst9, dst8, dst11, dst10, tmp4, tmp5);
3603 
3604  PCKEV_B3_UB(tmp1, tmp0, tmp3, tmp2, tmp5, tmp4, out0, out1, out2);
3605  ST_UB2(out0, out1, dst, dst_stride);
3606  ST_D2(out2, 0, 1, dst + 16, dst_stride);
3607  dst += (2 * dst_stride);
3608  }
3609 }
3610 
3611 static void hevc_hz_uniwgt_4t_32w_msa(const uint8_t *src,
3612  int32_t src_stride,
3613  uint8_t *dst,
3614  int32_t dst_stride,
3615  const int8_t *filter,
3616  int32_t height,
3617  int32_t weight,
3618  int32_t offset,
3619  int32_t rnd_val)
3620 {
3621  uint32_t loop_cnt;
3622  v16u8 out0, out1, out2, out3;
3623  v16i8 src0, src1, src2, src3, src4, src5;
3624  v8i16 filter_vec, filt0, filt1;
3625  v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]);
3626  v16i8 mask1, mask2, mask3;
3627  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
3628  v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
3629  v4i32 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
3630  v4i32 dst8, dst9, dst10, dst11, dst12, dst13, dst14, dst15;
3631  v4i32 weight_vec, rnd_vec, offset_vec;
3632  v8i16 zero = { 0 };
3633 
3634  src -= 1;
3635 
3636  filter_vec = LD_SH(filter);
3637  UNPCK_R_SB_SH(filter_vec, filter_vec);
3638  SPLATI_W2_SH(filter_vec, 0, filt0, filt1);
3639 
3640  weight_vec = __msa_fill_w(weight);
3641  rnd_vec = __msa_fill_w(rnd_val);
3642  offset_vec = __msa_fill_w(offset);
3643 
3644  mask1 = mask0 + 2;
3645  mask2 = mask0 + 8;
3646  mask3 = mask0 + 10;
3647 
3648  for (loop_cnt = (height >> 1); loop_cnt--;) {
3649  LD_SB2(src, 16, src0, src1);
3650  src2 = LD_SB(src + 24);
3651  src += src_stride;
3652  LD_SB2(src, 16, src3, src4);
3653  src5 = LD_SB(src + 24);
3654  src += src_stride;
3655  VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3656  VSHF_B2_SB(src0, src1, src0, src1, mask2, mask3, vec2, vec3);
3657  VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec4, vec5);
3658  VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec6, vec7);
3659  ILVRL_B2_SH(zero, vec0, tmp0, tmp1);
3660  ILVRL_B2_SH(zero, vec1, tmp2, tmp3);
3661  ILVRL_B2_SH(zero, vec2, tmp4, tmp5);
3662  ILVRL_B2_SH(zero, vec3, tmp6, tmp7);
3663  dst0 = HEVC_FILT_4TAP_SW(tmp0, tmp2, filt0, filt1);
3664  dst1 = HEVC_FILT_4TAP_SW(tmp1, tmp3, filt0, filt1);
3665  dst2 = HEVC_FILT_4TAP_SW(tmp4, tmp6, filt0, filt1);
3666  dst3 = HEVC_FILT_4TAP_SW(tmp5, tmp7, filt0, filt1);
3667  ILVRL_B2_SH(zero, vec4, tmp0, tmp1);
3668  ILVRL_B2_SH(zero, vec5, tmp2, tmp3);
3669  ILVRL_B2_SH(zero, vec6, tmp4, tmp5);
3670  ILVRL_B2_SH(zero, vec7, tmp6, tmp7);
3671  dst4 = HEVC_FILT_4TAP_SW(tmp0, tmp2, filt0, filt1);
3672  dst5 = HEVC_FILT_4TAP_SW(tmp1, tmp3, filt0, filt1);
3673  dst6 = HEVC_FILT_4TAP_SW(tmp4, tmp6, filt0, filt1);
3674  dst7 = HEVC_FILT_4TAP_SW(tmp5, tmp7, filt0, filt1);
3675 
3676  VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
3677  VSHF_B2_SB(src3, src4, src3, src4, mask2, mask3, vec2, vec3);
3678  VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec4, vec5);
3679  VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec6, vec7);
3680  ILVRL_B2_SH(zero, vec0, tmp0, tmp1);
3681  ILVRL_B2_SH(zero, vec1, tmp2, tmp3);
3682  ILVRL_B2_SH(zero, vec2, tmp4, tmp5);
3683  ILVRL_B2_SH(zero, vec3, tmp6, tmp7);
3684  dst8 = HEVC_FILT_4TAP_SW(tmp0, tmp2, filt0, filt1);
3685  dst9 = HEVC_FILT_4TAP_SW(tmp1, tmp3, filt0, filt1);
3686  dst10 = HEVC_FILT_4TAP_SW(tmp4, tmp6, filt0, filt1);
3687  dst11 = HEVC_FILT_4TAP_SW(tmp5, tmp7, filt0, filt1);
3688  ILVRL_B2_SH(zero, vec4, tmp0, tmp1);
3689  ILVRL_B2_SH(zero, vec5, tmp2, tmp3);
3690  ILVRL_B2_SH(zero, vec6, tmp4, tmp5);
3691  ILVRL_B2_SH(zero, vec7, tmp6, tmp7);
3692  dst12 = HEVC_FILT_4TAP_SW(tmp0, tmp2, filt0, filt1);
3693  dst13 = HEVC_FILT_4TAP_SW(tmp1, tmp3, filt0, filt1);
3694  dst14 = HEVC_FILT_4TAP_SW(tmp4, tmp6, filt0, filt1);
3695  dst15 = HEVC_FILT_4TAP_SW(tmp5, tmp7, filt0, filt1);
3696 
3697  MUL4(dst0, weight_vec, dst1, weight_vec, dst2, weight_vec, dst3,
3698  weight_vec, dst0, dst1, dst2, dst3);
3699  MUL4(dst4, weight_vec, dst5, weight_vec, dst6, weight_vec,
3700  dst7, weight_vec, dst4, dst5, dst6, dst7);
3701  SRAR_W4_SW(dst0, dst1, dst2, dst3, rnd_vec);
3702  SRAR_W4_SW(dst4, dst5, dst6, dst7, rnd_vec);
3703  ADD4(dst0, offset_vec, dst1, offset_vec, dst2, offset_vec, dst3,
3704  offset_vec, dst0, dst1, dst2, dst3);
3705  ADD4(dst4, offset_vec, dst5, offset_vec, dst6, offset_vec,
3706  dst7, offset_vec, dst4, dst5, dst6, dst7);
3707  CLIP_SW8_0_255(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
3708 
3709  MUL4(dst8, weight_vec, dst9, weight_vec, dst10, weight_vec, dst11,
3710  weight_vec, dst8, dst9, dst10, dst11);
3711  MUL4(dst12, weight_vec, dst13, weight_vec, dst14, weight_vec,
3712  dst15, weight_vec, dst12, dst13, dst14, dst15);
3713  SRAR_W4_SW(dst8, dst9, dst10, dst11, rnd_vec);
3714  SRAR_W4_SW(dst12, dst13, dst14, dst15, rnd_vec);
3715  ADD4(dst8, offset_vec, dst9, offset_vec, dst10, offset_vec, dst11,
3716  offset_vec, dst8, dst9, dst10, dst11);
3717  ADD4(dst12, offset_vec, dst13, offset_vec, dst14, offset_vec,
3718  dst15, offset_vec, dst12, dst13, dst14, dst15);
3719  CLIP_SW8_0_255(dst8, dst9, dst10, dst11, dst12, dst13, dst14, dst15);
3720 
3721  PCKEV_H2_SH(dst1, dst0, dst3, dst2, tmp0, tmp1);
3722  PCKEV_H2_SH(dst5, dst4, dst7, dst6, tmp2, tmp3);
3723  PCKEV_H2_SH(dst9, dst8, dst11, dst10, tmp4, tmp5);
3724  PCKEV_H2_SH(dst13, dst12, dst15, dst14, tmp6, tmp7);
3725  PCKEV_B4_UB(tmp1, tmp0, tmp3, tmp2, tmp5, tmp4, tmp7, tmp6,
3726  out0, out1, out2, out3);
3727  ST_UB2(out0, out1, dst, 16);
3728  dst += dst_stride;
3729  ST_UB2(out2, out3, dst, 16);
3730  dst += dst_stride;
3731  }
3732 }
3733 
3734 static void hevc_vt_uniwgt_4t_4x2_msa(const uint8_t *src,
3735  int32_t src_stride,
3736  uint8_t *dst,
3737  int32_t dst_stride,
3738  const int8_t *filter,
3739  int32_t weight,
3740  int32_t offset,
3741  int32_t rnd_val)
3742 {
3743  v16u8 out;
3744  v16i8 src0, src1, src2, src3, src4;
3745  v16i8 src10_r, src32_r, src21_r, src43_r;
3746  v16i8 src2110, src4332;
3747  v8i16 dst0;
3748  v4i32 dst0_r, dst0_l;
3749  v8i16 filter_vec, filt0, filt1;
3750  v8i16 tmp0, tmp1, tmp2, tmp3;
3751  v4i32 weight_vec, rnd_vec, offset_vec;
3752  v8i16 zero = { 0 };
3753 
3754  src -= src_stride;
3755 
3756  weight_vec = __msa_fill_w(weight);
3757  rnd_vec = __msa_fill_w(rnd_val);
3758  offset_vec = __msa_fill_w(offset);
3759 
3760  filter_vec = LD_SH(filter);
3761  UNPCK_R_SB_SH(filter_vec, filter_vec);
3762  SPLATI_W2_SH(filter_vec, 0, filt0, filt1);
3763 
3764  LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
3765  ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
3766  ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
3767  ILVR_D2_SB(src21_r, src10_r, src43_r, src32_r, src2110, src4332);
3768  ILVRL_B2_SH(zero, src2110, tmp0, tmp1);
3769  ILVRL_B2_SH(zero, src4332, tmp2, tmp3);
3770 
3771  dst0_r = HEVC_FILT_4TAP_SW(tmp0, tmp2, filt0, filt1);
3772  dst0_l = HEVC_FILT_4TAP_SW(tmp1, tmp3, filt0, filt1);
3773 
3774  MUL2(dst0_r, weight_vec, dst0_l, weight_vec, dst0_r, dst0_l);
3775  SRAR_W2_SW(dst0_r, dst0_l, rnd_vec);
3776  ADD2(dst0_r, offset_vec, dst0_l, offset_vec, dst0_r, dst0_l);
3777  CLIP_SW2_0_255(dst0_r, dst0_l);
3778  dst0 = __msa_pckev_h((v8i16) dst0_l, (v8i16) dst0_r);
3779  out = (v16u8) __msa_pckev_b((v16i8) dst0, (v16i8) dst0);
3780  ST_W2(out, 0, 1, dst, dst_stride);
3781 }
3782 
3783 static void hevc_vt_uniwgt_4t_4x4_msa(const uint8_t *src,
3784  int32_t src_stride,
3785  uint8_t *dst,
3786  int32_t dst_stride,
3787  const int8_t *filter,
3788  int32_t weight,
3789  int32_t offset,
3790  int32_t rnd_val)
3791 {
3792  v16u8 out;
3793  v16i8 src0, src1, src2, src3, src4, src5, src6;
3794  v16i8 src10_r, src32_r, src54_r, src21_r, src43_r, src65_r;
3795  v16i8 src2110, src4332, src6554;
3796  v4i32 dst0, dst1, dst2, dst3;
3797  v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
3798  v8i16 filter_vec, filt0, filt1;
3799  v4i32 weight_vec, rnd_vec, offset_vec;
3800  v8i16 zero = { 0 };
3801 
3802  src -= src_stride;
3803 
3804  weight_vec = __msa_fill_w(weight);
3805  rnd_vec = __msa_fill_w(rnd_val);
3806  offset_vec = __msa_fill_w(offset);
3807 
3808  filter_vec = LD_SH(filter);
3809  UNPCK_R_SB_SH(filter_vec, filter_vec);
3810  SPLATI_W2_SH(filter_vec, 0, filt0, filt1);
3811 
3812  LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
3813  ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
3814  ILVR_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5,
3815  src32_r, src43_r, src54_r, src65_r);
3816  ILVR_D3_SB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r,
3817  src2110, src4332, src6554);
3818 
3819  ILVRL_B2_SH(zero, src2110, tmp0, tmp1);
3820  ILVRL_B2_SH(zero, src4332, tmp2, tmp3);
3821  ILVRL_B2_SH(zero, src6554, tmp4, tmp5);
3822 
3823  dst0 = HEVC_FILT_4TAP_SW(tmp0, tmp2, filt0, filt1);
3824  dst1 = HEVC_FILT_4TAP_SW(tmp1, tmp3, filt0, filt1);
3825  dst2 = HEVC_FILT_4TAP_SW(tmp2, tmp4, filt0, filt1);
3826  dst3 = HEVC_FILT_4TAP_SW(tmp3, tmp5, filt0, filt1);
3827 
3828  MUL4(dst0, weight_vec, dst1, weight_vec, dst2, weight_vec, dst3,
3829  weight_vec, dst0, dst1, dst2, dst3);
3830  SRAR_W4_SW(dst0, dst1, dst2, dst3, rnd_vec);
3831  ADD4(dst0, offset_vec, dst1, offset_vec, dst2, offset_vec, dst3,
3832  offset_vec, dst0, dst1, dst2, dst3);
3833  CLIP_SW4_0_255(dst0, dst1, dst2, dst3);
3834 
3835  PCKEV_H2_SH(dst1, dst0, dst3, dst2, tmp0, tmp1);
3836  out = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
3837  ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
3838 }
3839 
3840 static void hevc_vt_uniwgt_4t_4x8multiple_msa(const uint8_t *src,
3841  int32_t src_stride,
3842  uint8_t *dst,
3843  int32_t dst_stride,
3844  const int8_t *filter,
3845  int32_t height,
3846  int32_t weight,
3847  int32_t offset,
3848  int32_t rnd_val)
3849 {
3850  int32_t loop_cnt;
3851  v16u8 out0, out1;
3852  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
3853  v16i8 src10_r, src32_r, src54_r, src76_r, src98_r;
3854  v16i8 src21_r, src43_r, src65_r, src87_r, src109_r;
3855  v16i8 src2110, src4332, src6554, src8776;
3856  v16i8 src10998;
3857  v4i32 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
3858  v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
3859  v8i16 filter_vec, filt0, filt1;
3860  v4i32 weight_vec, rnd_vec, offset_vec;
3861  v8i16 zero = { 0 };
3862 
3863  src -= src_stride;
3864 
3865  weight_vec = __msa_fill_w(weight);
3866  rnd_vec = __msa_fill_w(rnd_val);
3867  offset_vec = __msa_fill_w(offset);
3868 
3869  filter_vec = LD_SH(filter);
3870  UNPCK_R_SB_SH(filter_vec, filter_vec);
3871  SPLATI_W2_SH(filter_vec, 0, filt0, filt1);
3872 
3873  LD_SB3(src, src_stride, src0, src1, src2);
3874  src += (3 * src_stride);
3875  ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
3876  src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_r, (v2i64) src10_r);
3877 
3878  for (loop_cnt = (height >> 3); loop_cnt--;) {
3879  LD_SB8(src, src_stride,
3880  src3, src4, src5, src6, src7, src8, src9, src10);
3881  src += (8 * src_stride);
3882  ILVR_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5,
3883  src32_r, src43_r, src54_r, src65_r);
3884  ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
3885  ILVR_B2_SB(src9, src8, src10, src9, src98_r, src109_r);
3886  ILVR_D4_SB(src43_r, src32_r, src65_r, src54_r, src87_r, src76_r,
3887  src109_r, src98_r, src4332, src6554, src8776, src10998);
3888 
3889  ILVRL_B2_SH(zero, src2110, tmp0, tmp1);
3890  ILVRL_B2_SH(zero, src4332, tmp2, tmp3);
3891  ILVRL_B2_SH(zero, src6554, tmp4, tmp5);
3892  ILVRL_B2_SH(zero, src8776, tmp6, tmp7);
3893  dst0 = HEVC_FILT_4TAP_SW(tmp0, tmp2, filt0, filt1);
3894  dst1 = HEVC_FILT_4TAP_SW(tmp1, tmp3, filt0, filt1);
3895  dst2 = HEVC_FILT_4TAP_SW(tmp2, tmp4, filt0, filt1);
3896  dst3 = HEVC_FILT_4TAP_SW(tmp3, tmp5, filt0, filt1);
3897  ILVRL_B2_SH(zero, src10998, tmp0, tmp1);
3898  dst4 = HEVC_FILT_4TAP_SW(tmp4, tmp6, filt0, filt1);
3899  dst5 = HEVC_FILT_4TAP_SW(tmp5, tmp7, filt0, filt1);
3900  dst6 = HEVC_FILT_4TAP_SW(tmp6, tmp0, filt0, filt1);
3901  dst7 = HEVC_FILT_4TAP_SW(tmp7, tmp1, filt0, filt1);
3902 
3903  MUL4(dst0, weight_vec, dst1, weight_vec, dst2, weight_vec, dst3,
3904  weight_vec, dst0, dst1, dst2, dst3);
3905  MUL4(dst4, weight_vec, dst5, weight_vec, dst6, weight_vec,
3906  dst7, weight_vec, dst4, dst5, dst6, dst7);
3907  SRAR_W4_SW(dst0, dst1, dst2, dst3, rnd_vec);
3908  SRAR_W4_SW(dst4, dst5, dst6, dst7, rnd_vec);
3909  ADD4(dst0, offset_vec, dst1, offset_vec, dst2, offset_vec, dst3,
3910  offset_vec, dst0, dst1, dst2, dst3);
3911  ADD4(dst4, offset_vec, dst5, offset_vec, dst6, offset_vec,
3912  dst7, offset_vec, dst4, dst5, dst6, dst7);
3913  CLIP_SW8_0_255(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
3914  PCKEV_H2_SH(dst1, dst0, dst3, dst2, tmp0, tmp1);
3915  PCKEV_H2_SH(dst5, dst4, dst7, dst6, tmp2, tmp3);
3916 
3917  PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
3918  ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
3919  dst += (8 * dst_stride);
3920 
3921  src2 = src10;
3922  src2110 = src10998;
3923  }
3924 }
3925 
3926 static void hevc_vt_uniwgt_4t_4w_msa(const uint8_t *src,
3927  int32_t src_stride,
3928  uint8_t *dst,
3929  int32_t dst_stride,
3930  const int8_t *filter,
3931  int32_t height,
3932  int32_t weight,
3933  int32_t offset,
3934  int32_t rnd_val)
3935 {
3936  if (2 == height) {
3937  hevc_vt_uniwgt_4t_4x2_msa(src, src_stride, dst, dst_stride,
3938  filter, weight, offset, rnd_val);
3939  } else if (4 == height) {
3940  hevc_vt_uniwgt_4t_4x4_msa(src, src_stride, dst, dst_stride,
3941  filter, weight, offset, rnd_val);
3942  } else if (0 == (height % 8)) {
3943  hevc_vt_uniwgt_4t_4x8multiple_msa(src, src_stride, dst, dst_stride,
3945  rnd_val);
3946  }
3947 }
3948 
3949 static void hevc_vt_uniwgt_4t_6w_msa(const uint8_t *src,
3950  int32_t src_stride,
3951  uint8_t *dst,
3952  int32_t dst_stride,
3953  const int8_t *filter,
3954  int32_t height,
3955  int32_t weight,
3956  int32_t offset,
3957  int32_t rnd_val)
3958 {
3959  v16u8 out0, out1, out2, out3;
3960  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
3961  v16i8 src10_r, src32_r, src21_r, src43_r;
3962  v16i8 src54_r, src65_r, src76_r, src87_r, src98_r, src109_r;
3963  v8i16 filter_vec, filt0, filt1;
3964  v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
3965  v4i32 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
3966  v4i32 dst8, dst9, dst10, dst11, dst12, dst13, dst14, dst15;
3967  v4i32 weight_vec, rnd_vec, offset_vec;
3968  v8i16 zero = { 0 };
3969 
3970  src -= src_stride;
3971 
3972  weight_vec = __msa_fill_w(weight);
3973  rnd_vec = __msa_fill_w(rnd_val);
3974  offset_vec = __msa_fill_w(offset);
3975 
3976  filter_vec = LD_SH(filter);
3977  UNPCK_R_SB_SH(filter_vec, filter_vec);
3978  SPLATI_W2_SH(filter_vec, 0, filt0, filt1);
3979 
3980  LD_SB3(src, src_stride, src0, src1, src2);
3981  src += (3 * src_stride);
3982  LD_SB8(src, src_stride, src3, src4, src5, src6, src7, src8, src9, src10);
3983  ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
3984  ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
3985  ILVR_B2_SB(src5, src4, src6, src5, src54_r, src65_r);
3986  ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
3987  ILVR_B2_SB(src9, src8, src10, src9, src98_r, src109_r);
3988 
3989  ILVRL_B2_SH(zero, src10_r, tmp0, tmp1);
3990  ILVRL_B2_SH(zero, src21_r, tmp2, tmp3);
3991  ILVRL_B2_SH(zero, src32_r, tmp4, tmp5);
3992  ILVRL_B2_SH(zero, src43_r, tmp6, tmp7);
3993  dst0 = HEVC_FILT_4TAP_SW(tmp0, tmp4, filt0, filt1);
3994  dst1 = HEVC_FILT_4TAP_SW(tmp1, tmp5, filt0, filt1);
3995  dst2 = HEVC_FILT_4TAP_SW(tmp2, tmp6, filt0, filt1);
3996  dst3 = HEVC_FILT_4TAP_SW(tmp3, tmp7, filt0, filt1);
3997  ILVRL_B2_SH(zero, src54_r, tmp0, tmp1);
3998  ILVRL_B2_SH(zero, src65_r, tmp2, tmp3);
3999  dst4 = HEVC_FILT_4TAP_SW(tmp4, tmp0, filt0, filt1);
4000  dst5 = HEVC_FILT_4TAP_SW(tmp5, tmp1, filt0, filt1);
4001  dst6 = HEVC_FILT_4TAP_SW(tmp6, tmp2, filt0, filt1);
4002  dst7 = HEVC_FILT_4TAP_SW(tmp7, tmp3, filt0, filt1);
4003  ILVRL_B2_SH(zero, src76_r, tmp4, tmp5);
4004  ILVRL_B2_SH(zero, src87_r, tmp6, tmp7);
4005  dst8 = HEVC_FILT_4TAP_SW(tmp0, tmp4, filt0, filt1);
4006  dst9 = HEVC_FILT_4TAP_SW(tmp1, tmp5, filt0, filt1);
4007  dst10 = HEVC_FILT_4TAP_SW(tmp2, tmp6, filt0, filt1);
4008  dst11 = HEVC_FILT_4TAP_SW(tmp3, tmp7, filt0, filt1);
4009  ILVRL_B2_SH(zero, src98_r, tmp0, tmp1);
4010  ILVRL_B2_SH(zero, src109_r, tmp2, tmp3);
4011  dst12 = HEVC_FILT_4TAP_SW(tmp4, tmp0, filt0, filt1);
4012  dst13 = HEVC_FILT_4TAP_SW(tmp5, tmp1, filt0, filt1);
4013  dst14 = HEVC_FILT_4TAP_SW(tmp6, tmp2, filt0, filt1);
4014  dst15 = HEVC_FILT_4TAP_SW(tmp7, tmp3, filt0, filt1);
4015 
4016  MUL4(dst0, weight_vec, dst1, weight_vec, dst2, weight_vec, dst3,
4017  weight_vec, dst0, dst1, dst2, dst3);
4018  MUL4(dst4, weight_vec, dst5, weight_vec, dst6, weight_vec,
4019  dst7, weight_vec, dst4, dst5, dst6, dst7);
4020  SRAR_W4_SW(dst0, dst1, dst2, dst3, rnd_vec);
4021  SRAR_W4_SW(dst4, dst5, dst6, dst7, rnd_vec);
4022  ADD4(dst0, offset_vec, dst1, offset_vec, dst2, offset_vec, dst3,
4023  offset_vec, dst0, dst1, dst2, dst3);
4024  ADD4(dst4, offset_vec, dst5, offset_vec, dst6, offset_vec,
4025  dst7, offset_vec, dst4, dst5, dst6, dst7);
4026  CLIP_SW8_0_255(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
4027 
4028  MUL4(dst8, weight_vec, dst9, weight_vec, dst10, weight_vec, dst11,
4029  weight_vec, dst8, dst9, dst10, dst11);
4030  MUL4(dst12, weight_vec, dst13, weight_vec, dst14, weight_vec,
4031  dst15, weight_vec, dst12, dst13, dst14, dst15);
4032  SRAR_W4_SW(dst8, dst9, dst10, dst11, rnd_vec);
4033  SRAR_W4_SW(dst12, dst13, dst14, dst15, rnd_vec);
4034  ADD4(dst8, offset_vec, dst9, offset_vec, dst10, offset_vec, dst11,
4035  offset_vec, dst8, dst9, dst10, dst11);
4036  ADD4(dst12, offset_vec, dst13, offset_vec, dst14, offset_vec,
4037  dst15, offset_vec, dst12, dst13, dst14, dst15);
4038  CLIP_SW8_0_255(dst8, dst9, dst10, dst11, dst12, dst13, dst14, dst15);
4039 
4040  PCKEV_H2_SH(dst1, dst0, dst3, dst2, tmp0, tmp1);
4041  PCKEV_H2_SH(dst5, dst4, dst7, dst6, tmp2, tmp3);
4042  PCKEV_H2_SH(dst9, dst8, dst11, dst10, tmp4, tmp5);
4043  PCKEV_H2_SH(dst13, dst12, dst15, dst14, tmp6, tmp7);
4044 
4045  PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
4046  PCKEV_B2_UB(tmp5, tmp4, tmp7, tmp6, out2, out3);
4047  ST_W2(out0, 0, 2, dst, dst_stride);
4048  ST_H2(out0, 2, 6, dst + 4, dst_stride);
4049  ST_W2(out1, 0, 2, dst + 2 * dst_stride, dst_stride);
4050  ST_H2(out1, 2, 6, dst + 2 * dst_stride + 4, dst_stride);
4051  dst += (4 * dst_stride);
4052  ST_W2(out2, 0, 2, dst, dst_stride);
4053  ST_H2(out2, 2, 6, dst + 4, dst_stride);
4054  ST_W2(out3, 0, 2, dst + 2 * dst_stride, dst_stride);
4055  ST_H2(out3, 2, 6, dst + 2 * dst_stride + 4, dst_stride);
4056 }
4057 
4058 static void hevc_vt_uniwgt_4t_8x2_msa(const uint8_t *src,
4059  int32_t src_stride,
4060  uint8_t *dst,
4061  int32_t dst_stride,
4062  const int8_t *filter,
4063  int32_t weight,
4064  int32_t offset,
4065  int32_t rnd_val)
4066 {
4067  v16u8 out;
4068  v16i8 src0, src1, src2, src3, src4;
4069  v16i8 src10_r, src32_r, src21_r, src43_r;
4070  v4i32 dst0, dst1, dst2, dst3;
4071  v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
4072  v8i16 filter_vec, filt0, filt1;
4073  v4i32 weight_vec, rnd_vec, offset_vec;
4074  v8i16 zero = { 0 };
4075 
4076  src -= src_stride;
4077 
4078  weight_vec = __msa_fill_w(weight);
4079  rnd_vec = __msa_fill_w(rnd_val);
4080  offset_vec = __msa_fill_w(offset);
4081 
4082  filter_vec = LD_SH(filter);
4083  UNPCK_R_SB_SH(filter_vec, filter_vec);
4084  SPLATI_W2_SH(filter_vec, 0, filt0, filt1);
4085 
4086  LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
4087  ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
4088  ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
4089 
4090  ILVRL_B2_SH(zero, src10_r, tmp0, tmp1);
4091  ILVRL_B2_SH(zero, src21_r, tmp2, tmp3);
4092  ILVRL_B2_SH(zero, src32_r, tmp4, tmp5);
4093  ILVRL_B2_SH(zero, src43_r, tmp6, tmp7);
4094 
4095  dst0 = HEVC_FILT_4TAP_SW(tmp0, tmp4, filt0, filt1);
4096  dst1 = HEVC_FILT_4TAP_SW(tmp1, tmp5, filt0, filt1);
4097  dst2 = HEVC_FILT_4TAP_SW(tmp2, tmp6, filt0, filt1);
4098  dst3 = HEVC_FILT_4TAP_SW(tmp3, tmp7, filt0, filt1);
4099  MUL4(dst0, weight_vec, dst1, weight_vec, dst2, weight_vec, dst3,
4100  weight_vec, dst0, dst1, dst2, dst3);
4101  SRAR_W4_SW(dst0, dst1, dst2, dst3, rnd_vec);
4102  ADD4(dst0, offset_vec, dst1, offset_vec, dst2, offset_vec, dst3,
4103  offset_vec, dst0, dst1, dst2, dst3);
4104  CLIP_SW4_0_255(dst0, dst1, dst2, dst3);
4105  PCKEV_H2_SH(dst1, dst0, dst3, dst2, tmp0, tmp1);
4106 
4107  out = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
4108  ST_D2(out, 0, 1, dst, dst_stride);
4109 }
4110 
4111 static void hevc_vt_uniwgt_4t_8x4_msa(const uint8_t *src,
4112  int32_t src_stride,
4113  uint8_t *dst,
4114  int32_t dst_stride,
4115  const int8_t *filter,
4116  int32_t weight,
4117  int32_t offset,
4118  int32_t rnd_val)
4119 {
4120  v16u8 out0, out1;
4121  v16i8 src0, src1, src2, src3, src4;
4122  v16i8 src10_r, src32_r, src21_r, src43_r;
4123  v16i8 src5, src6, src54_r, src65_r;
4124  v8i16 filter_vec, filt0, filt1;
4125  v4i32 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
4126  v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
4127  v4i32 weight_vec, rnd_vec, offset_vec;
4128  v8i16 zero = { 0 };
4129 
4130  src -= src_stride;
4131 
4132  weight_vec = __msa_fill_w(weight);
4133  rnd_vec = __msa_fill_w(rnd_val);
4134  offset_vec = __msa_fill_w(offset);
4135 
4136  filter_vec = LD_SH(filter);
4137  UNPCK_R_SB_SH(filter_vec, filter_vec);
4138  SPLATI_W2_SH(filter_vec, 0, filt0, filt1);
4139 
4140  LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
4141  ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
4142  ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
4143  ILVR_B2_SB(src5, src4, src6, src5, src54_r, src65_r);
4144 
4145  ILVRL_B2_SH(zero, src10_r, tmp0, tmp1);
4146  ILVRL_B2_SH(zero, src21_r, tmp2, tmp3);
4147  ILVRL_B2_SH(zero, src32_r, tmp4, tmp5);
4148  ILVRL_B2_SH(zero, src43_r, tmp6, tmp7);
4149  dst0 = HEVC_FILT_4TAP_SW(tmp0, tmp4, filt0, filt1);
4150  dst1 = HEVC_FILT_4TAP_SW(tmp1, tmp5, filt0, filt1);
4151  dst2 = HEVC_FILT_4TAP_SW(tmp2, tmp6, filt0, filt1);
4152  dst3 = HEVC_FILT_4TAP_SW(tmp3, tmp7, filt0, filt1);
4153  ILVRL_B2_SH(zero, src54_r, tmp0, tmp1);
4154  ILVRL_B2_SH(zero, src65_r, tmp2, tmp3);
4155  dst4 = HEVC_FILT_4TAP_SW(tmp4, tmp0, filt0, filt1);
4156  dst5 = HEVC_FILT_4TAP_SW(tmp5, tmp1, filt0, filt1);
4157  dst6 = HEVC_FILT_4TAP_SW(tmp6, tmp2, filt0, filt1);
4158  dst7 = HEVC_FILT_4TAP_SW(tmp7, tmp3, filt0, filt1);
4159 
4160  MUL4(dst0, weight_vec, dst1, weight_vec, dst2, weight_vec, dst3,
4161  weight_vec, dst0, dst1, dst2, dst3);
4162  MUL4(dst4, weight_vec, dst5, weight_vec, dst6, weight_vec,
4163  dst7, weight_vec, dst4, dst5, dst6, dst7);
4164  SRAR_W4_SW(dst0, dst1, dst2, dst3, rnd_vec);
4165  SRAR_W4_SW(dst4, dst5, dst6, dst7, rnd_vec);
4166  ADD4(dst0, offset_vec, dst1, offset_vec, dst2, offset_vec, dst3,
4167  offset_vec, dst0, dst1, dst2, dst3);
4168  ADD4(dst4, offset_vec, dst5, offset_vec, dst6, offset_vec,
4169  dst7, offset_vec, dst4, dst5, dst6, dst7);
4170  CLIP_SW8_0_255(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
4171  PCKEV_H2_SH(dst1, dst0, dst3, dst2, tmp0, tmp1);
4172  PCKEV_H2_SH(dst5, dst4, dst7, dst6, tmp2, tmp3);
4173  PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
4174  ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
4175 }
4176 
4177 static void hevc_vt_uniwgt_4t_8x6_msa(const uint8_t *src,
4178  int32_t src_stride,
4179  uint8_t *dst,
4180  int32_t dst_stride,
4181  const int8_t *filter,
4182  int32_t weight,
4183  int32_t offset,
4184  int32_t rnd_val)
4185 {
4186  v16u8 out0, out1, out2;
4187  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
4188  v16i8 src10_r, src32_r, src54_r, src76_r;
4189  v16i8 src21_r, src43_r, src65_r, src87_r;
4190  v4i32 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
4191  v4i32 dst8, dst9, dst10, dst11;
4192  v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
4193  v8i16 filter_vec, filt0, filt1;
4194  v4i32 weight_vec, rnd_vec, offset_vec;
4195  v8i16 zero = { 0 };
4196 
4197  src -= src_stride;
4198  weight_vec = __msa_fill_w(weight);
4199  rnd_vec = __msa_fill_w(rnd_val);
4200  offset_vec = __msa_fill_w(offset);
4201 
4202  filter_vec = LD_SH(filter);
4203  UNPCK_R_SB_SH(filter_vec, filter_vec);
4204  SPLATI_W2_SH(filter_vec, 0, filt0, filt1);
4205 
4206  LD_SB3(src, src_stride, src0, src1, src2);
4207  src += (3 * src_stride);
4208  LD_SB6(src, src_stride, src3, src4, src5, src6, src7, src8);
4209 
4210  ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
4211  src32_r, src43_r);
4212  ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r,
4213  src76_r, src87_r);
4214 
4215  ILVRL_B2_SH(zero, src10_r, tmp0, tmp1);
4216  ILVRL_B2_SH(zero, src21_r, tmp2, tmp3);
4217  ILVRL_B2_SH(zero, src32_r, tmp4, tmp5);
4218  ILVRL_B2_SH(zero, src43_r, tmp6, tmp7);
4219  dst0 = HEVC_FILT_4TAP_SW(tmp0, tmp4, filt0, filt1);
4220  dst1 = HEVC_FILT_4TAP_SW(tmp1, tmp5, filt0, filt1);
4221  dst2 = HEVC_FILT_4TAP_SW(tmp2, tmp6, filt0, filt1);
4222  dst3 = HEVC_FILT_4TAP_SW(tmp3, tmp7, filt0, filt1);
4223  ILVRL_B2_SH(zero, src54_r, tmp0, tmp1);
4224  ILVRL_B2_SH(zero, src65_r, tmp2, tmp3);
4225  dst4 = HEVC_FILT_4TAP_SW(tmp4, tmp0, filt0, filt1);
4226  dst5 = HEVC_FILT_4TAP_SW(tmp5, tmp1, filt0, filt1);
4227  dst6 = HEVC_FILT_4TAP_SW(tmp6, tmp2, filt0, filt1);
4228  dst7 = HEVC_FILT_4TAP_SW(tmp7, tmp3, filt0, filt1);
4229  ILVRL_B2_SH(zero, src76_r, tmp4, tmp5);
4230  ILVRL_B2_SH(zero, src87_r, tmp6, tmp7);
4231  dst8 = HEVC_FILT_4TAP_SW(tmp0, tmp4, filt0, filt1);
4232  dst9 = HEVC_FILT_4TAP_SW(tmp1, tmp5, filt0, filt1);
4233  dst10 = HEVC_FILT_4TAP_SW(tmp2, tmp6, filt0, filt1);
4234  dst11 = HEVC_FILT_4TAP_SW(tmp3, tmp7, filt0, filt1);
4235 
4236  MUL4(dst0, weight_vec, dst1, weight_vec, dst2, weight_vec, dst3,
4237  weight_vec, dst0, dst1, dst2, dst3);
4238  MUL4(dst4, weight_vec, dst5, weight_vec, dst6, weight_vec,
4239  dst7, weight_vec, dst4, dst5, dst6, dst7);
4240  SRAR_W4_SW(dst0, dst1, dst2, dst3, rnd_vec);
4241  SRAR_W4_SW(dst4, dst5, dst6, dst7, rnd_vec);
4242  ADD4(dst0, offset_vec, dst1, offset_vec, dst2, offset_vec, dst3,
4243  offset_vec, dst0, dst1, dst2, dst3);
4244  ADD4(dst4, offset_vec, dst5, offset_vec, dst6, offset_vec,
4245  dst7, offset_vec, dst4, dst5, dst6, dst7);
4246  CLIP_SW8_0_255(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
4247  MUL4(dst8, weight_vec, dst9, weight_vec, dst10, weight_vec, dst11,
4248  weight_vec, dst8, dst9, dst10, dst11);
4249  SRAR_W4_SW(dst8, dst9, dst10, dst11, rnd_vec);
4250  ADD4(dst8, offset_vec, dst9, offset_vec, dst10, offset_vec, dst11,
4251  offset_vec, dst8, dst9, dst10, dst11);
4252  CLIP_SW4_0_255(dst8, dst9, dst10, dst11);
4253 
4254  PCKEV_H2_SH(dst1, dst0, dst3, dst2, tmp0, tmp1);
4255  PCKEV_H2_SH(dst5, dst4, dst7, dst6, tmp2, tmp3);
4256  PCKEV_H2_SH(dst9, dst8, dst11, dst10, tmp4, tmp5);
4257  PCKEV_B3_UB(tmp1, tmp0, tmp3, tmp2, tmp5, tmp4, out0, out1, out2);
4258  ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
4259  ST_D2(out2, 0, 1, dst + 4 * dst_stride, dst_stride);
4260 }
4261 
4262 static void hevc_vt_uniwgt_4t_8x8mult_msa(const uint8_t *src,
4263  int32_t src_stride,
4264  uint8_t *dst,
4265  int32_t dst_stride,
4266  const int8_t *filter,
4267  int32_t height,
4268  int32_t weight,
4269  int32_t offset,
4270  int32_t rnd_val)
4271 {
4272  int32_t loop_cnt;
4273  v16u8 out0, out1, out2, out3;
4274  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
4275  v16i8 src10_r, src32_r, src21_r, src43_r;
4276  v16i8 src54_r, src65_r, src76_r, src87_r, src98_r, src109_r;
4277  v8i16 filter_vec, filt0, filt1;
4278  v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
4279  v4i32 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
4280  v4i32 dst8, dst9, dst10, dst11, dst12, dst13, dst14, dst15;
4281  v4i32 weight_vec, rnd_vec, offset_vec;
4282  v8i16 zero = { 0 };
4283 
4284  src -= src_stride;
4285 
4286  weight_vec = __msa_fill_w(weight);
4287  rnd_vec = __msa_fill_w(rnd_val);
4288  offset_vec = __msa_fill_w(offset);
4289 
4290  filter_vec = LD_SH(filter);
4291  UNPCK_R_SB_SH(filter_vec, filter_vec);
4292  SPLATI_W2_SH(filter_vec, 0, filt0, filt1);
4293 
4294  LD_SB3(src, src_stride, src0, src1, src2);
4295  src += (3 * src_stride);
4296  ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
4297 
4298  for (loop_cnt = (height >> 3); loop_cnt--;) {
4299  LD_SB8(src, src_stride,
4300  src3, src4, src5, src6, src7, src8, src9, src10);
4301  src += (8 * src_stride);
4302  ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
4303  ILVR_B2_SB(src5, src4, src6, src5, src54_r, src65_r);
4304  ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
4305  ILVR_B2_SB(src9, src8, src10, src9, src98_r, src109_r);
4306 
4307  ILVRL_B2_SH(zero, src10_r, tmp0, tmp1);
4308  ILVRL_B2_SH(zero, src21_r, tmp2, tmp3);
4309  ILVRL_B2_SH(zero, src32_r, tmp4, tmp5);
4310  ILVRL_B2_SH(zero, src43_r, tmp6, tmp7);
4311  dst0 = HEVC_FILT_4TAP_SW(tmp0, tmp4, filt0, filt1);
4312  dst1 = HEVC_FILT_4TAP_SW(tmp1, tmp5, filt0, filt1);
4313  dst2 = HEVC_FILT_4TAP_SW(tmp2, tmp6, filt0, filt1);
4314  dst3 = HEVC_FILT_4TAP_SW(tmp3, tmp7, filt0, filt1);
4315  ILVRL_B2_SH(zero, src54_r, tmp0, tmp1);
4316  ILVRL_B2_SH(zero, src65_r, tmp2, tmp3);
4317  dst4 = HEVC_FILT_4TAP_SW(tmp4, tmp0, filt0, filt1);
4318  dst5 = HEVC_FILT_4TAP_SW(tmp5, tmp1, filt0, filt1);
4319  dst6 = HEVC_FILT_4TAP_SW(tmp6, tmp2, filt0, filt1);
4320  dst7 = HEVC_FILT_4TAP_SW(tmp7, tmp3, filt0, filt1);
4321  ILVRL_B2_SH(zero, src76_r, tmp4, tmp5);
4322  ILVRL_B2_SH(zero, src87_r, tmp6, tmp7);
4323  dst8 = HEVC_FILT_4TAP_SW(tmp0, tmp4, filt0, filt1);
4324  dst9 = HEVC_FILT_4TAP_SW(tmp1, tmp5, filt0, filt1);
4325  dst10 = HEVC_FILT_4TAP_SW(tmp2, tmp6, filt0, filt1);
4326  dst11 = HEVC_FILT_4TAP_SW(tmp3, tmp7, filt0, filt1);
4327  ILVRL_B2_SH(zero, src98_r, tmp0, tmp1);
4328  ILVRL_B2_SH(zero, src109_r, tmp2, tmp3);
4329  dst12 = HEVC_FILT_4TAP_SW(tmp4, tmp0, filt0, filt1);
4330  dst13 = HEVC_FILT_4TAP_SW(tmp5, tmp1, filt0, filt1);
4331  dst14 = HEVC_FILT_4TAP_SW(tmp6, tmp2, filt0, filt1);
4332  dst15 = HEVC_FILT_4TAP_SW(tmp7, tmp3, filt0, filt1);
4333 
4334  MUL4(dst0, weight_vec, dst1, weight_vec, dst2, weight_vec, dst3,
4335  weight_vec, dst0, dst1, dst2, dst3);
4336  MUL4(dst4, weight_vec, dst5, weight_vec, dst6, weight_vec,
4337  dst7, weight_vec, dst4, dst5, dst6, dst7);
4338  SRAR_W4_SW(dst0, dst1, dst2, dst3, rnd_vec);
4339  SRAR_W4_SW(dst4, dst5, dst6, dst7, rnd_vec);
4340  ADD4(dst0, offset_vec, dst1, offset_vec, dst2, offset_vec, dst3,
4341  offset_vec, dst0, dst1, dst2, dst3);
4342  ADD4(dst4, offset_vec, dst5, offset_vec, dst6, offset_vec,
4343  dst7, offset_vec, dst4, dst5, dst6, dst7);
4344  CLIP_SW8_0_255(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
4345 
4346  MUL4(dst8, weight_vec, dst9, weight_vec, dst10, weight_vec, dst11,
4347  weight_vec, dst8, dst9, dst10, dst11);
4348  MUL4(dst12, weight_vec, dst13, weight_vec, dst14, weight_vec,
4349  dst15, weight_vec, dst12, dst13, dst14, dst15);
4350  SRAR_W4_SW(dst8, dst9, dst10, dst11, rnd_vec);
4351  SRAR_W4_SW(dst12, dst13, dst14, dst15, rnd_vec);
4352  ADD4(dst8, offset_vec, dst9, offset_vec, dst10, offset_vec, dst11,
4353  offset_vec, dst8, dst9, dst10, dst11);
4354  ADD4(dst12, offset_vec, dst13, offset_vec, dst14, offset_vec,
4355  dst15, offset_vec, dst12, dst13, dst14, dst15);
4356  CLIP_SW8_0_255(dst8, dst9, dst10, dst11, dst12, dst13, dst14, dst15);
4357 
4358  PCKEV_H2_SH(dst1, dst0, dst3, dst2, tmp0, tmp1);
4359  PCKEV_H2_SH(dst5, dst4, dst7, dst6, tmp2, tmp3);
4360  PCKEV_H2_SH(dst9, dst8, dst11, dst10, tmp4, tmp5);
4361  PCKEV_H2_SH(dst13, dst12, dst15, dst14, tmp6, tmp7);
4362 
4363  PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
4364  PCKEV_B2_UB(tmp5, tmp4, tmp7, tmp6, out2, out3);
4365  ST_D8(out0, out1, out2, out3, 0, 1, 0, 1, 0, 1, 0, 1, dst, dst_stride);
4366  dst += (8 * dst_stride);
4367 
4368  src2 = src10;
4369  src10_r = src98_r;
4370  src21_r = src109_r;
4371  }
4372 }
4373 
4374 static void hevc_vt_uniwgt_4t_8w_msa(const uint8_t *src,
4375  int32_t src_stride,
4376  uint8_t *dst,
4377  int32_t dst_stride,
4378  const int8_t *filter,
4379  int32_t height,
4380  int32_t weight,
4381  int32_t offset,
4382  int32_t rnd_val)
4383 {
4384  if (2 == height) {
4385  hevc_vt_uniwgt_4t_8x2_msa(src, src_stride, dst, dst_stride,
4386  filter, weight, offset, rnd_val);
4387  } else if (4 == height) {
4388  hevc_vt_uniwgt_4t_8x4_msa(src, src_stride, dst, dst_stride,
4389  filter, weight, offset, rnd_val);
4390  } else if (6 == height) {
4391  hevc_vt_uniwgt_4t_8x6_msa(src, src_stride, dst, dst_stride,
4392  filter, weight, offset, rnd_val);
4393  } else {
4394  hevc_vt_uniwgt_4t_8x8mult_msa(src, src_stride, dst, dst_stride,
4396  rnd_val);
4397  }
4398 }
4399 
4400 static void hevc_vt_uniwgt_4t_12w_msa(const uint8_t *src,
4401  int32_t src_stride,
4402  uint8_t *dst,
4403  int32_t dst_stride,
4404  const int8_t *filter,
4405  int32_t height,
4406  int32_t weight,
4407  int32_t offset,
4408  int32_t rnd_val)
4409 {
4410  int32_t loop_cnt;
4411  v16u8 out0, out1, out2, out3, out4, out5;
4412  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
4413  v16i8 src10_r, src32_r, src21_r, src43_r;
4414  v16i8 src10_l, src32_l, src54_l, src21_l, src43_l, src65_l;
4415  v16i8 src2110, src4332;
4416  v16i8 src54_r, src76_r, src98_r, src65_r, src87_r, src109_r;
4417  v16i8 src76_l, src98_l, src87_l, src109_l, src6554, src8776, src10998;
4418  v8i16 filter_vec, filt0, filt1;
4419  v4i32 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8;
4420  v4i32 dst9, dst10, dst11;
4421  v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8, tmp9;
4422  v4i32 weight_vec, rnd_vec, offset_vec;
4423  v8i16 zero = { 0 };
4424 
4425  src -= (1 * src_stride);
4426 
4427  weight_vec = __msa_fill_w(weight);
4428  rnd_vec = __msa_fill_w(rnd_val);
4429  offset_vec = __msa_fill_w(offset);
4430 
4431  filter_vec = LD_SH(filter);
4432  UNPCK_R_SB_SH(filter_vec, filter_vec);
4433  SPLATI_W2_SH(filter_vec, 0, filt0, filt1);
4434 
4435  LD_SB3(src, src_stride, src0, src1, src2);
4436  src += (3 * src_stride);
4437  ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
4438  ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
4439  src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_l, (v2i64) src10_l);
4440 
4441  for (loop_cnt = 2; loop_cnt--;) {
4442  LD_SB8(src, src_stride, src3, src4, src5, src6, src7, src8, src9, src10);
4443  src += (8 * src_stride);
4444  ILVRL_B2_SB(src3, src2, src32_r, src32_l);
4445  ILVRL_B2_SB(src4, src3, src43_r, src43_l);
4446  ILVRL_B2_SB(src5, src4, src54_r, src54_l);
4447  ILVRL_B2_SB(src6, src5, src65_r, src65_l);
4448  src4332 = (v16i8) __msa_ilvr_d((v2i64) src43_l, (v2i64) src32_l);
4449  src6554 = (v16i8) __msa_ilvr_d((v2i64) src65_l, (v2i64) src54_l);
4450 
4451  ILVRL_B2_SH(zero, src10_r, tmp0, tmp1);
4452  ILVRL_B2_SH(zero, src21_r, tmp2, tmp3);
4453  ILVRL_B2_SH(zero, src32_r, tmp4, tmp5);
4454  ILVRL_B2_SH(zero, src43_r, tmp6, tmp7);
4455  dst0 = HEVC_FILT_4TAP_SW(tmp0, tmp4, filt0, filt1);
4456  dst1 = HEVC_FILT_4TAP_SW(tmp1, tmp5, filt0, filt1);
4457  dst2 = HEVC_FILT_4TAP_SW(tmp2, tmp6, filt0, filt1);
4458  dst3 = HEVC_FILT_4TAP_SW(tmp3, tmp7, filt0, filt1);
4459  ILVRL_B2_SH(zero, src54_r, tmp0, tmp1);
4460  ILVRL_B2_SH(zero, src65_r, tmp2, tmp3);
4461  dst4 = HEVC_FILT_4TAP_SW(tmp4, tmp0, filt0, filt1);
4462  dst5 = HEVC_FILT_4TAP_SW(tmp5, tmp1, filt0, filt1);
4463  dst6 = HEVC_FILT_4TAP_SW(tmp6, tmp2, filt0, filt1);
4464  dst7 = HEVC_FILT_4TAP_SW(tmp7, tmp3, filt0, filt1);
4465  ILVRL_B2_SH(zero, src2110, tmp4, tmp5);
4466  ILVRL_B2_SH(zero, src4332, tmp6, tmp7);
4467  ILVRL_B2_SH(zero, src6554, tmp8, tmp9);
4468  dst8 = HEVC_FILT_4TAP_SW(tmp4, tmp6, filt0, filt1);
4469  dst9 = HEVC_FILT_4TAP_SW(tmp5, tmp7, filt0, filt1);
4470  dst10 = HEVC_FILT_4TAP_SW(tmp6, tmp8, filt0, filt1);
4471  dst11 = HEVC_FILT_4TAP_SW(tmp7, tmp9, filt0, filt1);
4472 
4473  MUL4(dst0, weight_vec, dst1, weight_vec, dst2, weight_vec, dst3,
4474  weight_vec, dst0, dst1, dst2, dst3);
4475  MUL4(dst4, weight_vec, dst5, weight_vec, dst6, weight_vec,
4476  dst7, weight_vec, dst4, dst5, dst6, dst7);
4477  SRAR_W4_SW(dst0, dst1, dst2, dst3, rnd_vec);
4478  SRAR_W4_SW(dst4, dst5, dst6, dst7, rnd_vec);
4479  ADD4(dst0, offset_vec, dst1, offset_vec, dst2, offset_vec, dst3,
4480  offset_vec, dst0, dst1, dst2, dst3);
4481  ADD4(dst4, offset_vec, dst5, offset_vec, dst6, offset_vec,
4482  dst7, offset_vec, dst4, dst5, dst6, dst7);
4483  CLIP_SW8_0_255(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
4484 
4485  MUL4(dst8, weight_vec, dst9, weight_vec, dst10, weight_vec, dst11,
4486  weight_vec, dst8, dst9, dst10, dst11);
4487  SRAR_W4_SW(dst8, dst9, dst10, dst11, rnd_vec);
4488  ADD4(dst8, offset_vec, dst9, offset_vec, dst10, offset_vec, dst11,
4489  offset_vec, dst8, dst9, dst10, dst11);
4490  CLIP_SW4_0_255(dst8, dst9, dst10, dst11);
4491 
4492  PCKEV_H2_SH(dst1, dst0, dst3, dst2, dst0, dst1);
4493  PCKEV_H2_SH(dst5, dst4, dst7, dst6, dst2, dst3);
4494  PCKEV_H2_SH(dst9, dst8, dst11, dst10, dst4, dst5);
4495 
4496  PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2);
4497  ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
4498  ST_W4(out2, 0, 1, 2, 3, dst + 8, dst_stride);
4499  dst += (4 * dst_stride);
4500 
4501  ILVRL_B2_SB(src7, src6, src76_r, src76_l);
4502  ILVRL_B2_SB(src8, src7, src87_r, src87_l);
4503  ILVRL_B2_SB(src9, src8, src98_r, src98_l);
4504  ILVRL_B2_SB(src10, src9, src109_r, src109_l);
4505  src8776 = (v16i8) __msa_ilvr_d((v2i64) src87_l, (v2i64) src76_l);
4506  src10998 = (v16i8) __msa_ilvr_d((v2i64) src109_l, (v2i64) src98_l);
4507 
4508  ILVRL_B2_SH(zero, src76_r, tmp4, tmp5);
4509  ILVRL_B2_SH(zero, src87_r, tmp6, tmp7);
4510  dst0 = HEVC_FILT_4TAP_SW(tmp0, tmp4, filt0, filt1);
4511  dst1 = HEVC_FILT_4TAP_SW(tmp1, tmp5, filt0, filt1);
4512  dst2 = HEVC_FILT_4TAP_SW(tmp2, tmp6, filt0, filt1);
4513  dst3 = HEVC_FILT_4TAP_SW(tmp3, tmp7, filt0, filt1);
4514  ILVRL_B2_SH(zero, src98_r, tmp0, tmp1);
4515  ILVRL_B2_SH(zero, src109_r, tmp2, tmp3);
4516  dst4 = HEVC_FILT_4TAP_SW(tmp4, tmp0, filt0, filt1);
4517  dst5 = HEVC_FILT_4TAP_SW(tmp5, tmp1, filt0, filt1);
4518  dst6 = HEVC_FILT_4TAP_SW(tmp6, tmp2, filt0, filt1);
4519  dst7 = HEVC_FILT_4TAP_SW(tmp7, tmp3, filt0, filt1);
4520  ILVRL_B2_SH(zero, src8776, tmp4, tmp5);
4521  ILVRL_B2_SH(zero, src10998, tmp6, tmp7);
4522  ILVRL_B2_SH(zero, src6554, tmp8, tmp9);
4523  dst8 = HEVC_FILT_4TAP_SW(tmp8, tmp4, filt0, filt1);
4524  dst9 = HEVC_FILT_4TAP_SW(tmp9, tmp5, filt0, filt1);
4525  dst10 = HEVC_FILT_4TAP_SW(tmp4, tmp6, filt0, filt1);
4526  dst11 = HEVC_FILT_4TAP_SW(tmp5, tmp7, filt0, filt1);
4527 
4528  MUL4(dst0, weight_vec, dst1, weight_vec, dst2, weight_vec, dst3,
4529  weight_vec, dst0, dst1, dst2, dst3);
4530  MUL4(dst4, weight_vec, dst5, weight_vec, dst6, weight_vec,
4531  dst7, weight_vec, dst4, dst5, dst6, dst7);
4532  SRAR_W4_SW(dst0, dst1, dst2, dst3, rnd_vec);
4533  SRAR_W4_SW(dst4, dst5, dst6, dst7, rnd_vec);
4534  ADD4(dst0, offset_vec, dst1, offset_vec, dst2, offset_vec, dst3,
4535  offset_vec, dst0, dst1, dst2, dst3);
4536  ADD4(dst4, offset_vec, dst5, offset_vec, dst6, offset_vec,
4537  dst7, offset_vec, dst4, dst5, dst6, dst7);
4538  CLIP_SW8_0_255(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
4539 
4540  MUL4(dst8, weight_vec, dst9, weight_vec, dst10, weight_vec, dst11,
4541  weight_vec, dst8, dst9, dst10, dst11);
4542  SRAR_W4_SW(dst8, dst9, dst10, dst11, rnd_vec);
4543  ADD4(dst8, offset_vec, dst9, offset_vec, dst10, offset_vec, dst11,
4544  offset_vec, dst8, dst9, dst10, dst11);
4545  CLIP_SW4_0_255(dst8, dst9, dst10, dst11);
4546 
4547  PCKEV_H2_SH(dst1, dst0, dst3, dst2, dst0, dst1);
4548  PCKEV_H2_SH(dst5, dst4, dst7, dst6, dst2, dst3);
4549  PCKEV_H2_SH(dst9, dst8, dst11, dst10, dst4, dst5);
4550 
4551  PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out3, out4, out5);
4552  ST_D4(out3, out4, 0, 1, 0, 1, dst, dst_stride);
4553  ST_W4(out5, 0, 1, 2, 3, dst + 8, dst_stride);
4554  dst += (4 * dst_stride);
4555 
4556  src2 = src10;
4557  src10_r = src98_r;
4558  src21_r = src109_r;
4559  src2110 = src10998;
4560  }
4561 }
4562 
4563 static void hevc_vt_uniwgt_4t_16w_msa(const uint8_t *src,
4564  int32_t src_stride,
4565  uint8_t *dst,
4566  int32_t dst_stride,
4567  const int8_t *filter,
4568  int32_t height,
4569  int32_t weight,
4570  int32_t offset,
4571  int32_t rnd_val)
4572 {
4573  int32_t loop_cnt;
4574  v16u8 out0, out1, out2, out3;
4575  v16i8 src0, src1, src2, src3, src4, src5;
4576  v16i8 src10_r, src32_r, src21_r, src43_r;
4577  v16i8 src10_l, src32_l, src21_l, src43_l;
4578  v16i8 src54_r, src54_l, src65_r, src65_l, src6;
4579  v8i16 filter_vec, filt0, filt1;
4580  v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
4581  v4i32 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
4582  v4i32 dst8, dst9, dst10, dst11, dst12, dst13, dst14, dst15;
4583  v4i32 weight_vec, rnd_vec, offset_vec;
4584  v8i16 zero = { 0 };
4585 
4586  src -= src_stride;
4587 
4588  weight_vec = __msa_fill_w(weight);
4589  rnd_vec = __msa_fill_w(rnd_val);
4590  offset_vec = __msa_fill_w(offset);
4591 
4592  filter_vec = LD_SH(filter);
4593  UNPCK_R_SB_SH(filter_vec, filter_vec);
4594  SPLATI_W2_SH(filter_vec, 0, filt0, filt1);
4595 
4596  LD_SB3(src, src_stride, src0, src1, src2);
4597  src += (3 * src_stride);
4598  ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
4599  ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
4600 
4601  for (loop_cnt = (height >> 2); loop_cnt--;) {
4602  LD_SB4(src, src_stride, src3, src4, src5, src6);
4603  src += (4 * src_stride);
4604  ILVRL_B2_SB(src3, src2, src32_r, src32_l);
4605  ILVRL_B2_SB(src4, src3, src43_r, src43_l);
4606  ILVRL_B2_SB(src5, src4, src54_r, src54_l);
4607  ILVRL_B2_SB(src6, src5, src65_r, src65_l);
4608 
4609  ILVRL_B2_SH(zero, src10_r, tmp0, tmp1);
4610  ILVRL_B2_SH(zero, src21_r, tmp2, tmp3);
4611  ILVRL_B2_SH(zero, src32_r, tmp4, tmp5);
4612  ILVRL_B2_SH(zero, src43_r, tmp6, tmp7);
4613  dst0 = HEVC_FILT_4TAP_SW(tmp0, tmp4, filt0, filt1);
4614  dst1 = HEVC_FILT_4TAP_SW(tmp1, tmp5, filt0, filt1);
4615  dst2 = HEVC_FILT_4TAP_SW(tmp2, tmp6, filt0, filt1);
4616  dst3 = HEVC_FILT_4TAP_SW(tmp3, tmp7, filt0, filt1);
4617  ILVRL_B2_SH(zero, src54_r, tmp0, tmp1);
4618  ILVRL_B2_SH(zero, src65_r, tmp2, tmp3);
4619  dst4 = HEVC_FILT_4TAP_SW(tmp4, tmp0, filt0, filt1);
4620  dst5 = HEVC_FILT_4TAP_SW(tmp5, tmp1, filt0, filt1);
4621  dst6 = HEVC_FILT_4TAP_SW(tmp6, tmp2, filt0, filt1);
4622  dst7 = HEVC_FILT_4TAP_SW(tmp7, tmp3, filt0, filt1);
4623  ILVRL_B2_SH(zero, src10_l, tmp0, tmp1);
4624  ILVRL_B2_SH(zero, src21_l, tmp2, tmp3);
4625  ILVRL_B2_SH(zero, src32_l, tmp4, tmp5);
4626  ILVRL_B2_SH(zero, src43_l, tmp6, tmp7);
4627  dst8 = HEVC_FILT_4TAP_SW(tmp0, tmp4, filt0, filt1);
4628  dst9 = HEVC_FILT_4TAP_SW(tmp1, tmp5, filt0, filt1);
4629  dst10 = HEVC_FILT_4TAP_SW(tmp2, tmp6, filt0, filt1);
4630  dst11 = HEVC_FILT_4TAP_SW(tmp3, tmp7, filt0, filt1);
4631  ILVRL_B2_SH(zero, src54_l, tmp0, tmp1);
4632  ILVRL_B2_SH(zero, src65_l, tmp2, tmp3);
4633  dst12 = HEVC_FILT_4TAP_SW(tmp4, tmp0, filt0, filt1);
4634  dst13 = HEVC_FILT_4TAP_SW(tmp5, tmp1, filt0, filt1);
4635  dst14 = HEVC_FILT_4TAP_SW(tmp6, tmp2, filt0, filt1);
4636  dst15 = HEVC_FILT_4TAP_SW(tmp7, tmp3, filt0, filt1);
4637 
4638  MUL4(dst0, weight_vec, dst1, weight_vec, dst2, weight_vec, dst3,
4639  weight_vec, dst0, dst1, dst2, dst3);
4640  MUL4(dst4, weight_vec, dst5, weight_vec, dst6, weight_vec,
4641  dst7, weight_vec, dst4, dst5, dst6, dst7);
4642  SRAR_W4_SW(dst0, dst1, dst2, dst3, rnd_vec);
4643  SRAR_W4_SW(dst4, dst5, dst6, dst7, rnd_vec);
4644  ADD4(dst0, offset_vec, dst1, offset_vec, dst2, offset_vec, dst3,
4645  offset_vec, dst0, dst1, dst2, dst3);
4646  ADD4(dst4, offset_vec, dst5, offset_vec, dst6, offset_vec,
4647  dst7, offset_vec, dst4, dst5, dst6, dst7);
4648  CLIP_SW8_0_255(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
4649 
4650  MUL4(dst8, weight_vec, dst9, weight_vec, dst10, weight_vec, dst11,
4651  weight_vec, dst8, dst9, dst10, dst11);
4652  MUL4(dst12, weight_vec, dst13, weight_vec, dst14, weight_vec,
4653  dst15, weight_vec, dst12, dst13, dst14, dst15);
4654  SRAR_W4_SW(dst8, dst9, dst10, dst11, rnd_vec);
4655  SRAR_W4_SW(dst12, dst13, dst14, dst15, rnd_vec);
4656  ADD4(dst8, offset_vec, dst9, offset_vec, dst10, offset_vec, dst11,
4657  offset_vec, dst8, dst9, dst10, dst11);
4658  ADD4(dst12, offset_vec, dst13, offset_vec, dst14, offset_vec,
4659  dst15, offset_vec, dst12, dst13, dst14, dst15);
4660  CLIP_SW8_0_255(dst8, dst9, dst10, dst11, dst12, dst13, dst14, dst15);
4661 
4662  PCKEV_H2_SH(dst1, dst0, dst3, dst2, tmp0, tmp1);
4663  PCKEV_H2_SH(dst5, dst4, dst7, dst6, tmp2, tmp3);
4664  PCKEV_H2_SH(dst9, dst8, dst11, dst10, tmp4, tmp5);
4665  PCKEV_H2_SH(dst13, dst12, dst15, dst14, tmp6, tmp7);
4666  PCKEV_B4_UB(tmp4, tmp0, tmp5, tmp1, tmp6, tmp2, tmp7, tmp3, out0, out1,
4667  out2, out3);
4668  ST_UB4(out0, out1, out2, out3, dst, dst_stride);
4669  dst += (4 * dst_stride);
4670 
4671  src2 = src6;
4672  src10_r = src54_r;
4673  src21_r = src65_r;
4674  src10_l = src54_l;
4675  src21_l = src65_l;
4676  }
4677 }
4678 
4679 static void hevc_vt_uniwgt_4t_24w_msa(const uint8_t *src,
4680  int32_t src_stride,
4681  uint8_t *dst,
4682  int32_t dst_stride,
4683  const int8_t *filter,
4684  int32_t height,
4685  int32_t weight,
4686  int32_t offset,
4687  int32_t rnd_val)
4688 {
4689  uint32_t loop_cnt;
4690  v16u8 out0, out1, out2, out3, out4, out5;
4691  v16i8 src0, src1, src2, src3, src4, src5;
4692  v16i8 src6, src7, src8, src9, src10, src11, src12, src13;
4693  v16i8 src10_r, src32_r, src54_r, src21_r, src43_r, src65_r;
4694  v16i8 src10_l, src32_l, src54_l, src21_l, src43_l, src65_l;
4695  v16i8 src87_r, src98_r, src109_r, src1110_r, src1211_r, src1312_r;
4696  v8i16 filter_vec, filt0, filt1;
4697  v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
4698  v4i32 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
4699  v4i32 dst8, dst9, dst10, dst11, dst12, dst13, dst14, dst15;
4700  v4i32 dst16, dst17, dst18, dst19, dst20, dst21, dst22, dst23;
4701  v4i32 weight_vec, rnd_vec, offset_vec;
4702  v8i16 zero = { 0 };
4703 
4704  src -= src_stride;
4705 
4706  weight_vec = __msa_fill_w(weight);
4707  rnd_vec = __msa_fill_w(rnd_val);
4708  offset_vec = __msa_fill_w(offset);
4709 
4710  filter_vec = LD_SH(filter);
4711  UNPCK_R_SB_SH(filter_vec, filter_vec);
4712  SPLATI_W2_SH(filter_vec, 0, filt0, filt1);
4713 
4714  LD_SB3(src, src_stride, src0, src1, src2);
4715  LD_SB3(src + 16, src_stride, src7, src8, src9);
4716  src += (3 * src_stride);
4717  ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
4718  ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
4719  ILVR_B2_SB(src8, src7, src9, src8, src87_r, src98_r);
4720 
4721  for (loop_cnt = 8; loop_cnt--;) {
4722  LD_SB4(src, src_stride, src3, src4, src5, src6);
4723  LD_SB4(src + 16, src_stride, src10, src11, src12, src13);
4724  src += (4 * src_stride);
4725  ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
4726  ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
4727  ILVRL_B2_SB(src5, src4, src54_r, src54_l);
4728  ILVRL_B2_SB(src6, src5, src65_r, src65_l);
4729  ILVR_B2_SB(src10, src9, src11, src10, src109_r, src1110_r);
4730  ILVR_B2_SB(src12, src11, src13, src12, src1211_r, src1312_r);
4731 
4732  ILVRL_B2_SH(zero, src10_r, tmp0, tmp1);
4733  ILVRL_B2_SH(zero, src21_r, tmp2, tmp3);
4734  ILVRL_B2_SH(zero, src32_r, tmp4, tmp5);
4735  ILVRL_B2_SH(zero, src43_r, tmp6, tmp7);
4736  dst0 = HEVC_FILT_4TAP_SW(tmp0, tmp4, filt0, filt1);
4737  dst1 = HEVC_FILT_4TAP_SW(tmp1, tmp5, filt0, filt1);
4738  dst2 = HEVC_FILT_4TAP_SW(tmp2, tmp6, filt0, filt1);
4739  dst3 = HEVC_FILT_4TAP_SW(tmp3, tmp7, filt0, filt1);
4740  ILVRL_B2_SH(zero, src54_r, tmp0, tmp1);
4741  ILVRL_B2_SH(zero, src65_r, tmp2, tmp3);
4742  dst4 = HEVC_FILT_4TAP_SW(tmp4, tmp0, filt0, filt1);
4743  dst5 = HEVC_FILT_4TAP_SW(tmp5, tmp1, filt0, filt1);
4744  dst6 = HEVC_FILT_4TAP_SW(tmp6, tmp2, filt0, filt1);
4745  dst7 = HEVC_FILT_4TAP_SW(tmp7, tmp3, filt0, filt1);
4746  ILVRL_B2_SH(zero, src10_l, tmp0, tmp1);
4747  ILVRL_B2_SH(zero, src21_l, tmp2, tmp3);
4748  ILVRL_B2_SH(zero, src32_l, tmp4, tmp5);
4749  ILVRL_B2_SH(zero, src43_l, tmp6, tmp7);
4750  dst8 = HEVC_FILT_4TAP_SW(tmp0, tmp4, filt0, filt1);
4751  dst9 = HEVC_FILT_4TAP_SW(tmp1, tmp5, filt0, filt1);
4752  dst10 = HEVC_FILT_4TAP_SW(tmp2, tmp6, filt0, filt1);
4753  dst11 = HEVC_FILT_4TAP_SW(tmp3, tmp7, filt0, filt1);
4754  ILVRL_B2_SH(zero, src54_l, tmp0, tmp1);
4755  ILVRL_B2_SH(zero, src65_l, tmp2, tmp3);
4756  dst12 = HEVC_FILT_4TAP_SW(tmp4, tmp0, filt0, filt1);
4757  dst13 = HEVC_FILT_4TAP_SW(tmp5, tmp1, filt0, filt1);
4758  dst14 = HEVC_FILT_4TAP_SW(tmp6, tmp2, filt0, filt1);
4759  dst15 = HEVC_FILT_4TAP_SW(tmp7, tmp3, filt0, filt1);
4760  ILVRL_B2_SH(zero, src87_r, tmp0, tmp1);
4761  ILVRL_B2_SH(zero, src98_r, tmp2, tmp3);
4762  ILVRL_B2_SH(zero, src109_r, tmp4, tmp5);
4763  ILVRL_B2_SH(zero, src1110_r, tmp6, tmp7);
4764  dst16 = HEVC_FILT_4TAP_SW(tmp0, tmp4, filt0, filt1);
4765  dst17 = HEVC_FILT_4TAP_SW(tmp1, tmp5, filt0, filt1);
4766  dst18 = HEVC_FILT_4TAP_SW(tmp2, tmp6, filt0, filt1);
4767  dst19 = HEVC_FILT_4TAP_SW(tmp3, tmp7, filt0, filt1);
4768  ILVRL_B2_SH(zero, src1211_r, tmp0, tmp1);
4769  ILVRL_B2_SH(zero, src1312_r, tmp2, tmp3);
4770  dst20 = HEVC_FILT_4TAP_SW(tmp4, tmp0, filt0, filt1);
4771  dst21 = HEVC_FILT_4TAP_SW(tmp5, tmp1, filt0, filt1);
4772  dst22 = HEVC_FILT_4TAP_SW(tmp6, tmp2, filt0, filt1);
4773  dst23 = HEVC_FILT_4TAP_SW(tmp7, tmp3, filt0, filt1);
4774 
4775  MUL4(dst0, weight_vec, dst1, weight_vec, dst2, weight_vec, dst3,
4776  weight_vec, dst0, dst1, dst2, dst3);
4777  MUL4(dst4, weight_vec, dst5, weight_vec, dst6, weight_vec,
4778  dst7, weight_vec, dst4, dst5, dst6, dst7);
4779  SRAR_W4_SW(dst0, dst1, dst2, dst3, rnd_vec);
4780  SRAR_W4_SW(dst4, dst5, dst6, dst7, rnd_vec);
4781  ADD4(dst0, offset_vec, dst1, offset_vec, dst2, offset_vec, dst3,
4782  offset_vec, dst0, dst1, dst2, dst3);
4783  ADD4(dst4, offset_vec, dst5, offset_vec, dst6, offset_vec,
4784  dst7, offset_vec, dst4, dst5, dst6, dst7);
4785  CLIP_SW8_0_255(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
4786 
4787  MUL4(dst8, weight_vec, dst9, weight_vec, dst10, weight_vec, dst11,
4788  weight_vec, dst8, dst9, dst10, dst11);
4789  MUL4(dst12, weight_vec, dst13, weight_vec, dst14, weight_vec,
4790  dst15, weight_vec, dst12, dst13, dst14, dst15);
4791  SRAR_W4_SW(dst8, dst9, dst10, dst11, rnd_vec);
4792  SRAR_W4_SW(dst12, dst13, dst14, dst15, rnd_vec);
4793  ADD4(dst8, offset_vec, dst9, offset_vec, dst10, offset_vec, dst11,
4794  offset_vec, dst8, dst9, dst10, dst11);
4795  ADD4(dst12, offset_vec, dst13, offset_vec, dst14, offset_vec,
4796  dst15, offset_vec, dst12, dst13, dst14, dst15);
4797  CLIP_SW8_0_255(dst8, dst9, dst10, dst11, dst12, dst13, dst14, dst15);
4798 
4799  MUL4(dst16, weight_vec, dst17, weight_vec, dst18, weight_vec, dst19,
4800  weight_vec, dst16, dst17, dst18, dst19);
4801  MUL4(dst20, weight_vec, dst21, weight_vec, dst22, weight_vec,
4802  dst23, weight_vec, dst20, dst21, dst22, dst23);
4803  SRAR_W4_SW(dst16, dst17, dst18, dst19, rnd_vec);
4804  SRAR_W4_SW(dst20, dst21, dst22, dst23, rnd_vec);
4805  ADD4(dst16, offset_vec, dst17, offset_vec, dst18, offset_vec, dst19,
4806  offset_vec, dst16, dst17, dst18, dst19);
4807  ADD4(dst20, offset_vec, dst21, offset_vec, dst22, offset_vec,
4808  dst23, offset_vec, dst20, dst21, dst22, dst23);
4809  CLIP_SW8_0_255(dst16, dst17, dst18, dst19, dst20, dst21, dst22, dst23);
4810 
4811  PCKEV_H2_SH(dst1, dst0, dst3, dst2, tmp0, tmp1);
4812  PCKEV_H2_SH(dst5, dst4, dst7, dst6, tmp2, tmp3);
4813  PCKEV_H2_SH(dst9, dst8, dst11, dst10, tmp4, tmp5);
4814  PCKEV_H2_SH(dst13, dst12, dst15, dst14, tmp6, tmp7);
4815  PCKEV_B4_UB(tmp4, tmp0, tmp5, tmp1, tmp6, tmp2, tmp7, tmp3, out0, out1,
4816  out2, out3);
4817 
4818  PCKEV_H2_SH(dst17, dst16, dst19, dst18, tmp0, tmp1);
4819  PCKEV_H2_SH(dst21, dst20, dst23, dst22, tmp2, tmp3);
4820 
4821  PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out4, out5);
4822  ST_UB4(out0, out1, out2, out3, dst, dst_stride);
4823  ST_D4(out4, out5, 0, 1, 0, 1, dst + 16, dst_stride);
4824  dst += (4 * dst_stride);
4825 
4826  src2 = src6;
4827  src9 = src13;
4828  src10_r = src54_r;
4829  src21_r = src65_r;
4830  src10_l = src54_l;
4831  src21_l = src65_l;
4832  src87_r = src1211_r;
4833  src98_r = src1312_r;
4834  }
4835 }
4836 
4837 static void hevc_vt_uniwgt_4t_32w_msa(const uint8_t *src,
4838  int32_t src_stride,
4839  uint8_t *dst,
4840  int32_t dst_stride,
4841  const int8_t *filter,
4842  int32_t height,
4843  int32_t weight,
4844  int32_t offset,
4845  int32_t rnd_val)
4846 {
4847  uint32_t loop_cnt;
4848  v16u8 out0, out1, out2, out3;
4849  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9;
4850  v16i8 src10_r, src32_r, src76_r, src98_r;
4851  v16i8 src21_r, src43_r, src65_r, src87_r;
4852  v16i8 src10_l, src32_l, src76_l, src98_l;
4853  v16i8 src21_l, src43_l, src65_l, src87_l;
4854  v8i16 filter_vec, filt0, filt1;
4855  v4i32 weight_vec, rnd_vec, offset_vec;
4856  v4i32 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
4857  v4i32 dst8, dst9, dst10, dst11, dst12, dst13, dst14, dst15;
4858  v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
4859  v8i16 zero = { 0 };
4860 
4861  src -= src_stride;
4862 
4863  weight_vec = __msa_fill_w(weight);
4864  rnd_vec = __msa_fill_w(rnd_val);
4865  offset_vec = __msa_fill_w(offset);
4866 
4867  filter_vec = LD_SH(filter);
4868  UNPCK_R_SB_SH(filter_vec, filter_vec);
4869  SPLATI_W2_SH(filter_vec, 0, filt0, filt1);
4870 
4871  LD_SB3(src, src_stride, src0, src1, src2);
4872  LD_SB3(src + 16, src_stride, src5, src6, src7);
4873  src += (3 * src_stride);
4874  ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
4875  ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
4876  ILVR_B2_SB(src6, src5, src7, src6, src65_r, src76_r);
4877  ILVL_B2_SB(src6, src5, src7, src6, src65_l, src76_l);
4878 
4879  for (loop_cnt = (height >> 1); loop_cnt--;) {
4880  LD_SB2(src, src_stride, src3, src4);
4881  LD_SB2(src + 16, src_stride, src8, src9);
4882  src += (2 * src_stride);
4883  ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
4884  ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
4885  ILVRL_B2_SB(src8, src7, src87_r, src87_l);
4886  ILVRL_B2_SB(src9, src8, src98_r, src98_l);
4887 
4888  ILVRL_B2_SH(zero, src10_r, tmp0, tmp1);
4889  ILVRL_B2_SH(zero, src21_r, tmp2, tmp3);
4890  ILVRL_B2_SH(zero, src32_r, tmp4, tmp5);
4891  ILVRL_B2_SH(zero, src43_r, tmp6, tmp7);
4892  dst0 = HEVC_FILT_4TAP_SW(tmp0, tmp4, filt0, filt1);
4893  dst1 = HEVC_FILT_4TAP_SW(tmp1, tmp5, filt0, filt1);
4894  dst2 = HEVC_FILT_4TAP_SW(tmp2, tmp6, filt0, filt1);
4895  dst3 = HEVC_FILT_4TAP_SW(tmp3, tmp7, filt0, filt1);
4896 
4897  ILVRL_B2_SH(zero, src10_l, tmp0, tmp1);
4898  ILVRL_B2_SH(zero, src21_l, tmp2, tmp3);
4899  ILVRL_B2_SH(zero, src32_l, tmp4, tmp5);
4900  ILVRL_B2_SH(zero, src43_l, tmp6, tmp7);
4901  dst4 = HEVC_FILT_4TAP_SW(tmp0, tmp4, filt0, filt1);
4902  dst5 = HEVC_FILT_4TAP_SW(tmp1, tmp5, filt0, filt1);
4903  dst6 = HEVC_FILT_4TAP_SW(tmp2, tmp6, filt0, filt1);
4904  dst7 = HEVC_FILT_4TAP_SW(tmp3, tmp7, filt0, filt1);
4905 
4906  ILVRL_B2_SH(zero, src65_r, tmp0, tmp1);
4907  ILVRL_B2_SH(zero, src76_r, tmp2, tmp3);
4908  ILVRL_B2_SH(zero, src87_r, tmp4, tmp5);
4909  ILVRL_B2_SH(zero, src98_r, tmp6, tmp7);
4910  dst8 = HEVC_FILT_4TAP_SW(tmp0, tmp4, filt0, filt1);
4911  dst9 = HEVC_FILT_4TAP_SW(tmp1, tmp5, filt0, filt1);
4912  dst10 = HEVC_FILT_4TAP_SW(tmp2, tmp6, filt0, filt1);
4913  dst11 = HEVC_FILT_4TAP_SW(tmp3, tmp7, filt0, filt1);
4914  ILVRL_B2_SH(zero, src65_l, tmp0, tmp1);
4915  ILVRL_B2_SH(zero, src76_l, tmp2, tmp3);
4916  ILVRL_B2_SH(zero, src87_l, tmp4, tmp5);
4917  ILVRL_B2_SH(zero, src98_l, tmp6, tmp7);
4918  dst12 = HEVC_FILT_4TAP_SW(tmp0, tmp4, filt0, filt1);
4919  dst13 = HEVC_FILT_4TAP_SW(tmp1, tmp5, filt0, filt1);
4920  dst14 = HEVC_FILT_4TAP_SW(tmp2, tmp6, filt0, filt1);
4921  dst15 = HEVC_FILT_4TAP_SW(tmp3, tmp7, filt0, filt1);
4922 
4923  MUL4(dst0, weight_vec, dst1, weight_vec, dst2, weight_vec, dst3,
4924  weight_vec, dst0, dst1, dst2, dst3);
4925  MUL4(dst4, weight_vec, dst5, weight_vec, dst6, weight_vec,
4926  dst7, weight_vec, dst4, dst5, dst6, dst7);
4927  SRAR_W4_SW(dst0, dst1, dst2, dst3, rnd_vec);
4928  SRAR_W4_SW(dst4, dst5, dst6, dst7, rnd_vec);
4929  ADD4(dst0, offset_vec, dst1, offset_vec, dst2, offset_vec, dst3,
4930  offset_vec, dst0, dst1, dst2, dst3);
4931  ADD4(dst4, offset_vec, dst5, offset_vec, dst6, offset_vec,
4932  dst7, offset_vec, dst4, dst5, dst6, dst7);
4933  CLIP_SW8_0_255(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
4934 
4935  MUL4(dst8, weight_vec, dst9, weight_vec, dst10, weight_vec, dst11,
4936  weight_vec, dst8, dst9, dst10, dst11);
4937  MUL4(dst12, weight_vec, dst13, weight_vec, dst14, weight_vec,
4938  dst15, weight_vec, dst12, dst13, dst14, dst15);
4939  SRAR_W4_SW(dst8, dst9, dst10, dst11, rnd_vec);
4940  SRAR_W4_SW(dst12, dst13, dst14, dst15, rnd_vec);
4941  ADD4(dst8, offset_vec, dst9, offset_vec, dst10, offset_vec, dst11,
4942  offset_vec, dst8, dst9, dst10, dst11);
4943  ADD4(dst12, offset_vec, dst13, offset_vec, dst14, offset_vec,
4944  dst15, offset_vec, dst12, dst13, dst14, dst15);
4945  CLIP_SW8_0_255(dst8, dst9, dst10, dst11, dst12, dst13, dst14, dst15);
4946 
4947  PCKEV_H2_SH(dst1, dst0, dst3, dst2, tmp0, tmp1);
4948  PCKEV_H2_SH(dst5, dst4, dst7, dst6, tmp2, tmp3);
4949  PCKEV_H2_SH(dst9, dst8, dst11, dst10, tmp4, tmp5);
4950  PCKEV_H2_SH(dst13, dst12, dst15, dst14, tmp6, tmp7);
4951  PCKEV_B4_UB(tmp2, tmp0, tmp3, tmp1, tmp6, tmp4, tmp7, tmp5, out0, out1,
4952  out2, out3);
4953  ST_UB2(out0, out2, dst, 16);
4954  dst += dst_stride;
4955  ST_UB2(out1, out3, dst, 16);
4956  dst += dst_stride;
4957 
4958  src2 = src4;
4959  src7 = src9;
4960  src10_r = src32_r;
4961  src21_r = src43_r;
4962  src10_l = src32_l;
4963  src21_l = src43_l;
4964  src65_r = src87_r;
4965  src76_r = src98_r;
4966  src65_l = src87_l;
4967  src76_l = src98_l;
4968  }
4969 }
4970 
4971 static void hevc_hv_uniwgt_4t_4x2_msa(const uint8_t *src,
4972  int32_t src_stride,
4973  uint8_t *dst,
4974  int32_t dst_stride,
4975  const int8_t *filter_x,
4976  const int8_t *filter_y,
4977  int32_t weight,
4978  int32_t offset,
4979  int32_t rnd_val)
4980 {
4981  v16u8 out;
4982  v16i8 src0, src1, src2, src3, src4;
4983  v8i16 filt0, filt1;
4984  v16i8 mask0 = LD_SB(ff_hevc_mask_arr + 16);
4985  v16i8 mask1;
4986  v8i16 filt_h0, filt_h1, filter_vec, tmp;
4987  v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
4988  v8i16 dst10, dst21, dst32, dst43;
4989  v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
4990  v4i32 dst0, dst1, dst2, dst3, dst4;
4991  v4i32 weight_vec, rnd_vec, offset_vec;
4992  v8i16 zero = { 0 };
4993 
4994  src -= (src_stride + 1);
4995 
4996  filter_vec = LD_SH(filter_x);
4997  UNPCK_R_SB_SH(filter_vec, filter_vec);
4998  SPLATI_W2_SH(filter_vec, 0, filt0, filt1);
4999 
5000  filter_vec = LD_SH(filter_y);
5001  UNPCK_R_SB_SH(filter_vec, filter_vec);
5002  SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
5003 
5004  mask1 = mask0 + 2;
5005 
5006  weight_vec = __msa_fill_w(weight);
5007  rnd_vec = __msa_fill_w(rnd_val);
5008  offset_vec = __msa_fill_w(offset);
5009 
5010  LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
5011  VSHF_B2_SB(src0, src2, src0, src2, mask0, mask1, vec0, vec1);
5012  VSHF_B2_SB(src1, src3, src1, src3, mask0, mask1, vec2, vec3);
5013  VSHF_B2_SB(src2, src4, src2, src4, mask0, mask1, vec4, vec5);
5014  ILVRL_B2_SH(zero, vec0, tmp0, tmp1);
5015  ILVRL_B2_SH(zero, vec1, tmp2, tmp3);
5016  ILVRL_B2_SH(zero, vec2, tmp4, tmp5);
5017  ILVRL_B2_SH(zero, vec3, tmp6, tmp7);
5018  dst0 = HEVC_FILT_4TAP_SW(tmp0, tmp2, filt0, filt1);
5019  dst2 = HEVC_FILT_4TAP_SW(tmp1, tmp3, filt0, filt1);
5020  dst1 = HEVC_FILT_4TAP_SW(tmp4, tmp6, filt0, filt1);
5021  dst3 = HEVC_FILT_4TAP_SW(tmp5, tmp7, filt0, filt1);
5022  ILVL_B2_SH(zero, vec4, zero, vec5, tmp1, tmp3);
5023  dst4 = HEVC_FILT_4TAP_SW(tmp1, tmp3, filt0, filt1);
5024 
5025  ILVEV_H2_SH(dst0, dst1, dst2, dst3, dst10, dst32);
5026  ILVEV_H2_SH(dst1, dst2, dst3, dst4, dst21, dst43);
5027  dst0 = HEVC_FILT_4TAP(dst10, dst32, filt_h0, filt_h1);
5028  dst1 = HEVC_FILT_4TAP(dst21, dst43, filt_h0, filt_h1);
5029  dst0 >>= 6;
5030  dst1 >>= 6;
5031  MUL2(dst0, weight_vec, dst1, weight_vec, dst0, dst1);
5032  SRAR_W2_SW(dst0, dst1, rnd_vec);
5033  ADD2(dst0, offset_vec, dst1, offset_vec, dst0, dst1);
5034  CLIP_SW2_0_255(dst0, dst1);
5035  tmp = __msa_pckev_h((v8i16) dst1, (v8i16) dst0);
5036  out = (v16u8) __msa_pckev_b((v16i8) tmp, (v16i8) tmp);
5037  ST_W2(out, 0, 1, dst, dst_stride);
5038 }
5039 
5040 static void hevc_hv_uniwgt_4t_4x4_msa(const uint8_t *src,
5041  int32_t src_stride,
5042  uint8_t *dst,
5043  int32_t dst_stride,
5044  const int8_t *filter_x,
5045  const int8_t *filter_y,
5046  int32_t weight,
5047  int32_t offset,
5048  int32_t rnd_val)
5049 {
5050  v16u8 out;
5051  v16i8 src0, src1, src2, src3, src4, src5, src6;
5052  v8i16 filt0, filt1;
5053  v8i16 filt_h0, filt_h1, filter_vec;
5054  v16i8 mask0 = LD_SB(ff_hevc_mask_arr + 16);
5055  v16i8 mask1;
5056  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
5057  v8i16 dst10, dst32, dst54, dst21, dst43, dst65;
5058  v4i32 dst0, dst1, dst2, dst3, dst4, dst5, dst6;
5059  v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
5060  v4i32 weight_vec, rnd_vec, offset_vec;
5061  v8i16 zero = { 0 };
5062 
5063  src -= (src_stride + 1);
5064 
5065  filter_vec = LD_SH(filter_x);
5066  UNPCK_R_SB_SH(filter_vec, filter_vec);
5067  SPLATI_W2_SH(filter_vec, 0, filt0, filt1);
5068 
5069  filter_vec = LD_SH(filter_y);
5070  UNPCK_R_SB_SH(filter_vec, filter_vec);
5071  SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
5072 
5073  mask1 = mask0 + 2;
5074 
5075  weight_vec = __msa_fill_w(weight);
5076  rnd_vec = __msa_fill_w(rnd_val);
5077  offset_vec = __msa_fill_w(offset);
5078 
5079  LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
5080  VSHF_B2_SB(src0, src3, src0, src3, mask0, mask1, vec0, vec1);
5081  VSHF_B2_SB(src1, src4, src1, src4, mask0, mask1, vec2, vec3);
5082  VSHF_B2_SB(src2, src5, src2, src5, mask0, mask1, vec4, vec5);
5083  VSHF_B2_SB(src3, src6, src3, src6, mask0, mask1, vec6, vec7);
5084 
5085  ILVRL_B2_SH(zero, vec0, tmp0, tmp1);
5086  ILVRL_B2_SH(zero, vec1, tmp2, tmp3);
5087  ILVRL_B2_SH(zero, vec2, tmp4, tmp5);
5088  ILVRL_B2_SH(zero, vec3, tmp6, tmp7);
5089  dst0 = HEVC_FILT_4TAP_SW(tmp0, tmp2, filt0, filt1);
5090  dst3 = HEVC_FILT_4TAP_SW(tmp1, tmp3, filt0, filt1);
5091  dst1 = HEVC_FILT_4TAP_SW(tmp4, tmp6, filt0, filt1);
5092  dst4 = HEVC_FILT_4TAP_SW(tmp5, tmp7, filt0, filt1);
5093  ILVRL_B2_SH(zero, vec4, tmp0, tmp1);
5094  ILVRL_B2_SH(zero, vec5, tmp2, tmp3);
5095  ILVL_B2_SH(zero, vec6, zero, vec7, tmp5, tmp7);
5096  dst2 = HEVC_FILT_4TAP_SW(tmp0, tmp2, filt0, filt1);
5097  dst5 = HEVC_FILT_4TAP_SW(tmp1, tmp3, filt0, filt1);
5098  dst6 = HEVC_FILT_4TAP_SW(tmp5, tmp7, filt0, filt1);
5099  ILVEV_H2_SH(dst0, dst1, dst3, dst4, dst10, dst43);
5100  ILVEV_H2_SH(dst1, dst2, dst4, dst5, dst21, dst54);
5101  ILVEV_H2_SH(dst2, dst3, dst5, dst6, dst32, dst65);
5102  dst0 = HEVC_FILT_4TAP(dst10, dst32, filt_h0, filt_h1);
5103  dst1 = HEVC_FILT_4TAP(dst21, dst43, filt_h0, filt_h1);
5104  dst2 = HEVC_FILT_4TAP(dst32, dst54, filt_h0, filt_h1);
5105  dst3 = HEVC_FILT_4TAP(dst43, dst65, filt_h0, filt_h1);
5106  SRA_4V(dst0, dst1, dst2, dst3, 6);
5107  MUL2(dst0, weight_vec, dst1, weight_vec, dst0, dst1);
5108  MUL2(dst2, weight_vec, dst3, weight_vec, dst2, dst3);
5109  SRAR_W4_SW(dst0, dst1, dst2, dst3, rnd_vec);
5110  ADD4(dst0, offset_vec, dst1, offset_vec, dst2, offset_vec, dst3,
5111  offset_vec, dst0, dst1, dst2, dst3);
5112  CLIP_SW4_0_255(dst0, dst1, dst2, dst3);
5113  PCKEV_H2_SH(dst1, dst0, dst3, dst2, tmp0, tmp1);
5114  out = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
5115  ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
5116 }
5117 
5118 static void hevc_hv_uniwgt_4t_4multx8mult_msa(const uint8_t *src,
5119  int32_t src_stride,
5120  uint8_t *dst,
5121  int32_t dst_stride,
5122  const int8_t *filter_x,
5123  const int8_t *filter_y,
5124  int32_t height,
5125  int32_t weight,
5126  int32_t offset,
5127  int32_t rnd_val)
5128 {
5129  uint32_t loop_cnt;
5130  v16u8 out0, out1;
5131  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
5132  v8i16 filt0, filt1;
5133  v16i8 mask0 = LD_SB(ff_hevc_mask_arr + 16);
5134  v16i8 mask1;
5135  v8i16 filter_vec, filt_h0, filt_h1;
5136  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
5137  v8i16 dst10_r, dst32_r, dst54_r, dst76_r;
5138  v8i16 dst21_r, dst43_r, dst65_r, dst87_r;
5139  v8i16 dst98_r, dst109_r;
5140  v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
5141  v4i32 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8, dst9, dst10;
5142  v4i32 offset_vec, weight_vec, rnd_vec;
5143  v8i16 zero = { 0 };
5144 
5145  src -= (src_stride + 1);
5146 
5147  filter_vec = LD_SH(filter_x);
5148  UNPCK_R_SB_SH(filter_vec, filter_vec);
5149  SPLATI_W2_SH(filter_vec, 0, filt0, filt1);
5150 
5151  filter_vec = LD_SH(filter_y);
5152  UNPCK_R_SB_SH(filter_vec, filter_vec);
5153  SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
5154 
5155  mask1 = mask0 + 2;
5156 
5157  weight_vec = __msa_fill_w(weight);
5158  rnd_vec = __msa_fill_w(rnd_val);
5159  offset_vec = __msa_fill_w(offset);
5160 
5161  LD_SB3(src, src_stride, src0, src1, src2);
5162  src += (3 * src_stride);
5163 
5164  VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1);
5165  VSHF_B2_SB(src1, src2, src1, src2, mask0, mask1, vec2, vec3);
5166  ILVRL_B2_SH(zero, vec0, tmp0, tmp1);
5167  ILVRL_B2_SH(zero, vec1, tmp2, tmp3);
5168  ILVL_B2_SH(zero, vec2, zero, vec3, tmp5, tmp7);
5169  dst0 = HEVC_FILT_4TAP_SW(tmp0, tmp2, filt0, filt1);
5170  dst1 = HEVC_FILT_4TAP_SW(tmp1, tmp3, filt0, filt1);
5171  dst2 = HEVC_FILT_4TAP_SW(tmp5, tmp7, filt0, filt1);
5172 
5173  ILVEV_H2_SH(dst0, dst1, dst1, dst2, dst10_r, dst21_r);
5174 
5175  for (loop_cnt = height >> 3; loop_cnt--;) {
5176  LD_SB8(src, src_stride,
5177  src3, src4, src5, src6, src7, src8, src9, src10);
5178  src += (8 * src_stride);
5179 
5180  VSHF_B2_SB(src3, src7, src3, src7, mask0, mask1, vec0, vec1);
5181  VSHF_B2_SB(src4, src8, src4, src8, mask0, mask1, vec2, vec3);
5182  VSHF_B2_SB(src5, src9, src5, src9, mask0, mask1, vec4, vec5);
5183  VSHF_B2_SB(src6, src10, src6, src10, mask0, mask1, vec6, vec7);
5184 
5185  ILVRL_B2_SH(zero, vec0, tmp0, tmp1);
5186  ILVRL_B2_SH(zero, vec1, tmp2, tmp3);
5187  ILVRL_B2_SH(zero, vec2, tmp4, tmp5);
5188  ILVRL_B2_SH(zero, vec3, tmp6, tmp7);
5189  dst3 = HEVC_FILT_4TAP_SW(tmp0, tmp2, filt0, filt1);
5190  dst7 = HEVC_FILT_4TAP_SW(tmp1, tmp3, filt0, filt1);
5191  dst4 = HEVC_FILT_4TAP_SW(tmp4, tmp6, filt0, filt1);
5192  dst8 = HEVC_FILT_4TAP_SW(tmp5, tmp7, filt0, filt1);
5193  ILVRL_B2_SH(zero, vec4, tmp0, tmp1);
5194  ILVRL_B2_SH(zero, vec5, tmp2, tmp3);
5195  ILVRL_B2_SH(zero, vec6, tmp4, tmp5);
5196  ILVRL_B2_SH(zero, vec7, tmp6, tmp7);
5197  dst5 = HEVC_FILT_4TAP_SW(tmp0, tmp2, filt0, filt1);
5198  dst9 = HEVC_FILT_4TAP_SW(tmp1, tmp3, filt0, filt1);
5199  dst6 = HEVC_FILT_4TAP_SW(tmp4, tmp6, filt0, filt1);
5200  dst10 = HEVC_FILT_4TAP_SW(tmp5, tmp7, filt0, filt1);
5201 
5202  dst32_r = __msa_ilvev_h(dst3, dst2);
5203  ILVEV_H2_SH(dst3, dst4, dst7, dst8, dst43_r, dst87_r);
5204  ILVEV_H2_SH(dst4, dst5, dst8, dst9, dst54_r, dst98_r);
5205  ILVEV_H2_SH(dst5, dst6, dst9, dst10, dst65_r, dst109_r);
5206  dst76_r = __msa_ilvev_h(dst7, dst6);
5207  dst0 = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
5208  dst1 = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
5209  dst2 = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
5210  dst3 = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
5211  dst4 = HEVC_FILT_4TAP(dst54_r, dst76_r, filt_h0, filt_h1);
5212  dst5 = HEVC_FILT_4TAP(dst65_r, dst87_r, filt_h0, filt_h1);
5213  dst6 = HEVC_FILT_4TAP(dst76_r, dst98_r, filt_h0, filt_h1);
5214  dst7 = HEVC_FILT_4TAP(dst87_r, dst109_r, filt_h0, filt_h1);
5215  SRA_4V(dst0, dst1, dst2, dst3, 6);
5216  SRA_4V(dst4, dst5, dst6, dst7, 6);
5217  MUL2(dst0, weight_vec, dst1, weight_vec, dst0, dst1);
5218  MUL2(dst2, weight_vec, dst3, weight_vec, dst2, dst3);
5219  MUL2(dst4, weight_vec, dst5, weight_vec, dst4, dst5);
5220  MUL2(dst6, weight_vec, dst7, weight_vec, dst6, dst7);
5221  SRAR_W4_SW(dst0, dst1, dst2, dst3, rnd_vec);
5222  SRAR_W4_SW(dst4, dst5, dst6, dst7, rnd_vec);
5223  ADD4(dst0, offset_vec, dst1, offset_vec, dst2, offset_vec, dst3,
5224  offset_vec, dst0, dst1, dst2, dst3);
5225  ADD4(dst4, offset_vec, dst5, offset_vec, dst6, offset_vec, dst7,
5226  offset_vec, dst4, dst5, dst6, dst7);
5227  CLIP_SW8_0_255(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
5228  PCKEV_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6, tmp0, tmp1,
5229  tmp2, tmp3);
5230  PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
5231  ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
5232  dst += (8 * dst_stride);
5233 
5234  dst10_r = dst98_r;
5235  dst21_r = dst109_r;
5236  dst2 = dst10;
5237  }
5238 }
5239 
5240 static void hevc_hv_uniwgt_4t_4w_msa(const uint8_t *src,
5241  int32_t src_stride,
5242  uint8_t *dst,
5243  int32_t dst_stride,
5244  const int8_t *filter_x,
5245  const int8_t *filter_y,
5246  int32_t height,
5247  int32_t weight,
5248  int32_t offset,
5249  int32_t rnd_val)
5250 {
5251  if (2 == height) {
5252  hevc_hv_uniwgt_4t_4x2_msa(src, src_stride, dst, dst_stride,
5253  filter_x, filter_y, weight,
5254  offset, rnd_val);
5255  } else if (4 == height) {
5256  hevc_hv_uniwgt_4t_4x4_msa(src, src_stride, dst, dst_stride,
5257  filter_x,filter_y, weight,
5258  offset, rnd_val);
5259  } else if (0 == (height % 8)) {
5260  hevc_hv_uniwgt_4t_4multx8mult_msa(src, src_stride, dst, dst_stride,
5261  filter_x, filter_y, height, weight,
5262  offset, rnd_val);
5263  }
5264 }
5265 
5266 static void hevc_hv_uniwgt_4t_6w_msa(const uint8_t *src,
5267  int32_t src_stride,
5268  uint8_t *dst,
5269  int32_t dst_stride,
5270  const int8_t *filter_x,
5271  const int8_t *filter_y,
5272  int32_t height,
5273  int32_t weight,
5274  int32_t offset,
5275  int32_t rnd_val)
5276 {
5277  v16u8 out0, out1, out2;
5278  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
5279  v8i16 filt0, filt1;
5280  v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
5281  v16i8 mask1;
5282  v8i16 filt_h0, filt_h1, filter_vec;
5283  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
5284  v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8, dst9;
5285  v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
5286  v8i16 dst10_r, dst32_r, dst54_r, dst76_r, dst98_r, dst21_r, dst43_r;
5287  v8i16 dst65_r, dst87_r, dst109_r, dst10_l, dst32_l, dst54_l, dst76_l;
5288  v8i16 dst98_l, dst21_l, dst43_l, dst65_l, dst87_l, dst109_l;
5289  v8i16 dst1021_l, dst3243_l, dst5465_l, dst7687_l, dst98109_l;
5290  v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst4_r, dst5_r, dst6_r, dst7_r;
5291  v4i32 dst0_l, dst1_l, dst2_l, dst3_l, weight_vec, rnd_vec, offset_vec;
5292  v8i16 zero = { 0 };
5293 
5294  src -= (src_stride + 1);
5295 
5296  filter_vec = LD_SH(filter_x);
5297  UNPCK_R_SB_SH(filter_vec, filter_vec);
5298  SPLATI_W2_SH(filter_vec, 0, filt0, filt1);
5299 
5300  filter_vec = LD_SH(filter_y);
5301  UNPCK_R_SB_SH(filter_vec, filter_vec);
5302  SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
5303 
5304  mask1 = mask0 + 2;
5305 
5306  weight_vec = __msa_fill_w(weight);
5307  rnd_vec = __msa_fill_w(rnd_val);
5308  offset_vec = __msa_fill_w(offset);
5309 
5310  LD_SB3(src, src_stride, src0, src1, src2);
5311  src += (3 * src_stride);
5312 
5313  VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
5314  VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
5315  VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
5316  ILVRL_B2_SH(zero, vec0, tmp0, tmp1);
5317  ILVRL_B2_SH(zero, vec1, tmp2, tmp3);
5318  ILVRL_B2_SH(zero, vec2, tmp4, tmp5);
5319  ILVRL_B2_SH(zero, vec3, tmp6, tmp7);
5320  dst0 = HEVC_FILT_4TAP_SW(tmp0, tmp2, filt0, filt1);
5321  dst1 = HEVC_FILT_4TAP_SW(tmp1, tmp3, filt0, filt1);
5322  dst2 = HEVC_FILT_4TAP_SW(tmp4, tmp6, filt0, filt1);
5323  dst3 = HEVC_FILT_4TAP_SW(tmp5, tmp7, filt0, filt1);
5324  ILVRL_B2_SH(zero, vec4, tmp0, tmp1);
5325  ILVRL_B2_SH(zero, vec5, tmp2, tmp3);
5326  dst4 = HEVC_FILT_4TAP_SW(tmp0, tmp2, filt0, filt1);
5327  dst5 = HEVC_FILT_4TAP_SW(tmp1, tmp3, filt0, filt1);
5328 
5329  ILVEV_H2_SH(dst0, dst2, dst1, dst3, dst10_r, dst10_l);
5330  ILVEV_H2_SH(dst2, dst4, dst3, dst5, dst21_r, dst21_l);
5331 
5332  LD_SB8(src, src_stride, src3, src4, src5, src6, src7, src8, src9, src10);
5333  VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
5334  VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec2, vec3);
5335  VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec4, vec5);
5336  VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec6, vec7);
5337  ILVRL_B2_SH(zero, vec0, tmp0, tmp1);
5338  ILVRL_B2_SH(zero, vec1, tmp2, tmp3);
5339  ILVRL_B2_SH(zero, vec2, tmp4, tmp5);
5340  ILVRL_B2_SH(zero, vec3, tmp6, tmp7);
5341  dst6 = HEVC_FILT_4TAP_SW(tmp0, tmp2, filt0, filt1);
5342  dst7 = HEVC_FILT_4TAP_SW(tmp1, tmp3, filt0, filt1);
5343  dst8 = HEVC_FILT_4TAP_SW(tmp4, tmp6, filt0, filt1);
5344  dst9 = HEVC_FILT_4TAP_SW(tmp5, tmp7, filt0, filt1);
5345  ILVEV_H2_SH(dst4, dst6, dst5, dst7, dst32_r, dst32_l);
5346  ILVEV_H2_SH(dst6, dst8, dst7, dst9, dst43_r, dst43_l);
5347  ILVRL_B2_SH(zero, vec4, tmp0, tmp1);
5348  ILVRL_B2_SH(zero, vec5, tmp2, tmp3);
5349  ILVRL_B2_SH(zero, vec6, tmp4, tmp5);
5350  ILVRL_B2_SH(zero, vec7, tmp6, tmp7);
5351  dst0 = HEVC_FILT_4TAP_SW(tmp0, tmp2, filt0, filt1);
5352  dst1 = HEVC_FILT_4TAP_SW(tmp1, tmp3, filt0, filt1);
5353  dst2 = HEVC_FILT_4TAP_SW(tmp4, tmp6, filt0, filt1);
5354  dst3 = HEVC_FILT_4TAP_SW(tmp5, tmp7, filt0, filt1);
5355  ILVEV_H2_SH(dst8, dst0, dst9, dst1, dst54_r, dst54_l);
5356  ILVEV_H2_SH(dst0, dst2, dst1, dst3, dst65_r, dst65_l);
5357  VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec0, vec1);
5358  VSHF_B2_SB(src8, src8, src8, src8, mask0, mask1, vec2, vec3);
5359  VSHF_B2_SB(src9, src9, src9, src9, mask0, mask1, vec4, vec5);
5360  VSHF_B2_SB(src10, src10, src10, src10, mask0, mask1, vec6, vec7);
5361  ILVRL_B2_SH(zero, vec0, tmp0, tmp1);
5362  ILVRL_B2_SH(zero, vec1, tmp2, tmp3);
5363  ILVRL_B2_SH(zero, vec2, tmp4, tmp5);
5364  ILVRL_B2_SH(zero, vec3, tmp6, tmp7);
5365  dst4 = HEVC_FILT_4TAP_SW(tmp0, tmp2, filt0, filt1);
5366  dst5 = HEVC_FILT_4TAP_SW(tmp1, tmp3, filt0, filt1);
5367  dst6 = HEVC_FILT_4TAP_SW(tmp4, tmp6, filt0, filt1);
5368  dst7 = HEVC_FILT_4TAP_SW(tmp5, tmp7, filt0, filt1);
5369  ILVEV_H2_SH(dst2, dst4, dst3, dst5, dst76_r, dst76_l);
5370  ILVEV_H2_SH(dst4, dst6, dst5, dst7, dst87_r, dst87_l);
5371  ILVRL_B2_SH(zero, vec4, tmp0, tmp1);
5372  ILVRL_B2_SH(zero, vec5, tmp2, tmp3);
5373  ILVRL_B2_SH(zero, vec6, tmp4, tmp5);
5374  ILVRL_B2_SH(zero, vec7, tmp6, tmp7);
5375  dst0 = HEVC_FILT_4TAP_SW(tmp0, tmp2, filt0, filt1);
5376  dst1 = HEVC_FILT_4TAP_SW(tmp1, tmp3, filt0, filt1);
5377  dst2 = HEVC_FILT_4TAP_SW(tmp4, tmp6, filt0, filt1);
5378  dst3 = HEVC_FILT_4TAP_SW(tmp5, tmp7, filt0, filt1);
5379  ILVEV_H2_SH(dst6, dst0, dst7, dst1, dst98_r, dst98_l);
5380  ILVEV_H2_SH(dst0, dst2, dst1, dst3, dst109_r, dst109_l);
5381  PCKEV_D2_SH(dst21_l, dst10_l, dst43_l, dst32_l, dst1021_l, dst3243_l);
5382  PCKEV_D2_SH(dst65_l, dst54_l, dst87_l, dst76_l, dst5465_l, dst7687_l);
5383  dst98109_l = (v8i16) __msa_pckev_d((v2i64) dst109_l, (v2i64) dst98_l);
5384  dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
5385  dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
5386  dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
5387  dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
5388  dst4_r = HEVC_FILT_4TAP(dst54_r, dst76_r, filt_h0, filt_h1);
5389  dst5_r = HEVC_FILT_4TAP(dst65_r, dst87_r, filt_h0, filt_h1);
5390  dst6_r = HEVC_FILT_4TAP(dst76_r, dst98_r, filt_h0, filt_h1);
5391  dst7_r = HEVC_FILT_4TAP(dst87_r, dst109_r, filt_h0, filt_h1);
5392  dst0_l = HEVC_FILT_4TAP(dst1021_l, dst3243_l, filt_h0, filt_h1);
5393  dst1_l = HEVC_FILT_4TAP(dst3243_l, dst5465_l, filt_h0, filt_h1);
5394  dst2_l = HEVC_FILT_4TAP(dst5465_l, dst7687_l, filt_h0, filt_h1);
5395  dst3_l = HEVC_FILT_4TAP(dst7687_l, dst98109_l, filt_h0, filt_h1);
5396  SRA_4V(dst0_r, dst1_r, dst2_r, dst3_r, 6);
5397  SRA_4V(dst4_r, dst5_r, dst6_r, dst7_r, 6);
5398  SRA_4V(dst0_l, dst1_l, dst2_l, dst3_l, 6);
5399  MUL2(dst0_r, weight_vec, dst1_r, weight_vec, dst0_r, dst1_r);
5400  MUL2(dst2_r, weight_vec, dst3_r, weight_vec, dst2_r, dst3_r);
5401  MUL2(dst4_r, weight_vec, dst5_r, weight_vec, dst4_r, dst5_r);
5402  MUL2(dst6_r, weight_vec, dst7_r, weight_vec, dst6_r, dst7_r);
5403  MUL2(dst0_l, weight_vec, dst1_l, weight_vec, dst0_l, dst1_l);
5404  MUL2(dst2_l, weight_vec, dst3_l, weight_vec, dst2_l, dst3_l);
5405  SRAR_W4_SW(dst0_r, dst1_r, dst2_r, dst3_r, rnd_vec);
5406  SRAR_W4_SW(dst4_r, dst5_r, dst6_r, dst7_r, rnd_vec);
5407  SRAR_W4_SW(dst0_l, dst1_l, dst2_l, dst3_l, rnd_vec);
5408  ADD4(dst0_r, offset_vec, dst1_r, offset_vec, dst2_r, offset_vec, dst3_r,
5409  offset_vec, dst0_r, dst1_r, dst2_r, dst3_r);
5410  ADD4(dst4_r, offset_vec, dst5_r, offset_vec, dst6_r, offset_vec, dst7_r,
5411  offset_vec, dst4_r, dst5_r, dst6_r, dst7_r);
5412  ADD4(dst0_l, offset_vec, dst1_l, offset_vec, dst2_l, offset_vec, dst3_l,
5413  offset_vec, dst0_l, dst1_l, dst2_l, dst3_l);
5414  CLIP_SW8_0_255(dst1_r, dst0_r, dst3_r, dst2_r,
5415  dst4_r, dst5_r, dst6_r, dst7_r);
5416  CLIP_SW4_0_255(dst0_l, dst1_l, dst2_l, dst3_l);
5417  PCKEV_H2_SH(dst1_r, dst0_r, dst3_r, dst2_r, tmp0, tmp1);
5418  PCKEV_H2_SH(dst5_r, dst4_r, dst7_r, dst6_r, tmp2, tmp3);
5419  PCKEV_H2_SH(dst1_l, dst0_l, dst3_l, dst2_l, tmp4, tmp5);
5420  PCKEV_B3_UB(tmp1, tmp0, tmp3, tmp2, tmp5, tmp4, out0, out1, out2);
5421  ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
5422  ST_H8(out2, 0, 1, 2, 3, 4, 5, 6, 7, dst + 4, dst_stride);
5423 }
5424 
5425 static void hevc_hv_uniwgt_4t_8x2_msa(const uint8_t *src,
5426  int32_t src_stride,
5427  uint8_t *dst,
5428  int32_t dst_stride,
5429  const int8_t *filter_x,
5430  const int8_t *filter_y,
5431  int32_t weight,
5432  int32_t offset,
5433  int32_t rnd_val)
5434 {
5435  v16u8 out;
5436  v16i8 src0, src1, src2, src3, src4;
5437  v8i16 filt0, filt1;
5438  v8i16 filt_h0, filt_h1, filter_vec;
5439  v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
5440  v16i8 mask1;
5441  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9;
5442  v4i32 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
5443  v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
5444  v4i32 dst0_r, dst0_l, dst1_r, dst1_l;
5445  v8i16 dst10_r, dst32_r, dst21_r, dst43_r;
5446  v8i16 dst10_l, dst32_l, dst21_l, dst43_l;
5447  v4i32 weight_vec, rnd_vec, offset_vec;
5448  v8i16 zero = { 0 };
5449 
5450  src -= (src_stride + 1);
5451 
5452  filter_vec = LD_SH(filter_x);
5453  UNPCK_R_SB_SH(filter_vec, filter_vec);
5454  SPLATI_W2_SH(filter_vec, 0, filt0, filt1);
5455 
5456  filter_vec = LD_SH(filter_y);
5457  UNPCK_R_SB_SH(filter_vec, filter_vec);
5458  SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
5459 
5460  mask1 = mask0 + 2;
5461 
5462  weight_vec = __msa_fill_w(weight);
5463  rnd_vec = __msa_fill_w(rnd_val);
5464  offset_vec = __msa_fill_w(offset);
5465 
5466  LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
5467  VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
5468  VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
5469  VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
5470  VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec6, vec7);
5471  VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec8, vec9);
5472 
5473  ILVRL_B2_SH(zero, vec0, tmp0, tmp1);
5474  ILVRL_B2_SH(zero, vec1, tmp2, tmp3);
5475  ILVRL_B2_SH(zero, vec2, tmp4, tmp5);
5476  ILVRL_B2_SH(zero, vec3, tmp6, tmp7);
5477  dst0 = HEVC_FILT_4TAP_SW(tmp0, tmp2, filt0, filt1);
5478  dst1 = HEVC_FILT_4TAP_SW(tmp1, tmp3, filt0, filt1);
5479  dst2 = HEVC_FILT_4TAP_SW(tmp4, tmp6, filt0, filt1);
5480  dst3 = HEVC_FILT_4TAP_SW(tmp5, tmp7, filt0, filt1);
5481  ILVRL_B2_SH(zero, vec4, tmp0, tmp1);
5482  ILVRL_B2_SH(zero, vec5, tmp2, tmp3);
5483  ILVRL_B2_SH(zero, vec6, tmp4, tmp5);
5484  ILVRL_B2_SH(zero, vec7, tmp6, tmp7);
5485  dst4 = HEVC_FILT_4TAP_SW(tmp0, tmp2, filt0, filt1);
5486  dst5 = HEVC_FILT_4TAP_SW(tmp1, tmp3, filt0, filt1);
5487  dst6 = HEVC_FILT_4TAP_SW(tmp4, tmp6, filt0, filt1);
5488  dst7 = HEVC_FILT_4TAP_SW(tmp5, tmp7, filt0, filt1);
5489 
5490  ILVEV_H2_SH(dst0, dst2, dst1, dst3, dst10_r, dst10_l);
5491  ILVEV_H2_SH(dst2, dst4, dst3, dst5, dst21_r, dst21_l);
5492  ILVEV_H2_SH(dst4, dst6, dst5, dst7, dst32_r, dst32_l);
5493  ILVRL_B2_SH(zero, vec8, tmp0, tmp1);
5494  ILVRL_B2_SH(zero, vec9, tmp2, tmp3);
5495  dst0 = HEVC_FILT_4TAP_SW(tmp0, tmp2, filt0, filt1);
5496  dst1 = HEVC_FILT_4TAP_SW(tmp1, tmp3, filt0, filt1);
5497  ILVEV_H2_SH(dst6, dst0, dst7, dst1, dst43_r, dst43_l);
5498 
5499  dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
5500  dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
5501  dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
5502  dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
5503  SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
5504  MUL2(dst0_r, weight_vec, dst1_r, weight_vec, dst0_r, dst1_r);
5505  MUL2(dst0_l, weight_vec, dst1_l, weight_vec, dst0_l, dst1_l);
5506  SRAR_W4_SW(dst0_r, dst0_l, dst1_r, dst1_l, rnd_vec);
5507  ADD4(dst0_r, offset_vec, dst0_l, offset_vec, dst1_r, offset_vec,
5508  dst1_l, offset_vec, dst0_r, dst0_l, dst1_r, dst1_l);
5509  CLIP_SW4_0_255(dst0_r, dst0_l, dst1_r, dst1_l);
5510  PCKEV_H2_SH(dst0_l, dst0_r, dst1_l, dst1_r, tmp0, tmp1);
5511  out = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
5512  ST_D2(out, 0, 1, dst, dst_stride);
5513 }
5514 
5515 static void hevc_hv_uniwgt_4t_8multx4_msa(const uint8_t *src,
5516  int32_t src_stride,
5517  uint8_t *dst,
5518  int32_t dst_stride,
5519  const int8_t *filter_x,
5520  const int8_t *filter_y,
5521  int32_t width8mult,
5522  int32_t weight,
5523  int32_t offset,
5524  int32_t rnd_val)
5525 {
5526  uint32_t cnt;
5527  v16u8 out0, out1;
5528  v16i8 src0, src1, src2, src3, src4, src5, src6, mask0, mask1;
5529  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
5530  v8i16 filt0, filt1, filt_h0, filt_h1, filter_vec;
5531  v4i32 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
5532  v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
5533  v8i16 dst10_r, dst32_r, dst54_r, dst21_r, dst43_r, dst65_r;
5534  v8i16 dst10_l, dst32_l, dst54_l, dst21_l, dst43_l, dst65_l;
5535  v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
5536  v4i32 weight_vec, rnd_vec, offset_vec;
5537  v8i16 zero = { 0 };
5538 
5539  src -= (src_stride + 1);
5540 
5541  filter_vec = LD_SH(filter_x);
5542  UNPCK_R_SB_SH(filter_vec, filter_vec);
5543  SPLATI_W2_SH(filter_vec, 0, filt0, filt1);
5544 
5545  filter_vec = LD_SH(filter_y);
5546  UNPCK_R_SB_SH(filter_vec, filter_vec);
5547  SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
5548 
5549  mask0 = LD_SB(ff_hevc_mask_arr);
5550  mask1 = mask0 + 2;
5551 
5552  weight_vec = __msa_fill_w(weight);
5553  rnd_vec = __msa_fill_w(rnd_val);
5554  offset_vec = __msa_fill_w(offset);
5555 
5556  for (cnt = width8mult; cnt--;) {
5557  LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
5558  src += 8;
5559  VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
5560  VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
5561  VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
5562  ILVRL_B2_SH(zero, vec0, tmp0, tmp1);
5563  ILVRL_B2_SH(zero, vec1, tmp2, tmp3);
5564  ILVRL_B2_SH(zero, vec2, tmp4, tmp5);
5565  ILVRL_B2_SH(zero, vec3, tmp6, tmp7);
5566  dst0 = HEVC_FILT_4TAP_SW(tmp0, tmp2, filt0, filt1);
5567  dst1 = HEVC_FILT_4TAP_SW(tmp1, tmp3, filt0, filt1);
5568  dst2 = HEVC_FILT_4TAP_SW(tmp4, tmp6, filt0, filt1);
5569  dst3 = HEVC_FILT_4TAP_SW(tmp5, tmp7, filt0, filt1);
5570  ILVRL_B2_SH(zero, vec4, tmp0, tmp1);
5571  ILVRL_B2_SH(zero, vec5, tmp2, tmp3);
5572  dst4 = HEVC_FILT_4TAP_SW(tmp0, tmp2, filt0, filt1);
5573  dst5 = HEVC_FILT_4TAP_SW(tmp1, tmp3, filt0, filt1);
5574  ILVEV_H2_SH(dst0, dst2, dst1, dst3, dst10_r, dst10_l);
5575  ILVEV_H2_SH(dst2, dst4, dst3, dst5, dst21_r, dst21_l);
5576  VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
5577  VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec2, vec3);
5578  VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec4, vec5);
5579  VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec6, vec7);
5580  ILVRL_B2_SH(zero, vec0, tmp0, tmp1);
5581  ILVRL_B2_SH(zero, vec1, tmp2, tmp3);
5582  ILVRL_B2_SH(zero, vec2, tmp4, tmp5);
5583  ILVRL_B2_SH(zero, vec3, tmp6, tmp7);
5584  dst0 = HEVC_FILT_4TAP_SW(tmp0, tmp2, filt0, filt1);
5585  dst1 = HEVC_FILT_4TAP_SW(tmp1, tmp3, filt0, filt1);
5586  dst2 = HEVC_FILT_4TAP_SW(tmp4, tmp6, filt0, filt1);
5587  dst3 = HEVC_FILT_4TAP_SW(tmp5, tmp7, filt0, filt1);
5588  ILVEV_H2_SH(dst4, dst0, dst5, dst1, dst32_r, dst32_l);
5589  ILVEV_H2_SH(dst0, dst2, dst1, dst3, dst43_r, dst43_l);
5590  ILVRL_B2_SH(zero, vec4, tmp0, tmp1);
5591  ILVRL_B2_SH(zero, vec5, tmp2, tmp3);
5592  ILVRL_B2_SH(zero, vec6, tmp4, tmp5);
5593  ILVRL_B2_SH(zero, vec7, tmp6, tmp7);
5594  dst4 = HEVC_FILT_4TAP_SW(tmp0, tmp2, filt0, filt1);
5595  dst5 = HEVC_FILT_4TAP_SW(tmp1, tmp3, filt0, filt1);
5596  dst6 = HEVC_FILT_4TAP_SW(tmp4, tmp6, filt0, filt1);
5597  dst7 = HEVC_FILT_4TAP_SW(tmp5, tmp7, filt0, filt1);
5598  ILVEV_H2_SH(dst2, dst4, dst3, dst5, dst54_r, dst54_l);
5599  ILVEV_H2_SH(dst4, dst6, dst5, dst7, dst65_r, dst65_l);
5600 
5601  dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
5602  dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
5603  dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
5604  dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
5605  dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
5606  dst2_l = HEVC_FILT_4TAP(dst32_l, dst54_l, filt_h0, filt_h1);
5607  dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
5608  dst3_l = HEVC_FILT_4TAP(dst43_l, dst65_l, filt_h0, filt_h1);
5609  SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
5610  SRA_4V(dst2_r, dst2_l, dst3_r, dst3_l, 6);
5611  MUL2(dst0_r, weight_vec, dst1_r, weight_vec, dst0_r, dst1_r);
5612  MUL2(dst2_r, weight_vec, dst3_r, weight_vec, dst2_r, dst3_r);
5613  MUL2(dst0_l, weight_vec, dst1_l, weight_vec, dst0_l, dst1_l);
5614  MUL2(dst2_l, weight_vec, dst3_l, weight_vec, dst2_l, dst3_l);
5615  SRAR_W4_SW(dst0_r, dst0_l, dst1_r, dst1_l, rnd_vec);
5616  SRAR_W4_SW(dst2_r, dst2_l, dst3_r, dst3_l, rnd_vec);
5617  ADD2(dst0_r, offset_vec, dst1_r, offset_vec, dst0_r, dst1_r);
5618  ADD2(dst2_r, offset_vec, dst3_r, offset_vec, dst2_r, dst3_r);
5619  ADD2(dst0_l, offset_vec, dst1_l, offset_vec, dst0_l, dst1_l);
5620  ADD2(dst2_l, offset_vec, dst3_l, offset_vec, dst2_l, dst3_l);
5621  CLIP_SW8_0_255(dst0_r, dst0_l, dst1_r, dst1_l,
5622  dst2_r, dst2_l, dst3_r, dst3_l);
5623  PCKEV_H4_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst2_l, dst2_r, dst3_l,
5624  dst3_r, tmp0, tmp1, tmp2, tmp3);
5625  PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
5626  ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
5627  dst += 8;
5628  }
5629 }
5630 
5631 static void hevc_hv_uniwgt_4t_8x6_msa(const uint8_t *src,
5632  int32_t src_stride,
5633  uint8_t *dst,
5634  int32_t dst_stride,
5635  const int8_t *filter_x,
5636  const int8_t *filter_y,
5637  int32_t weight,
5638  int32_t offset,
5639  int32_t rnd_val)
5640 {
5641  v16u8 out0, out1, out2;
5642  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
5643  v8i16 filt0, filt1;
5644  v8i16 filt_h0, filt_h1, filter_vec;
5645  v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
5646  v16i8 mask1;
5647  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9;
5648  v16i8 vec10, vec11, vec12, vec13, vec14, vec15, vec16, vec17;
5649  v4i32 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
5650  v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
5651  v4i32 dst4_r, dst4_l, dst5_r, dst5_l, weight_vec, rnd_vec, offset_vec;
5652  v8i16 dst10_r, dst32_r, dst10_l, dst32_l;
5653  v8i16 dst21_r, dst43_r, dst21_l, dst43_l;
5654  v8i16 dst54_r, dst54_l, dst65_r, dst65_l;
5655  v8i16 dst76_r, dst76_l, dst87_r, dst87_l;
5656  v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
5657  v8i16 zero = { 0 };
5658 
5659  src -= (src_stride + 1);
5660 
5661  filter_vec = LD_SH(filter_x);
5662  UNPCK_R_SB_SH(filter_vec, filter_vec);
5663  SPLATI_W2_SH(filter_vec, 0, filt0, filt1);
5664 
5665  filter_vec = LD_SH(filter_y);
5666  UNPCK_R_SB_SH(filter_vec, filter_vec);
5667  SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
5668 
5669  mask1 = mask0 + 2;
5670 
5671  weight_vec = __msa_fill_w(weight);
5672  rnd_vec = __msa_fill_w(rnd_val);
5673  offset_vec = __msa_fill_w(offset);
5674 
5675  LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
5676  src += (5 * src_stride);
5677  LD_SB4(src, src_stride, src5, src6, src7, src8);
5678  VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
5679  VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
5680  VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
5681  VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec6, vec7);
5682  VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec8, vec9);
5683  VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec10, vec11);
5684  VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec12, vec13);
5685  VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec14, vec15);
5686  VSHF_B2_SB(src8, src8, src8, src8, mask0, mask1, vec16, vec17);
5687 
5688  ILVRL_B2_SH(zero, vec0, tmp0, tmp1);
5689  ILVRL_B2_SH(zero, vec1, tmp2, tmp3);
5690  ILVRL_B2_SH(zero, vec2, tmp4, tmp5);
5691  ILVRL_B2_SH(zero, vec3, tmp6, tmp7);
5692  dst0 = HEVC_FILT_4TAP_SW(tmp0, tmp2, filt0, filt1);
5693  dst1 = HEVC_FILT_4TAP_SW(tmp1, tmp3, filt0, filt1);
5694  dst2 = HEVC_FILT_4TAP_SW(tmp4, tmp6, filt0, filt1);
5695  dst3 = HEVC_FILT_4TAP_SW(tmp5, tmp7, filt0, filt1);
5696  ILVRL_B2_SH(zero, vec4, tmp0, tmp1);
5697  ILVRL_B2_SH(zero, vec5, tmp2, tmp3);
5698  ILVRL_B2_SH(zero, vec6, tmp4, tmp5);
5699  ILVRL_B2_SH(zero, vec7, tmp6, tmp7);
5700  dst4 = HEVC_FILT_4TAP_SW(tmp0, tmp2, filt0, filt1);
5701  dst5 = HEVC_FILT_4TAP_SW(tmp1, tmp3, filt0, filt1);
5702  dst6 = HEVC_FILT_4TAP_SW(tmp4, tmp6, filt0, filt1);
5703  dst7 = HEVC_FILT_4TAP_SW(tmp5, tmp7, filt0, filt1);
5704  ILVEV_H2_SH(dst0, dst2, dst1, dst3, dst10_r, dst10_l);
5705  ILVEV_H2_SH(dst2, dst4, dst3, dst5, dst21_r, dst21_l);
5706  ILVEV_H2_SH(dst4, dst6, dst5, dst7, dst32_r, dst32_l);
5707  ILVRL_B2_SH(zero, vec8, tmp0, tmp1);
5708  ILVRL_B2_SH(zero, vec9, tmp2, tmp3);
5709  ILVRL_B2_SH(zero, vec10, tmp4, tmp5);
5710  ILVRL_B2_SH(zero, vec11, tmp6, tmp7);
5711  dst0 = HEVC_FILT_4TAP_SW(tmp0, tmp2, filt0, filt1);
5712  dst1 = HEVC_FILT_4TAP_SW(tmp1, tmp3, filt0, filt1);
5713  dst2 = HEVC_FILT_4TAP_SW(tmp4, tmp6, filt0, filt1);
5714  dst3 = HEVC_FILT_4TAP_SW(tmp5, tmp7, filt0, filt1);
5715  ILVEV_H2_SH(dst6, dst0, dst7, dst1, dst43_r, dst43_l);
5716  ILVEV_H2_SH(dst0, dst2, dst1, dst3, dst54_r, dst54_l);
5717  ILVRL_B2_SH(zero, vec12, tmp0, tmp1);
5718  ILVRL_B2_SH(zero, vec13, tmp2, tmp3);
5719  ILVRL_B2_SH(zero, vec14, tmp4, tmp5);
5720  ILVRL_B2_SH(zero, vec15, tmp6, tmp7);
5721  dst4 = HEVC_FILT_4TAP_SW(tmp0, tmp2, filt0, filt1);
5722  dst5 = HEVC_FILT_4TAP_SW(tmp1, tmp3, filt0, filt1);
5723  dst6 = HEVC_FILT_4TAP_SW(tmp4, tmp6, filt0, filt1);
5724  dst7 = HEVC_FILT_4TAP_SW(tmp5, tmp7, filt0, filt1);
5725  ILVEV_H2_SH(dst2, dst4, dst3, dst5, dst65_r, dst65_l);
5726  ILVEV_H2_SH(dst4, dst6, dst5, dst7, dst76_r, dst76_l);
5727  ILVRL_B2_SH(zero, vec16, tmp0, tmp1);
5728  ILVRL_B2_SH(zero, vec17, tmp2, tmp3);
5729  dst0 = HEVC_FILT_4TAP_SW(tmp0, tmp2, filt0, filt1);
5730  dst1 = HEVC_FILT_4TAP_SW(tmp1, tmp3, filt0, filt1);
5731  ILVEV_H2_SH(dst6, dst0, dst7, dst1, dst87_r, dst87_l);
5732  dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
5733  dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
5734  dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
5735  dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
5736  dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
5737  dst2_l = HEVC_FILT_4TAP(dst32_l, dst54_l, filt_h0, filt_h1);
5738  dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
5739  dst3_l = HEVC_FILT_4TAP(dst43_l, dst65_l, filt_h0, filt_h1);
5740  dst4_r = HEVC_FILT_4TAP(dst54_r, dst76_r, filt_h0, filt_h1);
5741  dst4_l = HEVC_FILT_4TAP(dst54_l, dst76_l, filt_h0, filt_h1);
5742  dst5_r = HEVC_FILT_4TAP(dst65_r, dst87_r, filt_h0, filt_h1);
5743  dst5_l = HEVC_FILT_4TAP(dst65_l, dst87_l, filt_h0, filt_h1);
5744  SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
5745  SRA_4V(dst2_r, dst2_l, dst3_r, dst3_l, 6);
5746  SRA_4V(dst4_r, dst4_l, dst5_r, dst5_l, 6);
5747  MUL2(dst0_r, weight_vec, dst1_r, weight_vec, dst0_r, dst1_r);
5748  MUL2(dst2_r, weight_vec, dst3_r, weight_vec, dst2_r, dst3_r);
5749  MUL2(dst4_r, weight_vec, dst5_r, weight_vec, dst4_r, dst5_r);
5750  MUL2(dst0_l, weight_vec, dst1_l, weight_vec, dst0_l, dst1_l);
5751  MUL2(dst2_l, weight_vec, dst3_l, weight_vec, dst2_l, dst3_l);
5752  MUL2(dst4_l, weight_vec, dst5_l, weight_vec, dst4_l, dst5_l);
5753  SRAR_W4_SW(dst0_r, dst0_l, dst1_r, dst1_l, rnd_vec);
5754  SRAR_W4_SW(dst2_r, dst2_l, dst3_r, dst3_l, rnd_vec);
5755  SRAR_W4_SW(dst4_r, dst4_l, dst5_r, dst5_l, rnd_vec);
5756  ADD2(dst0_r, offset_vec, dst1_r, offset_vec, dst0_r, dst1_r);
5757  ADD2(dst2_r, offset_vec, dst3_r, offset_vec, dst2_r, dst3_r);
5758  ADD2(dst4_r, offset_vec, dst5_r, offset_vec, dst4_r, dst5_r);
5759  ADD2(dst0_l, offset_vec, dst1_l, offset_vec, dst0_l, dst1_l);
5760  ADD2(dst2_l, offset_vec, dst3_l, offset_vec, dst2_l, dst3_l);
5761  ADD2(dst4_l, offset_vec, dst5_l, offset_vec, dst4_l, dst5_l);
5762  CLIP_SW8_0_255(dst0_r, dst1_r, dst2_r, dst3_r,
5763  dst4_r, dst5_r, dst0_l, dst1_l);
5764  CLIP_SW4_0_255(dst2_l, dst3_l, dst4_l, dst5_l);
5765  PCKEV_H4_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst2_l, dst2_r, dst3_l, dst3_r,
5766  tmp0, tmp1, tmp2, tmp3);
5767  PCKEV_H2_SH(dst4_l, dst4_r, dst5_l, dst5_r, tmp4, tmp5);
5768  PCKEV_B3_UB(tmp1, tmp0, tmp3, tmp2, tmp5, tmp4, out0, out1, out2);
5769  ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
5770  ST_D2(out2, 0, 1, dst + 4 * dst_stride, dst_stride);
5771 }
5772 
5773 static void hevc_hv_uniwgt_4t_8multx4mult_msa(const uint8_t *src,
5774  int32_t src_stride,
5775  uint8_t *dst,
5776  int32_t dst_stride,
5777  const int8_t *filter_x,
5778  const int8_t *filter_y,
5779  int32_t height,
5780  int32_t weight,
5781  int32_t offset,
5782  int32_t rnd_val,
5783  int32_t width8mult)
5784 {
5785  uint32_t loop_cnt, cnt;
5786  const uint8_t *src_tmp;
5787  uint8_t *dst_tmp;
5788  v16u8 out0, out1;
5789  v16i8 src0, src1, src2, src3, src4, src5, src6;
5790  v8i16 filt0, filt1;
5791  v8i16 filt_h0, filt_h1, filter_vec;
5792  v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
5793  v16i8 mask1;
5794  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
5795  v4i32 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
5796  v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
5797  v8i16 dst10_r, dst32_r, dst54_r, dst21_r, dst43_r, dst65_r;
5798  v8i16 dst10_l, dst32_l, dst54_l, dst21_l, dst43_l, dst65_l;
5799  v4i32 dst0_r, dst0_l, dst1_r, dst1_l;
5800  v4i32 dst2_r, dst2_l, dst3_r, dst3_l;
5801  v4i32 weight_vec, rnd_vec, offset_vec;
5802  v8i16 zero = { 0 };
5803 
5804  src -= (src_stride + 1);
5805 
5806  filter_vec = LD_SH(filter_x);
5807  UNPCK_R_SB_SH(filter_vec, filter_vec);
5808  SPLATI_W2_SH(filter_vec, 0, filt0, filt1);
5809 
5810  filter_vec = LD_SH(filter_y);
5811  UNPCK_R_SB_SH(filter_vec, filter_vec);
5812  SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
5813 
5814  mask1 = mask0 + 2;
5815 
5816  weight_vec = __msa_fill_w(weight);
5817  rnd_vec = __msa_fill_w(rnd_val);
5818  offset_vec = __msa_fill_w(offset);
5819 
5820  for (cnt = width8mult; cnt--;) {
5821  src_tmp = src;
5822  dst_tmp = dst;
5823 
5824  LD_SB3(src_tmp, src_stride, src0, src1, src2);
5825  src_tmp += (3 * src_stride);
5826 
5827  VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
5828  VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
5829  VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
5830 
5831  ILVRL_B2_SH(zero, vec0, tmp0, tmp1);
5832  ILVRL_B2_SH(zero, vec1, tmp2, tmp3);
5833  ILVRL_B2_SH(zero, vec2, tmp4, tmp5);
5834  ILVRL_B2_SH(zero, vec3, tmp6, tmp7);
5835  dst0 = HEVC_FILT_4TAP_SW(tmp0, tmp2, filt0, filt1);
5836  dst1 = HEVC_FILT_4TAP_SW(tmp1, tmp3, filt0, filt1);
5837  dst2 = HEVC_FILT_4TAP_SW(tmp4, tmp6, filt0, filt1);
5838  dst3 = HEVC_FILT_4TAP_SW(tmp5, tmp7, filt0, filt1);
5839  ILVRL_B2_SH(zero, vec4, tmp0, tmp1);
5840  ILVRL_B2_SH(zero, vec5, tmp2, tmp3);
5841  dst4 = HEVC_FILT_4TAP_SW(tmp0, tmp2, filt0, filt1);
5842  dst5 = HEVC_FILT_4TAP_SW(tmp1, tmp3, filt0, filt1);
5843  ILVEV_H2_SH(dst0, dst2, dst1, dst3, dst10_r, dst10_l);
5844  ILVEV_H2_SH(dst2, dst4, dst3, dst5, dst21_r, dst21_l);
5845 
5846  for (loop_cnt = height >> 2; loop_cnt--;) {
5847  LD_SB4(src_tmp, src_stride, src3, src4, src5, src6);
5848  src_tmp += (4 * src_stride);
5849 
5850  VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
5851  VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec2, vec3);
5852  VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec4, vec5);
5853  VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec6, vec7);
5854 
5855  ILVRL_B2_SH(zero, vec0, tmp0, tmp1);
5856  ILVRL_B2_SH(zero, vec1, tmp2, tmp3);
5857  ILVRL_B2_SH(zero, vec2, tmp4, tmp5);
5858  ILVRL_B2_SH(zero, vec3, tmp6, tmp7);
5859  dst0 = HEVC_FILT_4TAP_SW(tmp0, tmp2, filt0, filt1);
5860  dst1 = HEVC_FILT_4TAP_SW(tmp1, tmp3, filt0, filt1);
5861  dst2 = HEVC_FILT_4TAP_SW(tmp4, tmp6, filt0, filt1);
5862  dst3 = HEVC_FILT_4TAP_SW(tmp5, tmp7, filt0, filt1);
5863  ILVEV_H2_SH(dst4, dst0, dst5, dst1, dst32_r, dst32_l);
5864  ILVEV_H2_SH(dst0, dst2, dst1, dst3, dst43_r, dst43_l);
5865  ILVRL_B2_SH(zero, vec4, tmp0, tmp1);
5866  ILVRL_B2_SH(zero, vec5, tmp2, tmp3);
5867  ILVRL_B2_SH(zero, vec6, tmp4, tmp5);
5868  ILVRL_B2_SH(zero, vec7, tmp6, tmp7);
5869  dst4 = HEVC_FILT_4TAP_SW(tmp0, tmp2, filt0, filt1);
5870  dst5 = HEVC_FILT_4TAP_SW(tmp1, tmp3, filt0, filt1);
5871  dst6 = HEVC_FILT_4TAP_SW(tmp4, tmp6, filt0, filt1);
5872  dst7 = HEVC_FILT_4TAP_SW(tmp5, tmp7, filt0, filt1);
5873  ILVEV_H2_SH(dst2, dst4, dst3, dst5, dst54_r, dst54_l);
5874  ILVEV_H2_SH(dst4, dst6, dst5, dst7, dst65_r, dst65_l);
5875  dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
5876  dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
5877  dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
5878  dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
5879  dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
5880  dst2_l = HEVC_FILT_4TAP(dst32_l, dst54_l, filt_h0, filt_h1);
5881  dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
5882  dst3_l = HEVC_FILT_4TAP(dst43_l, dst65_l, filt_h0, filt_h1);
5883  SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
5884  SRA_4V(dst2_r, dst2_l, dst3_r, dst3_l, 6);
5885  MUL2(dst0_r, weight_vec, dst1_r, weight_vec, dst0_r, dst1_r);
5886  MUL2(dst2_r, weight_vec, dst3_r, weight_vec, dst2_r, dst3_r);
5887  MUL2(dst0_l, weight_vec, dst1_l, weight_vec, dst0_l, dst1_l);
5888  MUL2(dst2_l, weight_vec, dst3_l, weight_vec, dst2_l, dst3_l);
5889  SRAR_W4_SW(dst0_r, dst0_l, dst1_r, dst1_l, rnd_vec);
5890  SRAR_W4_SW(dst2_r, dst2_l, dst3_r, dst3_l, rnd_vec);
5891  ADD2(dst0_r, offset_vec, dst1_r, offset_vec, dst0_r, dst1_r);
5892  ADD2(dst2_r, offset_vec, dst3_r, offset_vec, dst2_r, dst3_r);
5893  ADD2(dst0_l, offset_vec, dst1_l, offset_vec, dst0_l, dst1_l);
5894  ADD2(dst2_l, offset_vec, dst3_l, offset_vec, dst2_l, dst3_l);
5895  CLIP_SW8_0_255(dst0_r, dst0_l, dst1_r, dst1_l,
5896  dst2_r, dst2_l, dst3_r, dst3_l);
5897  PCKEV_H4_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst2_l, dst2_r, dst3_l,
5898  dst3_r, tmp0, tmp1, tmp2, tmp3);
5899  PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
5900  ST_D4(out0, out1, 0, 1, 0, 1, dst_tmp, dst_stride);
5901  dst_tmp += (4 * dst_stride);
5902 
5903  dst10_r = dst54_r;
5904  dst10_l = dst54_l;
5905  dst21_r = dst65_r;
5906  dst21_l = dst65_l;
5907  dst4 = dst6;
5908  dst5 = dst7;
5909  }
5910 
5911  src += 8;
5912  dst += 8;
5913  }
5914 }
5915 
5916 static void hevc_hv_uniwgt_4t_8w_msa(const uint8_t *src,
5917  int32_t src_stride,
5918  uint8_t *dst,
5919  int32_t dst_stride,
5920  const int8_t *filter_x,
5921  const int8_t *filter_y,
5922  int32_t height,
5923  int32_t weight,
5924  int32_t offset,
5925  int32_t rnd_val)
5926 {
5927 
5928  if (2 == height) {
5929  hevc_hv_uniwgt_4t_8x2_msa(src, src_stride, dst, dst_stride,
5930  filter_x, filter_y, weight,
5931  offset, rnd_val);
5932  } else if (4 == height) {
5933  hevc_hv_uniwgt_4t_8multx4_msa(src, src_stride, dst, dst_stride,
5934  filter_x, filter_y, 1, weight,
5935  offset, rnd_val);
5936  } else if (6 == height) {
5937  hevc_hv_uniwgt_4t_8x6_msa(src, src_stride, dst, dst_stride,
5938  filter_x, filter_y, weight,
5939  offset, rnd_val);
5940  } else if (0 == (height % 4)) {
5941  hevc_hv_uniwgt_4t_8multx4mult_msa(src, src_stride, dst, dst_stride,
5942  filter_x, filter_y, height, weight,
5943  offset, rnd_val, 1);
5944  }
5945 }
5946 
5947 static void hevc_hv_uniwgt_4t_12w_msa(const uint8_t *src,
5948  int32_t src_stride,
5949  uint8_t *dst,
5950  int32_t dst_stride,
5951  const int8_t *filter_x,
5952  const int8_t *filter_y,
5953  int32_t height,
5954  int32_t weight,
5955  int32_t offset,
5956  int32_t rnd_val)
5957 {
5958  uint32_t loop_cnt;
5959  const uint8_t *src_tmp;
5960  uint8_t *dst_tmp;
5961  v16u8 out0, out1;
5962  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
5963  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
5964  v16i8 mask0, mask1, mask2, mask3;
5965  v8i16 filt0, filt1, filt_h0, filt_h1, filter_vec;
5966  v4i32 dsth0, dsth1, dsth2, dsth3, dsth4, dsth5, dsth6, dsth7;
5967  v8i16 dst76_r, dst98_r, dst87_r, dst109_r;
5968  v8i16 dst10_r, dst32_r, dst54_r, dst21_r, dst43_r, dst65_r;
5969  v8i16 dst10_l, dst32_l, dst54_l, dst21_l, dst43_l, dst65_l;
5970  v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
5971  v4i32 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, weight_vec, rnd_vec;
5972  v4i32 dst8, dst9, dst10, offset_vec;
5973  v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
5974  v8i16 zero = { 0 };
5975 
5976  src -= (src_stride + 1);
5977 
5978  filter_vec = LD_SH(filter_x);
5979  UNPCK_R_SB_SH(filter_vec, filter_vec);
5980  SPLATI_W2_SH(filter_vec, 0, filt0, filt1);
5981 
5982  filter_vec = LD_SH(filter_y);
5983  UNPCK_R_SB_SH(filter_vec, filter_vec);
5984  SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
5985 
5986  mask0 = LD_SB(ff_hevc_mask_arr);
5987  mask1 = mask0 + 2;
5988 
5989  weight_vec = __msa_fill_w(weight);
5990  rnd_vec = __msa_fill_w(rnd_val);
5991  offset_vec = __msa_fill_w(offset);
5992 
5993  src_tmp = src;
5994  dst_tmp = dst;
5995 
5996  LD_SB3(src_tmp, src_stride, src0, src1, src2);
5997  src_tmp += (3 * src_stride);
5998  VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
5999  VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
6000  VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
6001 
6002  ILVRL_B2_SH(zero, vec0, tmp0, tmp1);
6003  ILVRL_B2_SH(zero, vec1, tmp2, tmp3);
6004  ILVRL_B2_SH(zero, vec2, tmp4, tmp5);
6005  ILVRL_B2_SH(zero, vec3, tmp6, tmp7);
6006  dsth0 = HEVC_FILT_4TAP_SW(tmp0, tmp2, filt0, filt1);
6007  dsth1 = HEVC_FILT_4TAP_SW(tmp1, tmp3, filt0, filt1);
6008  dsth2 = HEVC_FILT_4TAP_SW(tmp4, tmp6, filt0, filt1);
6009  dsth3 = HEVC_FILT_4TAP_SW(tmp5, tmp7, filt0, filt1);
6010  ILVRL_B2_SH(zero, vec4, tmp0, tmp1);
6011  ILVRL_B2_SH(zero, vec5, tmp2, tmp3);
6012  dsth4 = HEVC_FILT_4TAP_SW(tmp0, tmp2, filt0, filt1);
6013  dsth5 = HEVC_FILT_4TAP_SW(tmp1, tmp3, filt0, filt1);
6014  ILVEV_H2_SH(dsth0, dsth2, dsth1, dsth3, dst10_r, dst10_l);
6015  ILVEV_H2_SH(dsth2, dsth4, dsth3, dsth5, dst21_r, dst21_l);
6016 
6017  for (loop_cnt = 4; loop_cnt--;) {
6018  LD_SB4(src_tmp, src_stride, src3, src4, src5, src6);
6019  src_tmp += (4 * src_stride);
6020  VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
6021  VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec2, vec3);
6022  VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec4, vec5);
6023  VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec6, vec7);
6024 
6025  ILVRL_B2_SH(zero, vec0, tmp0, tmp1);
6026  ILVRL_B2_SH(zero, vec1, tmp2, tmp3);
6027  ILVRL_B2_SH(zero, vec2, tmp4, tmp5);
6028  ILVRL_B2_SH(zero, vec3, tmp6, tmp7);
6029  dsth0 = HEVC_FILT_4TAP_SW(tmp0, tmp2, filt0, filt1);
6030  dsth1 = HEVC_FILT_4TAP_SW(tmp1, tmp3, filt0, filt1);
6031  dsth2 = HEVC_FILT_4TAP_SW(tmp4, tmp6, filt0, filt1);
6032  dsth3 = HEVC_FILT_4TAP_SW(tmp5, tmp7, filt0, filt1);
6033  ILVEV_H2_SH(dsth4, dsth0, dsth5, dsth1, dst32_r, dst32_l);
6034  ILVEV_H2_SH(dsth0, dsth2, dsth1, dsth3, dst43_r, dst43_l);
6035  ILVRL_B2_SH(zero, vec4, tmp0, tmp1);
6036  ILVRL_B2_SH(zero, vec5, tmp2, tmp3);
6037  ILVRL_B2_SH(zero, vec6, tmp4, tmp5);
6038  ILVRL_B2_SH(zero, vec7, tmp6, tmp7);
6039  dsth4 = HEVC_FILT_4TAP_SW(tmp0, tmp2, filt0, filt1);
6040  dsth5 = HEVC_FILT_4TAP_SW(tmp1, tmp3, filt0, filt1);
6041  dsth6 = HEVC_FILT_4TAP_SW(tmp4, tmp6, filt0, filt1);
6042  dsth7 = HEVC_FILT_4TAP_SW(tmp5, tmp7, filt0, filt1);
6043  ILVEV_H2_SH(dsth2, dsth4, dsth3, dsth5, dst54_r, dst54_l);
6044  ILVEV_H2_SH(dsth4, dsth6, dsth5, dsth7, dst65_r, dst65_l);
6045 
6046  dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
6047  dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
6048  dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
6049  dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
6050  dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
6051  dst2_l = HEVC_FILT_4TAP(dst32_l, dst54_l, filt_h0, filt_h1);
6052  dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
6053  dst3_l = HEVC_FILT_4TAP(dst43_l, dst65_l, filt_h0, filt_h1);
6054  SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
6055  SRA_4V(dst2_r, dst2_l, dst3_r, dst3_l, 6);
6056  MUL2(dst0_r, weight_vec, dst1_r, weight_vec, dst0_r, dst1_r);
6057  MUL2(dst2_r, weight_vec, dst3_r, weight_vec, dst2_r, dst3_r);
6058  MUL2(dst0_l, weight_vec, dst1_l, weight_vec, dst0_l, dst1_l);
6059  MUL2(dst2_l, weight_vec, dst3_l, weight_vec, dst2_l, dst3_l);
6060  SRAR_W4_SW(dst0_r, dst0_l, dst1_r, dst1_l, rnd_vec);
6061  SRAR_W4_SW(dst2_r, dst2_l, dst3_r, dst3_l, rnd_vec);
6062  ADD2(dst0_r, offset_vec, dst1_r, offset_vec, dst0_r, dst1_r);
6063  ADD2(dst2_r, offset_vec, dst3_r, offset_vec, dst2_r, dst3_r);
6064  ADD2(dst0_l, offset_vec, dst1_l, offset_vec, dst0_l, dst1_l);
6065  ADD2(dst2_l, offset_vec, dst3_l, offset_vec, dst2_l, dst3_l);
6066  CLIP_SW8_0_255(dst0_r, dst0_l, dst1_r, dst1_l,
6067  dst2_r, dst2_l, dst3_r, dst3_l);
6068  PCKEV_H4_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst2_l, dst2_r, dst3_l,
6069  dst3_r, tmp0, tmp1, tmp2, tmp3);
6070  PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
6071  ST_D4(out0, out1, 0, 1, 0, 1, dst_tmp, dst_stride);
6072  dst_tmp += (4 * dst_stride);
6073 
6074  dst10_r = dst54_r;
6075  dst10_l = dst54_l;
6076  dst21_r = dst65_r;
6077  dst21_l = dst65_l;
6078  dsth4 = dsth6;
6079  dsth5 = dsth7;
6080  }
6081 
6082  src += 8;
6083  dst += 8;
6084 
6085  mask2 = LD_SB(ff_hevc_mask_arr + 16);
6086  mask3 = mask2 + 2;
6087 
6088  LD_SB3(src, src_stride, src0, src1, src2);
6089  src += (3 * src_stride);
6090  VSHF_B2_SB(src0, src1, src0, src1, mask2, mask3, vec0, vec1);
6091  VSHF_B2_SB(src1, src2, src1, src2, mask2, mask3, vec2, vec3);
6092 
6093  ILVRL_B2_SH(zero, vec0, tmp0, tmp1);
6094  ILVRL_B2_SH(zero, vec1, tmp2, tmp3);
6095  ILVL_B2_SH(zero, vec2, zero, vec3, tmp4, tmp6);
6096  dst0 = HEVC_FILT_4TAP_SW(tmp0, tmp2, filt0, filt1);
6097  dst1 = HEVC_FILT_4TAP_SW(tmp1, tmp3, filt0, filt1);
6098  dst2 = HEVC_FILT_4TAP_SW(tmp4, tmp6, filt0, filt1);
6099 
6100  ILVEV_H2_SH(dst0, dst1, dst1, dst2, dst10_r, dst21_r);
6101 
6102  for (loop_cnt = 2; loop_cnt--;) {
6103  LD_SB8(src, src_stride, src3, src4, src5, src6, src7, src8, src9,
6104  src10);
6105  src += (8 * src_stride);
6106  VSHF_B2_SB(src3, src7, src3, src7, mask2, mask3, vec0, vec1);
6107  VSHF_B2_SB(src4, src8, src4, src8, mask2, mask3, vec2, vec3);
6108  VSHF_B2_SB(src5, src9, src5, src9, mask2, mask3, vec4, vec5);
6109  VSHF_B2_SB(src6, src10, src6, src10, mask2, mask3, vec6, vec7);
6110 
6111  ILVRL_B2_SH(zero, vec0, tmp0, tmp1);
6112  ILVRL_B2_SH(zero, vec1, tmp2, tmp3);
6113  ILVRL_B2_SH(zero, vec2, tmp4, tmp5);
6114  ILVRL_B2_SH(zero, vec3, tmp6, tmp7);
6115  dst3 = HEVC_FILT_4TAP_SW(tmp0, tmp2, filt0, filt1);
6116  dst7 = HEVC_FILT_4TAP_SW(tmp1, tmp3, filt0, filt1);
6117  dst4 = HEVC_FILT_4TAP_SW(tmp4, tmp6, filt0, filt1);
6118  dst8 = HEVC_FILT_4TAP_SW(tmp5, tmp7, filt0, filt1);
6119  ILVRL_B2_SH(zero, vec4, tmp0, tmp1);
6120  ILVRL_B2_SH(zero, vec5, tmp2, tmp3);
6121  ILVRL_B2_SH(zero, vec6, tmp4, tmp5);
6122  ILVRL_B2_SH(zero, vec7, tmp6, tmp7);
6123  dst5 = HEVC_FILT_4TAP_SW(tmp0, tmp2, filt0, filt1);
6124  dst9 = HEVC_FILT_4TAP_SW(tmp1, tmp3, filt0, filt1);
6125  dst6 = HEVC_FILT_4TAP_SW(tmp4, tmp6, filt0, filt1);
6126  dst10 = HEVC_FILT_4TAP_SW(tmp5, tmp7, filt0, filt1);
6127 
6128  dst32_r = __msa_ilvev_h(dst3, dst2);
6129  ILVEV_H2_SH(dst3, dst4, dst7, dst8, dst43_r, dst87_r);
6130  ILVEV_H2_SH(dst4, dst5, dst8, dst9, dst54_r, dst98_r);
6131  ILVEV_H2_SH(dst5, dst6, dst9, dst10, dst65_r, dst109_r);
6132  dst76_r = __msa_ilvev_h(dst7, dst6);
6133  dst0 = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
6134  dst1 = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
6135  dst2 = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
6136  dst3 = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
6137  dst4 = HEVC_FILT_4TAP(dst54_r, dst76_r, filt_h0, filt_h1);
6138  dst5 = HEVC_FILT_4TAP(dst65_r, dst87_r, filt_h0, filt_h1);
6139  dst6 = HEVC_FILT_4TAP(dst76_r, dst98_r, filt_h0, filt_h1);
6140  dst7 = HEVC_FILT_4TAP(dst87_r, dst109_r, filt_h0, filt_h1);
6141  SRA_4V(dst0, dst1, dst2, dst3, 6);
6142  SRA_4V(dst4, dst5, dst6, dst7, 6);
6143  MUL2(dst0, weight_vec, dst1, weight_vec, dst0, dst1);
6144  MUL2(dst2, weight_vec, dst3, weight_vec, dst2, dst3);
6145  MUL2(dst4, weight_vec, dst5, weight_vec, dst4, dst5);
6146  MUL2(dst6, weight_vec, dst7, weight_vec, dst6, dst7);
6147  SRAR_W4_SW(dst0, dst1, dst2, dst3, rnd_vec);
6148  SRAR_W4_SW(dst4, dst5, dst6, dst7, rnd_vec);
6149  ADD2(dst0, offset_vec, dst1, offset_vec, dst0, dst1);
6150  ADD2(dst2, offset_vec, dst3, offset_vec, dst2, dst3);
6151  ADD2(dst4, offset_vec, dst5, offset_vec, dst4, dst5);
6152  ADD2(dst6, offset_vec, dst7, offset_vec, dst6, dst7);
6153  CLIP_SW8_0_255(dst0, dst1, dst2, dst3,
6154  dst4, dst5, dst6, dst7);
6155  PCKEV_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6, tmp0, tmp1,
6156  tmp2, tmp3);
6157  PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
6158  ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
6159  dst += (8 * dst_stride);
6160 
6161  dst10_r = dst98_r;
6162  dst21_r = dst109_r;
6163  dst2 = dst10;
6164  }
6165 }
6166 
6167 static void hevc_hv_uniwgt_4t_16w_msa(const uint8_t *src,
6168  int32_t src_stride,
6169  uint8_t *dst,
6170  int32_t dst_stride,
6171  const int8_t *filter_x,
6172  const int8_t *filter_y,
6173  int32_t height,
6174  int32_t weight,
6175  int32_t offset,
6176  int32_t rnd_val)
6177 {
6178  if (4 == height) {
6179  hevc_hv_uniwgt_4t_8multx4_msa(src, src_stride, dst, dst_stride,
6180  filter_x, filter_y, 2, weight, offset,
6181  rnd_val);
6182  } else {
6183  hevc_hv_uniwgt_4t_8multx4mult_msa(src, src_stride, dst, dst_stride,
6184  filter_x, filter_y, height, weight,
6185  offset, rnd_val, 2);
6186  }
6187 }
6188 
6189 static void hevc_hv_uniwgt_4t_24w_msa(const uint8_t *src,
6190  int32_t src_stride,
6191  uint8_t *dst,
6192  int32_t dst_stride,
6193  const int8_t *filter_x,
6194  const int8_t *filter_y,
6195  int32_t height,
6196  int32_t weight,
6197  int32_t offset,
6198  int32_t rnd_val)
6199 {
6200  hevc_hv_uniwgt_4t_8multx4mult_msa(src, src_stride, dst, dst_stride,
6201  filter_x, filter_y, height, weight,
6202  offset, rnd_val, 3);
6203 }
6204 
6205 static void hevc_hv_uniwgt_4t_32w_msa(const uint8_t *src,
6206  int32_t src_stride,
6207  uint8_t *dst,
6208  int32_t dst_stride,
6209  const int8_t *filter_x,
6210  const int8_t *filter_y,
6211  int32_t height,
6212  int32_t weight,
6213  int32_t offset,
6214  int32_t rnd_val)
6215 {
6216  hevc_hv_uniwgt_4t_8multx4mult_msa(src, src_stride, dst, dst_stride,
6217  filter_x, filter_y, height, weight,
6218  offset, rnd_val, 4);
6219 }
6220 
6221 #define UNIWGT_MC_COPY(WIDTH) \
6222 void ff_hevc_put_hevc_uni_w_pel_pixels##WIDTH##_8_msa(uint8_t *dst, \
6223  ptrdiff_t dst_stride, \
6224  const uint8_t *src, \
6225  ptrdiff_t src_stride, \
6226  int height, \
6227  int denom, \
6228  int weight, \
6229  int offset, \
6230  intptr_t mx, \
6231  intptr_t my, \
6232  int width) \
6233 { \
6234  int shift = denom + 14 - 8; \
6235  hevc_uniwgt_copy_##WIDTH##w_msa(src, src_stride, dst, dst_stride, \
6236  height, weight, offset, shift); \
6237 }
6238 
6239 UNIWGT_MC_COPY(4);
6240 UNIWGT_MC_COPY(6);
6241 UNIWGT_MC_COPY(8);
6242 UNIWGT_MC_COPY(12);
6243 UNIWGT_MC_COPY(16);
6244 UNIWGT_MC_COPY(24);
6245 UNIWGT_MC_COPY(32);
6246 UNIWGT_MC_COPY(48);
6247 UNIWGT_MC_COPY(64);
6248 
6249 #undef UNIWGT_MC_COPY
6250 
6251 #define UNI_W_MC(PEL, DIR, WIDTH, TAP, DIR1, FILT_DIR) \
6252 void ff_hevc_put_hevc_uni_w_##PEL##_##DIR##WIDTH##_8_msa(uint8_t *dst, \
6253  ptrdiff_t \
6254  dst_stride, \
6255  const uint8_t *src, \
6256  ptrdiff_t \
6257  src_stride, \
6258  int height, \
6259  int denom, \
6260  int weight, \
6261  int offset, \
6262  intptr_t mx, \
6263  intptr_t my, \
6264  int width) \
6265 { \
6266  const int8_t *filter = ff_hevc_##PEL##_filters[FILT_DIR - 1]; \
6267  int shift = denom + 14 - 8; \
6268  \
6269  hevc_##DIR1##_uniwgt_##TAP##t_##WIDTH##w_msa(src, src_stride, dst, \
6270  dst_stride, filter, height, \
6271  weight, offset, shift); \
6272 }
6273 
6274 UNI_W_MC(qpel, h, 4, 8, hz, mx);
6275 UNI_W_MC(qpel, h, 8, 8, hz, mx);
6276 UNI_W_MC(qpel, h, 12, 8, hz, mx);
6277 UNI_W_MC(qpel, h, 16, 8, hz, mx);
6278 UNI_W_MC(qpel, h, 24, 8, hz, mx);
6279 UNI_W_MC(qpel, h, 32, 8, hz, mx);
6280 UNI_W_MC(qpel, h, 48, 8, hz, mx);
6281 UNI_W_MC(qpel, h, 64, 8, hz, mx);
6282 
6283 UNI_W_MC(qpel, v, 4, 8, vt, my);
6284 UNI_W_MC(qpel, v, 8, 8, vt, my);
6285 UNI_W_MC(qpel, v, 12, 8, vt, my);
6286 UNI_W_MC(qpel, v, 16, 8, vt, my);
6287 UNI_W_MC(qpel, v, 24, 8, vt, my);
6288 UNI_W_MC(qpel, v, 32, 8, vt, my);
6289 UNI_W_MC(qpel, v, 48, 8, vt, my);
6290 UNI_W_MC(qpel, v, 64, 8, vt, my);
6291 
6292 UNI_W_MC(epel, h, 4, 4, hz, mx);
6293 UNI_W_MC(epel, h, 6, 4, hz, mx);
6294 UNI_W_MC(epel, h, 8, 4, hz, mx);
6295 UNI_W_MC(epel, h, 12, 4, hz, mx);
6296 UNI_W_MC(epel, h, 16, 4, hz, mx);
6297 UNI_W_MC(epel, h, 24, 4, hz, mx);
6298 UNI_W_MC(epel, h, 32, 4, hz, mx);
6299 
6300 UNI_W_MC(epel, v, 4, 4, vt, my);
6301 UNI_W_MC(epel, v, 6, 4, vt, my);
6302 UNI_W_MC(epel, v, 8, 4, vt, my);
6303 UNI_W_MC(epel, v, 12, 4, vt, my);
6304 UNI_W_MC(epel, v, 16, 4, vt, my);
6305 UNI_W_MC(epel, v, 24, 4, vt, my);
6306 UNI_W_MC(epel, v, 32, 4, vt, my);
6307 
6308 #undef UNI_W_MC
6309 
6310 #define UNI_W_MC_HV(PEL, WIDTH, TAP) \
6311 void ff_hevc_put_hevc_uni_w_##PEL##_hv##WIDTH##_8_msa(uint8_t *dst, \
6312  ptrdiff_t dst_stride, \
6313  const uint8_t *src, \
6314  ptrdiff_t src_stride, \
6315  int height, \
6316  int denom, \
6317  int weight, \
6318  int offset, \
6319  intptr_t mx, \
6320  intptr_t my, \
6321  int width) \
6322 { \
6323  const int8_t *filter_x = ff_hevc_##PEL##_filters[mx - 1]; \
6324  const int8_t *filter_y = ff_hevc_##PEL##_filters[my - 1]; \
6325  int shift = denom + 14 - 8; \
6326  \
6327  hevc_hv_uniwgt_##TAP##t_##WIDTH##w_msa(src, src_stride, dst, dst_stride, \
6328  filter_x, filter_y, height, \
6329  weight, offset, shift); \
6330 }
6331 
6332 UNI_W_MC_HV(qpel, 4, 8);
6333 UNI_W_MC_HV(qpel, 8, 8);
6334 UNI_W_MC_HV(qpel, 12, 8);
6335 UNI_W_MC_HV(qpel, 16, 8);
6336 UNI_W_MC_HV(qpel, 24, 8);
6337 UNI_W_MC_HV(qpel, 32, 8);
6338 UNI_W_MC_HV(qpel, 48, 8);
6339 UNI_W_MC_HV(qpel, 64, 8);
6340 
6341 UNI_W_MC_HV(epel, 4, 4);
6342 UNI_W_MC_HV(epel, 6, 4);
6343 UNI_W_MC_HV(epel, 8, 4);
6344 UNI_W_MC_HV(epel, 12, 4);
6345 UNI_W_MC_HV(epel, 16, 4);
6346 UNI_W_MC_HV(epel, 24, 4);
6347 UNI_W_MC_HV(epel, 32, 4);
6348 
6349 #undef UNI_W_MC_HV
hevc_vt_uniwgt_8t_16w_msa
static void hevc_vt_uniwgt_8t_16w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
Definition: hevc_mc_uniw_msa.c:1933
VSHF_B2_SB
#define VSHF_B2_SB(...)
Definition: generic_macros_msa.h:662
hevc_hv_uniwgt_4t_8multx4mult_msa
static void hevc_hv_uniwgt_4t_8multx4mult_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val, int32_t width8mult)
Definition: hevc_mc_uniw_msa.c:5773
LD_SB4
#define LD_SB4(...)
Definition: generic_macros_msa.h:297
UNI_W_MC_HV
#define UNI_W_MC_HV(PEL, WIDTH, TAP)
Definition: hevc_mc_uniw_msa.c:6310
out
FILE * out
Definition: movenc.c:54
hevc_hv_uniwgt_4t_4x2_msa
static void hevc_hv_uniwgt_4t_4x2_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t weight, int32_t offset, int32_t rnd_val)
Definition: hevc_mc_uniw_msa.c:4971
ST_UB2
#define ST_UB2(...)
Definition: generic_macros_msa.h:363
src1
const pixel * src1
Definition: h264pred_template.c:421
SPLATI_W4_SW
#define SPLATI_W4_SW(...)
Definition: generic_macros_msa.h:1701
hevc_hz_uniwgt_4t_4x2_msa
static void hevc_hz_uniwgt_4t_4x2_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t weight, int32_t offset, int32_t rnd_val)
Definition: hevc_mc_uniw_msa.c:2591
PCKEV_H2_SW
#define PCKEV_H2_SW(...)
Definition: generic_macros_msa.h:1760
hevc_hv_uniwgt_8t_8w_msa
static void hevc_hv_uniwgt_8t_8w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
Definition: hevc_mc_uniw_msa.c:2293
tmp
static uint8_t tmp[11]
Definition: aes_ctr.c:28
HEVC_UNIW_RND_CLIP2_MAX_SATU_H
#define HEVC_UNIW_RND_CLIP2_MAX_SATU_H(in0_h, in1_h, wgt_w, offset_h, rnd_w, out0_h, out1_h)
Definition: hevc_mc_uniw_msa.c:32
ST_D1
#define ST_D1(in, idx, pdst)
Definition: generic_macros_msa.h:485
LD_SH
#define LD_SH(...)
Definition: generic_macros_msa.h:35
hevc_uniwgt_copy_6w_msa
static void hevc_uniwgt_copy_6w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
Definition: hevc_mc_uniw_msa.c:141
SLLI_2V
#define SLLI_2V(in0, in1, shift)
Definition: generic_macros_msa.h:1916
VSHF_B4_SB
#define VSHF_B4_SB(...)
Definition: generic_macros_msa.h:680
PCKEV_B4_UB
#define PCKEV_B4_UB(...)
Definition: generic_macros_msa.h:1739
hevc_hz_uniwgt_4t_4x8multiple_msa
static void hevc_hz_uniwgt_4t_4x8multiple_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
Definition: hevc_mc_uniw_msa.c:2698
hevc_hv_uniwgt_4t_8x6_msa
static void hevc_hv_uniwgt_4t_8x6_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t weight, int32_t offset, int32_t rnd_val)
Definition: hevc_mc_uniw_msa.c:5631
filter
filter_frame For filters that do not use the this method is called when a frame is pushed to the filter s input It can be called at any time except in a reentrant way If the input frame is enough to produce then the filter should push the output frames on the output link immediately As an exception to the previous rule if the input frame is enough to produce several output frames then the filter needs output only at least one per link The additional frames can be left buffered in the filter
Definition: filter_design.txt:228
ST_UB4
#define ST_UB4(...)
Definition: generic_macros_msa.h:374
SRAR_W2_SW
#define SRAR_W2_SW(...)
Definition: generic_macros_msa.h:2034
ADD4
#define ADD4(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, out3)
Definition: generic_macros_msa.h:2123
hevc_hv_uniwgt_4t_12w_msa
static void hevc_hv_uniwgt_4t_12w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
Definition: hevc_mc_uniw_msa.c:5947
INSERT_W2_SB
#define INSERT_W2_SB(...)
Definition: generic_macros_msa.h:1144
hevc_hv_uniwgt_8t_48w_msa
static void hevc_hv_uniwgt_8t_48w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
Definition: hevc_mc_uniw_msa.c:2559
hevc_hz_uniwgt_4t_4w_msa
static void hevc_hz_uniwgt_4t_4w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
Definition: hevc_mc_uniw_msa.c:2778
ADD2
#define ADD2(in0, in1, in2, in3, out0, out1)
Definition: generic_macros_msa.h:2118
generic_macros_msa.h
hevc_vt_uniwgt_8t_16multx4mult_msa
static void hevc_vt_uniwgt_8t_16multx4mult_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val, int32_t weightmul16)
Definition: hevc_mc_uniw_msa.c:1771
ST_W4
#define ST_W4(in, idx0, idx1, idx2, idx3, pdst, stride)
Definition: vp8_lpf_lsx.c:234
hevc_hv_uniwgt_4t_4x4_msa
static void hevc_hv_uniwgt_4t_4x4_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t weight, int32_t offset, int32_t rnd_val)
Definition: hevc_mc_uniw_msa.c:5040
hevc_hv_uniwgt_4t_4multx8mult_msa
static void hevc_hv_uniwgt_4t_4multx8mult_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
Definition: hevc_mc_uniw_msa.c:5118
LD_SB
#define LD_SB(...)
Definition: generic_macros_msa.h:33
hevc_vt_uniwgt_4t_8x4_msa
static void hevc_vt_uniwgt_4t_8x4_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t weight, int32_t offset, int32_t rnd_val)
Definition: hevc_mc_uniw_msa.c:4111
hevc_uniwgt_copy_8w_msa
static void hevc_uniwgt_copy_8w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
Definition: hevc_mc_uniw_msa.c:248
hevc_vt_uniwgt_8t_4w_msa
static void hevc_vt_uniwgt_8t_4w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
Definition: hevc_mc_uniw_msa.c:1430
LD_SB5
#define LD_SB5(...)
Definition: generic_macros_msa.h:308
hevc_hz_uniwgt_4t_12w_msa
static void hevc_hz_uniwgt_4t_12w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
Definition: hevc_mc_uniw_msa.c:3284
ILVL_W2_SB
#define ILVL_W2_SB(...)
Definition: generic_macros_msa.h:1319
aligned
static int aligned(int val)
Definition: dashdec.c:170
hevc_vt_uniwgt_4t_4w_msa
static void hevc_vt_uniwgt_4t_4w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
Definition: hevc_mc_uniw_msa.c:3926
CLIP_SW2_0_255
#define CLIP_SW2_0_255(in0, in1)
Definition: generic_macros_msa.h:972
hevc_hv_uniwgt_4t_4w_msa
static void hevc_hv_uniwgt_4t_4w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
Definition: hevc_mc_uniw_msa.c:5240
hevc_hv_uniwgt_4t_16w_msa
static void hevc_hv_uniwgt_4t_16w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
Definition: hevc_mc_uniw_msa.c:6167
hevc_hz_uniwgt_4t_32w_msa
static void hevc_hz_uniwgt_4t_32w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
Definition: hevc_mc_uniw_msa.c:3611
MUL2
#define MUL2(in0, in1, in2, in3, out0, out1)
Definition: generic_macros_msa.h:2101
hevc_hz_uniwgt_4t_8x6_msa
static void hevc_hz_uniwgt_4t_8x6_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t weight, int32_t offset, int32_t rnd_val)
Definition: hevc_mc_uniw_msa.c:3046
hevc_hz_uniwgt_4t_4x4_msa
static void hevc_hz_uniwgt_4t_4x4_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t weight, int32_t offset, int32_t rnd_val)
Definition: hevc_mc_uniw_msa.c:2640
width
#define width
hevc_hv_uniwgt_8t_4w_msa
static void hevc_hv_uniwgt_8t_4w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
Definition: hevc_mc_uniw_msa.c:2011
ST_H8
#define ST_H8(in, idx0, idx1, idx2, idx3, idx4, idx5, idx6, idx7, pdst, stride)
Definition: generic_macros_msa.h:429
hevc_hv_uniwgt_4t_6w_msa
static void hevc_hv_uniwgt_4t_6w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
Definition: hevc_mc_uniw_msa.c:5266
UNPCK_R_SB_SH
#define UNPCK_R_SB_SH(in, out)
Definition: generic_macros_msa.h:2156
SRA_4V
#define SRA_4V(in0, in1, in2, in3, shift)
Definition: generic_macros_msa.h:1939
hevc_uniwgt_copy_64w_msa
static void hevc_uniwgt_copy_64w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
Definition: hevc_mc_uniw_msa.c:581
hevc_hz_uniwgt_4t_8w_msa
static void hevc_hz_uniwgt_4t_8w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
Definition: hevc_mc_uniw_msa.c:3258
PCKEV_H4_SH
#define PCKEV_H4_SH(...)
Definition: generic_macros_msa.h:1768
HEVC_FILT_8TAP
#define HEVC_FILT_8TAP(in0, in1, in2, in3, filt0, filt1, filt2, filt3)
Definition: hevc_macros_msa.h:35
hevc_hz_uniwgt_8t_24w_msa
static void hevc_hz_uniwgt_8t_24w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
Definition: hevc_mc_uniw_msa.c:1023
INSERT_D2_SB
#define INSERT_D2_SB(...)
Definition: generic_macros_msa.h:1170
hevc_vt_uniwgt_4t_8w_msa
static void hevc_vt_uniwgt_4t_8w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
Definition: hevc_mc_uniw_msa.c:4374
hevc_hz_uniwgt_8t_4w_msa
static void hevc_hz_uniwgt_8t_4w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
Definition: hevc_mc_uniw_msa.c:643
ST_D8
#define ST_D8(in0, in1, in2, in3, idx0, idx1, idx2, idx3, idx4, idx5, idx6, idx7, pdst, stride)
Definition: generic_macros_msa.h:511
hevc_hv_uniwgt_8t_8multx2mult_msa
static void hevc_hv_uniwgt_8t_8multx2mult_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val, int32_t width)
Definition: hevc_mc_uniw_msa.c:2133
hevc_macros_msa.h
ILVR_B4_SB
#define ILVR_B4_SB(...)
Definition: generic_macros_msa.h:1360
LD2
#define LD2(psrc, stride, out0, out1)
Definition: generic_macros_msa.h:223
ILVR_D2_SB
#define ILVR_D2_SB(...)
Definition: generic_macros_msa.h:1444
hevc_hz_uniwgt_4t_8x2_msa
static void hevc_hz_uniwgt_4t_8x2_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t weight, int32_t offset, int32_t rnd_val)
Definition: hevc_mc_uniw_msa.c:2918
ILVL_B2_SH
#define ILVL_B2_SH(...)
Definition: generic_macros_msa.h:1265
hevc_vt_uniwgt_8t_24w_msa
static void hevc_vt_uniwgt_8t_24w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
Definition: hevc_mc_uniw_msa.c:1948
PCKEV_B2_UB
#define PCKEV_B2_UB(...)
Definition: generic_macros_msa.h:1720
hevc_hz_uniwgt_8t_16w_msa
static void hevc_hz_uniwgt_8t_16w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
Definition: hevc_mc_uniw_msa.c:944
hevc_vt_uniwgt_4t_8x6_msa
static void hevc_vt_uniwgt_4t_8x6_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t weight, int32_t offset, int32_t rnd_val)
Definition: hevc_mc_uniw_msa.c:4177
hevc_uniwgt_copy_12w_msa
static void hevc_uniwgt_copy_12w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
Definition: hevc_mc_uniw_msa.c:342
INSERT_W4_SB
#define INSERT_W4_SB(...)
Definition: generic_macros_msa.h:1154
hevc_vt_uniwgt_4t_8x2_msa
static void hevc_vt_uniwgt_4t_8x2_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t weight, int32_t offset, int32_t rnd_val)
Definition: hevc_mc_uniw_msa.c:4058
hevc_hz_uniwgt_8t_32w_msa
static void hevc_hz_uniwgt_8t_32w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
Definition: hevc_mc_uniw_msa.c:1124
hevc_hz_uniwgt_8t_12w_msa
static void hevc_hz_uniwgt_8t_12w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
Definition: hevc_mc_uniw_msa.c:865
hevc_hz_uniwgt_8t_48w_msa
static void hevc_hz_uniwgt_8t_48w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
Definition: hevc_mc_uniw_msa.c:1238
hevc_vt_uniwgt_4t_16w_msa
static void hevc_vt_uniwgt_4t_16w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
Definition: hevc_mc_uniw_msa.c:4563
hevc_uniwgt_copy_32w_msa
static void hevc_uniwgt_copy_32w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
Definition: hevc_mc_uniw_msa.c:480
hevc_vt_uniwgt_4t_12w_msa
static void hevc_vt_uniwgt_4t_12w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
Definition: hevc_mc_uniw_msa.c:4400
ST_W2
#define ST_W2(in, idx0, idx1, pdst, stride)
Definition: generic_macros_msa.h:450
hevc_hv_uniwgt_8t_12w_msa
static void hevc_hv_uniwgt_8t_12w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
Definition: hevc_mc_uniw_msa.c:2309
weight
static int weight(int i, int blen, int offset)
Definition: diracdec.c:1562
ILVR_D3_SB
#define ILVR_D3_SB(...)
Definition: generic_macros_msa.h:1452
ILVR_D4_SB
#define ILVR_D4_SB(...)
Definition: generic_macros_msa.h:1460
CLIP_SW8_0_255
#define CLIP_SW8_0_255(in0, in1, in2, in3, in4, in5, in6, in7)
Definition: generic_macros_msa.h:984
hevc_uniwgt_copy_4w_msa
static void hevc_uniwgt_copy_4w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
Definition: hevc_mc_uniw_msa.c:74
CLIP_SW4_0_255
#define CLIP_SW4_0_255(in0, in1, in2, in3)
Definition: generic_macros_msa.h:978
hevcdsp_mips.h
hevc_hz_uniwgt_4t_6w_msa
static void hevc_hz_uniwgt_4t_6w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
Definition: hevc_mc_uniw_msa.c:2801
hevc_vt_uniwgt_8t_48w_msa
static void hevc_vt_uniwgt_8t_48w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
Definition: hevc_mc_uniw_msa.c:1981
SLLI_4V
#define SLLI_4V(in0, in1, in2, in3, shift)
Definition: generic_macros_msa.h:1921
hevc_hz_uniwgt_4t_8x4_msa
static void hevc_hz_uniwgt_4t_8x4_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t weight, int32_t offset, int32_t rnd_val)
Definition: hevc_mc_uniw_msa.c:2974
LD_SB7
#define LD_SB7(...)
Definition: generic_macros_msa.h:327
hevc_hz_uniwgt_8t_8w_msa
static void hevc_hz_uniwgt_8t_8w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
Definition: hevc_mc_uniw_msa.c:765
hevc_vt_uniwgt_8t_64w_msa
static void hevc_vt_uniwgt_8t_64w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
Definition: hevc_mc_uniw_msa.c:1996
ILVR_B2_SB
#define ILVR_B2_SB(...)
Definition: generic_macros_msa.h:1338
hevc_vt_uniwgt_4t_4x2_msa
static void hevc_vt_uniwgt_4t_4x2_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t weight, int32_t offset, int32_t rnd_val)
Definition: hevc_mc_uniw_msa.c:3734
ILVRL_B2_SB
#define ILVRL_B2_SB(...)
Definition: generic_macros_msa.h:1496
ILVEV_H2_SH
#define ILVEV_H2_SH(...)
Definition: generic_macros_msa.h:1208
hevc_hv_uniwgt_4t_8x2_msa
static void hevc_hv_uniwgt_4t_8x2_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t weight, int32_t offset, int32_t rnd_val)
Definition: hevc_mc_uniw_msa.c:5425
height
#define height
hevc_hv_uniwgt_8t_64w_msa
static void hevc_hv_uniwgt_8t_64w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
Definition: hevc_mc_uniw_msa.c:2575
offset
it s the only field you need to keep assuming you have a context There is some magic you don t need to care about around this just let it vf offset
Definition: writing_filters.txt:86
hevc_vt_uniwgt_4t_4x8multiple_msa
static void hevc_vt_uniwgt_4t_4x8multiple_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
Definition: hevc_mc_uniw_msa.c:3840
hevc_hv_uniwgt_4t_24w_msa
static void hevc_hv_uniwgt_4t_24w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
Definition: hevc_mc_uniw_msa.c:6189
hevc_uniwgt_copy_48w_msa
static void hevc_uniwgt_copy_48w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
Definition: hevc_mc_uniw_msa.c:525
LW4
#define LW4(psrc, stride, out0, out1, out2, out3)
Definition: generic_macros_msa.h:202
ST_D2
#define ST_D2(in, idx0, idx1, pdst, stride)
Definition: generic_macros_msa.h:491
UNIWGT_MC_COPY
#define UNIWGT_MC_COPY(WIDTH)
Definition: hevc_mc_uniw_msa.c:6221
LD_SB6
#define LD_SB6(...)
Definition: generic_macros_msa.h:316
hevc_hv_uniwgt_8t_32w_msa
static void hevc_hv_uniwgt_8t_32w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
Definition: hevc_mc_uniw_msa.c:2543
SPLATI_W4_SH
#define SPLATI_W4_SH(...)
Definition: generic_macros_msa.h:1700
ILVRL_H2_SW
#define ILVRL_H2_SW(...)
Definition: generic_macros_msa.h:1509
hevc_hz_uniwgt_4t_8x8multiple_msa
static void hevc_hz_uniwgt_4t_8x8multiple_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
Definition: hevc_mc_uniw_msa.c:3141
hevc_uniwgt_copy_24w_msa
static void hevc_uniwgt_copy_24w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
Definition: hevc_mc_uniw_msa.c:428
src2
const pixel * src2
Definition: h264pred_template.c:422
ST_W8
#define ST_W8(in0, in1, idx0, idx1, idx2, idx3, idx4, idx5, idx6, idx7, pdst, stride)
Definition: generic_macros_msa.h:470
PCKEV_B3_UB
#define PCKEV_B3_UB(...)
Definition: generic_macros_msa.h:1729
LD4
#define LD4(psrc, stride, out0, out1, out2, out3)
Definition: generic_macros_msa.h:228
ILVL_B2_SB
#define ILVL_B2_SB(...)
Definition: generic_macros_msa.h:1263
hevc_hv_uniwgt_4t_32w_msa
static void hevc_hv_uniwgt_4t_32w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
Definition: hevc_mc_uniw_msa.c:6205
HEVC_FILT_8TAP_4W_SH
#define HEVC_FILT_8TAP_4W_SH(in0, in1, in2, in3, filt0, filt1, filt2, filt3, dst0, dst1)
Definition: hevc_mc_uniw_msa.c:59
ST_H2
#define ST_H2(in, idx0, idx1, pdst, stride)
Definition: generic_macros_msa.h:409
SPLATI_W2_SH
#define SPLATI_W2_SH(...)
Definition: generic_macros_msa.h:1692
hevc_hv_uniwgt_8t_24w_msa
static void hevc_hv_uniwgt_8t_24w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
Definition: hevc_mc_uniw_msa.c:2527
LD_SB3
#define LD_SB3(...)
Definition: generic_macros_msa.h:289
ST_UB
#define ST_UB(...)
Definition: generic_macros_msa.h:40
UNI_W_MC
#define UNI_W_MC(PEL, DIR, WIDTH, TAP, DIR1, FILT_DIR)
Definition: hevc_mc_uniw_msa.c:6251
hevc_vt_uniwgt_4t_6w_msa
static void hevc_vt_uniwgt_4t_6w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
Definition: hevc_mc_uniw_msa.c:3949
hevc_vt_uniwgt_8t_32w_msa
static void hevc_vt_uniwgt_8t_32w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
Definition: hevc_mc_uniw_msa.c:1966
DOTP_SH2_SW
#define DOTP_SH2_SW(...)
Definition: generic_macros_msa.h:803
hevc_hv_uniwgt_4t_8w_msa
static void hevc_hv_uniwgt_4t_8w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
Definition: hevc_mc_uniw_msa.c:5916
ff_hevc_mask_arr
static const uint8_t ff_hevc_mask_arr[16 *2]
Definition: hevc_mc_uniw_msa.c:25
ST_D4
#define ST_D4(in0, in1, idx0, idx1, idx2, idx3, pdst, stride)
Definition: generic_macros_msa.h:499
ILVL_B4_SB
#define ILVL_B4_SB(...)
Definition: generic_macros_msa.h:1274
HEVC_FILT_4TAP_SW
#define HEVC_FILT_4TAP_SW(in0, in1, filt0, filt1)
Definition: hevc_macros_msa.h:55
LD_SB8
#define LD_SB8(...)
Definition: generic_macros_msa.h:336
ILVR_B4_SH
#define ILVR_B4_SH(...)
Definition: generic_macros_msa.h:1362
hevc_vt_uniwgt_4t_4x4_msa
static void hevc_vt_uniwgt_4t_4x4_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t weight, int32_t offset, int32_t rnd_val)
Definition: hevc_mc_uniw_msa.c:3783
src0
const pixel *const src0
Definition: h264pred_template.c:420
hevc_hv_uniwgt_4t_8multx4_msa
static void hevc_hv_uniwgt_4t_8multx4_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t width8mult, int32_t weight, int32_t offset, int32_t rnd_val)
Definition: hevc_mc_uniw_msa.c:5515
zero
#define zero
Definition: regdef.h:64
hevc_vt_uniwgt_4t_8x8mult_msa
static void hevc_vt_uniwgt_4t_8x8mult_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
Definition: hevc_mc_uniw_msa.c:4262
HEVC_UNIW_RND_CLIP4_MAX_SATU_H
#define HEVC_UNIW_RND_CLIP4_MAX_SATU_H(in0_h, in1_h, in2_h, in3_h, wgt_w, offset_h, rnd_w, out0_h, out1_h, out2_h, out3_h)
Definition: hevc_mc_uniw_msa.c:49
hevc_vt_uniwgt_8t_12w_msa
static void hevc_vt_uniwgt_8t_12w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
Definition: hevc_mc_uniw_msa.c:1663
hevc_uniwgt_copy_16w_msa
static void hevc_uniwgt_copy_16w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
Definition: hevc_mc_uniw_msa.c:386
hevc_hz_uniwgt_4t_24w_msa
static void hevc_hz_uniwgt_4t_24w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
Definition: hevc_mc_uniw_msa.c:3507
ILVRL_B2_SH
#define ILVRL_B2_SH(...)
Definition: generic_macros_msa.h:1498
src
INIT_CLIP pixel * src
Definition: h264pred_template.c:418
HEVC_FILT_4TAP
#define HEVC_FILT_4TAP(in0, in1, filt0, filt1)
Definition: hevc_macros_msa.h:64
hevc_hz_uniwgt_4t_16w_msa
static void hevc_hz_uniwgt_4t_16w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
Definition: hevc_mc_uniw_msa.c:3389
MUL4
#define MUL4(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, out3)
Definition: generic_macros_msa.h:2106
int32_t
int32_t
Definition: audioconvert.c:56
hevc_hv_uniwgt_8t_16w_msa
static void hevc_hv_uniwgt_8t_16w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
Definition: hevc_mc_uniw_msa.c:2511
h
h
Definition: vp9dsp_template.c:2038
ILVR_B2_SH
#define ILVR_B2_SH(...)
Definition: generic_macros_msa.h:1340
hevc_hz_uniwgt_8t_64w_msa
static void hevc_hz_uniwgt_8t_64w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
Definition: hevc_mc_uniw_msa.c:1339
PCKEV_D2_SH
#define PCKEV_D2_SH(...)
Definition: generic_macros_msa.h:1789
hevc_vt_uniwgt_8t_8w_msa
static void hevc_vt_uniwgt_8t_8w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
Definition: hevc_mc_uniw_msa.c:1567
hevc_vt_uniwgt_4t_24w_msa
static void hevc_vt_uniwgt_4t_24w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
Definition: hevc_mc_uniw_msa.c:4679
hevc_vt_uniwgt_4t_32w_msa
static void hevc_vt_uniwgt_4t_32w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
Definition: hevc_mc_uniw_msa.c:4837
PCKEV_H2_SH
#define PCKEV_H2_SH(...)
Definition: generic_macros_msa.h:1759
LD_SB2
#define LD_SB2(...)
Definition: generic_macros_msa.h:278
SRAR_W4_SW
#define SRAR_W4_SW(...)
Definition: generic_macros_msa.h:2041
LW2
#define LW2(psrc, stride, out0, out1)
Definition: generic_macros_msa.h:210