26 int16_t qadd, int8_t n_coeffs,
29 int16_t *block_dup =
block;
31 v8i16 block_vec, qmul_vec, qadd_vec, sub;
32 v8i16 add,
mask, mul, zero_mask;
34 qmul_vec = __msa_fill_h(qmul);
35 qadd_vec = __msa_fill_h(qadd);
36 for (cnt = 0; cnt < (n_coeffs >> 3); cnt++) {
37 block_vec =
LD_SH(block_dup + loop_start);
38 mask = __msa_clti_s_h(block_vec, 0);
39 zero_mask = __msa_ceqi_h(block_vec, 0);
40 mul = block_vec * qmul_vec;
43 add = (v8i16) __msa_bmnz_v((v16u8) add, (v16u8) sub, (v16u8)
mask);
44 block_vec = (v8i16) __msa_bmnz_v((v16u8) add, (v16u8) block_vec,
46 ST_SH(block_vec, block_dup + loop_start);
50 cnt = ((n_coeffs >> 3) * 8) + loop_start;
52 for (; cnt <= n_coeffs; cnt++) {
67 const int16_t *quant_matrix)
70 v8i16 block_vec, block_neg, qscale_vec,
mask;
71 v8i16 block_org0, block_org1, block_org2, block_org3;
72 v8i16 quant_m0, quant_m1, quant_m2, quant_m3;
73 v8i16 sum, mul, zero_mask;
74 v4i32 mul_vec, qscale_l, qscale_r, quant_m_r, quant_m_l;
75 v4i32 block_l, block_r, sad;
77 qscale_vec = __msa_fill_h(qscale);
78 for (cnt = 0; cnt < 2; cnt++) {
79 LD_SH4(
block, 8, block_org0, block_org1, block_org2, block_org3);
80 LD_SH4(quant_matrix, 8, quant_m0, quant_m1, quant_m2, quant_m3);
81 mask = __msa_clti_s_h(block_org0, 0);
82 zero_mask = __msa_ceqi_h(block_org0, 0);
83 block_neg = -block_org0;
84 block_vec = (v8i16) __msa_bmnz_v((v16u8) block_org0, (v16u8) block_neg,
91 mul_vec = block_l * qscale_l;
93 block_l = mul_vec >> 4;
94 mul_vec = block_r * qscale_r;
96 block_r = mul_vec >> 4;
97 mul = (v8i16) __msa_pckev_h((v8i16) block_l, (v8i16) block_r);
99 sum = (v8i16) __msa_bmnz_v((v16u8) mul, (v16u8) block_neg,
101 sum = (v8i16) __msa_bmnz_v((v16u8) sum, (v16u8) block_org0,
106 sad = __msa_hadd_s_w(sum, sum);
108 mask = __msa_clti_s_h(block_org1, 0);
109 zero_mask = __msa_ceqi_h(block_org1, 0);
110 block_neg = - block_org1;
111 block_vec = (v8i16) __msa_bmnz_v((v16u8) block_org1, (v16u8) block_neg,
118 mul_vec = block_l * qscale_l;
119 mul_vec *= quant_m_l;
120 block_l = mul_vec >> 4;
121 mul_vec = block_r * qscale_r;
122 mul_vec *= quant_m_r;
123 block_r = mul_vec >> 4;
124 mul = __msa_pckev_h((v8i16) block_l, (v8i16) block_r);
126 sum = (v8i16) __msa_bmnz_v((v16u8) mul, (v16u8) block_neg,
128 sum = (v8i16) __msa_bmnz_v((v16u8) sum, (v16u8) block_org1,
134 sad = __msa_hadd_s_w(sum, sum);
136 mask = __msa_clti_s_h(block_org2, 0);
137 zero_mask = __msa_ceqi_h(block_org2, 0);
138 block_neg = - block_org2;
139 block_vec = (v8i16) __msa_bmnz_v((v16u8) block_org2, (v16u8) block_neg,
146 mul_vec = block_l * qscale_l;
147 mul_vec *= quant_m_l;
148 block_l = mul_vec >> 4;
149 mul_vec = block_r * qscale_r;
150 mul_vec *= quant_m_r;
151 block_r = mul_vec >> 4;
152 mul = __msa_pckev_h((v8i16) block_l, (v8i16) block_r);
154 sum = (v8i16) __msa_bmnz_v((v16u8) mul, (v16u8) block_neg,
156 sum = (v8i16) __msa_bmnz_v((v16u8) sum, (v16u8) block_org2,
162 sad = __msa_hadd_s_w(sum, sum);
164 mask = __msa_clti_s_h(block_org3, 0);
165 zero_mask = __msa_ceqi_h(block_org3, 0);
166 block_neg = - block_org3;
167 block_vec = (v8i16) __msa_bmnz_v((v16u8) block_org3, (v16u8) block_neg,
174 mul_vec = block_l * qscale_l;
175 mul_vec *= quant_m_l;
176 block_l = mul_vec >> 4;
177 mul_vec = block_r * qscale_r;
178 mul_vec *= quant_m_r;
179 block_r = mul_vec >> 4;
180 mul = __msa_pckev_h((v8i16) block_l, (v8i16) block_r);
182 sum = (v8i16) __msa_bmnz_v((v16u8) mul, (v16u8) block_neg,
184 sum = (v8i16) __msa_bmnz_v((v16u8) sum, (v16u8) block_org3,
190 sad = __msa_hadd_s_w(sum, sum);
210 qadd = (qscale - 1) | 1;
217 nCoeffs =
s->inter_scantable.raster_end[
s->block_last_index[
index]];
231 qadd = (qscale - 1) | 1;
234 nCoeffs =
s->inter_scantable.raster_end[
s->block_last_index[
index]];
243 const uint16_t *quant_matrix;
246 quant_matrix =
s->inter_matrix;
250 block[63] ^= sum & 1;