23 #include <wasm_simd128.h>
27 #define HEVC_MAX_PB_SIZE 64
32 const int16_t *sao_offset_val,
33 int sao_left_class,
int width,
37 v128_t offset_low, offset_high;
39 for (
int k = 0; k < 4; k++)
40 offset_table[(k + sao_left_class) & 31] = (int8_t)sao_offset_val[k + 1];
45 for (
int y =
height; y > 0; y -= 2) {
46 v128_t src_v, src_high;
49 src_v = wasm_v128_load64_zero(
src);
51 src_v = wasm_v128_load64_lane(
src, src_v, 1);
54 v0 = wasm_u8x16_shr(src_v, 3);
55 v1 = wasm_i8x16_sub(v0, wasm_i8x16_const_splat(16));
56 v0 = wasm_i8x16_swizzle(offset_low, v0);
57 v1 = wasm_i8x16_swizzle(offset_high, v1);
58 v0 = wasm_v128_or(v0, v1);
59 src_high = wasm_u16x8_extend_high_u8x16(src_v);
60 v1 = wasm_i16x8_extend_high_i8x16(v0);
61 src_v = wasm_u16x8_extend_low_u8x16(src_v);
62 v0 = wasm_i16x8_extend_low_i8x16(v0);
64 v0 = wasm_i16x8_add_sat(src_v, v0);
65 v1 = wasm_i16x8_add_sat(src_high, v1);
66 v0 = wasm_u8x16_narrow_i16x8(v0, v1);
68 wasm_v128_store64_lane(
dst, v0, 0);
70 wasm_v128_store64_lane(
dst, v0, 1);
78 const int16_t *sao_offset_val,
79 int sao_left_class,
int width,
83 v128_t offset_low, offset_high;
85 for (
int k = 0; k < 4; k++)
86 offset_table[(k + sao_left_class) & 31] = (int8_t)sao_offset_val[k + 1];
91 for (
int y =
height; y > 0; y--) {
92 for (
int x = 0; x <
width; x += 16) {
93 v128_t src_v, src_high;
96 src_v = wasm_v128_load(&
src[x]);
98 v0 = wasm_u8x16_shr(src_v, 3);
99 v1 = wasm_i8x16_sub(v0, wasm_i8x16_const_splat(16));
100 v0 = wasm_i8x16_swizzle(offset_low, v0);
101 v1 = wasm_i8x16_swizzle(offset_high, v1);
102 v0 = wasm_v128_or(v0, v1);
103 src_high = wasm_u16x8_extend_high_u8x16(src_v);
104 v1 = wasm_i16x8_extend_high_i8x16(v0);
105 src_v = wasm_u16x8_extend_low_u8x16(src_v);
106 v0 = wasm_i16x8_extend_low_i8x16(v0);
108 v0 = wasm_i16x8_add_sat(src_v, v0);
109 v1 = wasm_i16x8_add_sat(src_high, v1);
110 v0 = wasm_u8x16_narrow_i16x8(v0, v1);
111 wasm_v128_store(&
dst[x], v0);
120 ptrdiff_t stride_dst,
121 const int16_t *sao_offset_val,
124 static const int8_t
pos[4][2][2] = {
125 { { -1, 0 }, { 1, 0 } },
126 { { 0, -1 }, { 0, 1 } },
127 { { -1, -1 }, { 1, 1 } },
128 { { 1, -1 }, { -1, 1 } },
130 int a_stride, b_stride;
132 const v128_t edge_idx = wasm_u8x16_make(1, 2, 0, 3,
136 v128_t sao_offset = wasm_v128_load(sao_offset_val);
137 v128_t one = wasm_i8x16_const_splat(1);
138 v128_t two = wasm_i8x16_const_splat(2);
140 a_stride =
pos[eo][0][0] +
pos[eo][0][1] * stride_src;
141 b_stride =
pos[eo][1][0] +
pos[eo][1][1] * stride_src;
142 for (
int y =
height; y > 0; y -= 2) {
146 v0 = wasm_v128_load64_zero(
src);
147 v1 = wasm_v128_load64_zero(
src + a_stride);
148 v2 = wasm_v128_load64_zero(
src + b_stride);
150 v0 = wasm_v128_load64_lane(
src, v0, 1);
151 v1 = wasm_v128_load64_lane(
src + a_stride, v1, 1);
152 v2 = wasm_v128_load64_lane(
src + b_stride, v2, 1);
155 diff0 = wasm_u8x16_gt(v0, v1);
156 v1 = wasm_u8x16_lt(v0, v1);
157 diff0 = wasm_i8x16_sub(v1, diff0);
159 diff1 = wasm_u8x16_gt(v0, v2);
160 v2 = wasm_u8x16_lt(v0, v2);
161 diff1 = wasm_i8x16_sub(v2, diff1);
163 v1 = wasm_i8x16_add(diff0, two);
164 v1 = wasm_i8x16_add(v1, diff1);
166 v2 = wasm_i8x16_swizzle(edge_idx, v1);
167 v1 = wasm_i8x16_shl(v2, 1);
168 v2 = wasm_i8x16_add(v1, one);
169 diff0 = wasm_i8x16_shuffle(v1, v2, 0, 16, 1, 17, 2, 18, 3, 19, 4,
170 20, 5, 21, 6, 22, 7, 23);
171 diff1 = wasm_i8x16_shuffle(v1, v2, 8, 24, 9, 25, 10, 26, 11, 27,
172 12, 28, 13, 29, 14, 30, 15, 31);
173 v1 = wasm_u16x8_extend_high_u8x16(v0);
174 v0 = wasm_u16x8_extend_low_u8x16(v0);
175 diff0 = wasm_i8x16_swizzle(sao_offset, diff0);
176 diff1 = wasm_i8x16_swizzle(sao_offset, diff1);
178 v0 = wasm_i16x8_add_sat(v0, diff0);
179 v1 = wasm_i16x8_add_sat(v1, diff1);
180 v0 = wasm_u8x16_narrow_i16x8(v0, v1);
182 wasm_v128_store64_lane(
dst, v0, 0);
184 wasm_v128_store64_lane(
dst, v0, 1);
190 ptrdiff_t stride_dst,
191 const int16_t *sao_offset_val,
194 static const int8_t
pos[4][2][2] = {
195 { { -1, 0 }, { 1, 0 } },
196 { { 0, -1 }, { 0, 1 } },
197 { { -1, -1 }, { 1, 1 } },
198 { { 1, -1 }, { -1, 1 } },
200 int a_stride, b_stride;
202 const v128_t edge_idx = wasm_u8x16_make(1, 2, 0, 3,
206 v128_t sao_offset = wasm_v128_load(sao_offset_val);
207 v128_t one = wasm_i8x16_const_splat(1);
208 v128_t two = wasm_i8x16_const_splat(2);
210 a_stride =
pos[eo][0][0] +
pos[eo][0][1] * stride_src;
211 b_stride =
pos[eo][1][0] +
pos[eo][1][1] * stride_src;
212 for (
int y =
height; y > 0; y--) {
213 for (
int x = 0; x <
width; x += 16) {
217 v0 = wasm_v128_load(&
src[x]);
218 v1 = wasm_v128_load(&
src[x + a_stride]);
219 v2 = wasm_v128_load(&
src[x + b_stride]);
221 diff0 = wasm_u8x16_gt(v0, v1);
222 v1 = wasm_u8x16_lt(v0, v1);
223 diff0 = wasm_i8x16_sub(v1, diff0);
225 diff1 = wasm_u8x16_gt(v0, v2);
226 v2 = wasm_u8x16_lt(v0, v2);
227 diff1 = wasm_i8x16_sub(v2, diff1);
229 v1 = wasm_i8x16_add(diff0, two);
230 v1 = wasm_i8x16_add(v1, diff1);
232 v2 = wasm_i8x16_swizzle(edge_idx, v1);
233 v1 = wasm_i8x16_shl(v2, 1);
234 v2 = wasm_i8x16_add(v1, one);
235 diff0 = wasm_i8x16_shuffle(v1, v2, 0, 16, 1, 17, 2, 18, 3, 19, 4,
236 20, 5, 21, 6, 22, 7, 23);
237 diff1 = wasm_i8x16_shuffle(v1, v2, 8, 24, 9, 25, 10, 26, 11, 27,
238 12, 28, 13, 29, 14, 30, 15, 31);
239 v1 = wasm_u16x8_extend_high_u8x16(v0);
240 v0 = wasm_u16x8_extend_low_u8x16(v0);
241 diff0 = wasm_i8x16_swizzle(sao_offset, diff0);
242 diff1 = wasm_i8x16_swizzle(sao_offset, diff1);
244 v0 = wasm_i16x8_add_sat(v0, diff0);
245 v1 = wasm_i16x8_add_sat(v1, diff1);
246 v0 = wasm_u8x16_narrow_i16x8(v0, v1);
247 wasm_v128_store(&
dst[x], v0);