FFmpeg
sao.c
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2025 Zhao Zhili
3  *
4  * This file is part of FFmpeg.
5  *
6  * FFmpeg is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU Lesser General Public
8  * License as published by the Free Software Foundation; either
9  * version 2.1 of the License, or (at your option) any later version.
10  *
11  * FFmpeg is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14  * Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with FFmpeg; if not, write to the Free Software
18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19  */
20 
21 #include "sao.h"
22 
23 #include <wasm_simd128.h>
24 
25 #include "libavcodec/defs.h"
26 
27 #define HEVC_MAX_PB_SIZE 64
28 
29 void ff_hevc_sao_band_filter_8x8_8_simd128(uint8_t *dst, const uint8_t *src,
30  ptrdiff_t stride_dst,
31  ptrdiff_t stride_src,
32  const int16_t *sao_offset_val,
33  int sao_left_class, int width,
34  int height)
35 {
36  int8_t offset_table[32] = {0};
37  v128_t offset_low, offset_high;
38 
39  for (int k = 0; k < 4; k++)
40  offset_table[(k + sao_left_class) & 31] = (int8_t)sao_offset_val[k + 1];
41 
42  offset_low = wasm_v128_load(offset_table);
43  offset_high = wasm_v128_load(&offset_table[16]);
44 
45  for (int y = height; y > 0; y -= 2) {
46  v128_t src_v, src_high;
47  v128_t v0, v1;
48 
49  src_v = wasm_v128_load64_zero(src);
50  src += stride_src;
51  src_v = wasm_v128_load64_lane(src, src_v, 1);
52  src += stride_src;
53 
54  v0 = wasm_u8x16_shr(src_v, 3);
55  v1 = wasm_i8x16_sub(v0, wasm_i8x16_const_splat(16));
56  v0 = wasm_i8x16_swizzle(offset_low, v0);
57  v1 = wasm_i8x16_swizzle(offset_high, v1);
58  v0 = wasm_v128_or(v0, v1);
59  src_high = wasm_u16x8_extend_high_u8x16(src_v);
60  v1 = wasm_i16x8_extend_high_i8x16(v0);
61  src_v = wasm_u16x8_extend_low_u8x16(src_v);
62  v0 = wasm_i16x8_extend_low_i8x16(v0);
63 
64  v0 = wasm_i16x8_add_sat(src_v, v0);
65  v1 = wasm_i16x8_add_sat(src_high, v1);
66  v0 = wasm_u8x16_narrow_i16x8(v0, v1);
67 
68  wasm_v128_store64_lane(dst, v0, 0);
69  dst += stride_dst;
70  wasm_v128_store64_lane(dst, v0, 1);
71  dst += stride_dst;
72  }
73 }
74 
75 void ff_hevc_sao_band_filter_16x16_8_simd128(uint8_t *dst, const uint8_t *src,
76  ptrdiff_t stride_dst,
77  ptrdiff_t stride_src,
78  const int16_t *sao_offset_val,
79  int sao_left_class, int width,
80  int height)
81 {
82  int8_t offset_table[32] = {0};
83  v128_t offset_low, offset_high;
84 
85  for (int k = 0; k < 4; k++)
86  offset_table[(k + sao_left_class) & 31] = (int8_t)sao_offset_val[k + 1];
87 
88  offset_low = wasm_v128_load(offset_table);
89  offset_high = wasm_v128_load(&offset_table[16]);
90 
91  for (int y = height; y > 0; y--) {
92  for (int x = 0; x < width; x += 16) {
93  v128_t src_v, src_high;
94  v128_t v0, v1;
95 
96  src_v = wasm_v128_load(&src[x]);
97 
98  v0 = wasm_u8x16_shr(src_v, 3);
99  v1 = wasm_i8x16_sub(v0, wasm_i8x16_const_splat(16));
100  v0 = wasm_i8x16_swizzle(offset_low, v0);
101  v1 = wasm_i8x16_swizzle(offset_high, v1);
102  v0 = wasm_v128_or(v0, v1);
103  src_high = wasm_u16x8_extend_high_u8x16(src_v);
104  v1 = wasm_i16x8_extend_high_i8x16(v0);
105  src_v = wasm_u16x8_extend_low_u8x16(src_v);
106  v0 = wasm_i16x8_extend_low_i8x16(v0);
107 
108  v0 = wasm_i16x8_add_sat(src_v, v0);
109  v1 = wasm_i16x8_add_sat(src_high, v1);
110  v0 = wasm_u8x16_narrow_i16x8(v0, v1);
111  wasm_v128_store(&dst[x], v0);
112  }
113 
114  dst += stride_dst;
115  src += stride_src;
116  }
117 }
118 
119 void ff_hevc_sao_edge_filter_8x8_8_simd128(uint8_t *dst, const uint8_t *src,
120  ptrdiff_t stride_dst,
121  const int16_t *sao_offset_val,
122  int eo, int width, int height)
123 {
124  static const int8_t pos[4][2][2] = {
125  { { -1, 0 }, { 1, 0 } }, // horizontal
126  { { 0, -1 }, { 0, 1 } }, // vertical
127  { { -1, -1 }, { 1, 1 } }, // 45 degree
128  { { 1, -1 }, { -1, 1 } }, // 135 degree
129  };
130  int a_stride, b_stride;
131  ptrdiff_t stride_src = (2 * HEVC_MAX_PB_SIZE + AV_INPUT_BUFFER_PADDING_SIZE);
132  const v128_t edge_idx = wasm_u8x16_make(1, 2, 0, 3,
133  4, 0, 0, 0,
134  0, 0, 0, 0,
135  0, 0, 0, 0);
136  v128_t sao_offset = wasm_v128_load(sao_offset_val);
137  v128_t one = wasm_i8x16_const_splat(1);
138  v128_t two = wasm_i8x16_const_splat(2);
139 
140  a_stride = pos[eo][0][0] + pos[eo][0][1] * stride_src;
141  b_stride = pos[eo][1][0] + pos[eo][1][1] * stride_src;
142  for (int y = height; y > 0; y -= 2) {
143  v128_t v0, v1, v2;
144  v128_t diff0, diff1;
145 
146  v0 = wasm_v128_load64_zero(src);
147  v1 = wasm_v128_load64_zero(src + a_stride);
148  v2 = wasm_v128_load64_zero(src + b_stride);
149  src += stride_src;
150  v0 = wasm_v128_load64_lane(src, v0, 1);
151  v1 = wasm_v128_load64_lane(src + a_stride, v1, 1);
152  v2 = wasm_v128_load64_lane(src + b_stride, v2, 1);
153  src += stride_src;
154 
155  diff0 = wasm_u8x16_gt(v0, v1);
156  v1 = wasm_u8x16_lt(v0, v1);
157  diff0 = wasm_i8x16_sub(v1, diff0);
158 
159  diff1 = wasm_u8x16_gt(v0, v2);
160  v2 = wasm_u8x16_lt(v0, v2);
161  diff1 = wasm_i8x16_sub(v2, diff1);
162 
163  v1 = wasm_i8x16_add(diff0, two);
164  v1 = wasm_i8x16_add(v1, diff1);
165 
166  v2 = wasm_i8x16_swizzle(edge_idx, v1); // offset_val
167  v1 = wasm_i8x16_shl(v2, 1); // Access int16_t
168  v2 = wasm_i8x16_add(v1, one); // Access upper half of int16_t
169  diff0 = wasm_i8x16_shuffle(v1, v2, 0, 16, 1, 17, 2, 18, 3, 19, 4,
170  20, 5, 21, 6, 22, 7, 23);
171  diff1 = wasm_i8x16_shuffle(v1, v2, 8, 24, 9, 25, 10, 26, 11, 27,
172  12, 28, 13, 29, 14, 30, 15, 31);
173  v1 = wasm_u16x8_extend_high_u8x16(v0);
174  v0 = wasm_u16x8_extend_low_u8x16(v0);
175  diff0 = wasm_i8x16_swizzle(sao_offset, diff0);
176  diff1 = wasm_i8x16_swizzle(sao_offset, diff1);
177 
178  v0 = wasm_i16x8_add_sat(v0, diff0);
179  v1 = wasm_i16x8_add_sat(v1, diff1);
180  v0 = wasm_u8x16_narrow_i16x8(v0, v1);
181 
182  wasm_v128_store64_lane(dst, v0, 0);
183  dst += stride_dst;
184  wasm_v128_store64_lane(dst, v0, 1);
185  dst += stride_dst;
186  }
187 }
188 
189 void ff_hevc_sao_edge_filter_16x16_8_simd128(uint8_t *dst, const uint8_t *src,
190  ptrdiff_t stride_dst,
191  const int16_t *sao_offset_val,
192  int eo, int width, int height)
193 {
194  static const int8_t pos[4][2][2] = {
195  { { -1, 0 }, { 1, 0 } }, // horizontal
196  { { 0, -1 }, { 0, 1 } }, // vertical
197  { { -1, -1 }, { 1, 1 } }, // 45 degree
198  { { 1, -1 }, { -1, 1 } }, // 135 degree
199  };
200  int a_stride, b_stride;
201  ptrdiff_t stride_src = (2 * HEVC_MAX_PB_SIZE + AV_INPUT_BUFFER_PADDING_SIZE);
202  const v128_t edge_idx = wasm_u8x16_make(1, 2, 0, 3,
203  4, 0, 0, 0,
204  0, 0, 0, 0,
205  0, 0, 0, 0);
206  v128_t sao_offset = wasm_v128_load(sao_offset_val);
207  v128_t one = wasm_i8x16_const_splat(1);
208  v128_t two = wasm_i8x16_const_splat(2);
209 
210  a_stride = pos[eo][0][0] + pos[eo][0][1] * stride_src;
211  b_stride = pos[eo][1][0] + pos[eo][1][1] * stride_src;
212  for (int y = height; y > 0; y--) {
213  for (int x = 0; x < width; x += 16) {
214  v128_t v0, v1, v2;
215  v128_t diff0, diff1;
216 
217  v0 = wasm_v128_load(&src[x]);
218  v1 = wasm_v128_load(&src[x + a_stride]);
219  v2 = wasm_v128_load(&src[x + b_stride]);
220 
221  diff0 = wasm_u8x16_gt(v0, v1);
222  v1 = wasm_u8x16_lt(v0, v1);
223  diff0 = wasm_i8x16_sub(v1, diff0);
224 
225  diff1 = wasm_u8x16_gt(v0, v2);
226  v2 = wasm_u8x16_lt(v0, v2);
227  diff1 = wasm_i8x16_sub(v2, diff1);
228 
229  v1 = wasm_i8x16_add(diff0, two);
230  v1 = wasm_i8x16_add(v1, diff1);
231 
232  v2 = wasm_i8x16_swizzle(edge_idx, v1); // offset_val
233  v1 = wasm_i8x16_shl(v2, 1); // Access int16_t
234  v2 = wasm_i8x16_add(v1, one); // Access upper half of int16_t
235  diff0 = wasm_i8x16_shuffle(v1, v2, 0, 16, 1, 17, 2, 18, 3, 19, 4,
236  20, 5, 21, 6, 22, 7, 23);
237  diff1 = wasm_i8x16_shuffle(v1, v2, 8, 24, 9, 25, 10, 26, 11, 27,
238  12, 28, 13, 29, 14, 30, 15, 31);
239  v1 = wasm_u16x8_extend_high_u8x16(v0);
240  v0 = wasm_u16x8_extend_low_u8x16(v0);
241  diff0 = wasm_i8x16_swizzle(sao_offset, diff0);
242  diff1 = wasm_i8x16_swizzle(sao_offset, diff1);
243 
244  v0 = wasm_i16x8_add_sat(v0, diff0);
245  v1 = wasm_i16x8_add_sat(v1, diff1);
246  v0 = wasm_u8x16_narrow_i16x8(v0, v1);
247  wasm_v128_store(&dst[x], v0);
248  }
249 
250  src += stride_src;
251  dst += stride_dst;
252  }
253 }
254 
HEVC_MAX_PB_SIZE
#define HEVC_MAX_PB_SIZE
Definition: sao.c:27
ff_hevc_sao_edge_filter_8x8_8_simd128
void ff_hevc_sao_edge_filter_8x8_8_simd128(uint8_t *dst, const uint8_t *src, ptrdiff_t stride_dst, const int16_t *sao_offset_val, int eo, int width, int height)
Definition: sao.c:119
height
#define height
Definition: dsp.h:89
dst
uint8_t ptrdiff_t const uint8_t ptrdiff_t int intptr_t intptr_t int int16_t * dst
Definition: dsp.h:87
offset_table
static const uint8_t offset_table[]
Definition: escape130.c:42
ff_hevc_sao_band_filter_8x8_8_simd128
void ff_hevc_sao_band_filter_8x8_8_simd128(uint8_t *dst, const uint8_t *src, ptrdiff_t stride_dst, ptrdiff_t stride_src, const int16_t *sao_offset_val, int sao_left_class, int width, int height)
Definition: sao.c:29
pos
unsigned int pos
Definition: spdifenc.c:414
AV_INPUT_BUFFER_PADDING_SIZE
#define AV_INPUT_BUFFER_PADDING_SIZE
Definition: defs.h:40
defs.h
ff_hevc_sao_edge_filter_16x16_8_simd128
void ff_hevc_sao_edge_filter_16x16_8_simd128(uint8_t *dst, const uint8_t *src, ptrdiff_t stride_dst, const int16_t *sao_offset_val, int eo, int width, int height)
Definition: sao.c:189
width
#define width
Definition: dsp.h:89
sao.h
ff_hevc_sao_band_filter_16x16_8_simd128
void ff_hevc_sao_band_filter_16x16_8_simd128(uint8_t *dst, const uint8_t *src, ptrdiff_t stride_dst, ptrdiff_t stride_src, const int16_t *sao_offset_val, int sao_left_class, int width, int height)
Definition: sao.c:75
src
#define src
Definition: vp8dsp.c:248