FFmpeg
pixblockdsp.c
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2002 Brian Foley
3  * Copyright (c) 2002 Dieter Shirley
4  * Copyright (c) 2003-2004 Romain Dolbeau <romain@dolbeau.org>
5  *
6  * This file is part of FFmpeg.
7  *
8  * FFmpeg is free software; you can redistribute it and/or
9  * modify it under the terms of the GNU Lesser General Public
10  * License as published by the Free Software Foundation; either
11  * version 2.1 of the License, or (at your option) any later version.
12  *
13  * FFmpeg is distributed in the hope that it will be useful,
14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16  * Lesser General Public License for more details.
17  *
18  * You should have received a copy of the GNU Lesser General Public
19  * License along with FFmpeg; if not, write to the Free Software
20  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21  */
22 
23 #include "config.h"
24 
25 #include "libavutil/attributes.h"
26 #include "libavutil/cpu.h"
27 #include "libavutil/ppc/cpu.h"
29 
30 #include "libavcodec/pixblockdsp.h"
31 
32 #if HAVE_ALTIVEC
33 
34 #if HAVE_VSX
35 static void get_pixels_altivec(int16_t *restrict block, const uint8_t *pixels,
36  ptrdiff_t stride)
37 {
38  int i;
39  vector unsigned char perm =
40  (vector unsigned char) {0x00,0x10, 0x01,0x11,0x02,0x12,0x03,0x13,\
41  0x04,0x14,0x05,0x15,0x06,0x16,0x07,0x17};
42  const vector unsigned char zero =
43  (const vector unsigned char) vec_splat_u8(0);
44 
45  for (i = 0; i < 8; i++) {
46  /* Read potentially unaligned pixels.
47  * We're reading 16 pixels, and actually only want 8,
48  * but we simply ignore the extras. */
49  vector unsigned char bytes = vec_vsx_ld(0, pixels);
50 
51  // Convert the bytes into shorts.
52  //vector signed short shorts = (vector signed short) vec_perm(zero, bytes, perm);
53  vector signed short shorts = (vector signed short) vec_perm(bytes, zero, perm);
54 
55  // Save the data to the block, we assume the block is 16-byte aligned.
56  vec_vsx_st(shorts, i * 16, (vector signed short *) block);
57 
58  pixels += stride;
59  }
60 }
61 #else
62 static void get_pixels_altivec(int16_t *restrict block, const uint8_t *pixels,
63  ptrdiff_t stride)
64 {
65  int i;
66  const vec_u8 zero = (const vec_u8)vec_splat_u8(0);
67 
68  for (i = 0; i < 8; i++) {
69  vec_u8 perm = vec_lvsl(0, pixels);
70  /* Read potentially unaligned pixels.
71  * We're reading 16 pixels, and actually only want 8,
72  * but we simply ignore the extras. */
73  vec_u8 pixl = vec_ld(0, pixels);
74  vec_u8 pixr = vec_ld(7, pixels);
75  vec_u8 bytes = vec_perm(pixl, pixr, perm);
76 
77  // Convert the bytes into shorts.
78  vec_s16 shorts = (vec_s16)vec_mergeh(zero, bytes);
79 
80  // Save the data to the block, we assume the block is 16-byte aligned.
81  vec_st(shorts, i * 16, (vec_s16 *)block);
82 
83  pixels += stride;
84  }
85 }
86 
87 #endif /* HAVE_VSX */
88 
89 #if HAVE_VSX
90 static void diff_pixels_altivec(int16_t *restrict block, const uint8_t *s1,
91  const uint8_t *s2, ptrdiff_t stride)
92 {
93  int i;
94  const vector unsigned char zero =
95  (const vector unsigned char) vec_splat_u8(0);
96  vector signed short shorts1, shorts2;
97 
98  for (i = 0; i < 4; i++) {
99  /* Read potentially unaligned pixels.
100  * We're reading 16 pixels, and actually only want 8,
101  * but we simply ignore the extras. */
102  vector unsigned char bytes = vec_vsx_ld(0, s1);
103 
104  // Convert the bytes into shorts.
105  shorts1 = (vector signed short) vec_mergeh(bytes, zero);
106 
107  // Do the same for the second block of pixels.
108  bytes =vec_vsx_ld(0, s2);
109 
110  // Convert the bytes into shorts.
111  shorts2 = (vector signed short) vec_mergeh(bytes, zero);
112 
113  // Do the subtraction.
114  shorts1 = vec_sub(shorts1, shorts2);
115 
116  // Save the data to the block, we assume the block is 16-byte aligned.
117  vec_vsx_st(shorts1, 0, (vector signed short *) block);
118 
119  s1 += stride;
120  s2 += stride;
121  block += 8;
122 
123  /* The code below is a copy of the code above...
124  * This is a manual unroll. */
125 
126  /* Read potentially unaligned pixels.
127  * We're reading 16 pixels, and actually only want 8,
128  * but we simply ignore the extras. */
129  bytes = vec_vsx_ld(0, s1);
130 
131  // Convert the bytes into shorts.
132  shorts1 = (vector signed short) vec_mergeh(bytes, zero);
133 
134  // Do the same for the second block of pixels.
135  bytes = vec_vsx_ld(0, s2);
136 
137  // Convert the bytes into shorts.
138  shorts2 = (vector signed short) vec_mergeh(bytes, zero);
139 
140  // Do the subtraction.
141  shorts1 = vec_sub(shorts1, shorts2);
142 
143  // Save the data to the block, we assume the block is 16-byte aligned.
144  vec_vsx_st(shorts1, 0, (vector signed short *) block);
145 
146  s1 += stride;
147  s2 += stride;
148  block += 8;
149  }
150 }
151 #else
152 static void diff_pixels_altivec(int16_t *restrict block, const uint8_t *s1,
153  const uint8_t *s2, ptrdiff_t stride)
154 {
155  int i;
156  vec_u8 perm;
157  const vec_u8 zero = (const vec_u8)vec_splat_u8(0);
158  vec_s16 shorts1, shorts2;
159 
160  for (i = 0; i < 4; i++) {
161  /* Read potentially unaligned pixels.
162  * We're reading 16 pixels, and actually only want 8,
163  * but we simply ignore the extras. */
164  perm = vec_lvsl(0, s1);
165  vec_u8 pixl = vec_ld(0, s1);
166  vec_u8 pixr = vec_ld(15, s1);
167  vec_u8 bytes = vec_perm(pixl, pixr, perm);
168 
169  // Convert the bytes into shorts.
170  shorts1 = (vec_s16)vec_mergeh(zero, bytes);
171 
172  // Do the same for the second block of pixels.
173  perm = vec_lvsl(0, s2);
174  pixl = vec_ld(0, s2);
175  pixr = vec_ld(15, s2);
176  bytes = vec_perm(pixl, pixr, perm);
177 
178  // Convert the bytes into shorts.
179  shorts2 = (vec_s16)vec_mergeh(zero, bytes);
180 
181  // Do the subtraction.
182  shorts1 = vec_sub(shorts1, shorts2);
183 
184  // Save the data to the block, we assume the block is 16-byte aligned.
185  vec_st(shorts1, 0, (vec_s16 *)block);
186 
187  s1 += stride;
188  s2 += stride;
189  block += 8;
190 
191  /* The code below is a copy of the code above...
192  * This is a manual unroll. */
193 
194  /* Read potentially unaligned pixels.
195  * We're reading 16 pixels, and actually only want 8,
196  * but we simply ignore the extras. */
197  perm = vec_lvsl(0, s1);
198  pixl = vec_ld(0, s1);
199  pixr = vec_ld(15, s1);
200  bytes = vec_perm(pixl, pixr, perm);
201 
202  // Convert the bytes into shorts.
203  shorts1 = (vec_s16)vec_mergeh(zero, bytes);
204 
205  // Do the same for the second block of pixels.
206  perm = vec_lvsl(0, s2);
207  pixl = vec_ld(0, s2);
208  pixr = vec_ld(15, s2);
209  bytes = vec_perm(pixl, pixr, perm);
210 
211  // Convert the bytes into shorts.
212  shorts2 = (vec_s16)vec_mergeh(zero, bytes);
213 
214  // Do the subtraction.
215  shorts1 = vec_sub(shorts1, shorts2);
216 
217  // Save the data to the block, we assume the block is 16-byte aligned.
218  vec_st(shorts1, 0, (vec_s16 *)block);
219 
220  s1 += stride;
221  s2 += stride;
222  block += 8;
223  }
224 }
225 
226 #endif /* HAVE_VSX */
227 
228 #endif /* HAVE_ALTIVEC */
229 
230 #if HAVE_VSX
231 static void get_pixels_vsx(int16_t *restrict block, const uint8_t *pixels,
232  ptrdiff_t stride)
233 {
234  int i;
235  for (i = 0; i < 8; i++) {
236  vec_s16 shorts = vsx_ld_u8_s16(0, pixels);
237 
238  vec_vsx_st(shorts, i * 16, block);
239 
240  pixels += stride;
241  }
242 }
243 
244 static void diff_pixels_vsx(int16_t *restrict block, const uint8_t *s1,
245  const uint8_t *s2, ptrdiff_t stride)
246 {
247  int i;
248  vec_s16 shorts1, shorts2;
249  for (i = 0; i < 8; i++) {
250  shorts1 = vsx_ld_u8_s16(0, s1);
251  shorts2 = vsx_ld_u8_s16(0, s2);
252 
253  shorts1 = vec_sub(shorts1, shorts2);
254 
255  vec_vsx_st(shorts1, 0, block);
256 
257  s1 += stride;
258  s2 += stride;
259  block += 8;
260  }
261 }
262 #endif /* HAVE_VSX */
263 
265  unsigned high_bit_depth)
266 {
267 #if HAVE_ALTIVEC
269  return;
270 
271  c->diff_pixels = diff_pixels_altivec;
272 
273  if (!high_bit_depth) {
274  c->get_pixels = get_pixels_altivec;
275  }
276 #endif /* HAVE_ALTIVEC */
277 
278 #if HAVE_VSX
279  if (!PPC_VSX(av_get_cpu_flags()))
280  return;
281 
282  c->diff_pixels = diff_pixels_vsx;
283 
284  if (!high_bit_depth)
285  c->get_pixels = get_pixels_vsx;
286 #endif /* HAVE_VSX */
287 }
av_get_cpu_flags
int av_get_cpu_flags(void)
Return the flags which specify extensions supported by the CPU.
Definition: cpu.c:109
perm
perm
Definition: f_perms.c:75
vec_s16
#define vec_s16
Definition: util_altivec.h:37
av_cold
#define av_cold
Definition: attributes.h:90
PixblockDSPContext
Definition: pixblockdsp.h:28
c
Undefined Behavior In the C some operations are like signed integer dereferencing freed accessing outside allocated Undefined Behavior must not occur in a C it is not safe even if the output of undefined operations is unused The unsafety may seem nit picking but Optimizing compilers have in fact optimized code on the assumption that no undefined Behavior occurs Optimizing code based on wrong assumptions can and has in some cases lead to effects beyond the output of computations The signed integer overflow problem in speed critical code Code which is highly optimized and works with signed integers sometimes has the problem that often the output of the computation does not c
Definition: undefined.txt:32
PPC_ALTIVEC
#define PPC_ALTIVEC(flags)
Definition: cpu.h:25
cpu.h
vec_u8
#define vec_u8
Definition: util_altivec.h:34
attributes.h
zero
static int zero(InterplayACMContext *s, unsigned ind, unsigned col)
Definition: interplayacm.c:121
i
#define i(width, name, range_min, range_max)
Definition: cbs_h2645.c:256
stride
#define stride
Definition: h264pred_template.c:536
util_altivec.h
cpu.h
PPC_VSX
#define PPC_VSX(flags)
Definition: cpu.h:26
ff_pixblockdsp_init_ppc
av_cold void ff_pixblockdsp_init_ppc(PixblockDSPContext *c, unsigned high_bit_depth)
Definition: pixblockdsp.c:264
block
The exact code depends on how similar the blocks are and how related they are to the block
Definition: filter_design.txt:207
short
it s the only field you need to keep assuming you have a context There is some magic you don t need to care about around this just let it vf default minimum maximum flags name is the option keep it simple and lowercase description are short
Definition: writing_filters.txt:89
pixblockdsp.h