FFmpeg
idctdsp.c
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2001 Michel Lespinasse
3  *
4  * This file is part of FFmpeg.
5  *
6  * FFmpeg is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU Lesser General Public
8  * License as published by the Free Software Foundation; either
9  * version 2.1 of the License, or (at your option) any later version.
10  *
11  * FFmpeg is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14  * Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with FFmpeg; if not, write to the Free Software
18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19  */
20 
21 /* NOTE: This code is based on GPL code from the libmpeg2 project. The
22  * author, Michel Lespinasses, has given explicit permission to release
23  * under LGPL as part of FFmpeg.
24  *
25  * FFmpeg integration by Dieter Shirley
26  *
27  * This file is a direct copy of the AltiVec IDCT module from the libmpeg2
28  * project. I've deleted all of the libmpeg2-specific code, renamed the
29  * functions and reordered the function parameters. The only change to the
30  * IDCT function itself was to factor out the partial transposition, and to
31  * perform a full transpose at the end of the function. */
32 
33 #include "config.h"
34 
35 #include <stdlib.h>
36 #include <string.h>
37 
38 #include "libavutil/attributes.h"
39 #include "libavutil/cpu.h"
40 #include "libavutil/ppc/cpu.h"
42 
43 #include "libavcodec/idctdsp.h"
44 
45 #if HAVE_ALTIVEC
46 
47 #define IDCT_HALF \
48  /* 1st stage */ \
49  t1 = vec_mradds(a1, vx7, vx1); \
50  t8 = vec_mradds(a1, vx1, vec_subs(zero, vx7)); \
51  t7 = vec_mradds(a2, vx5, vx3); \
52  t3 = vec_mradds(ma2, vx3, vx5); \
53  \
54  /* 2nd stage */ \
55  t5 = vec_adds(vx0, vx4); \
56  t0 = vec_subs(vx0, vx4); \
57  t2 = vec_mradds(a0, vx6, vx2); \
58  t4 = vec_mradds(a0, vx2, vec_subs(zero, vx6)); \
59  t6 = vec_adds(t8, t3); \
60  t3 = vec_subs(t8, t3); \
61  t8 = vec_subs(t1, t7); \
62  t1 = vec_adds(t1, t7); \
63  \
64  /* 3rd stage */ \
65  t7 = vec_adds(t5, t2); \
66  t2 = vec_subs(t5, t2); \
67  t5 = vec_adds(t0, t4); \
68  t0 = vec_subs(t0, t4); \
69  t4 = vec_subs(t8, t3); \
70  t3 = vec_adds(t8, t3); \
71  \
72  /* 4th stage */ \
73  vy0 = vec_adds(t7, t1); \
74  vy7 = vec_subs(t7, t1); \
75  vy1 = vec_mradds(c4, t3, t5); \
76  vy6 = vec_mradds(mc4, t3, t5); \
77  vy2 = vec_mradds(c4, t4, t0); \
78  vy5 = vec_mradds(mc4, t4, t0); \
79  vy3 = vec_adds(t2, t6); \
80  vy4 = vec_subs(t2, t6)
81 
82 #define IDCT \
83  vec_s16 vy0, vy1, vy2, vy3, vy4, vy5, vy6, vy7; \
84  vec_s16 t0, t1, t2, t3, t4, t5, t6, t7, t8; \
85  \
86  vec_s16 c4 = vec_splat(constants[0], 0); \
87  vec_s16 a0 = vec_splat(constants[0], 1); \
88  vec_s16 a1 = vec_splat(constants[0], 2); \
89  vec_s16 a2 = vec_splat(constants[0], 3); \
90  vec_s16 mc4 = vec_splat(constants[0], 4); \
91  vec_s16 ma2 = vec_splat(constants[0], 5); \
92  vec_s16 bias = (vec_s16) vec_splat((vec_s32) constants[0], 3); \
93  \
94  vec_s16 zero = vec_splat_s16(0); \
95  vec_u16 shift = vec_splat_u16(4); \
96  \
97  vec_s16 vx0 = vec_mradds(vec_sl(block[0], shift), constants[1], zero); \
98  vec_s16 vx1 = vec_mradds(vec_sl(block[1], shift), constants[2], zero); \
99  vec_s16 vx2 = vec_mradds(vec_sl(block[2], shift), constants[3], zero); \
100  vec_s16 vx3 = vec_mradds(vec_sl(block[3], shift), constants[4], zero); \
101  vec_s16 vx4 = vec_mradds(vec_sl(block[4], shift), constants[1], zero); \
102  vec_s16 vx5 = vec_mradds(vec_sl(block[5], shift), constants[4], zero); \
103  vec_s16 vx6 = vec_mradds(vec_sl(block[6], shift), constants[3], zero); \
104  vec_s16 vx7 = vec_mradds(vec_sl(block[7], shift), constants[2], zero); \
105  \
106  IDCT_HALF; \
107  \
108  vx0 = vec_mergeh(vy0, vy4); \
109  vx1 = vec_mergel(vy0, vy4); \
110  vx2 = vec_mergeh(vy1, vy5); \
111  vx3 = vec_mergel(vy1, vy5); \
112  vx4 = vec_mergeh(vy2, vy6); \
113  vx5 = vec_mergel(vy2, vy6); \
114  vx6 = vec_mergeh(vy3, vy7); \
115  vx7 = vec_mergel(vy3, vy7); \
116  \
117  vy0 = vec_mergeh(vx0, vx4); \
118  vy1 = vec_mergel(vx0, vx4); \
119  vy2 = vec_mergeh(vx1, vx5); \
120  vy3 = vec_mergel(vx1, vx5); \
121  vy4 = vec_mergeh(vx2, vx6); \
122  vy5 = vec_mergel(vx2, vx6); \
123  vy6 = vec_mergeh(vx3, vx7); \
124  vy7 = vec_mergel(vx3, vx7); \
125  \
126  vx0 = vec_adds(vec_mergeh(vy0, vy4), bias); \
127  vx1 = vec_mergel(vy0, vy4); \
128  vx2 = vec_mergeh(vy1, vy5); \
129  vx3 = vec_mergel(vy1, vy5); \
130  vx4 = vec_mergeh(vy2, vy6); \
131  vx5 = vec_mergel(vy2, vy6); \
132  vx6 = vec_mergeh(vy3, vy7); \
133  vx7 = vec_mergel(vy3, vy7); \
134  \
135  IDCT_HALF; \
136  \
137  shift = vec_splat_u16(6); \
138  vx0 = vec_sra(vy0, shift); \
139  vx1 = vec_sra(vy1, shift); \
140  vx2 = vec_sra(vy2, shift); \
141  vx3 = vec_sra(vy3, shift); \
142  vx4 = vec_sra(vy4, shift); \
143  vx5 = vec_sra(vy5, shift); \
144  vx6 = vec_sra(vy6, shift); \
145  vx7 = vec_sra(vy7, shift)
146 
147 static const vec_s16 constants[5] = {
148  { 23170, 13573, 6518, 21895, -23170, -21895, 32, 31 },
149  { 16384, 22725, 21407, 19266, 16384, 19266, 21407, 22725 },
150  { 22725, 31521, 29692, 26722, 22725, 26722, 29692, 31521 },
151  { 21407, 29692, 27969, 25172, 21407, 25172, 27969, 29692 },
152  { 19266, 26722, 25172, 22654, 19266, 22654, 25172, 26722 }
153 };
154 
155 static void idct_altivec(int16_t *blk)
156 {
157  vec_s16 *block = (vec_s16 *) blk;
158 
159  IDCT;
160 
161  block[0] = vx0;
162  block[1] = vx1;
163  block[2] = vx2;
164  block[3] = vx3;
165  block[4] = vx4;
166  block[5] = vx5;
167  block[6] = vx6;
168  block[7] = vx7;
169 }
170 
171 static void idct_put_altivec(uint8_t *dest, ptrdiff_t stride, int16_t *blk)
172 {
173  vec_s16 *block = (vec_s16 *) blk;
174  vec_u8 tmp;
175 
176  IDCT;
177 
178 #define COPY(dest, src) \
179  tmp = vec_packsu(src, src); \
180  vec_ste((vec_u32) tmp, 0, (unsigned int *) dest); \
181  vec_ste((vec_u32) tmp, 4, (unsigned int *) dest)
182 
183  COPY(dest, vx0);
184  dest += stride;
185  COPY(dest, vx1);
186  dest += stride;
187  COPY(dest, vx2);
188  dest += stride;
189  COPY(dest, vx3);
190  dest += stride;
191  COPY(dest, vx4);
192  dest += stride;
193  COPY(dest, vx5);
194  dest += stride;
195  COPY(dest, vx6);
196  dest += stride;
197  COPY(dest, vx7);
198 }
199 
200 static void idct_add_altivec(uint8_t *dest, ptrdiff_t stride, int16_t *blk)
201 {
202  vec_s16 *block = (vec_s16 *) blk;
203  vec_u8 tmp;
204  vec_s16 tmp2, tmp3;
205  vec_u8 perm0;
206  vec_u8 perm1;
207  vec_u8 p0, p1, p;
208 
209  IDCT;
210 
211 #if HAVE_BIGENDIAN
212  p0 = vec_lvsl(0, dest);
213  p1 = vec_lvsl(stride, dest);
214  p = vec_splat_u8(-1);
215  perm0 = vec_mergeh(p, p0);
216  perm1 = vec_mergeh(p, p1);
217 #endif
218 
219 #if HAVE_BIGENDIAN
220 #define GET_TMP2(dest, prm) \
221  tmp = vec_ld(0, dest); \
222  tmp2 = (vec_s16) vec_perm(tmp, (vec_u8) zero, prm);
223 #else
224 #define GET_TMP2(dest, prm) \
225  tmp = vec_vsx_ld(0, dest); \
226  tmp2 = (vec_s16) vec_mergeh(tmp, (vec_u8) zero)
227 #endif
228 
229 #define ADD(dest, src, perm) \
230  GET_TMP2(dest, perm); \
231  tmp3 = vec_adds(tmp2, src); \
232  tmp = vec_packsu(tmp3, tmp3); \
233  vec_ste((vec_u32) tmp, 0, (unsigned int *) dest); \
234  vec_ste((vec_u32) tmp, 4, (unsigned int *) dest)
235 
236  ADD(dest, vx0, perm0);
237  dest += stride;
238  ADD(dest, vx1, perm1);
239  dest += stride;
240  ADD(dest, vx2, perm0);
241  dest += stride;
242  ADD(dest, vx3, perm1);
243  dest += stride;
244  ADD(dest, vx4, perm0);
245  dest += stride;
246  ADD(dest, vx5, perm1);
247  dest += stride;
248  ADD(dest, vx6, perm0);
249  dest += stride;
250  ADD(dest, vx7, perm1);
251 }
252 
253 #endif /* HAVE_ALTIVEC */
254 
256  unsigned high_bit_depth)
257 {
258 #if HAVE_ALTIVEC
260  return;
261 
262  if (!high_bit_depth && avctx->lowres == 0) {
263  if ((avctx->idct_algo == FF_IDCT_AUTO && !(avctx->flags & AV_CODEC_FLAG_BITEXACT)) ||
264  (avctx->idct_algo == FF_IDCT_ALTIVEC)) {
265  c->idct = idct_altivec;
266  c->idct_add = idct_add_altivec;
267  c->idct_put = idct_put_altivec;
268  c->perm_type = FF_IDCT_PERM_TRANSPOSE;
269  }
270  }
271 #endif /* HAVE_ALTIVEC */
272 }
stride
int stride
Definition: mace.c:144
FF_IDCT_ALTIVEC
#define FF_IDCT_ALTIVEC
Definition: avcodec.h:1735
tmp
static uint8_t tmp[11]
Definition: aes_ctr.c:26
constants
static const struct @285 constants[]
av_get_cpu_flags
int av_get_cpu_flags(void)
Return the flags which specify extensions supported by the CPU.
Definition: cpu.c:93
FF_IDCT_AUTO
#define FF_IDCT_AUTO
Definition: avcodec.h:1730
AVCodecContext::flags
int flags
AV_CODEC_FLAG_*.
Definition: avcodec.h:606
vec_s16
#define vec_s16
Definition: util_altivec.h:37
av_cold
#define av_cold
Definition: attributes.h:90
COPY
#define COPY(src, name)
blk
#define blk(i)
Definition: sha.c:185
ADD
#define ADD(a, b)
Definition: dct32_template.c:123
c
Undefined Behavior In the C some operations are like signed integer dereferencing freed accessing outside allocated Undefined Behavior must not occur in a C it is not safe even if the output of undefined operations is unused The unsafety may seem nit picking but Optimizing compilers have in fact optimized code on the assumption that no undefined Behavior occurs Optimizing code based on wrong assumptions can and has in some cases lead to effects beyond the output of computations The signed integer overflow problem in speed critical code Code which is highly optimized and works with signed integers sometimes has the problem that often the output of the computation does not c
Definition: undefined.txt:32
AVCodecContext::lowres
int lowres
low resolution decoding, 1-> 1/2 size, 2->1/4 size
Definition: avcodec.h:1765
PPC_ALTIVEC
#define PPC_ALTIVEC(flags)
Definition: cpu.h:25
cpu.h
vec_u8
#define vec_u8
Definition: util_altivec.h:34
attributes.h
uint8_t
uint8_t
Definition: audio_convert.c:194
AVCodecContext::idct_algo
int idct_algo
IDCT algorithm, see FF_IDCT_* below.
Definition: avcodec.h:1729
idctdsp.h
FF_IDCT_PERM_TRANSPOSE
@ FF_IDCT_PERM_TRANSPOSE
Definition: idctdsp.h:41
IDCTDSPContext
Definition: idctdsp.h:53
AVCodecContext
main external API structure.
Definition: avcodec.h:526
IDCT
#define IDCT(H)
Definition: hevcdsp_template.c:240
AV_CODEC_FLAG_BITEXACT
#define AV_CODEC_FLAG_BITEXACT
Use only bitexact stuff (except (I)DCT).
Definition: avcodec.h:333
util_altivec.h
cpu.h
block
The exact code depends on how similar the blocks are and how related they are to the block
Definition: filter_design.txt:207
ff_idctdsp_init_ppc
av_cold void ff_idctdsp_init_ppc(IDCTDSPContext *c, AVCodecContext *avctx, unsigned high_bit_depth)
Definition: idctdsp.c:255