FFmpeg
postprocess_altivec_template.c
Go to the documentation of this file.
1 /*
2  * AltiVec optimizations (C) 2004 Romain Dolbeau <romain@dolbeau.org>
3  *
4  * based on code by Copyright (C) 2001-2003 Michael Niedermayer (michaelni@gmx.at)
5  *
6  * This file is part of FFmpeg.
7  *
8  * FFmpeg is free software; you can redistribute it and/or modify
9  * it under the terms of the GNU General Public License as published by
10  * the Free Software Foundation; either version 2 of the License, or
11  * (at your option) any later version.
12  *
13  * FFmpeg is distributed in the hope that it will be useful,
14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16  * GNU General Public License for more details.
17  *
18  * You should have received a copy of the GNU General Public License
19  * along with FFmpeg; if not, write to the Free Software
20  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21  */
22 
23 #include "libavutil/avutil.h"
24 
25 #define ALTIVEC_TRANSPOSE_8x8_SHORT(src_a,src_b,src_c,src_d,src_e,src_f,src_g,src_h) \
26  do { \
27  __typeof__(src_a) tempA1, tempB1, tempC1, tempD1; \
28  __typeof__(src_a) tempE1, tempF1, tempG1, tempH1; \
29  __typeof__(src_a) tempA2, tempB2, tempC2, tempD2; \
30  __typeof__(src_a) tempE2, tempF2, tempG2, tempH2; \
31  tempA1 = vec_mergeh (src_a, src_e); \
32  tempB1 = vec_mergel (src_a, src_e); \
33  tempC1 = vec_mergeh (src_b, src_f); \
34  tempD1 = vec_mergel (src_b, src_f); \
35  tempE1 = vec_mergeh (src_c, src_g); \
36  tempF1 = vec_mergel (src_c, src_g); \
37  tempG1 = vec_mergeh (src_d, src_h); \
38  tempH1 = vec_mergel (src_d, src_h); \
39  tempA2 = vec_mergeh (tempA1, tempE1); \
40  tempB2 = vec_mergel (tempA1, tempE1); \
41  tempC2 = vec_mergeh (tempB1, tempF1); \
42  tempD2 = vec_mergel (tempB1, tempF1); \
43  tempE2 = vec_mergeh (tempC1, tempG1); \
44  tempF2 = vec_mergel (tempC1, tempG1); \
45  tempG2 = vec_mergeh (tempD1, tempH1); \
46  tempH2 = vec_mergel (tempD1, tempH1); \
47  src_a = vec_mergeh (tempA2, tempE2); \
48  src_b = vec_mergel (tempA2, tempE2); \
49  src_c = vec_mergeh (tempB2, tempF2); \
50  src_d = vec_mergel (tempB2, tempF2); \
51  src_e = vec_mergeh (tempC2, tempG2); \
52  src_f = vec_mergel (tempC2, tempG2); \
53  src_g = vec_mergeh (tempD2, tempH2); \
54  src_h = vec_mergel (tempD2, tempH2); \
55  } while (0)
56 
57 
58 static inline int vertClassify_altivec(uint8_t src[], int stride, PPContext *c) {
59  /*
60  this code makes no assumption on src or stride.
61  One could remove the recomputation of the perm
62  vector by assuming (stride % 16) == 0, unfortunately
63  this is not always true.
64  */
65  short data_0 = ((c->nonBQP*c->ppMode.baseDcDiff)>>8) + 1;
66  DECLARE_ALIGNED(16, short, data)[8] =
67  {
68  data_0,
69  data_0 * 2 + 1,
70  c->QP * 2,
71  c->QP * 4
72  };
73  int numEq;
74  uint8_t *src2 = src;
75  vector signed short v_dcOffset;
76  vector signed short v2QP;
77  vector unsigned short v4QP;
78  vector unsigned short v_dcThreshold;
79  const int properStride = (stride % 16);
80  const int srcAlign = ((unsigned long)src2 % 16);
81  const int two_vectors = ((srcAlign > 8) || properStride) ? 1 : 0;
82  const vector signed int zero = vec_splat_s32(0);
83  const vector signed short mask = vec_splat_s16(1);
84  vector signed int v_numEq = vec_splat_s32(0);
85  vector signed short v_data = vec_ld(0, data);
86  vector signed short v_srcAss0, v_srcAss1, v_srcAss2, v_srcAss3,
87  v_srcAss4, v_srcAss5, v_srcAss6, v_srcAss7;
88 //FIXME avoid this mess if possible
89  register int j0 = 0,
90  j1 = stride,
91  j2 = 2 * stride,
92  j3 = 3 * stride,
93  j4 = 4 * stride,
94  j5 = 5 * stride,
95  j6 = 6 * stride,
96  j7 = 7 * stride;
97  vector unsigned char v_srcA0, v_srcA1, v_srcA2, v_srcA3,
98  v_srcA4, v_srcA5, v_srcA6, v_srcA7;
99 
100  v_dcOffset = vec_splat(v_data, 0);
101  v_dcThreshold = (vector unsigned short)vec_splat(v_data, 1);
102  v2QP = vec_splat(v_data, 2);
103  v4QP = (vector unsigned short)vec_splat(v_data, 3);
104 
105  src2 += stride * 4;
106 
107 #define LOAD_LINE(i) \
108  { \
109  vector unsigned char perm##i = vec_lvsl(j##i, src2); \
110  vector unsigned char v_srcA2##i; \
111  vector unsigned char v_srcA1##i = vec_ld(j##i, src2); \
112  if (two_vectors) \
113  v_srcA2##i = vec_ld(j##i + 16, src2); \
114  v_srcA##i = \
115  vec_perm(v_srcA1##i, v_srcA2##i, perm##i); \
116  v_srcAss##i = \
117  (vector signed short)vec_mergeh((vector signed char)zero, \
118  (vector signed char)v_srcA##i); }
119 
120 #define LOAD_LINE_ALIGNED(i) \
121  v_srcA##i = vec_ld(j##i, src2); \
122  v_srcAss##i = \
123  (vector signed short)vec_mergeh((vector signed char)zero, \
124  (vector signed char)v_srcA##i)
125 
126  /* Special-casing the aligned case is worthwhile, as all calls from
127  * the (transposed) horizontable deblocks will be aligned, in addition
128  * to the naturally aligned vertical deblocks. */
129  if (properStride && srcAlign) {
138  } else {
139  LOAD_LINE(0);
140  LOAD_LINE(1);
141  LOAD_LINE(2);
142  LOAD_LINE(3);
143  LOAD_LINE(4);
144  LOAD_LINE(5);
145  LOAD_LINE(6);
146  LOAD_LINE(7);
147  }
148 #undef LOAD_LINE
149 #undef LOAD_LINE_ALIGNED
150 
151 #define ITER(i, j) \
152  const vector signed short v_diff##i = \
153  vec_sub(v_srcAss##i, v_srcAss##j); \
154  const vector signed short v_sum##i = \
155  vec_add(v_diff##i, v_dcOffset); \
156  const vector signed short v_comp##i = \
157  (vector signed short)vec_cmplt((vector unsigned short)v_sum##i, \
158  v_dcThreshold); \
159  const vector signed short v_part##i = vec_and(mask, v_comp##i);
160 
161  {
162  ITER(0, 1)
163  ITER(1, 2)
164  ITER(2, 3)
165  ITER(3, 4)
166  ITER(4, 5)
167  ITER(5, 6)
168  ITER(6, 7)
169 
170  v_numEq = vec_sum4s(v_part0, v_numEq);
171  v_numEq = vec_sum4s(v_part1, v_numEq);
172  v_numEq = vec_sum4s(v_part2, v_numEq);
173  v_numEq = vec_sum4s(v_part3, v_numEq);
174  v_numEq = vec_sum4s(v_part4, v_numEq);
175  v_numEq = vec_sum4s(v_part5, v_numEq);
176  v_numEq = vec_sum4s(v_part6, v_numEq);
177  }
178 
179 #undef ITER
180 
181  v_numEq = vec_sums(v_numEq, zero);
182 
183  v_numEq = vec_splat(v_numEq, 3);
184  vec_ste(v_numEq, 0, &numEq);
185 
186  if (numEq > c->ppMode.flatnessThreshold){
187  const vector unsigned char mmoP1 = (const vector unsigned char)
188  {0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f,
189  0x00, 0x01, 0x12, 0x13, 0x08, 0x09, 0x1A, 0x1B};
190  const vector unsigned char mmoP2 = (const vector unsigned char)
191  {0x04, 0x05, 0x16, 0x17, 0x0C, 0x0D, 0x1E, 0x1F,
192  0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f};
193  const vector unsigned char mmoP = (const vector unsigned char)
194  vec_lvsl(8, (unsigned char*)0);
195 
196  vector signed short mmoL1 = vec_perm(v_srcAss0, v_srcAss2, mmoP1);
197  vector signed short mmoL2 = vec_perm(v_srcAss4, v_srcAss6, mmoP2);
198  vector signed short mmoL = vec_perm(mmoL1, mmoL2, mmoP);
199  vector signed short mmoR1 = vec_perm(v_srcAss5, v_srcAss7, mmoP1);
200  vector signed short mmoR2 = vec_perm(v_srcAss1, v_srcAss3, mmoP2);
201  vector signed short mmoR = vec_perm(mmoR1, mmoR2, mmoP);
202  vector signed short mmoDiff = vec_sub(mmoL, mmoR);
203  vector unsigned short mmoSum = (vector unsigned short)vec_add(mmoDiff, v2QP);
204 
205  if (vec_any_gt(mmoSum, v4QP))
206  return 0;
207  else
208  return 1;
209  }
210  else return 2;
211 }
212 
213 static inline void doVertLowPass_altivec(uint8_t *src, int stride, PPContext *c) {
214  /*
215  this code makes no assumption on src or stride.
216  One could remove the recomputation of the perm
217  vector by assuming (stride % 16) == 0, unfortunately
218  this is not always true. Quite a lot of load/stores
219  can be removed by assuming proper alignment of
220  src & stride :-(
221  */
222  uint8_t *src2 = src;
223  const vector signed int zero = vec_splat_s32(0);
224  const int properStride = (stride % 16);
225  const int srcAlign = ((unsigned long)src2 % 16);
226  DECLARE_ALIGNED(16, short, qp)[8] = {c->QP};
227  vector signed short vqp = vec_ld(0, qp);
228  vector signed short vb0, vb1, vb2, vb3, vb4, vb5, vb6, vb7, vb8, vb9;
229  vector unsigned char vbA0, av_uninit(vbA1), av_uninit(vbA2), av_uninit(vbA3), av_uninit(vbA4), av_uninit(vbA5), av_uninit(vbA6), av_uninit(vbA7), av_uninit(vbA8), vbA9;
230  vector unsigned char vbB0, av_uninit(vbB1), av_uninit(vbB2), av_uninit(vbB3), av_uninit(vbB4), av_uninit(vbB5), av_uninit(vbB6), av_uninit(vbB7), av_uninit(vbB8), vbB9;
231  vector unsigned char vbT0, vbT1, vbT2, vbT3, vbT4, vbT5, vbT6, vbT7, vbT8, vbT9;
232  vector unsigned char perml0, perml1, perml2, perml3, perml4,
233  perml5, perml6, perml7, perml8, perml9;
234  register int j0 = 0,
235  j1 = stride,
236  j2 = 2 * stride,
237  j3 = 3 * stride,
238  j4 = 4 * stride,
239  j5 = 5 * stride,
240  j6 = 6 * stride,
241  j7 = 7 * stride,
242  j8 = 8 * stride,
243  j9 = 9 * stride;
244 
245  vqp = vec_splat(vqp, 0);
246 
247  src2 += stride*3;
248 
249 #define LOAD_LINE(i) \
250  perml##i = vec_lvsl(i * stride, src2); \
251  vbA##i = vec_ld(i * stride, src2); \
252  vbB##i = vec_ld(i * stride + 16, src2); \
253  vbT##i = vec_perm(vbA##i, vbB##i, perml##i); \
254  vb##i = \
255  (vector signed short)vec_mergeh((vector unsigned char)zero, \
256  (vector unsigned char)vbT##i)
257 
258 #define LOAD_LINE_ALIGNED(i) \
259  vbT##i = vec_ld(j##i, src2); \
260  vb##i = \
261  (vector signed short)vec_mergeh((vector signed char)zero, \
262  (vector signed char)vbT##i)
263 
264  /* Special-casing the aligned case is worthwhile, as all calls from
265  * the (transposed) horizontable deblocks will be aligned, in addition
266  * to the naturally aligned vertical deblocks. */
267  if (properStride && srcAlign) {
278  } else {
279  LOAD_LINE(0);
280  LOAD_LINE(1);
281  LOAD_LINE(2);
282  LOAD_LINE(3);
283  LOAD_LINE(4);
284  LOAD_LINE(5);
285  LOAD_LINE(6);
286  LOAD_LINE(7);
287  LOAD_LINE(8);
288  LOAD_LINE(9);
289  }
290 #undef LOAD_LINE
291 #undef LOAD_LINE_ALIGNED
292  {
293  const vector unsigned short v_2 = vec_splat_u16(2);
294  const vector unsigned short v_4 = vec_splat_u16(4);
295 
296  const vector signed short v_diff01 = vec_sub(vb0, vb1);
297  const vector unsigned short v_cmp01 =
298  (const vector unsigned short) vec_cmplt(vec_abs(v_diff01), vqp);
299  const vector signed short v_first = vec_sel(vb1, vb0, v_cmp01);
300  const vector signed short v_diff89 = vec_sub(vb8, vb9);
301  const vector unsigned short v_cmp89 =
302  (const vector unsigned short) vec_cmplt(vec_abs(v_diff89), vqp);
303  const vector signed short v_last = vec_sel(vb8, vb9, v_cmp89);
304 
305  const vector signed short temp01 = vec_mladd(v_first, (vector signed short)v_4, vb1);
306  const vector signed short temp02 = vec_add(vb2, vb3);
307  const vector signed short temp03 = vec_add(temp01, (vector signed short)v_4);
308  const vector signed short v_sumsB0 = vec_add(temp02, temp03);
309 
310  const vector signed short temp11 = vec_sub(v_sumsB0, v_first);
311  const vector signed short v_sumsB1 = vec_add(temp11, vb4);
312 
313  const vector signed short temp21 = vec_sub(v_sumsB1, v_first);
314  const vector signed short v_sumsB2 = vec_add(temp21, vb5);
315 
316  const vector signed short temp31 = vec_sub(v_sumsB2, v_first);
317  const vector signed short v_sumsB3 = vec_add(temp31, vb6);
318 
319  const vector signed short temp41 = vec_sub(v_sumsB3, v_first);
320  const vector signed short v_sumsB4 = vec_add(temp41, vb7);
321 
322  const vector signed short temp51 = vec_sub(v_sumsB4, vb1);
323  const vector signed short v_sumsB5 = vec_add(temp51, vb8);
324 
325  const vector signed short temp61 = vec_sub(v_sumsB5, vb2);
326  const vector signed short v_sumsB6 = vec_add(temp61, v_last);
327 
328  const vector signed short temp71 = vec_sub(v_sumsB6, vb3);
329  const vector signed short v_sumsB7 = vec_add(temp71, v_last);
330 
331  const vector signed short temp81 = vec_sub(v_sumsB7, vb4);
332  const vector signed short v_sumsB8 = vec_add(temp81, v_last);
333 
334  const vector signed short temp91 = vec_sub(v_sumsB8, vb5);
335  const vector signed short v_sumsB9 = vec_add(temp91, v_last);
336 
337  #define COMPUTE_VR(i, j, k) \
338  const vector signed short temps1##i = \
339  vec_add(v_sumsB##i, v_sumsB##k); \
340  const vector signed short temps2##i = \
341  vec_mladd(vb##j, (vector signed short)v_2, temps1##i); \
342  const vector signed short vr##j = vec_sra(temps2##i, v_4)
343 
344  COMPUTE_VR(0, 1, 2);
345  COMPUTE_VR(1, 2, 3);
346  COMPUTE_VR(2, 3, 4);
347  COMPUTE_VR(3, 4, 5);
348  COMPUTE_VR(4, 5, 6);
349  COMPUTE_VR(5, 6, 7);
350  COMPUTE_VR(6, 7, 8);
351  COMPUTE_VR(7, 8, 9);
352 
353  const vector signed char neg1 = vec_splat_s8(-1);
354  const vector unsigned char permHH = (const vector unsigned char){0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
355  0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F};
356 
357 #define PACK_AND_STORE(i) \
358 { const vector unsigned char perms##i = \
359  vec_lvsr(i * stride, src2); \
360  const vector unsigned char vf##i = \
361  vec_packsu(vr##i, (vector signed short)zero); \
362  const vector unsigned char vg##i = \
363  vec_perm(vf##i, vbT##i, permHH); \
364  const vector unsigned char mask##i = \
365  vec_perm((vector unsigned char)zero, (vector unsigned char)neg1, perms##i); \
366  const vector unsigned char vg2##i = \
367  vec_perm(vg##i, vg##i, perms##i); \
368  const vector unsigned char svA##i = \
369  vec_sel(vbA##i, vg2##i, mask##i); \
370  const vector unsigned char svB##i = \
371  vec_sel(vg2##i, vbB##i, mask##i); \
372  vec_st(svA##i, i * stride, src2); \
373  vec_st(svB##i, i * stride + 16, src2);}
374 
375 #define PACK_AND_STORE_ALIGNED(i) \
376 { const vector unsigned char vf##i = \
377  vec_packsu(vr##i, (vector signed short)zero); \
378  const vector unsigned char vg##i = \
379  vec_perm(vf##i, vbT##i, permHH); \
380  vec_st(vg##i, i * stride, src2);}
381 
382  /* Special-casing the aligned case is worthwhile, as all calls from
383  * the (transposed) horizontable deblocks will be aligned, in addition
384  * to the naturally aligned vertical deblocks. */
385  if (properStride && srcAlign) {
394  } else {
395  PACK_AND_STORE(1)
396  PACK_AND_STORE(2)
397  PACK_AND_STORE(3)
398  PACK_AND_STORE(4)
399  PACK_AND_STORE(5)
400  PACK_AND_STORE(6)
401  PACK_AND_STORE(7)
402  PACK_AND_STORE(8)
403  }
404  #undef PACK_AND_STORE
405  #undef PACK_AND_STORE_ALIGNED
406  }
407 }
408 
409 
410 
411 static inline void doVertDefFilter_altivec(uint8_t src[], int stride, PPContext *c) {
412  /*
413  this code makes no assumption on src or stride.
414  One could remove the recomputation of the perm
415  vector by assuming (stride % 16) == 0, unfortunately
416  this is not always true. Quite a lot of load/stores
417  can be removed by assuming proper alignment of
418  src & stride :-(
419  */
420  uint8_t *src2 = src + stride*3;
421  const vector signed int zero = vec_splat_s32(0);
422  DECLARE_ALIGNED(16, short, qp)[8] = {8*c->QP};
423  vector signed short vqp = vec_splat(
424  (vector signed short)vec_ld(0, qp), 0);
425 
426 #define LOAD_LINE(i) \
427  const vector unsigned char perm##i = \
428  vec_lvsl(i * stride, src2); \
429  const vector unsigned char vbA##i = \
430  vec_ld(i * stride, src2); \
431  const vector unsigned char vbB##i = \
432  vec_ld(i * stride + 16, src2); \
433  const vector unsigned char vbT##i = \
434  vec_perm(vbA##i, vbB##i, perm##i); \
435  const vector signed short vb##i = \
436  (vector signed short)vec_mergeh((vector unsigned char)zero, \
437  (vector unsigned char)vbT##i)
438 
439  LOAD_LINE(1);
440  LOAD_LINE(2);
441  LOAD_LINE(3);
442  LOAD_LINE(4);
443  LOAD_LINE(5);
444  LOAD_LINE(6);
445  LOAD_LINE(7);
446  LOAD_LINE(8);
447 #undef LOAD_LINE
448 
449  const vector signed short v_1 = vec_splat_s16(1);
450  const vector signed short v_2 = vec_splat_s16(2);
451  const vector signed short v_5 = vec_splat_s16(5);
452  const vector signed short v_32 = vec_sl(v_1,
453  (vector unsigned short)v_5);
454  /* middle energy */
455  const vector signed short l3minusl6 = vec_sub(vb3, vb6);
456  const vector signed short l5minusl4 = vec_sub(vb5, vb4);
457  const vector signed short twotimes_l3minusl6 = vec_mladd(v_2, l3minusl6, (vector signed short)zero);
458  const vector signed short mE = vec_mladd(v_5, l5minusl4, twotimes_l3minusl6);
459  const vector signed short absmE = vec_abs(mE);
460  /* left & right energy */
461  const vector signed short l1minusl4 = vec_sub(vb1, vb4);
462  const vector signed short l3minusl2 = vec_sub(vb3, vb2);
463  const vector signed short l5minusl8 = vec_sub(vb5, vb8);
464  const vector signed short l7minusl6 = vec_sub(vb7, vb6);
465  const vector signed short twotimes_l1minusl4 = vec_mladd(v_2, l1minusl4, (vector signed short)zero);
466  const vector signed short twotimes_l5minusl8 = vec_mladd(v_2, l5minusl8, (vector signed short)zero);
467  const vector signed short lE = vec_mladd(v_5, l3minusl2, twotimes_l1minusl4);
468  const vector signed short rE = vec_mladd(v_5, l7minusl6, twotimes_l5minusl8);
469  /* d */
470  const vector signed short ddiff = vec_sub(absmE,
471  vec_min(vec_abs(lE),
472  vec_abs(rE)));
473  const vector signed short ddiffclamp = vec_max(ddiff, (vector signed short)zero);
474  const vector signed short dtimes64 = vec_mladd(v_5, ddiffclamp, v_32);
475  const vector signed short d = vec_sra(dtimes64, vec_splat_u16(6));
476  const vector signed short minusd = vec_sub((vector signed short)zero, d);
477  const vector signed short finald = vec_sel(minusd,
478  d,
479  vec_cmpgt(vec_sub((vector signed short)zero, mE),
480  (vector signed short)zero));
481  /* q */
482  const vector signed short qtimes2 = vec_sub(vb4, vb5);
483  /* for a shift right to behave like /2, we need to add one
484  to all negative integer */
485  const vector signed short rounddown = vec_sel((vector signed short)zero,
486  v_1,
487  vec_cmplt(qtimes2, (vector signed short)zero));
488  const vector signed short q = vec_sra(vec_add(qtimes2, rounddown), vec_splat_u16(1));
489  /* clamp */
490  const vector signed short dclamp_P1 = vec_max((vector signed short)zero, finald);
491  const vector signed short dclamp_P = vec_min(dclamp_P1, q);
492  const vector signed short dclamp_N1 = vec_min((vector signed short)zero, finald);
493  const vector signed short dclamp_N = vec_max(dclamp_N1, q);
494 
495  const vector signed short dclampedfinal = vec_sel(dclamp_N,
496  dclamp_P,
497  vec_cmpgt(q, (vector signed short)zero));
498  const vector signed short dornotd = vec_sel((vector signed short)zero,
499  dclampedfinal,
500  vec_cmplt(absmE, vqp));
501  /* add/subtract to l4 and l5 */
502  const vector signed short vb4minusd = vec_sub(vb4, dornotd);
503  const vector signed short vb5plusd = vec_add(vb5, dornotd);
504  /* finally, stores */
505  const vector unsigned char st4 = vec_packsu(vb4minusd, (vector signed short)zero);
506  const vector unsigned char st5 = vec_packsu(vb5plusd, (vector signed short)zero);
507 
508  const vector signed char neg1 = vec_splat_s8(-1);
509  const vector unsigned char permHH = (const vector unsigned char){0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
510  0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F};
511 
512 #define STORE(i) \
513 { const vector unsigned char perms##i = \
514  vec_lvsr(i * stride, src2); \
515  const vector unsigned char vg##i = \
516  vec_perm(st##i, vbT##i, permHH); \
517  const vector unsigned char mask##i = \
518  vec_perm((vector unsigned char)zero, (vector unsigned char)neg1, perms##i); \
519  const vector unsigned char vg2##i = \
520  vec_perm(vg##i, vg##i, perms##i); \
521  const vector unsigned char svA##i = \
522  vec_sel(vbA##i, vg2##i, mask##i); \
523  const vector unsigned char svB##i = \
524  vec_sel(vg2##i, vbB##i, mask##i); \
525  vec_st(svA##i, i * stride, src2); \
526  vec_st(svB##i, i * stride + 16, src2);}
527 
528  STORE(4)
529  STORE(5)
530 }
531 
532 static inline void dering_altivec(uint8_t src[], int stride, PPContext *c) {
533  const vector signed int vsint32_8 = vec_splat_s32(8);
534  const vector unsigned int vuint32_4 = vec_splat_u32(4);
535  const vector signed char neg1 = vec_splat_s8(-1);
536 
537  const vector unsigned char permA1 = (vector unsigned char)
538  {0x00, 0x01, 0x02, 0x10, 0x11, 0x12, 0x1F, 0x1F,
539  0x1F, 0x1F, 0x1F, 0x1F, 0x1F, 0x1F, 0x1F, 0x1F};
540  const vector unsigned char permA2 = (vector unsigned char)
541  {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x10, 0x11,
542  0x12, 0x1F, 0x1F, 0x1F, 0x1F, 0x1F, 0x1F, 0x1F};
543  const vector unsigned char permA1inc = (vector unsigned char)
544  {0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x00, 0x00,
545  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00};
546  const vector unsigned char permA2inc = (vector unsigned char)
547  {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01,
548  0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00};
549  const vector unsigned char magic = (vector unsigned char)
550  {0x01, 0x02, 0x01, 0x02, 0x04, 0x02, 0x01, 0x02,
551  0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00};
552  const vector unsigned char extractPerm = (vector unsigned char)
553  {0x10, 0x10, 0x10, 0x01, 0x10, 0x10, 0x10, 0x01,
554  0x10, 0x10, 0x10, 0x01, 0x10, 0x10, 0x10, 0x01};
555  const vector unsigned char extractPermInc = (vector unsigned char)
556  {0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01,
557  0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01};
558  const vector unsigned char identity = vec_lvsl(0,(unsigned char *)0);
559  const vector unsigned char tenRight = (vector unsigned char)
560  {0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
561  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00};
562  const vector unsigned char eightLeft = (vector unsigned char)
563  {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
564  0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x08};
565 
566  /*
567  this code makes no assumption on src or stride.
568  One could remove the recomputation of the perm
569  vector by assuming (stride % 16) == 0, unfortunately
570  this is not always true. Quite a lot of load/stores
571  can be removed by assuming proper alignment of
572  src & stride :-(
573  */
574  uint8_t *srcCopy = src;
575  DECLARE_ALIGNED(16, uint8_t, dt)[16] = { deringThreshold };
576  const vector signed int zero = vec_splat_s32(0);
577  vector unsigned char v_dt = vec_splat(vec_ld(0, dt), 0);
578 
579 #define LOAD_LINE(i) \
580  const vector unsigned char perm##i = \
581  vec_lvsl(i * stride, srcCopy); \
582  vector unsigned char sA##i = vec_ld(i * stride, srcCopy); \
583  vector unsigned char sB##i = vec_ld(i * stride + 16, srcCopy); \
584  vector unsigned char src##i = vec_perm(sA##i, sB##i, perm##i)
585 
586  LOAD_LINE(0);
587  LOAD_LINE(1);
588  LOAD_LINE(2);
589  LOAD_LINE(3);
590  LOAD_LINE(4);
591  LOAD_LINE(5);
592  LOAD_LINE(6);
593  LOAD_LINE(7);
594  LOAD_LINE(8);
595  LOAD_LINE(9);
596 #undef LOAD_LINE
597 
598  vector unsigned char v_avg;
599  DECLARE_ALIGNED(16, signed int, S)[8];
600  DECLARE_ALIGNED(16, int, tQP2)[4] = { c->QP/2 + 1 };
601  vector signed int vQP2 = vec_ld(0, tQP2);
602  vQP2 = vec_splat(vQP2, 0);
603 
604  {
605  const vector unsigned char trunc_perm = (vector unsigned char)
606  {0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08,
607  0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18};
608  const vector unsigned char trunc_src12 = vec_perm(src1, src2, trunc_perm);
609  const vector unsigned char trunc_src34 = vec_perm(src3, src4, trunc_perm);
610  const vector unsigned char trunc_src56 = vec_perm(src5, src6, trunc_perm);
611  const vector unsigned char trunc_src78 = vec_perm(src7, src8, trunc_perm);
612 
613 #define EXTRACT(op) do { \
614  const vector unsigned char s_1 = vec_##op(trunc_src12, trunc_src34); \
615  const vector unsigned char s_2 = vec_##op(trunc_src56, trunc_src78); \
616  const vector unsigned char s_6 = vec_##op(s_1, s_2); \
617  const vector unsigned char s_8h = vec_mergeh(s_6, s_6); \
618  const vector unsigned char s_8l = vec_mergel(s_6, s_6); \
619  const vector unsigned char s_9 = vec_##op(s_8h, s_8l); \
620  const vector unsigned char s_9h = vec_mergeh(s_9, s_9); \
621  const vector unsigned char s_9l = vec_mergel(s_9, s_9); \
622  const vector unsigned char s_10 = vec_##op(s_9h, s_9l); \
623  const vector unsigned char s_10h = vec_mergeh(s_10, s_10); \
624  const vector unsigned char s_10l = vec_mergel(s_10, s_10); \
625  const vector unsigned char s_11 = vec_##op(s_10h, s_10l); \
626  const vector unsigned char s_11h = vec_mergeh(s_11, s_11); \
627  const vector unsigned char s_11l = vec_mergel(s_11, s_11); \
628  v_##op = vec_##op(s_11h, s_11l); \
629 } while (0)
630 
631  vector unsigned char v_min;
632  vector unsigned char v_max;
633  EXTRACT(min);
634  EXTRACT(max);
635 #undef EXTRACT
636 
637  if (vec_all_lt(vec_sub(v_max, v_min), v_dt))
638  return;
639 
640  v_avg = vec_avg(v_min, v_max);
641  }
642 
643  {
644  const vector unsigned short mask1 = (vector unsigned short)
645  {0x0001, 0x0002, 0x0004, 0x0008,
646  0x0010, 0x0020, 0x0040, 0x0080};
647  const vector unsigned short mask2 = (vector unsigned short)
648  {0x0100, 0x0200, 0x0000, 0x0000,
649  0x0000, 0x0000, 0x0000, 0x0000};
650 
651  const vector unsigned int vuint32_16 = vec_sl(vec_splat_u32(1), vec_splat_u32(4));
652  const vector unsigned int vuint32_1 = vec_splat_u32(1);
653 
654  vector signed int sumA2;
655  vector signed int sumB2;
656  vector signed int sum0, sum1, sum2, sum3, sum4;
657  vector signed int sum5, sum6, sum7, sum8, sum9;
658 
659 #define COMPARE(i) \
660  do { \
661  const vector unsigned char cmp = \
662  (vector unsigned char)vec_cmpgt(src##i, v_avg); \
663  const vector unsigned short cmpHi = \
664  (vector unsigned short)vec_mergeh(cmp, cmp); \
665  const vector unsigned short cmpLi = \
666  (vector unsigned short)vec_mergel(cmp, cmp); \
667  const vector signed short cmpHf = \
668  (vector signed short)vec_and(cmpHi, mask1); \
669  const vector signed short cmpLf = \
670  (vector signed short)vec_and(cmpLi, mask2); \
671  const vector signed int sump = vec_sum4s(cmpHf, zero); \
672  const vector signed int sumq = vec_sum4s(cmpLf, sump); \
673  sum##i = vec_sums(sumq, zero); \
674  } while (0)
675 
676  COMPARE(0);
677  COMPARE(1);
678  COMPARE(2);
679  COMPARE(3);
680  COMPARE(4);
681  COMPARE(5);
682  COMPARE(6);
683  COMPARE(7);
684  COMPARE(8);
685  COMPARE(9);
686 #undef COMPARE
687 
688  {
689  const vector signed int sump02 = vec_mergel(sum0, sum2);
690  const vector signed int sump13 = vec_mergel(sum1, sum3);
691  const vector signed int sumA = vec_mergel(sump02, sump13);
692 
693  const vector signed int sump46 = vec_mergel(sum4, sum6);
694  const vector signed int sump57 = vec_mergel(sum5, sum7);
695  const vector signed int sumB = vec_mergel(sump46, sump57);
696 
697  const vector signed int sump8A = vec_mergel(sum8, zero);
698  const vector signed int sump9B = vec_mergel(sum9, zero);
699  const vector signed int sumC = vec_mergel(sump8A, sump9B);
700 
701  const vector signed int tA = vec_sl(vec_nor(zero, sumA), vuint32_16);
702  const vector signed int tB = vec_sl(vec_nor(zero, sumB), vuint32_16);
703  const vector signed int tC = vec_sl(vec_nor(zero, sumC), vuint32_16);
704  const vector signed int t2A = vec_or(sumA, tA);
705  const vector signed int t2B = vec_or(sumB, tB);
706  const vector signed int t2C = vec_or(sumC, tC);
707  const vector signed int t3A = vec_and(vec_sra(t2A, vuint32_1),
708  vec_sl(t2A, vuint32_1));
709  const vector signed int t3B = vec_and(vec_sra(t2B, vuint32_1),
710  vec_sl(t2B, vuint32_1));
711  const vector signed int t3C = vec_and(vec_sra(t2C, vuint32_1),
712  vec_sl(t2C, vuint32_1));
713  const vector signed int yA = vec_and(t2A, t3A);
714  const vector signed int yB = vec_and(t2B, t3B);
715  const vector signed int yC = vec_and(t2C, t3C);
716 
717  const vector unsigned char strangeperm1 = vec_lvsl(4, (unsigned char*)0);
718  const vector unsigned char strangeperm2 = vec_lvsl(8, (unsigned char*)0);
719  const vector signed int sumAd4 = vec_perm(yA, yB, strangeperm1);
720  const vector signed int sumAd8 = vec_perm(yA, yB, strangeperm2);
721  const vector signed int sumBd4 = vec_perm(yB, yC, strangeperm1);
722  const vector signed int sumBd8 = vec_perm(yB, yC, strangeperm2);
723  const vector signed int sumAp = vec_and(yA,
724  vec_and(sumAd4,sumAd8));
725  const vector signed int sumBp = vec_and(yB,
726  vec_and(sumBd4,sumBd8));
727  sumA2 = vec_or(sumAp,
728  vec_sra(sumAp,
729  vuint32_16));
730  sumB2 = vec_or(sumBp,
731  vec_sra(sumBp,
732  vuint32_16));
733  }
734  vec_st(sumA2, 0, S);
735  vec_st(sumB2, 16, S);
736  }
737 
738  /* I'm not sure the following is actually faster
739  than straight, unvectorized C code :-( */
740 
741 #define F_INIT() \
742  vector unsigned char tenRightM = tenRight; \
743  vector unsigned char permA1M = permA1; \
744  vector unsigned char permA2M = permA2; \
745  vector unsigned char extractPermM = extractPerm
746 
747 #define F2(i, j, k, l) \
748  if (S[i] & (1 << (l+1))) { \
749  const vector unsigned char a_A = vec_perm(src##i, src##j, permA1M); \
750  const vector unsigned char a_B = vec_perm(a_A, src##k, permA2M); \
751  const vector signed int a_sump = \
752  (vector signed int)vec_msum(a_B, magic, (vector unsigned int)zero);\
753  vector signed int F = vec_sr(vec_sums(a_sump, vsint32_8), vuint32_4); \
754  const vector signed int p = \
755  (vector signed int)vec_perm(src##j, (vector unsigned char)zero, \
756  extractPermM); \
757  const vector signed int sum = vec_add(p, vQP2); \
758  const vector signed int diff = vec_sub(p, vQP2); \
759  vector signed int newpm; \
760  vector unsigned char newpm2, mask; \
761  F = vec_splat(F, 3); \
762  if (vec_all_lt(sum, F)) \
763  newpm = sum; \
764  else if (vec_all_gt(diff, F)) \
765  newpm = diff; \
766  else newpm = F; \
767  newpm2 = vec_splat((vector unsigned char)newpm, 15); \
768  mask = vec_add(identity, tenRightM); \
769  src##j = vec_perm(src##j, newpm2, mask); \
770  } \
771  permA1M = vec_add(permA1M, permA1inc); \
772  permA2M = vec_add(permA2M, permA2inc); \
773  tenRightM = vec_sro(tenRightM, eightLeft); \
774  extractPermM = vec_add(extractPermM, extractPermInc)
775 
776 #define ITER(i, j, k) do { \
777  F_INIT(); \
778  F2(i, j, k, 0); \
779  F2(i, j, k, 1); \
780  F2(i, j, k, 2); \
781  F2(i, j, k, 3); \
782  F2(i, j, k, 4); \
783  F2(i, j, k, 5); \
784  F2(i, j, k, 6); \
785  F2(i, j, k, 7); \
786 } while (0)
787 
788  ITER(0, 1, 2);
789  ITER(1, 2, 3);
790  ITER(2, 3, 4);
791  ITER(3, 4, 5);
792  ITER(4, 5, 6);
793  ITER(5, 6, 7);
794  ITER(6, 7, 8);
795  ITER(7, 8, 9);
796 
797 #define STORE_LINE(i) do { \
798  const vector unsigned char permST = \
799  vec_lvsr(i * stride, srcCopy); \
800  const vector unsigned char maskST = \
801  vec_perm((vector unsigned char)zero, \
802  (vector unsigned char)neg1, permST); \
803  src##i = vec_perm(src##i ,src##i, permST); \
804  sA##i= vec_sel(sA##i, src##i, maskST); \
805  sB##i= vec_sel(src##i, sB##i, maskST); \
806  vec_st(sA##i, i * stride, srcCopy); \
807  vec_st(sB##i, i * stride + 16, srcCopy); \
808 } while (0)
809 
810  STORE_LINE(1);
811  STORE_LINE(2);
812  STORE_LINE(3);
813  STORE_LINE(4);
814  STORE_LINE(5);
815  STORE_LINE(6);
816  STORE_LINE(7);
817  STORE_LINE(8);
818 
819 #undef STORE_LINE
820 #undef ITER
821 #undef F2
822 }
823 
824 #define doHorizLowPass_altivec(a...) doHorizLowPass_C(a)
825 #define doHorizDefFilter_altivec(a...) doHorizDefFilter_C(a)
826 #define do_a_deblock_altivec(a...) do_a_deblock_C(a)
827 
828 static inline void tempNoiseReducer_altivec(uint8_t *src, int stride,
829  uint8_t *tempBlurred, uint32_t *tempBlurredPast, int *maxNoise)
830 {
831  const vector signed char neg1 = vec_splat_s8(-1);
832  const vector unsigned char permHH = (const vector unsigned char){0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
833  0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F};
834 
835  const vector signed int zero = vec_splat_s32(0);
836  const vector signed short vsint16_1 = vec_splat_s16(1);
837  vector signed int v_dp = zero;
838  vector signed int v_sysdp = zero;
839  int d, sysd, i;
840 
841 #define LOAD_LINE(src, i) \
842  register int j##src##i = i * stride; \
843  vector unsigned char perm##src##i = vec_lvsl(j##src##i, src); \
844  const vector unsigned char v_##src##A1##i = vec_ld(j##src##i, src); \
845  const vector unsigned char v_##src##A2##i = vec_ld(j##src##i + 16, src); \
846  const vector unsigned char v_##src##A##i = \
847  vec_perm(v_##src##A1##i, v_##src##A2##i, perm##src##i); \
848  vector signed short v_##src##Ass##i = \
849  (vector signed short)vec_mergeh((vector signed char)zero, \
850  (vector signed char)v_##src##A##i)
851 
852  LOAD_LINE(src, 0);
853  LOAD_LINE(src, 1);
854  LOAD_LINE(src, 2);
855  LOAD_LINE(src, 3);
856  LOAD_LINE(src, 4);
857  LOAD_LINE(src, 5);
858  LOAD_LINE(src, 6);
859  LOAD_LINE(src, 7);
860 
861  LOAD_LINE(tempBlurred, 0);
862  LOAD_LINE(tempBlurred, 1);
863  LOAD_LINE(tempBlurred, 2);
864  LOAD_LINE(tempBlurred, 3);
865  LOAD_LINE(tempBlurred, 4);
866  LOAD_LINE(tempBlurred, 5);
867  LOAD_LINE(tempBlurred, 6);
868  LOAD_LINE(tempBlurred, 7);
869 #undef LOAD_LINE
870 
871 #define ACCUMULATE_DIFFS(i) do { \
872  vector signed short v_d = vec_sub(v_tempBlurredAss##i, \
873  v_srcAss##i); \
874  v_dp = vec_msums(v_d, v_d, v_dp); \
875  v_sysdp = vec_msums(v_d, vsint16_1, v_sysdp); \
876  } while (0)
877 
878  ACCUMULATE_DIFFS(0);
879  ACCUMULATE_DIFFS(1);
880  ACCUMULATE_DIFFS(2);
881  ACCUMULATE_DIFFS(3);
882  ACCUMULATE_DIFFS(4);
883  ACCUMULATE_DIFFS(5);
884  ACCUMULATE_DIFFS(6);
885  ACCUMULATE_DIFFS(7);
886 #undef ACCUMULATE_DIFFS
887 
888  tempBlurredPast[127]= maxNoise[0];
889  tempBlurredPast[128]= maxNoise[1];
890  tempBlurredPast[129]= maxNoise[2];
891 
892  v_dp = vec_sums(v_dp, zero);
893  v_sysdp = vec_sums(v_sysdp, zero);
894 
895  v_dp = vec_splat(v_dp, 3);
896  v_sysdp = vec_splat(v_sysdp, 3);
897 
898  vec_ste(v_dp, 0, &d);
899  vec_ste(v_sysdp, 0, &sysd);
900 
901  i = d;
902  d = (4*d
903  +(*(tempBlurredPast-256))
904  +(*(tempBlurredPast-1))+ (*(tempBlurredPast+1))
905  +(*(tempBlurredPast+256))
906  +4)>>3;
907 
908  *tempBlurredPast=i;
909 
910  if (d > maxNoise[1]) {
911  if (d < maxNoise[2]) {
912 #define OP(i) v_tempBlurredAss##i = vec_avg(v_tempBlurredAss##i, v_srcAss##i);
913 
914  OP(0);
915  OP(1);
916  OP(2);
917  OP(3);
918  OP(4);
919  OP(5);
920  OP(6);
921  OP(7);
922 #undef OP
923  } else {
924 #define OP(i) v_tempBlurredAss##i = v_srcAss##i;
925 
926  OP(0);
927  OP(1);
928  OP(2);
929  OP(3);
930  OP(4);
931  OP(5);
932  OP(6);
933  OP(7);
934 #undef OP
935  }
936  } else {
937  if (d < maxNoise[0]) {
938  const vector signed short vsint16_7 = vec_splat_s16(7);
939  const vector signed short vsint16_4 = vec_splat_s16(4);
940  const vector unsigned short vuint16_3 = vec_splat_u16(3);
941 
942 #define OP(i) do { \
943  const vector signed short v_temp = \
944  vec_mladd(v_tempBlurredAss##i, vsint16_7, v_srcAss##i); \
945  const vector signed short v_temp2 = vec_add(v_temp, vsint16_4); \
946  v_tempBlurredAss##i = vec_sr(v_temp2, vuint16_3); \
947  } while (0)
948 
949  OP(0);
950  OP(1);
951  OP(2);
952  OP(3);
953  OP(4);
954  OP(5);
955  OP(6);
956  OP(7);
957 #undef OP
958  } else {
959  const vector signed short vsint16_3 = vec_splat_s16(3);
960  const vector signed short vsint16_2 = vec_splat_s16(2);
961 
962 #define OP(i) do { \
963  const vector signed short v_temp = \
964  vec_mladd(v_tempBlurredAss##i, vsint16_3, v_srcAss##i); \
965  const vector signed short v_temp2 = vec_add(v_temp, vsint16_2); \
966  v_tempBlurredAss##i = \
967  vec_sr(v_temp2, (vector unsigned short)vsint16_2); \
968  } while (0)
969 
970  OP(0);
971  OP(1);
972  OP(2);
973  OP(3);
974  OP(4);
975  OP(5);
976  OP(6);
977  OP(7);
978 #undef OP
979  }
980  }
981 
982 #define PACK_AND_STORE(src, i) do { \
983  const vector unsigned char perms = vec_lvsr(i * stride, src); \
984  const vector unsigned char vf = \
985  vec_packsu(v_tempBlurredAss##1, (vector signed short)zero); \
986  const vector unsigned char vg = vec_perm(vf, v_##src##A##i, permHH); \
987  const vector unsigned char mask = \
988  vec_perm((vector unsigned char)zero, (vector unsigned char)neg1, perms); \
989  const vector unsigned char vg2 = vec_perm(vg, vg, perms); \
990  const vector unsigned char svA = vec_sel(v_##src##A1##i, vg2, mask); \
991  const vector unsigned char svB = vec_sel(vg2, v_##src##A2##i, mask); \
992  vec_st(svA, i * stride, src); \
993  vec_st(svB, i * stride + 16, src); \
994 } while (0)
995 
996  PACK_AND_STORE(src, 0);
997  PACK_AND_STORE(src, 1);
998  PACK_AND_STORE(src, 2);
999  PACK_AND_STORE(src, 3);
1000  PACK_AND_STORE(src, 4);
1001  PACK_AND_STORE(src, 5);
1002  PACK_AND_STORE(src, 6);
1003  PACK_AND_STORE(src, 7);
1004  PACK_AND_STORE(tempBlurred, 0);
1005  PACK_AND_STORE(tempBlurred, 1);
1006  PACK_AND_STORE(tempBlurred, 2);
1007  PACK_AND_STORE(tempBlurred, 3);
1008  PACK_AND_STORE(tempBlurred, 4);
1009  PACK_AND_STORE(tempBlurred, 5);
1010  PACK_AND_STORE(tempBlurred, 6);
1011  PACK_AND_STORE(tempBlurred, 7);
1012 #undef PACK_AND_STORE
1013 }
1014 
1015 static inline void transpose_16x8_char_toPackedAlign_altivec(unsigned char* dst, unsigned char* src, int stride) {
1016  const vector unsigned char zero = vec_splat_u8(0);
1017 
1018 #define LOAD_DOUBLE_LINE(i, j) \
1019  vector unsigned char perm1##i = vec_lvsl(i * stride, src); \
1020  vector unsigned char perm2##i = vec_lvsl(j * stride, src); \
1021  vector unsigned char srcA##i = vec_ld(i * stride, src); \
1022  vector unsigned char srcB##i = vec_ld(i * stride + 16, src); \
1023  vector unsigned char srcC##i = vec_ld(j * stride, src); \
1024  vector unsigned char srcD##i = vec_ld(j * stride+ 16, src); \
1025  vector unsigned char src##i = vec_perm(srcA##i, srcB##i, perm1##i); \
1026  vector unsigned char src##j = vec_perm(srcC##i, srcD##i, perm2##i)
1027 
1028  LOAD_DOUBLE_LINE(0, 1);
1029  LOAD_DOUBLE_LINE(2, 3);
1030  LOAD_DOUBLE_LINE(4, 5);
1031  LOAD_DOUBLE_LINE(6, 7);
1032 #undef LOAD_DOUBLE_LINE
1033 
1034  vector unsigned char tempA = vec_mergeh(src0, zero);
1035  vector unsigned char tempB = vec_mergel(src0, zero);
1036  vector unsigned char tempC = vec_mergeh(src1, zero);
1037  vector unsigned char tempD = vec_mergel(src1, zero);
1038  vector unsigned char tempE = vec_mergeh(src2, zero);
1039  vector unsigned char tempF = vec_mergel(src2, zero);
1040  vector unsigned char tempG = vec_mergeh(src3, zero);
1041  vector unsigned char tempH = vec_mergel(src3, zero);
1042  vector unsigned char tempI = vec_mergeh(src4, zero);
1043  vector unsigned char tempJ = vec_mergel(src4, zero);
1044  vector unsigned char tempK = vec_mergeh(src5, zero);
1045  vector unsigned char tempL = vec_mergel(src5, zero);
1046  vector unsigned char tempM = vec_mergeh(src6, zero);
1047  vector unsigned char tempN = vec_mergel(src6, zero);
1048  vector unsigned char tempO = vec_mergeh(src7, zero);
1049  vector unsigned char tempP = vec_mergel(src7, zero);
1050 
1051  vector unsigned char temp0 = vec_mergeh(tempA, tempI);
1052  vector unsigned char temp1 = vec_mergel(tempA, tempI);
1053  vector unsigned char temp2 = vec_mergeh(tempB, tempJ);
1054  vector unsigned char temp3 = vec_mergel(tempB, tempJ);
1055  vector unsigned char temp4 = vec_mergeh(tempC, tempK);
1056  vector unsigned char temp5 = vec_mergel(tempC, tempK);
1057  vector unsigned char temp6 = vec_mergeh(tempD, tempL);
1058  vector unsigned char temp7 = vec_mergel(tempD, tempL);
1059  vector unsigned char temp8 = vec_mergeh(tempE, tempM);
1060  vector unsigned char temp9 = vec_mergel(tempE, tempM);
1061  vector unsigned char temp10 = vec_mergeh(tempF, tempN);
1062  vector unsigned char temp11 = vec_mergel(tempF, tempN);
1063  vector unsigned char temp12 = vec_mergeh(tempG, tempO);
1064  vector unsigned char temp13 = vec_mergel(tempG, tempO);
1065  vector unsigned char temp14 = vec_mergeh(tempH, tempP);
1066  vector unsigned char temp15 = vec_mergel(tempH, tempP);
1067 
1068  tempA = vec_mergeh(temp0, temp8);
1069  tempB = vec_mergel(temp0, temp8);
1070  tempC = vec_mergeh(temp1, temp9);
1071  tempD = vec_mergel(temp1, temp9);
1072  tempE = vec_mergeh(temp2, temp10);
1073  tempF = vec_mergel(temp2, temp10);
1074  tempG = vec_mergeh(temp3, temp11);
1075  tempH = vec_mergel(temp3, temp11);
1076  tempI = vec_mergeh(temp4, temp12);
1077  tempJ = vec_mergel(temp4, temp12);
1078  tempK = vec_mergeh(temp5, temp13);
1079  tempL = vec_mergel(temp5, temp13);
1080  tempM = vec_mergeh(temp6, temp14);
1081  tempN = vec_mergel(temp6, temp14);
1082  tempO = vec_mergeh(temp7, temp15);
1083  tempP = vec_mergel(temp7, temp15);
1084 
1085  temp0 = vec_mergeh(tempA, tempI);
1086  temp1 = vec_mergel(tempA, tempI);
1087  temp2 = vec_mergeh(tempB, tempJ);
1088  temp3 = vec_mergel(tempB, tempJ);
1089  temp4 = vec_mergeh(tempC, tempK);
1090  temp5 = vec_mergel(tempC, tempK);
1091  temp6 = vec_mergeh(tempD, tempL);
1092  temp7 = vec_mergel(tempD, tempL);
1093  temp8 = vec_mergeh(tempE, tempM);
1094  temp9 = vec_mergel(tempE, tempM);
1095  temp10 = vec_mergeh(tempF, tempN);
1096  temp11 = vec_mergel(tempF, tempN);
1097  temp12 = vec_mergeh(tempG, tempO);
1098  temp13 = vec_mergel(tempG, tempO);
1099  temp14 = vec_mergeh(tempH, tempP);
1100  temp15 = vec_mergel(tempH, tempP);
1101 
1102  vec_st(temp0, 0, dst);
1103  vec_st(temp1, 16, dst);
1104  vec_st(temp2, 32, dst);
1105  vec_st(temp3, 48, dst);
1106  vec_st(temp4, 64, dst);
1107  vec_st(temp5, 80, dst);
1108  vec_st(temp6, 96, dst);
1109  vec_st(temp7, 112, dst);
1110  vec_st(temp8, 128, dst);
1111  vec_st(temp9, 144, dst);
1112  vec_st(temp10, 160, dst);
1113  vec_st(temp11, 176, dst);
1114  vec_st(temp12, 192, dst);
1115  vec_st(temp13, 208, dst);
1116  vec_st(temp14, 224, dst);
1117  vec_st(temp15, 240, dst);
1118 }
1119 
1120 static inline void transpose_8x16_char_fromPackedAlign_altivec(unsigned char* dst, unsigned char* src, int stride) {
1121  const vector unsigned char zero = vec_splat_u8(0);
1122  const vector signed char neg1 = vec_splat_s8(-1);
1123 
1124 #define LOAD_DOUBLE_LINE(i, j) \
1125  vector unsigned char src##i = vec_ld(i * 16, src); \
1126  vector unsigned char src##j = vec_ld(j * 16, src)
1127 
1128  LOAD_DOUBLE_LINE(0, 1);
1129  LOAD_DOUBLE_LINE(2, 3);
1130  LOAD_DOUBLE_LINE(4, 5);
1131  LOAD_DOUBLE_LINE(6, 7);
1132  LOAD_DOUBLE_LINE(8, 9);
1133  LOAD_DOUBLE_LINE(10, 11);
1134  LOAD_DOUBLE_LINE(12, 13);
1135  LOAD_DOUBLE_LINE(14, 15);
1136 #undef LOAD_DOUBLE_LINE
1137 
1138  vector unsigned char tempA = vec_mergeh(src0, src8);
1139  vector unsigned char tempB;
1140  vector unsigned char tempC = vec_mergeh(src1, src9);
1141  vector unsigned char tempD;
1142  vector unsigned char tempE = vec_mergeh(src2, src10);
1143  vector unsigned char tempG = vec_mergeh(src3, src11);
1144  vector unsigned char tempI = vec_mergeh(src4, src12);
1145  vector unsigned char tempJ;
1146  vector unsigned char tempK = vec_mergeh(src5, src13);
1147  vector unsigned char tempL;
1148  vector unsigned char tempM = vec_mergeh(src6, src14);
1149  vector unsigned char tempO = vec_mergeh(src7, src15);
1150 
1151  vector unsigned char temp0 = vec_mergeh(tempA, tempI);
1152  vector unsigned char temp1 = vec_mergel(tempA, tempI);
1153  vector unsigned char temp2;
1154  vector unsigned char temp3;
1155  vector unsigned char temp4 = vec_mergeh(tempC, tempK);
1156  vector unsigned char temp5 = vec_mergel(tempC, tempK);
1157  vector unsigned char temp6;
1158  vector unsigned char temp7;
1159  vector unsigned char temp8 = vec_mergeh(tempE, tempM);
1160  vector unsigned char temp9 = vec_mergel(tempE, tempM);
1161  vector unsigned char temp12 = vec_mergeh(tempG, tempO);
1162  vector unsigned char temp13 = vec_mergel(tempG, tempO);
1163 
1164  tempA = vec_mergeh(temp0, temp8);
1165  tempB = vec_mergel(temp0, temp8);
1166  tempC = vec_mergeh(temp1, temp9);
1167  tempD = vec_mergel(temp1, temp9);
1168  tempI = vec_mergeh(temp4, temp12);
1169  tempJ = vec_mergel(temp4, temp12);
1170  tempK = vec_mergeh(temp5, temp13);
1171  tempL = vec_mergel(temp5, temp13);
1172 
1173  temp0 = vec_mergeh(tempA, tempI);
1174  temp1 = vec_mergel(tempA, tempI);
1175  temp2 = vec_mergeh(tempB, tempJ);
1176  temp3 = vec_mergel(tempB, tempJ);
1177  temp4 = vec_mergeh(tempC, tempK);
1178  temp5 = vec_mergel(tempC, tempK);
1179  temp6 = vec_mergeh(tempD, tempL);
1180  temp7 = vec_mergel(tempD, tempL);
1181 
1182 
1183 #define STORE_DOUBLE_LINE(i, j) do { \
1184  vector unsigned char dstAi = vec_ld(i * stride, dst); \
1185  vector unsigned char dstBi = vec_ld(i * stride + 16, dst); \
1186  vector unsigned char dstAj = vec_ld(j * stride, dst); \
1187  vector unsigned char dstBj = vec_ld(j * stride+ 16, dst); \
1188  vector unsigned char aligni = vec_lvsr(i * stride, dst); \
1189  vector unsigned char alignj = vec_lvsr(j * stride, dst); \
1190  vector unsigned char maski = \
1191  vec_perm(zero, (vector unsigned char)neg1, aligni); \
1192  vector unsigned char maskj = \
1193  vec_perm(zero, (vector unsigned char)neg1, alignj); \
1194  vector unsigned char dstRi = vec_perm(temp##i, temp##i, aligni); \
1195  vector unsigned char dstRj = vec_perm(temp##j, temp##j, alignj); \
1196  vector unsigned char dstAFi = vec_sel(dstAi, dstRi, maski); \
1197  vector unsigned char dstBFi = vec_sel(dstRi, dstBi, maski); \
1198  vector unsigned char dstAFj = vec_sel(dstAj, dstRj, maskj); \
1199  vector unsigned char dstBFj = vec_sel(dstRj, dstBj, maskj); \
1200  vec_st(dstAFi, i * stride, dst); \
1201  vec_st(dstBFi, i * stride + 16, dst); \
1202  vec_st(dstAFj, j * stride, dst); \
1203  vec_st(dstBFj, j * stride + 16, dst); \
1204 } while (0)
1205 
1206  STORE_DOUBLE_LINE(0,1);
1207  STORE_DOUBLE_LINE(2,3);
1208  STORE_DOUBLE_LINE(4,5);
1209  STORE_DOUBLE_LINE(6,7);
1210 }
stride
int stride
Definition: mace.c:144
COMPARE
#define COMPARE(i)
PPContext
postprocess context.
Definition: postprocess_internal.h:115
data
const char data[16]
Definition: mxf.c:91
STORE_LINE
#define STORE_LINE(i)
doVertLowPass_altivec
static void doVertLowPass_altivec(uint8_t *src, int stride, PPContext *c)
Definition: postprocess_altivec_template.c:213
max
#define max(a, b)
Definition: cuda_runtime.h:33
S
#define S(s, c, i)
Definition: flacdsp_template.c:46
STORE
#define STORE(i)
OP
#define OP(i)
ITER
#define ITER(i, j)
LOAD_DOUBLE_LINE
#define LOAD_DOUBLE_LINE(i, j)
mask
static const uint16_t mask[17]
Definition: lzw.c:38
ACCUMULATE_DIFFS
#define ACCUMULATE_DIFFS(i)
dering_altivec
static void dering_altivec(uint8_t src[], int stride, PPContext *c)
Definition: postprocess_altivec_template.c:532
src
#define src
Definition: vp8dsp.c:254
c
Undefined Behavior In the C some operations are like signed integer dereferencing freed accessing outside allocated Undefined Behavior must not occur in a C it is not safe even if the output of undefined operations is unused The unsafety may seem nit picking but Optimizing compilers have in fact optimized code on the assumption that no undefined Behavior occurs Optimizing code based on wrong assumptions can and has in some cases lead to effects beyond the output of computations The signed integer overflow problem in speed critical code Code which is highly optimized and works with signed integers sometimes has the problem that often the output of the computation does not c
Definition: undefined.txt:32
LOAD_LINE
#define LOAD_LINE(i)
LOAD_LINE_ALIGNED
#define LOAD_LINE_ALIGNED(i)
transpose_16x8_char_toPackedAlign_altivec
static void transpose_16x8_char_toPackedAlign_altivec(unsigned char *dst, unsigned char *src, int stride)
Definition: postprocess_altivec_template.c:1015
transpose_8x16_char_fromPackedAlign_altivec
static void transpose_8x16_char_fromPackedAlign_altivec(unsigned char *dst, unsigned char *src, int stride)
Definition: postprocess_altivec_template.c:1120
src0
#define src0
Definition: h264pred.c:138
DECLARE_ALIGNED
#define DECLARE_ALIGNED(n, t, v)
Definition: mem.h:112
src1
#define src1
Definition: h264pred.c:139
i
#define i(width, name, range_min, range_max)
Definition: cbs_h2645.c:269
COMPUTE_VR
#define COMPUTE_VR(i, j, k)
uint8_t
uint8_t
Definition: audio_convert.c:194
av_uninit
#define av_uninit(x)
Definition: attributes.h:154
vertClassify_altivec
static int vertClassify_altivec(uint8_t src[], int stride, PPContext *c)
Definition: postprocess_altivec_template.c:58
doVertDefFilter_altivec
static void doVertDefFilter_altivec(uint8_t src[], int stride, PPContext *c)
Definition: postprocess_altivec_template.c:411
EXTRACT
#define EXTRACT(op)
zero
#define zero
Definition: regdef.h:64
avutil.h
STORE_DOUBLE_LINE
#define STORE_DOUBLE_LINE(i, j)
PACK_AND_STORE_ALIGNED
#define PACK_AND_STORE_ALIGNED(i)
PACK_AND_STORE
#define PACK_AND_STORE(i)
tempNoiseReducer_altivec
static void tempNoiseReducer_altivec(uint8_t *src, int stride, uint8_t *tempBlurred, uint32_t *tempBlurredPast, int *maxNoise)
Definition: postprocess_altivec_template.c:828
short
it s the only field you need to keep assuming you have a context There is some magic you don t need to care about around this just let it vf default minimum maximum flags name is the option keep it simple and lowercase description are short
Definition: writing_filters.txt:89
min
float min
Definition: vorbis_enc_data.h:456