36 const vector
unsigned char zero = (
const vector
unsigned char)vec_splat_u8(0);
37 vector
unsigned char perm1 = vec_lvsl(0, pix2);
38 vector
unsigned char perm2 = vec_add(perm1, vec_splat_u8(1));
39 vector
unsigned char pix2l, pix2r;
40 vector
unsigned char pix1v, pix2v, pix2iv, avgv,
t5;
41 vector
unsigned int sad;
42 vector
signed int sumdiffs;
45 sad = (vector
unsigned int)vec_splat_u32(0);
46 for (i = 0; i < h; i++) {
50 pix1v = vec_ld( 0, pix1);
51 pix2l = vec_ld( 0, pix2);
52 pix2r = vec_ld(16, pix2);
53 pix2v = vec_perm(pix2l, pix2r, perm1);
54 pix2iv = vec_perm(pix2l, pix2r, perm2);
57 avgv = vec_avg(pix2v, pix2iv);
60 t5 = vec_sub(vec_max(pix1v, avgv), vec_min(pix1v, avgv));
63 sad = vec_sum4s(t5, sad);
69 sumdiffs = vec_sums((vector
signed int) sad, (vector
signed int) zero);
70 sumdiffs = vec_splat(sumdiffs, 3);
71 vec_ste(sumdiffs, 0, &s);
80 const vector
unsigned char zero = (
const vector
unsigned char)vec_splat_u8(0);
81 vector
unsigned char perm = vec_lvsl(0, pix2);
82 vector
unsigned char pix2l, pix2r;
83 vector
unsigned char pix1v, pix2v, pix3v, avgv,
t5;
84 vector
unsigned int sad;
85 vector
signed int sumdiffs;
86 uint8_t *pix3 = pix2 + line_size;
89 sad = (vector
unsigned int)vec_splat_u32(0);
98 pix2l = vec_ld( 0, pix2);
99 pix2r = vec_ld(15, pix2);
100 pix2v = vec_perm(pix2l, pix2r, perm);
102 for (i = 0; i < h; i++) {
106 pix1v = vec_ld(0, pix1);
108 pix2l = vec_ld( 0, pix3);
109 pix2r = vec_ld(15, pix3);
110 pix3v = vec_perm(pix2l, pix2r, perm);
113 avgv = vec_avg(pix2v, pix3v);
116 t5 = vec_sub(vec_max(pix1v, avgv), vec_min(pix1v, avgv));
119 sad = vec_sum4s(t5, sad);
128 sumdiffs = vec_sums((vector
signed int) sad, (vector
signed int) zero);
129 sumdiffs = vec_splat(sumdiffs, 3);
130 vec_ste(sumdiffs, 0, &s);
138 uint8_t *pix3 = pix2 + line_size;
139 const vector
unsigned char zero = (
const vector
unsigned char)vec_splat_u8(0);
140 const vector
unsigned short two = (
const vector
unsigned short)vec_splat_u16(2);
141 vector
unsigned char avgv,
t5;
142 vector
unsigned char perm1 = vec_lvsl(0, pix2);
143 vector
unsigned char perm2 = vec_add(perm1, vec_splat_u8(1));
144 vector
unsigned char pix2l, pix2r;
145 vector
unsigned char pix1v, pix2v, pix3v, pix2iv, pix3iv;
146 vector
unsigned short pix2lv, pix2hv, pix2ilv, pix2ihv;
147 vector
unsigned short pix3lv, pix3hv, pix3ilv, pix3ihv;
148 vector
unsigned short avghv, avglv;
149 vector
unsigned short t1,
t2,
t3,
t4;
150 vector
unsigned int sad;
151 vector
signed int sumdiffs;
153 sad = (vector
unsigned int)vec_splat_u32(0);
164 pix2l = vec_ld( 0, pix2);
165 pix2r = vec_ld(16, pix2);
166 pix2v = vec_perm(pix2l, pix2r, perm1);
167 pix2iv = vec_perm(pix2l, pix2r, perm2);
169 pix2hv = (vector
unsigned short) vec_mergeh(zero, pix2v);
170 pix2lv = (vector
unsigned short) vec_mergel(zero, pix2v);
171 pix2ihv = (vector
unsigned short) vec_mergeh(zero, pix2iv);
172 pix2ilv = (vector
unsigned short) vec_mergel(zero, pix2iv);
173 t1 = vec_add(pix2hv, pix2ihv);
174 t2 = vec_add(pix2lv, pix2ilv);
176 for (i = 0; i < h; i++) {
180 pix1v = vec_ld(0, pix1);
182 pix2l = vec_ld( 0, pix3);
183 pix2r = vec_ld(16, pix3);
184 pix3v = vec_perm(pix2l, pix2r, perm1);
185 pix3iv = vec_perm(pix2l, pix2r, perm2);
194 pix3hv = (vector
unsigned short) vec_mergeh(zero, pix3v);
195 pix3lv = (vector
unsigned short) vec_mergel(zero, pix3v);
196 pix3ihv = (vector
unsigned short) vec_mergeh(zero, pix3iv);
197 pix3ilv = (vector
unsigned short) vec_mergel(zero, pix3iv);
200 t3 = vec_add(pix3hv, pix3ihv);
201 t4 = vec_add(pix3lv, pix3ilv);
203 avghv = vec_sr(vec_add(vec_add(t1, t3), two), two);
204 avglv = vec_sr(vec_add(vec_add(t2, t4), two), two);
207 avgv = vec_pack(avghv, avglv);
210 t5 = vec_sub(vec_max(pix1v, avgv), vec_min(pix1v, avgv));
213 sad = vec_sum4s(t5, sad);
222 sumdiffs = vec_sums((vector
signed int) sad, (vector
signed int) zero);
223 sumdiffs = vec_splat(sumdiffs, 3);
224 vec_ste(sumdiffs, 0, &s);
233 const vector
unsigned int zero = (
const vector
unsigned int)vec_splat_u32(0);
234 vector
unsigned char perm = vec_lvsl(0, pix2);
236 vector
unsigned int sad;
237 vector
signed int sumdiffs;
239 sad = (vector
unsigned int)vec_splat_u32(0);
242 for (i = 0; i < h; i++) {
244 vector
unsigned char pix2l = vec_ld( 0, pix2);
245 vector
unsigned char pix2r = vec_ld(15, pix2);
246 t1 = vec_ld(0, pix1);
247 t2 = vec_perm(pix2l, pix2r, perm);
250 t3 = vec_max(t1, t2);
251 t4 = vec_min(t1, t2);
252 t5 = vec_sub(t3, t4);
255 sad = vec_sum4s(t5, sad);
262 sumdiffs = vec_sums((vector
signed int) sad, (vector
signed int) zero);
263 sumdiffs = vec_splat(sumdiffs, 3);
264 vec_ste(sumdiffs, 0, &s);
273 const vector
unsigned int zero = (
const vector
unsigned int)vec_splat_u32(0);
274 const vector
unsigned char permclear = (vector
unsigned char){255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0};
275 vector
unsigned char perm1 = vec_lvsl(0, pix1);
276 vector
unsigned char perm2 = vec_lvsl(0, pix2);
278 vector
unsigned int sad;
279 vector
signed int sumdiffs;
281 sad = (vector
unsigned int)vec_splat_u32(0);
283 for (i = 0; i < h; i++) {
287 vector
unsigned char pix1l = vec_ld( 0, pix1);
288 vector
unsigned char pix1r = vec_ld(15, pix1);
289 vector
unsigned char pix2l = vec_ld( 0, pix2);
290 vector
unsigned char pix2r = vec_ld(15, pix2);
291 t1 = vec_and(vec_perm(pix1l, pix1r, perm1), permclear);
292 t2 = vec_and(vec_perm(pix2l, pix2r, perm2), permclear);
295 t3 = vec_max(t1, t2);
296 t4 = vec_min(t1, t2);
297 t5 = vec_sub(t3, t4);
300 sad = vec_sum4s(t5, sad);
307 sumdiffs = vec_sums((vector
signed int) sad, (vector
signed int) zero);
308 sumdiffs = vec_splat(sumdiffs, 3);
309 vec_ste(sumdiffs, 0, &s);
318 const vector
unsigned int zero = (
const vector
unsigned int)vec_splat_u32(0);
319 vector
unsigned char perm = vec_lvsl(0, pix);
320 vector
unsigned char pixv;
321 vector
unsigned int sv;
322 vector
signed int sum;
324 sv = (vector
unsigned int)vec_splat_u32(0);
327 for (i = 0; i < 16; i++) {
329 vector
unsigned char pixl = vec_ld( 0, pix);
330 vector
unsigned char pixr = vec_ld(15, pix);
331 pixv = vec_perm(pixl, pixr, perm);
334 sv = vec_msum(pixv, pixv, sv);
339 sum = vec_sums((vector
signed int) sv, (vector
signed int) zero);
340 sum = vec_splat(sum, 3);
355 const vector
unsigned int zero = (
const vector
unsigned int)vec_splat_u32(0);
356 const vector
unsigned char permclear = (vector
unsigned char){255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0};
357 vector
unsigned char perm1 = vec_lvsl(0, pix1);
358 vector
unsigned char perm2 = vec_lvsl(0, pix2);
360 vector
unsigned int sum;
361 vector
signed int sumsqr;
363 sum = (vector
unsigned int)vec_splat_u32(0);
365 for (i = 0; i < h; i++) {
369 vector
unsigned char pix1l = vec_ld( 0, pix1);
370 vector
unsigned char pix1r = vec_ld(15, pix1);
371 vector
unsigned char pix2l = vec_ld( 0, pix2);
372 vector
unsigned char pix2r = vec_ld(15, pix2);
373 t1 = vec_and(vec_perm(pix1l, pix1r, perm1), permclear);
374 t2 = vec_and(vec_perm(pix2l, pix2r, perm2), permclear);
380 t3 = vec_max(t1, t2);
381 t4 = vec_min(t1, t2);
382 t5 = vec_sub(t3, t4);
385 sum = vec_msum(t5, t5, sum);
392 sumsqr = vec_sums((vector
signed int) sum, (vector
signed int) zero);
393 sumsqr = vec_splat(sumsqr, 3);
394 vec_ste(sumsqr, 0, &s);
408 const vector
unsigned int zero = (
const vector
unsigned int)vec_splat_u32(0);
409 vector
unsigned char perm = vec_lvsl(0, pix2);
411 vector
unsigned int sum;
412 vector
signed int sumsqr;
414 sum = (vector
unsigned int)vec_splat_u32(0);
416 for (i = 0; i < h; i++) {
418 vector
unsigned char pix2l = vec_ld( 0, pix2);
419 vector
unsigned char pix2r = vec_ld(15, pix2);
420 t1 = vec_ld(0, pix1);
421 t2 = vec_perm(pix2l, pix2r, perm);
427 t3 = vec_max(t1, t2);
428 t4 = vec_min(t1, t2);
429 t5 = vec_sub(t3, t4);
432 sum = vec_msum(t5, t5, sum);
439 sumsqr = vec_sums((vector
signed int) sum, (vector
signed int) zero);
440 sumsqr = vec_splat(sumsqr, 3);
441 vec_ste(sumsqr, 0, &s);
448 const vector
unsigned int zero = (
const vector
unsigned int)vec_splat_u32(0);
449 vector
unsigned char perm = vec_lvsl(0, pix);
450 vector
unsigned char t1;
451 vector
unsigned int sad;
452 vector
signed int sumdiffs;
457 sad = (vector
unsigned int)vec_splat_u32(0);
459 for (i = 0; i < 16; i++) {
461 vector
unsigned char pixl = vec_ld( 0, pix);
462 vector
unsigned char pixr = vec_ld(15, pix);
463 t1 = vec_perm(pixl, pixr, perm);
466 sad = vec_sum4s(t1, sad);
472 sumdiffs = vec_sums((vector
signed int) sad, (vector
signed int) zero);
473 sumdiffs = vec_splat(sumdiffs, 3);
474 vec_ste(sumdiffs, 0, &s);
482 vector
unsigned char perm = vec_lvsl(0, pixels);
483 vector
unsigned char bytes;
484 const vector
unsigned char zero = (
const vector
unsigned char)vec_splat_u8(0);
485 vector
signed short shorts;
487 for (i = 0; i < 8; i++) {
491 vector
unsigned char pixl = vec_ld( 0, pixels);
492 vector
unsigned char pixr = vec_ld(15, pixels);
493 bytes = vec_perm(pixl, pixr, perm);
496 shorts = (vector
signed short)vec_mergeh(zero, bytes);
499 vec_st(shorts, i*16, (vector
signed short*)block);
509 vector
unsigned char perm1 = vec_lvsl(0, s1);
510 vector
unsigned char perm2 = vec_lvsl(0, s2);
511 vector
unsigned char bytes, pixl, pixr;
512 const vector
unsigned char zero = (
const vector
unsigned char)vec_splat_u8(0);
513 vector
signed short shorts1, shorts2;
515 for (i = 0; i < 4; i++) {
519 pixl = vec_ld( 0, s1);
520 pixr = vec_ld(15, s1);
521 bytes = vec_perm(pixl, pixr, perm1);
524 shorts1 = (vector
signed short)vec_mergeh(zero, bytes);
527 pixl = vec_ld( 0, s2);
528 pixr = vec_ld(15, s2);
529 bytes = vec_perm(pixl, pixr, perm2);
532 shorts2 = (vector
signed short)vec_mergeh(zero, bytes);
535 shorts1 = vec_sub(shorts1, shorts2);
538 vec_st(shorts1, 0, (vector
signed short*)block);
551 pixl = vec_ld( 0, s1);
552 pixr = vec_ld(15, s1);
553 bytes = vec_perm(pixl, pixr, perm1);
556 shorts1 = (vector
signed short)vec_mergeh(zero, bytes);
559 pixl = vec_ld( 0, s2);
560 pixr = vec_ld(15, s2);
561 bytes = vec_perm(pixl, pixr, perm2);
564 shorts2 = (vector
signed short)vec_mergeh(zero, bytes);
567 shorts1 = vec_sub(shorts1, shorts2);
570 vec_st(shorts1, 0, (vector
signed short*)block);
594 register vector
unsigned char vdst, vsrc;
597 for (i = 0 ; (i + 15) < w ; i+=16) {
598 vdst = vec_ld(i, (
unsigned char*)dst);
599 vsrc = vec_ld(i, (
unsigned char*)src);
600 vdst = vec_add(vsrc, vdst);
601 vec_st(vdst, i, (
unsigned char*)dst);
604 for (; (i < w) ; i++) {
612 register vector
unsigned char pixelsv1, pixelsv2;
613 register vector
unsigned char pixelsv1B, pixelsv2B;
614 register vector
unsigned char pixelsv1C, pixelsv2C;
615 register vector
unsigned char pixelsv1D, pixelsv2D;
617 register vector
unsigned char perm = vec_lvsl(0, pixels);
619 register int line_size_2 = line_size << 1;
620 register int line_size_3 = line_size + line_size_2;
621 register int line_size_4 = line_size << 2;
628 for (i = 0; i < h; i += 4) {
629 pixelsv1 = vec_ld( 0, pixels);
630 pixelsv2 = vec_ld(15, pixels);
631 pixelsv1B = vec_ld(line_size, pixels);
632 pixelsv2B = vec_ld(15 + line_size, pixels);
633 pixelsv1C = vec_ld(line_size_2, pixels);
634 pixelsv2C = vec_ld(15 + line_size_2, pixels);
635 pixelsv1D = vec_ld(line_size_3, pixels);
636 pixelsv2D = vec_ld(15 + line_size_3, pixels);
637 vec_st(vec_perm(pixelsv1, pixelsv2, perm),
638 0, (
unsigned char*)block);
639 vec_st(vec_perm(pixelsv1B, pixelsv2B, perm),
640 line_size, (
unsigned char*)block);
641 vec_st(vec_perm(pixelsv1C, pixelsv2C, perm),
642 line_size_2, (
unsigned char*)block);
643 vec_st(vec_perm(pixelsv1D, pixelsv2D, perm),
644 line_size_3, (
unsigned char*)block);
651 #define op_avg(a,b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEUL)>>1) )
654 register vector
unsigned char pixelsv1, pixelsv2, pixelsv, blockv;
655 register vector
unsigned char perm = vec_lvsl(0, pixels);
658 for (i = 0; i < h; i++) {
659 pixelsv1 = vec_ld( 0, pixels);
660 pixelsv2 = vec_ld(16,pixels);
661 blockv = vec_ld(0, block);
662 pixelsv = vec_perm(pixelsv1, pixelsv2, perm);
663 blockv = vec_avg(blockv,pixelsv);
664 vec_st(blockv, 0, (
unsigned char*)block);
673 register vector
unsigned char pixelsv1, pixelsv2, pixelsv, blockv;
676 for (i = 0; i < h; i++) {
679 int rightside = ((
unsigned long)block & 0x0000000F);
681 blockv = vec_ld(0, block);
682 pixelsv1 = vec_ld( 0, pixels);
683 pixelsv2 = vec_ld(16, pixels);
684 pixelsv = vec_perm(pixelsv1, pixelsv2, vec_lvsl(0, pixels));
687 pixelsv = vec_perm(blockv, pixelsv,
vcprm(0,1,
s0,
s1));
689 pixelsv = vec_perm(blockv, pixelsv,
vcprm(
s0,
s1,2,3));
692 blockv = vec_avg(blockv, pixelsv);
694 vec_st(blockv, 0, block);
705 register vector
unsigned char pixelsv1, pixelsv2, pixelsavg;
706 register vector
unsigned char blockv, temp1, temp2;
707 register vector
unsigned short pixelssum1, pixelssum2, temp3;
708 register const vector
unsigned char vczero = (
const vector
unsigned char)vec_splat_u8(0);
709 register const vector
unsigned short vctwo = (
const vector
unsigned short)vec_splat_u16(2);
711 temp1 = vec_ld(0, pixels);
712 temp2 = vec_ld(16, pixels);
713 pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels));
714 if ((((
unsigned long)pixels) & 0x0000000F) == 0x0000000F) {
717 pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels));
719 pixelsv1 = vec_mergeh(vczero, pixelsv1);
720 pixelsv2 = vec_mergeh(vczero, pixelsv2);
721 pixelssum1 = vec_add((vector
unsigned short)pixelsv1,
722 (vector
unsigned short)pixelsv2);
723 pixelssum1 = vec_add(pixelssum1, vctwo);
725 for (i = 0; i < h ; i++) {
726 int rightside = ((
unsigned long)block & 0x0000000F);
727 blockv = vec_ld(0, block);
729 temp1 = vec_ld(line_size, pixels);
730 temp2 = vec_ld(line_size + 16, pixels);
731 pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels));
732 if (((((
unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F) {
735 pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels));
738 pixelsv1 = vec_mergeh(vczero, pixelsv1);
739 pixelsv2 = vec_mergeh(vczero, pixelsv2);
740 pixelssum2 = vec_add((vector
unsigned short)pixelsv1,
741 (vector
unsigned short)pixelsv2);
742 temp3 = vec_add(pixelssum1, pixelssum2);
743 temp3 = vec_sra(temp3, vctwo);
744 pixelssum1 = vec_add(pixelssum2, vctwo);
745 pixelsavg = vec_packsu(temp3, (vector
unsigned short) vczero);
748 blockv = vec_perm(blockv, pixelsavg,
vcprm(0, 1,
s0,
s1));
750 blockv = vec_perm(blockv, pixelsavg,
vcprm(
s0,
s1, 2, 3));
753 vec_st(blockv, 0, block);
764 register vector
unsigned char pixelsv1, pixelsv2, pixelsavg;
765 register vector
unsigned char blockv, temp1, temp2;
766 register vector
unsigned short pixelssum1, pixelssum2, temp3;
767 register const vector
unsigned char vczero = (
const vector
unsigned char)vec_splat_u8(0);
768 register const vector
unsigned short vcone = (
const vector
unsigned short)vec_splat_u16(1);
769 register const vector
unsigned short vctwo = (
const vector
unsigned short)vec_splat_u16(2);
771 temp1 = vec_ld(0, pixels);
772 temp2 = vec_ld(16, pixels);
773 pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels));
774 if ((((
unsigned long)pixels) & 0x0000000F) == 0x0000000F) {
777 pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels));
779 pixelsv1 = vec_mergeh(vczero, pixelsv1);
780 pixelsv2 = vec_mergeh(vczero, pixelsv2);
781 pixelssum1 = vec_add((vector
unsigned short)pixelsv1,
782 (vector
unsigned short)pixelsv2);
783 pixelssum1 = vec_add(pixelssum1, vcone);
785 for (i = 0; i < h ; i++) {
786 int rightside = ((
unsigned long)block & 0x0000000F);
787 blockv = vec_ld(0, block);
789 temp1 = vec_ld(line_size, pixels);
790 temp2 = vec_ld(line_size + 16, pixels);
791 pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels));
792 if (((((
unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F) {
795 pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels));
798 pixelsv1 = vec_mergeh(vczero, pixelsv1);
799 pixelsv2 = vec_mergeh(vczero, pixelsv2);
800 pixelssum2 = vec_add((vector
unsigned short)pixelsv1,
801 (vector
unsigned short)pixelsv2);
802 temp3 = vec_add(pixelssum1, pixelssum2);
803 temp3 = vec_sra(temp3, vctwo);
804 pixelssum1 = vec_add(pixelssum2, vcone);
805 pixelsavg = vec_packsu(temp3, (vector
unsigned short) vczero);
808 blockv = vec_perm(blockv, pixelsavg,
vcprm(0, 1,
s0,
s1));
810 blockv = vec_perm(blockv, pixelsavg,
vcprm(
s0,
s1, 2, 3));
813 vec_st(blockv, 0, block);
824 register vector
unsigned char pixelsv1, pixelsv2, pixelsv3, pixelsv4;
825 register vector
unsigned char blockv, temp1, temp2;
826 register vector
unsigned short temp3, temp4,
827 pixelssum1, pixelssum2, pixelssum3, pixelssum4;
828 register const vector
unsigned char vczero = (
const vector
unsigned char)vec_splat_u8(0);
829 register const vector
unsigned short vctwo = (
const vector
unsigned short)vec_splat_u16(2);
831 temp1 = vec_ld(0, pixels);
832 temp2 = vec_ld(16, pixels);
833 pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels));
834 if ((((
unsigned long)pixels) & 0x0000000F) == 0x0000000F) {
837 pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels));
839 pixelsv3 = vec_mergel(vczero, pixelsv1);
840 pixelsv4 = vec_mergel(vczero, pixelsv2);
841 pixelsv1 = vec_mergeh(vczero, pixelsv1);
842 pixelsv2 = vec_mergeh(vczero, pixelsv2);
843 pixelssum3 = vec_add((vector
unsigned short)pixelsv3,
844 (vector
unsigned short)pixelsv4);
845 pixelssum3 = vec_add(pixelssum3, vctwo);
846 pixelssum1 = vec_add((vector
unsigned short)pixelsv1,
847 (vector
unsigned short)pixelsv2);
848 pixelssum1 = vec_add(pixelssum1, vctwo);
850 for (i = 0; i < h ; i++) {
851 blockv = vec_ld(0, block);
853 temp1 = vec_ld(line_size, pixels);
854 temp2 = vec_ld(line_size + 16, pixels);
855 pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels));
856 if (((((
unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F) {
859 pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels));
862 pixelsv3 = vec_mergel(vczero, pixelsv1);
863 pixelsv4 = vec_mergel(vczero, pixelsv2);
864 pixelsv1 = vec_mergeh(vczero, pixelsv1);
865 pixelsv2 = vec_mergeh(vczero, pixelsv2);
867 pixelssum4 = vec_add((vector
unsigned short)pixelsv3,
868 (vector
unsigned short)pixelsv4);
869 pixelssum2 = vec_add((vector
unsigned short)pixelsv1,
870 (vector
unsigned short)pixelsv2);
871 temp4 = vec_add(pixelssum3, pixelssum4);
872 temp4 = vec_sra(temp4, vctwo);
873 temp3 = vec_add(pixelssum1, pixelssum2);
874 temp3 = vec_sra(temp3, vctwo);
876 pixelssum3 = vec_add(pixelssum4, vctwo);
877 pixelssum1 = vec_add(pixelssum2, vctwo);
879 blockv = vec_packsu(temp3, temp4);
881 vec_st(blockv, 0, block);
892 register vector
unsigned char pixelsv1, pixelsv2, pixelsv3, pixelsv4;
893 register vector
unsigned char blockv, temp1, temp2;
894 register vector
unsigned short temp3, temp4,
895 pixelssum1, pixelssum2, pixelssum3, pixelssum4;
896 register const vector
unsigned char vczero = (
const vector
unsigned char)vec_splat_u8(0);
897 register const vector
unsigned short vcone = (
const vector
unsigned short)vec_splat_u16(1);
898 register const vector
unsigned short vctwo = (
const vector
unsigned short)vec_splat_u16(2);
900 temp1 = vec_ld(0, pixels);
901 temp2 = vec_ld(16, pixels);
902 pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels));
903 if ((((
unsigned long)pixels) & 0x0000000F) == 0x0000000F) {
906 pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels));
908 pixelsv3 = vec_mergel(vczero, pixelsv1);
909 pixelsv4 = vec_mergel(vczero, pixelsv2);
910 pixelsv1 = vec_mergeh(vczero, pixelsv1);
911 pixelsv2 = vec_mergeh(vczero, pixelsv2);
912 pixelssum3 = vec_add((vector
unsigned short)pixelsv3,
913 (vector
unsigned short)pixelsv4);
914 pixelssum3 = vec_add(pixelssum3, vcone);
915 pixelssum1 = vec_add((vector
unsigned short)pixelsv1,
916 (vector
unsigned short)pixelsv2);
917 pixelssum1 = vec_add(pixelssum1, vcone);
919 for (i = 0; i < h ; i++) {
920 blockv = vec_ld(0, block);
922 temp1 = vec_ld(line_size, pixels);
923 temp2 = vec_ld(line_size + 16, pixels);
924 pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels));
925 if (((((
unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F) {
928 pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels));
931 pixelsv3 = vec_mergel(vczero, pixelsv1);
932 pixelsv4 = vec_mergel(vczero, pixelsv2);
933 pixelsv1 = vec_mergeh(vczero, pixelsv1);
934 pixelsv2 = vec_mergeh(vczero, pixelsv2);
936 pixelssum4 = vec_add((vector
unsigned short)pixelsv3,
937 (vector
unsigned short)pixelsv4);
938 pixelssum2 = vec_add((vector
unsigned short)pixelsv1,
939 (vector
unsigned short)pixelsv2);
940 temp4 = vec_add(pixelssum3, pixelssum4);
941 temp4 = vec_sra(temp4, vctwo);
942 temp3 = vec_add(pixelssum1, pixelssum2);
943 temp3 = vec_sra(temp3, vctwo);
945 pixelssum3 = vec_add(pixelssum4, vcone);
946 pixelssum1 = vec_add(pixelssum2, vcone);
948 blockv = vec_packsu(temp3, temp4);
950 vec_st(blockv, 0, block);
959 register const vector
unsigned char vzero =
960 (
const vector
unsigned char)vec_splat_u8(0);
961 register vector
signed short temp0, temp1, temp2, temp3, temp4,
964 register const vector
signed short vprod1 =(
const vector
signed short)
965 { 1,-1, 1,-1, 1,-1, 1,-1 };
966 register const vector
signed short vprod2 =(
const vector
signed short)
967 { 1, 1,-1,-1, 1, 1,-1,-1 };
968 register const vector
signed short vprod3 =(
const vector
signed short)
969 { 1, 1, 1, 1,-1,-1,-1,-1 };
970 register const vector
unsigned char perm1 = (
const vector
unsigned char)
971 {0x02, 0x03, 0x00, 0x01, 0x06, 0x07, 0x04, 0x05,
972 0x0A, 0x0B, 0x08, 0x09, 0x0E, 0x0F, 0x0C, 0x0D};
973 register const vector
unsigned char perm2 = (
const vector
unsigned char)
974 {0x04, 0x05, 0x06, 0x07, 0x00, 0x01, 0x02, 0x03,
975 0x0C, 0x0D, 0x0E, 0x0F, 0x08, 0x09, 0x0A, 0x0B};
976 register const vector
unsigned char perm3 = (
const vector
unsigned char)
977 {0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F,
978 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07};
980 #define ONEITERBUTTERFLY(i, res) \
982 register vector unsigned char src1, src2, srcO; \
983 register vector unsigned char dst1, dst2, dstO; \
984 register vector signed short srcV, dstV; \
985 register vector signed short but0, but1, but2, op1, op2, op3; \
986 src1 = vec_ld(stride * i, src); \
987 src2 = vec_ld((stride * i) + 15, src); \
988 srcO = vec_perm(src1, src2, vec_lvsl(stride * i, src)); \
989 dst1 = vec_ld(stride * i, dst); \
990 dst2 = vec_ld((stride * i) + 15, dst); \
991 dstO = vec_perm(dst1, dst2, vec_lvsl(stride * i, dst)); \
994 srcV = (vector signed short)vec_mergeh((vector signed char)vzero, \
995 (vector signed char)srcO); \
996 dstV = (vector signed short)vec_mergeh((vector signed char)vzero, \
997 (vector signed char)dstO); \
999 but0 = vec_sub(srcV, dstV); \
1000 op1 = vec_perm(but0, but0, perm1); \
1001 but1 = vec_mladd(but0, vprod1, op1); \
1002 op2 = vec_perm(but1, but1, perm2); \
1003 but2 = vec_mladd(but1, vprod2, op2); \
1004 op3 = vec_perm(but2, but2, perm3); \
1005 res = vec_mladd(but2, vprod3, op3); \
1016 #undef ONEITERBUTTERFLY
1018 register vector
signed int vsum;
1019 register vector
signed short line0 = vec_add(temp0, temp1);
1020 register vector
signed short line1 = vec_sub(temp0, temp1);
1021 register vector
signed short line2 = vec_add(temp2, temp3);
1022 register vector
signed short line3 = vec_sub(temp2, temp3);
1023 register vector
signed short line4 = vec_add(temp4, temp5);
1024 register vector
signed short line5 = vec_sub(temp4, temp5);
1025 register vector
signed short line6 = vec_add(temp6, temp7);
1026 register vector
signed short line7 = vec_sub(temp6, temp7);
1028 register vector
signed short line0B = vec_add(line0, line2);
1029 register vector
signed short line2B = vec_sub(line0, line2);
1030 register vector
signed short line1B = vec_add(line1, line3);
1031 register vector
signed short line3B = vec_sub(line1, line3);
1032 register vector
signed short line4B = vec_add(line4, line6);
1033 register vector
signed short line6B = vec_sub(line4, line6);
1034 register vector
signed short line5B = vec_add(line5, line7);
1035 register vector
signed short line7B = vec_sub(line5, line7);
1037 register vector
signed short line0C = vec_add(line0B, line4B);
1038 register vector
signed short line4C = vec_sub(line0B, line4B);
1039 register vector
signed short line1C = vec_add(line1B, line5B);
1040 register vector
signed short line5C = vec_sub(line1B, line5B);
1041 register vector
signed short line2C = vec_add(line2B, line6B);
1042 register vector
signed short line6C = vec_sub(line2B, line6B);
1043 register vector
signed short line3C = vec_add(line3B, line7B);
1044 register vector
signed short line7C = vec_sub(line3B, line7B);
1046 vsum = vec_sum4s(vec_abs(line0C), vec_splat_s32(0));
1047 vsum = vec_sum4s(vec_abs(line1C), vsum);
1048 vsum = vec_sum4s(vec_abs(line2C), vsum);
1049 vsum = vec_sum4s(vec_abs(line3C), vsum);
1050 vsum = vec_sum4s(vec_abs(line4C), vsum);
1051 vsum = vec_sum4s(vec_abs(line5C), vsum);
1052 vsum = vec_sum4s(vec_abs(line6C), vsum);
1053 vsum = vec_sum4s(vec_abs(line7C), vsum);
1054 vsum = vec_sums(vsum, (vector
signed int)vzero);
1055 vsum = vec_splat(vsum, 3);
1056 vec_ste(vsum, 0, &sum);
1082 register vector
signed short
1083 temp0 __asm__ (
"v0"),
1084 temp1 __asm__ (
"v1"),
1085 temp2 __asm__ (
"v2"),
1086 temp3 __asm__ (
"v3"),
1087 temp4 __asm__ (
"v4"),
1088 temp5 __asm__ (
"v5"),
1089 temp6 __asm__ (
"v6"),
1090 temp7 __asm__ (
"v7");
1091 register vector
signed short
1092 temp0S __asm__ (
"v8"),
1093 temp1S __asm__ (
"v9"),
1094 temp2S __asm__ (
"v10"),
1095 temp3S __asm__ (
"v11"),
1096 temp4S __asm__ (
"v12"),
1097 temp5S __asm__ (
"v13"),
1098 temp6S __asm__ (
"v14"),
1099 temp7S __asm__ (
"v15");
1100 register const vector
unsigned char vzero __asm__ (
"v31") =
1101 (
const vector
unsigned char)vec_splat_u8(0);
1103 register const vector
signed short vprod1 __asm__ (
"v16") =
1104 (
const vector
signed short){ 1,-1, 1,-1, 1,-1, 1,-1 };
1105 register const vector
signed short vprod2 __asm__ (
"v17") =
1106 (
const vector
signed short){ 1, 1,-1,-1, 1, 1,-1,-1 };
1107 register const vector
signed short vprod3 __asm__ (
"v18") =
1108 (
const vector
signed short){ 1, 1, 1, 1,-1,-1,-1,-1 };
1109 register const vector
unsigned char perm1 __asm__ (
"v19") =
1110 (
const vector
unsigned char)
1111 {0x02, 0x03, 0x00, 0x01, 0x06, 0x07, 0x04, 0x05,
1112 0x0A, 0x0B, 0x08, 0x09, 0x0E, 0x0F, 0x0C, 0x0D};
1113 register const vector
unsigned char perm2 __asm__ (
"v20") =
1114 (
const vector
unsigned char)
1115 {0x04, 0x05, 0x06, 0x07, 0x00, 0x01, 0x02, 0x03,
1116 0x0C, 0x0D, 0x0E, 0x0F, 0x08, 0x09, 0x0A, 0x0B};
1117 register const vector
unsigned char perm3 __asm__ (
"v21") =
1118 (
const vector
unsigned char)
1119 {0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F,
1120 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07};
1122 #define ONEITERBUTTERFLY(i, res1, res2) \
1124 register vector unsigned char src1 __asm__ ("v22"), \
1125 src2 __asm__ ("v23"), \
1126 dst1 __asm__ ("v24"), \
1127 dst2 __asm__ ("v25"), \
1128 srcO __asm__ ("v22"), \
1129 dstO __asm__ ("v23"); \
1131 register vector signed short srcV __asm__ ("v24"), \
1132 dstV __asm__ ("v25"), \
1133 srcW __asm__ ("v26"), \
1134 dstW __asm__ ("v27"), \
1135 but0 __asm__ ("v28"), \
1136 but0S __asm__ ("v29"), \
1137 op1 __asm__ ("v30"), \
1138 but1 __asm__ ("v22"), \
1139 op1S __asm__ ("v23"), \
1140 but1S __asm__ ("v24"), \
1141 op2 __asm__ ("v25"), \
1142 but2 __asm__ ("v26"), \
1143 op2S __asm__ ("v27"), \
1144 but2S __asm__ ("v28"), \
1145 op3 __asm__ ("v29"), \
1146 op3S __asm__ ("v30"); \
1148 src1 = vec_ld(stride * i, src); \
1149 src2 = vec_ld((stride * i) + 16, src); \
1150 srcO = vec_perm(src1, src2, vec_lvsl(stride * i, src)); \
1151 dst1 = vec_ld(stride * i, dst); \
1152 dst2 = vec_ld((stride * i) + 16, dst); \
1153 dstO = vec_perm(dst1, dst2, vec_lvsl(stride * i, dst)); \
1155 srcV = (vector signed short)vec_mergeh((vector signed char)vzero, \
1156 (vector signed char)srcO); \
1157 dstV = (vector signed short)vec_mergeh((vector signed char)vzero, \
1158 (vector signed char)dstO); \
1159 srcW = (vector signed short)vec_mergel((vector signed char)vzero, \
1160 (vector signed char)srcO); \
1161 dstW = (vector signed short)vec_mergel((vector signed char)vzero, \
1162 (vector signed char)dstO); \
1164 but0 = vec_sub(srcV, dstV); \
1165 but0S = vec_sub(srcW, dstW); \
1166 op1 = vec_perm(but0, but0, perm1); \
1167 but1 = vec_mladd(but0, vprod1, op1); \
1168 op1S = vec_perm(but0S, but0S, perm1); \
1169 but1S = vec_mladd(but0S, vprod1, op1S); \
1170 op2 = vec_perm(but1, but1, perm2); \
1171 but2 = vec_mladd(but1, vprod2, op2); \
1172 op2S = vec_perm(but1S, but1S, perm2); \
1173 but2S = vec_mladd(but1S, vprod2, op2S); \
1174 op3 = vec_perm(but2, but2, perm3); \
1175 res1 = vec_mladd(but2, vprod3, op3); \
1176 op3S = vec_perm(but2S, but2S, perm3); \
1177 res2 = vec_mladd(but2S, vprod3, op3S); \
1188 #undef ONEITERBUTTERFLY
1190 register vector
signed int vsum;
1191 register vector
signed short line0S, line1S, line2S, line3S, line4S,
1192 line5S, line6S, line7S, line0BS,line2BS,
1193 line1BS,line3BS,line4BS,line6BS,line5BS,
1194 line7BS,line0CS,line4CS,line1CS,line5CS,
1195 line2CS,line6CS,line3CS,line7CS;
1197 register vector
signed short line0 = vec_add(temp0, temp1);
1198 register vector
signed short line1 = vec_sub(temp0, temp1);
1199 register vector
signed short line2 = vec_add(temp2, temp3);
1200 register vector
signed short line3 = vec_sub(temp2, temp3);
1201 register vector
signed short line4 = vec_add(temp4, temp5);
1202 register vector
signed short line5 = vec_sub(temp4, temp5);
1203 register vector
signed short line6 = vec_add(temp6, temp7);
1204 register vector
signed short line7 = vec_sub(temp6, temp7);
1206 register vector
signed short line0B = vec_add(line0, line2);
1207 register vector
signed short line2B = vec_sub(line0, line2);
1208 register vector
signed short line1B = vec_add(line1, line3);
1209 register vector
signed short line3B = vec_sub(line1, line3);
1210 register vector
signed short line4B = vec_add(line4, line6);
1211 register vector
signed short line6B = vec_sub(line4, line6);
1212 register vector
signed short line5B = vec_add(line5, line7);
1213 register vector
signed short line7B = vec_sub(line5, line7);
1215 register vector
signed short line0C = vec_add(line0B, line4B);
1216 register vector
signed short line4C = vec_sub(line0B, line4B);
1217 register vector
signed short line1C = vec_add(line1B, line5B);
1218 register vector
signed short line5C = vec_sub(line1B, line5B);
1219 register vector
signed short line2C = vec_add(line2B, line6B);
1220 register vector
signed short line6C = vec_sub(line2B, line6B);
1221 register vector
signed short line3C = vec_add(line3B, line7B);
1222 register vector
signed short line7C = vec_sub(line3B, line7B);
1224 vsum = vec_sum4s(vec_abs(line0C), vec_splat_s32(0));
1225 vsum = vec_sum4s(vec_abs(line1C), vsum);
1226 vsum = vec_sum4s(vec_abs(line2C), vsum);
1227 vsum = vec_sum4s(vec_abs(line3C), vsum);
1228 vsum = vec_sum4s(vec_abs(line4C), vsum);
1229 vsum = vec_sum4s(vec_abs(line5C), vsum);
1230 vsum = vec_sum4s(vec_abs(line6C), vsum);
1231 vsum = vec_sum4s(vec_abs(line7C), vsum);
1233 line0S = vec_add(temp0S, temp1S);
1234 line1S = vec_sub(temp0S, temp1S);
1235 line2S = vec_add(temp2S, temp3S);
1236 line3S = vec_sub(temp2S, temp3S);
1237 line4S = vec_add(temp4S, temp5S);
1238 line5S = vec_sub(temp4S, temp5S);
1239 line6S = vec_add(temp6S, temp7S);
1240 line7S = vec_sub(temp6S, temp7S);
1242 line0BS = vec_add(line0S, line2S);
1243 line2BS = vec_sub(line0S, line2S);
1244 line1BS = vec_add(line1S, line3S);
1245 line3BS = vec_sub(line1S, line3S);
1246 line4BS = vec_add(line4S, line6S);
1247 line6BS = vec_sub(line4S, line6S);
1248 line5BS = vec_add(line5S, line7S);
1249 line7BS = vec_sub(line5S, line7S);
1251 line0CS = vec_add(line0BS, line4BS);
1252 line4CS = vec_sub(line0BS, line4BS);
1253 line1CS = vec_add(line1BS, line5BS);
1254 line5CS = vec_sub(line1BS, line5BS);
1255 line2CS = vec_add(line2BS, line6BS);
1256 line6CS = vec_sub(line2BS, line6BS);
1257 line3CS = vec_add(line3BS, line7BS);
1258 line7CS = vec_sub(line3BS, line7BS);
1260 vsum = vec_sum4s(vec_abs(line0CS), vsum);
1261 vsum = vec_sum4s(vec_abs(line1CS), vsum);
1262 vsum = vec_sum4s(vec_abs(line2CS), vsum);
1263 vsum = vec_sum4s(vec_abs(line3CS), vsum);
1264 vsum = vec_sum4s(vec_abs(line4CS), vsum);
1265 vsum = vec_sum4s(vec_abs(line5CS), vsum);
1266 vsum = vec_sum4s(vec_abs(line6CS), vsum);
1267 vsum = vec_sum4s(vec_abs(line7CS), vsum);
1268 vsum = vec_sums(vsum, (vector
signed int)
vzero);
1269 vsum = vec_splat(vsum, 3);
1270 vec_ste(vsum, 0, &sum);
1291 vector
bool int t0,
t1;
1292 const vector
unsigned int v_31 =
1293 vec_add(vec_add(vec_splat_u32(15),vec_splat_u32(15)),vec_splat_u32(1));
1294 for (i = 0; i < blocksize; i += 4) {
1295 m = vec_ld(0, mag+i);
1296 a = vec_ld(0, ang+i);
1297 t0 = vec_cmple(m, (vector
float)vec_splat_u32(0));
1298 t1 = vec_cmple(a, (vector
float)vec_splat_u32(0));
1299 a = vec_xor(a, (vector
float) vec_sl((vector
unsigned int)t0, v_31));
1300 t0 = (vector
bool int)vec_and(a, t1);
1301 t1 = (vector
bool int)vec_andc(a, t1);
1302 a = vec_sub(m, (vector
float)t1);
1303 m = vec_add(m, (vector
float)t0);
1304 vec_stl(a, 0, ang+i);
1305 vec_stl(m, 0, mag+i);
1313 register vector
unsigned char pixelsv1, pixelsv2, pixelsavg;
1314 register vector
unsigned char blockv, temp1, temp2, blocktemp;
1315 register vector
unsigned short pixelssum1, pixelssum2, temp3;
1317 register const vector
unsigned char vczero = (
const vector
unsigned char)
1319 register const vector
unsigned short vctwo = (
const vector
unsigned short)
1322 temp1 = vec_ld(0, pixels);
1323 temp2 = vec_ld(16, pixels);
1324 pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels));
1325 if ((((
unsigned long)pixels) & 0x0000000F) == 0x0000000F) {
1328 pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels));
1330 pixelsv1 = vec_mergeh(vczero, pixelsv1);
1331 pixelsv2 = vec_mergeh(vczero, pixelsv2);
1332 pixelssum1 = vec_add((vector
unsigned short)pixelsv1,
1333 (vector
unsigned short)pixelsv2);
1334 pixelssum1 = vec_add(pixelssum1, vctwo);
1336 for (i = 0; i < h ; i++) {
1337 int rightside = ((
unsigned long)block & 0x0000000F);
1338 blockv = vec_ld(0, block);
1340 temp1 = vec_ld(line_size, pixels);
1341 temp2 = vec_ld(line_size + 16, pixels);
1342 pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels));
1343 if (((((
unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F) {
1346 pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels));
1349 pixelsv1 = vec_mergeh(vczero, pixelsv1);
1350 pixelsv2 = vec_mergeh(vczero, pixelsv2);
1351 pixelssum2 = vec_add((vector
unsigned short)pixelsv1,
1352 (vector
unsigned short)pixelsv2);
1353 temp3 = vec_add(pixelssum1, pixelssum2);
1354 temp3 = vec_sra(temp3, vctwo);
1355 pixelssum1 = vec_add(pixelssum2, vctwo);
1356 pixelsavg = vec_packsu(temp3, (vector
unsigned short) vczero);
1359 blocktemp = vec_perm(blockv, pixelsavg,
vcprm(0, 1,
s0,
s1));
1361 blocktemp = vec_perm(blockv, pixelsavg,
vcprm(
s0,
s1, 2, 3));
1364 blockv = vec_avg(blocktemp, blockv);
1365 vec_st(blockv, 0, block);
1368 pixels += line_size;
1389 if (!high_bit_depth) {
1406 if (CONFIG_VORBIS_DECODER)