00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027 #include <stddef.h>
00028
00029 #undef PREFETCH
00030 #undef MOVNTQ
00031 #undef EMMS
00032 #undef SFENCE
00033 #undef PAVGB
00034
00035 #if COMPILE_TEMPLATE_AMD3DNOW
00036 #define PREFETCH "prefetch"
00037 #define PAVGB "pavgusb"
00038 #elif COMPILE_TEMPLATE_MMXEXT
00039 #define PREFETCH "prefetchnta"
00040 #define PAVGB "pavgb"
00041 #else
00042 #define PREFETCH " # nop"
00043 #endif
00044
00045 #if COMPILE_TEMPLATE_AMD3DNOW
00046
00047 #define EMMS "femms"
00048 #else
00049 #define EMMS "emms"
00050 #endif
00051
00052 #if COMPILE_TEMPLATE_MMXEXT
00053 #define MOVNTQ "movntq"
00054 #define SFENCE "sfence"
00055 #else
00056 #define MOVNTQ "movq"
00057 #define SFENCE " # nop"
00058 #endif
00059
00060 #if !COMPILE_TEMPLATE_SSE2
00061
00062 #if !COMPILE_TEMPLATE_AMD3DNOW
00063
00064 static inline void RENAME(rgb24tobgr32)(const uint8_t *src, uint8_t *dst, int src_size)
00065 {
00066 uint8_t *dest = dst;
00067 const uint8_t *s = src;
00068 const uint8_t *end;
00069 const uint8_t *mm_end;
00070 end = s + src_size;
00071 __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory");
00072 mm_end = end - 23;
00073 __asm__ volatile("movq %0, %%mm7"::"m"(mask32a):"memory");
00074 while (s < mm_end) {
00075 __asm__ volatile(
00076 PREFETCH" 32(%1) \n\t"
00077 "movd (%1), %%mm0 \n\t"
00078 "punpckldq 3(%1), %%mm0 \n\t"
00079 "movd 6(%1), %%mm1 \n\t"
00080 "punpckldq 9(%1), %%mm1 \n\t"
00081 "movd 12(%1), %%mm2 \n\t"
00082 "punpckldq 15(%1), %%mm2 \n\t"
00083 "movd 18(%1), %%mm3 \n\t"
00084 "punpckldq 21(%1), %%mm3 \n\t"
00085 "por %%mm7, %%mm0 \n\t"
00086 "por %%mm7, %%mm1 \n\t"
00087 "por %%mm7, %%mm2 \n\t"
00088 "por %%mm7, %%mm3 \n\t"
00089 MOVNTQ" %%mm0, (%0) \n\t"
00090 MOVNTQ" %%mm1, 8(%0) \n\t"
00091 MOVNTQ" %%mm2, 16(%0) \n\t"
00092 MOVNTQ" %%mm3, 24(%0)"
00093 :: "r"(dest), "r"(s)
00094 :"memory");
00095 dest += 32;
00096 s += 24;
00097 }
00098 __asm__ volatile(SFENCE:::"memory");
00099 __asm__ volatile(EMMS:::"memory");
00100 while (s < end) {
00101 *dest++ = *s++;
00102 *dest++ = *s++;
00103 *dest++ = *s++;
00104 *dest++ = 255;
00105 }
00106 }
00107
00108 #define STORE_BGR24_MMX \
00109 "psrlq $8, %%mm2 \n\t" \
00110 "psrlq $8, %%mm3 \n\t" \
00111 "psrlq $8, %%mm6 \n\t" \
00112 "psrlq $8, %%mm7 \n\t" \
00113 "pand "MANGLE(mask24l)", %%mm0\n\t" \
00114 "pand "MANGLE(mask24l)", %%mm1\n\t" \
00115 "pand "MANGLE(mask24l)", %%mm4\n\t" \
00116 "pand "MANGLE(mask24l)", %%mm5\n\t" \
00117 "pand "MANGLE(mask24h)", %%mm2\n\t" \
00118 "pand "MANGLE(mask24h)", %%mm3\n\t" \
00119 "pand "MANGLE(mask24h)", %%mm6\n\t" \
00120 "pand "MANGLE(mask24h)", %%mm7\n\t" \
00121 "por %%mm2, %%mm0 \n\t" \
00122 "por %%mm3, %%mm1 \n\t" \
00123 "por %%mm6, %%mm4 \n\t" \
00124 "por %%mm7, %%mm5 \n\t" \
00125 \
00126 "movq %%mm1, %%mm2 \n\t" \
00127 "movq %%mm4, %%mm3 \n\t" \
00128 "psllq $48, %%mm2 \n\t" \
00129 "psllq $32, %%mm3 \n\t" \
00130 "por %%mm2, %%mm0 \n\t" \
00131 "psrlq $16, %%mm1 \n\t" \
00132 "psrlq $32, %%mm4 \n\t" \
00133 "psllq $16, %%mm5 \n\t" \
00134 "por %%mm3, %%mm1 \n\t" \
00135 "por %%mm5, %%mm4 \n\t" \
00136 \
00137 MOVNTQ" %%mm0, (%0) \n\t" \
00138 MOVNTQ" %%mm1, 8(%0) \n\t" \
00139 MOVNTQ" %%mm4, 16(%0)"
00140
00141
00142 static inline void RENAME(rgb32tobgr24)(const uint8_t *src, uint8_t *dst, int src_size)
00143 {
00144 uint8_t *dest = dst;
00145 const uint8_t *s = src;
00146 const uint8_t *end;
00147 const uint8_t *mm_end;
00148 end = s + src_size;
00149 __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory");
00150 mm_end = end - 31;
00151 while (s < mm_end) {
00152 __asm__ volatile(
00153 PREFETCH" 32(%1) \n\t"
00154 "movq (%1), %%mm0 \n\t"
00155 "movq 8(%1), %%mm1 \n\t"
00156 "movq 16(%1), %%mm4 \n\t"
00157 "movq 24(%1), %%mm5 \n\t"
00158 "movq %%mm0, %%mm2 \n\t"
00159 "movq %%mm1, %%mm3 \n\t"
00160 "movq %%mm4, %%mm6 \n\t"
00161 "movq %%mm5, %%mm7 \n\t"
00162 STORE_BGR24_MMX
00163 :: "r"(dest), "r"(s)
00164 :"memory");
00165 dest += 24;
00166 s += 32;
00167 }
00168 __asm__ volatile(SFENCE:::"memory");
00169 __asm__ volatile(EMMS:::"memory");
00170 while (s < end) {
00171 *dest++ = *s++;
00172 *dest++ = *s++;
00173 *dest++ = *s++;
00174 s++;
00175 }
00176 }
00177
00178
00179
00180
00181
00182
00183
00184 static inline void RENAME(rgb15to16)(const uint8_t *src, uint8_t *dst, int src_size)
00185 {
00186 register const uint8_t* s=src;
00187 register uint8_t* d=dst;
00188 register const uint8_t *end;
00189 const uint8_t *mm_end;
00190 end = s + src_size;
00191 __asm__ volatile(PREFETCH" %0"::"m"(*s));
00192 __asm__ volatile("movq %0, %%mm4"::"m"(mask15s));
00193 mm_end = end - 15;
00194 while (s<mm_end) {
00195 __asm__ volatile(
00196 PREFETCH" 32(%1) \n\t"
00197 "movq (%1), %%mm0 \n\t"
00198 "movq 8(%1), %%mm2 \n\t"
00199 "movq %%mm0, %%mm1 \n\t"
00200 "movq %%mm2, %%mm3 \n\t"
00201 "pand %%mm4, %%mm0 \n\t"
00202 "pand %%mm4, %%mm2 \n\t"
00203 "paddw %%mm1, %%mm0 \n\t"
00204 "paddw %%mm3, %%mm2 \n\t"
00205 MOVNTQ" %%mm0, (%0) \n\t"
00206 MOVNTQ" %%mm2, 8(%0)"
00207 :: "r"(d), "r"(s)
00208 );
00209 d+=16;
00210 s+=16;
00211 }
00212 __asm__ volatile(SFENCE:::"memory");
00213 __asm__ volatile(EMMS:::"memory");
00214 mm_end = end - 3;
00215 while (s < mm_end) {
00216 register unsigned x= *((const uint32_t *)s);
00217 *((uint32_t *)d) = (x&0x7FFF7FFF) + (x&0x7FE07FE0);
00218 d+=4;
00219 s+=4;
00220 }
00221 if (s < end) {
00222 register unsigned short x= *((const uint16_t *)s);
00223 *((uint16_t *)d) = (x&0x7FFF) + (x&0x7FE0);
00224 }
00225 }
00226
00227 static inline void RENAME(rgb16to15)(const uint8_t *src, uint8_t *dst, int src_size)
00228 {
00229 register const uint8_t* s=src;
00230 register uint8_t* d=dst;
00231 register const uint8_t *end;
00232 const uint8_t *mm_end;
00233 end = s + src_size;
00234 __asm__ volatile(PREFETCH" %0"::"m"(*s));
00235 __asm__ volatile("movq %0, %%mm7"::"m"(mask15rg));
00236 __asm__ volatile("movq %0, %%mm6"::"m"(mask15b));
00237 mm_end = end - 15;
00238 while (s<mm_end) {
00239 __asm__ volatile(
00240 PREFETCH" 32(%1) \n\t"
00241 "movq (%1), %%mm0 \n\t"
00242 "movq 8(%1), %%mm2 \n\t"
00243 "movq %%mm0, %%mm1 \n\t"
00244 "movq %%mm2, %%mm3 \n\t"
00245 "psrlq $1, %%mm0 \n\t"
00246 "psrlq $1, %%mm2 \n\t"
00247 "pand %%mm7, %%mm0 \n\t"
00248 "pand %%mm7, %%mm2 \n\t"
00249 "pand %%mm6, %%mm1 \n\t"
00250 "pand %%mm6, %%mm3 \n\t"
00251 "por %%mm1, %%mm0 \n\t"
00252 "por %%mm3, %%mm2 \n\t"
00253 MOVNTQ" %%mm0, (%0) \n\t"
00254 MOVNTQ" %%mm2, 8(%0)"
00255 :: "r"(d), "r"(s)
00256 );
00257 d+=16;
00258 s+=16;
00259 }
00260 __asm__ volatile(SFENCE:::"memory");
00261 __asm__ volatile(EMMS:::"memory");
00262 mm_end = end - 3;
00263 while (s < mm_end) {
00264 register uint32_t x= *((const uint32_t*)s);
00265 *((uint32_t *)d) = ((x>>1)&0x7FE07FE0) | (x&0x001F001F);
00266 s+=4;
00267 d+=4;
00268 }
00269 if (s < end) {
00270 register uint16_t x= *((const uint16_t*)s);
00271 *((uint16_t *)d) = ((x>>1)&0x7FE0) | (x&0x001F);
00272 }
00273 }
00274
00275 static inline void RENAME(rgb32to16)(const uint8_t *src, uint8_t *dst, int src_size)
00276 {
00277 const uint8_t *s = src;
00278 const uint8_t *end;
00279 const uint8_t *mm_end;
00280 uint16_t *d = (uint16_t *)dst;
00281 end = s + src_size;
00282 mm_end = end - 15;
00283 __asm__ volatile(
00284 "movq %3, %%mm5 \n\t"
00285 "movq %4, %%mm6 \n\t"
00286 "movq %5, %%mm7 \n\t"
00287 "jmp 2f \n\t"
00288 ".p2align 4 \n\t"
00289 "1: \n\t"
00290 PREFETCH" 32(%1) \n\t"
00291 "movd (%1), %%mm0 \n\t"
00292 "movd 4(%1), %%mm3 \n\t"
00293 "punpckldq 8(%1), %%mm0 \n\t"
00294 "punpckldq 12(%1), %%mm3 \n\t"
00295 "movq %%mm0, %%mm1 \n\t"
00296 "movq %%mm3, %%mm4 \n\t"
00297 "pand %%mm6, %%mm0 \n\t"
00298 "pand %%mm6, %%mm3 \n\t"
00299 "pmaddwd %%mm7, %%mm0 \n\t"
00300 "pmaddwd %%mm7, %%mm3 \n\t"
00301 "pand %%mm5, %%mm1 \n\t"
00302 "pand %%mm5, %%mm4 \n\t"
00303 "por %%mm1, %%mm0 \n\t"
00304 "por %%mm4, %%mm3 \n\t"
00305 "psrld $5, %%mm0 \n\t"
00306 "pslld $11, %%mm3 \n\t"
00307 "por %%mm3, %%mm0 \n\t"
00308 MOVNTQ" %%mm0, (%0) \n\t"
00309 "add $16, %1 \n\t"
00310 "add $8, %0 \n\t"
00311 "2: \n\t"
00312 "cmp %2, %1 \n\t"
00313 " jb 1b \n\t"
00314 : "+r" (d), "+r"(s)
00315 : "r" (mm_end), "m" (mask3216g), "m" (mask3216br), "m" (mul3216)
00316 );
00317 __asm__ volatile(SFENCE:::"memory");
00318 __asm__ volatile(EMMS:::"memory");
00319 while (s < end) {
00320 register int rgb = *(const uint32_t*)s; s += 4;
00321 *d++ = ((rgb&0xFF)>>3) + ((rgb&0xFC00)>>5) + ((rgb&0xF80000)>>8);
00322 }
00323 }
00324
00325 static inline void RENAME(rgb32tobgr16)(const uint8_t *src, uint8_t *dst, int src_size)
00326 {
00327 const uint8_t *s = src;
00328 const uint8_t *end;
00329 const uint8_t *mm_end;
00330 uint16_t *d = (uint16_t *)dst;
00331 end = s + src_size;
00332 __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
00333 __asm__ volatile(
00334 "movq %0, %%mm7 \n\t"
00335 "movq %1, %%mm6 \n\t"
00336 ::"m"(red_16mask),"m"(green_16mask));
00337 mm_end = end - 15;
00338 while (s < mm_end) {
00339 __asm__ volatile(
00340 PREFETCH" 32(%1) \n\t"
00341 "movd (%1), %%mm0 \n\t"
00342 "movd 4(%1), %%mm3 \n\t"
00343 "punpckldq 8(%1), %%mm0 \n\t"
00344 "punpckldq 12(%1), %%mm3 \n\t"
00345 "movq %%mm0, %%mm1 \n\t"
00346 "movq %%mm0, %%mm2 \n\t"
00347 "movq %%mm3, %%mm4 \n\t"
00348 "movq %%mm3, %%mm5 \n\t"
00349 "psllq $8, %%mm0 \n\t"
00350 "psllq $8, %%mm3 \n\t"
00351 "pand %%mm7, %%mm0 \n\t"
00352 "pand %%mm7, %%mm3 \n\t"
00353 "psrlq $5, %%mm1 \n\t"
00354 "psrlq $5, %%mm4 \n\t"
00355 "pand %%mm6, %%mm1 \n\t"
00356 "pand %%mm6, %%mm4 \n\t"
00357 "psrlq $19, %%mm2 \n\t"
00358 "psrlq $19, %%mm5 \n\t"
00359 "pand %2, %%mm2 \n\t"
00360 "pand %2, %%mm5 \n\t"
00361 "por %%mm1, %%mm0 \n\t"
00362 "por %%mm4, %%mm3 \n\t"
00363 "por %%mm2, %%mm0 \n\t"
00364 "por %%mm5, %%mm3 \n\t"
00365 "psllq $16, %%mm3 \n\t"
00366 "por %%mm3, %%mm0 \n\t"
00367 MOVNTQ" %%mm0, (%0) \n\t"
00368 :: "r"(d),"r"(s),"m"(blue_16mask):"memory");
00369 d += 4;
00370 s += 16;
00371 }
00372 __asm__ volatile(SFENCE:::"memory");
00373 __asm__ volatile(EMMS:::"memory");
00374 while (s < end) {
00375 register int rgb = *(const uint32_t*)s; s += 4;
00376 *d++ = ((rgb&0xF8)<<8) + ((rgb&0xFC00)>>5) + ((rgb&0xF80000)>>19);
00377 }
00378 }
00379
00380 static inline void RENAME(rgb32to15)(const uint8_t *src, uint8_t *dst, int src_size)
00381 {
00382 const uint8_t *s = src;
00383 const uint8_t *end;
00384 const uint8_t *mm_end;
00385 uint16_t *d = (uint16_t *)dst;
00386 end = s + src_size;
00387 mm_end = end - 15;
00388 __asm__ volatile(
00389 "movq %3, %%mm5 \n\t"
00390 "movq %4, %%mm6 \n\t"
00391 "movq %5, %%mm7 \n\t"
00392 "jmp 2f \n\t"
00393 ".p2align 4 \n\t"
00394 "1: \n\t"
00395 PREFETCH" 32(%1) \n\t"
00396 "movd (%1), %%mm0 \n\t"
00397 "movd 4(%1), %%mm3 \n\t"
00398 "punpckldq 8(%1), %%mm0 \n\t"
00399 "punpckldq 12(%1), %%mm3 \n\t"
00400 "movq %%mm0, %%mm1 \n\t"
00401 "movq %%mm3, %%mm4 \n\t"
00402 "pand %%mm6, %%mm0 \n\t"
00403 "pand %%mm6, %%mm3 \n\t"
00404 "pmaddwd %%mm7, %%mm0 \n\t"
00405 "pmaddwd %%mm7, %%mm3 \n\t"
00406 "pand %%mm5, %%mm1 \n\t"
00407 "pand %%mm5, %%mm4 \n\t"
00408 "por %%mm1, %%mm0 \n\t"
00409 "por %%mm4, %%mm3 \n\t"
00410 "psrld $6, %%mm0 \n\t"
00411 "pslld $10, %%mm3 \n\t"
00412 "por %%mm3, %%mm0 \n\t"
00413 MOVNTQ" %%mm0, (%0) \n\t"
00414 "add $16, %1 \n\t"
00415 "add $8, %0 \n\t"
00416 "2: \n\t"
00417 "cmp %2, %1 \n\t"
00418 " jb 1b \n\t"
00419 : "+r" (d), "+r"(s)
00420 : "r" (mm_end), "m" (mask3215g), "m" (mask3216br), "m" (mul3215)
00421 );
00422 __asm__ volatile(SFENCE:::"memory");
00423 __asm__ volatile(EMMS:::"memory");
00424 while (s < end) {
00425 register int rgb = *(const uint32_t*)s; s += 4;
00426 *d++ = ((rgb&0xFF)>>3) + ((rgb&0xF800)>>6) + ((rgb&0xF80000)>>9);
00427 }
00428 }
00429
00430 static inline void RENAME(rgb32tobgr15)(const uint8_t *src, uint8_t *dst, int src_size)
00431 {
00432 const uint8_t *s = src;
00433 const uint8_t *end;
00434 const uint8_t *mm_end;
00435 uint16_t *d = (uint16_t *)dst;
00436 end = s + src_size;
00437 __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
00438 __asm__ volatile(
00439 "movq %0, %%mm7 \n\t"
00440 "movq %1, %%mm6 \n\t"
00441 ::"m"(red_15mask),"m"(green_15mask));
00442 mm_end = end - 15;
00443 while (s < mm_end) {
00444 __asm__ volatile(
00445 PREFETCH" 32(%1) \n\t"
00446 "movd (%1), %%mm0 \n\t"
00447 "movd 4(%1), %%mm3 \n\t"
00448 "punpckldq 8(%1), %%mm0 \n\t"
00449 "punpckldq 12(%1), %%mm3 \n\t"
00450 "movq %%mm0, %%mm1 \n\t"
00451 "movq %%mm0, %%mm2 \n\t"
00452 "movq %%mm3, %%mm4 \n\t"
00453 "movq %%mm3, %%mm5 \n\t"
00454 "psllq $7, %%mm0 \n\t"
00455 "psllq $7, %%mm3 \n\t"
00456 "pand %%mm7, %%mm0 \n\t"
00457 "pand %%mm7, %%mm3 \n\t"
00458 "psrlq $6, %%mm1 \n\t"
00459 "psrlq $6, %%mm4 \n\t"
00460 "pand %%mm6, %%mm1 \n\t"
00461 "pand %%mm6, %%mm4 \n\t"
00462 "psrlq $19, %%mm2 \n\t"
00463 "psrlq $19, %%mm5 \n\t"
00464 "pand %2, %%mm2 \n\t"
00465 "pand %2, %%mm5 \n\t"
00466 "por %%mm1, %%mm0 \n\t"
00467 "por %%mm4, %%mm3 \n\t"
00468 "por %%mm2, %%mm0 \n\t"
00469 "por %%mm5, %%mm3 \n\t"
00470 "psllq $16, %%mm3 \n\t"
00471 "por %%mm3, %%mm0 \n\t"
00472 MOVNTQ" %%mm0, (%0) \n\t"
00473 ::"r"(d),"r"(s),"m"(blue_15mask):"memory");
00474 d += 4;
00475 s += 16;
00476 }
00477 __asm__ volatile(SFENCE:::"memory");
00478 __asm__ volatile(EMMS:::"memory");
00479 while (s < end) {
00480 register int rgb = *(const uint32_t*)s; s += 4;
00481 *d++ = ((rgb&0xF8)<<7) + ((rgb&0xF800)>>6) + ((rgb&0xF80000)>>19);
00482 }
00483 }
00484
00485 static inline void RENAME(rgb24tobgr16)(const uint8_t *src, uint8_t *dst, int src_size)
00486 {
00487 const uint8_t *s = src;
00488 const uint8_t *end;
00489 const uint8_t *mm_end;
00490 uint16_t *d = (uint16_t *)dst;
00491 end = s + src_size;
00492 __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
00493 __asm__ volatile(
00494 "movq %0, %%mm7 \n\t"
00495 "movq %1, %%mm6 \n\t"
00496 ::"m"(red_16mask),"m"(green_16mask));
00497 mm_end = end - 11;
00498 while (s < mm_end) {
00499 __asm__ volatile(
00500 PREFETCH" 32(%1) \n\t"
00501 "movd (%1), %%mm0 \n\t"
00502 "movd 3(%1), %%mm3 \n\t"
00503 "punpckldq 6(%1), %%mm0 \n\t"
00504 "punpckldq 9(%1), %%mm3 \n\t"
00505 "movq %%mm0, %%mm1 \n\t"
00506 "movq %%mm0, %%mm2 \n\t"
00507 "movq %%mm3, %%mm4 \n\t"
00508 "movq %%mm3, %%mm5 \n\t"
00509 "psrlq $3, %%mm0 \n\t"
00510 "psrlq $3, %%mm3 \n\t"
00511 "pand %2, %%mm0 \n\t"
00512 "pand %2, %%mm3 \n\t"
00513 "psrlq $5, %%mm1 \n\t"
00514 "psrlq $5, %%mm4 \n\t"
00515 "pand %%mm6, %%mm1 \n\t"
00516 "pand %%mm6, %%mm4 \n\t"
00517 "psrlq $8, %%mm2 \n\t"
00518 "psrlq $8, %%mm5 \n\t"
00519 "pand %%mm7, %%mm2 \n\t"
00520 "pand %%mm7, %%mm5 \n\t"
00521 "por %%mm1, %%mm0 \n\t"
00522 "por %%mm4, %%mm3 \n\t"
00523 "por %%mm2, %%mm0 \n\t"
00524 "por %%mm5, %%mm3 \n\t"
00525 "psllq $16, %%mm3 \n\t"
00526 "por %%mm3, %%mm0 \n\t"
00527 MOVNTQ" %%mm0, (%0) \n\t"
00528 ::"r"(d),"r"(s),"m"(blue_16mask):"memory");
00529 d += 4;
00530 s += 12;
00531 }
00532 __asm__ volatile(SFENCE:::"memory");
00533 __asm__ volatile(EMMS:::"memory");
00534 while (s < end) {
00535 const int b = *s++;
00536 const int g = *s++;
00537 const int r = *s++;
00538 *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
00539 }
00540 }
00541
00542 static inline void RENAME(rgb24to16)(const uint8_t *src, uint8_t *dst, int src_size)
00543 {
00544 const uint8_t *s = src;
00545 const uint8_t *end;
00546 const uint8_t *mm_end;
00547 uint16_t *d = (uint16_t *)dst;
00548 end = s + src_size;
00549 __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
00550 __asm__ volatile(
00551 "movq %0, %%mm7 \n\t"
00552 "movq %1, %%mm6 \n\t"
00553 ::"m"(red_16mask),"m"(green_16mask));
00554 mm_end = end - 15;
00555 while (s < mm_end) {
00556 __asm__ volatile(
00557 PREFETCH" 32(%1) \n\t"
00558 "movd (%1), %%mm0 \n\t"
00559 "movd 3(%1), %%mm3 \n\t"
00560 "punpckldq 6(%1), %%mm0 \n\t"
00561 "punpckldq 9(%1), %%mm3 \n\t"
00562 "movq %%mm0, %%mm1 \n\t"
00563 "movq %%mm0, %%mm2 \n\t"
00564 "movq %%mm3, %%mm4 \n\t"
00565 "movq %%mm3, %%mm5 \n\t"
00566 "psllq $8, %%mm0 \n\t"
00567 "psllq $8, %%mm3 \n\t"
00568 "pand %%mm7, %%mm0 \n\t"
00569 "pand %%mm7, %%mm3 \n\t"
00570 "psrlq $5, %%mm1 \n\t"
00571 "psrlq $5, %%mm4 \n\t"
00572 "pand %%mm6, %%mm1 \n\t"
00573 "pand %%mm6, %%mm4 \n\t"
00574 "psrlq $19, %%mm2 \n\t"
00575 "psrlq $19, %%mm5 \n\t"
00576 "pand %2, %%mm2 \n\t"
00577 "pand %2, %%mm5 \n\t"
00578 "por %%mm1, %%mm0 \n\t"
00579 "por %%mm4, %%mm3 \n\t"
00580 "por %%mm2, %%mm0 \n\t"
00581 "por %%mm5, %%mm3 \n\t"
00582 "psllq $16, %%mm3 \n\t"
00583 "por %%mm3, %%mm0 \n\t"
00584 MOVNTQ" %%mm0, (%0) \n\t"
00585 ::"r"(d),"r"(s),"m"(blue_16mask):"memory");
00586 d += 4;
00587 s += 12;
00588 }
00589 __asm__ volatile(SFENCE:::"memory");
00590 __asm__ volatile(EMMS:::"memory");
00591 while (s < end) {
00592 const int r = *s++;
00593 const int g = *s++;
00594 const int b = *s++;
00595 *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
00596 }
00597 }
00598
00599 static inline void RENAME(rgb24tobgr15)(const uint8_t *src, uint8_t *dst, int src_size)
00600 {
00601 const uint8_t *s = src;
00602 const uint8_t *end;
00603 const uint8_t *mm_end;
00604 uint16_t *d = (uint16_t *)dst;
00605 end = s + src_size;
00606 __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
00607 __asm__ volatile(
00608 "movq %0, %%mm7 \n\t"
00609 "movq %1, %%mm6 \n\t"
00610 ::"m"(red_15mask),"m"(green_15mask));
00611 mm_end = end - 11;
00612 while (s < mm_end) {
00613 __asm__ volatile(
00614 PREFETCH" 32(%1) \n\t"
00615 "movd (%1), %%mm0 \n\t"
00616 "movd 3(%1), %%mm3 \n\t"
00617 "punpckldq 6(%1), %%mm0 \n\t"
00618 "punpckldq 9(%1), %%mm3 \n\t"
00619 "movq %%mm0, %%mm1 \n\t"
00620 "movq %%mm0, %%mm2 \n\t"
00621 "movq %%mm3, %%mm4 \n\t"
00622 "movq %%mm3, %%mm5 \n\t"
00623 "psrlq $3, %%mm0 \n\t"
00624 "psrlq $3, %%mm3 \n\t"
00625 "pand %2, %%mm0 \n\t"
00626 "pand %2, %%mm3 \n\t"
00627 "psrlq $6, %%mm1 \n\t"
00628 "psrlq $6, %%mm4 \n\t"
00629 "pand %%mm6, %%mm1 \n\t"
00630 "pand %%mm6, %%mm4 \n\t"
00631 "psrlq $9, %%mm2 \n\t"
00632 "psrlq $9, %%mm5 \n\t"
00633 "pand %%mm7, %%mm2 \n\t"
00634 "pand %%mm7, %%mm5 \n\t"
00635 "por %%mm1, %%mm0 \n\t"
00636 "por %%mm4, %%mm3 \n\t"
00637 "por %%mm2, %%mm0 \n\t"
00638 "por %%mm5, %%mm3 \n\t"
00639 "psllq $16, %%mm3 \n\t"
00640 "por %%mm3, %%mm0 \n\t"
00641 MOVNTQ" %%mm0, (%0) \n\t"
00642 ::"r"(d),"r"(s),"m"(blue_15mask):"memory");
00643 d += 4;
00644 s += 12;
00645 }
00646 __asm__ volatile(SFENCE:::"memory");
00647 __asm__ volatile(EMMS:::"memory");
00648 while (s < end) {
00649 const int b = *s++;
00650 const int g = *s++;
00651 const int r = *s++;
00652 *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
00653 }
00654 }
00655
00656 static inline void RENAME(rgb24to15)(const uint8_t *src, uint8_t *dst, int src_size)
00657 {
00658 const uint8_t *s = src;
00659 const uint8_t *end;
00660 const uint8_t *mm_end;
00661 uint16_t *d = (uint16_t *)dst;
00662 end = s + src_size;
00663 __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
00664 __asm__ volatile(
00665 "movq %0, %%mm7 \n\t"
00666 "movq %1, %%mm6 \n\t"
00667 ::"m"(red_15mask),"m"(green_15mask));
00668 mm_end = end - 15;
00669 while (s < mm_end) {
00670 __asm__ volatile(
00671 PREFETCH" 32(%1) \n\t"
00672 "movd (%1), %%mm0 \n\t"
00673 "movd 3(%1), %%mm3 \n\t"
00674 "punpckldq 6(%1), %%mm0 \n\t"
00675 "punpckldq 9(%1), %%mm3 \n\t"
00676 "movq %%mm0, %%mm1 \n\t"
00677 "movq %%mm0, %%mm2 \n\t"
00678 "movq %%mm3, %%mm4 \n\t"
00679 "movq %%mm3, %%mm5 \n\t"
00680 "psllq $7, %%mm0 \n\t"
00681 "psllq $7, %%mm3 \n\t"
00682 "pand %%mm7, %%mm0 \n\t"
00683 "pand %%mm7, %%mm3 \n\t"
00684 "psrlq $6, %%mm1 \n\t"
00685 "psrlq $6, %%mm4 \n\t"
00686 "pand %%mm6, %%mm1 \n\t"
00687 "pand %%mm6, %%mm4 \n\t"
00688 "psrlq $19, %%mm2 \n\t"
00689 "psrlq $19, %%mm5 \n\t"
00690 "pand %2, %%mm2 \n\t"
00691 "pand %2, %%mm5 \n\t"
00692 "por %%mm1, %%mm0 \n\t"
00693 "por %%mm4, %%mm3 \n\t"
00694 "por %%mm2, %%mm0 \n\t"
00695 "por %%mm5, %%mm3 \n\t"
00696 "psllq $16, %%mm3 \n\t"
00697 "por %%mm3, %%mm0 \n\t"
00698 MOVNTQ" %%mm0, (%0) \n\t"
00699 ::"r"(d),"r"(s),"m"(blue_15mask):"memory");
00700 d += 4;
00701 s += 12;
00702 }
00703 __asm__ volatile(SFENCE:::"memory");
00704 __asm__ volatile(EMMS:::"memory");
00705 while (s < end) {
00706 const int r = *s++;
00707 const int g = *s++;
00708 const int b = *s++;
00709 *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
00710 }
00711 }
00712
00713 static inline void RENAME(rgb15tobgr24)(const uint8_t *src, uint8_t *dst, int src_size)
00714 {
00715 const uint16_t *end;
00716 const uint16_t *mm_end;
00717 uint8_t *d = dst;
00718 const uint16_t *s = (const uint16_t*)src;
00719 end = s + src_size/2;
00720 __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory");
00721 mm_end = end - 7;
00722 while (s < mm_end) {
00723 __asm__ volatile(
00724 PREFETCH" 32(%1) \n\t"
00725 "movq (%1), %%mm0 \n\t"
00726 "movq (%1), %%mm1 \n\t"
00727 "movq (%1), %%mm2 \n\t"
00728 "pand %2, %%mm0 \n\t"
00729 "pand %3, %%mm1 \n\t"
00730 "pand %4, %%mm2 \n\t"
00731 "psllq $5, %%mm0 \n\t"
00732 "pmulhw "MANGLE(mul15_mid)", %%mm0 \n\t"
00733 "pmulhw "MANGLE(mul15_mid)", %%mm1 \n\t"
00734 "pmulhw "MANGLE(mul15_hi)", %%mm2 \n\t"
00735 "movq %%mm0, %%mm3 \n\t"
00736 "movq %%mm1, %%mm4 \n\t"
00737 "movq %%mm2, %%mm5 \n\t"
00738 "punpcklwd %5, %%mm0 \n\t"
00739 "punpcklwd %5, %%mm1 \n\t"
00740 "punpcklwd %5, %%mm2 \n\t"
00741 "punpckhwd %5, %%mm3 \n\t"
00742 "punpckhwd %5, %%mm4 \n\t"
00743 "punpckhwd %5, %%mm5 \n\t"
00744 "psllq $8, %%mm1 \n\t"
00745 "psllq $16, %%mm2 \n\t"
00746 "por %%mm1, %%mm0 \n\t"
00747 "por %%mm2, %%mm0 \n\t"
00748 "psllq $8, %%mm4 \n\t"
00749 "psllq $16, %%mm5 \n\t"
00750 "por %%mm4, %%mm3 \n\t"
00751 "por %%mm5, %%mm3 \n\t"
00752
00753 "movq %%mm0, %%mm6 \n\t"
00754 "movq %%mm3, %%mm7 \n\t"
00755
00756 "movq 8(%1), %%mm0 \n\t"
00757 "movq 8(%1), %%mm1 \n\t"
00758 "movq 8(%1), %%mm2 \n\t"
00759 "pand %2, %%mm0 \n\t"
00760 "pand %3, %%mm1 \n\t"
00761 "pand %4, %%mm2 \n\t"
00762 "psllq $5, %%mm0 \n\t"
00763 "pmulhw "MANGLE(mul15_mid)", %%mm0 \n\t"
00764 "pmulhw "MANGLE(mul15_mid)", %%mm1 \n\t"
00765 "pmulhw "MANGLE(mul15_hi)", %%mm2 \n\t"
00766 "movq %%mm0, %%mm3 \n\t"
00767 "movq %%mm1, %%mm4 \n\t"
00768 "movq %%mm2, %%mm5 \n\t"
00769 "punpcklwd %5, %%mm0 \n\t"
00770 "punpcklwd %5, %%mm1 \n\t"
00771 "punpcklwd %5, %%mm2 \n\t"
00772 "punpckhwd %5, %%mm3 \n\t"
00773 "punpckhwd %5, %%mm4 \n\t"
00774 "punpckhwd %5, %%mm5 \n\t"
00775 "psllq $8, %%mm1 \n\t"
00776 "psllq $16, %%mm2 \n\t"
00777 "por %%mm1, %%mm0 \n\t"
00778 "por %%mm2, %%mm0 \n\t"
00779 "psllq $8, %%mm4 \n\t"
00780 "psllq $16, %%mm5 \n\t"
00781 "por %%mm4, %%mm3 \n\t"
00782 "por %%mm5, %%mm3 \n\t"
00783
00784 :"=m"(*d)
00785 :"r"(s),"m"(mask15b),"m"(mask15g),"m"(mask15r), "m"(mmx_null)
00786 :"memory");
00787
00788 __asm__ volatile(
00789 "movq %%mm0, %%mm4 \n\t"
00790 "movq %%mm3, %%mm5 \n\t"
00791 "movq %%mm6, %%mm0 \n\t"
00792 "movq %%mm7, %%mm1 \n\t"
00793
00794 "movq %%mm4, %%mm6 \n\t"
00795 "movq %%mm5, %%mm7 \n\t"
00796 "movq %%mm0, %%mm2 \n\t"
00797 "movq %%mm1, %%mm3 \n\t"
00798
00799 STORE_BGR24_MMX
00800
00801 :: "r"(d), "m"(*s)
00802 :"memory");
00803 d += 24;
00804 s += 8;
00805 }
00806 __asm__ volatile(SFENCE:::"memory");
00807 __asm__ volatile(EMMS:::"memory");
00808 while (s < end) {
00809 register uint16_t bgr;
00810 bgr = *s++;
00811 *d++ = ((bgr&0x1F)<<3) | ((bgr&0x1F)>>2);
00812 *d++ = ((bgr&0x3E0)>>2) | ((bgr&0x3E0)>>7);
00813 *d++ = ((bgr&0x7C00)>>7) | ((bgr&0x7C00)>>12);
00814 }
00815 }
00816
00817 static inline void RENAME(rgb16tobgr24)(const uint8_t *src, uint8_t *dst, int src_size)
00818 {
00819 const uint16_t *end;
00820 const uint16_t *mm_end;
00821 uint8_t *d = (uint8_t *)dst;
00822 const uint16_t *s = (const uint16_t *)src;
00823 end = s + src_size/2;
00824 __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory");
00825 mm_end = end - 7;
00826 while (s < mm_end) {
00827 __asm__ volatile(
00828 PREFETCH" 32(%1) \n\t"
00829 "movq (%1), %%mm0 \n\t"
00830 "movq (%1), %%mm1 \n\t"
00831 "movq (%1), %%mm2 \n\t"
00832 "pand %2, %%mm0 \n\t"
00833 "pand %3, %%mm1 \n\t"
00834 "pand %4, %%mm2 \n\t"
00835 "psllq $5, %%mm0 \n\t"
00836 "psrlq $1, %%mm2 \n\t"
00837 "pmulhw "MANGLE(mul15_mid)", %%mm0 \n\t"
00838 "pmulhw "MANGLE(mul16_mid)", %%mm1 \n\t"
00839 "pmulhw "MANGLE(mul15_hi)", %%mm2 \n\t"
00840 "movq %%mm0, %%mm3 \n\t"
00841 "movq %%mm1, %%mm4 \n\t"
00842 "movq %%mm2, %%mm5 \n\t"
00843 "punpcklwd %5, %%mm0 \n\t"
00844 "punpcklwd %5, %%mm1 \n\t"
00845 "punpcklwd %5, %%mm2 \n\t"
00846 "punpckhwd %5, %%mm3 \n\t"
00847 "punpckhwd %5, %%mm4 \n\t"
00848 "punpckhwd %5, %%mm5 \n\t"
00849 "psllq $8, %%mm1 \n\t"
00850 "psllq $16, %%mm2 \n\t"
00851 "por %%mm1, %%mm0 \n\t"
00852 "por %%mm2, %%mm0 \n\t"
00853 "psllq $8, %%mm4 \n\t"
00854 "psllq $16, %%mm5 \n\t"
00855 "por %%mm4, %%mm3 \n\t"
00856 "por %%mm5, %%mm3 \n\t"
00857
00858 "movq %%mm0, %%mm6 \n\t"
00859 "movq %%mm3, %%mm7 \n\t"
00860
00861 "movq 8(%1), %%mm0 \n\t"
00862 "movq 8(%1), %%mm1 \n\t"
00863 "movq 8(%1), %%mm2 \n\t"
00864 "pand %2, %%mm0 \n\t"
00865 "pand %3, %%mm1 \n\t"
00866 "pand %4, %%mm2 \n\t"
00867 "psllq $5, %%mm0 \n\t"
00868 "psrlq $1, %%mm2 \n\t"
00869 "pmulhw "MANGLE(mul15_mid)", %%mm0 \n\t"
00870 "pmulhw "MANGLE(mul16_mid)", %%mm1 \n\t"
00871 "pmulhw "MANGLE(mul15_hi)", %%mm2 \n\t"
00872 "movq %%mm0, %%mm3 \n\t"
00873 "movq %%mm1, %%mm4 \n\t"
00874 "movq %%mm2, %%mm5 \n\t"
00875 "punpcklwd %5, %%mm0 \n\t"
00876 "punpcklwd %5, %%mm1 \n\t"
00877 "punpcklwd %5, %%mm2 \n\t"
00878 "punpckhwd %5, %%mm3 \n\t"
00879 "punpckhwd %5, %%mm4 \n\t"
00880 "punpckhwd %5, %%mm5 \n\t"
00881 "psllq $8, %%mm1 \n\t"
00882 "psllq $16, %%mm2 \n\t"
00883 "por %%mm1, %%mm0 \n\t"
00884 "por %%mm2, %%mm0 \n\t"
00885 "psllq $8, %%mm4 \n\t"
00886 "psllq $16, %%mm5 \n\t"
00887 "por %%mm4, %%mm3 \n\t"
00888 "por %%mm5, %%mm3 \n\t"
00889 :"=m"(*d)
00890 :"r"(s),"m"(mask16b),"m"(mask16g),"m"(mask16r),"m"(mmx_null)
00891 :"memory");
00892
00893 __asm__ volatile(
00894 "movq %%mm0, %%mm4 \n\t"
00895 "movq %%mm3, %%mm5 \n\t"
00896 "movq %%mm6, %%mm0 \n\t"
00897 "movq %%mm7, %%mm1 \n\t"
00898
00899 "movq %%mm4, %%mm6 \n\t"
00900 "movq %%mm5, %%mm7 \n\t"
00901 "movq %%mm0, %%mm2 \n\t"
00902 "movq %%mm1, %%mm3 \n\t"
00903
00904 STORE_BGR24_MMX
00905
00906 :: "r"(d), "m"(*s)
00907 :"memory");
00908 d += 24;
00909 s += 8;
00910 }
00911 __asm__ volatile(SFENCE:::"memory");
00912 __asm__ volatile(EMMS:::"memory");
00913 while (s < end) {
00914 register uint16_t bgr;
00915 bgr = *s++;
00916 *d++ = ((bgr&0x1F)<<3) | ((bgr&0x1F)>>2);
00917 *d++ = ((bgr&0x7E0)>>3) | ((bgr&0x7E0)>>9);
00918 *d++ = ((bgr&0xF800)>>8) | ((bgr&0xF800)>>13);
00919 }
00920 }
00921
00922
00923
00924
00925
00926
00927
00928
00929 #define PACK_RGB32 \
00930 "packuswb %%mm7, %%mm0 \n\t" \
00931 "packuswb %%mm7, %%mm1 \n\t" \
00932 "packuswb %%mm7, %%mm2 \n\t" \
00933 "punpcklbw %%mm1, %%mm0 \n\t" \
00934 "punpcklbw %%mm6, %%mm2 \n\t" \
00935 "movq %%mm0, %%mm3 \n\t" \
00936 "punpcklwd %%mm2, %%mm0 \n\t" \
00937 "punpckhwd %%mm2, %%mm3 \n\t" \
00938 MOVNTQ" %%mm0, (%0) \n\t" \
00939 MOVNTQ" %%mm3, 8(%0) \n\t" \
00940
00941 static inline void RENAME(rgb15to32)(const uint8_t *src, uint8_t *dst, int src_size)
00942 {
00943 const uint16_t *end;
00944 const uint16_t *mm_end;
00945 uint8_t *d = dst;
00946 const uint16_t *s = (const uint16_t *)src;
00947 end = s + src_size/2;
00948 __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory");
00949 __asm__ volatile("pxor %%mm7,%%mm7 \n\t":::"memory");
00950 __asm__ volatile("pcmpeqd %%mm6,%%mm6 \n\t":::"memory");
00951 mm_end = end - 3;
00952 while (s < mm_end) {
00953 __asm__ volatile(
00954 PREFETCH" 32(%1) \n\t"
00955 "movq (%1), %%mm0 \n\t"
00956 "movq (%1), %%mm1 \n\t"
00957 "movq (%1), %%mm2 \n\t"
00958 "pand %2, %%mm0 \n\t"
00959 "pand %3, %%mm1 \n\t"
00960 "pand %4, %%mm2 \n\t"
00961 "psllq $5, %%mm0 \n\t"
00962 "pmulhw %5, %%mm0 \n\t"
00963 "pmulhw %5, %%mm1 \n\t"
00964 "pmulhw "MANGLE(mul15_hi)", %%mm2 \n\t"
00965 PACK_RGB32
00966 ::"r"(d),"r"(s),"m"(mask15b),"m"(mask15g),"m"(mask15r) ,"m"(mul15_mid)
00967 :"memory");
00968 d += 16;
00969 s += 4;
00970 }
00971 __asm__ volatile(SFENCE:::"memory");
00972 __asm__ volatile(EMMS:::"memory");
00973 while (s < end) {
00974 register uint16_t bgr;
00975 bgr = *s++;
00976 *d++ = ((bgr&0x1F)<<3) | ((bgr&0x1F)>>2);
00977 *d++ = ((bgr&0x3E0)>>2) | ((bgr&0x3E0)>>7);
00978 *d++ = ((bgr&0x7C00)>>7) | ((bgr&0x7C00)>>12);
00979 *d++ = 255;
00980 }
00981 }
00982
00983 static inline void RENAME(rgb16to32)(const uint8_t *src, uint8_t *dst, int src_size)
00984 {
00985 const uint16_t *end;
00986 const uint16_t *mm_end;
00987 uint8_t *d = dst;
00988 const uint16_t *s = (const uint16_t*)src;
00989 end = s + src_size/2;
00990 __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory");
00991 __asm__ volatile("pxor %%mm7,%%mm7 \n\t":::"memory");
00992 __asm__ volatile("pcmpeqd %%mm6,%%mm6 \n\t":::"memory");
00993 mm_end = end - 3;
00994 while (s < mm_end) {
00995 __asm__ volatile(
00996 PREFETCH" 32(%1) \n\t"
00997 "movq (%1), %%mm0 \n\t"
00998 "movq (%1), %%mm1 \n\t"
00999 "movq (%1), %%mm2 \n\t"
01000 "pand %2, %%mm0 \n\t"
01001 "pand %3, %%mm1 \n\t"
01002 "pand %4, %%mm2 \n\t"
01003 "psllq $5, %%mm0 \n\t"
01004 "psrlq $1, %%mm2 \n\t"
01005 "pmulhw %5, %%mm0 \n\t"
01006 "pmulhw "MANGLE(mul16_mid)", %%mm1 \n\t"
01007 "pmulhw "MANGLE(mul15_hi)", %%mm2 \n\t"
01008 PACK_RGB32
01009 ::"r"(d),"r"(s),"m"(mask16b),"m"(mask16g),"m"(mask16r),"m"(mul15_mid)
01010 :"memory");
01011 d += 16;
01012 s += 4;
01013 }
01014 __asm__ volatile(SFENCE:::"memory");
01015 __asm__ volatile(EMMS:::"memory");
01016 while (s < end) {
01017 register uint16_t bgr;
01018 bgr = *s++;
01019 *d++ = ((bgr&0x1F)<<3) | ((bgr&0x1F)>>2);
01020 *d++ = ((bgr&0x7E0)>>3) | ((bgr&0x7E0)>>9);
01021 *d++ = ((bgr&0xF800)>>8) | ((bgr&0xF800)>>13);
01022 *d++ = 255;
01023 }
01024 }
01025
01026 static inline void RENAME(shuffle_bytes_2103)(const uint8_t *src, uint8_t *dst, int src_size)
01027 {
01028 x86_reg idx = 15 - src_size;
01029 const uint8_t *s = src-idx;
01030 uint8_t *d = dst-idx;
01031 __asm__ volatile(
01032 "test %0, %0 \n\t"
01033 "jns 2f \n\t"
01034 PREFETCH" (%1, %0) \n\t"
01035 "movq %3, %%mm7 \n\t"
01036 "pxor %4, %%mm7 \n\t"
01037 "movq %%mm7, %%mm6 \n\t"
01038 "pxor %5, %%mm7 \n\t"
01039 ".p2align 4 \n\t"
01040 "1: \n\t"
01041 PREFETCH" 32(%1, %0) \n\t"
01042 "movq (%1, %0), %%mm0 \n\t"
01043 "movq 8(%1, %0), %%mm1 \n\t"
01044 # if COMPILE_TEMPLATE_MMXEXT
01045 "pshufw $177, %%mm0, %%mm3 \n\t"
01046 "pshufw $177, %%mm1, %%mm5 \n\t"
01047 "pand %%mm7, %%mm0 \n\t"
01048 "pand %%mm6, %%mm3 \n\t"
01049 "pand %%mm7, %%mm1 \n\t"
01050 "pand %%mm6, %%mm5 \n\t"
01051 "por %%mm3, %%mm0 \n\t"
01052 "por %%mm5, %%mm1 \n\t"
01053 # else
01054 "movq %%mm0, %%mm2 \n\t"
01055 "movq %%mm1, %%mm4 \n\t"
01056 "pand %%mm7, %%mm0 \n\t"
01057 "pand %%mm6, %%mm2 \n\t"
01058 "pand %%mm7, %%mm1 \n\t"
01059 "pand %%mm6, %%mm4 \n\t"
01060 "movq %%mm2, %%mm3 \n\t"
01061 "movq %%mm4, %%mm5 \n\t"
01062 "pslld $16, %%mm2 \n\t"
01063 "psrld $16, %%mm3 \n\t"
01064 "pslld $16, %%mm4 \n\t"
01065 "psrld $16, %%mm5 \n\t"
01066 "por %%mm2, %%mm0 \n\t"
01067 "por %%mm4, %%mm1 \n\t"
01068 "por %%mm3, %%mm0 \n\t"
01069 "por %%mm5, %%mm1 \n\t"
01070 # endif
01071 MOVNTQ" %%mm0, (%2, %0) \n\t"
01072 MOVNTQ" %%mm1, 8(%2, %0) \n\t"
01073 "add $16, %0 \n\t"
01074 "js 1b \n\t"
01075 SFENCE" \n\t"
01076 EMMS" \n\t"
01077 "2: \n\t"
01078 : "+&r"(idx)
01079 : "r" (s), "r" (d), "m" (mask32b), "m" (mask32r), "m" (mmx_one)
01080 : "memory");
01081 for (; idx<15; idx+=4) {
01082 register int v = *(const uint32_t *)&s[idx], g = v & 0xff00ff00;
01083 v &= 0xff00ff;
01084 *(uint32_t *)&d[idx] = (v>>16) + g + (v<<16);
01085 }
01086 }
01087
01088 static inline void RENAME(rgb24tobgr24)(const uint8_t *src, uint8_t *dst, int src_size)
01089 {
01090 unsigned i;
01091 x86_reg mmx_size= 23 - src_size;
01092 __asm__ volatile (
01093 "test %%"REG_a", %%"REG_a" \n\t"
01094 "jns 2f \n\t"
01095 "movq "MANGLE(mask24r)", %%mm5 \n\t"
01096 "movq "MANGLE(mask24g)", %%mm6 \n\t"
01097 "movq "MANGLE(mask24b)", %%mm7 \n\t"
01098 ".p2align 4 \n\t"
01099 "1: \n\t"
01100 PREFETCH" 32(%1, %%"REG_a") \n\t"
01101 "movq (%1, %%"REG_a"), %%mm0 \n\t"
01102 "movq (%1, %%"REG_a"), %%mm1 \n\t"
01103 "movq 2(%1, %%"REG_a"), %%mm2 \n\t"
01104 "psllq $16, %%mm0 \n\t"
01105 "pand %%mm5, %%mm0 \n\t"
01106 "pand %%mm6, %%mm1 \n\t"
01107 "pand %%mm7, %%mm2 \n\t"
01108 "por %%mm0, %%mm1 \n\t"
01109 "por %%mm2, %%mm1 \n\t"
01110 "movq 6(%1, %%"REG_a"), %%mm0 \n\t"
01111 MOVNTQ" %%mm1, (%2, %%"REG_a") \n\t"
01112 "movq 8(%1, %%"REG_a"), %%mm1 \n\t"
01113 "movq 10(%1, %%"REG_a"), %%mm2 \n\t"
01114 "pand %%mm7, %%mm0 \n\t"
01115 "pand %%mm5, %%mm1 \n\t"
01116 "pand %%mm6, %%mm2 \n\t"
01117 "por %%mm0, %%mm1 \n\t"
01118 "por %%mm2, %%mm1 \n\t"
01119 "movq 14(%1, %%"REG_a"), %%mm0 \n\t"
01120 MOVNTQ" %%mm1, 8(%2, %%"REG_a") \n\t"
01121 "movq 16(%1, %%"REG_a"), %%mm1 \n\t"
01122 "movq 18(%1, %%"REG_a"), %%mm2 \n\t"
01123 "pand %%mm6, %%mm0 \n\t"
01124 "pand %%mm7, %%mm1 \n\t"
01125 "pand %%mm5, %%mm2 \n\t"
01126 "por %%mm0, %%mm1 \n\t"
01127 "por %%mm2, %%mm1 \n\t"
01128 MOVNTQ" %%mm1, 16(%2, %%"REG_a") \n\t"
01129 "add $24, %%"REG_a" \n\t"
01130 " js 1b \n\t"
01131 "2: \n\t"
01132 : "+a" (mmx_size)
01133 : "r" (src-mmx_size), "r"(dst-mmx_size)
01134 );
01135
01136 __asm__ volatile(SFENCE:::"memory");
01137 __asm__ volatile(EMMS:::"memory");
01138
01139 if (mmx_size==23) return;
01140
01141 src+= src_size;
01142 dst+= src_size;
01143 src_size= 23-mmx_size;
01144 src-= src_size;
01145 dst-= src_size;
01146 for (i=0; i<src_size; i+=3) {
01147 register uint8_t x;
01148 x = src[i + 2];
01149 dst[i + 1] = src[i + 1];
01150 dst[i + 2] = src[i + 0];
01151 dst[i + 0] = x;
01152 }
01153 }
01154
01155 static inline void RENAME(yuvPlanartoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
01156 int width, int height,
01157 int lumStride, int chromStride, int dstStride, int vertLumPerChroma)
01158 {
01159 int y;
01160 const x86_reg chromWidth= width>>1;
01161 for (y=0; y<height; y++) {
01162
01163 __asm__ volatile(
01164 "xor %%"REG_a", %%"REG_a" \n\t"
01165 ".p2align 4 \n\t"
01166 "1: \n\t"
01167 PREFETCH" 32(%1, %%"REG_a", 2) \n\t"
01168 PREFETCH" 32(%2, %%"REG_a") \n\t"
01169 PREFETCH" 32(%3, %%"REG_a") \n\t"
01170 "movq (%2, %%"REG_a"), %%mm0 \n\t"
01171 "movq %%mm0, %%mm2 \n\t"
01172 "movq (%3, %%"REG_a"), %%mm1 \n\t"
01173 "punpcklbw %%mm1, %%mm0 \n\t"
01174 "punpckhbw %%mm1, %%mm2 \n\t"
01175
01176 "movq (%1, %%"REG_a",2), %%mm3 \n\t"
01177 "movq 8(%1, %%"REG_a",2), %%mm5 \n\t"
01178 "movq %%mm3, %%mm4 \n\t"
01179 "movq %%mm5, %%mm6 \n\t"
01180 "punpcklbw %%mm0, %%mm3 \n\t"
01181 "punpckhbw %%mm0, %%mm4 \n\t"
01182 "punpcklbw %%mm2, %%mm5 \n\t"
01183 "punpckhbw %%mm2, %%mm6 \n\t"
01184
01185 MOVNTQ" %%mm3, (%0, %%"REG_a", 4) \n\t"
01186 MOVNTQ" %%mm4, 8(%0, %%"REG_a", 4) \n\t"
01187 MOVNTQ" %%mm5, 16(%0, %%"REG_a", 4) \n\t"
01188 MOVNTQ" %%mm6, 24(%0, %%"REG_a", 4) \n\t"
01189
01190 "add $8, %%"REG_a" \n\t"
01191 "cmp %4, %%"REG_a" \n\t"
01192 " jb 1b \n\t"
01193 ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" (chromWidth)
01194 : "%"REG_a
01195 );
01196 if ((y&(vertLumPerChroma-1)) == vertLumPerChroma-1) {
01197 usrc += chromStride;
01198 vsrc += chromStride;
01199 }
01200 ysrc += lumStride;
01201 dst += dstStride;
01202 }
01203 __asm__(EMMS" \n\t"
01204 SFENCE" \n\t"
01205 :::"memory");
01206 }
01207
01212 static inline void RENAME(yv12toyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
01213 int width, int height,
01214 int lumStride, int chromStride, int dstStride)
01215 {
01216
01217 RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2);
01218 }
01219
01220 static inline void RENAME(yuvPlanartouyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
01221 int width, int height,
01222 int lumStride, int chromStride, int dstStride, int vertLumPerChroma)
01223 {
01224 int y;
01225 const x86_reg chromWidth= width>>1;
01226 for (y=0; y<height; y++) {
01227
01228 __asm__ volatile(
01229 "xor %%"REG_a", %%"REG_a" \n\t"
01230 ".p2align 4 \n\t"
01231 "1: \n\t"
01232 PREFETCH" 32(%1, %%"REG_a", 2) \n\t"
01233 PREFETCH" 32(%2, %%"REG_a") \n\t"
01234 PREFETCH" 32(%3, %%"REG_a") \n\t"
01235 "movq (%2, %%"REG_a"), %%mm0 \n\t"
01236 "movq %%mm0, %%mm2 \n\t"
01237 "movq (%3, %%"REG_a"), %%mm1 \n\t"
01238 "punpcklbw %%mm1, %%mm0 \n\t"
01239 "punpckhbw %%mm1, %%mm2 \n\t"
01240
01241 "movq (%1, %%"REG_a",2), %%mm3 \n\t"
01242 "movq 8(%1, %%"REG_a",2), %%mm5 \n\t"
01243 "movq %%mm0, %%mm4 \n\t"
01244 "movq %%mm2, %%mm6 \n\t"
01245 "punpcklbw %%mm3, %%mm0 \n\t"
01246 "punpckhbw %%mm3, %%mm4 \n\t"
01247 "punpcklbw %%mm5, %%mm2 \n\t"
01248 "punpckhbw %%mm5, %%mm6 \n\t"
01249
01250 MOVNTQ" %%mm0, (%0, %%"REG_a", 4) \n\t"
01251 MOVNTQ" %%mm4, 8(%0, %%"REG_a", 4) \n\t"
01252 MOVNTQ" %%mm2, 16(%0, %%"REG_a", 4) \n\t"
01253 MOVNTQ" %%mm6, 24(%0, %%"REG_a", 4) \n\t"
01254
01255 "add $8, %%"REG_a" \n\t"
01256 "cmp %4, %%"REG_a" \n\t"
01257 " jb 1b \n\t"
01258 ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" (chromWidth)
01259 : "%"REG_a
01260 );
01261 if ((y&(vertLumPerChroma-1)) == vertLumPerChroma-1) {
01262 usrc += chromStride;
01263 vsrc += chromStride;
01264 }
01265 ysrc += lumStride;
01266 dst += dstStride;
01267 }
01268 __asm__(EMMS" \n\t"
01269 SFENCE" \n\t"
01270 :::"memory");
01271 }
01272
01277 static inline void RENAME(yv12touyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
01278 int width, int height,
01279 int lumStride, int chromStride, int dstStride)
01280 {
01281
01282 RENAME(yuvPlanartouyvy)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2);
01283 }
01284
01288 static inline void RENAME(yuv422ptouyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
01289 int width, int height,
01290 int lumStride, int chromStride, int dstStride)
01291 {
01292 RENAME(yuvPlanartouyvy)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 1);
01293 }
01294
01298 static inline void RENAME(yuv422ptoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
01299 int width, int height,
01300 int lumStride, int chromStride, int dstStride)
01301 {
01302 RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 1);
01303 }
01304
01309 static inline void RENAME(yuy2toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
01310 int width, int height,
01311 int lumStride, int chromStride, int srcStride)
01312 {
01313 int y;
01314 const x86_reg chromWidth= width>>1;
01315 for (y=0; y<height; y+=2) {
01316 __asm__ volatile(
01317 "xor %%"REG_a", %%"REG_a" \n\t"
01318 "pcmpeqw %%mm7, %%mm7 \n\t"
01319 "psrlw $8, %%mm7 \n\t"
01320 ".p2align 4 \n\t"
01321 "1: \n\t"
01322 PREFETCH" 64(%0, %%"REG_a", 4) \n\t"
01323 "movq (%0, %%"REG_a", 4), %%mm0 \n\t"
01324 "movq 8(%0, %%"REG_a", 4), %%mm1 \n\t"
01325 "movq %%mm0, %%mm2 \n\t"
01326 "movq %%mm1, %%mm3 \n\t"
01327 "psrlw $8, %%mm0 \n\t"
01328 "psrlw $8, %%mm1 \n\t"
01329 "pand %%mm7, %%mm2 \n\t"
01330 "pand %%mm7, %%mm3 \n\t"
01331 "packuswb %%mm1, %%mm0 \n\t"
01332 "packuswb %%mm3, %%mm2 \n\t"
01333
01334 MOVNTQ" %%mm2, (%1, %%"REG_a", 2) \n\t"
01335
01336 "movq 16(%0, %%"REG_a", 4), %%mm1 \n\t"
01337 "movq 24(%0, %%"REG_a", 4), %%mm2 \n\t"
01338 "movq %%mm1, %%mm3 \n\t"
01339 "movq %%mm2, %%mm4 \n\t"
01340 "psrlw $8, %%mm1 \n\t"
01341 "psrlw $8, %%mm2 \n\t"
01342 "pand %%mm7, %%mm3 \n\t"
01343 "pand %%mm7, %%mm4 \n\t"
01344 "packuswb %%mm2, %%mm1 \n\t"
01345 "packuswb %%mm4, %%mm3 \n\t"
01346
01347 MOVNTQ" %%mm3, 8(%1, %%"REG_a", 2) \n\t"
01348
01349 "movq %%mm0, %%mm2 \n\t"
01350 "movq %%mm1, %%mm3 \n\t"
01351 "psrlw $8, %%mm0 \n\t"
01352 "psrlw $8, %%mm1 \n\t"
01353 "pand %%mm7, %%mm2 \n\t"
01354 "pand %%mm7, %%mm3 \n\t"
01355 "packuswb %%mm1, %%mm0 \n\t"
01356 "packuswb %%mm3, %%mm2 \n\t"
01357
01358 MOVNTQ" %%mm0, (%3, %%"REG_a") \n\t"
01359 MOVNTQ" %%mm2, (%2, %%"REG_a") \n\t"
01360
01361 "add $8, %%"REG_a" \n\t"
01362 "cmp %4, %%"REG_a" \n\t"
01363 " jb 1b \n\t"
01364 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
01365 : "memory", "%"REG_a
01366 );
01367
01368 ydst += lumStride;
01369 src += srcStride;
01370
01371 __asm__ volatile(
01372 "xor %%"REG_a", %%"REG_a" \n\t"
01373 ".p2align 4 \n\t"
01374 "1: \n\t"
01375 PREFETCH" 64(%0, %%"REG_a", 4) \n\t"
01376 "movq (%0, %%"REG_a", 4), %%mm0 \n\t"
01377 "movq 8(%0, %%"REG_a", 4), %%mm1 \n\t"
01378 "movq 16(%0, %%"REG_a", 4), %%mm2 \n\t"
01379 "movq 24(%0, %%"REG_a", 4), %%mm3 \n\t"
01380 "pand %%mm7, %%mm0 \n\t"
01381 "pand %%mm7, %%mm1 \n\t"
01382 "pand %%mm7, %%mm2 \n\t"
01383 "pand %%mm7, %%mm3 \n\t"
01384 "packuswb %%mm1, %%mm0 \n\t"
01385 "packuswb %%mm3, %%mm2 \n\t"
01386
01387 MOVNTQ" %%mm0, (%1, %%"REG_a", 2) \n\t"
01388 MOVNTQ" %%mm2, 8(%1, %%"REG_a", 2) \n\t"
01389
01390 "add $8, %%"REG_a" \n\t"
01391 "cmp %4, %%"REG_a" \n\t"
01392 " jb 1b \n\t"
01393
01394 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
01395 : "memory", "%"REG_a
01396 );
01397 udst += chromStride;
01398 vdst += chromStride;
01399 ydst += lumStride;
01400 src += srcStride;
01401 }
01402 __asm__ volatile(EMMS" \n\t"
01403 SFENCE" \n\t"
01404 :::"memory");
01405 }
01406 #endif
01407
01408 #if COMPILE_TEMPLATE_MMXEXT || COMPILE_TEMPLATE_AMD3DNOW
01409 static inline void RENAME(planar2x)(const uint8_t *src, uint8_t *dst, int srcWidth, int srcHeight, int srcStride, int dstStride)
01410 {
01411 int x,y;
01412
01413 dst[0]= src[0];
01414
01415
01416 for (x=0; x<srcWidth-1; x++) {
01417 dst[2*x+1]= (3*src[x] + src[x+1])>>2;
01418 dst[2*x+2]= ( src[x] + 3*src[x+1])>>2;
01419 }
01420 dst[2*srcWidth-1]= src[srcWidth-1];
01421
01422 dst+= dstStride;
01423
01424 for (y=1; y<srcHeight; y++) {
01425 const x86_reg mmxSize= srcWidth&~15;
01426 __asm__ volatile(
01427 "mov %4, %%"REG_a" \n\t"
01428 "movq "MANGLE(mmx_ff)", %%mm0 \n\t"
01429 "movq (%0, %%"REG_a"), %%mm4 \n\t"
01430 "movq %%mm4, %%mm2 \n\t"
01431 "psllq $8, %%mm4 \n\t"
01432 "pand %%mm0, %%mm2 \n\t"
01433 "por %%mm2, %%mm4 \n\t"
01434 "movq (%1, %%"REG_a"), %%mm5 \n\t"
01435 "movq %%mm5, %%mm3 \n\t"
01436 "psllq $8, %%mm5 \n\t"
01437 "pand %%mm0, %%mm3 \n\t"
01438 "por %%mm3, %%mm5 \n\t"
01439 "1: \n\t"
01440 "movq (%0, %%"REG_a"), %%mm0 \n\t"
01441 "movq (%1, %%"REG_a"), %%mm1 \n\t"
01442 "movq 1(%0, %%"REG_a"), %%mm2 \n\t"
01443 "movq 1(%1, %%"REG_a"), %%mm3 \n\t"
01444 PAVGB" %%mm0, %%mm5 \n\t"
01445 PAVGB" %%mm0, %%mm3 \n\t"
01446 PAVGB" %%mm0, %%mm5 \n\t"
01447 PAVGB" %%mm0, %%mm3 \n\t"
01448 PAVGB" %%mm1, %%mm4 \n\t"
01449 PAVGB" %%mm1, %%mm2 \n\t"
01450 PAVGB" %%mm1, %%mm4 \n\t"
01451 PAVGB" %%mm1, %%mm2 \n\t"
01452 "movq %%mm5, %%mm7 \n\t"
01453 "movq %%mm4, %%mm6 \n\t"
01454 "punpcklbw %%mm3, %%mm5 \n\t"
01455 "punpckhbw %%mm3, %%mm7 \n\t"
01456 "punpcklbw %%mm2, %%mm4 \n\t"
01457 "punpckhbw %%mm2, %%mm6 \n\t"
01458 MOVNTQ" %%mm5, (%2, %%"REG_a", 2) \n\t"
01459 MOVNTQ" %%mm7, 8(%2, %%"REG_a", 2) \n\t"
01460 MOVNTQ" %%mm4, (%3, %%"REG_a", 2) \n\t"
01461 MOVNTQ" %%mm6, 8(%3, %%"REG_a", 2) \n\t"
01462 "add $8, %%"REG_a" \n\t"
01463 "movq -1(%0, %%"REG_a"), %%mm4 \n\t"
01464 "movq -1(%1, %%"REG_a"), %%mm5 \n\t"
01465 " js 1b \n\t"
01466 :: "r" (src + mmxSize ), "r" (src + srcStride + mmxSize ),
01467 "r" (dst + mmxSize*2), "r" (dst + dstStride + mmxSize*2),
01468 "g" (-mmxSize)
01469 : "%"REG_a
01470 );
01471
01472 for (x=mmxSize-1; x<srcWidth-1; x++) {
01473 dst[2*x +1]= (3*src[x+0] + src[x+srcStride+1])>>2;
01474 dst[2*x+dstStride+2]= ( src[x+0] + 3*src[x+srcStride+1])>>2;
01475 dst[2*x+dstStride+1]= ( src[x+1] + 3*src[x+srcStride ])>>2;
01476 dst[2*x +2]= (3*src[x+1] + src[x+srcStride ])>>2;
01477 }
01478 dst[srcWidth*2 -1 ]= (3*src[srcWidth-1] + src[srcWidth-1 + srcStride])>>2;
01479 dst[srcWidth*2 -1 + dstStride]= ( src[srcWidth-1] + 3*src[srcWidth-1 + srcStride])>>2;
01480
01481 dst+=dstStride*2;
01482 src+=srcStride;
01483 }
01484
01485
01486 dst[0]= src[0];
01487
01488 for (x=0; x<srcWidth-1; x++) {
01489 dst[2*x+1]= (3*src[x] + src[x+1])>>2;
01490 dst[2*x+2]= ( src[x] + 3*src[x+1])>>2;
01491 }
01492 dst[2*srcWidth-1]= src[srcWidth-1];
01493
01494 __asm__ volatile(EMMS" \n\t"
01495 SFENCE" \n\t"
01496 :::"memory");
01497 }
01498 #endif
01499
01500 #if !COMPILE_TEMPLATE_AMD3DNOW
01501
01507 static inline void RENAME(uyvytoyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
01508 int width, int height,
01509 int lumStride, int chromStride, int srcStride)
01510 {
01511 int y;
01512 const x86_reg chromWidth= width>>1;
01513 for (y=0; y<height; y+=2) {
01514 __asm__ volatile(
01515 "xor %%"REG_a", %%"REG_a" \n\t"
01516 "pcmpeqw %%mm7, %%mm7 \n\t"
01517 "psrlw $8, %%mm7 \n\t"
01518 ".p2align 4 \n\t"
01519 "1: \n\t"
01520 PREFETCH" 64(%0, %%"REG_a", 4) \n\t"
01521 "movq (%0, %%"REG_a", 4), %%mm0 \n\t"
01522 "movq 8(%0, %%"REG_a", 4), %%mm1 \n\t"
01523 "movq %%mm0, %%mm2 \n\t"
01524 "movq %%mm1, %%mm3 \n\t"
01525 "pand %%mm7, %%mm0 \n\t"
01526 "pand %%mm7, %%mm1 \n\t"
01527 "psrlw $8, %%mm2 \n\t"
01528 "psrlw $8, %%mm3 \n\t"
01529 "packuswb %%mm1, %%mm0 \n\t"
01530 "packuswb %%mm3, %%mm2 \n\t"
01531
01532 MOVNTQ" %%mm2, (%1, %%"REG_a", 2) \n\t"
01533
01534 "movq 16(%0, %%"REG_a", 4), %%mm1 \n\t"
01535 "movq 24(%0, %%"REG_a", 4), %%mm2 \n\t"
01536 "movq %%mm1, %%mm3 \n\t"
01537 "movq %%mm2, %%mm4 \n\t"
01538 "pand %%mm7, %%mm1 \n\t"
01539 "pand %%mm7, %%mm2 \n\t"
01540 "psrlw $8, %%mm3 \n\t"
01541 "psrlw $8, %%mm4 \n\t"
01542 "packuswb %%mm2, %%mm1 \n\t"
01543 "packuswb %%mm4, %%mm3 \n\t"
01544
01545 MOVNTQ" %%mm3, 8(%1, %%"REG_a", 2) \n\t"
01546
01547 "movq %%mm0, %%mm2 \n\t"
01548 "movq %%mm1, %%mm3 \n\t"
01549 "psrlw $8, %%mm0 \n\t"
01550 "psrlw $8, %%mm1 \n\t"
01551 "pand %%mm7, %%mm2 \n\t"
01552 "pand %%mm7, %%mm3 \n\t"
01553 "packuswb %%mm1, %%mm0 \n\t"
01554 "packuswb %%mm3, %%mm2 \n\t"
01555
01556 MOVNTQ" %%mm0, (%3, %%"REG_a") \n\t"
01557 MOVNTQ" %%mm2, (%2, %%"REG_a") \n\t"
01558
01559 "add $8, %%"REG_a" \n\t"
01560 "cmp %4, %%"REG_a" \n\t"
01561 " jb 1b \n\t"
01562 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
01563 : "memory", "%"REG_a
01564 );
01565
01566 ydst += lumStride;
01567 src += srcStride;
01568
01569 __asm__ volatile(
01570 "xor %%"REG_a", %%"REG_a" \n\t"
01571 ".p2align 4 \n\t"
01572 "1: \n\t"
01573 PREFETCH" 64(%0, %%"REG_a", 4) \n\t"
01574 "movq (%0, %%"REG_a", 4), %%mm0 \n\t"
01575 "movq 8(%0, %%"REG_a", 4), %%mm1 \n\t"
01576 "movq 16(%0, %%"REG_a", 4), %%mm2 \n\t"
01577 "movq 24(%0, %%"REG_a", 4), %%mm3 \n\t"
01578 "psrlw $8, %%mm0 \n\t"
01579 "psrlw $8, %%mm1 \n\t"
01580 "psrlw $8, %%mm2 \n\t"
01581 "psrlw $8, %%mm3 \n\t"
01582 "packuswb %%mm1, %%mm0 \n\t"
01583 "packuswb %%mm3, %%mm2 \n\t"
01584
01585 MOVNTQ" %%mm0, (%1, %%"REG_a", 2) \n\t"
01586 MOVNTQ" %%mm2, 8(%1, %%"REG_a", 2) \n\t"
01587
01588 "add $8, %%"REG_a" \n\t"
01589 "cmp %4, %%"REG_a" \n\t"
01590 " jb 1b \n\t"
01591
01592 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
01593 : "memory", "%"REG_a
01594 );
01595 udst += chromStride;
01596 vdst += chromStride;
01597 ydst += lumStride;
01598 src += srcStride;
01599 }
01600 __asm__ volatile(EMMS" \n\t"
01601 SFENCE" \n\t"
01602 :::"memory");
01603 }
01604 #endif
01605
01613 static inline void RENAME(rgb24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
01614 int width, int height,
01615 int lumStride, int chromStride, int srcStride)
01616 {
01617 int y;
01618 const x86_reg chromWidth= width>>1;
01619 for (y=0; y<height-2; y+=2) {
01620 int i;
01621 for (i=0; i<2; i++) {
01622 __asm__ volatile(
01623 "mov %2, %%"REG_a" \n\t"
01624 "movq "MANGLE(ff_bgr2YCoeff)", %%mm6 \n\t"
01625 "movq "MANGLE(ff_w1111)", %%mm5 \n\t"
01626 "pxor %%mm7, %%mm7 \n\t"
01627 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_d" \n\t"
01628 ".p2align 4 \n\t"
01629 "1: \n\t"
01630 PREFETCH" 64(%0, %%"REG_d") \n\t"
01631 "movd (%0, %%"REG_d"), %%mm0 \n\t"
01632 "movd 3(%0, %%"REG_d"), %%mm1 \n\t"
01633 "punpcklbw %%mm7, %%mm0 \n\t"
01634 "punpcklbw %%mm7, %%mm1 \n\t"
01635 "movd 6(%0, %%"REG_d"), %%mm2 \n\t"
01636 "movd 9(%0, %%"REG_d"), %%mm3 \n\t"
01637 "punpcklbw %%mm7, %%mm2 \n\t"
01638 "punpcklbw %%mm7, %%mm3 \n\t"
01639 "pmaddwd %%mm6, %%mm0 \n\t"
01640 "pmaddwd %%mm6, %%mm1 \n\t"
01641 "pmaddwd %%mm6, %%mm2 \n\t"
01642 "pmaddwd %%mm6, %%mm3 \n\t"
01643 #ifndef FAST_BGR2YV12
01644 "psrad $8, %%mm0 \n\t"
01645 "psrad $8, %%mm1 \n\t"
01646 "psrad $8, %%mm2 \n\t"
01647 "psrad $8, %%mm3 \n\t"
01648 #endif
01649 "packssdw %%mm1, %%mm0 \n\t"
01650 "packssdw %%mm3, %%mm2 \n\t"
01651 "pmaddwd %%mm5, %%mm0 \n\t"
01652 "pmaddwd %%mm5, %%mm2 \n\t"
01653 "packssdw %%mm2, %%mm0 \n\t"
01654 "psraw $7, %%mm0 \n\t"
01655
01656 "movd 12(%0, %%"REG_d"), %%mm4 \n\t"
01657 "movd 15(%0, %%"REG_d"), %%mm1 \n\t"
01658 "punpcklbw %%mm7, %%mm4 \n\t"
01659 "punpcklbw %%mm7, %%mm1 \n\t"
01660 "movd 18(%0, %%"REG_d"), %%mm2 \n\t"
01661 "movd 21(%0, %%"REG_d"), %%mm3 \n\t"
01662 "punpcklbw %%mm7, %%mm2 \n\t"
01663 "punpcklbw %%mm7, %%mm3 \n\t"
01664 "pmaddwd %%mm6, %%mm4 \n\t"
01665 "pmaddwd %%mm6, %%mm1 \n\t"
01666 "pmaddwd %%mm6, %%mm2 \n\t"
01667 "pmaddwd %%mm6, %%mm3 \n\t"
01668 #ifndef FAST_BGR2YV12
01669 "psrad $8, %%mm4 \n\t"
01670 "psrad $8, %%mm1 \n\t"
01671 "psrad $8, %%mm2 \n\t"
01672 "psrad $8, %%mm3 \n\t"
01673 #endif
01674 "packssdw %%mm1, %%mm4 \n\t"
01675 "packssdw %%mm3, %%mm2 \n\t"
01676 "pmaddwd %%mm5, %%mm4 \n\t"
01677 "pmaddwd %%mm5, %%mm2 \n\t"
01678 "add $24, %%"REG_d" \n\t"
01679 "packssdw %%mm2, %%mm4 \n\t"
01680 "psraw $7, %%mm4 \n\t"
01681
01682 "packuswb %%mm4, %%mm0 \n\t"
01683 "paddusb "MANGLE(ff_bgr2YOffset)", %%mm0 \n\t"
01684
01685 MOVNTQ" %%mm0, (%1, %%"REG_a") \n\t"
01686 "add $8, %%"REG_a" \n\t"
01687 " js 1b \n\t"
01688 : : "r" (src+width*3), "r" (ydst+width), "g" ((x86_reg)-width)
01689 : "%"REG_a, "%"REG_d
01690 );
01691 ydst += lumStride;
01692 src += srcStride;
01693 }
01694 src -= srcStride*2;
01695 __asm__ volatile(
01696 "mov %4, %%"REG_a" \n\t"
01697 "movq "MANGLE(ff_w1111)", %%mm5 \n\t"
01698 "movq "MANGLE(ff_bgr2UCoeff)", %%mm6 \n\t"
01699 "pxor %%mm7, %%mm7 \n\t"
01700 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_d" \n\t"
01701 "add %%"REG_d", %%"REG_d" \n\t"
01702 ".p2align 4 \n\t"
01703 "1: \n\t"
01704 PREFETCH" 64(%0, %%"REG_d") \n\t"
01705 PREFETCH" 64(%1, %%"REG_d") \n\t"
01706 #if COMPILE_TEMPLATE_MMXEXT || COMPILE_TEMPLATE_AMD3DNOW
01707 "movq (%0, %%"REG_d"), %%mm0 \n\t"
01708 "movq (%1, %%"REG_d"), %%mm1 \n\t"
01709 "movq 6(%0, %%"REG_d"), %%mm2 \n\t"
01710 "movq 6(%1, %%"REG_d"), %%mm3 \n\t"
01711 PAVGB" %%mm1, %%mm0 \n\t"
01712 PAVGB" %%mm3, %%mm2 \n\t"
01713 "movq %%mm0, %%mm1 \n\t"
01714 "movq %%mm2, %%mm3 \n\t"
01715 "psrlq $24, %%mm0 \n\t"
01716 "psrlq $24, %%mm2 \n\t"
01717 PAVGB" %%mm1, %%mm0 \n\t"
01718 PAVGB" %%mm3, %%mm2 \n\t"
01719 "punpcklbw %%mm7, %%mm0 \n\t"
01720 "punpcklbw %%mm7, %%mm2 \n\t"
01721 #else
01722 "movd (%0, %%"REG_d"), %%mm0 \n\t"
01723 "movd (%1, %%"REG_d"), %%mm1 \n\t"
01724 "movd 3(%0, %%"REG_d"), %%mm2 \n\t"
01725 "movd 3(%1, %%"REG_d"), %%mm3 \n\t"
01726 "punpcklbw %%mm7, %%mm0 \n\t"
01727 "punpcklbw %%mm7, %%mm1 \n\t"
01728 "punpcklbw %%mm7, %%mm2 \n\t"
01729 "punpcklbw %%mm7, %%mm3 \n\t"
01730 "paddw %%mm1, %%mm0 \n\t"
01731 "paddw %%mm3, %%mm2 \n\t"
01732 "paddw %%mm2, %%mm0 \n\t"
01733 "movd 6(%0, %%"REG_d"), %%mm4 \n\t"
01734 "movd 6(%1, %%"REG_d"), %%mm1 \n\t"
01735 "movd 9(%0, %%"REG_d"), %%mm2 \n\t"
01736 "movd 9(%1, %%"REG_d"), %%mm3 \n\t"
01737 "punpcklbw %%mm7, %%mm4 \n\t"
01738 "punpcklbw %%mm7, %%mm1 \n\t"
01739 "punpcklbw %%mm7, %%mm2 \n\t"
01740 "punpcklbw %%mm7, %%mm3 \n\t"
01741 "paddw %%mm1, %%mm4 \n\t"
01742 "paddw %%mm3, %%mm2 \n\t"
01743 "paddw %%mm4, %%mm2 \n\t"
01744 "psrlw $2, %%mm0 \n\t"
01745 "psrlw $2, %%mm2 \n\t"
01746 #endif
01747 "movq "MANGLE(ff_bgr2VCoeff)", %%mm1 \n\t"
01748 "movq "MANGLE(ff_bgr2VCoeff)", %%mm3 \n\t"
01749
01750 "pmaddwd %%mm0, %%mm1 \n\t"
01751 "pmaddwd %%mm2, %%mm3 \n\t"
01752 "pmaddwd %%mm6, %%mm0 \n\t"
01753 "pmaddwd %%mm6, %%mm2 \n\t"
01754 #ifndef FAST_BGR2YV12
01755 "psrad $8, %%mm0 \n\t"
01756 "psrad $8, %%mm1 \n\t"
01757 "psrad $8, %%mm2 \n\t"
01758 "psrad $8, %%mm3 \n\t"
01759 #endif
01760 "packssdw %%mm2, %%mm0 \n\t"
01761 "packssdw %%mm3, %%mm1 \n\t"
01762 "pmaddwd %%mm5, %%mm0 \n\t"
01763 "pmaddwd %%mm5, %%mm1 \n\t"
01764 "packssdw %%mm1, %%mm0 \n\t"
01765 "psraw $7, %%mm0 \n\t"
01766
01767 #if COMPILE_TEMPLATE_MMXEXT || COMPILE_TEMPLATE_AMD3DNOW
01768 "movq 12(%0, %%"REG_d"), %%mm4 \n\t"
01769 "movq 12(%1, %%"REG_d"), %%mm1 \n\t"
01770 "movq 18(%0, %%"REG_d"), %%mm2 \n\t"
01771 "movq 18(%1, %%"REG_d"), %%mm3 \n\t"
01772 PAVGB" %%mm1, %%mm4 \n\t"
01773 PAVGB" %%mm3, %%mm2 \n\t"
01774 "movq %%mm4, %%mm1 \n\t"
01775 "movq %%mm2, %%mm3 \n\t"
01776 "psrlq $24, %%mm4 \n\t"
01777 "psrlq $24, %%mm2 \n\t"
01778 PAVGB" %%mm1, %%mm4 \n\t"
01779 PAVGB" %%mm3, %%mm2 \n\t"
01780 "punpcklbw %%mm7, %%mm4 \n\t"
01781 "punpcklbw %%mm7, %%mm2 \n\t"
01782 #else
01783 "movd 12(%0, %%"REG_d"), %%mm4 \n\t"
01784 "movd 12(%1, %%"REG_d"), %%mm1 \n\t"
01785 "movd 15(%0, %%"REG_d"), %%mm2 \n\t"
01786 "movd 15(%1, %%"REG_d"), %%mm3 \n\t"
01787 "punpcklbw %%mm7, %%mm4 \n\t"
01788 "punpcklbw %%mm7, %%mm1 \n\t"
01789 "punpcklbw %%mm7, %%mm2 \n\t"
01790 "punpcklbw %%mm7, %%mm3 \n\t"
01791 "paddw %%mm1, %%mm4 \n\t"
01792 "paddw %%mm3, %%mm2 \n\t"
01793 "paddw %%mm2, %%mm4 \n\t"
01794 "movd 18(%0, %%"REG_d"), %%mm5 \n\t"
01795 "movd 18(%1, %%"REG_d"), %%mm1 \n\t"
01796 "movd 21(%0, %%"REG_d"), %%mm2 \n\t"
01797 "movd 21(%1, %%"REG_d"), %%mm3 \n\t"
01798 "punpcklbw %%mm7, %%mm5 \n\t"
01799 "punpcklbw %%mm7, %%mm1 \n\t"
01800 "punpcklbw %%mm7, %%mm2 \n\t"
01801 "punpcklbw %%mm7, %%mm3 \n\t"
01802 "paddw %%mm1, %%mm5 \n\t"
01803 "paddw %%mm3, %%mm2 \n\t"
01804 "paddw %%mm5, %%mm2 \n\t"
01805 "movq "MANGLE(ff_w1111)", %%mm5 \n\t"
01806 "psrlw $2, %%mm4 \n\t"
01807 "psrlw $2, %%mm2 \n\t"
01808 #endif
01809 "movq "MANGLE(ff_bgr2VCoeff)", %%mm1 \n\t"
01810 "movq "MANGLE(ff_bgr2VCoeff)", %%mm3 \n\t"
01811
01812 "pmaddwd %%mm4, %%mm1 \n\t"
01813 "pmaddwd %%mm2, %%mm3 \n\t"
01814 "pmaddwd %%mm6, %%mm4 \n\t"
01815 "pmaddwd %%mm6, %%mm2 \n\t"
01816 #ifndef FAST_BGR2YV12
01817 "psrad $8, %%mm4 \n\t"
01818 "psrad $8, %%mm1 \n\t"
01819 "psrad $8, %%mm2 \n\t"
01820 "psrad $8, %%mm3 \n\t"
01821 #endif
01822 "packssdw %%mm2, %%mm4 \n\t"
01823 "packssdw %%mm3, %%mm1 \n\t"
01824 "pmaddwd %%mm5, %%mm4 \n\t"
01825 "pmaddwd %%mm5, %%mm1 \n\t"
01826 "add $24, %%"REG_d" \n\t"
01827 "packssdw %%mm1, %%mm4 \n\t"
01828 "psraw $7, %%mm4 \n\t"
01829
01830 "movq %%mm0, %%mm1 \n\t"
01831 "punpckldq %%mm4, %%mm0 \n\t"
01832 "punpckhdq %%mm4, %%mm1 \n\t"
01833 "packsswb %%mm1, %%mm0 \n\t"
01834 "paddb "MANGLE(ff_bgr2UVOffset)", %%mm0 \n\t"
01835 "movd %%mm0, (%2, %%"REG_a") \n\t"
01836 "punpckhdq %%mm0, %%mm0 \n\t"
01837 "movd %%mm0, (%3, %%"REG_a") \n\t"
01838 "add $4, %%"REG_a" \n\t"
01839 " js 1b \n\t"
01840 : : "r" (src+chromWidth*6), "r" (src+srcStride+chromWidth*6), "r" (udst+chromWidth), "r" (vdst+chromWidth), "g" (-chromWidth)
01841 : "%"REG_a, "%"REG_d
01842 );
01843
01844 udst += chromStride;
01845 vdst += chromStride;
01846 src += srcStride*2;
01847 }
01848
01849 __asm__ volatile(EMMS" \n\t"
01850 SFENCE" \n\t"
01851 :::"memory");
01852
01853 rgb24toyv12_c(src, ydst, udst, vdst, width, height-y, lumStride, chromStride, srcStride);
01854 }
01855 #endif
01856
01857 #if !COMPILE_TEMPLATE_AMD3DNOW
01858 static void RENAME(interleaveBytes)(const uint8_t *src1, const uint8_t *src2, uint8_t *dest,
01859 int width, int height, int src1Stride,
01860 int src2Stride, int dstStride)
01861 {
01862 int h;
01863
01864 for (h=0; h < height; h++) {
01865 int w;
01866
01867 #if COMPILE_TEMPLATE_SSE2
01868 __asm__(
01869 "xor %%"REG_a", %%"REG_a" \n\t"
01870 "1: \n\t"
01871 PREFETCH" 64(%1, %%"REG_a") \n\t"
01872 PREFETCH" 64(%2, %%"REG_a") \n\t"
01873 "movdqa (%1, %%"REG_a"), %%xmm0 \n\t"
01874 "movdqa (%1, %%"REG_a"), %%xmm1 \n\t"
01875 "movdqa (%2, %%"REG_a"), %%xmm2 \n\t"
01876 "punpcklbw %%xmm2, %%xmm0 \n\t"
01877 "punpckhbw %%xmm2, %%xmm1 \n\t"
01878 "movntdq %%xmm0, (%0, %%"REG_a", 2) \n\t"
01879 "movntdq %%xmm1, 16(%0, %%"REG_a", 2) \n\t"
01880 "add $16, %%"REG_a" \n\t"
01881 "cmp %3, %%"REG_a" \n\t"
01882 " jb 1b \n\t"
01883 ::"r"(dest), "r"(src1), "r"(src2), "r" ((x86_reg)width-15)
01884 : "memory", "%"REG_a""
01885 );
01886 #else
01887 __asm__(
01888 "xor %%"REG_a", %%"REG_a" \n\t"
01889 "1: \n\t"
01890 PREFETCH" 64(%1, %%"REG_a") \n\t"
01891 PREFETCH" 64(%2, %%"REG_a") \n\t"
01892 "movq (%1, %%"REG_a"), %%mm0 \n\t"
01893 "movq 8(%1, %%"REG_a"), %%mm2 \n\t"
01894 "movq %%mm0, %%mm1 \n\t"
01895 "movq %%mm2, %%mm3 \n\t"
01896 "movq (%2, %%"REG_a"), %%mm4 \n\t"
01897 "movq 8(%2, %%"REG_a"), %%mm5 \n\t"
01898 "punpcklbw %%mm4, %%mm0 \n\t"
01899 "punpckhbw %%mm4, %%mm1 \n\t"
01900 "punpcklbw %%mm5, %%mm2 \n\t"
01901 "punpckhbw %%mm5, %%mm3 \n\t"
01902 MOVNTQ" %%mm0, (%0, %%"REG_a", 2) \n\t"
01903 MOVNTQ" %%mm1, 8(%0, %%"REG_a", 2) \n\t"
01904 MOVNTQ" %%mm2, 16(%0, %%"REG_a", 2) \n\t"
01905 MOVNTQ" %%mm3, 24(%0, %%"REG_a", 2) \n\t"
01906 "add $16, %%"REG_a" \n\t"
01907 "cmp %3, %%"REG_a" \n\t"
01908 " jb 1b \n\t"
01909 ::"r"(dest), "r"(src1), "r"(src2), "r" ((x86_reg)width-15)
01910 : "memory", "%"REG_a
01911 );
01912 #endif
01913 for (w= (width&(~15)); w < width; w++) {
01914 dest[2*w+0] = src1[w];
01915 dest[2*w+1] = src2[w];
01916 }
01917 dest += dstStride;
01918 src1 += src1Stride;
01919 src2 += src2Stride;
01920 }
01921 __asm__(
01922 EMMS" \n\t"
01923 SFENCE" \n\t"
01924 ::: "memory"
01925 );
01926 }
01927 #endif
01928
01929 #if !COMPILE_TEMPLATE_SSE2
01930 #if !COMPILE_TEMPLATE_AMD3DNOW
01931 static inline void RENAME(vu9_to_vu12)(const uint8_t *src1, const uint8_t *src2,
01932 uint8_t *dst1, uint8_t *dst2,
01933 int width, int height,
01934 int srcStride1, int srcStride2,
01935 int dstStride1, int dstStride2)
01936 {
01937 x86_reg x, y;
01938 int w,h;
01939 w=width/2; h=height/2;
01940 __asm__ volatile(
01941 PREFETCH" %0 \n\t"
01942 PREFETCH" %1 \n\t"
01943 ::"m"(*(src1+srcStride1)),"m"(*(src2+srcStride2)):"memory");
01944 for (y=0;y<h;y++) {
01945 const uint8_t* s1=src1+srcStride1*(y>>1);
01946 uint8_t* d=dst1+dstStride1*y;
01947 x=0;
01948 for (;x<w-31;x+=32) {
01949 __asm__ volatile(
01950 PREFETCH" 32(%1,%2) \n\t"
01951 "movq (%1,%2), %%mm0 \n\t"
01952 "movq 8(%1,%2), %%mm2 \n\t"
01953 "movq 16(%1,%2), %%mm4 \n\t"
01954 "movq 24(%1,%2), %%mm6 \n\t"
01955 "movq %%mm0, %%mm1 \n\t"
01956 "movq %%mm2, %%mm3 \n\t"
01957 "movq %%mm4, %%mm5 \n\t"
01958 "movq %%mm6, %%mm7 \n\t"
01959 "punpcklbw %%mm0, %%mm0 \n\t"
01960 "punpckhbw %%mm1, %%mm1 \n\t"
01961 "punpcklbw %%mm2, %%mm2 \n\t"
01962 "punpckhbw %%mm3, %%mm3 \n\t"
01963 "punpcklbw %%mm4, %%mm4 \n\t"
01964 "punpckhbw %%mm5, %%mm5 \n\t"
01965 "punpcklbw %%mm6, %%mm6 \n\t"
01966 "punpckhbw %%mm7, %%mm7 \n\t"
01967 MOVNTQ" %%mm0, (%0,%2,2) \n\t"
01968 MOVNTQ" %%mm1, 8(%0,%2,2) \n\t"
01969 MOVNTQ" %%mm2, 16(%0,%2,2) \n\t"
01970 MOVNTQ" %%mm3, 24(%0,%2,2) \n\t"
01971 MOVNTQ" %%mm4, 32(%0,%2,2) \n\t"
01972 MOVNTQ" %%mm5, 40(%0,%2,2) \n\t"
01973 MOVNTQ" %%mm6, 48(%0,%2,2) \n\t"
01974 MOVNTQ" %%mm7, 56(%0,%2,2)"
01975 :: "r"(d), "r"(s1), "r"(x)
01976 :"memory");
01977 }
01978 for (;x<w;x++) d[2*x]=d[2*x+1]=s1[x];
01979 }
01980 for (y=0;y<h;y++) {
01981 const uint8_t* s2=src2+srcStride2*(y>>1);
01982 uint8_t* d=dst2+dstStride2*y;
01983 x=0;
01984 for (;x<w-31;x+=32) {
01985 __asm__ volatile(
01986 PREFETCH" 32(%1,%2) \n\t"
01987 "movq (%1,%2), %%mm0 \n\t"
01988 "movq 8(%1,%2), %%mm2 \n\t"
01989 "movq 16(%1,%2), %%mm4 \n\t"
01990 "movq 24(%1,%2), %%mm6 \n\t"
01991 "movq %%mm0, %%mm1 \n\t"
01992 "movq %%mm2, %%mm3 \n\t"
01993 "movq %%mm4, %%mm5 \n\t"
01994 "movq %%mm6, %%mm7 \n\t"
01995 "punpcklbw %%mm0, %%mm0 \n\t"
01996 "punpckhbw %%mm1, %%mm1 \n\t"
01997 "punpcklbw %%mm2, %%mm2 \n\t"
01998 "punpckhbw %%mm3, %%mm3 \n\t"
01999 "punpcklbw %%mm4, %%mm4 \n\t"
02000 "punpckhbw %%mm5, %%mm5 \n\t"
02001 "punpcklbw %%mm6, %%mm6 \n\t"
02002 "punpckhbw %%mm7, %%mm7 \n\t"
02003 MOVNTQ" %%mm0, (%0,%2,2) \n\t"
02004 MOVNTQ" %%mm1, 8(%0,%2,2) \n\t"
02005 MOVNTQ" %%mm2, 16(%0,%2,2) \n\t"
02006 MOVNTQ" %%mm3, 24(%0,%2,2) \n\t"
02007 MOVNTQ" %%mm4, 32(%0,%2,2) \n\t"
02008 MOVNTQ" %%mm5, 40(%0,%2,2) \n\t"
02009 MOVNTQ" %%mm6, 48(%0,%2,2) \n\t"
02010 MOVNTQ" %%mm7, 56(%0,%2,2)"
02011 :: "r"(d), "r"(s2), "r"(x)
02012 :"memory");
02013 }
02014 for (;x<w;x++) d[2*x]=d[2*x+1]=s2[x];
02015 }
02016 __asm__(
02017 EMMS" \n\t"
02018 SFENCE" \n\t"
02019 ::: "memory"
02020 );
02021 }
02022
02023 static inline void RENAME(yvu9_to_yuy2)(const uint8_t *src1, const uint8_t *src2, const uint8_t *src3,
02024 uint8_t *dst,
02025 int width, int height,
02026 int srcStride1, int srcStride2,
02027 int srcStride3, int dstStride)
02028 {
02029 x86_reg x;
02030 int y,w,h;
02031 w=width/2; h=height;
02032 for (y=0;y<h;y++) {
02033 const uint8_t* yp=src1+srcStride1*y;
02034 const uint8_t* up=src2+srcStride2*(y>>2);
02035 const uint8_t* vp=src3+srcStride3*(y>>2);
02036 uint8_t* d=dst+dstStride*y;
02037 x=0;
02038 for (;x<w-7;x+=8) {
02039 __asm__ volatile(
02040 PREFETCH" 32(%1, %0) \n\t"
02041 PREFETCH" 32(%2, %0) \n\t"
02042 PREFETCH" 32(%3, %0) \n\t"
02043 "movq (%1, %0, 4), %%mm0 \n\t"
02044 "movq (%2, %0), %%mm1 \n\t"
02045 "movq (%3, %0), %%mm2 \n\t"
02046 "movq %%mm0, %%mm3 \n\t"
02047 "movq %%mm1, %%mm4 \n\t"
02048 "movq %%mm2, %%mm5 \n\t"
02049 "punpcklbw %%mm1, %%mm1 \n\t"
02050 "punpcklbw %%mm2, %%mm2 \n\t"
02051 "punpckhbw %%mm4, %%mm4 \n\t"
02052 "punpckhbw %%mm5, %%mm5 \n\t"
02053
02054 "movq %%mm1, %%mm6 \n\t"
02055 "punpcklbw %%mm2, %%mm1 \n\t"
02056 "punpcklbw %%mm1, %%mm0 \n\t"
02057 "punpckhbw %%mm1, %%mm3 \n\t"
02058 MOVNTQ" %%mm0, (%4, %0, 8) \n\t"
02059 MOVNTQ" %%mm3, 8(%4, %0, 8) \n\t"
02060
02061 "punpckhbw %%mm2, %%mm6 \n\t"
02062 "movq 8(%1, %0, 4), %%mm0 \n\t"
02063 "movq %%mm0, %%mm3 \n\t"
02064 "punpcklbw %%mm6, %%mm0 \n\t"
02065 "punpckhbw %%mm6, %%mm3 \n\t"
02066 MOVNTQ" %%mm0, 16(%4, %0, 8) \n\t"
02067 MOVNTQ" %%mm3, 24(%4, %0, 8) \n\t"
02068
02069 "movq %%mm4, %%mm6 \n\t"
02070 "movq 16(%1, %0, 4), %%mm0 \n\t"
02071 "movq %%mm0, %%mm3 \n\t"
02072 "punpcklbw %%mm5, %%mm4 \n\t"
02073 "punpcklbw %%mm4, %%mm0 \n\t"
02074 "punpckhbw %%mm4, %%mm3 \n\t"
02075 MOVNTQ" %%mm0, 32(%4, %0, 8) \n\t"
02076 MOVNTQ" %%mm3, 40(%4, %0, 8) \n\t"
02077
02078 "punpckhbw %%mm5, %%mm6 \n\t"
02079 "movq 24(%1, %0, 4), %%mm0 \n\t"
02080 "movq %%mm0, %%mm3 \n\t"
02081 "punpcklbw %%mm6, %%mm0 \n\t"
02082 "punpckhbw %%mm6, %%mm3 \n\t"
02083 MOVNTQ" %%mm0, 48(%4, %0, 8) \n\t"
02084 MOVNTQ" %%mm3, 56(%4, %0, 8) \n\t"
02085
02086 : "+r" (x)
02087 : "r"(yp), "r" (up), "r"(vp), "r"(d)
02088 :"memory");
02089 }
02090 for (; x<w; x++) {
02091 const int x2 = x<<2;
02092 d[8*x+0] = yp[x2];
02093 d[8*x+1] = up[x];
02094 d[8*x+2] = yp[x2+1];
02095 d[8*x+3] = vp[x];
02096 d[8*x+4] = yp[x2+2];
02097 d[8*x+5] = up[x];
02098 d[8*x+6] = yp[x2+3];
02099 d[8*x+7] = vp[x];
02100 }
02101 }
02102 __asm__(
02103 EMMS" \n\t"
02104 SFENCE" \n\t"
02105 ::: "memory"
02106 );
02107 }
02108 #endif
02109
02110 static void RENAME(extract_even)(const uint8_t *src, uint8_t *dst, x86_reg count)
02111 {
02112 dst += count;
02113 src += 2*count;
02114 count= - count;
02115
02116 if(count <= -16) {
02117 count += 15;
02118 __asm__ volatile(
02119 "pcmpeqw %%mm7, %%mm7 \n\t"
02120 "psrlw $8, %%mm7 \n\t"
02121 "1: \n\t"
02122 "movq -30(%1, %0, 2), %%mm0 \n\t"
02123 "movq -22(%1, %0, 2), %%mm1 \n\t"
02124 "movq -14(%1, %0, 2), %%mm2 \n\t"
02125 "movq -6(%1, %0, 2), %%mm3 \n\t"
02126 "pand %%mm7, %%mm0 \n\t"
02127 "pand %%mm7, %%mm1 \n\t"
02128 "pand %%mm7, %%mm2 \n\t"
02129 "pand %%mm7, %%mm3 \n\t"
02130 "packuswb %%mm1, %%mm0 \n\t"
02131 "packuswb %%mm3, %%mm2 \n\t"
02132 MOVNTQ" %%mm0,-15(%2, %0) \n\t"
02133 MOVNTQ" %%mm2,- 7(%2, %0) \n\t"
02134 "add $16, %0 \n\t"
02135 " js 1b \n\t"
02136 : "+r"(count)
02137 : "r"(src), "r"(dst)
02138 );
02139 count -= 15;
02140 }
02141 while(count<0) {
02142 dst[count]= src[2*count];
02143 count++;
02144 }
02145 }
02146
02147 #if !COMPILE_TEMPLATE_AMD3DNOW
02148 static void RENAME(extract_even2)(const uint8_t *src, uint8_t *dst0, uint8_t *dst1, x86_reg count)
02149 {
02150 dst0+= count;
02151 dst1+= count;
02152 src += 4*count;
02153 count= - count;
02154 if(count <= -8) {
02155 count += 7;
02156 __asm__ volatile(
02157 "pcmpeqw %%mm7, %%mm7 \n\t"
02158 "psrlw $8, %%mm7 \n\t"
02159 "1: \n\t"
02160 "movq -28(%1, %0, 4), %%mm0 \n\t"
02161 "movq -20(%1, %0, 4), %%mm1 \n\t"
02162 "movq -12(%1, %0, 4), %%mm2 \n\t"
02163 "movq -4(%1, %0, 4), %%mm3 \n\t"
02164 "pand %%mm7, %%mm0 \n\t"
02165 "pand %%mm7, %%mm1 \n\t"
02166 "pand %%mm7, %%mm2 \n\t"
02167 "pand %%mm7, %%mm3 \n\t"
02168 "packuswb %%mm1, %%mm0 \n\t"
02169 "packuswb %%mm3, %%mm2 \n\t"
02170 "movq %%mm0, %%mm1 \n\t"
02171 "movq %%mm2, %%mm3 \n\t"
02172 "psrlw $8, %%mm0 \n\t"
02173 "psrlw $8, %%mm2 \n\t"
02174 "pand %%mm7, %%mm1 \n\t"
02175 "pand %%mm7, %%mm3 \n\t"
02176 "packuswb %%mm2, %%mm0 \n\t"
02177 "packuswb %%mm3, %%mm1 \n\t"
02178 MOVNTQ" %%mm0,- 7(%3, %0) \n\t"
02179 MOVNTQ" %%mm1,- 7(%2, %0) \n\t"
02180 "add $8, %0 \n\t"
02181 " js 1b \n\t"
02182 : "+r"(count)
02183 : "r"(src), "r"(dst0), "r"(dst1)
02184 );
02185 count -= 7;
02186 }
02187 while(count<0) {
02188 dst0[count]= src[4*count+0];
02189 dst1[count]= src[4*count+2];
02190 count++;
02191 }
02192 }
02193 #endif
02194
02195 static void RENAME(extract_even2avg)(const uint8_t *src0, const uint8_t *src1, uint8_t *dst0, uint8_t *dst1, x86_reg count)
02196 {
02197 dst0 += count;
02198 dst1 += count;
02199 src0 += 4*count;
02200 src1 += 4*count;
02201 count= - count;
02202 #ifdef PAVGB
02203 if(count <= -8) {
02204 count += 7;
02205 __asm__ volatile(
02206 "pcmpeqw %%mm7, %%mm7 \n\t"
02207 "psrlw $8, %%mm7 \n\t"
02208 "1: \n\t"
02209 "movq -28(%1, %0, 4), %%mm0 \n\t"
02210 "movq -20(%1, %0, 4), %%mm1 \n\t"
02211 "movq -12(%1, %0, 4), %%mm2 \n\t"
02212 "movq -4(%1, %0, 4), %%mm3 \n\t"
02213 PAVGB" -28(%2, %0, 4), %%mm0 \n\t"
02214 PAVGB" -20(%2, %0, 4), %%mm1 \n\t"
02215 PAVGB" -12(%2, %0, 4), %%mm2 \n\t"
02216 PAVGB" - 4(%2, %0, 4), %%mm3 \n\t"
02217 "pand %%mm7, %%mm0 \n\t"
02218 "pand %%mm7, %%mm1 \n\t"
02219 "pand %%mm7, %%mm2 \n\t"
02220 "pand %%mm7, %%mm3 \n\t"
02221 "packuswb %%mm1, %%mm0 \n\t"
02222 "packuswb %%mm3, %%mm2 \n\t"
02223 "movq %%mm0, %%mm1 \n\t"
02224 "movq %%mm2, %%mm3 \n\t"
02225 "psrlw $8, %%mm0 \n\t"
02226 "psrlw $8, %%mm2 \n\t"
02227 "pand %%mm7, %%mm1 \n\t"
02228 "pand %%mm7, %%mm3 \n\t"
02229 "packuswb %%mm2, %%mm0 \n\t"
02230 "packuswb %%mm3, %%mm1 \n\t"
02231 MOVNTQ" %%mm0,- 7(%4, %0) \n\t"
02232 MOVNTQ" %%mm1,- 7(%3, %0) \n\t"
02233 "add $8, %0 \n\t"
02234 " js 1b \n\t"
02235 : "+r"(count)
02236 : "r"(src0), "r"(src1), "r"(dst0), "r"(dst1)
02237 );
02238 count -= 7;
02239 }
02240 #endif
02241 while(count<0) {
02242 dst0[count]= (src0[4*count+0]+src1[4*count+0])>>1;
02243 dst1[count]= (src0[4*count+2]+src1[4*count+2])>>1;
02244 count++;
02245 }
02246 }
02247
02248 #if !COMPILE_TEMPLATE_AMD3DNOW
02249 static void RENAME(extract_odd2)(const uint8_t *src, uint8_t *dst0, uint8_t *dst1, x86_reg count)
02250 {
02251 dst0+= count;
02252 dst1+= count;
02253 src += 4*count;
02254 count= - count;
02255 if(count <= -8) {
02256 count += 7;
02257 __asm__ volatile(
02258 "pcmpeqw %%mm7, %%mm7 \n\t"
02259 "psrlw $8, %%mm7 \n\t"
02260 "1: \n\t"
02261 "movq -28(%1, %0, 4), %%mm0 \n\t"
02262 "movq -20(%1, %0, 4), %%mm1 \n\t"
02263 "movq -12(%1, %0, 4), %%mm2 \n\t"
02264 "movq -4(%1, %0, 4), %%mm3 \n\t"
02265 "psrlw $8, %%mm0 \n\t"
02266 "psrlw $8, %%mm1 \n\t"
02267 "psrlw $8, %%mm2 \n\t"
02268 "psrlw $8, %%mm3 \n\t"
02269 "packuswb %%mm1, %%mm0 \n\t"
02270 "packuswb %%mm3, %%mm2 \n\t"
02271 "movq %%mm0, %%mm1 \n\t"
02272 "movq %%mm2, %%mm3 \n\t"
02273 "psrlw $8, %%mm0 \n\t"
02274 "psrlw $8, %%mm2 \n\t"
02275 "pand %%mm7, %%mm1 \n\t"
02276 "pand %%mm7, %%mm3 \n\t"
02277 "packuswb %%mm2, %%mm0 \n\t"
02278 "packuswb %%mm3, %%mm1 \n\t"
02279 MOVNTQ" %%mm0,- 7(%3, %0) \n\t"
02280 MOVNTQ" %%mm1,- 7(%2, %0) \n\t"
02281 "add $8, %0 \n\t"
02282 " js 1b \n\t"
02283 : "+r"(count)
02284 : "r"(src), "r"(dst0), "r"(dst1)
02285 );
02286 count -= 7;
02287 }
02288 src++;
02289 while(count<0) {
02290 dst0[count]= src[4*count+0];
02291 dst1[count]= src[4*count+2];
02292 count++;
02293 }
02294 }
02295 #endif
02296
02297 static void RENAME(extract_odd2avg)(const uint8_t *src0, const uint8_t *src1, uint8_t *dst0, uint8_t *dst1, x86_reg count)
02298 {
02299 dst0 += count;
02300 dst1 += count;
02301 src0 += 4*count;
02302 src1 += 4*count;
02303 count= - count;
02304 #ifdef PAVGB
02305 if(count <= -8) {
02306 count += 7;
02307 __asm__ volatile(
02308 "pcmpeqw %%mm7, %%mm7 \n\t"
02309 "psrlw $8, %%mm7 \n\t"
02310 "1: \n\t"
02311 "movq -28(%1, %0, 4), %%mm0 \n\t"
02312 "movq -20(%1, %0, 4), %%mm1 \n\t"
02313 "movq -12(%1, %0, 4), %%mm2 \n\t"
02314 "movq -4(%1, %0, 4), %%mm3 \n\t"
02315 PAVGB" -28(%2, %0, 4), %%mm0 \n\t"
02316 PAVGB" -20(%2, %0, 4), %%mm1 \n\t"
02317 PAVGB" -12(%2, %0, 4), %%mm2 \n\t"
02318 PAVGB" - 4(%2, %0, 4), %%mm3 \n\t"
02319 "psrlw $8, %%mm0 \n\t"
02320 "psrlw $8, %%mm1 \n\t"
02321 "psrlw $8, %%mm2 \n\t"
02322 "psrlw $8, %%mm3 \n\t"
02323 "packuswb %%mm1, %%mm0 \n\t"
02324 "packuswb %%mm3, %%mm2 \n\t"
02325 "movq %%mm0, %%mm1 \n\t"
02326 "movq %%mm2, %%mm3 \n\t"
02327 "psrlw $8, %%mm0 \n\t"
02328 "psrlw $8, %%mm2 \n\t"
02329 "pand %%mm7, %%mm1 \n\t"
02330 "pand %%mm7, %%mm3 \n\t"
02331 "packuswb %%mm2, %%mm0 \n\t"
02332 "packuswb %%mm3, %%mm1 \n\t"
02333 MOVNTQ" %%mm0,- 7(%4, %0) \n\t"
02334 MOVNTQ" %%mm1,- 7(%3, %0) \n\t"
02335 "add $8, %0 \n\t"
02336 " js 1b \n\t"
02337 : "+r"(count)
02338 : "r"(src0), "r"(src1), "r"(dst0), "r"(dst1)
02339 );
02340 count -= 7;
02341 }
02342 #endif
02343 src0++;
02344 src1++;
02345 while(count<0) {
02346 dst0[count]= (src0[4*count+0]+src1[4*count+0])>>1;
02347 dst1[count]= (src0[4*count+2]+src1[4*count+2])>>1;
02348 count++;
02349 }
02350 }
02351
02352 static void RENAME(yuyvtoyuv420)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src,
02353 int width, int height,
02354 int lumStride, int chromStride, int srcStride)
02355 {
02356 int y;
02357 const int chromWidth= -((-width)>>1);
02358
02359 for (y=0; y<height; y++) {
02360 RENAME(extract_even)(src, ydst, width);
02361 if(y&1) {
02362 RENAME(extract_odd2avg)(src-srcStride, src, udst, vdst, chromWidth);
02363 udst+= chromStride;
02364 vdst+= chromStride;
02365 }
02366
02367 src += srcStride;
02368 ydst+= lumStride;
02369 }
02370 __asm__(
02371 EMMS" \n\t"
02372 SFENCE" \n\t"
02373 ::: "memory"
02374 );
02375 }
02376
02377 #if !COMPILE_TEMPLATE_AMD3DNOW
02378 static void RENAME(yuyvtoyuv422)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src,
02379 int width, int height,
02380 int lumStride, int chromStride, int srcStride)
02381 {
02382 int y;
02383 const int chromWidth= -((-width)>>1);
02384
02385 for (y=0; y<height; y++) {
02386 RENAME(extract_even)(src, ydst, width);
02387 RENAME(extract_odd2)(src, udst, vdst, chromWidth);
02388
02389 src += srcStride;
02390 ydst+= lumStride;
02391 udst+= chromStride;
02392 vdst+= chromStride;
02393 }
02394 __asm__(
02395 EMMS" \n\t"
02396 SFENCE" \n\t"
02397 ::: "memory"
02398 );
02399 }
02400 #endif
02401
02402 static void RENAME(uyvytoyuv420)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src,
02403 int width, int height,
02404 int lumStride, int chromStride, int srcStride)
02405 {
02406 int y;
02407 const int chromWidth= -((-width)>>1);
02408
02409 for (y=0; y<height; y++) {
02410 RENAME(extract_even)(src+1, ydst, width);
02411 if(y&1) {
02412 RENAME(extract_even2avg)(src-srcStride, src, udst, vdst, chromWidth);
02413 udst+= chromStride;
02414 vdst+= chromStride;
02415 }
02416
02417 src += srcStride;
02418 ydst+= lumStride;
02419 }
02420 __asm__(
02421 EMMS" \n\t"
02422 SFENCE" \n\t"
02423 ::: "memory"
02424 );
02425 }
02426
02427 #if !COMPILE_TEMPLATE_AMD3DNOW
02428 static void RENAME(uyvytoyuv422)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src,
02429 int width, int height,
02430 int lumStride, int chromStride, int srcStride)
02431 {
02432 int y;
02433 const int chromWidth= -((-width)>>1);
02434
02435 for (y=0; y<height; y++) {
02436 RENAME(extract_even)(src+1, ydst, width);
02437 RENAME(extract_even2)(src, udst, vdst, chromWidth);
02438
02439 src += srcStride;
02440 ydst+= lumStride;
02441 udst+= chromStride;
02442 vdst+= chromStride;
02443 }
02444 __asm__(
02445 EMMS" \n\t"
02446 SFENCE" \n\t"
02447 ::: "memory"
02448 );
02449 }
02450 #endif
02451 #endif
02452
02453 static inline void RENAME(rgb2rgb_init)(void)
02454 {
02455 #if !COMPILE_TEMPLATE_SSE2
02456 #if !COMPILE_TEMPLATE_AMD3DNOW
02457 rgb15to16 = RENAME(rgb15to16);
02458 rgb15tobgr24 = RENAME(rgb15tobgr24);
02459 rgb15to32 = RENAME(rgb15to32);
02460 rgb16tobgr24 = RENAME(rgb16tobgr24);
02461 rgb16to32 = RENAME(rgb16to32);
02462 rgb16to15 = RENAME(rgb16to15);
02463 rgb24tobgr16 = RENAME(rgb24tobgr16);
02464 rgb24tobgr15 = RENAME(rgb24tobgr15);
02465 rgb24tobgr32 = RENAME(rgb24tobgr32);
02466 rgb32to16 = RENAME(rgb32to16);
02467 rgb32to15 = RENAME(rgb32to15);
02468 rgb32tobgr24 = RENAME(rgb32tobgr24);
02469 rgb24to15 = RENAME(rgb24to15);
02470 rgb24to16 = RENAME(rgb24to16);
02471 rgb24tobgr24 = RENAME(rgb24tobgr24);
02472 shuffle_bytes_2103 = RENAME(shuffle_bytes_2103);
02473 rgb32tobgr16 = RENAME(rgb32tobgr16);
02474 rgb32tobgr15 = RENAME(rgb32tobgr15);
02475 yv12toyuy2 = RENAME(yv12toyuy2);
02476 yv12touyvy = RENAME(yv12touyvy);
02477 yuv422ptoyuy2 = RENAME(yuv422ptoyuy2);
02478 yuv422ptouyvy = RENAME(yuv422ptouyvy);
02479 yuy2toyv12 = RENAME(yuy2toyv12);
02480 vu9_to_vu12 = RENAME(vu9_to_vu12);
02481 yvu9_to_yuy2 = RENAME(yvu9_to_yuy2);
02482 uyvytoyuv422 = RENAME(uyvytoyuv422);
02483 yuyvtoyuv422 = RENAME(yuyvtoyuv422);
02484 #endif
02485
02486 #if COMPILE_TEMPLATE_MMXEXT || COMPILE_TEMPLATE_AMD3DNOW
02487 planar2x = RENAME(planar2x);
02488 #endif
02489 rgb24toyv12 = RENAME(rgb24toyv12);
02490
02491 yuyvtoyuv420 = RENAME(yuyvtoyuv420);
02492 uyvytoyuv420 = RENAME(uyvytoyuv420);
02493 #endif
02494
02495 #if !COMPILE_TEMPLATE_AMD3DNOW
02496 interleaveBytes = RENAME(interleaveBytes);
02497 #endif
02498 }