00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029
00030
00031
00032
00033
00034
00035
00036
00037
00038
00039
00040
00041
00042 #include <inttypes.h>
00043
00044 #include "config.h"
00045 #include "libavcodec/avcodec.h"
00046 #include "libavutil/mem.h"
00047 #include "dsputil_mmx.h"
00048 #include "idct_xvid.h"
00049
00050 #if HAVE_INLINE_ASM
00051
00052
00053
00054
00055
00056 #define BITS_INV_ACC 5 // 4 or 5 for IEEE
00057 #define SHIFT_INV_ROW (16 - BITS_INV_ACC) //11
00058 #define SHIFT_INV_COL (1 + BITS_INV_ACC) //6
00059 #define RND_INV_ROW (1024 * (6 - BITS_INV_ACC))
00060 #define RND_INV_COL (16 * (BITS_INV_ACC - 3))
00061 #define RND_INV_CORR (RND_INV_COL - 1)
00062
00063 #define BITS_FRW_ACC 3 // 2 or 3 for accuracy
00064 #define SHIFT_FRW_COL BITS_FRW_ACC
00065 #define SHIFT_FRW_ROW (BITS_FRW_ACC + 17)
00066 #define RND_FRW_ROW (262144*(BITS_FRW_ACC - 1))
00067
00068
00069
00070
00071
00072
00073
00074 DECLARE_ALIGNED(8, static const int16_t, tg_1_16)[4*4] = {
00075 13036,13036,13036,13036,
00076 27146,27146,27146,27146,
00077 -21746,-21746,-21746,-21746,
00078 23170,23170,23170,23170};
00079
00080 DECLARE_ALIGNED(8, static const int32_t, rounder_0)[2*8] = {
00081 65536,65536,
00082 3597,3597,
00083 2260,2260,
00084 1203,1203,
00085 0,0,
00086 120,120,
00087 512,512,
00088 512,512};
00089
00090
00091
00092
00093
00094
00095
00096
00097
00098
00099
00100
00101
00102
00103
00104
00105
00106
00107
00108
00109
00110
00111
00112
00113
00114
00115
00116
00117
00118
00119
00120
00121
00122
00123
00124
00125
00126
00127
00128
00129
00130
00131
00132
00133
00134
00135
00136
00137
00138
00139
00140
00141
00142
00143
00144
00145
00146
00147
00148
00149
00150 DECLARE_ALIGNED(8, static const int16_t, tab_i_04_mmx)[32*4] = {
00151 16384,16384,16384,-16384,
00152 21407,8867,8867,-21407,
00153 16384,-16384,16384,16384,
00154 -8867,21407,-21407,-8867,
00155 22725,12873,19266,-22725,
00156 19266,4520,-4520,-12873,
00157 12873,4520,4520,19266,
00158 -22725,19266,-12873,-22725,
00159
00160 22725,22725,22725,-22725,
00161 29692,12299,12299,-29692,
00162 22725,-22725,22725,22725,
00163 -12299,29692,-29692,-12299,
00164 31521,17855,26722,-31521,
00165 26722,6270,-6270,-17855,
00166 17855,6270,6270,26722,
00167 -31521,26722,-17855,-31521,
00168
00169 21407,21407,21407,-21407,
00170 27969,11585,11585,-27969,
00171 21407,-21407,21407,21407,
00172 -11585,27969,-27969,-11585,
00173 29692,16819,25172,-29692,
00174 25172,5906,-5906,-16819,
00175 16819,5906,5906,25172,
00176 -29692,25172,-16819,-29692,
00177
00178 19266,19266,19266,-19266,
00179 25172,10426,10426,-25172,
00180 19266,-19266,19266,19266,
00181 -10426,25172,-25172,-10426,
00182 26722,15137,22654,-26722,
00183 22654,5315,-5315,-15137,
00184 15137,5315,5315,22654,
00185 -26722,22654,-15137,-26722,
00186 };
00187
00188
00189
00190
00191
00192 DECLARE_ALIGNED(8, static const int16_t, tab_i_04_xmm)[32*4] = {
00193 16384,21407,16384,8867,
00194 16384,8867,-16384,-21407,
00195 16384,-8867,16384,-21407,
00196 -16384,21407,16384,-8867,
00197 22725,19266,19266,-4520,
00198 12873,4520,-22725,-12873,
00199 12873,-22725,4520,-12873,
00200 4520,19266,19266,-22725,
00201
00202 22725,29692,22725,12299,
00203 22725,12299,-22725,-29692,
00204 22725,-12299,22725,-29692,
00205 -22725,29692,22725,-12299,
00206 31521,26722,26722,-6270,
00207 17855,6270,-31521,-17855,
00208 17855,-31521,6270,-17855,
00209 6270,26722,26722,-31521,
00210
00211 21407,27969,21407,11585,
00212 21407,11585,-21407,-27969,
00213 21407,-11585,21407,-27969,
00214 -21407,27969,21407,-11585,
00215 29692,25172,25172,-5906,
00216 16819,5906,-29692,-16819,
00217 16819,-29692,5906,-16819,
00218 5906,25172,25172,-29692,
00219
00220 19266,25172,19266,10426,
00221 19266,10426,-19266,-25172,
00222 19266,-10426,19266,-25172,
00223 -19266,25172,19266,-10426,
00224 26722,22654,22654,-5315,
00225 15137,5315,-26722,-15137,
00226 15137,-26722,5315,-15137,
00227 5315,22654,22654,-26722,
00228 };
00229
00230
00231
00232
00233
00234
00235
00236
00237 #define DCT_8_INV_ROW_MMX(A1,A2,A3,A4)\
00238 "movq " #A1 ",%%mm0 \n\t"\
00239 "movq 8+" #A1 ",%%mm1 \n\t"\
00240 "movq %%mm0,%%mm2 \n\t"\
00241 "movq " #A3 ",%%mm3 \n\t"\
00242 "punpcklwd %%mm1,%%mm0 \n\t"\
00243 "movq %%mm0,%%mm5 \n\t"\
00244 "punpckldq %%mm0,%%mm0 \n\t"\
00245 "movq 8+" #A3 ",%%mm4 \n\t"\
00246 "punpckhwd %%mm1,%%mm2 \n\t"\
00247 "pmaddwd %%mm0,%%mm3 \n\t"\
00248 "movq %%mm2,%%mm6 \n\t"\
00249 "movq 32+" #A3 ",%%mm1 \n\t"\
00250 "punpckldq %%mm2,%%mm2 \n\t"\
00251 "pmaddwd %%mm2,%%mm4 \n\t"\
00252 "punpckhdq %%mm5,%%mm5 \n\t"\
00253 "pmaddwd 16+" #A3 ",%%mm0 \n\t"\
00254 "punpckhdq %%mm6,%%mm6 \n\t"\
00255 "movq 40+" #A3 ",%%mm7 \n\t"\
00256 "pmaddwd %%mm5,%%mm1 \n\t"\
00257 "paddd " #A4 ",%%mm3 \n\t"\
00258 "pmaddwd %%mm6,%%mm7 \n\t"\
00259 "pmaddwd 24+" #A3 ",%%mm2 \n\t"\
00260 "paddd %%mm4,%%mm3 \n\t"\
00261 "pmaddwd 48+" #A3 ",%%mm5 \n\t"\
00262 "movq %%mm3,%%mm4 \n\t"\
00263 "pmaddwd 56+" #A3 ",%%mm6 \n\t"\
00264 "paddd %%mm7,%%mm1 \n\t"\
00265 "paddd " #A4 ",%%mm0 \n\t"\
00266 "psubd %%mm1,%%mm3 \n\t"\
00267 "psrad $11,%%mm3 \n\t"\
00268 "paddd %%mm4,%%mm1 \n\t"\
00269 "paddd %%mm2,%%mm0 \n\t"\
00270 "psrad $11,%%mm1 \n\t"\
00271 "paddd %%mm6,%%mm5 \n\t"\
00272 "movq %%mm0,%%mm4 \n\t"\
00273 "paddd %%mm5,%%mm0 \n\t"\
00274 "psubd %%mm5,%%mm4 \n\t"\
00275 "psrad $11,%%mm0 \n\t"\
00276 "psrad $11,%%mm4 \n\t"\
00277 "packssdw %%mm0,%%mm1 \n\t"\
00278 "packssdw %%mm3,%%mm4 \n\t"\
00279 "movq %%mm4,%%mm7 \n\t"\
00280 "psrld $16,%%mm4 \n\t"\
00281 "pslld $16,%%mm7 \n\t"\
00282 "movq %%mm1," #A2 " \n\t"\
00283 "por %%mm4,%%mm7 \n\t"\
00284 "movq %%mm7,8 +" #A2 "\n\t"\
00285
00286
00287
00288
00289
00290
00291 #define DCT_8_INV_ROW_XMM(A1,A2,A3,A4)\
00292 "movq " #A1 ",%%mm0 \n\t"\
00293 "movq 8+" #A1 ",%%mm1 \n\t"\
00294 "movq %%mm0,%%mm2 \n\t"\
00295 "movq " #A3 ",%%mm3 \n\t"\
00296 "pshufw $0x88,%%mm0,%%mm0 \n\t"\
00297 "movq 8+" #A3 ",%%mm4 \n\t"\
00298 "movq %%mm1,%%mm5 \n\t"\
00299 "pmaddwd %%mm0,%%mm3 \n\t"\
00300 "movq 32+" #A3 ",%%mm6 \n\t"\
00301 "pshufw $0x88,%%mm1,%%mm1 \n\t"\
00302 "pmaddwd %%mm1,%%mm4 \n\t"\
00303 "movq 40+" #A3 ",%%mm7 \n\t"\
00304 "pshufw $0xdd,%%mm2,%%mm2 \n\t"\
00305 "pmaddwd %%mm2,%%mm6 \n\t"\
00306 "pshufw $0xdd,%%mm5,%%mm5 \n\t"\
00307 "pmaddwd %%mm5,%%mm7 \n\t"\
00308 "paddd " #A4 ",%%mm3 \n\t"\
00309 "pmaddwd 16+" #A3 ",%%mm0 \n\t"\
00310 "paddd %%mm4,%%mm3 \n\t"\
00311 "pmaddwd 24+" #A3 ",%%mm1 \n\t"\
00312 "movq %%mm3,%%mm4 \n\t"\
00313 "pmaddwd 48+" #A3 ",%%mm2 \n\t"\
00314 "paddd %%mm7,%%mm6 \n\t"\
00315 "pmaddwd 56+" #A3 ",%%mm5 \n\t"\
00316 "paddd %%mm6,%%mm3 \n\t"\
00317 "paddd " #A4 ",%%mm0 \n\t"\
00318 "psrad $11,%%mm3 \n\t"\
00319 "paddd %%mm1,%%mm0 \n\t"\
00320 "psubd %%mm6,%%mm4 \n\t"\
00321 "movq %%mm0,%%mm7 \n\t"\
00322 "paddd %%mm5,%%mm2 \n\t"\
00323 "paddd %%mm2,%%mm0 \n\t"\
00324 "psrad $11,%%mm4 \n\t"\
00325 "psubd %%mm2,%%mm7 \n\t"\
00326 "psrad $11,%%mm0 \n\t"\
00327 "psrad $11,%%mm7 \n\t"\
00328 "packssdw %%mm0,%%mm3 \n\t"\
00329 "packssdw %%mm4,%%mm7 \n\t"\
00330 "movq %%mm3, " #A2 " \n\t"\
00331 "pshufw $0xb1,%%mm7,%%mm7 \n\t"\
00332 "movq %%mm7,8 +" #A2 "\n\t"\
00333
00334
00335
00336
00337
00338
00339
00340
00341
00342
00343
00344
00345
00346
00347
00348
00349
00350
00351
00352
00353
00354
00355
00356
00357
00358
00359
00360
00361
00362
00363
00364
00365
00366
00367
00368
00369
00370
00371
00372
00373
00374
00375
00376
00377
00378
00379
00380
00381
00382
00383
00384
00385
00386
00387
00388
00389
00390
00391
00392
00393
00394
00395
00396
00397
00398 #define DCT_8_INV_COL(A1,A2)\
00399 "movq 2*8(%3),%%mm0\n\t"\
00400 "movq 16*3+" #A1 ",%%mm3\n\t"\
00401 "movq %%mm0,%%mm1 \n\t"\
00402 "movq 16*5+" #A1 ",%%mm5\n\t"\
00403 "pmulhw %%mm3,%%mm0 \n\t"\
00404 "movq (%3),%%mm4\n\t"\
00405 "pmulhw %%mm5,%%mm1 \n\t"\
00406 "movq 16*7+" #A1 ",%%mm7\n\t"\
00407 "movq %%mm4,%%mm2 \n\t"\
00408 "movq 16*1+" #A1 ",%%mm6\n\t"\
00409 "pmulhw %%mm7,%%mm4 \n\t"\
00410 "paddsw %%mm3,%%mm0 \n\t"\
00411 "pmulhw %%mm6,%%mm2 \n\t"\
00412 "paddsw %%mm3,%%mm1 \n\t"\
00413 "psubsw %%mm5,%%mm0 \n\t"\
00414 "movq 3*8(%3),%%mm3\n\t"\
00415 "paddsw %%mm5,%%mm1 \n\t"\
00416 "paddsw %%mm6,%%mm4 \n\t"\
00417 "psubsw %%mm7,%%mm2 \n\t"\
00418 "movq %%mm4,%%mm5 \n\t"\
00419 "movq %%mm2,%%mm6 \n\t"\
00420 "paddsw %%mm1,%%mm5 \n\t"\
00421 "psubsw %%mm0,%%mm6 \n\t"\
00422 "psubsw %%mm1,%%mm4 \n\t"\
00423 "paddsw %%mm0,%%mm2 \n\t"\
00424 "movq 1*8(%3),%%mm7\n\t"\
00425 "movq %%mm4,%%mm1 \n\t"\
00426 "movq %%mm5,3*16 +" #A2 "\n\t"\
00427 "paddsw %%mm2,%%mm1 \n\t"\
00428 "movq %%mm6,5*16 +" #A2 "\n\t"\
00429 "psubsw %%mm2,%%mm4 \n\t"\
00430 "movq 2*16+" #A1 ",%%mm5\n\t"\
00431 "movq %%mm7,%%mm0 \n\t"\
00432 "movq 6*16+" #A1 ",%%mm6\n\t"\
00433 "pmulhw %%mm5,%%mm0 \n\t"\
00434 "pmulhw %%mm6,%%mm7 \n\t"\
00435 "pmulhw %%mm3,%%mm1 \n\t"\
00436 "movq 0*16+" #A1 ",%%mm2\n\t"\
00437 "pmulhw %%mm3,%%mm4 \n\t"\
00438 "psubsw %%mm6,%%mm0 \n\t"\
00439 "movq %%mm2,%%mm3 \n\t"\
00440 "movq 4*16+" #A1 ",%%mm6\n\t"\
00441 "paddsw %%mm5,%%mm7 \n\t"\
00442 "paddsw %%mm6,%%mm2 \n\t"\
00443 "psubsw %%mm6,%%mm3 \n\t"\
00444 "movq %%mm2,%%mm5 \n\t"\
00445 "movq %%mm3,%%mm6 \n\t"\
00446 "psubsw %%mm7,%%mm2 \n\t"\
00447 "paddsw %%mm0,%%mm3 \n\t"\
00448 "paddsw %%mm1,%%mm1 \n\t"\
00449 "paddsw %%mm4,%%mm4 \n\t"\
00450 "paddsw %%mm7,%%mm5 \n\t"\
00451 "psubsw %%mm0,%%mm6 \n\t"\
00452 "movq %%mm3,%%mm7 \n\t"\
00453 "movq %%mm6,%%mm0 \n\t"\
00454 "paddsw %%mm1,%%mm3 \n\t"\
00455 "paddsw %%mm4,%%mm6 \n\t"\
00456 "psraw $6,%%mm3 \n\t"\
00457 "psubsw %%mm1,%%mm7 \n\t"\
00458 "psraw $6,%%mm6 \n\t"\
00459 "psubsw %%mm4,%%mm0 \n\t"\
00460 "movq 3*16+" #A2 ",%%mm1 \n\t"\
00461 "psraw $6,%%mm7 \n\t"\
00462 "movq %%mm5,%%mm4 \n\t"\
00463 "psraw $6,%%mm0 \n\t"\
00464 "movq %%mm3,1*16+" #A2 "\n\t"\
00465 "paddsw %%mm1,%%mm5 \n\t"\
00466 "movq %%mm6,2*16+" #A2 "\n\t"\
00467 "psubsw %%mm1,%%mm4 \n\t"\
00468 "movq 5*16+" #A2 ",%%mm3 \n\t"\
00469 "psraw $6,%%mm5 \n\t"\
00470 "movq %%mm2,%%mm6 \n\t"\
00471 "psraw $6,%%mm4 \n\t"\
00472 "movq %%mm0,5*16+" #A2 "\n\t"\
00473 "paddsw %%mm3,%%mm2 \n\t"\
00474 "movq %%mm7,6*16+" #A2 "\n\t"\
00475 "psubsw %%mm3,%%mm6 \n\t"\
00476 "movq %%mm5,0*16+" #A2 "\n\t"\
00477 "psraw $6,%%mm2 \n\t"\
00478 "movq %%mm4,7*16+" #A2 "\n\t"\
00479 "psraw $6,%%mm6 \n\t"\
00480 "movq %%mm2,3*16+" #A2 "\n\t"\
00481 "movq %%mm6,4*16+" #A2 "\n\t"
00482
00483
00484
00485
00486
00487
00488
00489
00490
00491
00492 void ff_idct_xvid_mmx(short *block){
00493 __asm__ volatile(
00494
00495 DCT_8_INV_ROW_MMX(0*16(%0), 0*16(%0), 64*0(%2), 8*0(%1))
00496 DCT_8_INV_ROW_MMX(1*16(%0), 1*16(%0), 64*1(%2), 8*1(%1))
00497 DCT_8_INV_ROW_MMX(2*16(%0), 2*16(%0), 64*2(%2), 8*2(%1))
00498 DCT_8_INV_ROW_MMX(3*16(%0), 3*16(%0), 64*3(%2), 8*3(%1))
00499 DCT_8_INV_ROW_MMX(4*16(%0), 4*16(%0), 64*0(%2), 8*4(%1))
00500 DCT_8_INV_ROW_MMX(5*16(%0), 5*16(%0), 64*3(%2), 8*5(%1))
00501 DCT_8_INV_ROW_MMX(6*16(%0), 6*16(%0), 64*2(%2), 8*6(%1))
00502 DCT_8_INV_ROW_MMX(7*16(%0), 7*16(%0), 64*1(%2), 8*7(%1))
00503
00504
00505 DCT_8_INV_COL(0(%0), 0(%0))
00506 DCT_8_INV_COL(8(%0), 8(%0))
00507 :: "r"(block), "r"(rounder_0), "r"(tab_i_04_mmx), "r"(tg_1_16));
00508 }
00509
00510
00511
00512
00513
00514
00515 void ff_idct_xvid_mmx2(short *block){
00516 __asm__ volatile(
00517
00518 DCT_8_INV_ROW_XMM(0*16(%0), 0*16(%0), 64*0(%2), 8*0(%1))
00519 DCT_8_INV_ROW_XMM(1*16(%0), 1*16(%0), 64*1(%2), 8*1(%1))
00520 DCT_8_INV_ROW_XMM(2*16(%0), 2*16(%0), 64*2(%2), 8*2(%1))
00521 DCT_8_INV_ROW_XMM(3*16(%0), 3*16(%0), 64*3(%2), 8*3(%1))
00522 DCT_8_INV_ROW_XMM(4*16(%0), 4*16(%0), 64*0(%2), 8*4(%1))
00523 DCT_8_INV_ROW_XMM(5*16(%0), 5*16(%0), 64*3(%2), 8*5(%1))
00524 DCT_8_INV_ROW_XMM(6*16(%0), 6*16(%0), 64*2(%2), 8*6(%1))
00525 DCT_8_INV_ROW_XMM(7*16(%0), 7*16(%0), 64*1(%2), 8*7(%1))
00526
00527
00528 DCT_8_INV_COL(0(%0), 0(%0))
00529 DCT_8_INV_COL(8(%0), 8(%0))
00530 :: "r"(block), "r"(rounder_0), "r"(tab_i_04_xmm), "r"(tg_1_16));
00531 }
00532
00533 void ff_idct_xvid_mmx_put(uint8_t *dest, int line_size, DCTELEM *block)
00534 {
00535 ff_idct_xvid_mmx(block);
00536 ff_put_pixels_clamped_mmx(block, dest, line_size);
00537 }
00538
00539 void ff_idct_xvid_mmx_add(uint8_t *dest, int line_size, DCTELEM *block)
00540 {
00541 ff_idct_xvid_mmx(block);
00542 ff_add_pixels_clamped_mmx(block, dest, line_size);
00543 }
00544
00545 void ff_idct_xvid_mmx2_put(uint8_t *dest, int line_size, DCTELEM *block)
00546 {
00547 ff_idct_xvid_mmx2(block);
00548 ff_put_pixels_clamped_mmx(block, dest, line_size);
00549 }
00550
00551 void ff_idct_xvid_mmx2_add(uint8_t *dest, int line_size, DCTELEM *block)
00552 {
00553 ff_idct_xvid_mmx2(block);
00554 ff_add_pixels_clamped_mmx(block, dest, line_size);
00555 }
00556
00557 #endif