36 #if HAVE_6REGS && HAVE_INLINE_ASM 
   39 #define OP_AVG(S,D) "pavgb " #S ", " #D " \n\t" 
   42 #define NORMALIZE_MMX(SHIFT)                                    \ 
   43      "paddw     %%mm7, %%mm3           \n\t"       \ 
   44      "paddw     %%mm7, %%mm4           \n\t"       \ 
   45      "psraw     "SHIFT", %%mm3         \n\t"                    \ 
   46      "psraw     "SHIFT", %%mm4         \n\t" 
   48 #define TRANSFER_DO_PACK(OP)                    \ 
   49      "packuswb  %%mm4, %%mm3           \n\t"    \ 
   51      "movq      %%mm3, (%2)            \n\t" 
   53 #define TRANSFER_DONT_PACK(OP)                  \ 
   56      "movq      %%mm3, 0(%2)           \n\t"    \ 
   57      "movq      %%mm4, 8(%2)           \n\t" 
   60 #define DO_UNPACK(reg)  "punpcklbw %%mm0, " reg "\n\t" 
   61 #define DONT_UNPACK(reg) 
   64 #define LOAD_ROUNDER_MMX(ROUND)                 \ 
   65      "movd      "ROUND", %%mm7         \n\t"    \ 
   66      "punpcklwd %%mm7, %%mm7           \n\t"    \ 
   67      "punpckldq %%mm7, %%mm7           \n\t" 
   69 #define SHIFT2_LINE(OFF, R0,R1,R2,R3)           \ 
   70     "paddw     %%mm"#R2", %%mm"#R1"    \n\t"    \ 
   71     "movd      (%0,%3), %%mm"#R0"      \n\t"    \ 
   72     "pmullw    %%mm6, %%mm"#R1"        \n\t"    \ 
   73     "punpcklbw %%mm0, %%mm"#R0"        \n\t"    \ 
   74     "movd      (%0,%2), %%mm"#R3"      \n\t"    \ 
   75     "psubw     %%mm"#R0", %%mm"#R1"    \n\t"    \ 
   76     "punpcklbw %%mm0, %%mm"#R3"        \n\t"    \ 
   77     "paddw     %%mm7, %%mm"#R1"        \n\t"    \ 
   78     "psubw     %%mm"#R3", %%mm"#R1"    \n\t"    \ 
   79     "psraw     %4, %%mm"#R1"           \n\t"    \ 
   80     "movq      %%mm"#R1", "#OFF"(%1)   \n\t"    \ 
   84 static void vc1_put_ver_16b_shift2_mmx(int16_t *dst,
 
   86                                        int rnd, int64_t 
shift)
 
   89         "mov       $3, %%"REG_c
"           \n\t" 
   90         LOAD_ROUNDER_MMX(
"%5")
 
   93         "movd      (%0), %%mm2             \n\t" 
   95         "movd      (%0), %%mm3             \n\t" 
   96         "punpcklbw %%mm0, %%mm2            \n\t" 
   97         "punpcklbw %%mm0, %%mm3            \n\t" 
   98         SHIFT2_LINE(  0, 1, 2, 3, 4)
 
   99         SHIFT2_LINE( 24, 2, 3, 4, 1)
 
  100         SHIFT2_LINE( 48, 3, 4, 1, 2)
 
  101         SHIFT2_LINE( 72, 4, 1, 2, 3)
 
  102         SHIFT2_LINE( 96, 1, 2, 3, 4)
 
  103         SHIFT2_LINE(120, 2, 3, 4, 1)
 
  104         SHIFT2_LINE(144, 3, 4, 1, 2)
 
  105         SHIFT2_LINE(168, 4, 1, 2, 3)
 
  110         : "+
r"(src), "+
r"(dst)
 
  111         : "
r"(stride), "
r"(-2*stride),
 
  112           "
m"(shift), "
m"(rnd), "
r"(9*stride-4)
 
  122 #define VC1_HOR_16b_SHIFT2(OP, OPNAME)\ 
  123 static void OPNAME ## vc1_hor_16b_shift2_mmx(uint8_t *dst, x86_reg stride,\ 
  124                                              const int16_t *src, int rnd)\ 
  129     rnd -= (-1+9+9-1)*1024; \ 
  131         LOAD_ROUNDER_MMX("%4")\ 
  132         "movq      "MANGLE(ff_pw_128)", %%mm6\n\t"\ 
  133         "movq      "MANGLE(ff_pw_9)", %%mm5 \n\t"\ 
  135         "movq      2*0+0(%1), %%mm1        \n\t"\ 
  136         "movq      2*0+8(%1), %%mm2        \n\t"\ 
  137         "movq      2*1+0(%1), %%mm3        \n\t"\ 
  138         "movq      2*1+8(%1), %%mm4        \n\t"\ 
  139         "paddw     2*3+0(%1), %%mm1        \n\t"\ 
  140         "paddw     2*3+8(%1), %%mm2        \n\t"\ 
  141         "paddw     2*2+0(%1), %%mm3        \n\t"\ 
  142         "paddw     2*2+8(%1), %%mm4        \n\t"\ 
  143         "pmullw    %%mm5, %%mm3            \n\t"\ 
  144         "pmullw    %%mm5, %%mm4            \n\t"\ 
  145         "psubw     %%mm1, %%mm3            \n\t"\ 
  146         "psubw     %%mm2, %%mm4            \n\t"\ 
  149         "paddw     %%mm6, %%mm3            \n\t"\ 
  150         "paddw     %%mm6, %%mm4            \n\t"\ 
  151         TRANSFER_DO_PACK(OP)\ 
  156         : "+r"(h), "+r" (src),  "+r" (dst)\ 
  157         : "r"(stride), "m"(rnd)\ 
  158           NAMED_CONSTRAINTS_ADD(ff_pw_128,ff_pw_9)\ 
  163 VC1_HOR_16b_SHIFT2(
OP_PUT, put_)
 
  164 VC1_HOR_16b_SHIFT2(
OP_AVG, avg_)
 
  171 #define VC1_SHIFT2(OP, OPNAME)\ 
  172 static void OPNAME ## vc1_shift2_mmx(uint8_t *dst, const uint8_t *src,\ 
  173                                      x86_reg stride, int rnd, x86_reg offset)\ 
  177         "mov       $8, %%"REG_c"           \n\t"\ 
  178         LOAD_ROUNDER_MMX("%5")\ 
  179         "movq      "MANGLE(ff_pw_9)", %%mm6\n\t"\ 
  181         "movd      0(%0   ), %%mm3         \n\t"\ 
  182         "movd      4(%0   ), %%mm4         \n\t"\ 
  183         "movd      0(%0,%2), %%mm1         \n\t"\ 
  184         "movd      4(%0,%2), %%mm2         \n\t"\ 
  186         "punpcklbw %%mm0, %%mm3            \n\t"\ 
  187         "punpcklbw %%mm0, %%mm4            \n\t"\ 
  188         "punpcklbw %%mm0, %%mm1            \n\t"\ 
  189         "punpcklbw %%mm0, %%mm2            \n\t"\ 
  190         "paddw     %%mm1, %%mm3            \n\t"\ 
  191         "paddw     %%mm2, %%mm4            \n\t"\ 
  192         "movd      0(%0,%3), %%mm1         \n\t"\ 
  193         "movd      4(%0,%3), %%mm2         \n\t"\ 
  194         "pmullw    %%mm6, %%mm3            \n\t" \ 
  195         "pmullw    %%mm6, %%mm4            \n\t" \ 
  196         "punpcklbw %%mm0, %%mm1            \n\t"\ 
  197         "punpcklbw %%mm0, %%mm2            \n\t"\ 
  198         "psubw     %%mm1, %%mm3            \n\t" \ 
  199         "psubw     %%mm2, %%mm4            \n\t" \ 
  200         "movd      0(%0,%2), %%mm1         \n\t"\ 
  201         "movd      4(%0,%2), %%mm2         \n\t"\ 
  202         "punpcklbw %%mm0, %%mm1            \n\t"\ 
  203         "punpcklbw %%mm0, %%mm2            \n\t"\ 
  204         "psubw     %%mm1, %%mm3            \n\t" \ 
  205         "psubw     %%mm2, %%mm4            \n\t" \ 
  207         "packuswb  %%mm4, %%mm3            \n\t"\ 
  209         "movq      %%mm3, (%1)             \n\t"\ 
  212         "dec       %%"REG_c"               \n\t"\ 
  214         : "+r"(src),  "+r"(dst)\ 
  215         : "r"(offset), "r"(-2*offset), "g"(stride), "m"(rnd),\ 
  217           NAMED_CONSTRAINTS_ADD(ff_pw_9)\ 
  218         : "%"REG_c, "memory"\ 
  223 VC1_SHIFT2(OP_AVG, avg_)
 
  235 #define MSPEL_FILTER13_CORE(UNPACK, MOVQ, A1, A2, A3, A4)       \ 
  236      MOVQ "*0+"A1", %%mm1       \n\t"                           \ 
  237      MOVQ "*4+"A1", %%mm2       \n\t"                           \ 
  240      "pmullw    "MANGLE(ff_pw_3)", %%mm1\n\t"                   \ 
  241      "pmullw    "MANGLE(ff_pw_3)", %%mm2\n\t"                   \ 
  242      MOVQ "*0+"A2", %%mm3       \n\t"                           \ 
  243      MOVQ "*4+"A2", %%mm4       \n\t"                           \ 
  246      "pmullw    %%mm6, %%mm3    \n\t"                  \ 
  247      "pmullw    %%mm6, %%mm4    \n\t"                  \ 
  248      "psubw     %%mm1, %%mm3    \n\t"                \ 
  249      "psubw     %%mm2, %%mm4    \n\t"                \ 
  250      MOVQ "*0+"A4", %%mm1       \n\t"                           \ 
  251      MOVQ "*4+"A4", %%mm2       \n\t"                           \ 
  254      "psllw     $2, %%mm1       \n\t"                   \ 
  255      "psllw     $2, %%mm2       \n\t"                   \ 
  256      "psubw     %%mm1, %%mm3    \n\t"             \ 
  257      "psubw     %%mm2, %%mm4    \n\t"             \ 
  258      MOVQ "*0+"A3", %%mm1       \n\t"                           \ 
  259      MOVQ "*4+"A3", %%mm2       \n\t"                           \ 
  262      "pmullw    %%mm5, %%mm1    \n\t"                  \ 
  263      "pmullw    %%mm5, %%mm2    \n\t"                  \ 
  264      "paddw     %%mm1, %%mm3    \n\t"           \ 
  265      "paddw     %%mm2, %%mm4    \n\t"  
  275 #define MSPEL_FILTER13_VER_16B(NAME, A1, A2, A3, A4)                    \ 
  277 vc1_put_ver_16b_ ## NAME ## _mmx(int16_t *dst, const uint8_t *src,      \ 
  278                                  x86_reg src_stride,                   \ 
  279                                  int rnd, int64_t shift)                \ 
  284         LOAD_ROUNDER_MMX("%5")                                          \ 
  285         "movq      "MANGLE(ff_pw_53)", %%mm5\n\t"                       \ 
  286         "movq      "MANGLE(ff_pw_18)", %%mm6\n\t"                       \ 
  289         MSPEL_FILTER13_CORE(DO_UNPACK, "movd  1", A1, A2, A3, A4)       \ 
  290         NORMALIZE_MMX("%6")                                             \ 
  291         TRANSFER_DONT_PACK(OP_PUT)                                      \ 
  293         "movd      8+"A1", %%mm1   \n\t"                                \ 
  295         "movq      %%mm1, %%mm3    \n\t"                                \ 
  296         "paddw     %%mm1, %%mm1    \n\t"                                \ 
  297         "paddw     %%mm3, %%mm1    \n\t"                        \ 
  298         "movd      8+"A2", %%mm3   \n\t"                                \ 
  300         "pmullw    %%mm6, %%mm3    \n\t"                       \ 
  301         "psubw     %%mm1, %%mm3    \n\t"                      \ 
  302         "movd      8+"A3", %%mm1   \n\t"                                \ 
  304         "pmullw    %%mm5, %%mm1    \n\t"                       \ 
  305         "paddw     %%mm1, %%mm3    \n\t"                   \ 
  306         "movd      8+"A4", %%mm1   \n\t"                                \ 
  308         "psllw     $2, %%mm1       \n\t"                        \ 
  309         "psubw     %%mm1, %%mm3    \n\t"                                \ 
  310         "paddw     %%mm7, %%mm3    \n\t"                                \ 
  311         "psraw     %6, %%mm3       \n\t"                                \ 
  312         "movq      %%mm3, 16(%2)   \n\t"                                \ 
  317         : "+r"(h), "+r" (src),  "+r" (dst)                              \ 
  318         : "r"(src_stride), "r"(3*src_stride),                           \ 
  319           "m"(rnd), "m"(shift)                                          \ 
  320           NAMED_CONSTRAINTS_ADD(ff_pw_3,ff_pw_53,ff_pw_18)              \ 
  332 #define MSPEL_FILTER13_HOR_16B(NAME, A1, A2, A3, A4, OP, OPNAME)        \ 
  334 OPNAME ## vc1_hor_16b_ ## NAME ## _mmx(uint8_t *dst, x86_reg stride,    \ 
  335                                  const int16_t *src, int rnd)           \ 
  339     rnd -= (-4+58+13-3)*256;                         \ 
  341         LOAD_ROUNDER_MMX("%4")                                          \ 
  342         "movq      "MANGLE(ff_pw_18)", %%mm6   \n\t"                    \ 
  343         "movq      "MANGLE(ff_pw_53)", %%mm5   \n\t"                    \ 
  346         MSPEL_FILTER13_CORE(DONT_UNPACK, "movq 2", A1, A2, A3, A4)      \ 
  347         NORMALIZE_MMX("$7")                                             \ 
  349         "paddw     "MANGLE(ff_pw_128)", %%mm3  \n\t"                    \ 
  350         "paddw     "MANGLE(ff_pw_128)", %%mm4  \n\t"                    \ 
  351         TRANSFER_DO_PACK(OP)                                            \ 
  356         : "+r"(h), "+r" (src),  "+r" (dst)                              \ 
  357         : "r"(stride), "m"(rnd)                                         \ 
  358           NAMED_CONSTRAINTS_ADD(ff_pw_3,ff_pw_18,ff_pw_53,ff_pw_128)    \ 
  371 #define MSPEL_FILTER13_8B(NAME, A1, A2, A3, A4, OP, OPNAME)             \ 
  373 OPNAME ## vc1_## NAME ## _mmx(uint8_t *dst, const uint8_t *src,         \ 
  374                         x86_reg stride, int rnd, x86_reg offset)      \ 
  380         LOAD_ROUNDER_MMX("%6")                                          \ 
  381         "movq      "MANGLE(ff_pw_53)", %%mm5       \n\t"                \ 
  382         "movq      "MANGLE(ff_pw_18)", %%mm6       \n\t"                \ 
  385         MSPEL_FILTER13_CORE(DO_UNPACK, "movd   1", A1, A2, A3, A4)      \ 
  386         NORMALIZE_MMX("$6")                                             \ 
  387         TRANSFER_DO_PACK(OP)                                            \ 
  392         : "+r"(h), "+r" (src),  "+r" (dst)                              \ 
  393         : "r"(offset), "r"(3*offset), "g"(stride), "m"(rnd)             \ 
  394           NAMED_CONSTRAINTS_ADD(ff_pw_53,ff_pw_18,ff_pw_3)              \ 
  400 MSPEL_FILTER13_8B     (
shift1, 
"0(%1,%4  )", 
"0(%1,%3,2)", 
"0(%1,%3  )", 
"0(%1     )", 
OP_PUT, put_)
 
  401 MSPEL_FILTER13_8B     (
shift1, "0(%1,%4  )", "0(%1,%3,2)", "0(%1,%3  )", "0(%1     )", OP_AVG, avg_)
 
  402 MSPEL_FILTER13_VER_16B(shift1, "0(%1,%4  )", "0(%1,%3,2)", "0(%1,%3  )", "0(%1     )")
 
  403 MSPEL_FILTER13_HOR_16B(shift1, "2*3(%1)", "2*2(%1)", "2*1(%1)", "2*0(%1)", 
OP_PUT, put_)
 
  404 MSPEL_FILTER13_HOR_16B(shift1, "2*3(%1)", "2*2(%1)", "2*1(%1)", "2*0(%1)", OP_AVG, avg_)
 
  407 MSPEL_FILTER13_8B     (shift3, "0(%1     )", "0(%1,%3  )", "0(%1,%3,2)", "0(%1,%4  )", OP_PUT, put_)
 
  408 MSPEL_FILTER13_8B     (shift3, "0(%1     )", "0(%1,%3  )", "0(%1,%3,2)", "0(%1,%4  )", OP_AVG, avg_)
 
  409 MSPEL_FILTER13_VER_16B(shift3, "0(%1     )", "0(%1,%3  )", "0(%1,%3,2)", "0(%1,%4  )")
 
  410 MSPEL_FILTER13_HOR_16B(shift3, "2*0(%1)", "2*1(%1)", "2*2(%1)", "2*3(%1)", OP_PUT, put_)
 
  411 MSPEL_FILTER13_HOR_16B(shift3, "2*0(%1)", "2*1(%1)", "2*2(%1)", "2*3(%1)", OP_AVG, avg_)
 
  413 typedef 
void (*vc1_mspel_mc_filter_ver_16bits)(int16_t *dst, const 
uint8_t *src, 
x86_reg src_stride, 
int rnd, int64_t shift);
 
  414 typedef 
void (*vc1_mspel_mc_filter_hor_16bits)(
uint8_t *dst, 
x86_reg dst_stride, const int16_t *src, 
int rnd);
 
  428 #define VC1_MSPEL_MC(OP)\ 
  429 static void OP ## vc1_mspel_mc(uint8_t *dst, const uint8_t *src, int stride,\ 
  430                                int hmode, int vmode, int rnd)\ 
  432     static const vc1_mspel_mc_filter_ver_16bits vc1_put_shift_ver_16bits[] =\ 
  433          { NULL, vc1_put_ver_16b_shift1_mmx, vc1_put_ver_16b_shift2_mmx, vc1_put_ver_16b_shift3_mmx };\ 
  434     static const vc1_mspel_mc_filter_hor_16bits vc1_put_shift_hor_16bits[] =\ 
  435          { NULL, OP ## vc1_hor_16b_shift1_mmx, OP ## vc1_hor_16b_shift2_mmx, OP ## vc1_hor_16b_shift3_mmx };\ 
  436     static const vc1_mspel_mc_filter_8bits vc1_put_shift_8bits[] =\ 
  437          { NULL, OP ## vc1_shift1_mmx, OP ## vc1_shift2_mmx, OP ## vc1_shift3_mmx };\ 
  440         "pxor %%mm0, %%mm0         \n\t"\ 
  446             static const int shift_value[] = { 0, 5, 1, 5 };\ 
  447             int              shift = (shift_value[hmode]+shift_value[vmode])>>1;\ 
  449             DECLARE_ALIGNED(16, int16_t, tmp)[12*8];\ 
  451             r = (1<<(shift-1)) + rnd-1;\ 
  452             vc1_put_shift_ver_16bits[vmode](tmp, src-1, stride, r, shift);\ 
  454             vc1_put_shift_hor_16bits[hmode](dst, stride, tmp+1, 64-rnd);\ 
  458             vc1_put_shift_8bits[vmode](dst, src, stride, 1-rnd, stride);\ 
  464     vc1_put_shift_8bits[hmode](dst, src, stride, rnd, 1);\ 
  466 static void OP ## vc1_mspel_mc_16(uint8_t *dst, const uint8_t *src, \ 
  467                                   int stride, int hmode, int vmode, int rnd)\ 
  469     OP ## vc1_mspel_mc(dst + 0, src + 0, stride, hmode, vmode, rnd); \ 
  470     OP ## vc1_mspel_mc(dst + 8, src + 8, stride, hmode, vmode, rnd); \ 
  471     dst += 8*stride; src += 8*stride; \ 
  472     OP ## vc1_mspel_mc(dst + 0, src + 0, stride, hmode, vmode, rnd); \ 
  473     OP ## vc1_mspel_mc(dst + 8, src + 8, stride, hmode, vmode, rnd); \ 
  480 #define DECLARE_FUNCTION(a, b)                                          \ 
  481 static void put_vc1_mspel_mc ## a ## b ## _mmx(uint8_t *dst,            \ 
  482                                                const uint8_t *src,      \ 
  486      put_vc1_mspel_mc(dst, src, stride, a, b, rnd);                     \ 
  488 static void avg_vc1_mspel_mc ## a ## b ## _mmxext(uint8_t *dst,         \ 
  489                                                   const uint8_t *src,   \ 
  493      avg_vc1_mspel_mc(dst, src, stride, a, b, rnd);                     \ 
  495 static void put_vc1_mspel_mc ## a ## b ## _16_mmx(uint8_t *dst,         \ 
  496                                                   const uint8_t *src,   \ 
  500      put_vc1_mspel_mc_16(dst, src, stride, a, b, rnd);                  \ 
  502 static void avg_vc1_mspel_mc ## a ## b ## _16_mmxext(uint8_t *dst,      \ 
  507      avg_vc1_mspel_mc_16(dst, src, stride, a, b, rnd);                  \ 
  510 DECLARE_FUNCTION(0, 1)
 
  511 DECLARE_FUNCTION(0, 2)
 
  512 DECLARE_FUNCTION(0, 3)
 
  514 DECLARE_FUNCTION(1, 0)
 
  515 DECLARE_FUNCTION(1, 1)
 
  516 DECLARE_FUNCTION(1, 2)
 
  517 DECLARE_FUNCTION(1, 3)
 
  519 DECLARE_FUNCTION(2, 0)
 
  520 DECLARE_FUNCTION(2, 1)
 
  521 DECLARE_FUNCTION(2, 2)
 
  522 DECLARE_FUNCTION(2, 3)
 
  524 DECLARE_FUNCTION(3, 0)
 
  525 DECLARE_FUNCTION(3, 1)
 
  526 DECLARE_FUNCTION(3, 2)
 
  527 DECLARE_FUNCTION(3, 3)
 
  529 static 
void vc1_inv_trans_4x4_dc_mmxext(
uint8_t *dest, 
int linesize,
 
  533     dc = (17 * dc +  4) >> 3;
 
  534     dc = (17 * dc + 64) >> 7;
 
  536         "movd          %0, %%mm0 \n\t" 
  537         "pshufw $0, %%mm0, %%mm0 \n\t" 
  538         "pxor       %%mm1, %%mm1 \n\t" 
  539         "psubw      %%mm0, %%mm1 \n\t" 
  540         "packuswb   %%mm0, %%mm0 \n\t" 
  541         "packuswb   %%mm1, %%mm1 \n\t" 
  545         "movd          %0, %%mm2 \n\t" 
  546         "movd          %1, %%mm3 \n\t" 
  547         "movd          %2, %%mm4 \n\t" 
  548         "movd          %3, %%mm5 \n\t" 
  549         "paddusb    %%mm0, %%mm2 \n\t" 
  550         "paddusb    %%mm0, %%mm3 \n\t" 
  551         "paddusb    %%mm0, %%mm4 \n\t" 
  552         "paddusb    %%mm0, %%mm5 \n\t" 
  553         "psubusb    %%mm1, %%mm2 \n\t" 
  554         "psubusb    %%mm1, %%mm3 \n\t" 
  555         "psubusb    %%mm1, %%mm4 \n\t" 
  556         "psubusb    %%mm1, %%mm5 \n\t" 
  557         "movd       %%mm2, %0    \n\t" 
  558         "movd       %%mm3, %1    \n\t" 
  559         "movd       %%mm4, %2    \n\t" 
  560         "movd       %%mm5, %3    \n\t" 
  561         :
"+m"(*(uint32_t*)(dest+0*linesize)),
 
  562          "+m"(*(uint32_t*)(dest+1*linesize)),
 
  563          "+m"(*(uint32_t*)(dest+2*linesize)),
 
  564          "+m"(*(uint32_t*)(dest+3*linesize))
 
  568 static void vc1_inv_trans_4x8_dc_mmxext(
uint8_t *dest, 
int linesize,
 
  572     dc = (17 * dc +  4) >> 3;
 
  573     dc = (12 * dc + 64) >> 7;
 
  575         "movd          %0, %%mm0 \n\t" 
  576         "pshufw $0, %%mm0, %%mm0 \n\t" 
  577         "pxor       %%mm1, %%mm1 \n\t" 
  578         "psubw      %%mm0, %%mm1 \n\t" 
  579         "packuswb   %%mm0, %%mm0 \n\t" 
  580         "packuswb   %%mm1, %%mm1 \n\t" 
  584         "movd          %0, %%mm2 \n\t" 
  585         "movd          %1, %%mm3 \n\t" 
  586         "movd          %2, %%mm4 \n\t" 
  587         "movd          %3, %%mm5 \n\t" 
  588         "paddusb    %%mm0, %%mm2 \n\t" 
  589         "paddusb    %%mm0, %%mm3 \n\t" 
  590         "paddusb    %%mm0, %%mm4 \n\t" 
  591         "paddusb    %%mm0, %%mm5 \n\t" 
  592         "psubusb    %%mm1, %%mm2 \n\t" 
  593         "psubusb    %%mm1, %%mm3 \n\t" 
  594         "psubusb    %%mm1, %%mm4 \n\t" 
  595         "psubusb    %%mm1, %%mm5 \n\t" 
  596         "movd       %%mm2, %0    \n\t" 
  597         "movd       %%mm3, %1    \n\t" 
  598         "movd       %%mm4, %2    \n\t" 
  599         "movd       %%mm5, %3    \n\t" 
  600         :
"+m"(*(uint32_t*)(dest+0*linesize)),
 
  601          "+m"(*(uint32_t*)(dest+1*linesize)),
 
  602          "+m"(*(uint32_t*)(dest+2*linesize)),
 
  603          "+m"(*(uint32_t*)(dest+3*linesize))
 
  607         "movd          %0, %%mm2 \n\t" 
  608         "movd          %1, %%mm3 \n\t" 
  609         "movd          %2, %%mm4 \n\t" 
  610         "movd          %3, %%mm5 \n\t" 
  611         "paddusb    %%mm0, %%mm2 \n\t" 
  612         "paddusb    %%mm0, %%mm3 \n\t" 
  613         "paddusb    %%mm0, %%mm4 \n\t" 
  614         "paddusb    %%mm0, %%mm5 \n\t" 
  615         "psubusb    %%mm1, %%mm2 \n\t" 
  616         "psubusb    %%mm1, %%mm3 \n\t" 
  617         "psubusb    %%mm1, %%mm4 \n\t" 
  618         "psubusb    %%mm1, %%mm5 \n\t" 
  619         "movd       %%mm2, %0    \n\t" 
  620         "movd       %%mm3, %1    \n\t" 
  621         "movd       %%mm4, %2    \n\t" 
  622         "movd       %%mm5, %3    \n\t" 
  623         :
"+m"(*(uint32_t*)(dest+0*linesize)),
 
  624          "+m"(*(uint32_t*)(dest+1*linesize)),
 
  625          "+m"(*(uint32_t*)(dest+2*linesize)),
 
  626          "+m"(*(uint32_t*)(dest+3*linesize))
 
  630 static void vc1_inv_trans_8x4_dc_mmxext(
uint8_t *dest, 
int linesize,
 
  634     dc = ( 3 * dc +  1) >> 1;
 
  635     dc = (17 * dc + 64) >> 7;
 
  637         "movd          %0, %%mm0 \n\t" 
  638         "pshufw $0, %%mm0, %%mm0 \n\t" 
  639         "pxor       %%mm1, %%mm1 \n\t" 
  640         "psubw      %%mm0, %%mm1 \n\t" 
  641         "packuswb   %%mm0, %%mm0 \n\t" 
  642         "packuswb   %%mm1, %%mm1 \n\t" 
  646         "movq          %0, %%mm2 \n\t" 
  647         "movq          %1, %%mm3 \n\t" 
  648         "movq          %2, %%mm4 \n\t" 
  649         "movq          %3, %%mm5 \n\t" 
  650         "paddusb    %%mm0, %%mm2 \n\t" 
  651         "paddusb    %%mm0, %%mm3 \n\t" 
  652         "paddusb    %%mm0, %%mm4 \n\t" 
  653         "paddusb    %%mm0, %%mm5 \n\t" 
  654         "psubusb    %%mm1, %%mm2 \n\t" 
  655         "psubusb    %%mm1, %%mm3 \n\t" 
  656         "psubusb    %%mm1, %%mm4 \n\t" 
  657         "psubusb    %%mm1, %%mm5 \n\t" 
  658         "movq       %%mm2, %0    \n\t" 
  659         "movq       %%mm3, %1    \n\t" 
  660         "movq       %%mm4, %2    \n\t" 
  661         "movq       %%mm5, %3    \n\t" 
  662         :
"+m"(*(uint32_t*)(dest+0*linesize)),
 
  663          "+m"(*(uint32_t*)(dest+1*linesize)),
 
  664          "+m"(*(uint32_t*)(dest+2*linesize)),
 
  665          "+m"(*(uint32_t*)(dest+3*linesize))
 
  669 static void vc1_inv_trans_8x8_dc_mmxext(
uint8_t *dest, 
int linesize,
 
  673     dc = (3 * dc +  1) >> 1;
 
  674     dc = (3 * dc + 16) >> 5;
 
  676         "movd          %0, %%mm0 \n\t" 
  677         "pshufw $0, %%mm0, %%mm0 \n\t" 
  678         "pxor       %%mm1, %%mm1 \n\t" 
  679         "psubw      %%mm0, %%mm1 \n\t" 
  680         "packuswb   %%mm0, %%mm0 \n\t" 
  681         "packuswb   %%mm1, %%mm1 \n\t" 
  685         "movq          %0, %%mm2 \n\t" 
  686         "movq          %1, %%mm3 \n\t" 
  687         "movq          %2, %%mm4 \n\t" 
  688         "movq          %3, %%mm5 \n\t" 
  689         "paddusb    %%mm0, %%mm2 \n\t" 
  690         "paddusb    %%mm0, %%mm3 \n\t" 
  691         "paddusb    %%mm0, %%mm4 \n\t" 
  692         "paddusb    %%mm0, %%mm5 \n\t" 
  693         "psubusb    %%mm1, %%mm2 \n\t" 
  694         "psubusb    %%mm1, %%mm3 \n\t" 
  695         "psubusb    %%mm1, %%mm4 \n\t" 
  696         "psubusb    %%mm1, %%mm5 \n\t" 
  697         "movq       %%mm2, %0    \n\t" 
  698         "movq       %%mm3, %1    \n\t" 
  699         "movq       %%mm4, %2    \n\t" 
  700         "movq       %%mm5, %3    \n\t" 
  701         :
"+m"(*(uint32_t*)(dest+0*linesize)),
 
  702          "+m"(*(uint32_t*)(dest+1*linesize)),
 
  703          "+m"(*(uint32_t*)(dest+2*linesize)),
 
  704          "+m"(*(uint32_t*)(dest+3*linesize))
 
  708         "movq          %0, %%mm2 \n\t" 
  709         "movq          %1, %%mm3 \n\t" 
  710         "movq          %2, %%mm4 \n\t" 
  711         "movq          %3, %%mm5 \n\t" 
  712         "paddusb    %%mm0, %%mm2 \n\t" 
  713         "paddusb    %%mm0, %%mm3 \n\t" 
  714         "paddusb    %%mm0, %%mm4 \n\t" 
  715         "paddusb    %%mm0, %%mm5 \n\t" 
  716         "psubusb    %%mm1, %%mm2 \n\t" 
  717         "psubusb    %%mm1, %%mm3 \n\t" 
  718         "psubusb    %%mm1, %%mm4 \n\t" 
  719         "psubusb    %%mm1, %%mm5 \n\t" 
  720         "movq       %%mm2, %0    \n\t" 
  721         "movq       %%mm3, %1    \n\t" 
  722         "movq       %%mm4, %2    \n\t" 
  723         "movq       %%mm5, %3    \n\t" 
  724         :
"+m"(*(uint32_t*)(dest+0*linesize)),
 
  725          "+m"(*(uint32_t*)(dest+1*linesize)),
 
  726          "+m"(*(uint32_t*)(dest+2*linesize)),
 
  727          "+m"(*(uint32_t*)(dest+3*linesize))
 
  731 #if HAVE_MMX_EXTERNAL 
  732 static void put_vc1_mspel_mc00_mmx(
uint8_t *dst, 
const uint8_t *src,
 
  733                                    ptrdiff_t stride, 
int rnd)
 
  737 static void put_vc1_mspel_mc00_16_mmx(
uint8_t *dst, 
const uint8_t *src,
 
  738                                       ptrdiff_t stride, 
int rnd)
 
  742 static void avg_vc1_mspel_mc00_mmx(
uint8_t *dst, 
const uint8_t *src,
 
  743                                    ptrdiff_t stride, 
int rnd)
 
  747 static void avg_vc1_mspel_mc00_16_mmx(
uint8_t *dst, 
const uint8_t *src,
 
  748                                       ptrdiff_t stride, 
int rnd)
 
  754 #define FN_ASSIGN(OP, X, Y, INSN) \ 
  755     dsp->OP##vc1_mspel_pixels_tab[1][X+4*Y] = OP##vc1_mspel_mc##X##Y##INSN; \ 
  756     dsp->OP##vc1_mspel_pixels_tab[0][X+4*Y] = OP##vc1_mspel_mc##X##Y##_16##INSN 
  760 #if HAVE_MMX_EXTERNAL