32 #define REAL_MOVNTQ(a,b) "movntq " #a ", " #b " \n\t" 
   33 #define MOVNTQ2 "movntq " 
   34 #define MOVNTQ(a,b)  REAL_MOVNTQ(a,b) 
   36 #define YSCALEYUV2PACKEDX_UV \ 
   38         "xor                %%"FF_REG_a", %%"FF_REG_a"  \n\t"\ 
   42         "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"FF_REG_d"  \n\t"\ 
   43         "mov              (%%"FF_REG_d"), %%"FF_REG_S"  \n\t"\ 
   44         "movq      "VROUNDER_OFFSET"(%0), %%mm3         \n\t"\ 
   45         "movq                      %%mm3, %%mm4         \n\t"\ 
   48         "movq            8(%%"FF_REG_d"), %%mm0         \n\t" \ 
   49         "movq  (%%"FF_REG_S", %%"FF_REG_a"), %%mm2      \n\t" \ 
   50         "add                          %6, %%"FF_REG_S"  \n\t" \ 
   51         "movq  (%%"FF_REG_S", %%"FF_REG_a"), %%mm5      \n\t" \ 
   52         "add                         $16, %%"FF_REG_d"  \n\t"\ 
   53         "mov              (%%"FF_REG_d"), %%"FF_REG_S"  \n\t"\ 
   54         "pmulhw                    %%mm0, %%mm2         \n\t"\ 
   55         "pmulhw                    %%mm0, %%mm5         \n\t"\ 
   56         "paddw                     %%mm2, %%mm3         \n\t"\ 
   57         "paddw                     %%mm5, %%mm4         \n\t"\ 
   58         "test               %%"FF_REG_S", %%"FF_REG_S"  \n\t"\ 
   61 #define YSCALEYUV2PACKEDX_YA(offset,coeff,src1,src2,dst1,dst2) \ 
   62     "lea                "offset"(%0), %%"FF_REG_d"  \n\t"\ 
   63     "mov              (%%"FF_REG_d"), %%"FF_REG_S"  \n\t"\ 
   64     "movq      "VROUNDER_OFFSET"(%0), "#dst1"       \n\t"\ 
   65     "movq                    "#dst1", "#dst2"       \n\t"\ 
   68     "movq            8(%%"FF_REG_d"), "#coeff"      \n\t" \ 
   69     "movq  (%%"FF_REG_S", %%"FF_REG_a", 2), "#src1" \n\t" \ 
   70     "movq 8(%%"FF_REG_S", %%"FF_REG_a", 2), "#src2" \n\t" \ 
   71     "add                         $16, %%"FF_REG_d"  \n\t"\ 
   72     "mov              (%%"FF_REG_d"), %%"FF_REG_S"  \n\t"\ 
   73     "pmulhw                 "#coeff", "#src1"       \n\t"\ 
   74     "pmulhw                 "#coeff", "#src2"       \n\t"\ 
   75     "paddw                   "#src1", "#dst1"       \n\t"\ 
   76     "paddw                   "#src2", "#dst2"       \n\t"\ 
   77     "test               %%"FF_REG_S", %%"FF_REG_S"  \n\t"\ 
   80 #define YSCALEYUV2PACKEDX \ 
   81     YSCALEYUV2PACKEDX_UV \ 
   82     YSCALEYUV2PACKEDX_YA(LUM_MMX_FILTER_OFFSET,%%mm0,%%mm2,%%mm5,%%mm1,%%mm7) \ 
   84 #define YSCALEYUV2PACKEDX_END                     \ 
   85         :: "r" (&c->redDither),                   \ 
   86             "m" (dummy), "m" (dummy), "m" (dummy),\ 
   87             "r" (dest), "m" (dstW_reg), "m"(uv_off) \ 
   88             NAMED_CONSTRAINTS_ADD(bF8,bFC) \ 
   89         : "%"FF_REG_a, "%"FF_REG_d, "%"FF_REG_S            \ 
   92 #define YSCALEYUV2PACKEDX_ACCURATE_UV \ 
   94         "xor %%"FF_REG_a", %%"FF_REG_a"                 \n\t"\ 
   98         "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"FF_REG_d"  \n\t"\ 
   99         "mov              (%%"FF_REG_d"), %%"FF_REG_S"  \n\t"\ 
  100         "pxor                      %%mm4, %%mm4         \n\t"\ 
  101         "pxor                      %%mm5, %%mm5         \n\t"\ 
  102         "pxor                      %%mm6, %%mm6         \n\t"\ 
  103         "pxor                      %%mm7, %%mm7         \n\t"\ 
  106         "movq  (%%"FF_REG_S", %%"FF_REG_a"), %%mm0      \n\t" \ 
  107         "add                          %6, %%"FF_REG_S"  \n\t" \ 
  108         "movq  (%%"FF_REG_S", %%"FF_REG_a"), %%mm2      \n\t" \ 
  109         "mov "STR(APCK_PTR2)"(%%"FF_REG_d"), %%"FF_REG_S" \n\t"\ 
  110         "movq  (%%"FF_REG_S", %%"FF_REG_a"), %%mm1      \n\t" \ 
  111         "movq                      %%mm0, %%mm3         \n\t"\ 
  112         "punpcklwd                 %%mm1, %%mm0         \n\t"\ 
  113         "punpckhwd                 %%mm1, %%mm3         \n\t"\ 
  114         "movq "STR(APCK_COEF)"(%%"FF_REG_d"),%%mm1      \n\t" \ 
  115         "pmaddwd                   %%mm1, %%mm0         \n\t"\ 
  116         "pmaddwd                   %%mm1, %%mm3         \n\t"\ 
  117         "paddd                     %%mm0, %%mm4         \n\t"\ 
  118         "paddd                     %%mm3, %%mm5         \n\t"\ 
  119         "add                          %6, %%"FF_REG_S"  \n\t" \ 
  120         "movq  (%%"FF_REG_S", %%"FF_REG_a"), %%mm3      \n\t" \ 
  121         "mov "STR(APCK_SIZE)"(%%"FF_REG_d"), %%"FF_REG_S" \n\t"\ 
  122         "add           $"STR(APCK_SIZE)", %%"FF_REG_d"  \n\t"\ 
  123         "test               %%"FF_REG_S", %%"FF_REG_S"  \n\t"\ 
  124         "movq                      %%mm2, %%mm0         \n\t"\ 
  125         "punpcklwd                 %%mm3, %%mm2         \n\t"\ 
  126         "punpckhwd                 %%mm3, %%mm0         \n\t"\ 
  127         "pmaddwd                   %%mm1, %%mm2         \n\t"\ 
  128         "pmaddwd                   %%mm1, %%mm0         \n\t"\ 
  129         "paddd                     %%mm2, %%mm6         \n\t"\ 
  130         "paddd                     %%mm0, %%mm7         \n\t"\ 
  132         "psrad                       $16, %%mm4         \n\t"\ 
  133         "psrad                       $16, %%mm5         \n\t"\ 
  134         "psrad                       $16, %%mm6         \n\t"\ 
  135         "psrad                       $16, %%mm7         \n\t"\ 
  136         "movq      "VROUNDER_OFFSET"(%0), %%mm0         \n\t"\ 
  137         "packssdw                  %%mm5, %%mm4         \n\t"\ 
  138         "packssdw                  %%mm7, %%mm6         \n\t"\ 
  139         "paddw                     %%mm0, %%mm4         \n\t"\ 
  140         "paddw                     %%mm0, %%mm6         \n\t"\ 
  141         "movq                      %%mm4, "U_TEMP"(%0)  \n\t"\ 
  142         "movq                      %%mm6, "V_TEMP"(%0)  \n\t"\ 
  144 #define YSCALEYUV2PACKEDX_ACCURATE_YA(offset) \ 
  145     "lea                "offset"(%0), %%"FF_REG_d"      \n\t"\ 
  146     "mov                 (%%"FF_REG_d"), %%"FF_REG_S"   \n\t"\ 
  147     "pxor                      %%mm1, %%mm1         \n\t"\ 
  148     "pxor                      %%mm5, %%mm5         \n\t"\ 
  149     "pxor                      %%mm7, %%mm7         \n\t"\ 
  150     "pxor                      %%mm6, %%mm6         \n\t"\ 
  153     "movq  (%%"FF_REG_S", %%"FF_REG_a", 2), %%mm0       \n\t" \ 
  154     "movq 8(%%"FF_REG_S", %%"FF_REG_a", 2), %%mm2       \n\t" \ 
  155     "mov "STR(APCK_PTR2)"(%%"FF_REG_d"), %%"FF_REG_S"   \n\t"\ 
  156     "movq  (%%"FF_REG_S", %%"FF_REG_a", 2), %%mm4       \n\t" \ 
  157     "movq                      %%mm0, %%mm3         \n\t"\ 
  158     "punpcklwd                 %%mm4, %%mm0         \n\t"\ 
  159     "punpckhwd                 %%mm4, %%mm3         \n\t"\ 
  160     "movq "STR(APCK_COEF)"(%%"FF_REG_d"), %%mm4     \n\t" \ 
  161     "pmaddwd                   %%mm4, %%mm0         \n\t"\ 
  162     "pmaddwd                   %%mm4, %%mm3         \n\t"\ 
  163     "paddd                     %%mm0, %%mm1         \n\t"\ 
  164     "paddd                     %%mm3, %%mm5         \n\t"\ 
  165     "movq 8(%%"FF_REG_S", %%"FF_REG_a", 2), %%mm3   \n\t" \ 
  166     "mov "STR(APCK_SIZE)"(%%"FF_REG_d"), %%"FF_REG_S" \n\t"\ 
  167     "add           $"STR(APCK_SIZE)", %%"FF_REG_d"  \n\t"\ 
  168     "test               %%"FF_REG_S", %%"FF_REG_S"  \n\t"\ 
  169     "movq                      %%mm2, %%mm0         \n\t"\ 
  170     "punpcklwd                 %%mm3, %%mm2         \n\t"\ 
  171     "punpckhwd                 %%mm3, %%mm0         \n\t"\ 
  172     "pmaddwd                   %%mm4, %%mm2         \n\t"\ 
  173     "pmaddwd                   %%mm4, %%mm0         \n\t"\ 
  174     "paddd                     %%mm2, %%mm7         \n\t"\ 
  175     "paddd                     %%mm0, %%mm6         \n\t"\ 
  177     "psrad                       $16, %%mm1         \n\t"\ 
  178     "psrad                       $16, %%mm5         \n\t"\ 
  179     "psrad                       $16, %%mm7         \n\t"\ 
  180     "psrad                       $16, %%mm6         \n\t"\ 
  181     "movq      "VROUNDER_OFFSET"(%0), %%mm0         \n\t"\ 
  182     "packssdw                  %%mm5, %%mm1         \n\t"\ 
  183     "packssdw                  %%mm6, %%mm7         \n\t"\ 
  184     "paddw                     %%mm0, %%mm1         \n\t"\ 
  185     "paddw                     %%mm0, %%mm7         \n\t"\ 
  186     "movq               "U_TEMP"(%0), %%mm3         \n\t"\ 
  187     "movq               "V_TEMP"(%0), %%mm4         \n\t"\ 
  189 #define YSCALEYUV2PACKEDX_ACCURATE \ 
  190     YSCALEYUV2PACKEDX_ACCURATE_UV \ 
  191     YSCALEYUV2PACKEDX_ACCURATE_YA(LUM_MMX_FILTER_OFFSET) 
  193 #define YSCALEYUV2RGBX \ 
  194     "psubw  "U_OFFSET"(%0), %%mm3       \n\t" \ 
  195     "psubw  "V_OFFSET"(%0), %%mm4       \n\t" \ 
  196     "movq            %%mm3, %%mm2       \n\t" \ 
  197     "movq            %%mm4, %%mm5       \n\t" \ 
  198     "pmulhw "UG_COEFF"(%0), %%mm3       \n\t"\ 
  199     "pmulhw "VG_COEFF"(%0), %%mm4       \n\t"\ 
  201     "pmulhw "UB_COEFF"(%0), %%mm2       \n\t"\ 
  202     "pmulhw "VR_COEFF"(%0), %%mm5       \n\t"\ 
  203     "psubw  "Y_OFFSET"(%0), %%mm1       \n\t" \ 
  204     "psubw  "Y_OFFSET"(%0), %%mm7       \n\t" \ 
  205     "pmulhw  "Y_COEFF"(%0), %%mm1       \n\t"\ 
  206     "pmulhw  "Y_COEFF"(%0), %%mm7       \n\t"\ 
  208     "paddw           %%mm3, %%mm4       \n\t"\ 
  209     "movq            %%mm2, %%mm0       \n\t"\ 
  210     "movq            %%mm5, %%mm6       \n\t"\ 
  211     "movq            %%mm4, %%mm3       \n\t"\ 
  212     "punpcklwd       %%mm2, %%mm2       \n\t"\ 
  213     "punpcklwd       %%mm5, %%mm5       \n\t"\ 
  214     "punpcklwd       %%mm4, %%mm4       \n\t"\ 
  215     "paddw           %%mm1, %%mm2       \n\t"\ 
  216     "paddw           %%mm1, %%mm5       \n\t"\ 
  217     "paddw           %%mm1, %%mm4       \n\t"\ 
  218     "punpckhwd       %%mm0, %%mm0       \n\t"\ 
  219     "punpckhwd       %%mm6, %%mm6       \n\t"\ 
  220     "punpckhwd       %%mm3, %%mm3       \n\t"\ 
  221     "paddw           %%mm7, %%mm0       \n\t"\ 
  222     "paddw           %%mm7, %%mm6       \n\t"\ 
  223     "paddw           %%mm7, %%mm3       \n\t"\ 
  225     "packuswb        %%mm0, %%mm2       \n\t"\ 
  226     "packuswb        %%mm6, %%mm5       \n\t"\ 
  227     "packuswb        %%mm3, %%mm4       \n\t"\ 
  229 #define REAL_WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t) \ 
  230     "movq       "#b", "#q2"     \n\t" \ 
  231     "movq       "#r", "#t"      \n\t" \ 
  232     "punpcklbw  "#g", "#b"      \n\t" \ 
  233     "punpcklbw  "#a", "#r"      \n\t" \ 
  234     "punpckhbw  "#g", "#q2"     \n\t" \ 
  235     "punpckhbw  "#a", "#t"      \n\t" \ 
  236     "movq       "#b", "#q0"     \n\t" \ 
  237     "movq      "#q2", "#q3"     \n\t" \ 
  238     "punpcklwd  "#r", "#q0"     \n\t" \ 
  239     "punpckhwd  "#r", "#b"      \n\t" \ 
  240     "punpcklwd  "#t", "#q2"     \n\t" \ 
  241     "punpckhwd  "#t", "#q3"     \n\t" \ 
  243     MOVNTQ(   q0,   (dst, index, 4))\ 
  244     MOVNTQ(    b,  8(dst, index, 4))\ 
  245     MOVNTQ(   q2, 16(dst, index, 4))\ 
  246     MOVNTQ(   q3, 24(dst, index, 4))\ 
  248     "add      $8, "#index"      \n\t"\ 
  249     "cmp  "dstw", "#index"      \n\t"\ 
  251 #define WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t)  REAL_WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t) 
  254                                    const int16_t **lumSrc, 
int lumFilterSize,
 
  255                                    const int16_t *chrFilter, 
const int16_t **chrUSrc,
 
  256                                    const int16_t **chrVSrc,
 
  257                                    int chrFilterSize, 
const int16_t **alpSrc,
 
  258                                    uint8_t *dest, 
int dstW, 
int dstY)
 
  264     if (CONFIG_SWSCALE_ALPHA && 
c->needAlpha) {
 
  267         "movq                      %%mm2, "U_TEMP"(%0)  \n\t" 
  268         "movq                      %%mm4, "V_TEMP"(%0)  \n\t" 
  269         "movq                      %%mm5, "Y_TEMP"(%0)  \n\t" 
  271         "movq               "Y_TEMP"(%0), %%mm5         \n\t" 
  272         "psraw                        $3, %%mm1         \n\t" 
  273         "psraw                        $3, %%mm7         \n\t" 
  274         "packuswb                  %%mm7, %%mm1         \n\t" 
  275         WRITEBGR32(%4, 
"%5", %%FF_REGa, %%mm3, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm2, %%mm6)
 
  280         "pcmpeqd %%mm7, %%mm7 \n\t" 
  281         WRITEBGR32(%4, 
"%5", %%FF_REGa, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
 
  287                                 const int16_t **lumSrc, 
int lumFilterSize,
 
  288                                 const int16_t *chrFilter, 
const int16_t **chrUSrc,
 
  289                                 const int16_t **chrVSrc,
 
  290                                 int chrFilterSize, 
const int16_t **alpSrc,
 
  291                                 uint8_t *dest, 
int dstW, 
int dstY)
 
  297     if (CONFIG_SWSCALE_ALPHA && 
c->needAlpha) {
 
  301         "psraw                        $3, %%mm1         \n\t" 
  302         "psraw                        $3, %%mm7         \n\t" 
  303         "packuswb                  %%mm7, %%mm1         \n\t" 
  304         WRITEBGR32(%4, 
"%5", %%FF_REGa, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
 
  309         "pcmpeqd %%mm7, %%mm7 \n\t" 
  310         WRITEBGR32(%4, 
"%5", %%FF_REGa, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
 
  316                                 const int16_t **lumSrc, 
int lumFilterSize,
 
  317                                 const int16_t *chrFilter, 
const int16_t **chrUSrc,
 
  318                                 const int16_t **chrVSrc,
 
  319                                 int chrFilterSize, 
const int16_t **alpSrc,
 
  320                                 uint8_t *dest, 
int dstW, 
int dstY)
 
  326     if (CONFIG_SWSCALE_ALPHA && 
c->needAlpha) {
 
  330         "psraw                        $3, %%mm1         \n\t" 
  331         "psraw                        $3, %%mm7         \n\t" 
  332         "packuswb                  %%mm7, %%mm1         \n\t" 
  333         WRITEBGR32(%4, 
"%5", %%FF_REGa, %%mm5, %%mm4, %%mm2, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
 
  338         "pcmpeqd %%mm7, %%mm7 \n\t" 
  339         WRITEBGR32(%4, 
"%5", %%FF_REGa, %%mm5, %%mm4, %%mm2, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
 
  344 #define REAL_WRITERGB16(dst, dstw, index) \ 
  345     "pand "MANGLE(bF8)", %%mm2  \n\t" \ 
  346     "pand "MANGLE(bFC)", %%mm4  \n\t" \ 
  347     "pand "MANGLE(bF8)", %%mm5  \n\t" \ 
  348     "psrlq           $3, %%mm2  \n\t"\ 
  350     "movq         %%mm2, %%mm1  \n\t"\ 
  351     "movq         %%mm4, %%mm3  \n\t"\ 
  353     "punpcklbw    %%mm7, %%mm3  \n\t"\ 
  354     "punpcklbw    %%mm5, %%mm2  \n\t"\ 
  355     "punpckhbw    %%mm7, %%mm4  \n\t"\ 
  356     "punpckhbw    %%mm5, %%mm1  \n\t"\ 
  358     "psllq           $3, %%mm3  \n\t"\ 
  359     "psllq           $3, %%mm4  \n\t"\ 
  361     "por          %%mm3, %%mm2  \n\t"\ 
  362     "por          %%mm4, %%mm1  \n\t"\ 
  364     MOVNTQ(%%mm2,  (dst, index, 2))\ 
  365     MOVNTQ(%%mm1, 8(dst, index, 2))\ 
  367     "add             $8, "#index"   \n\t"\ 
  368     "cmp         "dstw", "#index"   \n\t"\ 
  370 #define WRITERGB16(dst, dstw, index)  REAL_WRITERGB16(dst, dstw, index) 
  373                                     const int16_t **lumSrc, 
int lumFilterSize,
 
  374                                     const int16_t *chrFilter, 
const int16_t **chrUSrc,
 
  375                                     const int16_t **chrVSrc,
 
  376                                     int chrFilterSize, 
const int16_t **alpSrc,
 
  377                                     uint8_t *dest, 
int dstW, 
int dstY)
 
  385     "pxor %%mm7, %%mm7 \n\t" 
  397                                  const int16_t **lumSrc, 
int lumFilterSize,
 
  398                                  const int16_t *chrFilter, 
const int16_t **chrUSrc,
 
  399                                  const int16_t **chrVSrc,
 
  400                                  int chrFilterSize, 
const int16_t **alpSrc,
 
  401                                  uint8_t *dest, 
int dstW, 
int dstY)
 
  409     "pxor %%mm7, %%mm7 \n\t" 
  420 #define REAL_WRITERGB15(dst, dstw, index) \ 
  421     "pand "MANGLE(bF8)", %%mm2  \n\t" \ 
  422     "pand "MANGLE(bF8)", %%mm4  \n\t" \ 
  423     "pand "MANGLE(bF8)", %%mm5  \n\t" \ 
  424     "psrlq           $3, %%mm2  \n\t"\ 
  425     "psrlq           $1, %%mm5  \n\t"\ 
  427     "movq         %%mm2, %%mm1  \n\t"\ 
  428     "movq         %%mm4, %%mm3  \n\t"\ 
  430     "punpcklbw    %%mm7, %%mm3  \n\t"\ 
  431     "punpcklbw    %%mm5, %%mm2  \n\t"\ 
  432     "punpckhbw    %%mm7, %%mm4  \n\t"\ 
  433     "punpckhbw    %%mm5, %%mm1  \n\t"\ 
  435     "psllq           $2, %%mm3  \n\t"\ 
  436     "psllq           $2, %%mm4  \n\t"\ 
  438     "por          %%mm3, %%mm2  \n\t"\ 
  439     "por          %%mm4, %%mm1  \n\t"\ 
  441     MOVNTQ(%%mm2,  (dst, index, 2))\ 
  442     MOVNTQ(%%mm1, 8(dst, index, 2))\ 
  444     "add             $8, "#index"   \n\t"\ 
  445     "cmp         "dstw", "#index"   \n\t"\ 
  447 #define WRITERGB15(dst, dstw, index)  REAL_WRITERGB15(dst, dstw, index) 
  450                                     const int16_t **lumSrc, 
int lumFilterSize,
 
  451                                     const int16_t *chrFilter, 
const int16_t **chrUSrc,
 
  452                                     const int16_t **chrVSrc,
 
  453                                     int chrFilterSize, 
const int16_t **alpSrc,
 
  454                                     uint8_t *dest, 
int dstW, 
int dstY)
 
  462     "pxor %%mm7, %%mm7 \n\t" 
  474                                  const int16_t **lumSrc, 
int lumFilterSize,
 
  475                                  const int16_t *chrFilter, 
const int16_t **chrUSrc,
 
  476                                  const int16_t **chrVSrc,
 
  477                                  int chrFilterSize, 
const int16_t **alpSrc,
 
  478                                  uint8_t *dest, 
int dstW, 
int dstY)
 
  486     "pxor %%mm7, %%mm7 \n\t" 
  497 #define WRITEBGR24MMX(dst, dstw, index) \ 
  499     "movq      %%mm2, %%mm1     \n\t" \ 
  500     "movq      %%mm5, %%mm6     \n\t" \ 
  501     "punpcklbw %%mm4, %%mm2     \n\t" \ 
  502     "punpcklbw %%mm7, %%mm5     \n\t" \ 
  503     "punpckhbw %%mm4, %%mm1     \n\t" \ 
  504     "punpckhbw %%mm7, %%mm6     \n\t" \ 
  505     "movq      %%mm2, %%mm0     \n\t" \ 
  506     "movq      %%mm1, %%mm3     \n\t" \ 
  507     "punpcklwd %%mm5, %%mm0     \n\t" \ 
  508     "punpckhwd %%mm5, %%mm2     \n\t" \ 
  509     "punpcklwd %%mm6, %%mm1     \n\t" \ 
  510     "punpckhwd %%mm6, %%mm3     \n\t" \ 
  512     "movq      %%mm0, %%mm4     \n\t" \ 
  513     "movq      %%mm2, %%mm6     \n\t" \ 
  514     "movq      %%mm1, %%mm5     \n\t" \ 
  515     "movq      %%mm3, %%mm7     \n\t" \ 
  517     "psllq       $40, %%mm0     \n\t" \ 
  518     "psllq       $40, %%mm2     \n\t" \ 
  519     "psllq       $40, %%mm1     \n\t" \ 
  520     "psllq       $40, %%mm3     \n\t" \ 
  522     "punpckhdq %%mm4, %%mm0     \n\t" \ 
  523     "punpckhdq %%mm6, %%mm2     \n\t" \ 
  524     "punpckhdq %%mm5, %%mm1     \n\t" \ 
  525     "punpckhdq %%mm7, %%mm3     \n\t" \ 
  527     "psrlq        $8, %%mm0     \n\t" \ 
  528     "movq      %%mm2, %%mm6     \n\t" \ 
  529     "psllq       $40, %%mm2     \n\t" \ 
  530     "por       %%mm2, %%mm0     \n\t" \ 
  531     MOVNTQ(%%mm0, (dst))\ 
  533     "psrlq       $24, %%mm6     \n\t" \ 
  534     "movq      %%mm1, %%mm5     \n\t" \ 
  535     "psllq       $24, %%mm1     \n\t" \ 
  536     "por       %%mm1, %%mm6     \n\t" \ 
  537     MOVNTQ(%%mm6, 8(dst))\ 
  539     "psrlq       $40, %%mm5     \n\t" \ 
  540     "psllq        $8, %%mm3     \n\t" \ 
  541     "por       %%mm3, %%mm5     \n\t" \ 
  542     MOVNTQ(%%mm5, 16(dst))\ 
  544     "add         $24, "#dst"    \n\t"\ 
  546     "add          $8, "#index"  \n\t"\ 
  547     "cmp      "dstw", "#index"  \n\t"\ 
  550 #define WRITEBGR24MMXEXT(dst, dstw, index) \ 
  552     "movq "MANGLE(ff_M24A)", %%mm0 \n\t"\ 
  553     "movq "MANGLE(ff_M24C)", %%mm7 \n\t"\ 
  554     "pshufw $0x50, %%mm2, %%mm1 \n\t" \ 
  555     "pshufw $0x50, %%mm4, %%mm3 \n\t" \ 
  556     "pshufw $0x00, %%mm5, %%mm6 \n\t" \ 
  558     "pand   %%mm0, %%mm1        \n\t" \ 
  559     "pand   %%mm0, %%mm3        \n\t" \ 
  560     "pand   %%mm7, %%mm6        \n\t" \ 
  562     "psllq     $8, %%mm3        \n\t" \ 
  563     "por    %%mm1, %%mm6        \n\t"\ 
  564     "por    %%mm3, %%mm6        \n\t"\ 
  565     MOVNTQ(%%mm6, (dst))\ 
  567     "psrlq     $8, %%mm4        \n\t" \ 
  568     "pshufw $0xA5, %%mm2, %%mm1 \n\t" \ 
  569     "pshufw $0x55, %%mm4, %%mm3 \n\t" \ 
  570     "pshufw $0xA5, %%mm5, %%mm6 \n\t" \ 
  572     "pand "MANGLE(ff_M24B)", %%mm1 \n\t" \ 
  573     "pand   %%mm7, %%mm3        \n\t" \ 
  574     "pand   %%mm0, %%mm6        \n\t" \ 
  576     "por    %%mm1, %%mm3        \n\t" \ 
  577     "por    %%mm3, %%mm6        \n\t"\ 
  578     MOVNTQ(%%mm6, 8(dst))\ 
  580     "pshufw $0xFF, %%mm2, %%mm1 \n\t" \ 
  581     "pshufw $0xFA, %%mm4, %%mm3 \n\t" \ 
  582     "pshufw $0xFA, %%mm5, %%mm6 \n\t" \ 
  584     "pand   %%mm7, %%mm1        \n\t" \ 
  585     "pand   %%mm0, %%mm3        \n\t" \ 
  586     "pand "MANGLE(ff_M24B)", %%mm6 \n\t" \ 
  588     "por    %%mm1, %%mm3        \n\t"\ 
  589     "por    %%mm3, %%mm6        \n\t"\ 
  590     MOVNTQ(%%mm6, 16(dst))\ 
  592     "add      $24, "#dst"       \n\t"\ 
  594     "add       $8, "#index"     \n\t"\ 
  595     "cmp   "dstw", "#index"     \n\t"\ 
  599 #define WRITEBGR24(dst, dstw, index)  WRITEBGR24MMXEXT(dst, dstw, index) 
  603                                    const int16_t **lumSrc, 
int lumFilterSize,
 
  604                                    const int16_t *chrFilter, 
const int16_t **chrUSrc,
 
  605                                    const int16_t **chrVSrc,
 
  606                                    int chrFilterSize, 
const int16_t **alpSrc,
 
  607                                    uint8_t *dest, 
int dstW, 
int dstY)
 
  615     "pxor %%mm7, %%mm7 \n\t" 
  616     "lea (%%"FF_REG_a
", %%"FF_REG_a
", 2), %%"FF_REG_c
"\n\t"  
  617     "add %4, %%"FF_REG_c
"                        \n\t" 
  619     :: 
"r" (&
c->redDither),
 
  621        "r" (dest), 
"m" (dstW_reg), 
"m"(uv_off)
 
  623     : 
"%"FF_REG_a, 
"%"FF_REG_c, 
"%"FF_REG_d, 
"%"FF_REG_S
 
  628                                 const int16_t **lumSrc, 
int lumFilterSize,
 
  629                                 const int16_t *chrFilter, 
const int16_t **chrUSrc,
 
  630                                 const int16_t **chrVSrc,
 
  631                                 int chrFilterSize, 
const int16_t **alpSrc,
 
  632                                 uint8_t *dest, 
int dstW, 
int dstY)
 
  640     "pxor                    %%mm7, %%mm7              \n\t" 
  641     "lea (%%"FF_REG_a
", %%"FF_REG_a
", 2), %%"FF_REG_c
" \n\t"  
  642     "add                        %4, %%"FF_REG_c
"       \n\t" 
  644     :: 
"r" (&
c->redDither),
 
  646        "r" (dest),  
"m" (dstW_reg), 
"m"(uv_off)
 
  648     : 
"%"FF_REG_a, 
"%"FF_REG_c, 
"%"FF_REG_d, 
"%"FF_REG_S
 
  653 #define REAL_WRITEYUY2(dst, dstw, index) \ 
  654     "packuswb  %%mm3, %%mm3     \n\t"\ 
  655     "packuswb  %%mm4, %%mm4     \n\t"\ 
  656     "packuswb  %%mm7, %%mm1     \n\t"\ 
  657     "punpcklbw %%mm4, %%mm3     \n\t"\ 
  658     "movq      %%mm1, %%mm7     \n\t"\ 
  659     "punpcklbw %%mm3, %%mm1     \n\t"\ 
  660     "punpckhbw %%mm3, %%mm7     \n\t"\ 
  662     MOVNTQ(%%mm1, (dst, index, 2))\ 
  663     MOVNTQ(%%mm7, 8(dst, index, 2))\ 
  665     "add          $8, "#index"  \n\t"\ 
  666     "cmp      "dstw", "#index"  \n\t"\ 
  668 #define WRITEYUY2(dst, dstw, index)  REAL_WRITEYUY2(dst, dstw, index) 
  671                                      const int16_t **lumSrc, 
int lumFilterSize,
 
  672                                      const int16_t *chrFilter, 
const int16_t **chrUSrc,
 
  673                                      const int16_t **chrVSrc,
 
  674                                      int chrFilterSize, 
const int16_t **alpSrc,
 
  675                                      uint8_t *dest, 
int dstW, 
int dstY)
 
  683     "psraw $3, %%mm3    \n\t" 
  684     "psraw $3, %%mm4    \n\t" 
  685     "psraw $3, %%mm1    \n\t" 
  686     "psraw $3, %%mm7    \n\t" 
  692                                   const int16_t **lumSrc, 
int lumFilterSize,
 
  693                                   const int16_t *chrFilter, 
const int16_t **chrUSrc,
 
  694                                   const int16_t **chrVSrc,
 
  695                                   int chrFilterSize, 
const int16_t **alpSrc,
 
  696                                   uint8_t *dest, 
int dstW, 
int dstY)
 
  704     "psraw $3, %%mm3    \n\t" 
  705     "psraw $3, %%mm4    \n\t" 
  706     "psraw $3, %%mm1    \n\t" 
  707     "psraw $3, %%mm7    \n\t" 
  712 #define REAL_YSCALEYUV2RGB_UV(index, c) \ 
  713     "xor            "#index", "#index"  \n\t"\ 
  716     "movq     (%2, "#index"), %%mm2     \n\t" \ 
  717     "movq     (%3, "#index"), %%mm3     \n\t" \ 
  718     "add "UV_OFF_BYTE"("#c"), "#index"  \n\t" \ 
  719     "movq     (%2, "#index"), %%mm5     \n\t" \ 
  720     "movq     (%3, "#index"), %%mm4     \n\t" \ 
  721     "sub "UV_OFF_BYTE"("#c"), "#index"  \n\t" \ 
  722     "psubw             %%mm3, %%mm2     \n\t" \ 
  723     "psubw             %%mm4, %%mm5     \n\t" \ 
  724     "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0    \n\t"\ 
  725     "pmulhw            %%mm0, %%mm2     \n\t" \ 
  726     "pmulhw            %%mm0, %%mm5     \n\t" \ 
  727     "psraw                $4, %%mm3     \n\t" \ 
  728     "psraw                $4, %%mm4     \n\t" \ 
  729     "paddw             %%mm2, %%mm3     \n\t" \ 
  730     "paddw             %%mm5, %%mm4     \n\t" \ 
  731     "psubw  "U_OFFSET"("#c"), %%mm3     \n\t" \ 
  732     "psubw  "V_OFFSET"("#c"), %%mm4     \n\t" \ 
  733     "movq              %%mm3, %%mm2     \n\t" \ 
  734     "movq              %%mm4, %%mm5     \n\t" \ 
  735     "pmulhw "UG_COEFF"("#c"), %%mm3     \n\t"\ 
  736     "pmulhw "VG_COEFF"("#c"), %%mm4     \n\t"\ 
  739 #define REAL_YSCALEYUV2RGB_YA(index, c, b1, b2) \ 
  740     "movq  ("#b1", "#index", 2), %%mm0     \n\t" \ 
  741     "movq  ("#b2", "#index", 2), %%mm1     \n\t" \ 
  742     "movq 8("#b1", "#index", 2), %%mm6     \n\t" \ 
  743     "movq 8("#b2", "#index", 2), %%mm7     \n\t" \ 
  744     "psubw             %%mm1, %%mm0     \n\t" \ 
  745     "psubw             %%mm7, %%mm6     \n\t" \ 
  746     "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0  \n\t" \ 
  747     "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6  \n\t" \ 
  748     "psraw                $4, %%mm1     \n\t" \ 
  749     "psraw                $4, %%mm7     \n\t" \ 
  750     "paddw             %%mm0, %%mm1     \n\t" \ 
  751     "paddw             %%mm6, %%mm7     \n\t" \ 
  753 #define REAL_YSCALEYUV2RGB_COEFF(c) \ 
  754     "pmulhw "UB_COEFF"("#c"), %%mm2     \n\t"\ 
  755     "pmulhw "VR_COEFF"("#c"), %%mm5     \n\t"\ 
  756     "psubw  "Y_OFFSET"("#c"), %%mm1     \n\t" \ 
  757     "psubw  "Y_OFFSET"("#c"), %%mm7     \n\t" \ 
  758     "pmulhw  "Y_COEFF"("#c"), %%mm1     \n\t"\ 
  759     "pmulhw  "Y_COEFF"("#c"), %%mm7     \n\t"\ 
  761     "paddw             %%mm3, %%mm4     \n\t"\ 
  762     "movq              %%mm2, %%mm0     \n\t"\ 
  763     "movq              %%mm5, %%mm6     \n\t"\ 
  764     "movq              %%mm4, %%mm3     \n\t"\ 
  765     "punpcklwd         %%mm2, %%mm2     \n\t"\ 
  766     "punpcklwd         %%mm5, %%mm5     \n\t"\ 
  767     "punpcklwd         %%mm4, %%mm4     \n\t"\ 
  768     "paddw             %%mm1, %%mm2     \n\t"\ 
  769     "paddw             %%mm1, %%mm5     \n\t"\ 
  770     "paddw             %%mm1, %%mm4     \n\t"\ 
  771     "punpckhwd         %%mm0, %%mm0     \n\t"\ 
  772     "punpckhwd         %%mm6, %%mm6     \n\t"\ 
  773     "punpckhwd         %%mm3, %%mm3     \n\t"\ 
  774     "paddw             %%mm7, %%mm0     \n\t"\ 
  775     "paddw             %%mm7, %%mm6     \n\t"\ 
  776     "paddw             %%mm7, %%mm3     \n\t"\ 
  778     "packuswb          %%mm0, %%mm2     \n\t"\ 
  779     "packuswb          %%mm6, %%mm5     \n\t"\ 
  780     "packuswb          %%mm3, %%mm4     \n\t"\ 
  782 #define YSCALEYUV2RGB_YA(index, c, b1, b2) REAL_YSCALEYUV2RGB_YA(index, c, b1, b2) 
  784 #define YSCALEYUV2RGB(index, c) \ 
  785     REAL_YSCALEYUV2RGB_UV(index, c) \ 
  786     REAL_YSCALEYUV2RGB_YA(index, c, %0, %1) \ 
  787     REAL_YSCALEYUV2RGB_COEFF(c) 
  793                                 const int16_t *ubuf[2], 
const int16_t *vbuf[2],
 
  794                                 const int16_t *abuf[2], uint8_t *dest,
 
  795                                 int dstW, 
int yalpha, 
int uvalpha, 
int y)
 
  797     const int16_t *buf0  = buf[0],  *buf1  = buf[1],
 
  798                   *ubuf0 = ubuf[0], *ubuf1 = ubuf[1];
 
  800     if (CONFIG_SWSCALE_ALPHA && 
c->needAlpha) {
 
  801         const int16_t *abuf0 = abuf[0], *abuf1 = abuf[1];
 
  806             "psraw                  $3, %%mm1       \n\t"  
  807             "psraw                  $3, %%mm7       \n\t"  
  808             "packuswb            %%mm7, %%mm1       \n\t" 
  809             WRITEBGR32(%4, 
DSTW_OFFSET"(%5)", %%r8, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
 
  810             :: 
"c" (buf0), 
"d" (buf1), 
"S" (ubuf0), 
"D" (ubuf1), 
"r" (dest),
 
  812                "r" (abuf0), 
"r" (abuf1)
 
  816         c->u_temp=(intptr_t)abuf0;
 
  817         c->v_temp=(intptr_t)abuf1;
 
  820             "mov        %4, %%"FF_REG_b
"            \n\t" 
  821             "push %%"FF_REG_BP
"                     \n\t" 
  825             "mov          "U_TEMP"(%5), %0          \n\t" 
  826             "mov          "V_TEMP"(%5), %1          \n\t" 
  828             "psraw                  $3, %%mm1       \n\t"  
  829             "psraw                  $3, %%mm7       \n\t"  
  830             "packuswb            %%mm7, %%mm1       \n\t" 
  833             WRITEBGR32(%%FF_REGb, 
DSTW_OFFSET"(%5)", %%FF_REGBP, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
 
  834             "pop %%"FF_REG_BP
"                      \n\t" 
  836             :: 
"c" (buf0), 
"d" (buf1), 
"S" (ubuf0), 
"D" (ubuf1), 
"m" (dest),
 
  843             "mov        %4, %%"FF_REG_b
"            \n\t" 
  844             "push %%"FF_REG_BP
"                     \n\t" 
  846             "pcmpeqd %%mm7, %%mm7                   \n\t" 
  847             WRITEBGR32(%%FF_REGb, 
DSTW_OFFSET"(%5)", %%FF_REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
 
  848             "pop %%"FF_REG_BP
"                      \n\t" 
  850             :: 
"c" (buf0), 
"d" (buf1), 
"S" (ubuf0), 
"D" (ubuf1), 
"m" (dest),
 
  857                                 const int16_t *ubuf[2], 
const int16_t *vbuf[2],
 
  858                                 const int16_t *abuf[2], uint8_t *dest,
 
  859                                 int dstW, 
int yalpha, 
int uvalpha, 
int y)
 
  861     const int16_t *buf0  = buf[0],  *buf1  = buf[1],
 
  862                   *ubuf0 = ubuf[0], *ubuf1 = ubuf[1];
 
  866         "mov           %4, %%"FF_REG_b
"         \n\t" 
  867         "push %%"FF_REG_BP
"                     \n\t" 
  869         "pxor    %%mm7, %%mm7                   \n\t" 
  871         "pop %%"FF_REG_BP
"                      \n\t" 
  873         :: 
"c" (buf0), 
"d" (buf1), 
"S" (ubuf0), 
"D" (ubuf1), 
"m" (dest),
 
  880                                  const int16_t *ubuf[2], 
const int16_t *vbuf[2],
 
  881                                  const int16_t *abuf[2], uint8_t *dest,
 
  882                                  int dstW, 
int yalpha, 
int uvalpha, 
int y)
 
  884     const int16_t *buf0  = buf[0],  *buf1  = buf[1],
 
  885                   *ubuf0 = ubuf[0], *ubuf1 = ubuf[1];
 
  889         "mov        %4, %%"FF_REG_b
"            \n\t" 
  890         "push %%"FF_REG_BP
"                     \n\t" 
  892         "pxor    %%mm7, %%mm7                   \n\t" 
  900         "pop %%"FF_REG_BP
"                      \n\t" 
  902         :: 
"c" (buf0), 
"d" (buf1), 
"S" (ubuf0), 
"D" (ubuf1), 
"m" (dest),
 
  909                                  const int16_t *ubuf[2], 
const int16_t *vbuf[2],
 
  910                                  const int16_t *abuf[2], uint8_t *dest,
 
  911                                  int dstW, 
int yalpha, 
int uvalpha, 
int y)
 
  913     const int16_t *buf0  = buf[0],  *buf1  = buf[1],
 
  914                   *ubuf0 = ubuf[0], *ubuf1 = ubuf[1];
 
  918         "mov           %4, %%"FF_REG_b
"         \n\t" 
  919         "push %%"FF_REG_BP
"                     \n\t" 
  921         "pxor    %%mm7, %%mm7                   \n\t" 
  929         "pop %%"FF_REG_BP
"                      \n\t" 
  931         :: 
"c" (buf0), 
"d" (buf1), 
"S" (ubuf0), 
"D" (ubuf1), 
"m" (dest),
 
  937 #define REAL_YSCALEYUV2PACKED(index, c) \ 
  938     "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0              \n\t"\ 
  939     "movq "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm1              \n\t"\ 
  940     "psraw                $3, %%mm0                           \n\t"\ 
  941     "psraw                $3, %%mm1                           \n\t"\ 
  942     "movq              %%mm0, "CHR_MMX_FILTER_OFFSET"+8("#c") \n\t"\ 
  943     "movq              %%mm1, "LUM_MMX_FILTER_OFFSET"+8("#c") \n\t"\ 
  944     "xor            "#index", "#index"                        \n\t"\ 
  947     "movq     (%2, "#index"), %%mm2     \n\t" \ 
  948     "movq     (%3, "#index"), %%mm3     \n\t" \ 
  949     "add "UV_OFF_BYTE"("#c"), "#index"  \n\t" \ 
  950     "movq     (%2, "#index"), %%mm5     \n\t" \ 
  951     "movq     (%3, "#index"), %%mm4     \n\t" \ 
  952     "sub "UV_OFF_BYTE"("#c"), "#index"  \n\t" \ 
  953     "psubw             %%mm3, %%mm2     \n\t" \ 
  954     "psubw             %%mm4, %%mm5     \n\t" \ 
  955     "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0    \n\t"\ 
  956     "pmulhw            %%mm0, %%mm2     \n\t" \ 
  957     "pmulhw            %%mm0, %%mm5     \n\t" \ 
  958     "psraw                $7, %%mm3     \n\t" \ 
  959     "psraw                $7, %%mm4     \n\t" \ 
  960     "paddw             %%mm2, %%mm3     \n\t" \ 
  961     "paddw             %%mm5, %%mm4     \n\t" \ 
  962     "movq  (%0, "#index", 2), %%mm0     \n\t" \ 
  963     "movq  (%1, "#index", 2), %%mm1     \n\t" \ 
  964     "movq 8(%0, "#index", 2), %%mm6     \n\t" \ 
  965     "movq 8(%1, "#index", 2), %%mm7     \n\t" \ 
  966     "psubw             %%mm1, %%mm0     \n\t" \ 
  967     "psubw             %%mm7, %%mm6     \n\t" \ 
  968     "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0  \n\t" \ 
  969     "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6  \n\t" \ 
  970     "psraw                $7, %%mm1     \n\t" \ 
  971     "psraw                $7, %%mm7     \n\t" \ 
  972     "paddw             %%mm0, %%mm1     \n\t" \ 
  973     "paddw             %%mm6, %%mm7     \n\t" \ 
  975 #define YSCALEYUV2PACKED(index, c)  REAL_YSCALEYUV2PACKED(index, c) 
  978                                   const int16_t *ubuf[2], 
const int16_t *vbuf[2],
 
  979                                   const int16_t *abuf[2], uint8_t *dest,
 
  980                                   int dstW, 
int yalpha, 
int uvalpha, 
int y)
 
  982     const int16_t *buf0  = buf[0],  *buf1  = buf[1],
 
  983                   *ubuf0 = ubuf[0], *ubuf1 = ubuf[1];
 
  987         "mov           %4, %%"FF_REG_b
"         \n\t" 
  988         "push %%"FF_REG_BP
"                     \n\t" 
  991         "pop %%"FF_REG_BP
"                      \n\t" 
  993         :: 
"c" (buf0), 
"d" (buf1), 
"S" (ubuf0), 
"D" (ubuf1), 
"m" (dest),
 
  998 #define REAL_YSCALEYUV2RGB1(index, c) \ 
  999     "xor            "#index", "#index"  \n\t"\ 
 1002     "movq     (%2, "#index"), %%mm3     \n\t" \ 
 1003     "add "UV_OFF_BYTE"("#c"), "#index"  \n\t" \ 
 1004     "movq     (%2, "#index"), %%mm4     \n\t" \ 
 1005     "sub "UV_OFF_BYTE"("#c"), "#index"  \n\t" \ 
 1006     "psraw                $4, %%mm3     \n\t" \ 
 1007     "psraw                $4, %%mm4     \n\t" \ 
 1008     "psubw  "U_OFFSET"("#c"), %%mm3     \n\t" \ 
 1009     "psubw  "V_OFFSET"("#c"), %%mm4     \n\t" \ 
 1010     "movq              %%mm3, %%mm2     \n\t" \ 
 1011     "movq              %%mm4, %%mm5     \n\t" \ 
 1012     "pmulhw "UG_COEFF"("#c"), %%mm3     \n\t"\ 
 1013     "pmulhw "VG_COEFF"("#c"), %%mm4     \n\t"\ 
 1015     "movq  (%0, "#index", 2), %%mm1     \n\t" \ 
 1016     "movq 8(%0, "#index", 2), %%mm7     \n\t" \ 
 1017     "psraw                $4, %%mm1     \n\t" \ 
 1018     "psraw                $4, %%mm7     \n\t" \ 
 1019     "pmulhw "UB_COEFF"("#c"), %%mm2     \n\t"\ 
 1020     "pmulhw "VR_COEFF"("#c"), %%mm5     \n\t"\ 
 1021     "psubw  "Y_OFFSET"("#c"), %%mm1     \n\t" \ 
 1022     "psubw  "Y_OFFSET"("#c"), %%mm7     \n\t" \ 
 1023     "pmulhw  "Y_COEFF"("#c"), %%mm1     \n\t"\ 
 1024     "pmulhw  "Y_COEFF"("#c"), %%mm7     \n\t"\ 
 1026     "paddw             %%mm3, %%mm4     \n\t"\ 
 1027     "movq              %%mm2, %%mm0     \n\t"\ 
 1028     "movq              %%mm5, %%mm6     \n\t"\ 
 1029     "movq              %%mm4, %%mm3     \n\t"\ 
 1030     "punpcklwd         %%mm2, %%mm2     \n\t"\ 
 1031     "punpcklwd         %%mm5, %%mm5     \n\t"\ 
 1032     "punpcklwd         %%mm4, %%mm4     \n\t"\ 
 1033     "paddw             %%mm1, %%mm2     \n\t"\ 
 1034     "paddw             %%mm1, %%mm5     \n\t"\ 
 1035     "paddw             %%mm1, %%mm4     \n\t"\ 
 1036     "punpckhwd         %%mm0, %%mm0     \n\t"\ 
 1037     "punpckhwd         %%mm6, %%mm6     \n\t"\ 
 1038     "punpckhwd         %%mm3, %%mm3     \n\t"\ 
 1039     "paddw             %%mm7, %%mm0     \n\t"\ 
 1040     "paddw             %%mm7, %%mm6     \n\t"\ 
 1041     "paddw             %%mm7, %%mm3     \n\t"\ 
 1043     "packuswb          %%mm0, %%mm2     \n\t"\ 
 1044     "packuswb          %%mm6, %%mm5     \n\t"\ 
 1045     "packuswb          %%mm3, %%mm4     \n\t"\ 
 1047 #define YSCALEYUV2RGB1(index, c)  REAL_YSCALEYUV2RGB1(index, c) 
 1050 #define REAL_YSCALEYUV2RGB1b(index, c) \ 
 1051     "xor            "#index", "#index"  \n\t"\ 
 1054     "movq     (%2, "#index"), %%mm2     \n\t" \ 
 1055     "movq     (%3, "#index"), %%mm3     \n\t" \ 
 1056     "add "UV_OFF_BYTE"("#c"), "#index"  \n\t" \ 
 1057     "movq     (%2, "#index"), %%mm5     \n\t" \ 
 1058     "movq     (%3, "#index"), %%mm4     \n\t" \ 
 1059     "sub "UV_OFF_BYTE"("#c"), "#index"  \n\t" \ 
 1060     "paddw             %%mm2, %%mm3     \n\t" \ 
 1061     "paddw             %%mm5, %%mm4     \n\t" \ 
 1062     "psrlw                $5, %%mm3     \n\t" \ 
 1063     "psrlw                $5, %%mm4     \n\t" \ 
 1064     "psubw  "U_OFFSET"("#c"), %%mm3     \n\t" \ 
 1065     "psubw  "V_OFFSET"("#c"), %%mm4     \n\t" \ 
 1066     "movq              %%mm3, %%mm2     \n\t" \ 
 1067     "movq              %%mm4, %%mm5     \n\t" \ 
 1068     "pmulhw "UG_COEFF"("#c"), %%mm3     \n\t"\ 
 1069     "pmulhw "VG_COEFF"("#c"), %%mm4     \n\t"\ 
 1071     "movq  (%0, "#index", 2), %%mm1     \n\t" \ 
 1072     "movq 8(%0, "#index", 2), %%mm7     \n\t" \ 
 1073     "psraw                $4, %%mm1     \n\t" \ 
 1074     "psraw                $4, %%mm7     \n\t" \ 
 1075     "pmulhw "UB_COEFF"("#c"), %%mm2     \n\t"\ 
 1076     "pmulhw "VR_COEFF"("#c"), %%mm5     \n\t"\ 
 1077     "psubw  "Y_OFFSET"("#c"), %%mm1     \n\t" \ 
 1078     "psubw  "Y_OFFSET"("#c"), %%mm7     \n\t" \ 
 1079     "pmulhw  "Y_COEFF"("#c"), %%mm1     \n\t"\ 
 1080     "pmulhw  "Y_COEFF"("#c"), %%mm7     \n\t"\ 
 1082     "paddw             %%mm3, %%mm4     \n\t"\ 
 1083     "movq              %%mm2, %%mm0     \n\t"\ 
 1084     "movq              %%mm5, %%mm6     \n\t"\ 
 1085     "movq              %%mm4, %%mm3     \n\t"\ 
 1086     "punpcklwd         %%mm2, %%mm2     \n\t"\ 
 1087     "punpcklwd         %%mm5, %%mm5     \n\t"\ 
 1088     "punpcklwd         %%mm4, %%mm4     \n\t"\ 
 1089     "paddw             %%mm1, %%mm2     \n\t"\ 
 1090     "paddw             %%mm1, %%mm5     \n\t"\ 
 1091     "paddw             %%mm1, %%mm4     \n\t"\ 
 1092     "punpckhwd         %%mm0, %%mm0     \n\t"\ 
 1093     "punpckhwd         %%mm6, %%mm6     \n\t"\ 
 1094     "punpckhwd         %%mm3, %%mm3     \n\t"\ 
 1095     "paddw             %%mm7, %%mm0     \n\t"\ 
 1096     "paddw             %%mm7, %%mm6     \n\t"\ 
 1097     "paddw             %%mm7, %%mm3     \n\t"\ 
 1099     "packuswb          %%mm0, %%mm2     \n\t"\ 
 1100     "packuswb          %%mm6, %%mm5     \n\t"\ 
 1101     "packuswb          %%mm3, %%mm4     \n\t"\ 
 1103 #define YSCALEYUV2RGB1b(index, c)  REAL_YSCALEYUV2RGB1b(index, c) 
 1105 #define REAL_YSCALEYUV2RGB1_ALPHA(index) \ 
 1106     "movq  (%1, "#index", 2), %%mm7     \n\t" \ 
 1107     "movq 8(%1, "#index", 2), %%mm1     \n\t" \ 
 1108     "psraw                $7, %%mm7     \n\t" \ 
 1109     "psraw                $7, %%mm1     \n\t" \ 
 1110     "packuswb          %%mm1, %%mm7     \n\t" 
 1111 #define YSCALEYUV2RGB1_ALPHA(index) REAL_YSCALEYUV2RGB1_ALPHA(index) 
 1117                                 const int16_t *ubuf[2], 
const int16_t *vbuf[2],
 
 1118                                 const int16_t *abuf0, uint8_t *dest,
 
 1119                                 int dstW, 
int uvalpha, 
int y)
 
 1121     const int16_t *ubuf0 = ubuf[0];
 
 1122     const int16_t *buf1= buf0; 
 
 1124     if (uvalpha < 2048) { 
 
 1125         const int16_t *ubuf1 = ubuf[0];
 
 1126         if (CONFIG_SWSCALE_ALPHA && 
c->needAlpha) {
 
 1129                 "mov           %4, %%"FF_REG_b
"         \n\t" 
 1130                 "push %%"FF_REG_BP
"                     \n\t" 
 1133                 WRITEBGR32(%%FF_REGb, 
DSTW_OFFSET"(%5)", %%FF_REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
 
 1134                 "pop %%"FF_REG_BP
"                      \n\t" 
 1136                 :: 
"c" (buf0), 
"d" (abuf0), 
"S" (ubuf0), 
"D" (ubuf1), 
"m" (dest),
 
 1142                 "mov           %4, %%"FF_REG_b
"         \n\t" 
 1143                 "push %%"FF_REG_BP
"                     \n\t" 
 1145                 "pcmpeqd %%mm7, %%mm7                   \n\t" 
 1146                 WRITEBGR32(%%FF_REGb, 
DSTW_OFFSET"(%5)", %%FF_REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
 
 1147                 "pop %%"FF_REG_BP
"                      \n\t" 
 1149                 :: 
"c" (buf0), 
"d" (buf1), 
"S" (ubuf0), 
"D" (ubuf1), 
"m" (dest),
 
 1154         const int16_t *ubuf1 = ubuf[1];
 
 1155         if (CONFIG_SWSCALE_ALPHA && 
c->needAlpha) {
 
 1158                 "mov           %4, %%"FF_REG_b
"         \n\t" 
 1159                 "push %%"FF_REG_BP
"                     \n\t" 
 1162                 WRITEBGR32(%%FF_REGb, 
DSTW_OFFSET"(%5)", %%FF_REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
 
 1163                 "pop %%"FF_REG_BP
"                      \n\t" 
 1165                 :: 
"c" (buf0), 
"d" (abuf0), 
"S" (ubuf0), 
"D" (ubuf1), 
"m" (dest),
 
 1171                 "mov           %4, %%"FF_REG_b
"         \n\t" 
 1172                 "push %%"FF_REG_BP
"                     \n\t" 
 1174                 "pcmpeqd %%mm7, %%mm7                   \n\t" 
 1175                 WRITEBGR32(%%FF_REGb, 
DSTW_OFFSET"(%5)", %%FF_REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
 
 1176                 "pop %%"FF_REG_BP
"                      \n\t" 
 1178                 :: 
"c" (buf0), 
"d" (buf1), 
"S" (ubuf0), 
"D" (ubuf1), 
"m" (dest),
 
 1186                                 const int16_t *ubuf[2], 
const int16_t *vbuf[2],
 
 1187                                 const int16_t *abuf0, uint8_t *dest,
 
 1188                                 int dstW, 
int uvalpha, 
int y)
 
 1190     const int16_t *ubuf0 = ubuf[0];
 
 1191     const int16_t *buf1= buf0; 
 
 1193     if (uvalpha < 2048) { 
 
 1194         const int16_t *ubuf1 = ubuf[0];
 
 1197             "mov           %4, %%"FF_REG_b
"         \n\t" 
 1198             "push %%"FF_REG_BP
"                     \n\t" 
 1200             "pxor    %%mm7, %%mm7                   \n\t" 
 1202             "pop %%"FF_REG_BP
"                      \n\t" 
 1204             :: 
"c" (buf0), 
"d" (buf1), 
"S" (ubuf0), 
"D" (ubuf1), 
"m" (dest),
 
 1209         const int16_t *ubuf1 = ubuf[1];
 
 1212             "mov           %4, %%"FF_REG_b
"         \n\t" 
 1213             "push %%"FF_REG_BP
"                     \n\t" 
 1215             "pxor    %%mm7, %%mm7                   \n\t" 
 1217             "pop %%"FF_REG_BP
"                      \n\t" 
 1219             :: 
"c" (buf0), 
"d" (buf1), 
"S" (ubuf0), 
"D" (ubuf1), 
"m" (dest),
 
 1227                                  const int16_t *ubuf[2], 
const int16_t *vbuf[2],
 
 1228                                  const int16_t *abuf0, uint8_t *dest,
 
 1229                                  int dstW, 
int uvalpha, 
int y)
 
 1231     const int16_t *ubuf0 = ubuf[0];
 
 1232     const int16_t *buf1= buf0; 
 
 1234     if (uvalpha < 2048) { 
 
 1235         const int16_t *ubuf1 = ubuf[0];
 
 1238             "mov           %4, %%"FF_REG_b
"         \n\t" 
 1239             "push %%"FF_REG_BP
"                     \n\t" 
 1241             "pxor    %%mm7, %%mm7                   \n\t" 
 1249             "pop %%"FF_REG_BP
"                      \n\t" 
 1251             :: 
"c" (buf0), 
"d" (buf1), 
"S" (ubuf0), 
"D" (ubuf1), 
"m" (dest),
 
 1256         const int16_t *ubuf1 = ubuf[1];
 
 1259             "mov           %4, %%"FF_REG_b
"         \n\t" 
 1260             "push %%"FF_REG_BP
"                     \n\t" 
 1262             "pxor    %%mm7, %%mm7                   \n\t" 
 1270             "pop %%"FF_REG_BP
"                      \n\t" 
 1272             :: 
"c" (buf0), 
"d" (buf1), 
"S" (ubuf0), 
"D" (ubuf1), 
"m" (dest),
 
 1280                                  const int16_t *ubuf[2], 
const int16_t *vbuf[2],
 
 1281                                  const int16_t *abuf0, uint8_t *dest,
 
 1282                                  int dstW, 
int uvalpha, 
int y)
 
 1284     const int16_t *ubuf0 = ubuf[0];
 
 1285     const int16_t *buf1= buf0; 
 
 1287     if (uvalpha < 2048) { 
 
 1288         const int16_t *ubuf1 = ubuf[0];
 
 1291             "mov           %4, %%"FF_REG_b
"         \n\t" 
 1292             "push %%"FF_REG_BP
"                     \n\t" 
 1294             "pxor    %%mm7, %%mm7                   \n\t" 
 1302             "pop %%"FF_REG_BP
"                      \n\t" 
 1304             :: 
"c" (buf0), 
"d" (buf1), 
"S" (ubuf0), 
"D" (ubuf1), 
"m" (dest),
 
 1309         const int16_t *ubuf1 = ubuf[1];
 
 1312             "mov           %4, %%"FF_REG_b
"         \n\t" 
 1313             "push %%"FF_REG_BP
"                     \n\t" 
 1315             "pxor    %%mm7, %%mm7                   \n\t" 
 1323             "pop %%"FF_REG_BP
"                      \n\t" 
 1325             :: 
"c" (buf0), 
"d" (buf1), 
"S" (ubuf0), 
"D" (ubuf1), 
"m" (dest),
 
 1332 #define REAL_YSCALEYUV2PACKED1(index, c) \ 
 1333     "xor            "#index", "#index"  \n\t"\ 
 1336     "movq     (%2, "#index"), %%mm3     \n\t" \ 
 1337     "add "UV_OFF_BYTE"("#c"), "#index"  \n\t" \ 
 1338     "movq     (%2, "#index"), %%mm4     \n\t" \ 
 1339     "sub "UV_OFF_BYTE"("#c"), "#index"  \n\t" \ 
 1340     "psraw                $7, %%mm3     \n\t" \ 
 1341     "psraw                $7, %%mm4     \n\t" \ 
 1342     "movq  (%0, "#index", 2), %%mm1     \n\t" \ 
 1343     "movq 8(%0, "#index", 2), %%mm7     \n\t" \ 
 1344     "psraw                $7, %%mm1     \n\t" \ 
 1345     "psraw                $7, %%mm7     \n\t" \ 
 1347 #define YSCALEYUV2PACKED1(index, c)  REAL_YSCALEYUV2PACKED1(index, c) 
 1349 #define REAL_YSCALEYUV2PACKED1b(index, c) \ 
 1350     "xor "#index", "#index"             \n\t"\ 
 1353     "movq     (%2, "#index"), %%mm2     \n\t" \ 
 1354     "movq     (%3, "#index"), %%mm3     \n\t" \ 
 1355     "add "UV_OFF_BYTE"("#c"), "#index"  \n\t" \ 
 1356     "movq     (%2, "#index"), %%mm5     \n\t" \ 
 1357     "movq     (%3, "#index"), %%mm4     \n\t" \ 
 1358     "sub "UV_OFF_BYTE"("#c"), "#index"  \n\t" \ 
 1359     "paddw             %%mm2, %%mm3     \n\t" \ 
 1360     "paddw             %%mm5, %%mm4     \n\t" \ 
 1361     "psrlw                $8, %%mm3     \n\t" \ 
 1362     "psrlw                $8, %%mm4     \n\t" \ 
 1363     "movq  (%0, "#index", 2), %%mm1     \n\t" \ 
 1364     "movq 8(%0, "#index", 2), %%mm7     \n\t" \ 
 1365     "psraw                $7, %%mm1     \n\t" \ 
 1366     "psraw                $7, %%mm7     \n\t" 
 1367 #define YSCALEYUV2PACKED1b(index, c)  REAL_YSCALEYUV2PACKED1b(index, c) 
 1370                                   const int16_t *ubuf[2], 
const int16_t *vbuf[2],
 
 1371                                   const int16_t *abuf0, uint8_t *dest,
 
 1372                                   int dstW, 
int uvalpha, 
int y)
 
 1374     const int16_t *ubuf0 = ubuf[0];
 
 1375     const int16_t *buf1= buf0; 
 
 1377     if (uvalpha < 2048) { 
 
 1378         const int16_t *ubuf1 = ubuf[0];
 
 1381             "mov           %4, %%"FF_REG_b
"         \n\t" 
 1382             "push %%"FF_REG_BP
"                     \n\t" 
 1385             "pop %%"FF_REG_BP
"                      \n\t" 
 1387             :: 
"c" (buf0), 
"d" (buf1), 
"S" (ubuf0), 
"D" (ubuf1), 
"m" (dest),
 
 1391         const int16_t *ubuf1 = ubuf[1];
 
 1394             "mov           %4, %%"FF_REG_b
"         \n\t" 
 1395             "push %%"FF_REG_BP
"                     \n\t" 
 1398             "pop %%"FF_REG_BP
"                      \n\t" 
 1400             :: 
"c" (buf0), 
"d" (buf1), 
"S" (ubuf0), 
"D" (ubuf1), 
"m" (dest),
 
 1409     c->use_mmx_vfilter= 0;
 
 1415                     switch (
c->dstFormat) {
 
 1427                 c->use_mmx_vfilter= 1;
 
 1429                     switch (
c->dstFormat) {
 
 1443             switch (
c->dstFormat) {
 
 1445                 c->yuv2packed1 = 
RENAME(yuv2rgb32_1);
 
 1446                 c->yuv2packed2 = 
RENAME(yuv2rgb32_2);
 
 1449                 c->yuv2packed1 = 
RENAME(yuv2bgr24_1);
 
 1450                 c->yuv2packed2 = 
RENAME(yuv2bgr24_2);
 
 1453                 c->yuv2packed1 = 
RENAME(yuv2rgb555_1);
 
 1454                 c->yuv2packed2 = 
RENAME(yuv2rgb555_2);
 
 1457                 c->yuv2packed1 = 
RENAME(yuv2rgb565_1);
 
 1458                 c->yuv2packed2 = 
RENAME(yuv2rgb565_2);
 
 1461                 c->yuv2packed1 = 
RENAME(yuv2yuyv422_1);
 
 1462                 c->yuv2packed2 = 
RENAME(yuv2yuyv422_2);
 
 1470     if (
c->srcBpc == 8 && 
c->dstBpc <= 14) {
 
 1476             c->hyscale_fast = 
NULL;
 
 1477             c->hcscale_fast = 
NULL;