FFmpeg
swscale_template.c
Go to the documentation of this file.
1 /*
2  * Copyright (C) 2001-2011 Michael Niedermayer <michaelni@gmx.at>
3  *
4  * This file is part of FFmpeg.
5  *
6  * FFmpeg is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU Lesser General Public
8  * License as published by the Free Software Foundation; either
9  * version 2.1 of the License, or (at your option) any later version.
10  *
11  * FFmpeg is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14  * Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with FFmpeg; if not, write to the Free Software
18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19  */
20 
21 #include <stdint.h>
22 
23 #include "libavutil/x86/asm.h"
25 
26 #undef REAL_MOVNTQ
27 #undef MOVNTQ
28 
29 
30 #define REAL_MOVNTQ(a,b) "movntq " #a ", " #b " \n\t"
31 #define MOVNTQ(a,b) REAL_MOVNTQ(a,b)
32 
33 #define YSCALEYUV2PACKEDX_UV \
34  __asm__ volatile(\
35  "xor %%"FF_REG_a", %%"FF_REG_a" \n\t"\
36  ".p2align 4 \n\t"\
37  "nop \n\t"\
38  "1: \n\t"\
39  "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"FF_REG_d" \n\t"\
40  "mov (%%"FF_REG_d"), %%"FF_REG_S" \n\t"\
41  "movq "VROUNDER_OFFSET"(%0), %%mm3 \n\t"\
42  "movq %%mm3, %%mm4 \n\t"\
43  ".p2align 4 \n\t"\
44  "2: \n\t"\
45  "movq 8(%%"FF_REG_d"), %%mm0 \n\t" /* filterCoeff */\
46  "movq (%%"FF_REG_S", %%"FF_REG_a"), %%mm2 \n\t" /* UsrcData */\
47  "add %6, %%"FF_REG_S" \n\t" \
48  "movq (%%"FF_REG_S", %%"FF_REG_a"), %%mm5 \n\t" /* VsrcData */\
49  "add $16, %%"FF_REG_d" \n\t"\
50  "mov (%%"FF_REG_d"), %%"FF_REG_S" \n\t"\
51  "pmulhw %%mm0, %%mm2 \n\t"\
52  "pmulhw %%mm0, %%mm5 \n\t"\
53  "paddw %%mm2, %%mm3 \n\t"\
54  "paddw %%mm5, %%mm4 \n\t"\
55  "test %%"FF_REG_S", %%"FF_REG_S" \n\t"\
56  " jnz 2b \n\t"\
57 
58 #define YSCALEYUV2PACKEDX_YA(offset,coeff,src1,src2,dst1,dst2) \
59  "lea "offset"(%0), %%"FF_REG_d" \n\t"\
60  "mov (%%"FF_REG_d"), %%"FF_REG_S" \n\t"\
61  "movq "VROUNDER_OFFSET"(%0), "#dst1" \n\t"\
62  "movq "#dst1", "#dst2" \n\t"\
63  ".p2align 4 \n\t"\
64  "2: \n\t"\
65  "movq 8(%%"FF_REG_d"), "#coeff" \n\t" /* filterCoeff */\
66  "movq (%%"FF_REG_S", %%"FF_REG_a", 2), "#src1" \n\t" /* Y1srcData */\
67  "movq 8(%%"FF_REG_S", %%"FF_REG_a", 2), "#src2" \n\t" /* Y2srcData */\
68  "add $16, %%"FF_REG_d" \n\t"\
69  "mov (%%"FF_REG_d"), %%"FF_REG_S" \n\t"\
70  "pmulhw "#coeff", "#src1" \n\t"\
71  "pmulhw "#coeff", "#src2" \n\t"\
72  "paddw "#src1", "#dst1" \n\t"\
73  "paddw "#src2", "#dst2" \n\t"\
74  "test %%"FF_REG_S", %%"FF_REG_S" \n\t"\
75  " jnz 2b \n\t"\
76 
77 #define YSCALEYUV2PACKEDX \
78  YSCALEYUV2PACKEDX_UV \
79  YSCALEYUV2PACKEDX_YA(LUM_MMX_FILTER_OFFSET,%%mm0,%%mm2,%%mm5,%%mm1,%%mm7) \
80 
81 #define YSCALEYUV2PACKEDX_END \
82  :: "r" (&c->redDither), \
83  "m" (dummy), "m" (dummy), "m" (dummy),\
84  "r" (dest), "m" (dstW_reg), "m"(uv_off) \
85  NAMED_CONSTRAINTS_ADD(bF8,bFC) \
86  : "%"FF_REG_a, "%"FF_REG_d, "%"FF_REG_S \
87  );
88 
89 #define YSCALEYUV2PACKEDX_ACCURATE_UV \
90  __asm__ volatile(\
91  "xor %%"FF_REG_a", %%"FF_REG_a" \n\t"\
92  ".p2align 4 \n\t"\
93  "nop \n\t"\
94  "1: \n\t"\
95  "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"FF_REG_d" \n\t"\
96  "mov (%%"FF_REG_d"), %%"FF_REG_S" \n\t"\
97  "pxor %%mm4, %%mm4 \n\t"\
98  "pxor %%mm5, %%mm5 \n\t"\
99  "pxor %%mm6, %%mm6 \n\t"\
100  "pxor %%mm7, %%mm7 \n\t"\
101  ".p2align 4 \n\t"\
102  "2: \n\t"\
103  "movq (%%"FF_REG_S", %%"FF_REG_a"), %%mm0 \n\t" /* UsrcData */\
104  "add %6, %%"FF_REG_S" \n\t" \
105  "movq (%%"FF_REG_S", %%"FF_REG_a"), %%mm2 \n\t" /* VsrcData */\
106  "mov "STR(APCK_PTR2)"(%%"FF_REG_d"), %%"FF_REG_S" \n\t"\
107  "movq (%%"FF_REG_S", %%"FF_REG_a"), %%mm1 \n\t" /* UsrcData */\
108  "movq %%mm0, %%mm3 \n\t"\
109  "punpcklwd %%mm1, %%mm0 \n\t"\
110  "punpckhwd %%mm1, %%mm3 \n\t"\
111  "movq "STR(APCK_COEF)"(%%"FF_REG_d"),%%mm1 \n\t" /* filterCoeff */\
112  "pmaddwd %%mm1, %%mm0 \n\t"\
113  "pmaddwd %%mm1, %%mm3 \n\t"\
114  "paddd %%mm0, %%mm4 \n\t"\
115  "paddd %%mm3, %%mm5 \n\t"\
116  "add %6, %%"FF_REG_S" \n\t" \
117  "movq (%%"FF_REG_S", %%"FF_REG_a"), %%mm3 \n\t" /* VsrcData */\
118  "mov "STR(APCK_SIZE)"(%%"FF_REG_d"), %%"FF_REG_S" \n\t"\
119  "add $"STR(APCK_SIZE)", %%"FF_REG_d" \n\t"\
120  "test %%"FF_REG_S", %%"FF_REG_S" \n\t"\
121  "movq %%mm2, %%mm0 \n\t"\
122  "punpcklwd %%mm3, %%mm2 \n\t"\
123  "punpckhwd %%mm3, %%mm0 \n\t"\
124  "pmaddwd %%mm1, %%mm2 \n\t"\
125  "pmaddwd %%mm1, %%mm0 \n\t"\
126  "paddd %%mm2, %%mm6 \n\t"\
127  "paddd %%mm0, %%mm7 \n\t"\
128  " jnz 2b \n\t"\
129  "psrad $16, %%mm4 \n\t"\
130  "psrad $16, %%mm5 \n\t"\
131  "psrad $16, %%mm6 \n\t"\
132  "psrad $16, %%mm7 \n\t"\
133  "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\
134  "packssdw %%mm5, %%mm4 \n\t"\
135  "packssdw %%mm7, %%mm6 \n\t"\
136  "paddw %%mm0, %%mm4 \n\t"\
137  "paddw %%mm0, %%mm6 \n\t"\
138  "movq %%mm4, "U_TEMP"(%0) \n\t"\
139  "movq %%mm6, "V_TEMP"(%0) \n\t"\
140 
141 #define YSCALEYUV2PACKEDX_ACCURATE_YA(offset) \
142  "lea "offset"(%0), %%"FF_REG_d" \n\t"\
143  "mov (%%"FF_REG_d"), %%"FF_REG_S" \n\t"\
144  "pxor %%mm1, %%mm1 \n\t"\
145  "pxor %%mm5, %%mm5 \n\t"\
146  "pxor %%mm7, %%mm7 \n\t"\
147  "pxor %%mm6, %%mm6 \n\t"\
148  ".p2align 4 \n\t"\
149  "2: \n\t"\
150  "movq (%%"FF_REG_S", %%"FF_REG_a", 2), %%mm0 \n\t" /* Y1srcData */\
151  "movq 8(%%"FF_REG_S", %%"FF_REG_a", 2), %%mm2 \n\t" /* Y2srcData */\
152  "mov "STR(APCK_PTR2)"(%%"FF_REG_d"), %%"FF_REG_S" \n\t"\
153  "movq (%%"FF_REG_S", %%"FF_REG_a", 2), %%mm4 \n\t" /* Y1srcData */\
154  "movq %%mm0, %%mm3 \n\t"\
155  "punpcklwd %%mm4, %%mm0 \n\t"\
156  "punpckhwd %%mm4, %%mm3 \n\t"\
157  "movq "STR(APCK_COEF)"(%%"FF_REG_d"), %%mm4 \n\t" /* filterCoeff */\
158  "pmaddwd %%mm4, %%mm0 \n\t"\
159  "pmaddwd %%mm4, %%mm3 \n\t"\
160  "paddd %%mm0, %%mm1 \n\t"\
161  "paddd %%mm3, %%mm5 \n\t"\
162  "movq 8(%%"FF_REG_S", %%"FF_REG_a", 2), %%mm3 \n\t" /* Y2srcData */\
163  "mov "STR(APCK_SIZE)"(%%"FF_REG_d"), %%"FF_REG_S" \n\t"\
164  "add $"STR(APCK_SIZE)", %%"FF_REG_d" \n\t"\
165  "test %%"FF_REG_S", %%"FF_REG_S" \n\t"\
166  "movq %%mm2, %%mm0 \n\t"\
167  "punpcklwd %%mm3, %%mm2 \n\t"\
168  "punpckhwd %%mm3, %%mm0 \n\t"\
169  "pmaddwd %%mm4, %%mm2 \n\t"\
170  "pmaddwd %%mm4, %%mm0 \n\t"\
171  "paddd %%mm2, %%mm7 \n\t"\
172  "paddd %%mm0, %%mm6 \n\t"\
173  " jnz 2b \n\t"\
174  "psrad $16, %%mm1 \n\t"\
175  "psrad $16, %%mm5 \n\t"\
176  "psrad $16, %%mm7 \n\t"\
177  "psrad $16, %%mm6 \n\t"\
178  "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\
179  "packssdw %%mm5, %%mm1 \n\t"\
180  "packssdw %%mm6, %%mm7 \n\t"\
181  "paddw %%mm0, %%mm1 \n\t"\
182  "paddw %%mm0, %%mm7 \n\t"\
183  "movq "U_TEMP"(%0), %%mm3 \n\t"\
184  "movq "V_TEMP"(%0), %%mm4 \n\t"\
185 
186 #define YSCALEYUV2PACKEDX_ACCURATE \
187  YSCALEYUV2PACKEDX_ACCURATE_UV \
188  YSCALEYUV2PACKEDX_ACCURATE_YA(LUM_MMX_FILTER_OFFSET)
189 
190 #define YSCALEYUV2RGBX \
191  "psubw "U_OFFSET"(%0), %%mm3 \n\t" /* (U-128)8*/\
192  "psubw "V_OFFSET"(%0), %%mm4 \n\t" /* (V-128)8*/\
193  "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
194  "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
195  "pmulhw "UG_COEFF"(%0), %%mm3 \n\t"\
196  "pmulhw "VG_COEFF"(%0), %%mm4 \n\t"\
197  /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
198  "pmulhw "UB_COEFF"(%0), %%mm2 \n\t"\
199  "pmulhw "VR_COEFF"(%0), %%mm5 \n\t"\
200  "psubw "Y_OFFSET"(%0), %%mm1 \n\t" /* 8(Y-16)*/\
201  "psubw "Y_OFFSET"(%0), %%mm7 \n\t" /* 8(Y-16)*/\
202  "pmulhw "Y_COEFF"(%0), %%mm1 \n\t"\
203  "pmulhw "Y_COEFF"(%0), %%mm7 \n\t"\
204  /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
205  "paddw %%mm3, %%mm4 \n\t"\
206  "movq %%mm2, %%mm0 \n\t"\
207  "movq %%mm5, %%mm6 \n\t"\
208  "movq %%mm4, %%mm3 \n\t"\
209  "punpcklwd %%mm2, %%mm2 \n\t"\
210  "punpcklwd %%mm5, %%mm5 \n\t"\
211  "punpcklwd %%mm4, %%mm4 \n\t"\
212  "paddw %%mm1, %%mm2 \n\t"\
213  "paddw %%mm1, %%mm5 \n\t"\
214  "paddw %%mm1, %%mm4 \n\t"\
215  "punpckhwd %%mm0, %%mm0 \n\t"\
216  "punpckhwd %%mm6, %%mm6 \n\t"\
217  "punpckhwd %%mm3, %%mm3 \n\t"\
218  "paddw %%mm7, %%mm0 \n\t"\
219  "paddw %%mm7, %%mm6 \n\t"\
220  "paddw %%mm7, %%mm3 \n\t"\
221  /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
222  "packuswb %%mm0, %%mm2 \n\t"\
223  "packuswb %%mm6, %%mm5 \n\t"\
224  "packuswb %%mm3, %%mm4 \n\t"\
225 
226 #define REAL_WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t) \
227  "movq "#b", "#q2" \n\t" /* B */\
228  "movq "#r", "#t" \n\t" /* R */\
229  "punpcklbw "#g", "#b" \n\t" /* GBGBGBGB 0 */\
230  "punpcklbw "#a", "#r" \n\t" /* ARARARAR 0 */\
231  "punpckhbw "#g", "#q2" \n\t" /* GBGBGBGB 2 */\
232  "punpckhbw "#a", "#t" \n\t" /* ARARARAR 2 */\
233  "movq "#b", "#q0" \n\t" /* GBGBGBGB 0 */\
234  "movq "#q2", "#q3" \n\t" /* GBGBGBGB 2 */\
235  "punpcklwd "#r", "#q0" \n\t" /* ARGBARGB 0 */\
236  "punpckhwd "#r", "#b" \n\t" /* ARGBARGB 1 */\
237  "punpcklwd "#t", "#q2" \n\t" /* ARGBARGB 2 */\
238  "punpckhwd "#t", "#q3" \n\t" /* ARGBARGB 3 */\
239 \
240  MOVNTQ( q0, (dst, index, 4))\
241  MOVNTQ( b, 8(dst, index, 4))\
242  MOVNTQ( q2, 16(dst, index, 4))\
243  MOVNTQ( q3, 24(dst, index, 4))\
244 \
245  "add $8, "#index" \n\t"\
246  "cmp "dstw", "#index" \n\t"\
247  " jb 1b \n\t"
248 #define WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t) REAL_WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t)
249 
250 static void RENAME(yuv2rgb32_X_ar)(SwsInternal *c, const int16_t *lumFilter,
251  const int16_t **lumSrc, int lumFilterSize,
252  const int16_t *chrFilter, const int16_t **chrUSrc,
253  const int16_t **chrVSrc,
254  int chrFilterSize, const int16_t **alpSrc,
255  uint8_t *dest, int dstW, int dstY)
256 {
257  x86_reg dummy=0;
258  x86_reg dstW_reg = dstW;
259  x86_reg uv_off = c->uv_offx2;
260 
261  if (CONFIG_SWSCALE_ALPHA && c->needAlpha) {
264  "movq %%mm2, "U_TEMP"(%0) \n\t"
265  "movq %%mm4, "V_TEMP"(%0) \n\t"
266  "movq %%mm5, "Y_TEMP"(%0) \n\t"
268  "movq "Y_TEMP"(%0), %%mm5 \n\t"
269  "psraw $3, %%mm1 \n\t"
270  "psraw $3, %%mm7 \n\t"
271  "packuswb %%mm7, %%mm1 \n\t"
272  WRITEBGR32(%4, "%5", %%FF_REGa, %%mm3, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm2, %%mm6)
274  } else {
277  "pcmpeqd %%mm7, %%mm7 \n\t"
278  WRITEBGR32(%4, "%5", %%FF_REGa, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
280  }
281 }
282 
283 static void RENAME(yuv2rgb32_X)(SwsInternal *c, const int16_t *lumFilter,
284  const int16_t **lumSrc, int lumFilterSize,
285  const int16_t *chrFilter, const int16_t **chrUSrc,
286  const int16_t **chrVSrc,
287  int chrFilterSize, const int16_t **alpSrc,
288  uint8_t *dest, int dstW, int dstY)
289 {
290  x86_reg dummy=0;
291  x86_reg dstW_reg = dstW;
292  x86_reg uv_off = c->uv_offx2;
293 
294  if (CONFIG_SWSCALE_ALPHA && c->needAlpha) {
297  YSCALEYUV2PACKEDX_YA(ALP_MMX_FILTER_OFFSET, %%mm0, %%mm3, %%mm6, %%mm1, %%mm7)
298  "psraw $3, %%mm1 \n\t"
299  "psraw $3, %%mm7 \n\t"
300  "packuswb %%mm7, %%mm1 \n\t"
301  WRITEBGR32(%4, "%5", %%FF_REGa, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
303  } else {
306  "pcmpeqd %%mm7, %%mm7 \n\t"
307  WRITEBGR32(%4, "%5", %%FF_REGa, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
309  }
310 }
311 
312 static void RENAME(yuv2bgr32_X)(SwsInternal *c, const int16_t *lumFilter,
313  const int16_t **lumSrc, int lumFilterSize,
314  const int16_t *chrFilter, const int16_t **chrUSrc,
315  const int16_t **chrVSrc,
316  int chrFilterSize, const int16_t **alpSrc,
317  uint8_t *dest, int dstW, int dstY)
318 {
319  x86_reg dummy=0;
320  x86_reg dstW_reg = dstW;
321  x86_reg uv_off = c->uv_offx2;
322 
323  if (CONFIG_SWSCALE_ALPHA && c->needAlpha) {
326  YSCALEYUV2PACKEDX_YA(ALP_MMX_FILTER_OFFSET, %%mm0, %%mm3, %%mm6, %%mm1, %%mm7)
327  "psraw $3, %%mm1 \n\t"
328  "psraw $3, %%mm7 \n\t"
329  "packuswb %%mm7, %%mm1 \n\t"
330  WRITEBGR32(%4, "%5", %%FF_REGa, %%mm5, %%mm4, %%mm2, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
332  } else {
335  "pcmpeqd %%mm7, %%mm7 \n\t"
336  WRITEBGR32(%4, "%5", %%FF_REGa, %%mm5, %%mm4, %%mm2, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
338  }
339 }
340 
341 #define REAL_WRITERGB16(dst, dstw, index) \
342  "pand "MANGLE(bF8)", %%mm2 \n\t" /* B */\
343  "pand "MANGLE(bFC)", %%mm4 \n\t" /* G */\
344  "pand "MANGLE(bF8)", %%mm5 \n\t" /* R */\
345  "psrlq $3, %%mm2 \n\t"\
346 \
347  "movq %%mm2, %%mm1 \n\t"\
348  "movq %%mm4, %%mm3 \n\t"\
349 \
350  "punpcklbw %%mm7, %%mm3 \n\t"\
351  "punpcklbw %%mm5, %%mm2 \n\t"\
352  "punpckhbw %%mm7, %%mm4 \n\t"\
353  "punpckhbw %%mm5, %%mm1 \n\t"\
354 \
355  "psllq $3, %%mm3 \n\t"\
356  "psllq $3, %%mm4 \n\t"\
357 \
358  "por %%mm3, %%mm2 \n\t"\
359  "por %%mm4, %%mm1 \n\t"\
360 \
361  MOVNTQ(%%mm2, (dst, index, 2))\
362  MOVNTQ(%%mm1, 8(dst, index, 2))\
363 \
364  "add $8, "#index" \n\t"\
365  "cmp "dstw", "#index" \n\t"\
366  " jb 1b \n\t"
367 #define WRITERGB16(dst, dstw, index) REAL_WRITERGB16(dst, dstw, index)
368 
369 static void RENAME(yuv2rgb565_X_ar)(SwsInternal *c, const int16_t *lumFilter,
370  const int16_t **lumSrc, int lumFilterSize,
371  const int16_t *chrFilter, const int16_t **chrUSrc,
372  const int16_t **chrVSrc,
373  int chrFilterSize, const int16_t **alpSrc,
374  uint8_t *dest, int dstW, int dstY)
375 {
376  x86_reg dummy=0;
377  x86_reg dstW_reg = dstW;
378  x86_reg uv_off = c->uv_offx2;
379 
382  "pxor %%mm7, %%mm7 \n\t"
383  /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
384  "paddusb "BLUE_DITHER"(%0), %%mm2\n\t"
385  "paddusb "GREEN_DITHER"(%0), %%mm4\n\t"
386  "paddusb "RED_DITHER"(%0), %%mm5\n\t"
387  WRITERGB16(%4, "%5", %%FF_REGa)
389 }
390 
391 static void RENAME(yuv2rgb565_X)(SwsInternal *c, const int16_t *lumFilter,
392  const int16_t **lumSrc, int lumFilterSize,
393  const int16_t *chrFilter, const int16_t **chrUSrc,
394  const int16_t **chrVSrc,
395  int chrFilterSize, const int16_t **alpSrc,
396  uint8_t *dest, int dstW, int dstY)
397 {
398  x86_reg dummy=0;
399  x86_reg dstW_reg = dstW;
400  x86_reg uv_off = c->uv_offx2;
401 
404  "pxor %%mm7, %%mm7 \n\t"
405  /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
406  "paddusb "BLUE_DITHER"(%0), %%mm2 \n\t"
407  "paddusb "GREEN_DITHER"(%0), %%mm4 \n\t"
408  "paddusb "RED_DITHER"(%0), %%mm5 \n\t"
409  WRITERGB16(%4, "%5", %%FF_REGa)
411 }
412 
413 #define REAL_WRITERGB15(dst, dstw, index) \
414  "pand "MANGLE(bF8)", %%mm2 \n\t" /* B */\
415  "pand "MANGLE(bF8)", %%mm4 \n\t" /* G */\
416  "pand "MANGLE(bF8)", %%mm5 \n\t" /* R */\
417  "psrlq $3, %%mm2 \n\t"\
418  "psrlq $1, %%mm5 \n\t"\
419 \
420  "movq %%mm2, %%mm1 \n\t"\
421  "movq %%mm4, %%mm3 \n\t"\
422 \
423  "punpcklbw %%mm7, %%mm3 \n\t"\
424  "punpcklbw %%mm5, %%mm2 \n\t"\
425  "punpckhbw %%mm7, %%mm4 \n\t"\
426  "punpckhbw %%mm5, %%mm1 \n\t"\
427 \
428  "psllq $2, %%mm3 \n\t"\
429  "psllq $2, %%mm4 \n\t"\
430 \
431  "por %%mm3, %%mm2 \n\t"\
432  "por %%mm4, %%mm1 \n\t"\
433 \
434  MOVNTQ(%%mm2, (dst, index, 2))\
435  MOVNTQ(%%mm1, 8(dst, index, 2))\
436 \
437  "add $8, "#index" \n\t"\
438  "cmp "dstw", "#index" \n\t"\
439  " jb 1b \n\t"
440 #define WRITERGB15(dst, dstw, index) REAL_WRITERGB15(dst, dstw, index)
441 
442 static void RENAME(yuv2rgb555_X_ar)(SwsInternal *c, const int16_t *lumFilter,
443  const int16_t **lumSrc, int lumFilterSize,
444  const int16_t *chrFilter, const int16_t **chrUSrc,
445  const int16_t **chrVSrc,
446  int chrFilterSize, const int16_t **alpSrc,
447  uint8_t *dest, int dstW, int dstY)
448 {
449  x86_reg dummy=0;
450  x86_reg dstW_reg = dstW;
451  x86_reg uv_off = c->uv_offx2;
452 
455  "pxor %%mm7, %%mm7 \n\t"
456  /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
457  "paddusb "BLUE_DITHER"(%0), %%mm2\n\t"
458  "paddusb "GREEN_DITHER"(%0), %%mm4\n\t"
459  "paddusb "RED_DITHER"(%0), %%mm5\n\t"
460  WRITERGB15(%4, "%5", %%FF_REGa)
462 }
463 
464 static void RENAME(yuv2rgb555_X)(SwsInternal *c, const int16_t *lumFilter,
465  const int16_t **lumSrc, int lumFilterSize,
466  const int16_t *chrFilter, const int16_t **chrUSrc,
467  const int16_t **chrVSrc,
468  int chrFilterSize, const int16_t **alpSrc,
469  uint8_t *dest, int dstW, int dstY)
470 {
471  x86_reg dummy=0;
472  x86_reg dstW_reg = dstW;
473  x86_reg uv_off = c->uv_offx2;
474 
477  "pxor %%mm7, %%mm7 \n\t"
478  /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
479  "paddusb "BLUE_DITHER"(%0), %%mm2 \n\t"
480  "paddusb "GREEN_DITHER"(%0), %%mm4 \n\t"
481  "paddusb "RED_DITHER"(%0), %%mm5 \n\t"
482  WRITERGB15(%4, "%5", %%FF_REGa)
484 }
485 
486 #define WRITEBGR24MMXEXT(dst, dstw, index) \
487  /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
488  "movq "MANGLE(M24A)", %%mm0 \n\t"\
489  "movq "MANGLE(M24C)", %%mm7 \n\t"\
490  "pshufw $0x50, %%mm2, %%mm1 \n\t" /* B3 B2 B3 B2 B1 B0 B1 B0 */\
491  "pshufw $0x50, %%mm4, %%mm3 \n\t" /* G3 G2 G3 G2 G1 G0 G1 G0 */\
492  "pshufw $0x00, %%mm5, %%mm6 \n\t" /* R1 R0 R1 R0 R1 R0 R1 R0 */\
493 \
494  "pand %%mm0, %%mm1 \n\t" /* B2 B1 B0 */\
495  "pand %%mm0, %%mm3 \n\t" /* G2 G1 G0 */\
496  "pand %%mm7, %%mm6 \n\t" /* R1 R0 */\
497 \
498  "psllq $8, %%mm3 \n\t" /* G2 G1 G0 */\
499  "por %%mm1, %%mm6 \n\t"\
500  "por %%mm3, %%mm6 \n\t"\
501  MOVNTQ(%%mm6, (dst))\
502 \
503  "psrlq $8, %%mm4 \n\t" /* 00 G7 G6 G5 G4 G3 G2 G1 */\
504  "pshufw $0xA5, %%mm2, %%mm1 \n\t" /* B5 B4 B5 B4 B3 B2 B3 B2 */\
505  "pshufw $0x55, %%mm4, %%mm3 \n\t" /* G4 G3 G4 G3 G4 G3 G4 G3 */\
506  "pshufw $0xA5, %%mm5, %%mm6 \n\t" /* R5 R4 R5 R4 R3 R2 R3 R2 */\
507 \
508  "pand "MANGLE(M24B)", %%mm1 \n\t" /* B5 B4 B3 */\
509  "pand %%mm7, %%mm3 \n\t" /* G4 G3 */\
510  "pand %%mm0, %%mm6 \n\t" /* R4 R3 R2 */\
511 \
512  "por %%mm1, %%mm3 \n\t" /* B5 G4 B4 G3 B3 */\
513  "por %%mm3, %%mm6 \n\t"\
514  MOVNTQ(%%mm6, 8(dst))\
515 \
516  "pshufw $0xFF, %%mm2, %%mm1 \n\t" /* B7 B6 B7 B6 B7 B6 B6 B7 */\
517  "pshufw $0xFA, %%mm4, %%mm3 \n\t" /* 00 G7 00 G7 G6 G5 G6 G5 */\
518  "pshufw $0xFA, %%mm5, %%mm6 \n\t" /* R7 R6 R7 R6 R5 R4 R5 R4 */\
519 \
520  "pand %%mm7, %%mm1 \n\t" /* B7 B6 */\
521  "pand %%mm0, %%mm3 \n\t" /* G7 G6 G5 */\
522  "pand "MANGLE(M24B)", %%mm6 \n\t" /* R7 R6 R5 */\
523 \
524  "por %%mm1, %%mm3 \n\t"\
525  "por %%mm3, %%mm6 \n\t"\
526  MOVNTQ(%%mm6, 16(dst))\
527 \
528  "add $24, "#dst" \n\t"\
529 \
530  "add $8, "#index" \n\t"\
531  "cmp "dstw", "#index" \n\t"\
532  " jb 1b \n\t"
533 
534 #undef WRITEBGR24
535 #define WRITEBGR24(dst, dstw, index) WRITEBGR24MMXEXT(dst, dstw, index)
536 
537 #if HAVE_6REGS
538 static void RENAME(yuv2bgr24_X_ar)(SwsInternal *c, const int16_t *lumFilter,
539  const int16_t **lumSrc, int lumFilterSize,
540  const int16_t *chrFilter, const int16_t **chrUSrc,
541  const int16_t **chrVSrc,
542  int chrFilterSize, const int16_t **alpSrc,
543  uint8_t *dest, int dstW, int dstY)
544 {
545  x86_reg dummy=0;
546  x86_reg dstW_reg = dstW;
547  x86_reg uv_off = c->uv_offx2;
548 
551  "pxor %%mm7, %%mm7 \n\t"
552  "lea (%%"FF_REG_a", %%"FF_REG_a", 2), %%"FF_REG_c"\n\t" //FIXME optimize
553  "add %4, %%"FF_REG_c" \n\t"
554  WRITEBGR24(%%FF_REGc, "%5", %%FF_REGa)
555  :: "r" (&c->redDither),
556  "m" (dummy), "m" (dummy), "m" (dummy),
557  "r" (dest), "m" (dstW_reg), "m"(uv_off)
558  NAMED_CONSTRAINTS_ADD(M24A,M24C,M24B)
559  : "%"FF_REG_a, "%"FF_REG_c, "%"FF_REG_d, "%"FF_REG_S
560  );
561 }
562 
563 static void RENAME(yuv2bgr24_X)(SwsInternal *c, const int16_t *lumFilter,
564  const int16_t **lumSrc, int lumFilterSize,
565  const int16_t *chrFilter, const int16_t **chrUSrc,
566  const int16_t **chrVSrc,
567  int chrFilterSize, const int16_t **alpSrc,
568  uint8_t *dest, int dstW, int dstY)
569 {
570  x86_reg dummy=0;
571  x86_reg dstW_reg = dstW;
572  x86_reg uv_off = c->uv_offx2;
573 
576  "pxor %%mm7, %%mm7 \n\t"
577  "lea (%%"FF_REG_a", %%"FF_REG_a", 2), %%"FF_REG_c" \n\t" //FIXME optimize
578  "add %4, %%"FF_REG_c" \n\t"
579  WRITEBGR24(%%FF_REGc, "%5", %%FF_REGa)
580  :: "r" (&c->redDither),
581  "m" (dummy), "m" (dummy), "m" (dummy),
582  "r" (dest), "m" (dstW_reg), "m"(uv_off)
583  NAMED_CONSTRAINTS_ADD(M24A,M24C,M24B)
584  : "%"FF_REG_a, "%"FF_REG_c, "%"FF_REG_d, "%"FF_REG_S
585  );
586 }
587 #endif /* HAVE_6REGS */
588 
589 #define REAL_WRITEYUY2(dst, dstw, index) \
590  "packuswb %%mm3, %%mm3 \n\t"\
591  "packuswb %%mm4, %%mm4 \n\t"\
592  "packuswb %%mm7, %%mm1 \n\t"\
593  "punpcklbw %%mm4, %%mm3 \n\t"\
594  "movq %%mm1, %%mm7 \n\t"\
595  "punpcklbw %%mm3, %%mm1 \n\t"\
596  "punpckhbw %%mm3, %%mm7 \n\t"\
597 \
598  MOVNTQ(%%mm1, (dst, index, 2))\
599  MOVNTQ(%%mm7, 8(dst, index, 2))\
600 \
601  "add $8, "#index" \n\t"\
602  "cmp "dstw", "#index" \n\t"\
603  " jb 1b \n\t"
604 #define WRITEYUY2(dst, dstw, index) REAL_WRITEYUY2(dst, dstw, index)
605 
606 static void RENAME(yuv2yuyv422_X_ar)(SwsInternal *c, const int16_t *lumFilter,
607  const int16_t **lumSrc, int lumFilterSize,
608  const int16_t *chrFilter, const int16_t **chrUSrc,
609  const int16_t **chrVSrc,
610  int chrFilterSize, const int16_t **alpSrc,
611  uint8_t *dest, int dstW, int dstY)
612 {
613  x86_reg dummy=0;
614  x86_reg dstW_reg = dstW;
615  x86_reg uv_off = c->uv_offx2;
616 
618  /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
619  "psraw $3, %%mm3 \n\t"
620  "psraw $3, %%mm4 \n\t"
621  "psraw $3, %%mm1 \n\t"
622  "psraw $3, %%mm7 \n\t"
623  WRITEYUY2(%4, "%5", %%FF_REGa)
625 }
626 
627 static void RENAME(yuv2yuyv422_X)(SwsInternal *c, const int16_t *lumFilter,
628  const int16_t **lumSrc, int lumFilterSize,
629  const int16_t *chrFilter, const int16_t **chrUSrc,
630  const int16_t **chrVSrc,
631  int chrFilterSize, const int16_t **alpSrc,
632  uint8_t *dest, int dstW, int dstY)
633 {
634  x86_reg dummy=0;
635  x86_reg dstW_reg = dstW;
636  x86_reg uv_off = c->uv_offx2;
637 
639  /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
640  "psraw $3, %%mm3 \n\t"
641  "psraw $3, %%mm4 \n\t"
642  "psraw $3, %%mm1 \n\t"
643  "psraw $3, %%mm7 \n\t"
644  WRITEYUY2(%4, "%5", %%FF_REGa)
646 }
647 
648 #define REAL_YSCALEYUV2RGB_UV(index, c) \
649  "xor "#index", "#index" \n\t"\
650  ".p2align 4 \n\t"\
651  "1: \n\t"\
652  "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
653  "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
654  "add "UV_OFF_BYTE"("#c"), "#index" \n\t" \
655  "movq (%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
656  "movq (%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
657  "sub "UV_OFF_BYTE"("#c"), "#index" \n\t" \
658  "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
659  "psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
660  "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\
661  "pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
662  "pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
663  "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
664  "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
665  "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
666  "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
667  "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
668  "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
669  "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
670  "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
671  "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\
672  "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\
673  /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
674 
675 #define REAL_YSCALEYUV2RGB_YA(index, c, b1, b2) \
676  "movq ("#b1", "#index", 2), %%mm0 \n\t" /*buf0[eax]*/\
677  "movq ("#b2", "#index", 2), %%mm1 \n\t" /*buf1[eax]*/\
678  "movq 8("#b1", "#index", 2), %%mm6 \n\t" /*buf0[eax]*/\
679  "movq 8("#b2", "#index", 2), %%mm7 \n\t" /*buf1[eax]*/\
680  "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
681  "psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\
682  "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
683  "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
684  "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
685  "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
686  "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
687  "paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
688 
689 #define REAL_YSCALEYUV2RGB_COEFF(c) \
690  "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\
691  "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\
692  "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
693  "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
694  "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
695  "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
696  /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
697  "paddw %%mm3, %%mm4 \n\t"\
698  "movq %%mm2, %%mm0 \n\t"\
699  "movq %%mm5, %%mm6 \n\t"\
700  "movq %%mm4, %%mm3 \n\t"\
701  "punpcklwd %%mm2, %%mm2 \n\t"\
702  "punpcklwd %%mm5, %%mm5 \n\t"\
703  "punpcklwd %%mm4, %%mm4 \n\t"\
704  "paddw %%mm1, %%mm2 \n\t"\
705  "paddw %%mm1, %%mm5 \n\t"\
706  "paddw %%mm1, %%mm4 \n\t"\
707  "punpckhwd %%mm0, %%mm0 \n\t"\
708  "punpckhwd %%mm6, %%mm6 \n\t"\
709  "punpckhwd %%mm3, %%mm3 \n\t"\
710  "paddw %%mm7, %%mm0 \n\t"\
711  "paddw %%mm7, %%mm6 \n\t"\
712  "paddw %%mm7, %%mm3 \n\t"\
713  /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
714  "packuswb %%mm0, %%mm2 \n\t"\
715  "packuswb %%mm6, %%mm5 \n\t"\
716  "packuswb %%mm3, %%mm4 \n\t"\
717 
718 #define YSCALEYUV2RGB_YA(index, c, b1, b2) REAL_YSCALEYUV2RGB_YA(index, c, b1, b2)
719 
720 #define YSCALEYUV2RGB(index, c) \
721  REAL_YSCALEYUV2RGB_UV(index, c) \
722  REAL_YSCALEYUV2RGB_YA(index, c, %0, %1) \
723  REAL_YSCALEYUV2RGB_COEFF(c)
724 
725 /**
726  * vertical bilinear scale YV12 to RGB
727  */
728 static void RENAME(yuv2rgb32_2)(SwsInternal *c, const int16_t *buf[2],
729  const int16_t *ubuf[2], const int16_t *vbuf[2],
730  const int16_t *abuf[2], uint8_t *dest,
731  int dstW, int yalpha, int uvalpha, int y)
732 {
733  const int16_t *buf0 = buf[0], *buf1 = buf[1],
734  *ubuf0 = ubuf[0], *ubuf1 = ubuf[1];
735 
736  if (CONFIG_SWSCALE_ALPHA && c->needAlpha) {
737  const int16_t *abuf0 = abuf[0], *abuf1 = abuf[1];
738 #if ARCH_X86_64
739  __asm__ volatile(
740  YSCALEYUV2RGB(%%r8, %5)
741  YSCALEYUV2RGB_YA(%%r8, %5, %6, %7)
742  "psraw $3, %%mm1 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
743  "psraw $3, %%mm7 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
744  "packuswb %%mm7, %%mm1 \n\t"
745  WRITEBGR32(%4, DSTW_OFFSET"(%5)", %%r8, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
746  :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "r" (dest),
747  "a" (&c->redDither),
748  "r" (abuf0), "r" (abuf1)
749  : "%r8"
750  );
751 #else
752  c->u_temp=(intptr_t)abuf0;
753  c->v_temp=(intptr_t)abuf1;
754  __asm__ volatile(
755  "mov %%"FF_REG_b", "ESP_OFFSET"(%5) \n\t"
756  "mov %4, %%"FF_REG_b" \n\t"
757  "push %%"FF_REG_BP" \n\t"
758  YSCALEYUV2RGB(%%FF_REGBP, %5)
759  "push %0 \n\t"
760  "push %1 \n\t"
761  "mov "U_TEMP"(%5), %0 \n\t"
762  "mov "V_TEMP"(%5), %1 \n\t"
763  YSCALEYUV2RGB_YA(%%FF_REGBP, %5, %0, %1)
764  "psraw $3, %%mm1 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
765  "psraw $3, %%mm7 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
766  "packuswb %%mm7, %%mm1 \n\t"
767  "pop %1 \n\t"
768  "pop %0 \n\t"
769  WRITEBGR32(%%FF_REGb, DSTW_OFFSET"(%5)", %%FF_REGBP, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
770  "pop %%"FF_REG_BP" \n\t"
771  "mov "ESP_OFFSET"(%5), %%"FF_REG_b" \n\t"
772  :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
773  "a" (&c->redDither)
774  );
775 #endif
776  } else {
777  __asm__ volatile(
778  "mov %%"FF_REG_b", "ESP_OFFSET"(%5) \n\t"
779  "mov %4, %%"FF_REG_b" \n\t"
780  "push %%"FF_REG_BP" \n\t"
781  YSCALEYUV2RGB(%%FF_REGBP, %5)
782  "pcmpeqd %%mm7, %%mm7 \n\t"
783  WRITEBGR32(%%FF_REGb, DSTW_OFFSET"(%5)", %%FF_REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
784  "pop %%"FF_REG_BP" \n\t"
785  "mov "ESP_OFFSET"(%5), %%"FF_REG_b" \n\t"
786  :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
787  "a" (&c->redDither)
788  );
789  }
790 }
791 
792 static void RENAME(yuv2bgr24_2)(SwsInternal *c, const int16_t *buf[2],
793  const int16_t *ubuf[2], const int16_t *vbuf[2],
794  const int16_t *abuf[2], uint8_t *dest,
795  int dstW, int yalpha, int uvalpha, int y)
796 {
797  const int16_t *buf0 = buf[0], *buf1 = buf[1],
798  *ubuf0 = ubuf[0], *ubuf1 = ubuf[1];
799 
800  __asm__ volatile(
801  "mov %%"FF_REG_b", "ESP_OFFSET"(%5) \n\t"
802  "mov %4, %%"FF_REG_b" \n\t"
803  "push %%"FF_REG_BP" \n\t"
804  YSCALEYUV2RGB(%%FF_REGBP, %5)
805  "pxor %%mm7, %%mm7 \n\t"
806  WRITEBGR24(%%FF_REGb, DSTW_OFFSET"(%5)", %%FF_REGBP)
807  "pop %%"FF_REG_BP" \n\t"
808  "mov "ESP_OFFSET"(%5), %%"FF_REG_b" \n\t"
809  :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
810  "a" (&c->redDither)
811  NAMED_CONSTRAINTS_ADD(M24A,M24C,M24B)
812  );
813 }
814 
815 static void RENAME(yuv2rgb555_2)(SwsInternal *c, const int16_t *buf[2],
816  const int16_t *ubuf[2], const int16_t *vbuf[2],
817  const int16_t *abuf[2], uint8_t *dest,
818  int dstW, int yalpha, int uvalpha, int y)
819 {
820  const int16_t *buf0 = buf[0], *buf1 = buf[1],
821  *ubuf0 = ubuf[0], *ubuf1 = ubuf[1];
822 
823  __asm__ volatile(
824  "mov %%"FF_REG_b", "ESP_OFFSET"(%5) \n\t"
825  "mov %4, %%"FF_REG_b" \n\t"
826  "push %%"FF_REG_BP" \n\t"
827  YSCALEYUV2RGB(%%FF_REGBP, %5)
828  "pxor %%mm7, %%mm7 \n\t"
829  /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
830  "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
831  "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
832  "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
833  WRITERGB15(%%FF_REGb, DSTW_OFFSET"(%5)", %%FF_REGBP)
834  "pop %%"FF_REG_BP" \n\t"
835  "mov "ESP_OFFSET"(%5), %%"FF_REG_b" \n\t"
836  :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
837  "a" (&c->redDither)
839  );
840 }
841 
842 static void RENAME(yuv2rgb565_2)(SwsInternal *c, const int16_t *buf[2],
843  const int16_t *ubuf[2], const int16_t *vbuf[2],
844  const int16_t *abuf[2], uint8_t *dest,
845  int dstW, int yalpha, int uvalpha, int y)
846 {
847  const int16_t *buf0 = buf[0], *buf1 = buf[1],
848  *ubuf0 = ubuf[0], *ubuf1 = ubuf[1];
849 
850  __asm__ volatile(
851  "mov %%"FF_REG_b", "ESP_OFFSET"(%5) \n\t"
852  "mov %4, %%"FF_REG_b" \n\t"
853  "push %%"FF_REG_BP" \n\t"
854  YSCALEYUV2RGB(%%FF_REGBP, %5)
855  "pxor %%mm7, %%mm7 \n\t"
856  /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
857  "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
858  "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
859  "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
860  WRITERGB16(%%FF_REGb, DSTW_OFFSET"(%5)", %%FF_REGBP)
861  "pop %%"FF_REG_BP" \n\t"
862  "mov "ESP_OFFSET"(%5), %%"FF_REG_b" \n\t"
863  :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
864  "a" (&c->redDither)
865  NAMED_CONSTRAINTS_ADD(bF8,bFC)
866  );
867 }
868 
869 #define REAL_YSCALEYUV2PACKED(index, c) \
870  "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\
871  "movq "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm1 \n\t"\
872  "psraw $3, %%mm0 \n\t"\
873  "psraw $3, %%mm1 \n\t"\
874  "movq %%mm0, "CHR_MMX_FILTER_OFFSET"+8("#c") \n\t"\
875  "movq %%mm1, "LUM_MMX_FILTER_OFFSET"+8("#c") \n\t"\
876  "xor "#index", "#index" \n\t"\
877  ".p2align 4 \n\t"\
878  "1: \n\t"\
879  "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
880  "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
881  "add "UV_OFF_BYTE"("#c"), "#index" \n\t" \
882  "movq (%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
883  "movq (%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
884  "sub "UV_OFF_BYTE"("#c"), "#index" \n\t" \
885  "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
886  "psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
887  "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\
888  "pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
889  "pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
890  "psraw $7, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
891  "psraw $7, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
892  "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
893  "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
894  "movq (%0, "#index", 2), %%mm0 \n\t" /*buf0[eax]*/\
895  "movq (%1, "#index", 2), %%mm1 \n\t" /*buf1[eax]*/\
896  "movq 8(%0, "#index", 2), %%mm6 \n\t" /*buf0[eax]*/\
897  "movq 8(%1, "#index", 2), %%mm7 \n\t" /*buf1[eax]*/\
898  "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
899  "psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\
900  "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
901  "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
902  "psraw $7, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
903  "psraw $7, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
904  "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
905  "paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
906 
907 #define YSCALEYUV2PACKED(index, c) REAL_YSCALEYUV2PACKED(index, c)
908 
909 static void RENAME(yuv2yuyv422_2)(SwsInternal *c, const int16_t *buf[2],
910  const int16_t *ubuf[2], const int16_t *vbuf[2],
911  const int16_t *abuf[2], uint8_t *dest,
912  int dstW, int yalpha, int uvalpha, int y)
913 {
914  const int16_t *buf0 = buf[0], *buf1 = buf[1],
915  *ubuf0 = ubuf[0], *ubuf1 = ubuf[1];
916 
917  __asm__ volatile(
918  "mov %%"FF_REG_b", "ESP_OFFSET"(%5) \n\t"
919  "mov %4, %%"FF_REG_b" \n\t"
920  "push %%"FF_REG_BP" \n\t"
921  YSCALEYUV2PACKED(%%FF_REGBP, %5)
922  WRITEYUY2(%%FF_REGb, DSTW_OFFSET"(%5)", %%FF_REGBP)
923  "pop %%"FF_REG_BP" \n\t"
924  "mov "ESP_OFFSET"(%5), %%"FF_REG_b" \n\t"
925  :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
926  "a" (&c->redDither)
927  );
928 }
929 
930 #define REAL_YSCALEYUV2RGB1(index, c) \
931  "xor "#index", "#index" \n\t"\
932  ".p2align 4 \n\t"\
933  "1: \n\t"\
934  "movq (%2, "#index"), %%mm3 \n\t" /* uvbuf0[eax]*/\
935  "add "UV_OFF_BYTE"("#c"), "#index" \n\t" \
936  "movq (%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
937  "sub "UV_OFF_BYTE"("#c"), "#index" \n\t" \
938  "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
939  "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
940  "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
941  "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
942  "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
943  "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
944  "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\
945  "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\
946  /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
947  "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
948  "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
949  "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
950  "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
951  "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\
952  "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\
953  "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
954  "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
955  "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
956  "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
957  /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
958  "paddw %%mm3, %%mm4 \n\t"\
959  "movq %%mm2, %%mm0 \n\t"\
960  "movq %%mm5, %%mm6 \n\t"\
961  "movq %%mm4, %%mm3 \n\t"\
962  "punpcklwd %%mm2, %%mm2 \n\t"\
963  "punpcklwd %%mm5, %%mm5 \n\t"\
964  "punpcklwd %%mm4, %%mm4 \n\t"\
965  "paddw %%mm1, %%mm2 \n\t"\
966  "paddw %%mm1, %%mm5 \n\t"\
967  "paddw %%mm1, %%mm4 \n\t"\
968  "punpckhwd %%mm0, %%mm0 \n\t"\
969  "punpckhwd %%mm6, %%mm6 \n\t"\
970  "punpckhwd %%mm3, %%mm3 \n\t"\
971  "paddw %%mm7, %%mm0 \n\t"\
972  "paddw %%mm7, %%mm6 \n\t"\
973  "paddw %%mm7, %%mm3 \n\t"\
974  /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
975  "packuswb %%mm0, %%mm2 \n\t"\
976  "packuswb %%mm6, %%mm5 \n\t"\
977  "packuswb %%mm3, %%mm4 \n\t"\
978 
979 #define YSCALEYUV2RGB1(index, c) REAL_YSCALEYUV2RGB1(index, c)
980 
981 // do vertical chrominance interpolation
982 #define REAL_YSCALEYUV2RGB1b(index, c) \
983  "xor "#index", "#index" \n\t"\
984  ".p2align 4 \n\t"\
985  "1: \n\t"\
986  "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
987  "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
988  "add "UV_OFF_BYTE"("#c"), "#index" \n\t" \
989  "movq (%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
990  "movq (%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
991  "sub "UV_OFF_BYTE"("#c"), "#index" \n\t" \
992  "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
993  "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
994  "psrlw $5, %%mm3 \n\t" /*FIXME might overflow*/\
995  "psrlw $5, %%mm4 \n\t" /*FIXME might overflow*/\
996  "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
997  "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
998  "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
999  "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
1000  "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\
1001  "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\
1002  /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
1003  "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
1004  "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
1005  "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
1006  "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
1007  "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\
1008  "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\
1009  "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
1010  "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
1011  "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
1012  "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
1013  /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
1014  "paddw %%mm3, %%mm4 \n\t"\
1015  "movq %%mm2, %%mm0 \n\t"\
1016  "movq %%mm5, %%mm6 \n\t"\
1017  "movq %%mm4, %%mm3 \n\t"\
1018  "punpcklwd %%mm2, %%mm2 \n\t"\
1019  "punpcklwd %%mm5, %%mm5 \n\t"\
1020  "punpcklwd %%mm4, %%mm4 \n\t"\
1021  "paddw %%mm1, %%mm2 \n\t"\
1022  "paddw %%mm1, %%mm5 \n\t"\
1023  "paddw %%mm1, %%mm4 \n\t"\
1024  "punpckhwd %%mm0, %%mm0 \n\t"\
1025  "punpckhwd %%mm6, %%mm6 \n\t"\
1026  "punpckhwd %%mm3, %%mm3 \n\t"\
1027  "paddw %%mm7, %%mm0 \n\t"\
1028  "paddw %%mm7, %%mm6 \n\t"\
1029  "paddw %%mm7, %%mm3 \n\t"\
1030  /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
1031  "packuswb %%mm0, %%mm2 \n\t"\
1032  "packuswb %%mm6, %%mm5 \n\t"\
1033  "packuswb %%mm3, %%mm4 \n\t"\
1034 
1035 #define YSCALEYUV2RGB1b(index, c) REAL_YSCALEYUV2RGB1b(index, c)
1036 
1037 #define REAL_YSCALEYUV2RGB1_ALPHA(index) \
1038  "movq (%1, "#index", 2), %%mm7 \n\t" /* abuf0[index ] */\
1039  "movq 8(%1, "#index", 2), %%mm1 \n\t" /* abuf0[index+4] */\
1040  "psraw $7, %%mm7 \n\t" /* abuf0[index ] >>7 */\
1041  "psraw $7, %%mm1 \n\t" /* abuf0[index+4] >>7 */\
1042  "packuswb %%mm1, %%mm7 \n\t"
1043 #define YSCALEYUV2RGB1_ALPHA(index) REAL_YSCALEYUV2RGB1_ALPHA(index)
1044 
1045 /**
1046  * YV12 to RGB without scaling or interpolating
1047  */
1048 static void RENAME(yuv2rgb32_1)(SwsInternal *c, const int16_t *buf0,
1049  const int16_t *ubuf[2], const int16_t *vbuf[2],
1050  const int16_t *abuf0, uint8_t *dest,
1051  int dstW, int uvalpha, int y)
1052 {
1053  const int16_t *ubuf0 = ubuf[0];
1054  const int16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
1055 
1056  if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
1057  const int16_t *ubuf1 = ubuf[0];
1058  if (CONFIG_SWSCALE_ALPHA && c->needAlpha) {
1059  __asm__ volatile(
1060  "mov %%"FF_REG_b", "ESP_OFFSET"(%5) \n\t"
1061  "mov %4, %%"FF_REG_b" \n\t"
1062  "push %%"FF_REG_BP" \n\t"
1063  YSCALEYUV2RGB1(%%FF_REGBP, %5)
1064  YSCALEYUV2RGB1_ALPHA(%%FF_REGBP)
1065  WRITEBGR32(%%FF_REGb, DSTW_OFFSET"(%5)", %%FF_REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1066  "pop %%"FF_REG_BP" \n\t"
1067  "mov "ESP_OFFSET"(%5), %%"FF_REG_b" \n\t"
1068  :: "c" (buf0), "d" (abuf0), "S" (ubuf0), "D" (ubuf1), "m" (dest),
1069  "a" (&c->redDither)
1070  );
1071  } else {
1072  __asm__ volatile(
1073  "mov %%"FF_REG_b", "ESP_OFFSET"(%5) \n\t"
1074  "mov %4, %%"FF_REG_b" \n\t"
1075  "push %%"FF_REG_BP" \n\t"
1076  YSCALEYUV2RGB1(%%FF_REGBP, %5)
1077  "pcmpeqd %%mm7, %%mm7 \n\t"
1078  WRITEBGR32(%%FF_REGb, DSTW_OFFSET"(%5)", %%FF_REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1079  "pop %%"FF_REG_BP" \n\t"
1080  "mov "ESP_OFFSET"(%5), %%"FF_REG_b" \n\t"
1081  :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
1082  "a" (&c->redDither)
1083  );
1084  }
1085  } else {
1086  const int16_t *ubuf1 = ubuf[1];
1087  if (CONFIG_SWSCALE_ALPHA && c->needAlpha) {
1088  __asm__ volatile(
1089  "mov %%"FF_REG_b", "ESP_OFFSET"(%5) \n\t"
1090  "mov %4, %%"FF_REG_b" \n\t"
1091  "push %%"FF_REG_BP" \n\t"
1092  YSCALEYUV2RGB1b(%%FF_REGBP, %5)
1093  YSCALEYUV2RGB1_ALPHA(%%FF_REGBP)
1094  WRITEBGR32(%%FF_REGb, DSTW_OFFSET"(%5)", %%FF_REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1095  "pop %%"FF_REG_BP" \n\t"
1096  "mov "ESP_OFFSET"(%5), %%"FF_REG_b" \n\t"
1097  :: "c" (buf0), "d" (abuf0), "S" (ubuf0), "D" (ubuf1), "m" (dest),
1098  "a" (&c->redDither)
1099  );
1100  } else {
1101  __asm__ volatile(
1102  "mov %%"FF_REG_b", "ESP_OFFSET"(%5) \n\t"
1103  "mov %4, %%"FF_REG_b" \n\t"
1104  "push %%"FF_REG_BP" \n\t"
1105  YSCALEYUV2RGB1b(%%FF_REGBP, %5)
1106  "pcmpeqd %%mm7, %%mm7 \n\t"
1107  WRITEBGR32(%%FF_REGb, DSTW_OFFSET"(%5)", %%FF_REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1108  "pop %%"FF_REG_BP" \n\t"
1109  "mov "ESP_OFFSET"(%5), %%"FF_REG_b" \n\t"
1110  :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
1111  "a" (&c->redDither)
1112  );
1113  }
1114  }
1115 }
1116 
1117 static void RENAME(yuv2bgr24_1)(SwsInternal *c, const int16_t *buf0,
1118  const int16_t *ubuf[2], const int16_t *vbuf[2],
1119  const int16_t *abuf0, uint8_t *dest,
1120  int dstW, int uvalpha, int y)
1121 {
1122  const int16_t *ubuf0 = ubuf[0];
1123  const int16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
1124 
1125  if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
1126  const int16_t *ubuf1 = ubuf[0];
1127  __asm__ volatile(
1128  "mov %%"FF_REG_b", "ESP_OFFSET"(%5) \n\t"
1129  "mov %4, %%"FF_REG_b" \n\t"
1130  "push %%"FF_REG_BP" \n\t"
1131  YSCALEYUV2RGB1(%%FF_REGBP, %5)
1132  "pxor %%mm7, %%mm7 \n\t"
1133  WRITEBGR24(%%FF_REGb, DSTW_OFFSET"(%5)", %%FF_REGBP)
1134  "pop %%"FF_REG_BP" \n\t"
1135  "mov "ESP_OFFSET"(%5), %%"FF_REG_b" \n\t"
1136  :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
1137  "a" (&c->redDither)
1138  NAMED_CONSTRAINTS_ADD(M24A,M24C,M24B)
1139  );
1140  } else {
1141  const int16_t *ubuf1 = ubuf[1];
1142  __asm__ volatile(
1143  "mov %%"FF_REG_b", "ESP_OFFSET"(%5) \n\t"
1144  "mov %4, %%"FF_REG_b" \n\t"
1145  "push %%"FF_REG_BP" \n\t"
1146  YSCALEYUV2RGB1b(%%FF_REGBP, %5)
1147  "pxor %%mm7, %%mm7 \n\t"
1148  WRITEBGR24(%%FF_REGb, DSTW_OFFSET"(%5)", %%FF_REGBP)
1149  "pop %%"FF_REG_BP" \n\t"
1150  "mov "ESP_OFFSET"(%5), %%"FF_REG_b" \n\t"
1151  :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
1152  "a" (&c->redDither)
1153  NAMED_CONSTRAINTS_ADD(M24A,M24C,M24B)
1154  );
1155  }
1156 }
1157 
1158 static void RENAME(yuv2rgb555_1)(SwsInternal *c, const int16_t *buf0,
1159  const int16_t *ubuf[2], const int16_t *vbuf[2],
1160  const int16_t *abuf0, uint8_t *dest,
1161  int dstW, int uvalpha, int y)
1162 {
1163  const int16_t *ubuf0 = ubuf[0];
1164  const int16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
1165 
1166  if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
1167  const int16_t *ubuf1 = ubuf[0];
1168  __asm__ volatile(
1169  "mov %%"FF_REG_b", "ESP_OFFSET"(%5) \n\t"
1170  "mov %4, %%"FF_REG_b" \n\t"
1171  "push %%"FF_REG_BP" \n\t"
1172  YSCALEYUV2RGB1(%%FF_REGBP, %5)
1173  "pxor %%mm7, %%mm7 \n\t"
1174  /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1175  "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
1176  "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
1177  "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
1178  WRITERGB15(%%FF_REGb, DSTW_OFFSET"(%5)", %%FF_REGBP)
1179  "pop %%"FF_REG_BP" \n\t"
1180  "mov "ESP_OFFSET"(%5), %%"FF_REG_b" \n\t"
1181  :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
1182  "a" (&c->redDither)
1184  );
1185  } else {
1186  const int16_t *ubuf1 = ubuf[1];
1187  __asm__ volatile(
1188  "mov %%"FF_REG_b", "ESP_OFFSET"(%5) \n\t"
1189  "mov %4, %%"FF_REG_b" \n\t"
1190  "push %%"FF_REG_BP" \n\t"
1191  YSCALEYUV2RGB1b(%%FF_REGBP, %5)
1192  "pxor %%mm7, %%mm7 \n\t"
1193  /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1194  "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
1195  "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
1196  "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
1197  WRITERGB15(%%FF_REGb, DSTW_OFFSET"(%5)", %%FF_REGBP)
1198  "pop %%"FF_REG_BP" \n\t"
1199  "mov "ESP_OFFSET"(%5), %%"FF_REG_b" \n\t"
1200  :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
1201  "a" (&c->redDither)
1203  );
1204  }
1205 }
1206 
1207 static void RENAME(yuv2rgb565_1)(SwsInternal *c, const int16_t *buf0,
1208  const int16_t *ubuf[2], const int16_t *vbuf[2],
1209  const int16_t *abuf0, uint8_t *dest,
1210  int dstW, int uvalpha, int y)
1211 {
1212  const int16_t *ubuf0 = ubuf[0];
1213  const int16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
1214 
1215  if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
1216  const int16_t *ubuf1 = ubuf[0];
1217  __asm__ volatile(
1218  "mov %%"FF_REG_b", "ESP_OFFSET"(%5) \n\t"
1219  "mov %4, %%"FF_REG_b" \n\t"
1220  "push %%"FF_REG_BP" \n\t"
1221  YSCALEYUV2RGB1(%%FF_REGBP, %5)
1222  "pxor %%mm7, %%mm7 \n\t"
1223  /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1224  "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
1225  "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
1226  "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
1227  WRITERGB16(%%FF_REGb, DSTW_OFFSET"(%5)", %%FF_REGBP)
1228  "pop %%"FF_REG_BP" \n\t"
1229  "mov "ESP_OFFSET"(%5), %%"FF_REG_b" \n\t"
1230  :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
1231  "a" (&c->redDither)
1232  NAMED_CONSTRAINTS_ADD(bF8,bFC)
1233  );
1234  } else {
1235  const int16_t *ubuf1 = ubuf[1];
1236  __asm__ volatile(
1237  "mov %%"FF_REG_b", "ESP_OFFSET"(%5) \n\t"
1238  "mov %4, %%"FF_REG_b" \n\t"
1239  "push %%"FF_REG_BP" \n\t"
1240  YSCALEYUV2RGB1b(%%FF_REGBP, %5)
1241  "pxor %%mm7, %%mm7 \n\t"
1242  /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1243  "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
1244  "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
1245  "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
1246  WRITERGB16(%%FF_REGb, DSTW_OFFSET"(%5)", %%FF_REGBP)
1247  "pop %%"FF_REG_BP" \n\t"
1248  "mov "ESP_OFFSET"(%5), %%"FF_REG_b" \n\t"
1249  :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
1250  "a" (&c->redDither)
1251  NAMED_CONSTRAINTS_ADD(bF8,bFC)
1252  );
1253  }
1254 }
1255 
1256 #define REAL_YSCALEYUV2PACKED1(index, c) \
1257  "xor "#index", "#index" \n\t"\
1258  ".p2align 4 \n\t"\
1259  "1: \n\t"\
1260  "movq (%2, "#index"), %%mm3 \n\t" /* uvbuf0[eax]*/\
1261  "add "UV_OFF_BYTE"("#c"), "#index" \n\t" \
1262  "movq (%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
1263  "sub "UV_OFF_BYTE"("#c"), "#index" \n\t" \
1264  "psraw $7, %%mm3 \n\t" \
1265  "psraw $7, %%mm4 \n\t" \
1266  "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
1267  "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
1268  "psraw $7, %%mm1 \n\t" \
1269  "psraw $7, %%mm7 \n\t" \
1270 
1271 #define YSCALEYUV2PACKED1(index, c) REAL_YSCALEYUV2PACKED1(index, c)
1272 
1273 #define REAL_YSCALEYUV2PACKED1b(index, c) \
1274  "xor "#index", "#index" \n\t"\
1275  ".p2align 4 \n\t"\
1276  "1: \n\t"\
1277  "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
1278  "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
1279  "add "UV_OFF_BYTE"("#c"), "#index" \n\t" \
1280  "movq (%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
1281  "movq (%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
1282  "sub "UV_OFF_BYTE"("#c"), "#index" \n\t" \
1283  "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
1284  "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
1285  "psrlw $8, %%mm3 \n\t" \
1286  "psrlw $8, %%mm4 \n\t" \
1287  "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
1288  "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
1289  "psraw $7, %%mm1 \n\t" \
1290  "psraw $7, %%mm7 \n\t"
1291 #define YSCALEYUV2PACKED1b(index, c) REAL_YSCALEYUV2PACKED1b(index, c)
1292 
1293 static void RENAME(yuv2yuyv422_1)(SwsInternal *c, const int16_t *buf0,
1294  const int16_t *ubuf[2], const int16_t *vbuf[2],
1295  const int16_t *abuf0, uint8_t *dest,
1296  int dstW, int uvalpha, int y)
1297 {
1298  const int16_t *ubuf0 = ubuf[0];
1299  const int16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
1300 
1301  if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
1302  const int16_t *ubuf1 = ubuf[0];
1303  __asm__ volatile(
1304  "mov %%"FF_REG_b", "ESP_OFFSET"(%5) \n\t"
1305  "mov %4, %%"FF_REG_b" \n\t"
1306  "push %%"FF_REG_BP" \n\t"
1307  YSCALEYUV2PACKED1(%%FF_REGBP, %5)
1308  WRITEYUY2(%%FF_REGb, DSTW_OFFSET"(%5)", %%FF_REGBP)
1309  "pop %%"FF_REG_BP" \n\t"
1310  "mov "ESP_OFFSET"(%5), %%"FF_REG_b" \n\t"
1311  :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
1312  "a" (&c->redDither)
1313  );
1314  } else {
1315  const int16_t *ubuf1 = ubuf[1];
1316  __asm__ volatile(
1317  "mov %%"FF_REG_b", "ESP_OFFSET"(%5) \n\t"
1318  "mov %4, %%"FF_REG_b" \n\t"
1319  "push %%"FF_REG_BP" \n\t"
1320  YSCALEYUV2PACKED1b(%%FF_REGBP, %5)
1321  WRITEYUY2(%%FF_REGb, DSTW_OFFSET"(%5)", %%FF_REGBP)
1322  "pop %%"FF_REG_BP" \n\t"
1323  "mov "ESP_OFFSET"(%5), %%"FF_REG_b" \n\t"
1324  :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
1325  "a" (&c->redDither)
1326  );
1327  }
1328 }
1330 {
1331  enum AVPixelFormat dstFormat = c->opts.dst_format;
1332 
1333  c->use_mmx_vfilter= 0;
1334  if (!is16BPS(dstFormat) && !isNBPS(dstFormat) && !isSemiPlanarYUV(dstFormat)
1335  && dstFormat != AV_PIX_FMT_GRAYF32BE && dstFormat != AV_PIX_FMT_GRAYF32LE
1336  && !(c->opts.flags & SWS_BITEXACT)) {
1337  if (c->opts.flags & SWS_ACCURATE_RND) {
1338  if (!(c->opts.flags & SWS_FULL_CHR_H_INT)) {
1339  switch (c->opts.dst_format) {
1340  case AV_PIX_FMT_RGB32: c->yuv2packedX = RENAME(yuv2rgb32_X_ar); break;
1341 #if HAVE_6REGS
1342  case AV_PIX_FMT_BGR24: c->yuv2packedX = RENAME(yuv2bgr24_X_ar); break;
1343 #endif
1344  case AV_PIX_FMT_RGB555: c->yuv2packedX = RENAME(yuv2rgb555_X_ar); break;
1345  case AV_PIX_FMT_RGB565: c->yuv2packedX = RENAME(yuv2rgb565_X_ar); break;
1346  case AV_PIX_FMT_YUYV422: c->yuv2packedX = RENAME(yuv2yuyv422_X_ar); break;
1347  default: break;
1348  }
1349  }
1350  } else {
1351  c->use_mmx_vfilter= 1;
1352  if (!(c->opts.flags & SWS_FULL_CHR_H_INT)) {
1353  switch (c->opts.dst_format) {
1354  case AV_PIX_FMT_RGB32: c->yuv2packedX = RENAME(yuv2rgb32_X); break;
1355  case AV_PIX_FMT_BGR32: c->yuv2packedX = RENAME(yuv2bgr32_X); break;
1356 #if HAVE_6REGS
1357  case AV_PIX_FMT_BGR24: c->yuv2packedX = RENAME(yuv2bgr24_X); break;
1358 #endif
1359  case AV_PIX_FMT_RGB555: c->yuv2packedX = RENAME(yuv2rgb555_X); break;
1360  case AV_PIX_FMT_RGB565: c->yuv2packedX = RENAME(yuv2rgb565_X); break;
1361  case AV_PIX_FMT_YUYV422: c->yuv2packedX = RENAME(yuv2yuyv422_X); break;
1362  default: break;
1363  }
1364  }
1365  }
1366  if (!(c->opts.flags & SWS_FULL_CHR_H_INT)) {
1367  switch (c->opts.dst_format) {
1368  case AV_PIX_FMT_RGB32:
1369  c->yuv2packed1 = RENAME(yuv2rgb32_1);
1370  c->yuv2packed2 = RENAME(yuv2rgb32_2);
1371  break;
1372  case AV_PIX_FMT_BGR24:
1373  c->yuv2packed1 = RENAME(yuv2bgr24_1);
1374  c->yuv2packed2 = RENAME(yuv2bgr24_2);
1375  break;
1376  case AV_PIX_FMT_RGB555:
1377  c->yuv2packed1 = RENAME(yuv2rgb555_1);
1378  c->yuv2packed2 = RENAME(yuv2rgb555_2);
1379  break;
1380  case AV_PIX_FMT_RGB565:
1381  c->yuv2packed1 = RENAME(yuv2rgb565_1);
1382  c->yuv2packed2 = RENAME(yuv2rgb565_2);
1383  break;
1384  case AV_PIX_FMT_YUYV422:
1385  c->yuv2packed1 = RENAME(yuv2yuyv422_1);
1386  c->yuv2packed2 = RENAME(yuv2yuyv422_2);
1387  break;
1388  default:
1389  break;
1390  }
1391  }
1392  }
1393 
1394  if (c->srcBpc == 8 && c->dstBpc <= 14) {
1395  // Use the new MMX scaler if the MMXEXT one can't be used (it is faster than the x86 ASM one).
1396  if (c->opts.flags & SWS_FAST_BILINEAR && c->canMMXEXTBeUsed) {
1397  c->hyscale_fast = ff_hyscale_fast_mmxext;
1398  c->hcscale_fast = ff_hcscale_fast_mmxext;
1399  } else {
1400  c->hyscale_fast = NULL;
1401  c->hcscale_fast = NULL;
1402  }
1403  }
1404 }
ff_hyscale_fast_mmxext
void ff_hyscale_fast_mmxext(SwsInternal *c, int16_t *dst, int dstWidth, const uint8_t *src, int srcW, int xInc)
Definition: hscale_fast_bilinear_simd.c:192
WRITEBGR32
#define WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t)
Definition: swscale_template.c:248
sws_init_swscale
static av_cold void sws_init_swscale(SwsInternal *c)
Definition: swscale.c:663
AVPixelFormat
AVPixelFormat
Pixel format.
Definition: pixfmt.h:71
YSCALEYUV2PACKEDX_ACCURATE
#define YSCALEYUV2PACKEDX_ACCURATE
Definition: swscale_template.c:186
ALP_MMX_FILTER_OFFSET
#define ALP_MMX_FILTER_OFFSET
Definition: swscale_internal.h:513
YSCALEYUV2RGB1
#define YSCALEYUV2RGB1(index, c)
Definition: swscale_template.c:979
YSCALEYUV2PACKEDX_YA
#define YSCALEYUV2PACKEDX_YA(offset, coeff, src1, src2, dst1, dst2)
Definition: swscale_template.c:58
AV_PIX_FMT_BGR32
#define AV_PIX_FMT_BGR32
Definition: pixfmt.h:513
x86_reg
int x86_reg
Definition: asm.h:71
SWS_BITEXACT
@ SWS_BITEXACT
Definition: swscale.h:158
YSCALEYUV2RGB
#define YSCALEYUV2RGB(index, c)
Definition: swscale_template.c:720
AV_PIX_FMT_BGR24
@ AV_PIX_FMT_BGR24
packed RGB 8:8:8, 24bpp, BGRBGR...
Definition: pixfmt.h:76
AV_PIX_FMT_GRAYF32LE
@ AV_PIX_FMT_GRAYF32LE
IEEE-754 single precision Y, 32bpp, little-endian.
Definition: pixfmt.h:364
SWS_FAST_BILINEAR
@ SWS_FAST_BILINEAR
Scaler selection options.
Definition: swscale.h:100
is16BPS
static av_always_inline int is16BPS(enum AVPixelFormat pix_fmt)
Definition: swscale_internal.h:742
DSTW_OFFSET
#define DSTW_OFFSET
Definition: swscale_internal.h:507
dummy
int dummy
Definition: motion.c:64
isNBPS
static av_always_inline int isNBPS(enum AVPixelFormat pix_fmt)
Definition: swscale_internal.h:756
av_cold
#define av_cold
Definition: attributes.h:106
NAMED_CONSTRAINTS_ADD
#define NAMED_CONSTRAINTS_ADD(...)
Definition: asm.h:144
BLUE_DITHER
#define BLUE_DITHER
Definition: swscale_internal.h:496
YSCALEYUV2RGB1b
#define YSCALEYUV2RGB1b(index, c)
Definition: swscale_template.c:1035
WRITERGB15
#define WRITERGB15(dst, dstw, index)
Definition: swscale_template.c:440
WRITEBGR24
#define WRITEBGR24(dst, dstw, index)
Definition: swscale_template.c:535
isSemiPlanarYUV
static av_always_inline int isSemiPlanarYUV(enum AVPixelFormat pix_fmt)
Definition: swscale_internal.h:788
ff_hcscale_fast_mmxext
void ff_hcscale_fast_mmxext(SwsInternal *c, int16_t *dst1, int16_t *dst2, int dstWidth, const uint8_t *src1, const uint8_t *src2, int srcW, int xInc)
Definition: hscale_fast_bilinear_simd.c:282
NULL
#define NULL
Definition: coverity.c:32
YSCALEYUV2PACKEDX
#define YSCALEYUV2PACKEDX
Definition: swscale_template.c:77
asm.h
AV_PIX_FMT_YUYV422
@ AV_PIX_FMT_YUYV422
packed YUV 4:2:2, 16bpp, Y0 Cb Y1 Cr
Definition: pixfmt.h:74
U_TEMP
#define U_TEMP
Definition: swscale_internal.h:510
GREEN_DITHER
#define GREEN_DITHER
Definition: swscale_internal.h:495
YSCALEYUV2RGB1_ALPHA
#define YSCALEYUV2RGB1_ALPHA(index)
Definition: swscale_template.c:1043
c
Undefined Behavior In the C some operations are like signed integer dereferencing freed accessing outside allocated Undefined Behavior must not occur in a C it is not safe even if the output of undefined operations is unused The unsafety may seem nit picking but Optimizing compilers have in fact optimized code on the assumption that no undefined Behavior occurs Optimizing code based on wrong assumptions can and has in some cases lead to effects beyond the output of computations The signed integer overflow problem in speed critical code Code which is highly optimized and works with signed integers sometimes has the problem that often the output of the computation does not c
Definition: undefined.txt:32
RED_DITHER
#define RED_DITHER
Definition: swscale_internal.h:494
AV_PIX_FMT_RGB32
#define AV_PIX_FMT_RGB32
Definition: pixfmt.h:511
YSCALEYUV2RGB_YA
#define YSCALEYUV2RGB_YA(index, c, b1, b2)
Definition: swscale_template.c:718
AV_PIX_FMT_RGB555
#define AV_PIX_FMT_RGB555
Definition: pixfmt.h:527
swscale_internal.h
YSCALEYUV2PACKED
#define YSCALEYUV2PACKED(index, c)
Definition: swscale_template.c:907
AV_PIX_FMT_RGB565
#define AV_PIX_FMT_RGB565
Definition: pixfmt.h:526
V_TEMP
#define V_TEMP
Definition: swscale_internal.h:511
SwsInternal
Definition: swscale_internal.h:334
__asm__
__asm__(".macro parse_r var r\n\t" "\\var = -1\n\t" _IFC_REG(0) _IFC_REG(1) _IFC_REG(2) _IFC_REG(3) _IFC_REG(4) _IFC_REG(5) _IFC_REG(6) _IFC_REG(7) _IFC_REG(8) _IFC_REG(9) _IFC_REG(10) _IFC_REG(11) _IFC_REG(12) _IFC_REG(13) _IFC_REG(14) _IFC_REG(15) _IFC_REG(16) _IFC_REG(17) _IFC_REG(18) _IFC_REG(19) _IFC_REG(20) _IFC_REG(21) _IFC_REG(22) _IFC_REG(23) _IFC_REG(24) _IFC_REG(25) _IFC_REG(26) _IFC_REG(27) _IFC_REG(28) _IFC_REG(29) _IFC_REG(30) _IFC_REG(31) ".iflt \\var\n\t" ".error \"Unable to parse register name \\r\"\n\t" ".endif\n\t" ".endm")
AV_PIX_FMT_GRAYF32BE
@ AV_PIX_FMT_GRAYF32BE
IEEE-754 single precision Y, 32bpp, big-endian.
Definition: pixfmt.h:363
SWS_FULL_CHR_H_INT
@ SWS_FULL_CHR_H_INT
Perform full chroma upsampling when upscaling to RGB.
Definition: swscale.h:134
WRITEYUY2
#define WRITEYUY2(dst, dstw, index)
Definition: swscale_template.c:604
RENAME
#define RENAME(element)
Definition: ac3enc_template.c:44
Y_TEMP
#define Y_TEMP
Definition: swscale_internal.h:512
YSCALEYUV2PACKEDX_ACCURATE_YA
#define YSCALEYUV2PACKEDX_ACCURATE_YA(offset)
Definition: swscale_template.c:141
YSCALEYUV2RGBX
#define YSCALEYUV2RGBX
Definition: swscale_template.c:190
ESP_OFFSET
#define ESP_OFFSET
Definition: swscale_internal.h:508
YSCALEYUV2PACKEDX_END
#define YSCALEYUV2PACKEDX_END
Definition: swscale_template.c:81
SWS_ACCURATE_RND
@ SWS_ACCURATE_RND
Force bit-exact output.
Definition: swscale.h:157
WRITERGB16
#define WRITERGB16(dst, dstw, index)
Definition: swscale_template.c:367
YSCALEYUV2PACKED1
#define YSCALEYUV2PACKED1(index, c)
Definition: swscale_template.c:1271
YSCALEYUV2PACKED1b
#define YSCALEYUV2PACKED1b(index, c)
Definition: swscale_template.c:1291