1 /*
2 * Copyright (C) 2001-2011 Michael Niedermayer <michaelni@gmx.at>
3 *
4 * This file is part of FFmpeg.
5 *
6 * FFmpeg is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
10 *
11 * FFmpeg is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with FFmpeg; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19 */
20
21 #include <stdint.h>
22
23 #include "libavutil/x86/asm.h"
24 #include "libswscale/swscale_internal.h"
25
26 #undef REAL_MOVNTQ
27 #undef MOVNTQ
28 #undef MOVNTQ2
29 #undef PREFETCH
30
31
32 #if COMPILE_TEMPLATE_MMXEXT
33 #define REAL_MOVNTQ(a,b) "movntq " #a ", " #b " \n\t"
34 #define MOVNTQ2 "movntq "
35 #else
36 #define REAL_MOVNTQ(a,b) "movq " #a ", " #b " \n\t"
37 #define MOVNTQ2 "movq "
38 #endif
39 #define MOVNTQ(a,b) REAL_MOVNTQ(a,b)
40
41 #define YSCALEYUV2PACKEDX_UV \
42 __asm__ volatile(\
43 "xor %%"FF_REG_a", %%"FF_REG_a" \n\t"\
44 ".p2align 4 \n\t"\
45 "nop \n\t"\
46 "1: \n\t"\
47 "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"FF_REG_d" \n\t"\
48 "mov (%%"FF_REG_d"), %%"FF_REG_S" \n\t"\
49 "movq "VROUNDER_OFFSET"(%0), %%mm3 \n\t"\
50 "movq %%mm3, %%mm4 \n\t"\
51 ".p2align 4 \n\t"\
52 "2: \n\t"\
53 "movq 8(%%"FF_REG_d"), %%mm0 \n\t" /* filterCoeff */\
54 "movq (%%"FF_REG_S", %%"FF_REG_a"), %%mm2 \n\t" /* UsrcData */\
55 "add %6, %%"FF_REG_S" \n\t" \
56 "movq (%%"FF_REG_S", %%"FF_REG_a"), %%mm5 \n\t" /* VsrcData */\
57 "add $16, %%"FF_REG_d" \n\t"\
58 "mov (%%"FF_REG_d"), %%"FF_REG_S" \n\t"\
59 "pmulhw %%mm0, %%mm2 \n\t"\
60 "pmulhw %%mm0, %%mm5 \n\t"\
61 "paddw %%mm2, %%mm3 \n\t"\
62 "paddw %%mm5, %%mm4 \n\t"\
63 "test %%"FF_REG_S", %%"FF_REG_S" \n\t"\
64 " jnz 2b \n\t"\
65
66 #define YSCALEYUV2PACKEDX_YA(offset,coeff,src1,src2,dst1,dst2) \
67 "lea "offset"(%0), %%"FF_REG_d" \n\t"\
68 "mov (%%"FF_REG_d"), %%"FF_REG_S" \n\t"\
69 "movq "VROUNDER_OFFSET"(%0), "#dst1" \n\t"\
70 "movq "#dst1", "#dst2" \n\t"\
71 ".p2align 4 \n\t"\
72 "2: \n\t"\
73 "movq 8(%%"FF_REG_d"), "#coeff" \n\t" /* filterCoeff */\
74 "movq (%%"FF_REG_S", %%"FF_REG_a", 2), "#src1" \n\t" /* Y1srcData */\
75 "movq 8(%%"FF_REG_S", %%"FF_REG_a", 2), "#src2" \n\t" /* Y2srcData */\
76 "add $16, %%"FF_REG_d" \n\t"\
77 "mov (%%"FF_REG_d"), %%"FF_REG_S" \n\t"\
78 "pmulhw "#coeff", "#src1" \n\t"\
79 "pmulhw "#coeff", "#src2" \n\t"\
80 "paddw "#src1", "#dst1" \n\t"\
81 "paddw "#src2", "#dst2" \n\t"\
82 "test %%"FF_REG_S", %%"FF_REG_S" \n\t"\
83 " jnz 2b \n\t"\
84
85 #define YSCALEYUV2PACKEDX \
86 YSCALEYUV2PACKEDX_UV \
87 YSCALEYUV2PACKEDX_YA(LUM_MMX_FILTER_OFFSET,%%mm0,%%mm2,%%mm5,%%mm1,%%mm7) \
88
89 #define YSCALEYUV2PACKEDX_END \
90 :: "r" (&c->redDither), \
91 "m" (dummy), "m" (dummy), "m" (dummy),\
92 "r" (dest), "m" (dstW_reg), "m"(uv_off) \
93 NAMED_CONSTRAINTS_ADD(bF8,bFC) \
94 : "%"FF_REG_a, "%"FF_REG_d, "%"FF_REG_S \
95 );
96
97 #define YSCALEYUV2PACKEDX_ACCURATE_UV \
98 __asm__ volatile(\
99 "xor %%"FF_REG_a", %%"FF_REG_a" \n\t"\
100 ".p2align 4 \n\t"\
101 "nop \n\t"\
102 "1: \n\t"\
103 "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"FF_REG_d" \n\t"\
104 "mov (%%"FF_REG_d"), %%"FF_REG_S" \n\t"\
105 "pxor %%mm4, %%mm4 \n\t"\
106 "pxor %%mm5, %%mm5 \n\t"\
107 "pxor %%mm6, %%mm6 \n\t"\
108 "pxor %%mm7, %%mm7 \n\t"\
109 ".p2align 4 \n\t"\
110 "2: \n\t"\
111 "movq (%%"FF_REG_S", %%"FF_REG_a"), %%mm0 \n\t" /* UsrcData */\
112 "add %6, %%"FF_REG_S" \n\t" \
113 "movq (%%"FF_REG_S", %%"FF_REG_a"), %%mm2 \n\t" /* VsrcData */\
114 "mov "STR(APCK_PTR2)"(%%"FF_REG_d"), %%"FF_REG_S" \n\t"\
115 "movq (%%"FF_REG_S", %%"FF_REG_a"), %%mm1 \n\t" /* UsrcData */\
116 "movq %%mm0, %%mm3 \n\t"\
117 "punpcklwd %%mm1, %%mm0 \n\t"\
118 "punpckhwd %%mm1, %%mm3 \n\t"\
119 "movq "STR(APCK_COEF)"(%%"FF_REG_d"),%%mm1 \n\t" /* filterCoeff */\
120 "pmaddwd %%mm1, %%mm0 \n\t"\
121 "pmaddwd %%mm1, %%mm3 \n\t"\
122 "paddd %%mm0, %%mm4 \n\t"\
123 "paddd %%mm3, %%mm5 \n\t"\
124 "add %6, %%"FF_REG_S" \n\t" \
125 "movq (%%"FF_REG_S", %%"FF_REG_a"), %%mm3 \n\t" /* VsrcData */\
126 "mov "STR(APCK_SIZE)"(%%"FF_REG_d"), %%"FF_REG_S" \n\t"\
127 "add $"STR(APCK_SIZE)", %%"FF_REG_d" \n\t"\
128 "test %%"FF_REG_S", %%"FF_REG_S" \n\t"\
129 "movq %%mm2, %%mm0 \n\t"\
130 "punpcklwd %%mm3, %%mm2 \n\t"\
131 "punpckhwd %%mm3, %%mm0 \n\t"\
132 "pmaddwd %%mm1, %%mm2 \n\t"\
133 "pmaddwd %%mm1, %%mm0 \n\t"\
134 "paddd %%mm2, %%mm6 \n\t"\
135 "paddd %%mm0, %%mm7 \n\t"\
136 " jnz 2b \n\t"\
137 "psrad $16, %%mm4 \n\t"\
138 "psrad $16, %%mm5 \n\t"\
139 "psrad $16, %%mm6 \n\t"\
140 "psrad $16, %%mm7 \n\t"\
141 "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\
142 "packssdw %%mm5, %%mm4 \n\t"\
143 "packssdw %%mm7, %%mm6 \n\t"\
144 "paddw %%mm0, %%mm4 \n\t"\
145 "paddw %%mm0, %%mm6 \n\t"\
146 "movq %%mm4, "U_TEMP"(%0) \n\t"\
147 "movq %%mm6, "V_TEMP"(%0) \n\t"\
148
149 #define YSCALEYUV2PACKEDX_ACCURATE_YA(offset) \
150 "lea "offset"(%0), %%"FF_REG_d" \n\t"\
151 "mov (%%"FF_REG_d"), %%"FF_REG_S" \n\t"\
152 "pxor %%mm1, %%mm1 \n\t"\
153 "pxor %%mm5, %%mm5 \n\t"\
154 "pxor %%mm7, %%mm7 \n\t"\
155 "pxor %%mm6, %%mm6 \n\t"\
156 ".p2align 4 \n\t"\
157 "2: \n\t"\
158 "movq (%%"FF_REG_S", %%"FF_REG_a", 2), %%mm0 \n\t" /* Y1srcData */\
159 "movq 8(%%"FF_REG_S", %%"FF_REG_a", 2), %%mm2 \n\t" /* Y2srcData */\
160 "mov "STR(APCK_PTR2)"(%%"FF_REG_d"), %%"FF_REG_S" \n\t"\
161 "movq (%%"FF_REG_S", %%"FF_REG_a", 2), %%mm4 \n\t" /* Y1srcData */\
162 "movq %%mm0, %%mm3 \n\t"\
163 "punpcklwd %%mm4, %%mm0 \n\t"\
164 "punpckhwd %%mm4, %%mm3 \n\t"\
165 "movq "STR(APCK_COEF)"(%%"FF_REG_d"), %%mm4 \n\t" /* filterCoeff */\
166 "pmaddwd %%mm4, %%mm0 \n\t"\
167 "pmaddwd %%mm4, %%mm3 \n\t"\
168 "paddd %%mm0, %%mm1 \n\t"\
169 "paddd %%mm3, %%mm5 \n\t"\
170 "movq 8(%%"FF_REG_S", %%"FF_REG_a", 2), %%mm3 \n\t" /* Y2srcData */\
171 "mov "STR(APCK_SIZE)"(%%"FF_REG_d"), %%"FF_REG_S" \n\t"\
172 "add $"STR(APCK_SIZE)", %%"FF_REG_d" \n\t"\
173 "test %%"FF_REG_S", %%"FF_REG_S" \n\t"\
174 "movq %%mm2, %%mm0 \n\t"\
175 "punpcklwd %%mm3, %%mm2 \n\t"\
176 "punpckhwd %%mm3, %%mm0 \n\t"\
177 "pmaddwd %%mm4, %%mm2 \n\t"\
178 "pmaddwd %%mm4, %%mm0 \n\t"\
179 "paddd %%mm2, %%mm7 \n\t"\
180 "paddd %%mm0, %%mm6 \n\t"\
181 " jnz 2b \n\t"\
182 "psrad $16, %%mm1 \n\t"\
183 "psrad $16, %%mm5 \n\t"\
184 "psrad $16, %%mm7 \n\t"\
185 "psrad $16, %%mm6 \n\t"\
186 "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\
187 "packssdw %%mm5, %%mm1 \n\t"\
188 "packssdw %%mm6, %%mm7 \n\t"\
189 "paddw %%mm0, %%mm1 \n\t"\
190 "paddw %%mm0, %%mm7 \n\t"\
191 "movq "U_TEMP"(%0), %%mm3 \n\t"\
192 "movq "V_TEMP"(%0), %%mm4 \n\t"\
193
194 #define YSCALEYUV2PACKEDX_ACCURATE \
195 YSCALEYUV2PACKEDX_ACCURATE_UV \
196 YSCALEYUV2PACKEDX_ACCURATE_YA(LUM_MMX_FILTER_OFFSET)
197
198 #define YSCALEYUV2RGBX \
199 "psubw "U_OFFSET"(%0), %%mm3 \n\t" /* (U-128)8*/\
200 "psubw "V_OFFSET"(%0), %%mm4 \n\t" /* (V-128)8*/\
201 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
202 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
203 "pmulhw "UG_COEFF"(%0), %%mm3 \n\t"\
204 "pmulhw "VG_COEFF"(%0), %%mm4 \n\t"\
205 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
206 "pmulhw "UB_COEFF"(%0), %%mm2 \n\t"\
207 "pmulhw "VR_COEFF"(%0), %%mm5 \n\t"\
208 "psubw "Y_OFFSET"(%0), %%mm1 \n\t" /* 8(Y-16)*/\
209 "psubw "Y_OFFSET"(%0), %%mm7 \n\t" /* 8(Y-16)*/\
210 "pmulhw "Y_COEFF"(%0), %%mm1 \n\t"\
211 "pmulhw "Y_COEFF"(%0), %%mm7 \n\t"\
212 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
213 "paddw %%mm3, %%mm4 \n\t"\
214 "movq %%mm2, %%mm0 \n\t"\
215 "movq %%mm5, %%mm6 \n\t"\
216 "movq %%mm4, %%mm3 \n\t"\
217 "punpcklwd %%mm2, %%mm2 \n\t"\
218 "punpcklwd %%mm5, %%mm5 \n\t"\
219 "punpcklwd %%mm4, %%mm4 \n\t"\
220 "paddw %%mm1, %%mm2 \n\t"\
221 "paddw %%mm1, %%mm5 \n\t"\
222 "paddw %%mm1, %%mm4 \n\t"\
223 "punpckhwd %%mm0, %%mm0 \n\t"\
224 "punpckhwd %%mm6, %%mm6 \n\t"\
225 "punpckhwd %%mm3, %%mm3 \n\t"\
226 "paddw %%mm7, %%mm0 \n\t"\
227 "paddw %%mm7, %%mm6 \n\t"\
228 "paddw %%mm7, %%mm3 \n\t"\
229 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
230 "packuswb %%mm0, %%mm2 \n\t"\
231 "packuswb %%mm6, %%mm5 \n\t"\
232 "packuswb %%mm3, %%mm4 \n\t"\
233
234 #define REAL_WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t) \
235 "movq "#b", "#q2" \n\t" /* B */\
236 "movq "#r", "#t" \n\t" /* R */\
237 "punpcklbw "#g", "#b" \n\t" /* GBGBGBGB 0 */\
238 "punpcklbw "#a", "#r" \n\t" /* ARARARAR 0 */\
239 "punpckhbw "#g", "#q2" \n\t" /* GBGBGBGB 2 */\
240 "punpckhbw "#a", "#t" \n\t" /* ARARARAR 2 */\
241 "movq "#b", "#q0" \n\t" /* GBGBGBGB 0 */\
242 "movq "#q2", "#q3" \n\t" /* GBGBGBGB 2 */\
243 "punpcklwd "#r", "#q0" \n\t" /* ARGBARGB 0 */\
244 "punpckhwd "#r", "#b" \n\t" /* ARGBARGB 1 */\
245 "punpcklwd "#t", "#q2" \n\t" /* ARGBARGB 2 */\
246 "punpckhwd "#t", "#q3" \n\t" /* ARGBARGB 3 */\
247 \
248 MOVNTQ( q0, (dst, index, 4))\
249 MOVNTQ( b, 8(dst, index, 4))\
250 MOVNTQ( q2, 16(dst, index, 4))\
251 MOVNTQ( q3, 24(dst, index, 4))\
252 \
253 "add $8, "#index" \n\t"\
254 "cmp "dstw", "#index" \n\t"\
255 " jb 1b \n\t"
256 #define WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t) REAL_WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t)
257
RENAME(yuv2rgb32_X_ar)258 static void RENAME(yuv2rgb32_X_ar)(SwsContext *c, const int16_t *lumFilter,
259 const int16_t **lumSrc, int lumFilterSize,
260 const int16_t *chrFilter, const int16_t **chrUSrc,
261 const int16_t **chrVSrc,
262 int chrFilterSize, const int16_t **alpSrc,
263 uint8_t *dest, int dstW, int dstY)
264 {
265 x86_reg dummy=0;
266 x86_reg dstW_reg = dstW;
267 x86_reg uv_off = c->uv_offx2;
268
269 if (CONFIG_SWSCALE_ALPHA && c->needAlpha) {
270 YSCALEYUV2PACKEDX_ACCURATE
271 YSCALEYUV2RGBX
272 "movq %%mm2, "U_TEMP"(%0) \n\t"
273 "movq %%mm4, "V_TEMP"(%0) \n\t"
274 "movq %%mm5, "Y_TEMP"(%0) \n\t"
275 YSCALEYUV2PACKEDX_ACCURATE_YA(ALP_MMX_FILTER_OFFSET)
276 "movq "Y_TEMP"(%0), %%mm5 \n\t"
277 "psraw $3, %%mm1 \n\t"
278 "psraw $3, %%mm7 \n\t"
279 "packuswb %%mm7, %%mm1 \n\t"
280 WRITEBGR32(%4, "%5", %%FF_REGa, %%mm3, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm2, %%mm6)
281 YSCALEYUV2PACKEDX_END
282 } else {
283 YSCALEYUV2PACKEDX_ACCURATE
284 YSCALEYUV2RGBX
285 "pcmpeqd %%mm7, %%mm7 \n\t"
286 WRITEBGR32(%4, "%5", %%FF_REGa, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
287 YSCALEYUV2PACKEDX_END
288 }
289 }
290
RENAME(yuv2rgb32_X)291 static void RENAME(yuv2rgb32_X)(SwsContext *c, const int16_t *lumFilter,
292 const int16_t **lumSrc, int lumFilterSize,
293 const int16_t *chrFilter, const int16_t **chrUSrc,
294 const int16_t **chrVSrc,
295 int chrFilterSize, const int16_t **alpSrc,
296 uint8_t *dest, int dstW, int dstY)
297 {
298 x86_reg dummy=0;
299 x86_reg dstW_reg = dstW;
300 x86_reg uv_off = c->uv_offx2;
301
302 if (CONFIG_SWSCALE_ALPHA && c->needAlpha) {
303 YSCALEYUV2PACKEDX
304 YSCALEYUV2RGBX
305 YSCALEYUV2PACKEDX_YA(ALP_MMX_FILTER_OFFSET, %%mm0, %%mm3, %%mm6, %%mm1, %%mm7)
306 "psraw $3, %%mm1 \n\t"
307 "psraw $3, %%mm7 \n\t"
308 "packuswb %%mm7, %%mm1 \n\t"
309 WRITEBGR32(%4, "%5", %%FF_REGa, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
310 YSCALEYUV2PACKEDX_END
311 } else {
312 YSCALEYUV2PACKEDX
313 YSCALEYUV2RGBX
314 "pcmpeqd %%mm7, %%mm7 \n\t"
315 WRITEBGR32(%4, "%5", %%FF_REGa, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
316 YSCALEYUV2PACKEDX_END
317 }
318 }
319
RENAME(yuv2bgr32_X)320 static void RENAME(yuv2bgr32_X)(SwsContext *c, const int16_t *lumFilter,
321 const int16_t **lumSrc, int lumFilterSize,
322 const int16_t *chrFilter, const int16_t **chrUSrc,
323 const int16_t **chrVSrc,
324 int chrFilterSize, const int16_t **alpSrc,
325 uint8_t *dest, int dstW, int dstY)
326 {
327 x86_reg dummy=0;
328 x86_reg dstW_reg = dstW;
329 x86_reg uv_off = c->uv_offx2;
330
331 if (CONFIG_SWSCALE_ALPHA && c->needAlpha) {
332 YSCALEYUV2PACKEDX
333 YSCALEYUV2RGBX
334 YSCALEYUV2PACKEDX_YA(ALP_MMX_FILTER_OFFSET, %%mm0, %%mm3, %%mm6, %%mm1, %%mm7)
335 "psraw $3, %%mm1 \n\t"
336 "psraw $3, %%mm7 \n\t"
337 "packuswb %%mm7, %%mm1 \n\t"
338 WRITEBGR32(%4, "%5", %%FF_REGa, %%mm5, %%mm4, %%mm2, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
339 YSCALEYUV2PACKEDX_END
340 } else {
341 YSCALEYUV2PACKEDX
342 YSCALEYUV2RGBX
343 "pcmpeqd %%mm7, %%mm7 \n\t"
344 WRITEBGR32(%4, "%5", %%FF_REGa, %%mm5, %%mm4, %%mm2, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
345 YSCALEYUV2PACKEDX_END
346 }
347 }
348
349 #define REAL_WRITERGB16(dst, dstw, index) \
350 "pand "MANGLE(bF8)", %%mm2 \n\t" /* B */\
351 "pand "MANGLE(bFC)", %%mm4 \n\t" /* G */\
352 "pand "MANGLE(bF8)", %%mm5 \n\t" /* R */\
353 "psrlq $3, %%mm2 \n\t"\
354 \
355 "movq %%mm2, %%mm1 \n\t"\
356 "movq %%mm4, %%mm3 \n\t"\
357 \
358 "punpcklbw %%mm7, %%mm3 \n\t"\
359 "punpcklbw %%mm5, %%mm2 \n\t"\
360 "punpckhbw %%mm7, %%mm4 \n\t"\
361 "punpckhbw %%mm5, %%mm1 \n\t"\
362 \
363 "psllq $3, %%mm3 \n\t"\
364 "psllq $3, %%mm4 \n\t"\
365 \
366 "por %%mm3, %%mm2 \n\t"\
367 "por %%mm4, %%mm1 \n\t"\
368 \
369 MOVNTQ(%%mm2, (dst, index, 2))\
370 MOVNTQ(%%mm1, 8(dst, index, 2))\
371 \
372 "add $8, "#index" \n\t"\
373 "cmp "dstw", "#index" \n\t"\
374 " jb 1b \n\t"
375 #define WRITERGB16(dst, dstw, index) REAL_WRITERGB16(dst, dstw, index)
376
RENAME(yuv2rgb565_X_ar)377 static void RENAME(yuv2rgb565_X_ar)(SwsContext *c, const int16_t *lumFilter,
378 const int16_t **lumSrc, int lumFilterSize,
379 const int16_t *chrFilter, const int16_t **chrUSrc,
380 const int16_t **chrVSrc,
381 int chrFilterSize, const int16_t **alpSrc,
382 uint8_t *dest, int dstW, int dstY)
383 {
384 x86_reg dummy=0;
385 x86_reg dstW_reg = dstW;
386 x86_reg uv_off = c->uv_offx2;
387
388 YSCALEYUV2PACKEDX_ACCURATE
389 YSCALEYUV2RGBX
390 "pxor %%mm7, %%mm7 \n\t"
391 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
392 #ifdef DITHER1XBPP
393 "paddusb "BLUE_DITHER"(%0), %%mm2\n\t"
394 "paddusb "GREEN_DITHER"(%0), %%mm4\n\t"
395 "paddusb "RED_DITHER"(%0), %%mm5\n\t"
396 #endif
397 WRITERGB16(%4, "%5", %%FF_REGa)
398 YSCALEYUV2PACKEDX_END
399 }
400
RENAME(yuv2rgb565_X)401 static void RENAME(yuv2rgb565_X)(SwsContext *c, const int16_t *lumFilter,
402 const int16_t **lumSrc, int lumFilterSize,
403 const int16_t *chrFilter, const int16_t **chrUSrc,
404 const int16_t **chrVSrc,
405 int chrFilterSize, const int16_t **alpSrc,
406 uint8_t *dest, int dstW, int dstY)
407 {
408 x86_reg dummy=0;
409 x86_reg dstW_reg = dstW;
410 x86_reg uv_off = c->uv_offx2;
411
412 YSCALEYUV2PACKEDX
413 YSCALEYUV2RGBX
414 "pxor %%mm7, %%mm7 \n\t"
415 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
416 #ifdef DITHER1XBPP
417 "paddusb "BLUE_DITHER"(%0), %%mm2 \n\t"
418 "paddusb "GREEN_DITHER"(%0), %%mm4 \n\t"
419 "paddusb "RED_DITHER"(%0), %%mm5 \n\t"
420 #endif
421 WRITERGB16(%4, "%5", %%FF_REGa)
422 YSCALEYUV2PACKEDX_END
423 }
424
425 #define REAL_WRITERGB15(dst, dstw, index) \
426 "pand "MANGLE(bF8)", %%mm2 \n\t" /* B */\
427 "pand "MANGLE(bF8)", %%mm4 \n\t" /* G */\
428 "pand "MANGLE(bF8)", %%mm5 \n\t" /* R */\
429 "psrlq $3, %%mm2 \n\t"\
430 "psrlq $1, %%mm5 \n\t"\
431 \
432 "movq %%mm2, %%mm1 \n\t"\
433 "movq %%mm4, %%mm3 \n\t"\
434 \
435 "punpcklbw %%mm7, %%mm3 \n\t"\
436 "punpcklbw %%mm5, %%mm2 \n\t"\
437 "punpckhbw %%mm7, %%mm4 \n\t"\
438 "punpckhbw %%mm5, %%mm1 \n\t"\
439 \
440 "psllq $2, %%mm3 \n\t"\
441 "psllq $2, %%mm4 \n\t"\
442 \
443 "por %%mm3, %%mm2 \n\t"\
444 "por %%mm4, %%mm1 \n\t"\
445 \
446 MOVNTQ(%%mm2, (dst, index, 2))\
447 MOVNTQ(%%mm1, 8(dst, index, 2))\
448 \
449 "add $8, "#index" \n\t"\
450 "cmp "dstw", "#index" \n\t"\
451 " jb 1b \n\t"
452 #define WRITERGB15(dst, dstw, index) REAL_WRITERGB15(dst, dstw, index)
453
RENAME(yuv2rgb555_X_ar)454 static void RENAME(yuv2rgb555_X_ar)(SwsContext *c, const int16_t *lumFilter,
455 const int16_t **lumSrc, int lumFilterSize,
456 const int16_t *chrFilter, const int16_t **chrUSrc,
457 const int16_t **chrVSrc,
458 int chrFilterSize, const int16_t **alpSrc,
459 uint8_t *dest, int dstW, int dstY)
460 {
461 x86_reg dummy=0;
462 x86_reg dstW_reg = dstW;
463 x86_reg uv_off = c->uv_offx2;
464
465 YSCALEYUV2PACKEDX_ACCURATE
466 YSCALEYUV2RGBX
467 "pxor %%mm7, %%mm7 \n\t"
468 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
469 #ifdef DITHER1XBPP
470 "paddusb "BLUE_DITHER"(%0), %%mm2\n\t"
471 "paddusb "GREEN_DITHER"(%0), %%mm4\n\t"
472 "paddusb "RED_DITHER"(%0), %%mm5\n\t"
473 #endif
474 WRITERGB15(%4, "%5", %%FF_REGa)
475 YSCALEYUV2PACKEDX_END
476 }
477
RENAME(yuv2rgb555_X)478 static void RENAME(yuv2rgb555_X)(SwsContext *c, const int16_t *lumFilter,
479 const int16_t **lumSrc, int lumFilterSize,
480 const int16_t *chrFilter, const int16_t **chrUSrc,
481 const int16_t **chrVSrc,
482 int chrFilterSize, const int16_t **alpSrc,
483 uint8_t *dest, int dstW, int dstY)
484 {
485 x86_reg dummy=0;
486 x86_reg dstW_reg = dstW;
487 x86_reg uv_off = c->uv_offx2;
488
489 YSCALEYUV2PACKEDX
490 YSCALEYUV2RGBX
491 "pxor %%mm7, %%mm7 \n\t"
492 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
493 #ifdef DITHER1XBPP
494 "paddusb "BLUE_DITHER"(%0), %%mm2 \n\t"
495 "paddusb "GREEN_DITHER"(%0), %%mm4 \n\t"
496 "paddusb "RED_DITHER"(%0), %%mm5 \n\t"
497 #endif
498 WRITERGB15(%4, "%5", %%FF_REGa)
499 YSCALEYUV2PACKEDX_END
500 }
501
502 #define WRITEBGR24MMX(dst, dstw, index) \
503 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
504 "movq %%mm2, %%mm1 \n\t" /* B */\
505 "movq %%mm5, %%mm6 \n\t" /* R */\
506 "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\
507 "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\
508 "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\
509 "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\
510 "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\
511 "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\
512 "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\
513 "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\
514 "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\
515 "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\
516 \
517 "movq %%mm0, %%mm4 \n\t" /* 0RGB0RGB 0 */\
518 "movq %%mm2, %%mm6 \n\t" /* 0RGB0RGB 1 */\
519 "movq %%mm1, %%mm5 \n\t" /* 0RGB0RGB 2 */\
520 "movq %%mm3, %%mm7 \n\t" /* 0RGB0RGB 3 */\
521 \
522 "psllq $40, %%mm0 \n\t" /* RGB00000 0 */\
523 "psllq $40, %%mm2 \n\t" /* RGB00000 1 */\
524 "psllq $40, %%mm1 \n\t" /* RGB00000 2 */\
525 "psllq $40, %%mm3 \n\t" /* RGB00000 3 */\
526 \
527 "punpckhdq %%mm4, %%mm0 \n\t" /* 0RGBRGB0 0 */\
528 "punpckhdq %%mm6, %%mm2 \n\t" /* 0RGBRGB0 1 */\
529 "punpckhdq %%mm5, %%mm1 \n\t" /* 0RGBRGB0 2 */\
530 "punpckhdq %%mm7, %%mm3 \n\t" /* 0RGBRGB0 3 */\
531 \
532 "psrlq $8, %%mm0 \n\t" /* 00RGBRGB 0 */\
533 "movq %%mm2, %%mm6 \n\t" /* 0RGBRGB0 1 */\
534 "psllq $40, %%mm2 \n\t" /* GB000000 1 */\
535 "por %%mm2, %%mm0 \n\t" /* GBRGBRGB 0 */\
536 MOVNTQ(%%mm0, (dst))\
537 \
538 "psrlq $24, %%mm6 \n\t" /* 0000RGBR 1 */\
539 "movq %%mm1, %%mm5 \n\t" /* 0RGBRGB0 2 */\
540 "psllq $24, %%mm1 \n\t" /* BRGB0000 2 */\
541 "por %%mm1, %%mm6 \n\t" /* BRGBRGBR 1 */\
542 MOVNTQ(%%mm6, 8(dst))\
543 \
544 "psrlq $40, %%mm5 \n\t" /* 000000RG 2 */\
545 "psllq $8, %%mm3 \n\t" /* RGBRGB00 3 */\
546 "por %%mm3, %%mm5 \n\t" /* RGBRGBRG 2 */\
547 MOVNTQ(%%mm5, 16(dst))\
548 \
549 "add $24, "#dst" \n\t"\
550 \
551 "add $8, "#index" \n\t"\
552 "cmp "dstw", "#index" \n\t"\
553 " jb 1b \n\t"
554
555 #define WRITEBGR24MMXEXT(dst, dstw, index) \
556 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
557 "movq "MANGLE(ff_M24A)", %%mm0 \n\t"\
558 "movq "MANGLE(ff_M24C)", %%mm7 \n\t"\
559 "pshufw $0x50, %%mm2, %%mm1 \n\t" /* B3 B2 B3 B2 B1 B0 B1 B0 */\
560 "pshufw $0x50, %%mm4, %%mm3 \n\t" /* G3 G2 G3 G2 G1 G0 G1 G0 */\
561 "pshufw $0x00, %%mm5, %%mm6 \n\t" /* R1 R0 R1 R0 R1 R0 R1 R0 */\
562 \
563 "pand %%mm0, %%mm1 \n\t" /* B2 B1 B0 */\
564 "pand %%mm0, %%mm3 \n\t" /* G2 G1 G0 */\
565 "pand %%mm7, %%mm6 \n\t" /* R1 R0 */\
566 \
567 "psllq $8, %%mm3 \n\t" /* G2 G1 G0 */\
568 "por %%mm1, %%mm6 \n\t"\
569 "por %%mm3, %%mm6 \n\t"\
570 MOVNTQ(%%mm6, (dst))\
571 \
572 "psrlq $8, %%mm4 \n\t" /* 00 G7 G6 G5 G4 G3 G2 G1 */\
573 "pshufw $0xA5, %%mm2, %%mm1 \n\t" /* B5 B4 B5 B4 B3 B2 B3 B2 */\
574 "pshufw $0x55, %%mm4, %%mm3 \n\t" /* G4 G3 G4 G3 G4 G3 G4 G3 */\
575 "pshufw $0xA5, %%mm5, %%mm6 \n\t" /* R5 R4 R5 R4 R3 R2 R3 R2 */\
576 \
577 "pand "MANGLE(ff_M24B)", %%mm1 \n\t" /* B5 B4 B3 */\
578 "pand %%mm7, %%mm3 \n\t" /* G4 G3 */\
579 "pand %%mm0, %%mm6 \n\t" /* R4 R3 R2 */\
580 \
581 "por %%mm1, %%mm3 \n\t" /* B5 G4 B4 G3 B3 */\
582 "por %%mm3, %%mm6 \n\t"\
583 MOVNTQ(%%mm6, 8(dst))\
584 \
585 "pshufw $0xFF, %%mm2, %%mm1 \n\t" /* B7 B6 B7 B6 B7 B6 B6 B7 */\
586 "pshufw $0xFA, %%mm4, %%mm3 \n\t" /* 00 G7 00 G7 G6 G5 G6 G5 */\
587 "pshufw $0xFA, %%mm5, %%mm6 \n\t" /* R7 R6 R7 R6 R5 R4 R5 R4 */\
588 \
589 "pand %%mm7, %%mm1 \n\t" /* B7 B6 */\
590 "pand %%mm0, %%mm3 \n\t" /* G7 G6 G5 */\
591 "pand "MANGLE(ff_M24B)", %%mm6 \n\t" /* R7 R6 R5 */\
592 \
593 "por %%mm1, %%mm3 \n\t"\
594 "por %%mm3, %%mm6 \n\t"\
595 MOVNTQ(%%mm6, 16(dst))\
596 \
597 "add $24, "#dst" \n\t"\
598 \
599 "add $8, "#index" \n\t"\
600 "cmp "dstw", "#index" \n\t"\
601 " jb 1b \n\t"
602
603 #if COMPILE_TEMPLATE_MMXEXT
604 #undef WRITEBGR24
605 #define WRITEBGR24(dst, dstw, index) WRITEBGR24MMXEXT(dst, dstw, index)
606 #else
607 #undef WRITEBGR24
608 #define WRITEBGR24(dst, dstw, index) WRITEBGR24MMX(dst, dstw, index)
609 #endif
610
611 #if HAVE_6REGS
RENAME(yuv2bgr24_X_ar)612 static void RENAME(yuv2bgr24_X_ar)(SwsContext *c, const int16_t *lumFilter,
613 const int16_t **lumSrc, int lumFilterSize,
614 const int16_t *chrFilter, const int16_t **chrUSrc,
615 const int16_t **chrVSrc,
616 int chrFilterSize, const int16_t **alpSrc,
617 uint8_t *dest, int dstW, int dstY)
618 {
619 x86_reg dummy=0;
620 x86_reg dstW_reg = dstW;
621 x86_reg uv_off = c->uv_offx2;
622
623 YSCALEYUV2PACKEDX_ACCURATE
624 YSCALEYUV2RGBX
625 "pxor %%mm7, %%mm7 \n\t"
626 "lea (%%"FF_REG_a", %%"FF_REG_a", 2), %%"FF_REG_c"\n\t" //FIXME optimize
627 "add %4, %%"FF_REG_c" \n\t"
628 WRITEBGR24(%%FF_REGc, "%5", %%FF_REGa)
629 :: "r" (&c->redDither),
630 "m" (dummy), "m" (dummy), "m" (dummy),
631 "r" (dest), "m" (dstW_reg), "m"(uv_off)
632 NAMED_CONSTRAINTS_ADD(ff_M24A,ff_M24C,ff_M24B)
633 : "%"FF_REG_a, "%"FF_REG_c, "%"FF_REG_d, "%"FF_REG_S
634 );
635 }
636
RENAME(yuv2bgr24_X)637 static void RENAME(yuv2bgr24_X)(SwsContext *c, const int16_t *lumFilter,
638 const int16_t **lumSrc, int lumFilterSize,
639 const int16_t *chrFilter, const int16_t **chrUSrc,
640 const int16_t **chrVSrc,
641 int chrFilterSize, const int16_t **alpSrc,
642 uint8_t *dest, int dstW, int dstY)
643 {
644 x86_reg dummy=0;
645 x86_reg dstW_reg = dstW;
646 x86_reg uv_off = c->uv_offx2;
647
648 YSCALEYUV2PACKEDX
649 YSCALEYUV2RGBX
650 "pxor %%mm7, %%mm7 \n\t"
651 "lea (%%"FF_REG_a", %%"FF_REG_a", 2), %%"FF_REG_c" \n\t" //FIXME optimize
652 "add %4, %%"FF_REG_c" \n\t"
653 WRITEBGR24(%%FF_REGc, "%5", %%FF_REGa)
654 :: "r" (&c->redDither),
655 "m" (dummy), "m" (dummy), "m" (dummy),
656 "r" (dest), "m" (dstW_reg), "m"(uv_off)
657 NAMED_CONSTRAINTS_ADD(ff_M24A,ff_M24C,ff_M24B)
658 : "%"FF_REG_a, "%"FF_REG_c, "%"FF_REG_d, "%"FF_REG_S
659 );
660 }
661 #endif /* HAVE_6REGS */
662
663 #define REAL_WRITEYUY2(dst, dstw, index) \
664 "packuswb %%mm3, %%mm3 \n\t"\
665 "packuswb %%mm4, %%mm4 \n\t"\
666 "packuswb %%mm7, %%mm1 \n\t"\
667 "punpcklbw %%mm4, %%mm3 \n\t"\
668 "movq %%mm1, %%mm7 \n\t"\
669 "punpcklbw %%mm3, %%mm1 \n\t"\
670 "punpckhbw %%mm3, %%mm7 \n\t"\
671 \
672 MOVNTQ(%%mm1, (dst, index, 2))\
673 MOVNTQ(%%mm7, 8(dst, index, 2))\
674 \
675 "add $8, "#index" \n\t"\
676 "cmp "dstw", "#index" \n\t"\
677 " jb 1b \n\t"
678 #define WRITEYUY2(dst, dstw, index) REAL_WRITEYUY2(dst, dstw, index)
679
RENAME(yuv2yuyv422_X_ar)680 static void RENAME(yuv2yuyv422_X_ar)(SwsContext *c, const int16_t *lumFilter,
681 const int16_t **lumSrc, int lumFilterSize,
682 const int16_t *chrFilter, const int16_t **chrUSrc,
683 const int16_t **chrVSrc,
684 int chrFilterSize, const int16_t **alpSrc,
685 uint8_t *dest, int dstW, int dstY)
686 {
687 x86_reg dummy=0;
688 x86_reg dstW_reg = dstW;
689 x86_reg uv_off = c->uv_offx2;
690
691 YSCALEYUV2PACKEDX_ACCURATE
692 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
693 "psraw $3, %%mm3 \n\t"
694 "psraw $3, %%mm4 \n\t"
695 "psraw $3, %%mm1 \n\t"
696 "psraw $3, %%mm7 \n\t"
697 WRITEYUY2(%4, "%5", %%FF_REGa)
698 YSCALEYUV2PACKEDX_END
699 }
700
RENAME(yuv2yuyv422_X)701 static void RENAME(yuv2yuyv422_X)(SwsContext *c, const int16_t *lumFilter,
702 const int16_t **lumSrc, int lumFilterSize,
703 const int16_t *chrFilter, const int16_t **chrUSrc,
704 const int16_t **chrVSrc,
705 int chrFilterSize, const int16_t **alpSrc,
706 uint8_t *dest, int dstW, int dstY)
707 {
708 x86_reg dummy=0;
709 x86_reg dstW_reg = dstW;
710 x86_reg uv_off = c->uv_offx2;
711
712 YSCALEYUV2PACKEDX
713 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
714 "psraw $3, %%mm3 \n\t"
715 "psraw $3, %%mm4 \n\t"
716 "psraw $3, %%mm1 \n\t"
717 "psraw $3, %%mm7 \n\t"
718 WRITEYUY2(%4, "%5", %%FF_REGa)
719 YSCALEYUV2PACKEDX_END
720 }
721
722 #define REAL_YSCALEYUV2RGB_UV(index, c) \
723 "xor "#index", "#index" \n\t"\
724 ".p2align 4 \n\t"\
725 "1: \n\t"\
726 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
727 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
728 "add "UV_OFF_BYTE"("#c"), "#index" \n\t" \
729 "movq (%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
730 "movq (%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
731 "sub "UV_OFF_BYTE"("#c"), "#index" \n\t" \
732 "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
733 "psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
734 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\
735 "pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
736 "pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
737 "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
738 "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
739 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
740 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
741 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
742 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
743 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
744 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
745 "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\
746 "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\
747 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
748
749 #define REAL_YSCALEYUV2RGB_YA(index, c, b1, b2) \
750 "movq ("#b1", "#index", 2), %%mm0 \n\t" /*buf0[eax]*/\
751 "movq ("#b2", "#index", 2), %%mm1 \n\t" /*buf1[eax]*/\
752 "movq 8("#b1", "#index", 2), %%mm6 \n\t" /*buf0[eax]*/\
753 "movq 8("#b2", "#index", 2), %%mm7 \n\t" /*buf1[eax]*/\
754 "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
755 "psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\
756 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
757 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
758 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
759 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
760 "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
761 "paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
762
763 #define REAL_YSCALEYUV2RGB_COEFF(c) \
764 "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\
765 "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\
766 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
767 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
768 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
769 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
770 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
771 "paddw %%mm3, %%mm4 \n\t"\
772 "movq %%mm2, %%mm0 \n\t"\
773 "movq %%mm5, %%mm6 \n\t"\
774 "movq %%mm4, %%mm3 \n\t"\
775 "punpcklwd %%mm2, %%mm2 \n\t"\
776 "punpcklwd %%mm5, %%mm5 \n\t"\
777 "punpcklwd %%mm4, %%mm4 \n\t"\
778 "paddw %%mm1, %%mm2 \n\t"\
779 "paddw %%mm1, %%mm5 \n\t"\
780 "paddw %%mm1, %%mm4 \n\t"\
781 "punpckhwd %%mm0, %%mm0 \n\t"\
782 "punpckhwd %%mm6, %%mm6 \n\t"\
783 "punpckhwd %%mm3, %%mm3 \n\t"\
784 "paddw %%mm7, %%mm0 \n\t"\
785 "paddw %%mm7, %%mm6 \n\t"\
786 "paddw %%mm7, %%mm3 \n\t"\
787 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
788 "packuswb %%mm0, %%mm2 \n\t"\
789 "packuswb %%mm6, %%mm5 \n\t"\
790 "packuswb %%mm3, %%mm4 \n\t"\
791
792 #define YSCALEYUV2RGB_YA(index, c, b1, b2) REAL_YSCALEYUV2RGB_YA(index, c, b1, b2)
793
794 #define YSCALEYUV2RGB(index, c) \
795 REAL_YSCALEYUV2RGB_UV(index, c) \
796 REAL_YSCALEYUV2RGB_YA(index, c, %0, %1) \
797 REAL_YSCALEYUV2RGB_COEFF(c)
798
799 /**
800 * vertical bilinear scale YV12 to RGB
801 */
RENAME(yuv2rgb32_2)802 static void RENAME(yuv2rgb32_2)(SwsContext *c, const int16_t *buf[2],
803 const int16_t *ubuf[2], const int16_t *vbuf[2],
804 const int16_t *abuf[2], uint8_t *dest,
805 int dstW, int yalpha, int uvalpha, int y)
806 {
807 const int16_t *buf0 = buf[0], *buf1 = buf[1],
808 *ubuf0 = ubuf[0], *ubuf1 = ubuf[1];
809
810 if (CONFIG_SWSCALE_ALPHA && c->needAlpha) {
811 const int16_t *abuf0 = abuf[0], *abuf1 = abuf[1];
812 #if ARCH_X86_64
813 __asm__ volatile(
814 YSCALEYUV2RGB(%%r8, %5)
815 YSCALEYUV2RGB_YA(%%r8, %5, %6, %7)
816 "psraw $3, %%mm1 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
817 "psraw $3, %%mm7 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
818 "packuswb %%mm7, %%mm1 \n\t"
819 WRITEBGR32(%4, DSTW_OFFSET"(%5)", %%r8, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
820 :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "r" (dest),
821 "a" (&c->redDither),
822 "r" (abuf0), "r" (abuf1)
823 : "%r8"
824 );
825 #else
826 c->u_temp=(intptr_t)abuf0;
827 c->v_temp=(intptr_t)abuf1;
828 __asm__ volatile(
829 "mov %%"FF_REG_b", "ESP_OFFSET"(%5) \n\t"
830 "mov %4, %%"FF_REG_b" \n\t"
831 "push %%"FF_REG_BP" \n\t"
832 YSCALEYUV2RGB(%%FF_REGBP, %5)
833 "push %0 \n\t"
834 "push %1 \n\t"
835 "mov "U_TEMP"(%5), %0 \n\t"
836 "mov "V_TEMP"(%5), %1 \n\t"
837 YSCALEYUV2RGB_YA(%%FF_REGBP, %5, %0, %1)
838 "psraw $3, %%mm1 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
839 "psraw $3, %%mm7 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
840 "packuswb %%mm7, %%mm1 \n\t"
841 "pop %1 \n\t"
842 "pop %0 \n\t"
843 WRITEBGR32(%%FF_REGb, DSTW_OFFSET"(%5)", %%FF_REGBP, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
844 "pop %%"FF_REG_BP" \n\t"
845 "mov "ESP_OFFSET"(%5), %%"FF_REG_b" \n\t"
846 :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
847 "a" (&c->redDither)
848 );
849 #endif
850 } else {
851 __asm__ volatile(
852 "mov %%"FF_REG_b", "ESP_OFFSET"(%5) \n\t"
853 "mov %4, %%"FF_REG_b" \n\t"
854 "push %%"FF_REG_BP" \n\t"
855 YSCALEYUV2RGB(%%FF_REGBP, %5)
856 "pcmpeqd %%mm7, %%mm7 \n\t"
857 WRITEBGR32(%%FF_REGb, DSTW_OFFSET"(%5)", %%FF_REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
858 "pop %%"FF_REG_BP" \n\t"
859 "mov "ESP_OFFSET"(%5), %%"FF_REG_b" \n\t"
860 :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
861 "a" (&c->redDither)
862 );
863 }
864 }
865
RENAME(yuv2bgr24_2)866 static void RENAME(yuv2bgr24_2)(SwsContext *c, const int16_t *buf[2],
867 const int16_t *ubuf[2], const int16_t *vbuf[2],
868 const int16_t *abuf[2], uint8_t *dest,
869 int dstW, int yalpha, int uvalpha, int y)
870 {
871 const int16_t *buf0 = buf[0], *buf1 = buf[1],
872 *ubuf0 = ubuf[0], *ubuf1 = ubuf[1];
873
874 __asm__ volatile(
875 "mov %%"FF_REG_b", "ESP_OFFSET"(%5) \n\t"
876 "mov %4, %%"FF_REG_b" \n\t"
877 "push %%"FF_REG_BP" \n\t"
878 YSCALEYUV2RGB(%%FF_REGBP, %5)
879 "pxor %%mm7, %%mm7 \n\t"
880 WRITEBGR24(%%FF_REGb, DSTW_OFFSET"(%5)", %%FF_REGBP)
881 "pop %%"FF_REG_BP" \n\t"
882 "mov "ESP_OFFSET"(%5), %%"FF_REG_b" \n\t"
883 :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
884 "a" (&c->redDither)
885 NAMED_CONSTRAINTS_ADD(ff_M24A,ff_M24C,ff_M24B)
886 );
887 }
888
RENAME(yuv2rgb555_2)889 static void RENAME(yuv2rgb555_2)(SwsContext *c, const int16_t *buf[2],
890 const int16_t *ubuf[2], const int16_t *vbuf[2],
891 const int16_t *abuf[2], uint8_t *dest,
892 int dstW, int yalpha, int uvalpha, int y)
893 {
894 const int16_t *buf0 = buf[0], *buf1 = buf[1],
895 *ubuf0 = ubuf[0], *ubuf1 = ubuf[1];
896
897 __asm__ volatile(
898 "mov %%"FF_REG_b", "ESP_OFFSET"(%5) \n\t"
899 "mov %4, %%"FF_REG_b" \n\t"
900 "push %%"FF_REG_BP" \n\t"
901 YSCALEYUV2RGB(%%FF_REGBP, %5)
902 "pxor %%mm7, %%mm7 \n\t"
903 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
904 #ifdef DITHER1XBPP
905 "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
906 "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
907 "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
908 #endif
909 WRITERGB15(%%FF_REGb, DSTW_OFFSET"(%5)", %%FF_REGBP)
910 "pop %%"FF_REG_BP" \n\t"
911 "mov "ESP_OFFSET"(%5), %%"FF_REG_b" \n\t"
912 :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
913 "a" (&c->redDither)
914 NAMED_CONSTRAINTS_ADD(bF8)
915 );
916 }
917
RENAME(yuv2rgb565_2)918 static void RENAME(yuv2rgb565_2)(SwsContext *c, const int16_t *buf[2],
919 const int16_t *ubuf[2], const int16_t *vbuf[2],
920 const int16_t *abuf[2], uint8_t *dest,
921 int dstW, int yalpha, int uvalpha, int y)
922 {
923 const int16_t *buf0 = buf[0], *buf1 = buf[1],
924 *ubuf0 = ubuf[0], *ubuf1 = ubuf[1];
925
926 __asm__ volatile(
927 "mov %%"FF_REG_b", "ESP_OFFSET"(%5) \n\t"
928 "mov %4, %%"FF_REG_b" \n\t"
929 "push %%"FF_REG_BP" \n\t"
930 YSCALEYUV2RGB(%%FF_REGBP, %5)
931 "pxor %%mm7, %%mm7 \n\t"
932 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
933 #ifdef DITHER1XBPP
934 "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
935 "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
936 "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
937 #endif
938 WRITERGB16(%%FF_REGb, DSTW_OFFSET"(%5)", %%FF_REGBP)
939 "pop %%"FF_REG_BP" \n\t"
940 "mov "ESP_OFFSET"(%5), %%"FF_REG_b" \n\t"
941 :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
942 "a" (&c->redDither)
943 NAMED_CONSTRAINTS_ADD(bF8,bFC)
944 );
945 }
946
947 #define REAL_YSCALEYUV2PACKED(index, c) \
948 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\
949 "movq "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm1 \n\t"\
950 "psraw $3, %%mm0 \n\t"\
951 "psraw $3, %%mm1 \n\t"\
952 "movq %%mm0, "CHR_MMX_FILTER_OFFSET"+8("#c") \n\t"\
953 "movq %%mm1, "LUM_MMX_FILTER_OFFSET"+8("#c") \n\t"\
954 "xor "#index", "#index" \n\t"\
955 ".p2align 4 \n\t"\
956 "1: \n\t"\
957 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
958 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
959 "add "UV_OFF_BYTE"("#c"), "#index" \n\t" \
960 "movq (%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
961 "movq (%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
962 "sub "UV_OFF_BYTE"("#c"), "#index" \n\t" \
963 "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
964 "psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
965 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\
966 "pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
967 "pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
968 "psraw $7, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
969 "psraw $7, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
970 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
971 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
972 "movq (%0, "#index", 2), %%mm0 \n\t" /*buf0[eax]*/\
973 "movq (%1, "#index", 2), %%mm1 \n\t" /*buf1[eax]*/\
974 "movq 8(%0, "#index", 2), %%mm6 \n\t" /*buf0[eax]*/\
975 "movq 8(%1, "#index", 2), %%mm7 \n\t" /*buf1[eax]*/\
976 "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
977 "psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\
978 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
979 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
980 "psraw $7, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
981 "psraw $7, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
982 "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
983 "paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
984
985 #define YSCALEYUV2PACKED(index, c) REAL_YSCALEYUV2PACKED(index, c)
986
RENAME(yuv2yuyv422_2)987 static void RENAME(yuv2yuyv422_2)(SwsContext *c, const int16_t *buf[2],
988 const int16_t *ubuf[2], const int16_t *vbuf[2],
989 const int16_t *abuf[2], uint8_t *dest,
990 int dstW, int yalpha, int uvalpha, int y)
991 {
992 const int16_t *buf0 = buf[0], *buf1 = buf[1],
993 *ubuf0 = ubuf[0], *ubuf1 = ubuf[1];
994
995 __asm__ volatile(
996 "mov %%"FF_REG_b", "ESP_OFFSET"(%5) \n\t"
997 "mov %4, %%"FF_REG_b" \n\t"
998 "push %%"FF_REG_BP" \n\t"
999 YSCALEYUV2PACKED(%%FF_REGBP, %5)
1000 WRITEYUY2(%%FF_REGb, DSTW_OFFSET"(%5)", %%FF_REGBP)
1001 "pop %%"FF_REG_BP" \n\t"
1002 "mov "ESP_OFFSET"(%5), %%"FF_REG_b" \n\t"
1003 :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
1004 "a" (&c->redDither)
1005 );
1006 }
1007
1008 #define REAL_YSCALEYUV2RGB1(index, c) \
1009 "xor "#index", "#index" \n\t"\
1010 ".p2align 4 \n\t"\
1011 "1: \n\t"\
1012 "movq (%2, "#index"), %%mm3 \n\t" /* uvbuf0[eax]*/\
1013 "add "UV_OFF_BYTE"("#c"), "#index" \n\t" \
1014 "movq (%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
1015 "sub "UV_OFF_BYTE"("#c"), "#index" \n\t" \
1016 "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
1017 "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
1018 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
1019 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
1020 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
1021 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
1022 "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\
1023 "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\
1024 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
1025 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
1026 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
1027 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
1028 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
1029 "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\
1030 "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\
1031 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
1032 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
1033 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
1034 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
1035 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
1036 "paddw %%mm3, %%mm4 \n\t"\
1037 "movq %%mm2, %%mm0 \n\t"\
1038 "movq %%mm5, %%mm6 \n\t"\
1039 "movq %%mm4, %%mm3 \n\t"\
1040 "punpcklwd %%mm2, %%mm2 \n\t"\
1041 "punpcklwd %%mm5, %%mm5 \n\t"\
1042 "punpcklwd %%mm4, %%mm4 \n\t"\
1043 "paddw %%mm1, %%mm2 \n\t"\
1044 "paddw %%mm1, %%mm5 \n\t"\
1045 "paddw %%mm1, %%mm4 \n\t"\
1046 "punpckhwd %%mm0, %%mm0 \n\t"\
1047 "punpckhwd %%mm6, %%mm6 \n\t"\
1048 "punpckhwd %%mm3, %%mm3 \n\t"\
1049 "paddw %%mm7, %%mm0 \n\t"\
1050 "paddw %%mm7, %%mm6 \n\t"\
1051 "paddw %%mm7, %%mm3 \n\t"\
1052 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
1053 "packuswb %%mm0, %%mm2 \n\t"\
1054 "packuswb %%mm6, %%mm5 \n\t"\
1055 "packuswb %%mm3, %%mm4 \n\t"\
1056
1057 #define YSCALEYUV2RGB1(index, c) REAL_YSCALEYUV2RGB1(index, c)
1058
1059 // do vertical chrominance interpolation
1060 #define REAL_YSCALEYUV2RGB1b(index, c) \
1061 "xor "#index", "#index" \n\t"\
1062 ".p2align 4 \n\t"\
1063 "1: \n\t"\
1064 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
1065 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
1066 "add "UV_OFF_BYTE"("#c"), "#index" \n\t" \
1067 "movq (%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
1068 "movq (%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
1069 "sub "UV_OFF_BYTE"("#c"), "#index" \n\t" \
1070 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
1071 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
1072 "psrlw $5, %%mm3 \n\t" /*FIXME might overflow*/\
1073 "psrlw $5, %%mm4 \n\t" /*FIXME might overflow*/\
1074 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
1075 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
1076 "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
1077 "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
1078 "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\
1079 "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\
1080 /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
1081 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
1082 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
1083 "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
1084 "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
1085 "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\
1086 "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\
1087 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
1088 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
1089 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
1090 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
1091 /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
1092 "paddw %%mm3, %%mm4 \n\t"\
1093 "movq %%mm2, %%mm0 \n\t"\
1094 "movq %%mm5, %%mm6 \n\t"\
1095 "movq %%mm4, %%mm3 \n\t"\
1096 "punpcklwd %%mm2, %%mm2 \n\t"\
1097 "punpcklwd %%mm5, %%mm5 \n\t"\
1098 "punpcklwd %%mm4, %%mm4 \n\t"\
1099 "paddw %%mm1, %%mm2 \n\t"\
1100 "paddw %%mm1, %%mm5 \n\t"\
1101 "paddw %%mm1, %%mm4 \n\t"\
1102 "punpckhwd %%mm0, %%mm0 \n\t"\
1103 "punpckhwd %%mm6, %%mm6 \n\t"\
1104 "punpckhwd %%mm3, %%mm3 \n\t"\
1105 "paddw %%mm7, %%mm0 \n\t"\
1106 "paddw %%mm7, %%mm6 \n\t"\
1107 "paddw %%mm7, %%mm3 \n\t"\
1108 /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
1109 "packuswb %%mm0, %%mm2 \n\t"\
1110 "packuswb %%mm6, %%mm5 \n\t"\
1111 "packuswb %%mm3, %%mm4 \n\t"\
1112
1113 #define YSCALEYUV2RGB1b(index, c) REAL_YSCALEYUV2RGB1b(index, c)
1114
1115 #define REAL_YSCALEYUV2RGB1_ALPHA(index) \
1116 "movq (%1, "#index", 2), %%mm7 \n\t" /* abuf0[index ] */\
1117 "movq 8(%1, "#index", 2), %%mm1 \n\t" /* abuf0[index+4] */\
1118 "psraw $7, %%mm7 \n\t" /* abuf0[index ] >>7 */\
1119 "psraw $7, %%mm1 \n\t" /* abuf0[index+4] >>7 */\
1120 "packuswb %%mm1, %%mm7 \n\t"
1121 #define YSCALEYUV2RGB1_ALPHA(index) REAL_YSCALEYUV2RGB1_ALPHA(index)
1122
1123 /**
1124 * YV12 to RGB without scaling or interpolating
1125 */
RENAME(yuv2rgb32_1)1126 static void RENAME(yuv2rgb32_1)(SwsContext *c, const int16_t *buf0,
1127 const int16_t *ubuf[2], const int16_t *vbuf[2],
1128 const int16_t *abuf0, uint8_t *dest,
1129 int dstW, int uvalpha, int y)
1130 {
1131 const int16_t *ubuf0 = ubuf[0];
1132 const int16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
1133
1134 if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
1135 const int16_t *ubuf1 = ubuf[0];
1136 if (CONFIG_SWSCALE_ALPHA && c->needAlpha) {
1137 __asm__ volatile(
1138 "mov %%"FF_REG_b", "ESP_OFFSET"(%5) \n\t"
1139 "mov %4, %%"FF_REG_b" \n\t"
1140 "push %%"FF_REG_BP" \n\t"
1141 YSCALEYUV2RGB1(%%FF_REGBP, %5)
1142 YSCALEYUV2RGB1_ALPHA(%%FF_REGBP)
1143 WRITEBGR32(%%FF_REGb, DSTW_OFFSET"(%5)", %%FF_REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1144 "pop %%"FF_REG_BP" \n\t"
1145 "mov "ESP_OFFSET"(%5), %%"FF_REG_b" \n\t"
1146 :: "c" (buf0), "d" (abuf0), "S" (ubuf0), "D" (ubuf1), "m" (dest),
1147 "a" (&c->redDither)
1148 );
1149 } else {
1150 __asm__ volatile(
1151 "mov %%"FF_REG_b", "ESP_OFFSET"(%5) \n\t"
1152 "mov %4, %%"FF_REG_b" \n\t"
1153 "push %%"FF_REG_BP" \n\t"
1154 YSCALEYUV2RGB1(%%FF_REGBP, %5)
1155 "pcmpeqd %%mm7, %%mm7 \n\t"
1156 WRITEBGR32(%%FF_REGb, DSTW_OFFSET"(%5)", %%FF_REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1157 "pop %%"FF_REG_BP" \n\t"
1158 "mov "ESP_OFFSET"(%5), %%"FF_REG_b" \n\t"
1159 :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
1160 "a" (&c->redDither)
1161 );
1162 }
1163 } else {
1164 const int16_t *ubuf1 = ubuf[1];
1165 if (CONFIG_SWSCALE_ALPHA && c->needAlpha) {
1166 __asm__ volatile(
1167 "mov %%"FF_REG_b", "ESP_OFFSET"(%5) \n\t"
1168 "mov %4, %%"FF_REG_b" \n\t"
1169 "push %%"FF_REG_BP" \n\t"
1170 YSCALEYUV2RGB1b(%%FF_REGBP, %5)
1171 YSCALEYUV2RGB1_ALPHA(%%FF_REGBP)
1172 WRITEBGR32(%%FF_REGb, DSTW_OFFSET"(%5)", %%FF_REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1173 "pop %%"FF_REG_BP" \n\t"
1174 "mov "ESP_OFFSET"(%5), %%"FF_REG_b" \n\t"
1175 :: "c" (buf0), "d" (abuf0), "S" (ubuf0), "D" (ubuf1), "m" (dest),
1176 "a" (&c->redDither)
1177 );
1178 } else {
1179 __asm__ volatile(
1180 "mov %%"FF_REG_b", "ESP_OFFSET"(%5) \n\t"
1181 "mov %4, %%"FF_REG_b" \n\t"
1182 "push %%"FF_REG_BP" \n\t"
1183 YSCALEYUV2RGB1b(%%FF_REGBP, %5)
1184 "pcmpeqd %%mm7, %%mm7 \n\t"
1185 WRITEBGR32(%%FF_REGb, DSTW_OFFSET"(%5)", %%FF_REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1186 "pop %%"FF_REG_BP" \n\t"
1187 "mov "ESP_OFFSET"(%5), %%"FF_REG_b" \n\t"
1188 :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
1189 "a" (&c->redDither)
1190 );
1191 }
1192 }
1193 }
1194
RENAME(yuv2bgr24_1)1195 static void RENAME(yuv2bgr24_1)(SwsContext *c, const int16_t *buf0,
1196 const int16_t *ubuf[2], const int16_t *vbuf[2],
1197 const int16_t *abuf0, uint8_t *dest,
1198 int dstW, int uvalpha, int y)
1199 {
1200 const int16_t *ubuf0 = ubuf[0];
1201 const int16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
1202
1203 if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
1204 const int16_t *ubuf1 = ubuf[0];
1205 __asm__ volatile(
1206 "mov %%"FF_REG_b", "ESP_OFFSET"(%5) \n\t"
1207 "mov %4, %%"FF_REG_b" \n\t"
1208 "push %%"FF_REG_BP" \n\t"
1209 YSCALEYUV2RGB1(%%FF_REGBP, %5)
1210 "pxor %%mm7, %%mm7 \n\t"
1211 WRITEBGR24(%%FF_REGb, DSTW_OFFSET"(%5)", %%FF_REGBP)
1212 "pop %%"FF_REG_BP" \n\t"
1213 "mov "ESP_OFFSET"(%5), %%"FF_REG_b" \n\t"
1214 :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
1215 "a" (&c->redDither)
1216 NAMED_CONSTRAINTS_ADD(ff_M24A,ff_M24C,ff_M24B)
1217 );
1218 } else {
1219 const int16_t *ubuf1 = ubuf[1];
1220 __asm__ volatile(
1221 "mov %%"FF_REG_b", "ESP_OFFSET"(%5) \n\t"
1222 "mov %4, %%"FF_REG_b" \n\t"
1223 "push %%"FF_REG_BP" \n\t"
1224 YSCALEYUV2RGB1b(%%FF_REGBP, %5)
1225 "pxor %%mm7, %%mm7 \n\t"
1226 WRITEBGR24(%%FF_REGb, DSTW_OFFSET"(%5)", %%FF_REGBP)
1227 "pop %%"FF_REG_BP" \n\t"
1228 "mov "ESP_OFFSET"(%5), %%"FF_REG_b" \n\t"
1229 :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
1230 "a" (&c->redDither)
1231 NAMED_CONSTRAINTS_ADD(ff_M24A,ff_M24C,ff_M24B)
1232 );
1233 }
1234 }
1235
RENAME(yuv2rgb555_1)1236 static void RENAME(yuv2rgb555_1)(SwsContext *c, const int16_t *buf0,
1237 const int16_t *ubuf[2], const int16_t *vbuf[2],
1238 const int16_t *abuf0, uint8_t *dest,
1239 int dstW, int uvalpha, int y)
1240 {
1241 const int16_t *ubuf0 = ubuf[0];
1242 const int16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
1243
1244 if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
1245 const int16_t *ubuf1 = ubuf[0];
1246 __asm__ volatile(
1247 "mov %%"FF_REG_b", "ESP_OFFSET"(%5) \n\t"
1248 "mov %4, %%"FF_REG_b" \n\t"
1249 "push %%"FF_REG_BP" \n\t"
1250 YSCALEYUV2RGB1(%%FF_REGBP, %5)
1251 "pxor %%mm7, %%mm7 \n\t"
1252 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1253 #ifdef DITHER1XBPP
1254 "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
1255 "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
1256 "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
1257 #endif
1258 WRITERGB15(%%FF_REGb, DSTW_OFFSET"(%5)", %%FF_REGBP)
1259 "pop %%"FF_REG_BP" \n\t"
1260 "mov "ESP_OFFSET"(%5), %%"FF_REG_b" \n\t"
1261 :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
1262 "a" (&c->redDither)
1263 NAMED_CONSTRAINTS_ADD(bF8)
1264 );
1265 } else {
1266 const int16_t *ubuf1 = ubuf[1];
1267 __asm__ volatile(
1268 "mov %%"FF_REG_b", "ESP_OFFSET"(%5) \n\t"
1269 "mov %4, %%"FF_REG_b" \n\t"
1270 "push %%"FF_REG_BP" \n\t"
1271 YSCALEYUV2RGB1b(%%FF_REGBP, %5)
1272 "pxor %%mm7, %%mm7 \n\t"
1273 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1274 #ifdef DITHER1XBPP
1275 "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
1276 "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
1277 "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
1278 #endif
1279 WRITERGB15(%%FF_REGb, DSTW_OFFSET"(%5)", %%FF_REGBP)
1280 "pop %%"FF_REG_BP" \n\t"
1281 "mov "ESP_OFFSET"(%5), %%"FF_REG_b" \n\t"
1282 :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
1283 "a" (&c->redDither)
1284 NAMED_CONSTRAINTS_ADD(bF8)
1285 );
1286 }
1287 }
1288
RENAME(yuv2rgb565_1)1289 static void RENAME(yuv2rgb565_1)(SwsContext *c, const int16_t *buf0,
1290 const int16_t *ubuf[2], const int16_t *vbuf[2],
1291 const int16_t *abuf0, uint8_t *dest,
1292 int dstW, int uvalpha, int y)
1293 {
1294 const int16_t *ubuf0 = ubuf[0];
1295 const int16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
1296
1297 if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
1298 const int16_t *ubuf1 = ubuf[0];
1299 __asm__ volatile(
1300 "mov %%"FF_REG_b", "ESP_OFFSET"(%5) \n\t"
1301 "mov %4, %%"FF_REG_b" \n\t"
1302 "push %%"FF_REG_BP" \n\t"
1303 YSCALEYUV2RGB1(%%FF_REGBP, %5)
1304 "pxor %%mm7, %%mm7 \n\t"
1305 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1306 #ifdef DITHER1XBPP
1307 "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
1308 "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
1309 "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
1310 #endif
1311 WRITERGB16(%%FF_REGb, DSTW_OFFSET"(%5)", %%FF_REGBP)
1312 "pop %%"FF_REG_BP" \n\t"
1313 "mov "ESP_OFFSET"(%5), %%"FF_REG_b" \n\t"
1314 :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
1315 "a" (&c->redDither)
1316 NAMED_CONSTRAINTS_ADD(bF8,bFC)
1317 );
1318 } else {
1319 const int16_t *ubuf1 = ubuf[1];
1320 __asm__ volatile(
1321 "mov %%"FF_REG_b", "ESP_OFFSET"(%5) \n\t"
1322 "mov %4, %%"FF_REG_b" \n\t"
1323 "push %%"FF_REG_BP" \n\t"
1324 YSCALEYUV2RGB1b(%%FF_REGBP, %5)
1325 "pxor %%mm7, %%mm7 \n\t"
1326 /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1327 #ifdef DITHER1XBPP
1328 "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
1329 "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
1330 "paddusb "RED_DITHER"(%5), %%mm5 \n\t"
1331 #endif
1332 WRITERGB16(%%FF_REGb, DSTW_OFFSET"(%5)", %%FF_REGBP)
1333 "pop %%"FF_REG_BP" \n\t"
1334 "mov "ESP_OFFSET"(%5), %%"FF_REG_b" \n\t"
1335 :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
1336 "a" (&c->redDither)
1337 NAMED_CONSTRAINTS_ADD(bF8,bFC)
1338 );
1339 }
1340 }
1341
1342 #define REAL_YSCALEYUV2PACKED1(index, c) \
1343 "xor "#index", "#index" \n\t"\
1344 ".p2align 4 \n\t"\
1345 "1: \n\t"\
1346 "movq (%2, "#index"), %%mm3 \n\t" /* uvbuf0[eax]*/\
1347 "add "UV_OFF_BYTE"("#c"), "#index" \n\t" \
1348 "movq (%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
1349 "sub "UV_OFF_BYTE"("#c"), "#index" \n\t" \
1350 "psraw $7, %%mm3 \n\t" \
1351 "psraw $7, %%mm4 \n\t" \
1352 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
1353 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
1354 "psraw $7, %%mm1 \n\t" \
1355 "psraw $7, %%mm7 \n\t" \
1356
1357 #define YSCALEYUV2PACKED1(index, c) REAL_YSCALEYUV2PACKED1(index, c)
1358
1359 #define REAL_YSCALEYUV2PACKED1b(index, c) \
1360 "xor "#index", "#index" \n\t"\
1361 ".p2align 4 \n\t"\
1362 "1: \n\t"\
1363 "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
1364 "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
1365 "add "UV_OFF_BYTE"("#c"), "#index" \n\t" \
1366 "movq (%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
1367 "movq (%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
1368 "sub "UV_OFF_BYTE"("#c"), "#index" \n\t" \
1369 "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
1370 "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
1371 "psrlw $8, %%mm3 \n\t" \
1372 "psrlw $8, %%mm4 \n\t" \
1373 "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
1374 "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
1375 "psraw $7, %%mm1 \n\t" \
1376 "psraw $7, %%mm7 \n\t"
1377 #define YSCALEYUV2PACKED1b(index, c) REAL_YSCALEYUV2PACKED1b(index, c)
1378
RENAME(yuv2yuyv422_1)1379 static void RENAME(yuv2yuyv422_1)(SwsContext *c, const int16_t *buf0,
1380 const int16_t *ubuf[2], const int16_t *vbuf[2],
1381 const int16_t *abuf0, uint8_t *dest,
1382 int dstW, int uvalpha, int y)
1383 {
1384 const int16_t *ubuf0 = ubuf[0];
1385 const int16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
1386
1387 if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
1388 const int16_t *ubuf1 = ubuf[0];
1389 __asm__ volatile(
1390 "mov %%"FF_REG_b", "ESP_OFFSET"(%5) \n\t"
1391 "mov %4, %%"FF_REG_b" \n\t"
1392 "push %%"FF_REG_BP" \n\t"
1393 YSCALEYUV2PACKED1(%%FF_REGBP, %5)
1394 WRITEYUY2(%%FF_REGb, DSTW_OFFSET"(%5)", %%FF_REGBP)
1395 "pop %%"FF_REG_BP" \n\t"
1396 "mov "ESP_OFFSET"(%5), %%"FF_REG_b" \n\t"
1397 :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
1398 "a" (&c->redDither)
1399 );
1400 } else {
1401 const int16_t *ubuf1 = ubuf[1];
1402 __asm__ volatile(
1403 "mov %%"FF_REG_b", "ESP_OFFSET"(%5) \n\t"
1404 "mov %4, %%"FF_REG_b" \n\t"
1405 "push %%"FF_REG_BP" \n\t"
1406 YSCALEYUV2PACKED1b(%%FF_REGBP, %5)
1407 WRITEYUY2(%%FF_REGb, DSTW_OFFSET"(%5)", %%FF_REGBP)
1408 "pop %%"FF_REG_BP" \n\t"
1409 "mov "ESP_OFFSET"(%5), %%"FF_REG_b" \n\t"
1410 :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
1411 "a" (&c->redDither)
1412 );
1413 }
1414 }
RENAME(sws_init_swscale)1415 static av_cold void RENAME(sws_init_swscale)(SwsContext *c)
1416 {
1417 enum AVPixelFormat dstFormat = c->dstFormat;
1418
1419 c->use_mmx_vfilter= 0;
1420 if (!is16BPS(dstFormat) && !isNBPS(dstFormat) && !isSemiPlanarYUV(dstFormat)
1421 && dstFormat != AV_PIX_FMT_GRAYF32BE && dstFormat != AV_PIX_FMT_GRAYF32LE
1422 && !(c->flags & SWS_BITEXACT)) {
1423 if (c->flags & SWS_ACCURATE_RND) {
1424 if (!(c->flags & SWS_FULL_CHR_H_INT)) {
1425 switch (c->dstFormat) {
1426 case AV_PIX_FMT_RGB32: c->yuv2packedX = RENAME(yuv2rgb32_X_ar); break;
1427 #if HAVE_6REGS
1428 case AV_PIX_FMT_BGR24: c->yuv2packedX = RENAME(yuv2bgr24_X_ar); break;
1429 #endif
1430 case AV_PIX_FMT_RGB555: c->yuv2packedX = RENAME(yuv2rgb555_X_ar); break;
1431 case AV_PIX_FMT_RGB565: c->yuv2packedX = RENAME(yuv2rgb565_X_ar); break;
1432 case AV_PIX_FMT_YUYV422: c->yuv2packedX = RENAME(yuv2yuyv422_X_ar); break;
1433 default: break;
1434 }
1435 }
1436 } else {
1437 c->use_mmx_vfilter= 1;
1438 if (!(c->flags & SWS_FULL_CHR_H_INT)) {
1439 switch (c->dstFormat) {
1440 case AV_PIX_FMT_RGB32: c->yuv2packedX = RENAME(yuv2rgb32_X); break;
1441 case AV_PIX_FMT_BGR32: c->yuv2packedX = RENAME(yuv2bgr32_X); break;
1442 #if HAVE_6REGS
1443 case AV_PIX_FMT_BGR24: c->yuv2packedX = RENAME(yuv2bgr24_X); break;
1444 #endif
1445 case AV_PIX_FMT_RGB555: c->yuv2packedX = RENAME(yuv2rgb555_X); break;
1446 case AV_PIX_FMT_RGB565: c->yuv2packedX = RENAME(yuv2rgb565_X); break;
1447 case AV_PIX_FMT_YUYV422: c->yuv2packedX = RENAME(yuv2yuyv422_X); break;
1448 default: break;
1449 }
1450 }
1451 }
1452 if (!(c->flags & SWS_FULL_CHR_H_INT)) {
1453 switch (c->dstFormat) {
1454 case AV_PIX_FMT_RGB32:
1455 c->yuv2packed1 = RENAME(yuv2rgb32_1);
1456 c->yuv2packed2 = RENAME(yuv2rgb32_2);
1457 break;
1458 case AV_PIX_FMT_BGR24:
1459 c->yuv2packed1 = RENAME(yuv2bgr24_1);
1460 c->yuv2packed2 = RENAME(yuv2bgr24_2);
1461 break;
1462 case AV_PIX_FMT_RGB555:
1463 c->yuv2packed1 = RENAME(yuv2rgb555_1);
1464 c->yuv2packed2 = RENAME(yuv2rgb555_2);
1465 break;
1466 case AV_PIX_FMT_RGB565:
1467 c->yuv2packed1 = RENAME(yuv2rgb565_1);
1468 c->yuv2packed2 = RENAME(yuv2rgb565_2);
1469 break;
1470 case AV_PIX_FMT_YUYV422:
1471 c->yuv2packed1 = RENAME(yuv2yuyv422_1);
1472 c->yuv2packed2 = RENAME(yuv2yuyv422_2);
1473 break;
1474 default:
1475 break;
1476 }
1477 }
1478 }
1479
1480 if (c->srcBpc == 8 && c->dstBpc <= 14) {
1481 // Use the new MMX scaler if the MMXEXT one can't be used (it is faster than the x86 ASM one).
1482 #if COMPILE_TEMPLATE_MMXEXT
1483 if (c->flags & SWS_FAST_BILINEAR && c->canMMXEXTBeUsed) {
1484 c->hyscale_fast = ff_hyscale_fast_mmxext;
1485 c->hcscale_fast = ff_hcscale_fast_mmxext;
1486 } else {
1487 #endif /* COMPILE_TEMPLATE_MMXEXT */
1488 c->hyscale_fast = NULL;
1489 c->hcscale_fast = NULL;
1490 #if COMPILE_TEMPLATE_MMXEXT
1491 }
1492 #endif /* COMPILE_TEMPLATE_MMXEXT */
1493 }
1494 }
1495