1 /*
2 Simple DirectMedia Layer
3 Copyright (C) 1997-2016 Sam Lantinga <slouken@libsdl.org>
4
5 This software is provided 'as-is', without any express or implied
6 warranty. In no event will the authors be held liable for any damages
7 arising from the use of this software.
8
9 Permission is granted to anyone to use this software for any purpose,
10 including commercial applications, and to alter it and redistribute it
11 freely, subject to the following restrictions:
12
13 1. The origin of this software must not be misrepresented; you must not
14 claim that you wrote the original software. If you use this software
15 in a product, an acknowledgment in the product documentation would be
16 appreciated but is not required.
17 2. Altered source versions must be plainly marked as such, and must not be
18 misrepresented as being the original software.
19 3. This notice may not be removed or altered from any source distribution.
20 */
21 #include "../SDL_internal.h"
22
23 #include "SDL_video.h"
24 #include "SDL_blit.h"
25
26 /* Functions to perform alpha blended blitting */
27
28 /* N->1 blending with per-surface alpha */
29 static void
BlitNto1SurfaceAlpha(SDL_BlitInfo * info)30 BlitNto1SurfaceAlpha(SDL_BlitInfo * info)
31 {
32 int width = info->dst_w;
33 int height = info->dst_h;
34 Uint8 *src = info->src;
35 int srcskip = info->src_skip;
36 Uint8 *dst = info->dst;
37 int dstskip = info->dst_skip;
38 Uint8 *palmap = info->table;
39 SDL_PixelFormat *srcfmt = info->src_fmt;
40 SDL_PixelFormat *dstfmt = info->dst_fmt;
41 int srcbpp = srcfmt->BytesPerPixel;
42 Uint32 Pixel;
43 unsigned sR, sG, sB;
44 unsigned dR, dG, dB;
45 const unsigned A = info->a;
46
47 while (height--) {
48 /* *INDENT-OFF* */
49 DUFFS_LOOP4(
50 {
51 DISEMBLE_RGB(src, srcbpp, srcfmt, Pixel, sR, sG, sB);
52 dR = dstfmt->palette->colors[*dst].r;
53 dG = dstfmt->palette->colors[*dst].g;
54 dB = dstfmt->palette->colors[*dst].b;
55 ALPHA_BLEND_RGB(sR, sG, sB, A, dR, dG, dB);
56 dR &= 0xff;
57 dG &= 0xff;
58 dB &= 0xff;
59 /* Pack RGB into 8bit pixel */
60 if ( palmap == NULL ) {
61 *dst =((dR>>5)<<(3+2))|((dG>>5)<<(2))|((dB>>6)<<(0));
62 } else {
63 *dst = palmap[((dR>>5)<<(3+2))|((dG>>5)<<(2))|((dB>>6)<<(0))];
64 }
65 dst++;
66 src += srcbpp;
67 },
68 width);
69 /* *INDENT-ON* */
70 src += srcskip;
71 dst += dstskip;
72 }
73 }
74
75 /* N->1 blending with pixel alpha */
76 static void
BlitNto1PixelAlpha(SDL_BlitInfo * info)77 BlitNto1PixelAlpha(SDL_BlitInfo * info)
78 {
79 int width = info->dst_w;
80 int height = info->dst_h;
81 Uint8 *src = info->src;
82 int srcskip = info->src_skip;
83 Uint8 *dst = info->dst;
84 int dstskip = info->dst_skip;
85 Uint8 *palmap = info->table;
86 SDL_PixelFormat *srcfmt = info->src_fmt;
87 SDL_PixelFormat *dstfmt = info->dst_fmt;
88 int srcbpp = srcfmt->BytesPerPixel;
89 Uint32 Pixel;
90 unsigned sR, sG, sB, sA;
91 unsigned dR, dG, dB;
92
93 while (height--) {
94 /* *INDENT-OFF* */
95 DUFFS_LOOP4(
96 {
97 DISEMBLE_RGBA(src,srcbpp,srcfmt,Pixel,sR,sG,sB,sA);
98 dR = dstfmt->palette->colors[*dst].r;
99 dG = dstfmt->palette->colors[*dst].g;
100 dB = dstfmt->palette->colors[*dst].b;
101 ALPHA_BLEND_RGB(sR, sG, sB, sA, dR, dG, dB);
102 dR &= 0xff;
103 dG &= 0xff;
104 dB &= 0xff;
105 /* Pack RGB into 8bit pixel */
106 if ( palmap == NULL ) {
107 *dst =((dR>>5)<<(3+2))|((dG>>5)<<(2))|((dB>>6)<<(0));
108 } else {
109 *dst = palmap[((dR>>5)<<(3+2))|((dG>>5)<<(2))|((dB>>6)<<(0))];
110 }
111 dst++;
112 src += srcbpp;
113 },
114 width);
115 /* *INDENT-ON* */
116 src += srcskip;
117 dst += dstskip;
118 }
119 }
120
121 /* colorkeyed N->1 blending with per-surface alpha */
122 static void
BlitNto1SurfaceAlphaKey(SDL_BlitInfo * info)123 BlitNto1SurfaceAlphaKey(SDL_BlitInfo * info)
124 {
125 int width = info->dst_w;
126 int height = info->dst_h;
127 Uint8 *src = info->src;
128 int srcskip = info->src_skip;
129 Uint8 *dst = info->dst;
130 int dstskip = info->dst_skip;
131 Uint8 *palmap = info->table;
132 SDL_PixelFormat *srcfmt = info->src_fmt;
133 SDL_PixelFormat *dstfmt = info->dst_fmt;
134 int srcbpp = srcfmt->BytesPerPixel;
135 Uint32 ckey = info->colorkey;
136 Uint32 Pixel;
137 unsigned sR, sG, sB;
138 unsigned dR, dG, dB;
139 const unsigned A = info->a;
140
141 while (height--) {
142 /* *INDENT-OFF* */
143 DUFFS_LOOP(
144 {
145 DISEMBLE_RGB(src, srcbpp, srcfmt, Pixel, sR, sG, sB);
146 if ( Pixel != ckey ) {
147 dR = dstfmt->palette->colors[*dst].r;
148 dG = dstfmt->palette->colors[*dst].g;
149 dB = dstfmt->palette->colors[*dst].b;
150 ALPHA_BLEND_RGB(sR, sG, sB, A, dR, dG, dB);
151 dR &= 0xff;
152 dG &= 0xff;
153 dB &= 0xff;
154 /* Pack RGB into 8bit pixel */
155 if ( palmap == NULL ) {
156 *dst =((dR>>5)<<(3+2))|((dG>>5)<<(2))|((dB>>6)<<(0));
157 } else {
158 *dst = palmap[((dR>>5)<<(3+2))|((dG>>5)<<(2))|((dB>>6)<<(0))];
159 }
160 }
161 dst++;
162 src += srcbpp;
163 },
164 width);
165 /* *INDENT-ON* */
166 src += srcskip;
167 dst += dstskip;
168 }
169 }
170
171 #ifdef __MMX__
172
173 /* fast RGB888->(A)RGB888 blending with surface alpha=128 special case */
174 static void
BlitRGBtoRGBSurfaceAlpha128MMX(SDL_BlitInfo * info)175 BlitRGBtoRGBSurfaceAlpha128MMX(SDL_BlitInfo * info)
176 {
177 int width = info->dst_w;
178 int height = info->dst_h;
179 Uint32 *srcp = (Uint32 *) info->src;
180 int srcskip = info->src_skip >> 2;
181 Uint32 *dstp = (Uint32 *) info->dst;
182 int dstskip = info->dst_skip >> 2;
183 Uint32 dalpha = info->dst_fmt->Amask;
184
185 __m64 src1, src2, dst1, dst2, lmask, hmask, dsta;
186
187 hmask = _mm_set_pi32(0x00fefefe, 0x00fefefe); /* alpha128 mask -> hmask */
188 lmask = _mm_set_pi32(0x00010101, 0x00010101); /* !alpha128 mask -> lmask */
189 dsta = _mm_set_pi32(dalpha, dalpha); /* dst alpha mask -> dsta */
190
191 while (height--) {
192 int n = width;
193 if (n & 1) {
194 Uint32 s = *srcp++;
195 Uint32 d = *dstp;
196 *dstp++ = ((((s & 0x00fefefe) + (d & 0x00fefefe)) >> 1)
197 + (s & d & 0x00010101)) | dalpha;
198 n--;
199 }
200
201 for (n >>= 1; n > 0; --n) {
202 dst1 = *(__m64 *) dstp; /* 2 x dst -> dst1(ARGBARGB) */
203 dst2 = dst1; /* 2 x dst -> dst2(ARGBARGB) */
204
205 src1 = *(__m64 *) srcp; /* 2 x src -> src1(ARGBARGB) */
206 src2 = src1; /* 2 x src -> src2(ARGBARGB) */
207
208 dst2 = _mm_and_si64(dst2, hmask); /* dst & mask -> dst2 */
209 src2 = _mm_and_si64(src2, hmask); /* src & mask -> src2 */
210 src2 = _mm_add_pi32(src2, dst2); /* dst2 + src2 -> src2 */
211 src2 = _mm_srli_pi32(src2, 1); /* src2 >> 1 -> src2 */
212
213 dst1 = _mm_and_si64(dst1, src1); /* src & dst -> dst1 */
214 dst1 = _mm_and_si64(dst1, lmask); /* dst1 & !mask -> dst1 */
215 dst1 = _mm_add_pi32(dst1, src2); /* src2 + dst1 -> dst1 */
216 dst1 = _mm_or_si64(dst1, dsta); /* dsta(full alpha) | dst1 -> dst1 */
217
218 *(__m64 *) dstp = dst1; /* dst1 -> 2 x dst pixels */
219 dstp += 2;
220 srcp += 2;
221 }
222
223 srcp += srcskip;
224 dstp += dstskip;
225 }
226 _mm_empty();
227 }
228
229 /* fast RGB888->(A)RGB888 blending with surface alpha */
230 static void
BlitRGBtoRGBSurfaceAlphaMMX(SDL_BlitInfo * info)231 BlitRGBtoRGBSurfaceAlphaMMX(SDL_BlitInfo * info)
232 {
233 SDL_PixelFormat *df = info->dst_fmt;
234 Uint32 chanmask;
235 unsigned alpha = info->a;
236
237 if (alpha == 128 && (df->Rmask | df->Gmask | df->Bmask) == 0x00FFFFFF) {
238 /* only call a128 version when R,G,B occupy lower bits */
239 BlitRGBtoRGBSurfaceAlpha128MMX(info);
240 } else {
241 int width = info->dst_w;
242 int height = info->dst_h;
243 Uint32 *srcp = (Uint32 *) info->src;
244 int srcskip = info->src_skip >> 2;
245 Uint32 *dstp = (Uint32 *) info->dst;
246 int dstskip = info->dst_skip >> 2;
247 Uint32 dalpha = df->Amask;
248 Uint32 amult;
249
250 __m64 src1, src2, dst1, dst2, mm_alpha, mm_zero, dsta;
251
252 mm_zero = _mm_setzero_si64(); /* 0 -> mm_zero */
253 /* form the alpha mult */
254 amult = alpha | (alpha << 8);
255 amult = amult | (amult << 16);
256 chanmask =
257 (0xff << df->Rshift) | (0xff << df->
258 Gshift) | (0xff << df->Bshift);
259 mm_alpha = _mm_set_pi32(0, amult & chanmask); /* 0000AAAA -> mm_alpha, minus 1 chan */
260 mm_alpha = _mm_unpacklo_pi8(mm_alpha, mm_zero); /* 0A0A0A0A -> mm_alpha, minus 1 chan */
261 /* at this point mm_alpha can be 000A0A0A or 0A0A0A00 or another combo */
262 dsta = _mm_set_pi32(dalpha, dalpha); /* dst alpha mask -> dsta */
263
264 while (height--) {
265 int n = width;
266 if (n & 1) {
267 /* One Pixel Blend */
268 src2 = _mm_cvtsi32_si64(*srcp); /* src(ARGB) -> src2 (0000ARGB) */
269 src2 = _mm_unpacklo_pi8(src2, mm_zero); /* 0A0R0G0B -> src2 */
270
271 dst1 = _mm_cvtsi32_si64(*dstp); /* dst(ARGB) -> dst1 (0000ARGB) */
272 dst1 = _mm_unpacklo_pi8(dst1, mm_zero); /* 0A0R0G0B -> dst1 */
273
274 src2 = _mm_sub_pi16(src2, dst1); /* src2 - dst2 -> src2 */
275 src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
276 src2 = _mm_srli_pi16(src2, 8); /* src2 >> 8 -> src2 */
277 dst1 = _mm_add_pi8(src2, dst1); /* src2 + dst1 -> dst1 */
278
279 dst1 = _mm_packs_pu16(dst1, mm_zero); /* 0000ARGB -> dst1 */
280 dst1 = _mm_or_si64(dst1, dsta); /* dsta | dst1 -> dst1 */
281 *dstp = _mm_cvtsi64_si32(dst1); /* dst1 -> pixel */
282
283 ++srcp;
284 ++dstp;
285
286 n--;
287 }
288
289 for (n >>= 1; n > 0; --n) {
290 /* Two Pixels Blend */
291 src1 = *(__m64 *) srcp; /* 2 x src -> src1(ARGBARGB) */
292 src2 = src1; /* 2 x src -> src2(ARGBARGB) */
293 src1 = _mm_unpacklo_pi8(src1, mm_zero); /* low - 0A0R0G0B -> src1 */
294 src2 = _mm_unpackhi_pi8(src2, mm_zero); /* high - 0A0R0G0B -> src2 */
295
296 dst1 = *(__m64 *) dstp; /* 2 x dst -> dst1(ARGBARGB) */
297 dst2 = dst1; /* 2 x dst -> dst2(ARGBARGB) */
298 dst1 = _mm_unpacklo_pi8(dst1, mm_zero); /* low - 0A0R0G0B -> dst1 */
299 dst2 = _mm_unpackhi_pi8(dst2, mm_zero); /* high - 0A0R0G0B -> dst2 */
300
301 src1 = _mm_sub_pi16(src1, dst1); /* src1 - dst1 -> src1 */
302 src1 = _mm_mullo_pi16(src1, mm_alpha); /* src1 * alpha -> src1 */
303 src1 = _mm_srli_pi16(src1, 8); /* src1 >> 8 -> src1 */
304 dst1 = _mm_add_pi8(src1, dst1); /* src1 + dst1(dst1) -> dst1 */
305
306 src2 = _mm_sub_pi16(src2, dst2); /* src2 - dst2 -> src2 */
307 src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
308 src2 = _mm_srli_pi16(src2, 8); /* src2 >> 8 -> src2 */
309 dst2 = _mm_add_pi8(src2, dst2); /* src2 + dst2(dst2) -> dst2 */
310
311 dst1 = _mm_packs_pu16(dst1, dst2); /* 0A0R0G0B(res1), 0A0R0G0B(res2) -> dst1(ARGBARGB) */
312 dst1 = _mm_or_si64(dst1, dsta); /* dsta | dst1 -> dst1 */
313
314 *(__m64 *) dstp = dst1; /* dst1 -> 2 x pixel */
315
316 srcp += 2;
317 dstp += 2;
318 }
319 srcp += srcskip;
320 dstp += dstskip;
321 }
322 _mm_empty();
323 }
324 }
325
326 /* fast ARGB888->(A)RGB888 blending with pixel alpha */
327 static void
BlitRGBtoRGBPixelAlphaMMX(SDL_BlitInfo * info)328 BlitRGBtoRGBPixelAlphaMMX(SDL_BlitInfo * info)
329 {
330 int width = info->dst_w;
331 int height = info->dst_h;
332 Uint32 *srcp = (Uint32 *) info->src;
333 int srcskip = info->src_skip >> 2;
334 Uint32 *dstp = (Uint32 *) info->dst;
335 int dstskip = info->dst_skip >> 2;
336 SDL_PixelFormat *sf = info->src_fmt;
337 Uint32 amask = sf->Amask;
338 Uint32 ashift = sf->Ashift;
339 Uint64 multmask, multmask2;
340
341 __m64 src1, dst1, mm_alpha, mm_zero, mm_alpha2;
342
343 mm_zero = _mm_setzero_si64(); /* 0 -> mm_zero */
344 multmask = 0x00FF;
345 multmask <<= (ashift * 2);
346 multmask2 = 0x00FF00FF00FF00FFULL;
347
348 while (height--) {
349 /* *INDENT-OFF* */
350 DUFFS_LOOP4({
351 Uint32 alpha = *srcp & amask;
352 if (alpha == 0) {
353 /* do nothing */
354 } else if (alpha == amask) {
355 *dstp = *srcp;
356 } else {
357 src1 = _mm_cvtsi32_si64(*srcp); /* src(ARGB) -> src1 (0000ARGB) */
358 src1 = _mm_unpacklo_pi8(src1, mm_zero); /* 0A0R0G0B -> src1 */
359
360 dst1 = _mm_cvtsi32_si64(*dstp); /* dst(ARGB) -> dst1 (0000ARGB) */
361 dst1 = _mm_unpacklo_pi8(dst1, mm_zero); /* 0A0R0G0B -> dst1 */
362
363 mm_alpha = _mm_cvtsi32_si64(alpha); /* alpha -> mm_alpha (0000000A) */
364 mm_alpha = _mm_srli_si64(mm_alpha, ashift); /* mm_alpha >> ashift -> mm_alpha(0000000A) */
365 mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha); /* 00000A0A -> mm_alpha */
366 mm_alpha2 = _mm_unpacklo_pi32(mm_alpha, mm_alpha); /* 0A0A0A0A -> mm_alpha2 */
367 mm_alpha = _mm_or_si64(mm_alpha2, *(__m64 *) & multmask); /* 0F0A0A0A -> mm_alpha */
368 mm_alpha2 = _mm_xor_si64(mm_alpha2, *(__m64 *) & multmask2); /* 255 - mm_alpha -> mm_alpha */
369
370 /* blend */
371 src1 = _mm_mullo_pi16(src1, mm_alpha);
372 src1 = _mm_srli_pi16(src1, 8);
373 dst1 = _mm_mullo_pi16(dst1, mm_alpha2);
374 dst1 = _mm_srli_pi16(dst1, 8);
375 dst1 = _mm_add_pi16(src1, dst1);
376 dst1 = _mm_packs_pu16(dst1, mm_zero);
377
378 *dstp = _mm_cvtsi64_si32(dst1); /* dst1 -> pixel */
379 }
380 ++srcp;
381 ++dstp;
382 }, width);
383 /* *INDENT-ON* */
384 srcp += srcskip;
385 dstp += dstskip;
386 }
387 _mm_empty();
388 }
389
390 #endif /* __MMX__ */
391
392 /* fast RGB888->(A)RGB888 blending with surface alpha=128 special case */
393 static void
BlitRGBtoRGBSurfaceAlpha128(SDL_BlitInfo * info)394 BlitRGBtoRGBSurfaceAlpha128(SDL_BlitInfo * info)
395 {
396 int width = info->dst_w;
397 int height = info->dst_h;
398 Uint32 *srcp = (Uint32 *) info->src;
399 int srcskip = info->src_skip >> 2;
400 Uint32 *dstp = (Uint32 *) info->dst;
401 int dstskip = info->dst_skip >> 2;
402
403 while (height--) {
404 /* *INDENT-OFF* */
405 DUFFS_LOOP4({
406 Uint32 s = *srcp++;
407 Uint32 d = *dstp;
408 *dstp++ = ((((s & 0x00fefefe) + (d & 0x00fefefe)) >> 1)
409 + (s & d & 0x00010101)) | 0xff000000;
410 }, width);
411 /* *INDENT-ON* */
412 srcp += srcskip;
413 dstp += dstskip;
414 }
415 }
416
417 /* fast RGB888->(A)RGB888 blending with surface alpha */
418 static void
BlitRGBtoRGBSurfaceAlpha(SDL_BlitInfo * info)419 BlitRGBtoRGBSurfaceAlpha(SDL_BlitInfo * info)
420 {
421 unsigned alpha = info->a;
422 if (alpha == 128) {
423 BlitRGBtoRGBSurfaceAlpha128(info);
424 } else {
425 int width = info->dst_w;
426 int height = info->dst_h;
427 Uint32 *srcp = (Uint32 *) info->src;
428 int srcskip = info->src_skip >> 2;
429 Uint32 *dstp = (Uint32 *) info->dst;
430 int dstskip = info->dst_skip >> 2;
431 Uint32 s;
432 Uint32 d;
433 Uint32 s1;
434 Uint32 d1;
435
436 while (height--) {
437 /* *INDENT-OFF* */
438 DUFFS_LOOP4({
439 s = *srcp;
440 d = *dstp;
441 s1 = s & 0xff00ff;
442 d1 = d & 0xff00ff;
443 d1 = (d1 + ((s1 - d1) * alpha >> 8))
444 & 0xff00ff;
445 s &= 0xff00;
446 d &= 0xff00;
447 d = (d + ((s - d) * alpha >> 8)) & 0xff00;
448 *dstp = d1 | d | 0xff000000;
449 ++srcp;
450 ++dstp;
451 }, width);
452 /* *INDENT-ON* */
453 srcp += srcskip;
454 dstp += dstskip;
455 }
456 }
457 }
458
459 /* fast ARGB888->(A)RGB888 blending with pixel alpha */
460 static void
BlitRGBtoRGBPixelAlpha(SDL_BlitInfo * info)461 BlitRGBtoRGBPixelAlpha(SDL_BlitInfo * info)
462 {
463 int width = info->dst_w;
464 int height = info->dst_h;
465 Uint32 *srcp = (Uint32 *) info->src;
466 int srcskip = info->src_skip >> 2;
467 Uint32 *dstp = (Uint32 *) info->dst;
468 int dstskip = info->dst_skip >> 2;
469
470 while (height--) {
471 /* *INDENT-OFF* */
472 DUFFS_LOOP4({
473 Uint32 dalpha;
474 Uint32 d;
475 Uint32 s1;
476 Uint32 d1;
477 Uint32 s = *srcp;
478 Uint32 alpha = s >> 24;
479 /* FIXME: Here we special-case opaque alpha since the
480 compositioning used (>>8 instead of /255) doesn't handle
481 it correctly. Also special-case alpha=0 for speed?
482 Benchmark this! */
483 if (alpha) {
484 if (alpha == SDL_ALPHA_OPAQUE) {
485 *dstp = *srcp;
486 } else {
487 /*
488 * take out the middle component (green), and process
489 * the other two in parallel. One multiply less.
490 */
491 d = *dstp;
492 dalpha = d >> 24;
493 s1 = s & 0xff00ff;
494 d1 = d & 0xff00ff;
495 d1 = (d1 + ((s1 - d1) * alpha >> 8)) & 0xff00ff;
496 s &= 0xff00;
497 d &= 0xff00;
498 d = (d + ((s - d) * alpha >> 8)) & 0xff00;
499 dalpha = alpha + (dalpha * (alpha ^ 0xFF) >> 8);
500 *dstp = d1 | d | (dalpha << 24);
501 }
502 }
503 ++srcp;
504 ++dstp;
505 }, width);
506 /* *INDENT-ON* */
507 srcp += srcskip;
508 dstp += dstskip;
509 }
510 }
511
512 #ifdef __3dNOW__
513 /* fast (as in MMX with prefetch) ARGB888->(A)RGB888 blending with pixel alpha */
514 static void
BlitRGBtoRGBPixelAlphaMMX3DNOW(SDL_BlitInfo * info)515 BlitRGBtoRGBPixelAlphaMMX3DNOW(SDL_BlitInfo * info)
516 {
517 int width = info->dst_w;
518 int height = info->dst_h;
519 Uint32 *srcp = (Uint32 *) info->src;
520 int srcskip = info->src_skip >> 2;
521 Uint32 *dstp = (Uint32 *) info->dst;
522 int dstskip = info->dst_skip >> 2;
523 SDL_PixelFormat *sf = info->src_fmt;
524 Uint32 amask = sf->Amask;
525 Uint32 ashift = sf->Ashift;
526 Uint64 multmask, multmask2;
527
528 __m64 src1, dst1, mm_alpha, mm_zero, mm_alpha2;
529
530 mm_zero = _mm_setzero_si64(); /* 0 -> mm_zero */
531 multmask = 0x00FF;
532 multmask <<= (ashift * 2);
533 multmask2 = 0x00FF00FF00FF00FFULL;
534
535 while (height--) {
536 /* *INDENT-OFF* */
537 DUFFS_LOOP4({
538 Uint32 alpha;
539
540 _m_prefetch(srcp + 16);
541 _m_prefetch(dstp + 16);
542
543 alpha = *srcp & amask;
544 if (alpha == 0) {
545 /* do nothing */
546 } else if (alpha == amask) {
547 *dstp = *srcp;
548 } else {
549 src1 = _mm_cvtsi32_si64(*srcp); /* src(ARGB) -> src1 (0000ARGB) */
550 src1 = _mm_unpacklo_pi8(src1, mm_zero); /* 0A0R0G0B -> src1 */
551
552 dst1 = _mm_cvtsi32_si64(*dstp); /* dst(ARGB) -> dst1 (0000ARGB) */
553 dst1 = _mm_unpacklo_pi8(dst1, mm_zero); /* 0A0R0G0B -> dst1 */
554
555 mm_alpha = _mm_cvtsi32_si64(alpha); /* alpha -> mm_alpha (0000000A) */
556 mm_alpha = _mm_srli_si64(mm_alpha, ashift); /* mm_alpha >> ashift -> mm_alpha(0000000A) */
557 mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha); /* 00000A0A -> mm_alpha */
558 mm_alpha2 = _mm_unpacklo_pi32(mm_alpha, mm_alpha); /* 0A0A0A0A -> mm_alpha2 */
559 mm_alpha = _mm_or_si64(mm_alpha2, *(__m64 *) & multmask); /* 0F0A0A0A -> mm_alpha */
560 mm_alpha2 = _mm_xor_si64(mm_alpha2, *(__m64 *) & multmask2); /* 255 - mm_alpha -> mm_alpha */
561
562
563 /* blend */
564 src1 = _mm_mullo_pi16(src1, mm_alpha);
565 src1 = _mm_srli_pi16(src1, 8);
566 dst1 = _mm_mullo_pi16(dst1, mm_alpha2);
567 dst1 = _mm_srli_pi16(dst1, 8);
568 dst1 = _mm_add_pi16(src1, dst1);
569 dst1 = _mm_packs_pu16(dst1, mm_zero);
570
571 *dstp = _mm_cvtsi64_si32(dst1); /* dst1 -> pixel */
572 }
573 ++srcp;
574 ++dstp;
575 }, width);
576 /* *INDENT-ON* */
577 srcp += srcskip;
578 dstp += dstskip;
579 }
580 _mm_empty();
581 }
582
583 #endif /* __3dNOW__ */
584
585 /* 16bpp special case for per-surface alpha=50%: blend 2 pixels in parallel */
586
587 /* blend a single 16 bit pixel at 50% */
588 #define BLEND16_50(d, s, mask) \
589 ((((s & mask) + (d & mask)) >> 1) + (s & d & (~mask & 0xffff)))
590
591 /* blend two 16 bit pixels at 50% */
592 #define BLEND2x16_50(d, s, mask) \
593 (((s & (mask | mask << 16)) >> 1) + ((d & (mask | mask << 16)) >> 1) \
594 + (s & d & (~(mask | mask << 16))))
595
596 static void
Blit16to16SurfaceAlpha128(SDL_BlitInfo * info,Uint16 mask)597 Blit16to16SurfaceAlpha128(SDL_BlitInfo * info, Uint16 mask)
598 {
599 int width = info->dst_w;
600 int height = info->dst_h;
601 Uint16 *srcp = (Uint16 *) info->src;
602 int srcskip = info->src_skip >> 1;
603 Uint16 *dstp = (Uint16 *) info->dst;
604 int dstskip = info->dst_skip >> 1;
605
606 while (height--) {
607 if (((uintptr_t) srcp ^ (uintptr_t) dstp) & 2) {
608 /*
609 * Source and destination not aligned, pipeline it.
610 * This is mostly a win for big blits but no loss for
611 * small ones
612 */
613 Uint32 prev_sw;
614 int w = width;
615
616 /* handle odd destination */
617 if ((uintptr_t) dstp & 2) {
618 Uint16 d = *dstp, s = *srcp;
619 *dstp = BLEND16_50(d, s, mask);
620 dstp++;
621 srcp++;
622 w--;
623 }
624 srcp++; /* srcp is now 32-bit aligned */
625
626 /* bootstrap pipeline with first halfword */
627 prev_sw = ((Uint32 *) srcp)[-1];
628
629 while (w > 1) {
630 Uint32 sw, dw, s;
631 sw = *(Uint32 *) srcp;
632 dw = *(Uint32 *) dstp;
633 #if SDL_BYTEORDER == SDL_BIG_ENDIAN
634 s = (prev_sw << 16) + (sw >> 16);
635 #else
636 s = (prev_sw >> 16) + (sw << 16);
637 #endif
638 prev_sw = sw;
639 *(Uint32 *) dstp = BLEND2x16_50(dw, s, mask);
640 dstp += 2;
641 srcp += 2;
642 w -= 2;
643 }
644
645 /* final pixel if any */
646 if (w) {
647 Uint16 d = *dstp, s;
648 #if SDL_BYTEORDER == SDL_BIG_ENDIAN
649 s = (Uint16) prev_sw;
650 #else
651 s = (Uint16) (prev_sw >> 16);
652 #endif
653 *dstp = BLEND16_50(d, s, mask);
654 srcp++;
655 dstp++;
656 }
657 srcp += srcskip - 1;
658 dstp += dstskip;
659 } else {
660 /* source and destination are aligned */
661 int w = width;
662
663 /* first odd pixel? */
664 if ((uintptr_t) srcp & 2) {
665 Uint16 d = *dstp, s = *srcp;
666 *dstp = BLEND16_50(d, s, mask);
667 srcp++;
668 dstp++;
669 w--;
670 }
671 /* srcp and dstp are now 32-bit aligned */
672
673 while (w > 1) {
674 Uint32 sw = *(Uint32 *) srcp;
675 Uint32 dw = *(Uint32 *) dstp;
676 *(Uint32 *) dstp = BLEND2x16_50(dw, sw, mask);
677 srcp += 2;
678 dstp += 2;
679 w -= 2;
680 }
681
682 /* last odd pixel? */
683 if (w) {
684 Uint16 d = *dstp, s = *srcp;
685 *dstp = BLEND16_50(d, s, mask);
686 srcp++;
687 dstp++;
688 }
689 srcp += srcskip;
690 dstp += dstskip;
691 }
692 }
693 }
694
695 #ifdef __MMX__
696
697 /* fast RGB565->RGB565 blending with surface alpha */
698 static void
Blit565to565SurfaceAlphaMMX(SDL_BlitInfo * info)699 Blit565to565SurfaceAlphaMMX(SDL_BlitInfo * info)
700 {
701 unsigned alpha = info->a;
702 if (alpha == 128) {
703 Blit16to16SurfaceAlpha128(info, 0xf7de);
704 } else {
705 int width = info->dst_w;
706 int height = info->dst_h;
707 Uint16 *srcp = (Uint16 *) info->src;
708 int srcskip = info->src_skip >> 1;
709 Uint16 *dstp = (Uint16 *) info->dst;
710 int dstskip = info->dst_skip >> 1;
711 Uint32 s, d;
712
713 __m64 src1, dst1, src2, dst2, gmask, bmask, mm_res, mm_alpha;
714
715 alpha &= ~(1 + 2 + 4); /* cut alpha to get the exact same behaviour */
716 mm_alpha = _mm_set_pi32(0, alpha); /* 0000000A -> mm_alpha */
717 alpha >>= 3; /* downscale alpha to 5 bits */
718
719 mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha); /* 00000A0A -> mm_alpha */
720 mm_alpha = _mm_unpacklo_pi32(mm_alpha, mm_alpha); /* 0A0A0A0A -> mm_alpha */
721 /* position alpha to allow for mullo and mulhi on diff channels
722 to reduce the number of operations */
723 mm_alpha = _mm_slli_si64(mm_alpha, 3);
724
725 /* Setup the 565 color channel masks */
726 gmask = _mm_set_pi32(0x07E007E0, 0x07E007E0); /* MASKGREEN -> gmask */
727 bmask = _mm_set_pi32(0x001F001F, 0x001F001F); /* MASKBLUE -> bmask */
728
729 while (height--) {
730 /* *INDENT-OFF* */
731 DUFFS_LOOP_124(
732 {
733 s = *srcp++;
734 d = *dstp;
735 /*
736 * shift out the middle component (green) to
737 * the high 16 bits, and process all three RGB
738 * components at the same time.
739 */
740 s = (s | s << 16) & 0x07e0f81f;
741 d = (d | d << 16) & 0x07e0f81f;
742 d += (s - d) * alpha >> 5;
743 d &= 0x07e0f81f;
744 *dstp++ = (Uint16)(d | d >> 16);
745 },{
746 s = *srcp++;
747 d = *dstp;
748 /*
749 * shift out the middle component (green) to
750 * the high 16 bits, and process all three RGB
751 * components at the same time.
752 */
753 s = (s | s << 16) & 0x07e0f81f;
754 d = (d | d << 16) & 0x07e0f81f;
755 d += (s - d) * alpha >> 5;
756 d &= 0x07e0f81f;
757 *dstp++ = (Uint16)(d | d >> 16);
758 s = *srcp++;
759 d = *dstp;
760 /*
761 * shift out the middle component (green) to
762 * the high 16 bits, and process all three RGB
763 * components at the same time.
764 */
765 s = (s | s << 16) & 0x07e0f81f;
766 d = (d | d << 16) & 0x07e0f81f;
767 d += (s - d) * alpha >> 5;
768 d &= 0x07e0f81f;
769 *dstp++ = (Uint16)(d | d >> 16);
770 },{
771 src1 = *(__m64*)srcp; /* 4 src pixels -> src1 */
772 dst1 = *(__m64*)dstp; /* 4 dst pixels -> dst1 */
773
774 /* red */
775 src2 = src1;
776 src2 = _mm_srli_pi16(src2, 11); /* src2 >> 11 -> src2 [000r 000r 000r 000r] */
777
778 dst2 = dst1;
779 dst2 = _mm_srli_pi16(dst2, 11); /* dst2 >> 11 -> dst2 [000r 000r 000r 000r] */
780
781 /* blend */
782 src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
783 src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
784 src2 = _mm_srli_pi16(src2, 11); /* src2 >> 11 -> src2 */
785 dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
786 dst2 = _mm_slli_pi16(dst2, 11); /* dst2 << 11 -> dst2 */
787
788 mm_res = dst2; /* RED -> mm_res */
789
790 /* green -- process the bits in place */
791 src2 = src1;
792 src2 = _mm_and_si64(src2, gmask); /* src & MASKGREEN -> src2 */
793
794 dst2 = dst1;
795 dst2 = _mm_and_si64(dst2, gmask); /* dst & MASKGREEN -> dst2 */
796
797 /* blend */
798 src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
799 src2 = _mm_mulhi_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
800 src2 = _mm_slli_pi16(src2, 5); /* src2 << 5 -> src2 */
801 dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
802
803 mm_res = _mm_or_si64(mm_res, dst2); /* RED | GREEN -> mm_res */
804
805 /* blue */
806 src2 = src1;
807 src2 = _mm_and_si64(src2, bmask); /* src & MASKBLUE -> src2[000b 000b 000b 000b] */
808
809 dst2 = dst1;
810 dst2 = _mm_and_si64(dst2, bmask); /* dst & MASKBLUE -> dst2[000b 000b 000b 000b] */
811
812 /* blend */
813 src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
814 src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
815 src2 = _mm_srli_pi16(src2, 11); /* src2 >> 11 -> src2 */
816 dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
817 dst2 = _mm_and_si64(dst2, bmask); /* dst2 & MASKBLUE -> dst2 */
818
819 mm_res = _mm_or_si64(mm_res, dst2); /* RED | GREEN | BLUE -> mm_res */
820
821 *(__m64*)dstp = mm_res; /* mm_res -> 4 dst pixels */
822
823 srcp += 4;
824 dstp += 4;
825 }, width);
826 /* *INDENT-ON* */
827 srcp += srcskip;
828 dstp += dstskip;
829 }
830 _mm_empty();
831 }
832 }
833
834 /* fast RGB555->RGB555 blending with surface alpha */
835 static void
Blit555to555SurfaceAlphaMMX(SDL_BlitInfo * info)836 Blit555to555SurfaceAlphaMMX(SDL_BlitInfo * info)
837 {
838 unsigned alpha = info->a;
839 if (alpha == 128) {
840 Blit16to16SurfaceAlpha128(info, 0xfbde);
841 } else {
842 int width = info->dst_w;
843 int height = info->dst_h;
844 Uint16 *srcp = (Uint16 *) info->src;
845 int srcskip = info->src_skip >> 1;
846 Uint16 *dstp = (Uint16 *) info->dst;
847 int dstskip = info->dst_skip >> 1;
848 Uint32 s, d;
849
850 __m64 src1, dst1, src2, dst2, rmask, gmask, bmask, mm_res, mm_alpha;
851
852 alpha &= ~(1 + 2 + 4); /* cut alpha to get the exact same behaviour */
853 mm_alpha = _mm_set_pi32(0, alpha); /* 0000000A -> mm_alpha */
854 alpha >>= 3; /* downscale alpha to 5 bits */
855
856 mm_alpha = _mm_unpacklo_pi16(mm_alpha, mm_alpha); /* 00000A0A -> mm_alpha */
857 mm_alpha = _mm_unpacklo_pi32(mm_alpha, mm_alpha); /* 0A0A0A0A -> mm_alpha */
858 /* position alpha to allow for mullo and mulhi on diff channels
859 to reduce the number of operations */
860 mm_alpha = _mm_slli_si64(mm_alpha, 3);
861
862 /* Setup the 555 color channel masks */
863 rmask = _mm_set_pi32(0x7C007C00, 0x7C007C00); /* MASKRED -> rmask */
864 gmask = _mm_set_pi32(0x03E003E0, 0x03E003E0); /* MASKGREEN -> gmask */
865 bmask = _mm_set_pi32(0x001F001F, 0x001F001F); /* MASKBLUE -> bmask */
866
867 while (height--) {
868 /* *INDENT-OFF* */
869 DUFFS_LOOP_124(
870 {
871 s = *srcp++;
872 d = *dstp;
873 /*
874 * shift out the middle component (green) to
875 * the high 16 bits, and process all three RGB
876 * components at the same time.
877 */
878 s = (s | s << 16) & 0x03e07c1f;
879 d = (d | d << 16) & 0x03e07c1f;
880 d += (s - d) * alpha >> 5;
881 d &= 0x03e07c1f;
882 *dstp++ = (Uint16)(d | d >> 16);
883 },{
884 s = *srcp++;
885 d = *dstp;
886 /*
887 * shift out the middle component (green) to
888 * the high 16 bits, and process all three RGB
889 * components at the same time.
890 */
891 s = (s | s << 16) & 0x03e07c1f;
892 d = (d | d << 16) & 0x03e07c1f;
893 d += (s - d) * alpha >> 5;
894 d &= 0x03e07c1f;
895 *dstp++ = (Uint16)(d | d >> 16);
896 s = *srcp++;
897 d = *dstp;
898 /*
899 * shift out the middle component (green) to
900 * the high 16 bits, and process all three RGB
901 * components at the same time.
902 */
903 s = (s | s << 16) & 0x03e07c1f;
904 d = (d | d << 16) & 0x03e07c1f;
905 d += (s - d) * alpha >> 5;
906 d &= 0x03e07c1f;
907 *dstp++ = (Uint16)(d | d >> 16);
908 },{
909 src1 = *(__m64*)srcp; /* 4 src pixels -> src1 */
910 dst1 = *(__m64*)dstp; /* 4 dst pixels -> dst1 */
911
912 /* red -- process the bits in place */
913 src2 = src1;
914 src2 = _mm_and_si64(src2, rmask); /* src & MASKRED -> src2 */
915
916 dst2 = dst1;
917 dst2 = _mm_and_si64(dst2, rmask); /* dst & MASKRED -> dst2 */
918
919 /* blend */
920 src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
921 src2 = _mm_mulhi_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
922 src2 = _mm_slli_pi16(src2, 5); /* src2 << 5 -> src2 */
923 dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
924 dst2 = _mm_and_si64(dst2, rmask); /* dst2 & MASKRED -> dst2 */
925
926 mm_res = dst2; /* RED -> mm_res */
927
928 /* green -- process the bits in place */
929 src2 = src1;
930 src2 = _mm_and_si64(src2, gmask); /* src & MASKGREEN -> src2 */
931
932 dst2 = dst1;
933 dst2 = _mm_and_si64(dst2, gmask); /* dst & MASKGREEN -> dst2 */
934
935 /* blend */
936 src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
937 src2 = _mm_mulhi_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
938 src2 = _mm_slli_pi16(src2, 5); /* src2 << 5 -> src2 */
939 dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
940
941 mm_res = _mm_or_si64(mm_res, dst2); /* RED | GREEN -> mm_res */
942
943 /* blue */
944 src2 = src1; /* src -> src2 */
945 src2 = _mm_and_si64(src2, bmask); /* src & MASKBLUE -> src2[000b 000b 000b 000b] */
946
947 dst2 = dst1; /* dst -> dst2 */
948 dst2 = _mm_and_si64(dst2, bmask); /* dst & MASKBLUE -> dst2[000b 000b 000b 000b] */
949
950 /* blend */
951 src2 = _mm_sub_pi16(src2, dst2);/* src - dst -> src2 */
952 src2 = _mm_mullo_pi16(src2, mm_alpha); /* src2 * alpha -> src2 */
953 src2 = _mm_srli_pi16(src2, 11); /* src2 >> 11 -> src2 */
954 dst2 = _mm_add_pi16(src2, dst2); /* src2 + dst2 -> dst2 */
955 dst2 = _mm_and_si64(dst2, bmask); /* dst2 & MASKBLUE -> dst2 */
956
957 mm_res = _mm_or_si64(mm_res, dst2); /* RED | GREEN | BLUE -> mm_res */
958
959 *(__m64*)dstp = mm_res; /* mm_res -> 4 dst pixels */
960
961 srcp += 4;
962 dstp += 4;
963 }, width);
964 /* *INDENT-ON* */
965 srcp += srcskip;
966 dstp += dstskip;
967 }
968 _mm_empty();
969 }
970 }
971
972 #endif /* __MMX__ */
973
974 /* fast RGB565->RGB565 blending with surface alpha */
975 static void
Blit565to565SurfaceAlpha(SDL_BlitInfo * info)976 Blit565to565SurfaceAlpha(SDL_BlitInfo * info)
977 {
978 unsigned alpha = info->a;
979 if (alpha == 128) {
980 Blit16to16SurfaceAlpha128(info, 0xf7de);
981 } else {
982 int width = info->dst_w;
983 int height = info->dst_h;
984 Uint16 *srcp = (Uint16 *) info->src;
985 int srcskip = info->src_skip >> 1;
986 Uint16 *dstp = (Uint16 *) info->dst;
987 int dstskip = info->dst_skip >> 1;
988 alpha >>= 3; /* downscale alpha to 5 bits */
989
990 while (height--) {
991 /* *INDENT-OFF* */
992 DUFFS_LOOP4({
993 Uint32 s = *srcp++;
994 Uint32 d = *dstp;
995 /*
996 * shift out the middle component (green) to
997 * the high 16 bits, and process all three RGB
998 * components at the same time.
999 */
1000 s = (s | s << 16) & 0x07e0f81f;
1001 d = (d | d << 16) & 0x07e0f81f;
1002 d += (s - d) * alpha >> 5;
1003 d &= 0x07e0f81f;
1004 *dstp++ = (Uint16)(d | d >> 16);
1005 }, width);
1006 /* *INDENT-ON* */
1007 srcp += srcskip;
1008 dstp += dstskip;
1009 }
1010 }
1011 }
1012
1013 /* fast RGB555->RGB555 blending with surface alpha */
1014 static void
Blit555to555SurfaceAlpha(SDL_BlitInfo * info)1015 Blit555to555SurfaceAlpha(SDL_BlitInfo * info)
1016 {
1017 unsigned alpha = info->a; /* downscale alpha to 5 bits */
1018 if (alpha == 128) {
1019 Blit16to16SurfaceAlpha128(info, 0xfbde);
1020 } else {
1021 int width = info->dst_w;
1022 int height = info->dst_h;
1023 Uint16 *srcp = (Uint16 *) info->src;
1024 int srcskip = info->src_skip >> 1;
1025 Uint16 *dstp = (Uint16 *) info->dst;
1026 int dstskip = info->dst_skip >> 1;
1027 alpha >>= 3; /* downscale alpha to 5 bits */
1028
1029 while (height--) {
1030 /* *INDENT-OFF* */
1031 DUFFS_LOOP4({
1032 Uint32 s = *srcp++;
1033 Uint32 d = *dstp;
1034 /*
1035 * shift out the middle component (green) to
1036 * the high 16 bits, and process all three RGB
1037 * components at the same time.
1038 */
1039 s = (s | s << 16) & 0x03e07c1f;
1040 d = (d | d << 16) & 0x03e07c1f;
1041 d += (s - d) * alpha >> 5;
1042 d &= 0x03e07c1f;
1043 *dstp++ = (Uint16)(d | d >> 16);
1044 }, width);
1045 /* *INDENT-ON* */
1046 srcp += srcskip;
1047 dstp += dstskip;
1048 }
1049 }
1050 }
1051
1052 /* fast ARGB8888->RGB565 blending with pixel alpha */
1053 static void
BlitARGBto565PixelAlpha(SDL_BlitInfo * info)1054 BlitARGBto565PixelAlpha(SDL_BlitInfo * info)
1055 {
1056 int width = info->dst_w;
1057 int height = info->dst_h;
1058 Uint32 *srcp = (Uint32 *) info->src;
1059 int srcskip = info->src_skip >> 2;
1060 Uint16 *dstp = (Uint16 *) info->dst;
1061 int dstskip = info->dst_skip >> 1;
1062
1063 while (height--) {
1064 /* *INDENT-OFF* */
1065 DUFFS_LOOP4({
1066 Uint32 s = *srcp;
1067 unsigned alpha = s >> 27; /* downscale alpha to 5 bits */
1068 /* FIXME: Here we special-case opaque alpha since the
1069 compositioning used (>>8 instead of /255) doesn't handle
1070 it correctly. Also special-case alpha=0 for speed?
1071 Benchmark this! */
1072 if(alpha) {
1073 if(alpha == (SDL_ALPHA_OPAQUE >> 3)) {
1074 *dstp = (Uint16)((s >> 8 & 0xf800) + (s >> 5 & 0x7e0) + (s >> 3 & 0x1f));
1075 } else {
1076 Uint32 d = *dstp;
1077 /*
1078 * convert source and destination to G0RAB65565
1079 * and blend all components at the same time
1080 */
1081 s = ((s & 0xfc00) << 11) + (s >> 8 & 0xf800)
1082 + (s >> 3 & 0x1f);
1083 d = (d | d << 16) & 0x07e0f81f;
1084 d += (s - d) * alpha >> 5;
1085 d &= 0x07e0f81f;
1086 *dstp = (Uint16)(d | d >> 16);
1087 }
1088 }
1089 srcp++;
1090 dstp++;
1091 }, width);
1092 /* *INDENT-ON* */
1093 srcp += srcskip;
1094 dstp += dstskip;
1095 }
1096 }
1097
1098 /* fast ARGB8888->RGB555 blending with pixel alpha */
1099 static void
BlitARGBto555PixelAlpha(SDL_BlitInfo * info)1100 BlitARGBto555PixelAlpha(SDL_BlitInfo * info)
1101 {
1102 int width = info->dst_w;
1103 int height = info->dst_h;
1104 Uint32 *srcp = (Uint32 *) info->src;
1105 int srcskip = info->src_skip >> 2;
1106 Uint16 *dstp = (Uint16 *) info->dst;
1107 int dstskip = info->dst_skip >> 1;
1108
1109 while (height--) {
1110 /* *INDENT-OFF* */
1111 DUFFS_LOOP4({
1112 unsigned alpha;
1113 Uint32 s = *srcp;
1114 alpha = s >> 27; /* downscale alpha to 5 bits */
1115 /* FIXME: Here we special-case opaque alpha since the
1116 compositioning used (>>8 instead of /255) doesn't handle
1117 it correctly. Also special-case alpha=0 for speed?
1118 Benchmark this! */
1119 if(alpha) {
1120 if(alpha == (SDL_ALPHA_OPAQUE >> 3)) {
1121 *dstp = (Uint16)((s >> 9 & 0x7c00) + (s >> 6 & 0x3e0) + (s >> 3 & 0x1f));
1122 } else {
1123 Uint32 d = *dstp;
1124 /*
1125 * convert source and destination to G0RAB65565
1126 * and blend all components at the same time
1127 */
1128 s = ((s & 0xf800) << 10) + (s >> 9 & 0x7c00)
1129 + (s >> 3 & 0x1f);
1130 d = (d | d << 16) & 0x03e07c1f;
1131 d += (s - d) * alpha >> 5;
1132 d &= 0x03e07c1f;
1133 *dstp = (Uint16)(d | d >> 16);
1134 }
1135 }
1136 srcp++;
1137 dstp++;
1138 }, width);
1139 /* *INDENT-ON* */
1140 srcp += srcskip;
1141 dstp += dstskip;
1142 }
1143 }
1144
1145 /* General (slow) N->N blending with per-surface alpha */
1146 static void
BlitNtoNSurfaceAlpha(SDL_BlitInfo * info)1147 BlitNtoNSurfaceAlpha(SDL_BlitInfo * info)
1148 {
1149 int width = info->dst_w;
1150 int height = info->dst_h;
1151 Uint8 *src = info->src;
1152 int srcskip = info->src_skip;
1153 Uint8 *dst = info->dst;
1154 int dstskip = info->dst_skip;
1155 SDL_PixelFormat *srcfmt = info->src_fmt;
1156 SDL_PixelFormat *dstfmt = info->dst_fmt;
1157 int srcbpp = srcfmt->BytesPerPixel;
1158 int dstbpp = dstfmt->BytesPerPixel;
1159 Uint32 Pixel;
1160 unsigned sR, sG, sB;
1161 unsigned dR, dG, dB, dA;
1162 const unsigned sA = info->a;
1163
1164 if (sA) {
1165 while (height--) {
1166 /* *INDENT-OFF* */
1167 DUFFS_LOOP4(
1168 {
1169 DISEMBLE_RGB(src, srcbpp, srcfmt, Pixel, sR, sG, sB);
1170 DISEMBLE_RGBA(dst, dstbpp, dstfmt, Pixel, dR, dG, dB, dA);
1171 ALPHA_BLEND_RGBA(sR, sG, sB, sA, dR, dG, dB, dA);
1172 ASSEMBLE_RGBA(dst, dstbpp, dstfmt, dR, dG, dB, dA);
1173 src += srcbpp;
1174 dst += dstbpp;
1175 },
1176 width);
1177 /* *INDENT-ON* */
1178 src += srcskip;
1179 dst += dstskip;
1180 }
1181 }
1182 }
1183
1184 /* General (slow) colorkeyed N->N blending with per-surface alpha */
1185 static void
BlitNtoNSurfaceAlphaKey(SDL_BlitInfo * info)1186 BlitNtoNSurfaceAlphaKey(SDL_BlitInfo * info)
1187 {
1188 int width = info->dst_w;
1189 int height = info->dst_h;
1190 Uint8 *src = info->src;
1191 int srcskip = info->src_skip;
1192 Uint8 *dst = info->dst;
1193 int dstskip = info->dst_skip;
1194 SDL_PixelFormat *srcfmt = info->src_fmt;
1195 SDL_PixelFormat *dstfmt = info->dst_fmt;
1196 Uint32 ckey = info->colorkey;
1197 int srcbpp = srcfmt->BytesPerPixel;
1198 int dstbpp = dstfmt->BytesPerPixel;
1199 Uint32 Pixel;
1200 unsigned sR, sG, sB;
1201 unsigned dR, dG, dB, dA;
1202 const unsigned sA = info->a;
1203
1204 while (height--) {
1205 /* *INDENT-OFF* */
1206 DUFFS_LOOP4(
1207 {
1208 RETRIEVE_RGB_PIXEL(src, srcbpp, Pixel);
1209 if(sA && Pixel != ckey) {
1210 RGB_FROM_PIXEL(Pixel, srcfmt, sR, sG, sB);
1211 DISEMBLE_RGBA(dst, dstbpp, dstfmt, Pixel, dR, dG, dB, dA);
1212 ALPHA_BLEND_RGBA(sR, sG, sB, sA, dR, dG, dB, dA);
1213 ASSEMBLE_RGBA(dst, dstbpp, dstfmt, dR, dG, dB, dA);
1214 }
1215 src += srcbpp;
1216 dst += dstbpp;
1217 },
1218 width);
1219 /* *INDENT-ON* */
1220 src += srcskip;
1221 dst += dstskip;
1222 }
1223 }
1224
1225 /* General (slow) N->N blending with pixel alpha */
1226 static void
BlitNtoNPixelAlpha(SDL_BlitInfo * info)1227 BlitNtoNPixelAlpha(SDL_BlitInfo * info)
1228 {
1229 int width = info->dst_w;
1230 int height = info->dst_h;
1231 Uint8 *src = info->src;
1232 int srcskip = info->src_skip;
1233 Uint8 *dst = info->dst;
1234 int dstskip = info->dst_skip;
1235 SDL_PixelFormat *srcfmt = info->src_fmt;
1236 SDL_PixelFormat *dstfmt = info->dst_fmt;
1237 int srcbpp;
1238 int dstbpp;
1239 Uint32 Pixel;
1240 unsigned sR, sG, sB, sA;
1241 unsigned dR, dG, dB, dA;
1242
1243 /* Set up some basic variables */
1244 srcbpp = srcfmt->BytesPerPixel;
1245 dstbpp = dstfmt->BytesPerPixel;
1246
1247 while (height--) {
1248 /* *INDENT-OFF* */
1249 DUFFS_LOOP4(
1250 {
1251 DISEMBLE_RGBA(src, srcbpp, srcfmt, Pixel, sR, sG, sB, sA);
1252 if(sA) {
1253 DISEMBLE_RGBA(dst, dstbpp, dstfmt, Pixel, dR, dG, dB, dA);
1254 ALPHA_BLEND_RGBA(sR, sG, sB, sA, dR, dG, dB, dA);
1255 ASSEMBLE_RGBA(dst, dstbpp, dstfmt, dR, dG, dB, dA);
1256 }
1257 src += srcbpp;
1258 dst += dstbpp;
1259 },
1260 width);
1261 /* *INDENT-ON* */
1262 src += srcskip;
1263 dst += dstskip;
1264 }
1265 }
1266
1267
1268 SDL_BlitFunc
SDL_CalculateBlitA(SDL_Surface * surface)1269 SDL_CalculateBlitA(SDL_Surface * surface)
1270 {
1271 SDL_PixelFormat *sf = surface->format;
1272 SDL_PixelFormat *df = surface->map->dst->format;
1273
1274 switch (surface->map->info.flags & ~SDL_COPY_RLE_MASK) {
1275 case SDL_COPY_BLEND:
1276 /* Per-pixel alpha blits */
1277 switch (df->BytesPerPixel) {
1278 case 1:
1279 return BlitNto1PixelAlpha;
1280
1281 case 2:
1282 if (sf->BytesPerPixel == 4 && sf->Amask == 0xff000000
1283 && sf->Gmask == 0xff00
1284 && ((sf->Rmask == 0xff && df->Rmask == 0x1f)
1285 || (sf->Bmask == 0xff && df->Bmask == 0x1f))) {
1286 if (df->Gmask == 0x7e0)
1287 return BlitARGBto565PixelAlpha;
1288 else if (df->Gmask == 0x3e0)
1289 return BlitARGBto555PixelAlpha;
1290 }
1291 return BlitNtoNPixelAlpha;
1292
1293 case 4:
1294 if (sf->Rmask == df->Rmask
1295 && sf->Gmask == df->Gmask
1296 && sf->Bmask == df->Bmask && sf->BytesPerPixel == 4) {
1297 #if defined(__MMX__) || defined(__3dNOW__)
1298 if (sf->Rshift % 8 == 0
1299 && sf->Gshift % 8 == 0
1300 && sf->Bshift % 8 == 0
1301 && sf->Ashift % 8 == 0 && sf->Aloss == 0) {
1302 #ifdef __3dNOW__
1303 if (SDL_Has3DNow())
1304 return BlitRGBtoRGBPixelAlphaMMX3DNOW;
1305 #endif
1306 #ifdef __MMX__
1307 if (SDL_HasMMX())
1308 return BlitRGBtoRGBPixelAlphaMMX;
1309 #endif
1310 }
1311 #endif /* __MMX__ || __3dNOW__ */
1312 if (sf->Amask == 0xff000000) {
1313 return BlitRGBtoRGBPixelAlpha;
1314 }
1315 }
1316 return BlitNtoNPixelAlpha;
1317
1318 case 3:
1319 default:
1320 return BlitNtoNPixelAlpha;
1321 }
1322 break;
1323
1324 case SDL_COPY_MODULATE_ALPHA | SDL_COPY_BLEND:
1325 if (sf->Amask == 0) {
1326 /* Per-surface alpha blits */
1327 switch (df->BytesPerPixel) {
1328 case 1:
1329 return BlitNto1SurfaceAlpha;
1330
1331 case 2:
1332 if (surface->map->identity) {
1333 if (df->Gmask == 0x7e0) {
1334 #ifdef __MMX__
1335 if (SDL_HasMMX())
1336 return Blit565to565SurfaceAlphaMMX;
1337 else
1338 #endif
1339 return Blit565to565SurfaceAlpha;
1340 } else if (df->Gmask == 0x3e0) {
1341 #ifdef __MMX__
1342 if (SDL_HasMMX())
1343 return Blit555to555SurfaceAlphaMMX;
1344 else
1345 #endif
1346 return Blit555to555SurfaceAlpha;
1347 }
1348 }
1349 return BlitNtoNSurfaceAlpha;
1350
1351 case 4:
1352 if (sf->Rmask == df->Rmask
1353 && sf->Gmask == df->Gmask
1354 && sf->Bmask == df->Bmask && sf->BytesPerPixel == 4) {
1355 #ifdef __MMX__
1356 if (sf->Rshift % 8 == 0
1357 && sf->Gshift % 8 == 0
1358 && sf->Bshift % 8 == 0 && SDL_HasMMX())
1359 return BlitRGBtoRGBSurfaceAlphaMMX;
1360 #endif
1361 if ((sf->Rmask | sf->Gmask | sf->Bmask) == 0xffffff) {
1362 return BlitRGBtoRGBSurfaceAlpha;
1363 }
1364 }
1365 return BlitNtoNSurfaceAlpha;
1366
1367 case 3:
1368 default:
1369 return BlitNtoNSurfaceAlpha;
1370 }
1371 }
1372 break;
1373
1374 case SDL_COPY_COLORKEY | SDL_COPY_MODULATE_ALPHA | SDL_COPY_BLEND:
1375 if (sf->Amask == 0) {
1376 if (df->BytesPerPixel == 1) {
1377 return BlitNto1SurfaceAlphaKey;
1378 } else {
1379 return BlitNtoNSurfaceAlphaKey;
1380 }
1381 }
1382 break;
1383 }
1384
1385 return NULL;
1386 }
1387
1388 /* vi: set ts=4 sw=4 expandtab: */
1389