• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2     SDL - Simple DirectMedia Layer
3     Copyright (C) 1997-2006 Sam Lantinga
4 
5     This library is free software; you can redistribute it and/or
6     modify it under the terms of the GNU Lesser General Public
7     License as published by the Free Software Foundation; either
8     version 2.1 of the License, or (at your option) any later version.
9 
10     This library is distributed in the hope that it will be useful,
11     but WITHOUT ANY WARRANTY; without even the implied warranty of
12     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13     Lesser General Public License for more details.
14 
15     You should have received a copy of the GNU Lesser General Public
16     License along with this library; if not, write to the Free Software
17     Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
18 
19     Sam Lantinga
20     slouken@libsdl.org
21 */
22 #include "SDL_config.h"
23 
24 /*
25  * RLE encoding for software colorkey and alpha-channel acceleration
26  *
27  * Original version by Sam Lantinga
28  *
29  * Mattias Engdeg�rd (Yorick): Rewrite. New encoding format, encoder and
30  * decoder. Added per-surface alpha blitter. Added per-pixel alpha
31  * format, encoder and blitter.
32  *
33  * Many thanks to Xark and johns for hints, benchmarks and useful comments
34  * leading to this code.
35  *
36  * Welcome to Macro Mayhem.
37  */
38 
39 /*
40  * The encoding translates the image data to a stream of segments of the form
41  *
42  * <skip> <run> <data>
43  *
44  * where <skip> is the number of transparent pixels to skip,
45  *       <run>  is the number of opaque pixels to blit,
46  * and   <data> are the pixels themselves.
47  *
48  * This basic structure is used both for colorkeyed surfaces, used for simple
49  * binary transparency and for per-surface alpha blending, and for surfaces
50  * with per-pixel alpha. The details differ, however:
51  *
52  * Encoding of colorkeyed surfaces:
53  *
54  *   Encoded pixels always have the same format as the target surface.
55  *   <skip> and <run> are unsigned 8 bit integers, except for 32 bit depth
56  *   where they are 16 bit. This makes the pixel data aligned at all times.
57  *   Segments never wrap around from one scan line to the next.
58  *
59  *   The end of the sequence is marked by a zero <skip>,<run> pair at the *
60  *   beginning of a line.
61  *
62  * Encoding of surfaces with per-pixel alpha:
63  *
64  *   The sequence begins with a struct RLEDestFormat describing the target
65  *   pixel format, to provide reliable un-encoding.
66  *
67  *   Each scan line is encoded twice: First all completely opaque pixels,
68  *   encoded in the target format as described above, and then all
69  *   partially transparent (translucent) pixels (where 1 <= alpha <= 254),
70  *   in the following 32-bit format:
71  *
72  *   For 32-bit targets, each pixel has the target RGB format but with
73  *   the alpha value occupying the highest 8 bits. The <skip> and <run>
74  *   counts are 16 bit.
75  *
76  *   For 16-bit targets, each pixel has the target RGB format, but with
77  *   the middle component (usually green) shifted 16 steps to the left,
78  *   and the hole filled with the 5 most significant bits of the alpha value.
79  *   i.e. if the target has the format         rrrrrggggggbbbbb,
80  *   the encoded pixel will be 00000gggggg00000rrrrr0aaaaabbbbb.
81  *   The <skip> and <run> counts are 8 bit for the opaque lines, 16 bit
82  *   for the translucent lines. Two padding bytes may be inserted
83  *   before each translucent line to keep them 32-bit aligned.
84  *
85  *   The end of the sequence is marked by a zero <skip>,<run> pair at the
86  *   beginning of an opaque line.
87  */
88 
89 #include "SDL_video.h"
90 #include "SDL_sysvideo.h"
91 #include "SDL_blit.h"
92 #include "SDL_RLEaccel_c.h"
93 
94 #if defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__)) && SDL_ASSEMBLY_ROUTINES
95 #define MMX_ASMBLIT
96 #endif
97 
98 #ifdef MMX_ASMBLIT
99 #include "mmx.h"
100 #include "SDL_cpuinfo.h"
101 #endif
102 
103 #ifndef MAX
104 #define MAX(a, b) ((a) > (b) ? (a) : (b))
105 #endif
106 #ifndef MIN
107 #define MIN(a, b) ((a) < (b) ? (a) : (b))
108 #endif
109 
110 #define PIXEL_COPY(to, from, len, bpp)			\
111 do {							\
112     if(bpp == 4) {					\
113 	SDL_memcpy4(to, from, (size_t)(len));		\
114     } else {						\
115 	SDL_memcpy(to, from, (size_t)(len) * (bpp));	\
116     }							\
117 } while(0)
118 
119 /*
120  * Various colorkey blit methods, for opaque and per-surface alpha
121  */
122 
123 #define OPAQUE_BLIT(to, from, length, bpp, alpha)	\
124     PIXEL_COPY(to, from, length, bpp)
125 
126 #ifdef MMX_ASMBLIT
127 
128 #define ALPHA_BLIT32_888MMX(to, from, length, bpp, alpha)	\
129     do {							\
130 	Uint32 *srcp = (Uint32 *)(from);			\
131 	Uint32 *dstp = (Uint32 *)(to);				\
132         int i = 0x00FF00FF;					\
133         movd_m2r(*(&i), mm3);					\
134         punpckldq_r2r(mm3, mm3);				\
135         i = 0xFF000000;						\
136         movd_m2r(*(&i), mm7);					\
137         punpckldq_r2r(mm7, mm7);				\
138         i = alpha | alpha << 16;				\
139         movd_m2r(*(&i), mm4);					\
140         punpckldq_r2r(mm4, mm4);				\
141 	pcmpeqd_r2r(mm5,mm5); /* set mm5 to "1" */		\
142 	pxor_r2r(mm7, mm5); /* make clear alpha mask */		\
143         i = length;						\
144 	if(i & 1) {						\
145           movd_m2r((*srcp), mm1); /* src -> mm1 */		\
146           punpcklbw_r2r(mm1, mm1);				\
147           pand_r2r(mm3, mm1);					\
148 	  movd_m2r((*dstp), mm2); /* dst -> mm2 */		\
149           punpcklbw_r2r(mm2, mm2);				\
150           pand_r2r(mm3, mm2);					\
151 	  psubw_r2r(mm2, mm1);					\
152 	  pmullw_r2r(mm4, mm1);					\
153 	  psrlw_i2r(8, mm1);					\
154 	  paddw_r2r(mm1, mm2);					\
155 	  pand_r2r(mm3, mm2);					\
156 	  packuswb_r2r(mm2, mm2);				\
157 	  pand_r2r(mm5, mm2); /* 00000RGB -> mm2 */		\
158 	  movd_r2m(mm2, *dstp);					\
159 	  ++srcp;						\
160 	  ++dstp;						\
161 	  i--;							\
162 	}							\
163 	for(; i > 0; --i) {					\
164           movq_m2r((*srcp), mm0);				\
165 	  movq_r2r(mm0, mm1);					\
166           punpcklbw_r2r(mm0, mm0);				\
167 	  movq_m2r((*dstp), mm2);				\
168 	  punpckhbw_r2r(mm1, mm1);				\
169 	  movq_r2r(mm2, mm6);					\
170           pand_r2r(mm3, mm0);					\
171           punpcklbw_r2r(mm2, mm2);				\
172 	  pand_r2r(mm3, mm1);					\
173 	  punpckhbw_r2r(mm6, mm6);				\
174           pand_r2r(mm3, mm2);					\
175 	  psubw_r2r(mm2, mm0);					\
176 	  pmullw_r2r(mm4, mm0);					\
177 	  pand_r2r(mm3, mm6);					\
178 	  psubw_r2r(mm6, mm1);					\
179 	  pmullw_r2r(mm4, mm1);					\
180 	  psrlw_i2r(8, mm0);					\
181 	  paddw_r2r(mm0, mm2);					\
182 	  psrlw_i2r(8, mm1);					\
183 	  paddw_r2r(mm1, mm6);					\
184 	  pand_r2r(mm3, mm2);					\
185 	  pand_r2r(mm3, mm6);					\
186 	  packuswb_r2r(mm2, mm2);				\
187 	  packuswb_r2r(mm6, mm6);				\
188 	  psrlq_i2r(32, mm2);					\
189 	  psllq_i2r(32, mm6);					\
190 	  por_r2r(mm6, mm2);					\
191 	  pand_r2r(mm5, mm2); /* 00000RGB -> mm2 */		\
192          movq_r2m(mm2, *dstp);					\
193 	  srcp += 2;						\
194 	  dstp += 2;						\
195 	  i--;							\
196 	}							\
197 	emms();							\
198     } while(0)
199 
200 #define ALPHA_BLIT16_565MMX(to, from, length, bpp, alpha)	\
201     do {						\
202         int i, n = 0;					\
203 	Uint16 *srcp = (Uint16 *)(from);		\
204 	Uint16 *dstp = (Uint16 *)(to);			\
205         Uint32 ALPHA = 0xF800;				\
206 	movd_m2r(*(&ALPHA), mm1);			\
207         punpcklwd_r2r(mm1, mm1);			\
208         punpcklwd_r2r(mm1, mm1);			\
209 	ALPHA = 0x07E0;					\
210 	movd_m2r(*(&ALPHA), mm4);			\
211         punpcklwd_r2r(mm4, mm4);			\
212         punpcklwd_r2r(mm4, mm4);			\
213 	ALPHA = 0x001F;					\
214 	movd_m2r(*(&ALPHA), mm7);			\
215         punpcklwd_r2r(mm7, mm7);			\
216         punpcklwd_r2r(mm7, mm7);			\
217 	alpha &= ~(1+2+4);				\
218         i = (Uint32)alpha | (Uint32)alpha << 16;	\
219         movd_m2r(*(&i), mm0);				\
220         punpckldq_r2r(mm0, mm0);			\
221         ALPHA = alpha >> 3;				\
222         i = ((int)(length) & 3);			\
223 	for(; i > 0; --i) {				\
224 	    Uint32 s = *srcp++;				\
225 	    Uint32 d = *dstp;				\
226 	    s = (s | s << 16) & 0x07e0f81f;		\
227 	    d = (d | d << 16) & 0x07e0f81f;		\
228 	    d += (s - d) * ALPHA >> 5;			\
229 	    d &= 0x07e0f81f;				\
230 	    *dstp++ = d | d >> 16;			\
231 	    n++;					\
232 	}						\
233 	i = (int)(length) - n;				\
234 	for(; i > 0; --i) {				\
235 	  movq_m2r((*dstp), mm3);			\
236 	  movq_m2r((*srcp), mm2);			\
237 	  movq_r2r(mm2, mm5);				\
238 	  pand_r2r(mm1 , mm5);				\
239 	  psrlq_i2r(11, mm5);				\
240 	  movq_r2r(mm3, mm6);				\
241 	  pand_r2r(mm1 , mm6);				\
242 	  psrlq_i2r(11, mm6);				\
243 	  psubw_r2r(mm6, mm5);				\
244 	  pmullw_r2r(mm0, mm5);				\
245 	  psrlw_i2r(8, mm5);				\
246 	  paddw_r2r(mm5, mm6);				\
247 	  psllq_i2r(11, mm6);				\
248 	  pand_r2r(mm1, mm6);				\
249 	  movq_r2r(mm4, mm5);				\
250 	  por_r2r(mm7, mm5);				\
251 	  pand_r2r(mm5, mm3);				\
252 	  por_r2r(mm6, mm3);				\
253 	  movq_r2r(mm2, mm5);				\
254 	  pand_r2r(mm4 , mm5);				\
255 	  psrlq_i2r(5, mm5);				\
256 	  movq_r2r(mm3, mm6);				\
257 	  pand_r2r(mm4 , mm6);				\
258 	  psrlq_i2r(5, mm6);				\
259 	  psubw_r2r(mm6, mm5);				\
260 	  pmullw_r2r(mm0, mm5);				\
261 	  psrlw_i2r(8, mm5);				\
262 	  paddw_r2r(mm5, mm6);				\
263 	  psllq_i2r(5, mm6);				\
264 	  pand_r2r(mm4, mm6);				\
265 	  movq_r2r(mm1, mm5);				\
266 	  por_r2r(mm7, mm5);				\
267 	  pand_r2r(mm5, mm3);				\
268 	  por_r2r(mm6, mm3);				\
269 	  movq_r2r(mm2, mm5);				\
270 	  pand_r2r(mm7 , mm5);				\
271           movq_r2r(mm3, mm6);				\
272 	  pand_r2r(mm7 , mm6);				\
273 	  psubw_r2r(mm6, mm5);				\
274 	  pmullw_r2r(mm0, mm5);				\
275 	  psrlw_i2r(8, mm5);				\
276 	  paddw_r2r(mm5, mm6);				\
277 	  pand_r2r(mm7, mm6);				\
278 	  movq_r2r(mm1, mm5);				\
279 	  por_r2r(mm4, mm5);				\
280 	  pand_r2r(mm5, mm3);				\
281 	  por_r2r(mm6, mm3);				\
282 	  movq_r2m(mm3, *dstp);				\
283 	  srcp += 4;					\
284 	  dstp += 4;					\
285 	  i -= 3;					\
286 	}						\
287 	emms();						\
288     } while(0)
289 
290 #define ALPHA_BLIT16_555MMX(to, from, length, bpp, alpha)	\
291     do {						\
292         int i, n = 0;					\
293 	Uint16 *srcp = (Uint16 *)(from);		\
294 	Uint16 *dstp = (Uint16 *)(to);			\
295         Uint32 ALPHA = 0x7C00;				\
296 	movd_m2r(*(&ALPHA), mm1);			\
297         punpcklwd_r2r(mm1, mm1);			\
298         punpcklwd_r2r(mm1, mm1);			\
299 	ALPHA = 0x03E0;					\
300         movd_m2r(*(&ALPHA), mm4);			\
301         punpcklwd_r2r(mm4, mm4);			\
302         punpcklwd_r2r(mm4, mm4);			\
303 	ALPHA = 0x001F;					\
304 	movd_m2r(*(&ALPHA), mm7);			\
305         punpcklwd_r2r(mm7, mm7);			\
306         punpcklwd_r2r(mm7, mm7);			\
307 	alpha &= ~(1+2+4);				\
308         i = (Uint32)alpha | (Uint32)alpha << 16;	\
309         movd_m2r(*(&i), mm0);				\
310         punpckldq_r2r(mm0, mm0);			\
311         i = ((int)(length) & 3);				\
312         ALPHA = alpha >> 3;				\
313 	for(; i > 0; --i) {				\
314 	    Uint32 s = *srcp++;				\
315 	    Uint32 d = *dstp;				\
316 	    s = (s | s << 16) & 0x03e07c1f;		\
317 	    d = (d | d << 16) & 0x03e07c1f;		\
318 	    d += (s - d) * ALPHA >> 5;			\
319 	    d &= 0x03e07c1f;				\
320 	    *dstp++ = d | d >> 16;			\
321 	    n++;					\
322 	}						\
323 	i = (int)(length) - n;				\
324 	for(; i > 0; --i) {				\
325 	  movq_m2r((*dstp), mm3);			\
326 	  movq_m2r((*srcp), mm2);			\
327 	  movq_r2r(mm2, mm5);				\
328 	  pand_r2r(mm1 , mm5);				\
329 	  psrlq_i2r(10, mm5);				\
330 	  movq_r2r(mm3, mm6);				\
331 	  pand_r2r(mm1 , mm6);				\
332 	  psrlq_i2r(10, mm6);				\
333 	  psubw_r2r(mm6, mm5);				\
334 	  pmullw_r2r(mm0, mm5);				\
335 	  psrlw_i2r(8, mm5);				\
336 	  paddw_r2r(mm5, mm6);				\
337 	  psllq_i2r(10, mm6);				\
338 	  pand_r2r(mm1, mm6);				\
339 	  movq_r2r(mm4, mm5);				\
340 	  por_r2r(mm7, mm5);				\
341 	  pand_r2r(mm5, mm3);				\
342 	  por_r2r(mm6, mm3);				\
343 	  movq_r2r(mm2, mm5);				\
344 	  pand_r2r(mm4 , mm5);				\
345 	  psrlq_i2r(5, mm5);				\
346 	  movq_r2r(mm3, mm6);				\
347 	  pand_r2r(mm4 , mm6);				\
348 	  psrlq_i2r(5, mm6);				\
349 	  psubw_r2r(mm6, mm5);				\
350 	  pmullw_r2r(mm0, mm5);				\
351 	  psrlw_i2r(8, mm5);				\
352 	  paddw_r2r(mm5, mm6);				\
353 	  psllq_i2r(5, mm6);				\
354 	  pand_r2r(mm4, mm6);				\
355 	  movq_r2r(mm1, mm5);				\
356 	  por_r2r(mm7, mm5);				\
357 	  pand_r2r(mm5, mm3);				\
358 	  por_r2r(mm6, mm3);				\
359 	  movq_r2r(mm2, mm5);				\
360 	  pand_r2r(mm7 , mm5);				\
361           movq_r2r(mm3, mm6);				\
362 	  pand_r2r(mm7 , mm6);				\
363 	  psubw_r2r(mm6, mm5);				\
364 	  pmullw_r2r(mm0, mm5);				\
365 	  psrlw_i2r(8, mm5);				\
366 	  paddw_r2r(mm5, mm6);				\
367 	  pand_r2r(mm7, mm6);				\
368 	  movq_r2r(mm1, mm5);				\
369 	  por_r2r(mm4, mm5);				\
370 	  pand_r2r(mm5, mm3);				\
371 	  por_r2r(mm6, mm3);				\
372 	  movq_r2m(mm3, *dstp);				\
373 	  srcp += 4;					\
374 	  dstp += 4;					\
375 	  i -= 3;					\
376 	}						\
377 	emms();						\
378     } while(0)
379 
380 #endif
381 
382 /*
383  * For 32bpp pixels on the form 0x00rrggbb:
384  * If we treat the middle component separately, we can process the two
385  * remaining in parallel. This is safe to do because of the gap to the left
386  * of each component, so the bits from the multiplication don't collide.
387  * This can be used for any RGB permutation of course.
388  */
389 #define ALPHA_BLIT32_888(to, from, length, bpp, alpha)		\
390     do {							\
391         int i;							\
392 	Uint32 *src = (Uint32 *)(from);				\
393 	Uint32 *dst = (Uint32 *)(to);				\
394 	for(i = 0; i < (int)(length); i++) {			\
395 	    Uint32 s = *src++;					\
396 	    Uint32 d = *dst;					\
397 	    Uint32 s1 = s & 0xff00ff;				\
398 	    Uint32 d1 = d & 0xff00ff;				\
399 	    d1 = (d1 + ((s1 - d1) * alpha >> 8)) & 0xff00ff;	\
400 	    s &= 0xff00;					\
401 	    d &= 0xff00;					\
402 	    d = (d + ((s - d) * alpha >> 8)) & 0xff00;		\
403 	    *dst++ = d1 | d;					\
404 	}							\
405     } while(0)
406 
407 /*
408  * For 16bpp pixels we can go a step further: put the middle component
409  * in the high 16 bits of a 32 bit word, and process all three RGB
410  * components at the same time. Since the smallest gap is here just
411  * 5 bits, we have to scale alpha down to 5 bits as well.
412  */
413 #define ALPHA_BLIT16_565(to, from, length, bpp, alpha)	\
414     do {						\
415         int i;						\
416 	Uint16 *src = (Uint16 *)(from);			\
417 	Uint16 *dst = (Uint16 *)(to);			\
418 	Uint32 ALPHA = alpha >> 3;			\
419 	for(i = 0; i < (int)(length); i++) {		\
420 	    Uint32 s = *src++;				\
421 	    Uint32 d = *dst;				\
422 	    s = (s | s << 16) & 0x07e0f81f;		\
423 	    d = (d | d << 16) & 0x07e0f81f;		\
424 	    d += (s - d) * ALPHA >> 5;			\
425 	    d &= 0x07e0f81f;				\
426 	    *dst++ = (Uint16)(d | d >> 16);			\
427 	}						\
428     } while(0)
429 
430 #define ALPHA_BLIT16_555(to, from, length, bpp, alpha)	\
431     do {						\
432         int i;						\
433 	Uint16 *src = (Uint16 *)(from);			\
434 	Uint16 *dst = (Uint16 *)(to);			\
435 	Uint32 ALPHA = alpha >> 3;			\
436 	for(i = 0; i < (int)(length); i++) {		\
437 	    Uint32 s = *src++;				\
438 	    Uint32 d = *dst;				\
439 	    s = (s | s << 16) & 0x03e07c1f;		\
440 	    d = (d | d << 16) & 0x03e07c1f;		\
441 	    d += (s - d) * ALPHA >> 5;			\
442 	    d &= 0x03e07c1f;				\
443 	    *dst++ = (Uint16)(d | d >> 16);			\
444 	}						\
445     } while(0)
446 
447 /*
448  * The general slow catch-all function, for remaining depths and formats
449  */
450 #define ALPHA_BLIT_ANY(to, from, length, bpp, alpha)			\
451     do {								\
452         int i;								\
453 	Uint8 *src = from;						\
454 	Uint8 *dst = to;						\
455 	for(i = 0; i < (int)(length); i++) {				\
456 	    Uint32 s, d;						\
457 	    unsigned rs, gs, bs, rd, gd, bd;				\
458 	    switch(bpp) {						\
459 	    case 2:							\
460 		s = *(Uint16 *)src;					\
461 		d = *(Uint16 *)dst;					\
462 		break;							\
463 	    case 3:							\
464 		if(SDL_BYTEORDER == SDL_BIG_ENDIAN) {			\
465 		    s = (src[0] << 16) | (src[1] << 8) | src[2];	\
466 		    d = (dst[0] << 16) | (dst[1] << 8) | dst[2];	\
467 		} else {						\
468 		    s = (src[2] << 16) | (src[1] << 8) | src[0];	\
469 		    d = (dst[2] << 16) | (dst[1] << 8) | dst[0];	\
470 		}							\
471 		break;							\
472 	    case 4:							\
473 		s = *(Uint32 *)src;					\
474 		d = *(Uint32 *)dst;					\
475 		break;							\
476 	    }								\
477 	    RGB_FROM_PIXEL(s, fmt, rs, gs, bs);				\
478 	    RGB_FROM_PIXEL(d, fmt, rd, gd, bd);				\
479 	    rd += (rs - rd) * alpha >> 8;				\
480 	    gd += (gs - gd) * alpha >> 8;				\
481 	    bd += (bs - bd) * alpha >> 8;				\
482 	    PIXEL_FROM_RGB(d, fmt, rd, gd, bd);				\
483 	    switch(bpp) {						\
484 	    case 2:							\
485 		*(Uint16 *)dst = (Uint16)d;					\
486 		break;							\
487 	    case 3:							\
488 		if(SDL_BYTEORDER == SDL_BIG_ENDIAN) {			\
489 		    dst[0] = (Uint8)(d >> 16);					\
490 		    dst[1] = (Uint8)(d >> 8);					\
491 		    dst[2] = (Uint8)(d);						\
492 		} else {						\
493 		    dst[0] = (Uint8)d;						\
494 		    dst[1] = (Uint8)(d >> 8);					\
495 		    dst[2] = (Uint8)(d >> 16);					\
496 		}							\
497 		break;							\
498 	    case 4:							\
499 		*(Uint32 *)dst = d;					\
500 		break;							\
501 	    }								\
502 	    src += bpp;							\
503 	    dst += bpp;							\
504 	}								\
505     } while(0)
506 
507 #ifdef MMX_ASMBLIT
508 
509 #define ALPHA_BLIT32_888_50MMX(to, from, length, bpp, alpha)		\
510     do {								\
511 	Uint32 *srcp = (Uint32 *)(from);				\
512 	Uint32 *dstp = (Uint32 *)(to);					\
513         int i = 0x00fefefe;						\
514         movd_m2r(*(&i), mm4);						\
515         punpckldq_r2r(mm4, mm4);					\
516         i = 0x00010101;							\
517         movd_m2r(*(&i), mm3);						\
518         punpckldq_r2r(mm3, mm3);					\
519         i = (int)(length);						\
520         if( i & 1 ) {							\
521 	  Uint32 s = *srcp++;						\
522 	  Uint32 d = *dstp;						\
523 	  *dstp++ = (((s & 0x00fefefe) + (d & 0x00fefefe)) >> 1)	\
524 		     + (s & d & 0x00010101);				\
525 	  i--;								\
526 	}								\
527 	for(; i > 0; --i) {						\
528 	    movq_m2r((*dstp), mm2); /* dst -> mm2 */			\
529 	    movq_r2r(mm2, mm6);	/* dst -> mm6 */			\
530 	    movq_m2r((*srcp), mm1); /* src -> mm1 */			\
531 	    movq_r2r(mm1, mm5);	/* src -> mm5 */			\
532 	    pand_r2r(mm4, mm6);	/* dst & 0x00fefefe -> mm6 */		\
533 	    pand_r2r(mm4, mm5); /* src & 0x00fefefe -> mm5 */		\
534 	    paddd_r2r(mm6, mm5); /* (dst & 0x00fefefe) + (dst & 0x00fefefe) -> mm5 */	\
535 	    psrld_i2r(1, mm5);						\
536 	    pand_r2r(mm1, mm2);	/* s & d -> mm2 */			\
537 	    pand_r2r(mm3, mm2);	/* s & d & 0x00010101 -> mm2 */		\
538 	    paddd_r2r(mm5, mm2);					\
539 	    movq_r2m(mm2, (*dstp));					\
540 	    dstp += 2;							\
541 	    srcp += 2;							\
542 	    i--;							\
543 	}								\
544 	emms();								\
545     } while(0)
546 
547 #endif
548 
549 /*
550  * Special case: 50% alpha (alpha=128)
551  * This is treated specially because it can be optimized very well, and
552  * since it is good for many cases of semi-translucency.
553  * The theory is to do all three components at the same time:
554  * First zero the lowest bit of each component, which gives us room to
555  * add them. Then shift right and add the sum of the lowest bits.
556  */
557 #define ALPHA_BLIT32_888_50(to, from, length, bpp, alpha)		\
558     do {								\
559         int i;								\
560 	Uint32 *src = (Uint32 *)(from);					\
561 	Uint32 *dst = (Uint32 *)(to);					\
562 	for(i = 0; i < (int)(length); i++) {				\
563 	    Uint32 s = *src++;						\
564 	    Uint32 d = *dst;						\
565 	    *dst++ = (((s & 0x00fefefe) + (d & 0x00fefefe)) >> 1)	\
566 		     + (s & d & 0x00010101);				\
567 	}								\
568     } while(0)
569 
570 /*
571  * For 16bpp, we can actually blend two pixels in parallel, if we take
572  * care to shift before we add, not after.
573  */
574 
575 /* helper: blend a single 16 bit pixel at 50% */
576 #define BLEND16_50(dst, src, mask)			\
577     do {						\
578 	Uint32 s = *src++;				\
579 	Uint32 d = *dst;				\
580 	*dst++ = (Uint16)((((s & mask) + (d & mask)) >> 1) +	\
581 	                  (s & d & (~mask & 0xffff)));		\
582     } while(0)
583 
584 /* basic 16bpp blender. mask is the pixels to keep when adding. */
585 #define ALPHA_BLIT16_50(to, from, length, bpp, alpha, mask)		\
586     do {								\
587 	unsigned n = (length);						\
588 	Uint16 *src = (Uint16 *)(from);					\
589 	Uint16 *dst = (Uint16 *)(to);					\
590 	if(((uintptr_t)src ^ (uintptr_t)dst) & 3) {			\
591 	    /* source and destination not in phase, blit one by one */	\
592 	    while(n--)							\
593 		BLEND16_50(dst, src, mask);				\
594 	} else {							\
595 	    if((uintptr_t)src & 3) {					\
596 		/* first odd pixel */					\
597 		BLEND16_50(dst, src, mask);				\
598 		n--;							\
599 	    }								\
600 	    for(; n > 1; n -= 2) {					\
601 		Uint32 s = *(Uint32 *)src;				\
602 		Uint32 d = *(Uint32 *)dst;				\
603 		*(Uint32 *)dst = ((s & (mask | mask << 16)) >> 1)	\
604 		               + ((d & (mask | mask << 16)) >> 1)	\
605 		               + (s & d & (~(mask | mask << 16)));	\
606 		src += 2;						\
607 		dst += 2;						\
608 	    }								\
609 	    if(n)							\
610 		BLEND16_50(dst, src, mask); /* last odd pixel */	\
611 	}								\
612     } while(0)
613 
614 #define ALPHA_BLIT16_565_50(to, from, length, bpp, alpha)	\
615     ALPHA_BLIT16_50(to, from, length, bpp, alpha, 0xf7de)
616 
617 #define ALPHA_BLIT16_555_50(to, from, length, bpp, alpha)	\
618     ALPHA_BLIT16_50(to, from, length, bpp, alpha, 0xfbde)
619 
620 #ifdef MMX_ASMBLIT
621 
622 #define CHOOSE_BLIT(blitter, alpha, fmt)				\
623     do {								\
624         if(alpha == 255) {						\
625 	    switch(fmt->BytesPerPixel) {				\
626 	    case 1: blitter(1, Uint8, OPAQUE_BLIT); break;		\
627 	    case 2: blitter(2, Uint8, OPAQUE_BLIT); break;		\
628 	    case 3: blitter(3, Uint8, OPAQUE_BLIT); break;		\
629 	    case 4: blitter(4, Uint16, OPAQUE_BLIT); break;		\
630 	    }								\
631 	} else {							\
632 	    switch(fmt->BytesPerPixel) {				\
633 	    case 1:							\
634 		/* No 8bpp alpha blitting */				\
635 		break;							\
636 									\
637 	    case 2:							\
638 		switch(fmt->Rmask | fmt->Gmask | fmt->Bmask) {		\
639 		case 0xffff:						\
640 		    if(fmt->Gmask == 0x07e0				\
641 		       || fmt->Rmask == 0x07e0				\
642 		       || fmt->Bmask == 0x07e0) {			\
643 			if(alpha == 128)				\
644 			    blitter(2, Uint8, ALPHA_BLIT16_565_50);	\
645 			else {						\
646 			    if(SDL_HasMMX())				\
647 				blitter(2, Uint8, ALPHA_BLIT16_565MMX);	\
648 			    else					\
649 				blitter(2, Uint8, ALPHA_BLIT16_565);	\
650 			}						\
651 		    } else						\
652 			goto general16;					\
653 		    break;						\
654 									\
655 		case 0x7fff:						\
656 		    if(fmt->Gmask == 0x03e0				\
657 		       || fmt->Rmask == 0x03e0				\
658 		       || fmt->Bmask == 0x03e0) {			\
659 			if(alpha == 128)				\
660 			    blitter(2, Uint8, ALPHA_BLIT16_555_50);	\
661 			else {						\
662 			    if(SDL_HasMMX())				\
663 				blitter(2, Uint8, ALPHA_BLIT16_555MMX);	\
664 			    else					\
665 				blitter(2, Uint8, ALPHA_BLIT16_555);	\
666 			}						\
667 			break;						\
668 		    }							\
669 		    /* fallthrough */					\
670 									\
671 		default:						\
672 		general16:						\
673 		    blitter(2, Uint8, ALPHA_BLIT_ANY);			\
674 		}							\
675 		break;							\
676 									\
677 	    case 3:							\
678 		blitter(3, Uint8, ALPHA_BLIT_ANY);			\
679 		break;							\
680 									\
681 	    case 4:							\
682 		if((fmt->Rmask | fmt->Gmask | fmt->Bmask) == 0x00ffffff	\
683 		   && (fmt->Gmask == 0xff00 || fmt->Rmask == 0xff00	\
684 		       || fmt->Bmask == 0xff00)) {			\
685 		    if(alpha == 128)					\
686 		    {							\
687 			if(SDL_HasMMX())				\
688 				blitter(4, Uint16, ALPHA_BLIT32_888_50MMX);\
689 			else						\
690 				blitter(4, Uint16, ALPHA_BLIT32_888_50);\
691 		    }							\
692 		    else						\
693 		    {							\
694 			if(SDL_HasMMX())				\
695 				blitter(4, Uint16, ALPHA_BLIT32_888MMX);\
696 			else						\
697 				blitter(4, Uint16, ALPHA_BLIT32_888);	\
698 		    }							\
699 		} else							\
700 		    blitter(4, Uint16, ALPHA_BLIT_ANY);			\
701 		break;							\
702 	    }								\
703 	}								\
704     } while(0)
705 
706 #else
707 
708 #define CHOOSE_BLIT(blitter, alpha, fmt)				\
709     do {								\
710         if(alpha == 255) {						\
711 	    switch(fmt->BytesPerPixel) {				\
712 	    case 1: blitter(1, Uint8, OPAQUE_BLIT); break;		\
713 	    case 2: blitter(2, Uint8, OPAQUE_BLIT); break;		\
714 	    case 3: blitter(3, Uint8, OPAQUE_BLIT); break;		\
715 	    case 4: blitter(4, Uint16, OPAQUE_BLIT); break;		\
716 	    }								\
717 	} else {							\
718 	    switch(fmt->BytesPerPixel) {				\
719 	    case 1:							\
720 		/* No 8bpp alpha blitting */				\
721 		break;							\
722 									\
723 	    case 2:							\
724 		switch(fmt->Rmask | fmt->Gmask | fmt->Bmask) {		\
725 		case 0xffff:						\
726 		    if(fmt->Gmask == 0x07e0				\
727 		       || fmt->Rmask == 0x07e0				\
728 		       || fmt->Bmask == 0x07e0) {			\
729 			if(alpha == 128)				\
730 			    blitter(2, Uint8, ALPHA_BLIT16_565_50);	\
731 			else {						\
732 			    blitter(2, Uint8, ALPHA_BLIT16_565);	\
733 			}						\
734 		    } else						\
735 			goto general16;					\
736 		    break;						\
737 									\
738 		case 0x7fff:						\
739 		    if(fmt->Gmask == 0x03e0				\
740 		       || fmt->Rmask == 0x03e0				\
741 		       || fmt->Bmask == 0x03e0) {			\
742 			if(alpha == 128)				\
743 			    blitter(2, Uint8, ALPHA_BLIT16_555_50);	\
744 			else {						\
745 			    blitter(2, Uint8, ALPHA_BLIT16_555);	\
746 			}						\
747 			break;						\
748 		    }							\
749 		    /* fallthrough */					\
750 									\
751 		default:						\
752 		general16:						\
753 		    blitter(2, Uint8, ALPHA_BLIT_ANY);			\
754 		}							\
755 		break;							\
756 									\
757 	    case 3:							\
758 		blitter(3, Uint8, ALPHA_BLIT_ANY);			\
759 		break;							\
760 									\
761 	    case 4:							\
762 		if((fmt->Rmask | fmt->Gmask | fmt->Bmask) == 0x00ffffff	\
763 		   && (fmt->Gmask == 0xff00 || fmt->Rmask == 0xff00	\
764 		       || fmt->Bmask == 0xff00)) {			\
765 		    if(alpha == 128)					\
766 			blitter(4, Uint16, ALPHA_BLIT32_888_50);	\
767 		    else						\
768 			blitter(4, Uint16, ALPHA_BLIT32_888);		\
769 		} else							\
770 		    blitter(4, Uint16, ALPHA_BLIT_ANY);			\
771 		break;							\
772 	    }								\
773 	}								\
774     } while(0)
775 
776 #endif
777 
778 /*
779  * This takes care of the case when the surface is clipped on the left and/or
780  * right. Top clipping has already been taken care of.
781  */
RLEClipBlit(int w,Uint8 * srcbuf,SDL_Surface * dst,Uint8 * dstbuf,SDL_Rect * srcrect,unsigned alpha)782 static void RLEClipBlit(int w, Uint8 *srcbuf, SDL_Surface *dst,
783 			Uint8 *dstbuf, SDL_Rect *srcrect, unsigned alpha)
784 {
785     SDL_PixelFormat *fmt = dst->format;
786 
787 #define RLECLIPBLIT(bpp, Type, do_blit)					   \
788     do {								   \
789 	int linecount = srcrect->h;					   \
790 	int ofs = 0;							   \
791 	int left = srcrect->x;						   \
792 	int right = left + srcrect->w;					   \
793 	dstbuf -= left * bpp;						   \
794 	for(;;) {							   \
795 	    int run;							   \
796 	    ofs += *(Type *)srcbuf;					   \
797 	    run = ((Type *)srcbuf)[1];					   \
798 	    srcbuf += 2 * sizeof(Type);					   \
799 	    if(run) {							   \
800 		/* clip to left and right borders */			   \
801 		if(ofs < right) {					   \
802 		    int start = 0;					   \
803 		    int len = run;					   \
804 		    int startcol;					   \
805 		    if(left - ofs > 0) {				   \
806 			start = left - ofs;				   \
807 			len -= start;					   \
808 			if(len <= 0)					   \
809 			    goto nocopy ## bpp ## do_blit;		   \
810 		    }							   \
811 		    startcol = ofs + start;				   \
812 		    if(len > right - startcol)				   \
813 			len = right - startcol;				   \
814 		    do_blit(dstbuf + startcol * bpp, srcbuf + start * bpp, \
815 			    len, bpp, alpha);				   \
816 		}							   \
817 	    nocopy ## bpp ## do_blit:					   \
818 		srcbuf += run * bpp;					   \
819 		ofs += run;						   \
820 	    } else if(!ofs)						   \
821 		break;							   \
822 	    if(ofs == w) {						   \
823 		ofs = 0;						   \
824 		dstbuf += dst->pitch;					   \
825 		if(!--linecount)					   \
826 		    break;						   \
827 	    }								   \
828 	}								   \
829     } while(0)
830 
831     CHOOSE_BLIT(RLECLIPBLIT, alpha, fmt);
832 
833 #undef RLECLIPBLIT
834 
835 }
836 
837 
838 /* blit a colorkeyed RLE surface */
SDL_RLEBlit(SDL_Surface * src,SDL_Rect * srcrect,SDL_Surface * dst,SDL_Rect * dstrect)839 int SDL_RLEBlit(SDL_Surface *src, SDL_Rect *srcrect,
840 		SDL_Surface *dst, SDL_Rect *dstrect)
841 {
842 	Uint8 *dstbuf;
843 	Uint8 *srcbuf;
844 	int x, y;
845 	int w = src->w;
846 	unsigned alpha;
847 
848 	/* Lock the destination if necessary */
849 	if ( SDL_MUSTLOCK(dst) ) {
850 		if ( SDL_LockSurface(dst) < 0 ) {
851 			return(-1);
852 		}
853 	}
854 
855 	/* Set up the source and destination pointers */
856 	x = dstrect->x;
857 	y = dstrect->y;
858 	dstbuf = (Uint8 *)dst->pixels
859 	         + y * dst->pitch + x * src->format->BytesPerPixel;
860 	srcbuf = (Uint8 *)src->map->sw_data->aux_data;
861 
862 	{
863 	    /* skip lines at the top if neccessary */
864 	    int vskip = srcrect->y;
865 	    int ofs = 0;
866 	    if(vskip) {
867 
868 #define RLESKIP(bpp, Type)			\
869 		for(;;) {			\
870 		    int run;			\
871 		    ofs += *(Type *)srcbuf;	\
872 		    run = ((Type *)srcbuf)[1];	\
873 		    srcbuf += sizeof(Type) * 2;	\
874 		    if(run) {			\
875 			srcbuf += run * bpp;	\
876 			ofs += run;		\
877 		    } else if(!ofs)		\
878 			goto done;		\
879 		    if(ofs == w) {		\
880 			ofs = 0;		\
881 			if(!--vskip)		\
882 			    break;		\
883 		    }				\
884 		}
885 
886 		switch(src->format->BytesPerPixel) {
887 		case 1: RLESKIP(1, Uint8); break;
888 		case 2: RLESKIP(2, Uint8); break;
889 		case 3: RLESKIP(3, Uint8); break;
890 		case 4: RLESKIP(4, Uint16); break;
891 		}
892 
893 #undef RLESKIP
894 
895 	    }
896 	}
897 
898 	alpha = (src->flags & SDL_SRCALPHA) == SDL_SRCALPHA
899 	        ? src->format->alpha : 255;
900 	/* if left or right edge clipping needed, call clip blit */
901 	if ( srcrect->x || srcrect->w != src->w ) {
902 	    RLEClipBlit(w, srcbuf, dst, dstbuf, srcrect, alpha);
903 	} else {
904 	    SDL_PixelFormat *fmt = src->format;
905 
906 #define RLEBLIT(bpp, Type, do_blit)					      \
907 	    do {							      \
908 		int linecount = srcrect->h;				      \
909 		int ofs = 0;						      \
910 		for(;;) {						      \
911 		    unsigned run;					      \
912 		    ofs += *(Type *)srcbuf;				      \
913 		    run = ((Type *)srcbuf)[1];				      \
914 		    srcbuf += 2 * sizeof(Type);				      \
915 		    if(run) {						      \
916 			do_blit(dstbuf + ofs * bpp, srcbuf, run, bpp, alpha); \
917 			srcbuf += run * bpp;				      \
918 			ofs += run;					      \
919 		    } else if(!ofs)					      \
920 			break;						      \
921 		    if(ofs == w) {					      \
922 			ofs = 0;					      \
923 			dstbuf += dst->pitch;				      \
924 			if(!--linecount)				      \
925 			    break;					      \
926 		    }							      \
927 		}							      \
928 	    } while(0)
929 
930 	    CHOOSE_BLIT(RLEBLIT, alpha, fmt);
931 
932 #undef RLEBLIT
933 	}
934 
935 done:
936 	/* Unlock the destination if necessary */
937 	if ( SDL_MUSTLOCK(dst) ) {
938 		SDL_UnlockSurface(dst);
939 	}
940 	return(0);
941 }
942 
943 #undef OPAQUE_BLIT
944 
945 /*
946  * Per-pixel blitting macros for translucent pixels:
947  * These use the same techniques as the per-surface blitting macros
948  */
949 
950 /*
951  * For 32bpp pixels, we have made sure the alpha is stored in the top
952  * 8 bits, so proceed as usual
953  */
954 #define BLIT_TRANSL_888(src, dst)				\
955     do {							\
956         Uint32 s = src;						\
957 	Uint32 d = dst;						\
958 	unsigned alpha = s >> 24;				\
959 	Uint32 s1 = s & 0xff00ff;				\
960 	Uint32 d1 = d & 0xff00ff;				\
961 	d1 = (d1 + ((s1 - d1) * alpha >> 8)) & 0xff00ff;	\
962 	s &= 0xff00;						\
963 	d &= 0xff00;						\
964 	d = (d + ((s - d) * alpha >> 8)) & 0xff00;		\
965 	dst = d1 | d;						\
966     } while(0)
967 
968 /*
969  * For 16bpp pixels, we have stored the 5 most significant alpha bits in
970  * bits 5-10. As before, we can process all 3 RGB components at the same time.
971  */
972 #define BLIT_TRANSL_565(src, dst)		\
973     do {					\
974 	Uint32 s = src;				\
975 	Uint32 d = dst;				\
976 	unsigned alpha = (s & 0x3e0) >> 5;	\
977 	s &= 0x07e0f81f;			\
978 	d = (d | d << 16) & 0x07e0f81f;		\
979 	d += (s - d) * alpha >> 5;		\
980 	d &= 0x07e0f81f;			\
981 	dst = (Uint16)(d | d >> 16);			\
982     } while(0)
983 
984 #define BLIT_TRANSL_555(src, dst)		\
985     do {					\
986 	Uint32 s = src;				\
987 	Uint32 d = dst;				\
988 	unsigned alpha = (s & 0x3e0) >> 5;	\
989 	s &= 0x03e07c1f;			\
990 	d = (d | d << 16) & 0x03e07c1f;		\
991 	d += (s - d) * alpha >> 5;		\
992 	d &= 0x03e07c1f;			\
993 	dst = (Uint16)(d | d >> 16);			\
994     } while(0)
995 
996 /* used to save the destination format in the encoding. Designed to be
997    macro-compatible with SDL_PixelFormat but without the unneeded fields */
998 typedef struct {
999 	Uint8  BytesPerPixel;
1000 	Uint8  Rloss;
1001 	Uint8  Gloss;
1002 	Uint8  Bloss;
1003 	Uint8  Rshift;
1004 	Uint8  Gshift;
1005 	Uint8  Bshift;
1006 	Uint8  Ashift;
1007 	Uint32 Rmask;
1008 	Uint32 Gmask;
1009 	Uint32 Bmask;
1010 	Uint32 Amask;
1011 } RLEDestFormat;
1012 
1013 /* blit a pixel-alpha RLE surface clipped at the right and/or left edges */
RLEAlphaClipBlit(int w,Uint8 * srcbuf,SDL_Surface * dst,Uint8 * dstbuf,SDL_Rect * srcrect)1014 static void RLEAlphaClipBlit(int w, Uint8 *srcbuf, SDL_Surface *dst,
1015 			     Uint8 *dstbuf, SDL_Rect *srcrect)
1016 {
1017     SDL_PixelFormat *df = dst->format;
1018     /*
1019      * clipped blitter: Ptype is the destination pixel type,
1020      * Ctype the translucent count type, and do_blend the macro
1021      * to blend one pixel.
1022      */
1023 #define RLEALPHACLIPBLIT(Ptype, Ctype, do_blend)			  \
1024     do {								  \
1025 	int linecount = srcrect->h;					  \
1026 	int left = srcrect->x;						  \
1027 	int right = left + srcrect->w;					  \
1028 	dstbuf -= left * sizeof(Ptype);					  \
1029 	do {								  \
1030 	    int ofs = 0;						  \
1031 	    /* blit opaque pixels on one line */			  \
1032 	    do {							  \
1033 		unsigned run;						  \
1034 		ofs += ((Ctype *)srcbuf)[0];				  \
1035 		run = ((Ctype *)srcbuf)[1];				  \
1036 		srcbuf += 2 * sizeof(Ctype);				  \
1037 		if(run) {						  \
1038 		    /* clip to left and right borders */		  \
1039 		    int cofs = ofs;					  \
1040 		    int crun = run;					  \
1041 		    if(left - cofs > 0) {				  \
1042 			crun -= left - cofs;				  \
1043 			cofs = left;					  \
1044 		    }							  \
1045 		    if(crun > right - cofs)				  \
1046 			crun = right - cofs;				  \
1047 		    if(crun > 0)					  \
1048 			PIXEL_COPY(dstbuf + cofs * sizeof(Ptype),	  \
1049 				   srcbuf + (cofs - ofs) * sizeof(Ptype), \
1050 				   (unsigned)crun, sizeof(Ptype));	  \
1051 		    srcbuf += run * sizeof(Ptype);			  \
1052 		    ofs += run;						  \
1053 		} else if(!ofs)						  \
1054 		    return;						  \
1055 	    } while(ofs < w);						  \
1056 	    /* skip padding if necessary */				  \
1057 	    if(sizeof(Ptype) == 2)					  \
1058 		srcbuf += (uintptr_t)srcbuf & 2;			  \
1059 	    /* blit translucent pixels on the same line */		  \
1060 	    ofs = 0;							  \
1061 	    do {							  \
1062 		unsigned run;						  \
1063 		ofs += ((Uint16 *)srcbuf)[0];				  \
1064 		run = ((Uint16 *)srcbuf)[1];				  \
1065 		srcbuf += 4;						  \
1066 		if(run) {						  \
1067 		    /* clip to left and right borders */		  \
1068 		    int cofs = ofs;					  \
1069 		    int crun = run;					  \
1070 		    if(left - cofs > 0) {				  \
1071 			crun -= left - cofs;				  \
1072 			cofs = left;					  \
1073 		    }							  \
1074 		    if(crun > right - cofs)				  \
1075 			crun = right - cofs;				  \
1076 		    if(crun > 0) {					  \
1077 			Ptype *dst = (Ptype *)dstbuf + cofs;		  \
1078 			Uint32 *src = (Uint32 *)srcbuf + (cofs - ofs);	  \
1079 			int i;						  \
1080 			for(i = 0; i < crun; i++)			  \
1081 			    do_blend(src[i], dst[i]);			  \
1082 		    }							  \
1083 		    srcbuf += run * 4;					  \
1084 		    ofs += run;						  \
1085 		}							  \
1086 	    } while(ofs < w);						  \
1087 	    dstbuf += dst->pitch;					  \
1088 	} while(--linecount);						  \
1089     } while(0)
1090 
1091     switch(df->BytesPerPixel) {
1092     case 2:
1093 	if(df->Gmask == 0x07e0 || df->Rmask == 0x07e0
1094 	   || df->Bmask == 0x07e0)
1095 	    RLEALPHACLIPBLIT(Uint16, Uint8, BLIT_TRANSL_565);
1096 	else
1097 	    RLEALPHACLIPBLIT(Uint16, Uint8, BLIT_TRANSL_555);
1098 	break;
1099     case 4:
1100 	RLEALPHACLIPBLIT(Uint32, Uint16, BLIT_TRANSL_888);
1101 	break;
1102     }
1103 }
1104 
1105 /* blit a pixel-alpha RLE surface */
SDL_RLEAlphaBlit(SDL_Surface * src,SDL_Rect * srcrect,SDL_Surface * dst,SDL_Rect * dstrect)1106 int SDL_RLEAlphaBlit(SDL_Surface *src, SDL_Rect *srcrect,
1107 		     SDL_Surface *dst, SDL_Rect *dstrect)
1108 {
1109     int x, y;
1110     int w = src->w;
1111     Uint8 *srcbuf, *dstbuf;
1112     SDL_PixelFormat *df = dst->format;
1113 
1114     /* Lock the destination if necessary */
1115     if ( SDL_MUSTLOCK(dst) ) {
1116 	if ( SDL_LockSurface(dst) < 0 ) {
1117 	    return -1;
1118 	}
1119     }
1120 
1121     x = dstrect->x;
1122     y = dstrect->y;
1123     dstbuf = (Uint8 *)dst->pixels
1124 	     + y * dst->pitch + x * df->BytesPerPixel;
1125     srcbuf = (Uint8 *)src->map->sw_data->aux_data + sizeof(RLEDestFormat);
1126 
1127     {
1128 	/* skip lines at the top if necessary */
1129 	int vskip = srcrect->y;
1130 	if(vskip) {
1131 	    int ofs;
1132 	    if(df->BytesPerPixel == 2) {
1133 		/* the 16/32 interleaved format */
1134 		do {
1135 		    /* skip opaque line */
1136 		    ofs = 0;
1137 		    do {
1138 			int run;
1139 			ofs += srcbuf[0];
1140 			run = srcbuf[1];
1141 			srcbuf += 2;
1142 			if(run) {
1143 			    srcbuf += 2 * run;
1144 			    ofs += run;
1145 			} else if(!ofs)
1146 			    goto done;
1147 		    } while(ofs < w);
1148 
1149 		    /* skip padding */
1150 		    srcbuf += (uintptr_t)srcbuf & 2;
1151 
1152 		    /* skip translucent line */
1153 		    ofs = 0;
1154 		    do {
1155 			int run;
1156 			ofs += ((Uint16 *)srcbuf)[0];
1157 			run = ((Uint16 *)srcbuf)[1];
1158 			srcbuf += 4 * (run + 1);
1159 			ofs += run;
1160 		    } while(ofs < w);
1161 		} while(--vskip);
1162 	    } else {
1163 		/* the 32/32 interleaved format */
1164 		vskip <<= 1;	/* opaque and translucent have same format */
1165 		do {
1166 		    ofs = 0;
1167 		    do {
1168 			int run;
1169 			ofs += ((Uint16 *)srcbuf)[0];
1170 			run = ((Uint16 *)srcbuf)[1];
1171 			srcbuf += 4;
1172 			if(run) {
1173 			    srcbuf += 4 * run;
1174 			    ofs += run;
1175 			} else if(!ofs)
1176 			    goto done;
1177 		    } while(ofs < w);
1178 		} while(--vskip);
1179 	    }
1180 	}
1181     }
1182 
1183     /* if left or right edge clipping needed, call clip blit */
1184     if(srcrect->x || srcrect->w != src->w) {
1185 	RLEAlphaClipBlit(w, srcbuf, dst, dstbuf, srcrect);
1186     } else {
1187 
1188 	/*
1189 	 * non-clipped blitter. Ptype is the destination pixel type,
1190 	 * Ctype the translucent count type, and do_blend the
1191 	 * macro to blend one pixel.
1192 	 */
1193 #define RLEALPHABLIT(Ptype, Ctype, do_blend)				 \
1194 	do {								 \
1195 	    int linecount = srcrect->h;					 \
1196 	    do {							 \
1197 		int ofs = 0;						 \
1198 		/* blit opaque pixels on one line */			 \
1199 		do {							 \
1200 		    unsigned run;					 \
1201 		    ofs += ((Ctype *)srcbuf)[0];			 \
1202 		    run = ((Ctype *)srcbuf)[1];				 \
1203 		    srcbuf += 2 * sizeof(Ctype);			 \
1204 		    if(run) {						 \
1205 			PIXEL_COPY(dstbuf + ofs * sizeof(Ptype), srcbuf, \
1206 				   run, sizeof(Ptype));			 \
1207 			srcbuf += run * sizeof(Ptype);			 \
1208 			ofs += run;					 \
1209 		    } else if(!ofs)					 \
1210 			goto done;					 \
1211 		} while(ofs < w);					 \
1212 		/* skip padding if necessary */				 \
1213 		if(sizeof(Ptype) == 2)					 \
1214 		    srcbuf += (uintptr_t)srcbuf & 2;		 	 \
1215 		/* blit translucent pixels on the same line */		 \
1216 		ofs = 0;						 \
1217 		do {							 \
1218 		    unsigned run;					 \
1219 		    ofs += ((Uint16 *)srcbuf)[0];			 \
1220 		    run = ((Uint16 *)srcbuf)[1];			 \
1221 		    srcbuf += 4;					 \
1222 		    if(run) {						 \
1223 			Ptype *dst = (Ptype *)dstbuf + ofs;		 \
1224 			unsigned i;					 \
1225 			for(i = 0; i < run; i++) {			 \
1226 			    Uint32 src = *(Uint32 *)srcbuf;		 \
1227 			    do_blend(src, *dst);			 \
1228 			    srcbuf += 4;				 \
1229 			    dst++;					 \
1230 			}						 \
1231 			ofs += run;					 \
1232 		    }							 \
1233 		} while(ofs < w);					 \
1234 		dstbuf += dst->pitch;					 \
1235 	    } while(--linecount);					 \
1236 	} while(0)
1237 
1238 	switch(df->BytesPerPixel) {
1239 	case 2:
1240 	    if(df->Gmask == 0x07e0 || df->Rmask == 0x07e0
1241 	       || df->Bmask == 0x07e0)
1242 		RLEALPHABLIT(Uint16, Uint8, BLIT_TRANSL_565);
1243 	    else
1244 		RLEALPHABLIT(Uint16, Uint8, BLIT_TRANSL_555);
1245 	    break;
1246 	case 4:
1247 	    RLEALPHABLIT(Uint32, Uint16, BLIT_TRANSL_888);
1248 	    break;
1249 	}
1250     }
1251 
1252  done:
1253     /* Unlock the destination if necessary */
1254     if ( SDL_MUSTLOCK(dst) ) {
1255 	SDL_UnlockSurface(dst);
1256     }
1257     return 0;
1258 }
1259 
1260 /*
1261  * Auxiliary functions:
1262  * The encoding functions take 32bpp rgb + a, and
1263  * return the number of bytes copied to the destination.
1264  * The decoding functions copy to 32bpp rgb + a, and
1265  * return the number of bytes copied from the source.
1266  * These are only used in the encoder and un-RLE code and are therefore not
1267  * highly optimised.
1268  */
1269 
1270 /* encode 32bpp rgb + a into 16bpp rgb, losing alpha */
copy_opaque_16(void * dst,Uint32 * src,int n,SDL_PixelFormat * sfmt,SDL_PixelFormat * dfmt)1271 static int copy_opaque_16(void *dst, Uint32 *src, int n,
1272 			  SDL_PixelFormat *sfmt, SDL_PixelFormat *dfmt)
1273 {
1274     int i;
1275     Uint16 *d = dst;
1276     for(i = 0; i < n; i++) {
1277 	unsigned r, g, b;
1278 	RGB_FROM_PIXEL(*src, sfmt, r, g, b);
1279 	PIXEL_FROM_RGB(*d, dfmt, r, g, b);
1280 	src++;
1281 	d++;
1282     }
1283     return n * 2;
1284 }
1285 
1286 /* decode opaque pixels from 16bpp to 32bpp rgb + a */
uncopy_opaque_16(Uint32 * dst,void * src,int n,RLEDestFormat * sfmt,SDL_PixelFormat * dfmt)1287 static int uncopy_opaque_16(Uint32 *dst, void *src, int n,
1288 			    RLEDestFormat *sfmt, SDL_PixelFormat *dfmt)
1289 {
1290     int i;
1291     Uint16 *s = src;
1292     unsigned alpha = dfmt->Amask ? 255 : 0;
1293     for(i = 0; i < n; i++) {
1294 	unsigned r, g, b;
1295 	RGB_FROM_PIXEL(*s, sfmt, r, g, b);
1296 	PIXEL_FROM_RGBA(*dst, dfmt, r, g, b, alpha);
1297 	s++;
1298 	dst++;
1299     }
1300     return n * 2;
1301 }
1302 
1303 
1304 
1305 /* encode 32bpp rgb + a into 32bpp G0RAB format for blitting into 565 */
copy_transl_565(void * dst,Uint32 * src,int n,SDL_PixelFormat * sfmt,SDL_PixelFormat * dfmt)1306 static int copy_transl_565(void *dst, Uint32 *src, int n,
1307 			   SDL_PixelFormat *sfmt, SDL_PixelFormat *dfmt)
1308 {
1309     int i;
1310     Uint32 *d = dst;
1311     for(i = 0; i < n; i++) {
1312 	unsigned r, g, b, a;
1313 	Uint16 pix;
1314 	RGBA_FROM_8888(*src, sfmt, r, g, b, a);
1315 	PIXEL_FROM_RGB(pix, dfmt, r, g, b);
1316 	*d = ((pix & 0x7e0) << 16) | (pix & 0xf81f) | ((a << 2) & 0x7e0);
1317 	src++;
1318 	d++;
1319     }
1320     return n * 4;
1321 }
1322 
1323 /* encode 32bpp rgb + a into 32bpp G0RAB format for blitting into 555 */
copy_transl_555(void * dst,Uint32 * src,int n,SDL_PixelFormat * sfmt,SDL_PixelFormat * dfmt)1324 static int copy_transl_555(void *dst, Uint32 *src, int n,
1325 			   SDL_PixelFormat *sfmt, SDL_PixelFormat *dfmt)
1326 {
1327     int i;
1328     Uint32 *d = dst;
1329     for(i = 0; i < n; i++) {
1330 	unsigned r, g, b, a;
1331 	Uint16 pix;
1332 	RGBA_FROM_8888(*src, sfmt, r, g, b, a);
1333 	PIXEL_FROM_RGB(pix, dfmt, r, g, b);
1334 	*d = ((pix & 0x3e0) << 16) | (pix & 0xfc1f) | ((a << 2) & 0x3e0);
1335 	src++;
1336 	d++;
1337     }
1338     return n * 4;
1339 }
1340 
1341 /* decode translucent pixels from 32bpp GORAB to 32bpp rgb + a */
uncopy_transl_16(Uint32 * dst,void * src,int n,RLEDestFormat * sfmt,SDL_PixelFormat * dfmt)1342 static int uncopy_transl_16(Uint32 *dst, void *src, int n,
1343 			    RLEDestFormat *sfmt, SDL_PixelFormat *dfmt)
1344 {
1345     int i;
1346     Uint32 *s = src;
1347     for(i = 0; i < n; i++) {
1348 	unsigned r, g, b, a;
1349 	Uint32 pix = *s++;
1350 	a = (pix & 0x3e0) >> 2;
1351 	pix = (pix & ~0x3e0) | pix >> 16;
1352 	RGB_FROM_PIXEL(pix, sfmt, r, g, b);
1353 	PIXEL_FROM_RGBA(*dst, dfmt, r, g, b, a);
1354 	dst++;
1355     }
1356     return n * 4;
1357 }
1358 
1359 /* encode 32bpp rgba into 32bpp rgba, keeping alpha (dual purpose) */
copy_32(void * dst,Uint32 * src,int n,SDL_PixelFormat * sfmt,SDL_PixelFormat * dfmt)1360 static int copy_32(void *dst, Uint32 *src, int n,
1361 		   SDL_PixelFormat *sfmt, SDL_PixelFormat *dfmt)
1362 {
1363     int i;
1364     Uint32 *d = dst;
1365     for(i = 0; i < n; i++) {
1366 	unsigned r, g, b, a;
1367 	Uint32 pixel;
1368 	RGBA_FROM_8888(*src, sfmt, r, g, b, a);
1369 	PIXEL_FROM_RGB(pixel, dfmt, r, g, b);
1370 	*d++ = pixel | a << 24;
1371 	src++;
1372     }
1373     return n * 4;
1374 }
1375 
1376 /* decode 32bpp rgba into 32bpp rgba, keeping alpha (dual purpose) */
uncopy_32(Uint32 * dst,void * src,int n,RLEDestFormat * sfmt,SDL_PixelFormat * dfmt)1377 static int uncopy_32(Uint32 *dst, void *src, int n,
1378 		     RLEDestFormat *sfmt, SDL_PixelFormat *dfmt)
1379 {
1380     int i;
1381     Uint32 *s = src;
1382     for(i = 0; i < n; i++) {
1383 	unsigned r, g, b, a;
1384 	Uint32 pixel = *s++;
1385 	RGB_FROM_PIXEL(pixel, sfmt, r, g, b);
1386 	a = pixel >> 24;
1387 	PIXEL_FROM_RGBA(*dst, dfmt, r, g, b, a);
1388 	dst++;
1389     }
1390     return n * 4;
1391 }
1392 
1393 #define ISOPAQUE(pixel, fmt) ((((pixel) & fmt->Amask) >> fmt->Ashift) == 255)
1394 
1395 #define ISTRANSL(pixel, fmt)	\
1396     ((unsigned)((((pixel) & fmt->Amask) >> fmt->Ashift) - 1U) < 254U)
1397 
1398 /* convert surface to be quickly alpha-blittable onto dest, if possible */
RLEAlphaSurface(SDL_Surface * surface)1399 static int RLEAlphaSurface(SDL_Surface *surface)
1400 {
1401     SDL_Surface *dest;
1402     SDL_PixelFormat *df;
1403     int maxsize = 0;
1404     int max_opaque_run;
1405     int max_transl_run = 65535;
1406     unsigned masksum;
1407     Uint8 *rlebuf, *dst;
1408     int (*copy_opaque)(void *, Uint32 *, int,
1409 		       SDL_PixelFormat *, SDL_PixelFormat *);
1410     int (*copy_transl)(void *, Uint32 *, int,
1411 		       SDL_PixelFormat *, SDL_PixelFormat *);
1412 
1413     dest = surface->map->dst;
1414     if(!dest)
1415 	return -1;
1416     df = dest->format;
1417     if(surface->format->BitsPerPixel != 32)
1418 	return -1;		/* only 32bpp source supported */
1419 
1420     /* find out whether the destination is one we support,
1421        and determine the max size of the encoded result */
1422     masksum = df->Rmask | df->Gmask | df->Bmask;
1423     switch(df->BytesPerPixel) {
1424     case 2:
1425 	/* 16bpp: only support 565 and 555 formats */
1426 	switch(masksum) {
1427 	case 0xffff:
1428 	    if(df->Gmask == 0x07e0
1429 	       || df->Rmask == 0x07e0 || df->Bmask == 0x07e0) {
1430 		copy_opaque = copy_opaque_16;
1431 		copy_transl = copy_transl_565;
1432 	    } else
1433 		return -1;
1434 	    break;
1435 	case 0x7fff:
1436 	    if(df->Gmask == 0x03e0
1437 	       || df->Rmask == 0x03e0 || df->Bmask == 0x03e0) {
1438 		copy_opaque = copy_opaque_16;
1439 		copy_transl = copy_transl_555;
1440 	    } else
1441 		return -1;
1442 	    break;
1443 	default:
1444 	    return -1;
1445 	}
1446 	max_opaque_run = 255;	/* runs stored as bytes */
1447 
1448 	/* worst case is alternating opaque and translucent pixels,
1449 	   with room for alignment padding between lines */
1450 	maxsize = surface->h * (2 + (4 + 2) * (surface->w + 1)) + 2;
1451 	break;
1452     case 4:
1453 	if(masksum != 0x00ffffff)
1454 	    return -1;		/* requires unused high byte */
1455 	copy_opaque = copy_32;
1456 	copy_transl = copy_32;
1457 	max_opaque_run = 255;	/* runs stored as short ints */
1458 
1459 	/* worst case is alternating opaque and translucent pixels */
1460 	maxsize = surface->h * 2 * 4 * (surface->w + 1) + 4;
1461 	break;
1462     default:
1463 	return -1;		/* anything else unsupported right now */
1464     }
1465 
1466     maxsize += sizeof(RLEDestFormat);
1467     rlebuf = (Uint8 *)SDL_malloc(maxsize);
1468     if(!rlebuf) {
1469 	SDL_OutOfMemory();
1470 	return -1;
1471     }
1472     {
1473 	/* save the destination format so we can undo the encoding later */
1474 	RLEDestFormat *r = (RLEDestFormat *)rlebuf;
1475 	r->BytesPerPixel = df->BytesPerPixel;
1476 	r->Rloss = df->Rloss;
1477 	r->Gloss = df->Gloss;
1478 	r->Bloss = df->Bloss;
1479 	r->Rshift = df->Rshift;
1480 	r->Gshift = df->Gshift;
1481 	r->Bshift = df->Bshift;
1482 	r->Ashift = df->Ashift;
1483 	r->Rmask = df->Rmask;
1484 	r->Gmask = df->Gmask;
1485 	r->Bmask = df->Bmask;
1486 	r->Amask = df->Amask;
1487     }
1488     dst = rlebuf + sizeof(RLEDestFormat);
1489 
1490     /* Do the actual encoding */
1491     {
1492 	int x, y;
1493 	int h = surface->h, w = surface->w;
1494 	SDL_PixelFormat *sf = surface->format;
1495 	Uint32 *src = (Uint32 *)surface->pixels;
1496 	Uint8 *lastline = dst;	/* end of last non-blank line */
1497 
1498 	/* opaque counts are 8 or 16 bits, depending on target depth */
1499 #define ADD_OPAQUE_COUNTS(n, m)			\
1500 	if(df->BytesPerPixel == 4) {		\
1501 	    ((Uint16 *)dst)[0] = n;		\
1502 	    ((Uint16 *)dst)[1] = m;		\
1503 	    dst += 4;				\
1504 	} else {				\
1505 	    dst[0] = n;				\
1506 	    dst[1] = m;				\
1507 	    dst += 2;				\
1508 	}
1509 
1510 	/* translucent counts are always 16 bit */
1511 #define ADD_TRANSL_COUNTS(n, m)		\
1512 	(((Uint16 *)dst)[0] = n, ((Uint16 *)dst)[1] = m, dst += 4)
1513 
1514 	for(y = 0; y < h; y++) {
1515 	    int runstart, skipstart;
1516 	    int blankline = 0;
1517 	    /* First encode all opaque pixels of a scan line */
1518 	    x = 0;
1519 	    do {
1520 		int run, skip, len;
1521 		skipstart = x;
1522 		while(x < w && !ISOPAQUE(src[x], sf))
1523 		    x++;
1524 		runstart = x;
1525 		while(x < w && ISOPAQUE(src[x], sf))
1526 		    x++;
1527 		skip = runstart - skipstart;
1528 		if(skip == w)
1529 		    blankline = 1;
1530 		run = x - runstart;
1531 		while(skip > max_opaque_run) {
1532 		    ADD_OPAQUE_COUNTS(max_opaque_run, 0);
1533 		    skip -= max_opaque_run;
1534 		}
1535 		len = MIN(run, max_opaque_run);
1536 		ADD_OPAQUE_COUNTS(skip, len);
1537 		dst += copy_opaque(dst, src + runstart, len, sf, df);
1538 		runstart += len;
1539 		run -= len;
1540 		while(run) {
1541 		    len = MIN(run, max_opaque_run);
1542 		    ADD_OPAQUE_COUNTS(0, len);
1543 		    dst += copy_opaque(dst, src + runstart, len, sf, df);
1544 		    runstart += len;
1545 		    run -= len;
1546 		}
1547 	    } while(x < w);
1548 
1549 	    /* Make sure the next output address is 32-bit aligned */
1550 	    dst += (uintptr_t)dst & 2;
1551 
1552 	    /* Next, encode all translucent pixels of the same scan line */
1553 	    x = 0;
1554 	    do {
1555 		int run, skip, len;
1556 		skipstart = x;
1557 		while(x < w && !ISTRANSL(src[x], sf))
1558 		    x++;
1559 		runstart = x;
1560 		while(x < w && ISTRANSL(src[x], sf))
1561 		    x++;
1562 		skip = runstart - skipstart;
1563 		blankline &= (skip == w);
1564 		run = x - runstart;
1565 		while(skip > max_transl_run) {
1566 		    ADD_TRANSL_COUNTS(max_transl_run, 0);
1567 		    skip -= max_transl_run;
1568 		}
1569 		len = MIN(run, max_transl_run);
1570 		ADD_TRANSL_COUNTS(skip, len);
1571 		dst += copy_transl(dst, src + runstart, len, sf, df);
1572 		runstart += len;
1573 		run -= len;
1574 		while(run) {
1575 		    len = MIN(run, max_transl_run);
1576 		    ADD_TRANSL_COUNTS(0, len);
1577 		    dst += copy_transl(dst, src + runstart, len, sf, df);
1578 		    runstart += len;
1579 		    run -= len;
1580 		}
1581 		if(!blankline)
1582 		    lastline = dst;
1583 	    } while(x < w);
1584 
1585 	    src += surface->pitch >> 2;
1586 	}
1587 	dst = lastline;		/* back up past trailing blank lines */
1588 	ADD_OPAQUE_COUNTS(0, 0);
1589     }
1590 
1591 #undef ADD_OPAQUE_COUNTS
1592 #undef ADD_TRANSL_COUNTS
1593 
1594     /* Now that we have it encoded, release the original pixels */
1595     if((surface->flags & SDL_PREALLOC) != SDL_PREALLOC
1596        && (surface->flags & SDL_HWSURFACE) != SDL_HWSURFACE) {
1597 	SDL_free( surface->pixels );
1598 	surface->pixels = NULL;
1599     }
1600 
1601     /* realloc the buffer to release unused memory */
1602     {
1603 	Uint8 *p = SDL_realloc(rlebuf, dst - rlebuf);
1604 	if(!p)
1605 	    p = rlebuf;
1606 	surface->map->sw_data->aux_data = p;
1607     }
1608 
1609     return 0;
1610 }
1611 
getpix_8(Uint8 * srcbuf)1612 static Uint32 getpix_8(Uint8 *srcbuf)
1613 {
1614     return *srcbuf;
1615 }
1616 
getpix_16(Uint8 * srcbuf)1617 static Uint32 getpix_16(Uint8 *srcbuf)
1618 {
1619     return *(Uint16 *)srcbuf;
1620 }
1621 
getpix_24(Uint8 * srcbuf)1622 static Uint32 getpix_24(Uint8 *srcbuf)
1623 {
1624 #if SDL_BYTEORDER == SDL_LIL_ENDIAN
1625     return srcbuf[0] + (srcbuf[1] << 8) + (srcbuf[2] << 16);
1626 #else
1627     return (srcbuf[0] << 16) + (srcbuf[1] << 8) + srcbuf[2];
1628 #endif
1629 }
1630 
getpix_32(Uint8 * srcbuf)1631 static Uint32 getpix_32(Uint8 *srcbuf)
1632 {
1633     return *(Uint32 *)srcbuf;
1634 }
1635 
1636 typedef Uint32 (*getpix_func)(Uint8 *);
1637 
1638 static getpix_func getpixes[4] = {
1639     getpix_8, getpix_16, getpix_24, getpix_32
1640 };
1641 
RLEColorkeySurface(SDL_Surface * surface)1642 static int RLEColorkeySurface(SDL_Surface *surface)
1643 {
1644         Uint8 *rlebuf, *dst;
1645 	int maxn;
1646 	int y;
1647 	Uint8 *srcbuf, *curbuf, *lastline;
1648 	int maxsize = 0;
1649 	int skip, run;
1650 	int bpp = surface->format->BytesPerPixel;
1651 	getpix_func getpix;
1652 	Uint32 ckey, rgbmask;
1653 	int w, h;
1654 
1655 	/* calculate the worst case size for the compressed surface */
1656 	switch(bpp) {
1657 	case 1:
1658 	    /* worst case is alternating opaque and transparent pixels,
1659 	       starting with an opaque pixel */
1660 	    maxsize = surface->h * 3 * (surface->w / 2 + 1) + 2;
1661 	    break;
1662 	case 2:
1663 	case 3:
1664 	    /* worst case is solid runs, at most 255 pixels wide */
1665 	    maxsize = surface->h * (2 * (surface->w / 255 + 1)
1666 				    + surface->w * bpp) + 2;
1667 	    break;
1668 	case 4:
1669 	    /* worst case is solid runs, at most 65535 pixels wide */
1670 	    maxsize = surface->h * (4 * (surface->w / 65535 + 1)
1671 				    + surface->w * 4) + 4;
1672 	    break;
1673 	}
1674 
1675 	rlebuf = (Uint8 *)SDL_malloc(maxsize);
1676 	if ( rlebuf == NULL ) {
1677 		SDL_OutOfMemory();
1678 		return(-1);
1679 	}
1680 
1681 	/* Set up the conversion */
1682 	srcbuf = (Uint8 *)surface->pixels;
1683 	curbuf = srcbuf;
1684 	maxn = bpp == 4 ? 65535 : 255;
1685 	skip = run = 0;
1686 	dst = rlebuf;
1687 	rgbmask = ~surface->format->Amask;
1688 	ckey = surface->format->colorkey & rgbmask;
1689 	lastline = dst;
1690 	getpix = getpixes[bpp - 1];
1691 	w = surface->w;
1692 	h = surface->h;
1693 
1694 #define ADD_COUNTS(n, m)			\
1695 	if(bpp == 4) {				\
1696 	    ((Uint16 *)dst)[0] = n;		\
1697 	    ((Uint16 *)dst)[1] = m;		\
1698 	    dst += 4;				\
1699 	} else {				\
1700 	    dst[0] = n;				\
1701 	    dst[1] = m;				\
1702 	    dst += 2;				\
1703 	}
1704 
1705 	for(y = 0; y < h; y++) {
1706 	    int x = 0;
1707 	    int blankline = 0;
1708 	    do {
1709 		int run, skip, len;
1710 		int runstart;
1711 		int skipstart = x;
1712 
1713 		/* find run of transparent, then opaque pixels */
1714 		while(x < w && (getpix(srcbuf + x * bpp) & rgbmask) == ckey)
1715 		    x++;
1716 		runstart = x;
1717 		while(x < w && (getpix(srcbuf + x * bpp) & rgbmask) != ckey)
1718 		    x++;
1719 		skip = runstart - skipstart;
1720 		if(skip == w)
1721 		    blankline = 1;
1722 		run = x - runstart;
1723 
1724 		/* encode segment */
1725 		while(skip > maxn) {
1726 		    ADD_COUNTS(maxn, 0);
1727 		    skip -= maxn;
1728 		}
1729 		len = MIN(run, maxn);
1730 		ADD_COUNTS(skip, len);
1731 		SDL_memcpy(dst, srcbuf + runstart * bpp, len * bpp);
1732 		dst += len * bpp;
1733 		run -= len;
1734 		runstart += len;
1735 		while(run) {
1736 		    len = MIN(run, maxn);
1737 		    ADD_COUNTS(0, len);
1738 		    SDL_memcpy(dst, srcbuf + runstart * bpp, len * bpp);
1739 		    dst += len * bpp;
1740 		    runstart += len;
1741 		    run -= len;
1742 		}
1743 		if(!blankline)
1744 		    lastline = dst;
1745 	    } while(x < w);
1746 
1747 	    srcbuf += surface->pitch;
1748 	}
1749 	dst = lastline;		/* back up bast trailing blank lines */
1750 	ADD_COUNTS(0, 0);
1751 
1752 #undef ADD_COUNTS
1753 
1754 	/* Now that we have it encoded, release the original pixels */
1755 	if((surface->flags & SDL_PREALLOC) != SDL_PREALLOC
1756 	   && (surface->flags & SDL_HWSURFACE) != SDL_HWSURFACE) {
1757 	    SDL_free( surface->pixels );
1758 	    surface->pixels = NULL;
1759 	}
1760 
1761 	/* realloc the buffer to release unused memory */
1762 	{
1763 	    /* If realloc returns NULL, the original block is left intact */
1764 	    Uint8 *p = SDL_realloc(rlebuf, dst - rlebuf);
1765 	    if(!p)
1766 		p = rlebuf;
1767 	    surface->map->sw_data->aux_data = p;
1768 	}
1769 
1770 	return(0);
1771 }
1772 
SDL_RLESurface(SDL_Surface * surface)1773 int SDL_RLESurface(SDL_Surface *surface)
1774 {
1775 	int retcode;
1776 
1777 	/* Clear any previous RLE conversion */
1778 	if ( (surface->flags & SDL_RLEACCEL) == SDL_RLEACCEL ) {
1779 		SDL_UnRLESurface(surface, 1);
1780 	}
1781 
1782 	/* We don't support RLE encoding of bitmaps */
1783 	if ( surface->format->BitsPerPixel < 8 ) {
1784 		return(-1);
1785 	}
1786 
1787 	/* Lock the surface if it's in hardware */
1788 	if ( SDL_MUSTLOCK(surface) ) {
1789 		if ( SDL_LockSurface(surface) < 0 ) {
1790 			return(-1);
1791 		}
1792 	}
1793 
1794 	/* Encode */
1795 	if((surface->flags & SDL_SRCCOLORKEY) == SDL_SRCCOLORKEY) {
1796 	    retcode = RLEColorkeySurface(surface);
1797 	} else {
1798 	    if((surface->flags & SDL_SRCALPHA) == SDL_SRCALPHA
1799 	       && surface->format->Amask != 0)
1800 		retcode = RLEAlphaSurface(surface);
1801 	    else
1802 		retcode = -1;	/* no RLE for per-surface alpha sans ckey */
1803 	}
1804 
1805 	/* Unlock the surface if it's in hardware */
1806 	if ( SDL_MUSTLOCK(surface) ) {
1807 		SDL_UnlockSurface(surface);
1808 	}
1809 
1810 	if(retcode < 0)
1811 	    return -1;
1812 
1813 	/* The surface is now accelerated */
1814 	surface->flags |= SDL_RLEACCEL;
1815 
1816 	return(0);
1817 }
1818 
1819 /*
1820  * Un-RLE a surface with pixel alpha
1821  * This may not give back exactly the image before RLE-encoding; all
1822  * completely transparent pixels will be lost, and colour and alpha depth
1823  * may have been reduced (when encoding for 16bpp targets).
1824  */
UnRLEAlpha(SDL_Surface * surface)1825 static SDL_bool UnRLEAlpha(SDL_Surface *surface)
1826 {
1827     Uint8 *srcbuf;
1828     Uint32 *dst;
1829     SDL_PixelFormat *sf = surface->format;
1830     RLEDestFormat *df = surface->map->sw_data->aux_data;
1831     int (*uncopy_opaque)(Uint32 *, void *, int,
1832 			 RLEDestFormat *, SDL_PixelFormat *);
1833     int (*uncopy_transl)(Uint32 *, void *, int,
1834 			 RLEDestFormat *, SDL_PixelFormat *);
1835     int w = surface->w;
1836     int bpp = df->BytesPerPixel;
1837 
1838     if(bpp == 2) {
1839 	uncopy_opaque = uncopy_opaque_16;
1840 	uncopy_transl = uncopy_transl_16;
1841     } else {
1842 	uncopy_opaque = uncopy_transl = uncopy_32;
1843     }
1844 
1845     surface->pixels = SDL_malloc(surface->h * surface->pitch);
1846     if ( !surface->pixels ) {
1847         return(SDL_FALSE);
1848     }
1849     /* fill background with transparent pixels */
1850     SDL_memset(surface->pixels, 0, surface->h * surface->pitch);
1851 
1852     dst = surface->pixels;
1853     srcbuf = (Uint8 *)(df + 1);
1854     for(;;) {
1855 	/* copy opaque pixels */
1856 	int ofs = 0;
1857 	do {
1858 	    unsigned run;
1859 	    if(bpp == 2) {
1860 		ofs += srcbuf[0];
1861 		run = srcbuf[1];
1862 		srcbuf += 2;
1863 	    } else {
1864 		ofs += ((Uint16 *)srcbuf)[0];
1865 		run = ((Uint16 *)srcbuf)[1];
1866 		srcbuf += 4;
1867 	    }
1868 	    if(run) {
1869 		srcbuf += uncopy_opaque(dst + ofs, srcbuf, run, df, sf);
1870 		ofs += run;
1871 	    } else if(!ofs)
1872 		return(SDL_TRUE);
1873 	} while(ofs < w);
1874 
1875 	/* skip padding if needed */
1876 	if(bpp == 2)
1877 	    srcbuf += (uintptr_t)srcbuf & 2;
1878 
1879 	/* copy translucent pixels */
1880 	ofs = 0;
1881 	do {
1882 	    unsigned run;
1883 	    ofs += ((Uint16 *)srcbuf)[0];
1884 	    run = ((Uint16 *)srcbuf)[1];
1885 	    srcbuf += 4;
1886 	    if(run) {
1887 		srcbuf += uncopy_transl(dst + ofs, srcbuf, run, df, sf);
1888 		ofs += run;
1889 	    }
1890 	} while(ofs < w);
1891 	dst += surface->pitch >> 2;
1892     }
1893     /* Make the compiler happy */
1894     return(SDL_TRUE);
1895 }
1896 
SDL_UnRLESurface(SDL_Surface * surface,int recode)1897 void SDL_UnRLESurface(SDL_Surface *surface, int recode)
1898 {
1899     if ( (surface->flags & SDL_RLEACCEL) == SDL_RLEACCEL ) {
1900 	surface->flags &= ~SDL_RLEACCEL;
1901 
1902 	if(recode && (surface->flags & SDL_PREALLOC) != SDL_PREALLOC
1903 	   && (surface->flags & SDL_HWSURFACE) != SDL_HWSURFACE) {
1904 	    if((surface->flags & SDL_SRCCOLORKEY) == SDL_SRCCOLORKEY) {
1905 		SDL_Rect full;
1906 		unsigned alpha_flag;
1907 
1908 		/* re-create the original surface */
1909 		surface->pixels = SDL_malloc(surface->h * surface->pitch);
1910 		if ( !surface->pixels ) {
1911 			/* Oh crap... */
1912 			surface->flags |= SDL_RLEACCEL;
1913 			return;
1914 		}
1915 
1916 		/* fill it with the background colour */
1917 		SDL_FillRect(surface, NULL, surface->format->colorkey);
1918 
1919 		/* now render the encoded surface */
1920 		full.x = full.y = 0;
1921 		full.w = surface->w;
1922 		full.h = surface->h;
1923 		alpha_flag = surface->flags & SDL_SRCALPHA;
1924 		surface->flags &= ~SDL_SRCALPHA; /* opaque blit */
1925 		SDL_RLEBlit(surface, &full, surface, &full);
1926 		surface->flags |= alpha_flag;
1927 	    } else {
1928 		if ( !UnRLEAlpha(surface) ) {
1929 		    /* Oh crap... */
1930 		    surface->flags |= SDL_RLEACCEL;
1931 		    return;
1932 		}
1933 	    }
1934 	}
1935 
1936 	if ( surface->map && surface->map->sw_data->aux_data ) {
1937 	    SDL_free(surface->map->sw_data->aux_data);
1938 	    surface->map->sw_data->aux_data = NULL;
1939 	}
1940     }
1941 }
1942 
1943 
1944