1; 2; pII-optimised MMX format converters for HERMES 3; Copyright (c) 1998 Christian Nentwich (c.nentwich@cs.ucl.ac.uk) 4; and (c) 1999 Jonathan Matthew (jmatthew@uq.net.au) 5; This source code is licensed under the GNU LGPL 6; 7; Please refer to the file COPYING.LIB contained in the distribution for 8; licensing conditions 9; 10; COPYRIGHT NOTICE 11; 12; This file partly contains code that is (c) Intel Corporation, specifically 13; the mode detection routine, and the converter to 15 bit (8 pixel 14; conversion routine from the mmx programming tutorial pages). 15; 16; 17; These routines aren't exactly pII optimised - it's just that as they 18; are, they're terrible on p5 MMXs, but less so on pIIs. Someone needs to 19; optimise them for p5 MMXs.. 20 21BITS 32 22 23%include "common.inc" 24 25SDL_FUNC _ConvertMMXpII32_24RGB888 26SDL_FUNC _ConvertMMXpII32_16RGB565 27SDL_FUNC _ConvertMMXpII32_16BGR565 28SDL_FUNC _ConvertMMXpII32_16RGB555 29SDL_FUNC _ConvertMMXpII32_16BGR555 30 31;; Macros for conversion routines 32 33%macro _push_immq_mask 1 34 push dword %1 35 push dword %1 36%endmacro 37 38%macro load_immq 2 39 _push_immq_mask %2 40 movq %1, [esp] 41%endmacro 42 43%macro pand_immq 2 44 _push_immq_mask %2 45 pand %1, [esp] 46%endmacro 47 48%define CLEANUP_IMMQ_LOADS(num) \ 49 add esp, byte 8 * num 50 51%define mmx32_rgb888_mask 00ffffffh 52%define mmx32_rgb565_b 000000f8h 53%define mmx32_rgb565_g 0000fc00h 54%define mmx32_rgb565_r 00f80000h 55 56%define mmx32_rgb555_rb 00f800f8h 57%define mmx32_rgb555_g 0000f800h 58%define mmx32_rgb555_mul 20000008h 59%define mmx32_bgr555_mul 00082000h 60 61SECTION .text 62 63_ConvertMMXpII32_24RGB888: 64 65 ; set up mm6 as the mask, mm7 as zero 66 load_immq mm6, mmx32_rgb888_mask 67 CLEANUP_IMMQ_LOADS(1) 68 pxor mm7, mm7 69 70 mov edx, ecx ; save ecx 71 and ecx, 0fffffffch ; clear lower two bits 72 jnz .L1 73 jmp .L2 74 75.L1: 76 77 movq mm0, [esi] ; A R G B a r g b 78 pand mm0, mm6 ; 0 R G B 0 r g b 79 movq mm1, [esi+8] ; A R G B a r g b 80 pand mm1, mm6 ; 0 R G B 0 r g b 81 82 movq mm2, mm0 ; 0 R G B 0 r g b 83 punpckhdq mm2, mm7 ; 0 0 0 0 0 R G B 84 punpckldq mm0, mm7 ; 0 0 0 0 0 r g b 85 psllq mm2, 24 ; 0 0 R G B 0 0 0 86 por mm0, mm2 ; 0 0 R G B r g b 87 88 movq mm3, mm1 ; 0 R G B 0 r g b 89 psllq mm3, 48 ; g b 0 0 0 0 0 0 90 por mm0, mm3 ; g b R G B r g b 91 92 movq mm4, mm1 ; 0 R G B 0 r g b 93 punpckhdq mm4, mm7 ; 0 0 0 0 0 R G B 94 punpckldq mm1, mm7 ; 0 0 0 0 0 r g b 95 psrlq mm1, 16 ; 0 0 0 R G B 0 r 96 psllq mm4, 8 ; 0 0 0 0 R G B 0 97 por mm1, mm4 ; 0 0 0 0 R G B r 98 99 movq [edi], mm0 100 add esi, BYTE 16 101 movd [edi+8], mm1 102 add edi, BYTE 12 103 sub ecx, BYTE 4 104 jnz .L1 105 106.L2: 107 mov ecx, edx 108 and ecx, BYTE 3 109 jz .L4 110.L3: 111 mov al, [esi] 112 mov bl, [esi+1] 113 mov dl, [esi+2] 114 mov [edi], al 115 mov [edi+1], bl 116 mov [edi+2], dl 117 add esi, BYTE 4 118 add edi, BYTE 3 119 dec ecx 120 jnz .L3 121.L4: 122 retn 123 124 125 126_ConvertMMXpII32_16RGB565: 127 128 ; set up masks 129 load_immq mm5, mmx32_rgb565_b 130 load_immq mm6, mmx32_rgb565_g 131 load_immq mm7, mmx32_rgb565_r 132 CLEANUP_IMMQ_LOADS(3) 133 134 mov edx, ecx 135 shr ecx, 2 136 jnz .L1 137 jmp .L2 ; not necessary at the moment, but doesn't hurt (much) 138 139.L1: 140 movq mm0, [esi] ; argb 141 movq mm1, mm0 ; argb 142 pand mm0, mm6 ; 00g0 143 movq mm3, mm1 ; argb 144 pand mm1, mm5 ; 000b 145 pand mm3, mm7 ; 0r00 146 pslld mm1, 2 ; 0 0 000000bb bbb00000 147 por mm0, mm1 ; 0 0 ggggggbb bbb00000 148 psrld mm0, 5 ; 0 0 00000ggg gggbbbbb 149 150 movq mm4, [esi+8] ; argb 151 movq mm2, mm4 ; argb 152 pand mm4, mm6 ; 00g0 153 movq mm1, mm2 ; argb 154 pand mm2, mm5 ; 000b 155 pand mm1, mm7 ; 0r00 156 pslld mm2, 2 ; 0 0 000000bb bbb00000 157 por mm4, mm2 ; 0 0 ggggggbb bbb00000 158 psrld mm4, 5 ; 0 0 00000ggg gggbbbbb 159 160 packuswb mm3, mm1 ; R 0 r 0 161 packssdw mm0, mm4 ; as above.. ish 162 por mm0, mm3 ; done. 163 movq [edi], mm0 164 165 add esi, 16 166 add edi, 8 167 dec ecx 168 jnz .L1 169 170.L2: 171 mov ecx, edx 172 and ecx, BYTE 3 173 jz .L4 174.L3: 175 mov al, [esi] 176 mov bh, [esi+1] 177 mov ah, [esi+2] 178 shr al, 3 179 and eax, 0F81Fh ; BYTE? 180 shr ebx, 5 181 and ebx, 07E0h ; BYTE? 182 add eax, ebx 183 mov [edi], al 184 mov [edi+1], ah 185 add esi, BYTE 4 186 add edi, BYTE 2 187 dec ecx 188 jnz .L3 189 190.L4: 191 retn 192 193 194_ConvertMMXpII32_16BGR565: 195 196 load_immq mm5, mmx32_rgb565_r 197 load_immq mm6, mmx32_rgb565_g 198 load_immq mm7, mmx32_rgb565_b 199 CLEANUP_IMMQ_LOADS(3) 200 201 mov edx, ecx 202 shr ecx, 2 203 jnz .L1 204 jmp .L2 205 206.L1: 207 movq mm0, [esi] ; a r g b 208 movq mm1, mm0 ; a r g b 209 pand mm0, mm6 ; 0 0 g 0 210 movq mm3, mm1 ; a r g b 211 pand mm1, mm5 ; 0 r 0 0 212 pand mm3, mm7 ; 0 0 0 b 213 214 psllq mm3, 16 ; 0 b 0 0 215 psrld mm1, 14 ; 0 0 000000rr rrr00000 216 por mm0, mm1 ; 0 0 ggggggrr rrr00000 217 psrld mm0, 5 ; 0 0 00000ggg gggrrrrr 218 219 movq mm4, [esi+8] ; a r g b 220 movq mm2, mm4 ; a r g b 221 pand mm4, mm6 ; 0 0 g 0 222 movq mm1, mm2 ; a r g b 223 pand mm2, mm5 ; 0 r 0 0 224 pand mm1, mm7 ; 0 0 0 b 225 226 psllq mm1, 16 ; 0 b 0 0 227 psrld mm2, 14 ; 0 0 000000rr rrr00000 228 por mm4, mm2 ; 0 0 ggggggrr rrr00000 229 psrld mm4, 5 ; 0 0 00000ggg gggrrrrr 230 231 packuswb mm3, mm1 ; BBBBB000 00000000 bbbbb000 00000000 232 packssdw mm0, mm4 ; 00000GGG GGGRRRRR 00000GGG GGGRRRRR 233 por mm0, mm3 ; BBBBBGGG GGGRRRRR bbbbbggg gggrrrrr 234 movq [edi], mm0 235 236 add esi, BYTE 16 237 add edi, BYTE 8 238 dec ecx 239 jnz .L1 240 241.L2: 242 and edx, BYTE 3 243 jz .L4 244.L3: 245 mov al, [esi+2] 246 mov bh, [esi+1] 247 mov ah, [esi] 248 shr al, 3 249 and eax, 0F81Fh ; BYTE ? 250 shr ebx, 5 251 and ebx, 07E0h ; BYTE ? 252 add eax, ebx 253 mov [edi], al 254 mov [edi+1], ah 255 add esi, BYTE 4 256 add edi, BYTE 2 257 dec edx 258 jnz .L3 259 260.L4: 261 retn 262 263_ConvertMMXpII32_16BGR555: 264 265 ; the 16BGR555 converter is identical to the RGB555 one, 266 ; except it uses a different multiplier for the pmaddwd 267 ; instruction. cool huh. 268 269 load_immq mm7, mmx32_bgr555_mul 270 jmp _convert_bgr555_cheat 271 272; This is the same as the Intel version.. they obviously went to 273; much more trouble to expand/coil the loop than I did, so theirs 274; would almost certainly be faster, even if only a little. 275; I did rename 'mmx32_rgb555_add' to 'mmx32_rgb555_mul', which is 276; (I think) a more accurate name.. 277_ConvertMMXpII32_16RGB555: 278 279 load_immq mm7, mmx32_rgb555_mul 280_convert_bgr555_cheat: 281 load_immq mm6, mmx32_rgb555_g 282 CLEANUP_IMMQ_LOADS(2) 283 284 mov edx,ecx ; Save ecx 285 286 and ecx,DWORD 0fffffff8h ; clear lower three bits 287 jnz .L_OK 288 jmp near .L2 289 290.L_OK: 291 292 movq mm2,[esi+8] 293 294 movq mm0,[esi] 295 movq mm3,mm2 296 297 pand_immq mm3, mmx32_rgb555_rb 298 movq mm1,mm0 299 300 pand_immq mm1, mmx32_rgb555_rb 301 pmaddwd mm3,mm7 302 303 CLEANUP_IMMQ_LOADS(2) 304 305 pmaddwd mm1,mm7 306 pand mm2,mm6 307 308.L1: 309 movq mm4,[esi+24] 310 pand mm0,mm6 311 312 movq mm5,[esi+16] 313 por mm3,mm2 314 315 psrld mm3,6 316 por mm1,mm0 317 318 movq mm0,mm4 319 psrld mm1,6 320 321 pand_immq mm0, mmx32_rgb555_rb 322 packssdw mm1,mm3 323 324 movq mm3,mm5 325 pmaddwd mm0,mm7 326 327 pand_immq mm3, mmx32_rgb555_rb 328 pand mm4,mm6 329 330 movq [edi],mm1 331 pmaddwd mm3,mm7 332 333 add esi,BYTE 32 334 por mm4,mm0 335 336 pand mm5,mm6 337 psrld mm4,6 338 339 movq mm2,[esi+8] 340 por mm5,mm3 341 342 movq mm0,[esi] 343 psrld mm5,6 344 345 movq mm3,mm2 346 movq mm1,mm0 347 348 pand_immq mm3, mmx32_rgb555_rb 349 packssdw mm5,mm4 350 351 pand_immq mm1, mmx32_rgb555_rb 352 pand mm2,mm6 353 354 CLEANUP_IMMQ_LOADS(4) 355 356 movq [edi+8],mm5 357 pmaddwd mm3,mm7 358 359 pmaddwd mm1,mm7 360 add edi,BYTE 16 361 362 sub ecx,BYTE 8 363 jz .L2 364 jmp .L1 365 366 367.L2: 368 mov ecx,edx 369 370 and ecx,BYTE 7 371 jz .L4 372 373.L3: 374 mov ebx,[esi] 375 add esi,BYTE 4 376 377 mov eax,ebx 378 mov edx,ebx 379 380 shr eax,3 381 shr edx,6 382 383 and eax,BYTE 0000000000011111b 384 and edx, 0000001111100000b 385 386 shr ebx,9 387 388 or eax,edx 389 390 and ebx, 0111110000000000b 391 392 or eax,ebx 393 394 mov [edi],ax 395 add edi,BYTE 2 396 397 dec ecx 398 jnz .L3 399 400.L4: 401 retn 402 403%ifidn __OUTPUT_FORMAT__,elf32 404section .note.GNU-stack noalloc noexec nowrite progbits 405%endif 406