1; 2; x86 format converters for HERMES 3; Some routines Copyright (c) 1998 Christian Nentwich (brn@eleet.mcb.at) 4; This source code is licensed under the GNU LGPL 5; 6; Please refer to the file COPYING.LIB contained in the distribution for 7; licensing conditions 8; 9; Most routines are (c) Glenn Fiedler (ptc@gaffer.org), used with permission 10; 11 12BITS 32 13 14%include "common.inc" 15 16SDL_FUNC _ConvertX86p32_32BGR888 17SDL_FUNC _ConvertX86p32_32RGBA888 18SDL_FUNC _ConvertX86p32_32BGRA888 19SDL_FUNC _ConvertX86p32_24RGB888 20SDL_FUNC _ConvertX86p32_24BGR888 21SDL_FUNC _ConvertX86p32_16RGB565 22SDL_FUNC _ConvertX86p32_16BGR565 23SDL_FUNC _ConvertX86p32_16RGB555 24SDL_FUNC _ConvertX86p32_16BGR555 25SDL_FUNC _ConvertX86p32_8RGB332 26 27SECTION .text 28 29;; _Convert_* 30;; Paramters: 31;; ESI = source 32;; EDI = dest 33;; ECX = amount (NOT 0!!! (the _ConvertX86 routine checks for that though)) 34;; Destroys: 35;; EAX, EBX, EDX 36 37 38_ConvertX86p32_32BGR888: 39 40 ; check short 41 cmp ecx,BYTE 32 42 ja .L3 43 44.L1 ; short loop 45 mov edx,[esi] 46 bswap edx 47 ror edx,8 48 mov [edi],edx 49 add esi,BYTE 4 50 add edi,BYTE 4 51 dec ecx 52 jnz .L1 53.L2 54 retn 55 56.L3 ; save ebp 57 push ebp 58 59 ; unroll four times 60 mov ebp,ecx 61 shr ebp,2 62 63 ; save count 64 push ecx 65 66.L4 mov eax,[esi] 67 mov ebx,[esi+4] 68 69 bswap eax 70 71 bswap ebx 72 73 ror eax,8 74 mov ecx,[esi+8] 75 76 ror ebx,8 77 mov edx,[esi+12] 78 79 bswap ecx 80 81 bswap edx 82 83 ror ecx,8 84 mov [edi+0],eax 85 86 ror edx,8 87 mov [edi+4],ebx 88 89 mov [edi+8],ecx 90 mov [edi+12],edx 91 92 add esi,BYTE 16 93 add edi,BYTE 16 94 95 dec ebp 96 jnz .L4 97 98 ; check tail 99 pop ecx 100 and ecx,BYTE 11b 101 jz .L6 102 103.L5 ; tail loop 104 mov edx,[esi] 105 bswap edx 106 ror edx,8 107 mov [edi],edx 108 add esi,BYTE 4 109 add edi,BYTE 4 110 dec ecx 111 jnz .L5 112 113.L6 pop ebp 114 retn 115 116 117 118 119_ConvertX86p32_32RGBA888: 120 121 ; check short 122 cmp ecx,BYTE 32 123 ja .L3 124 125.L1 ; short loop 126 mov edx,[esi] 127 rol edx,8 128 mov [edi],edx 129 add esi,BYTE 4 130 add edi,BYTE 4 131 dec ecx 132 jnz .L1 133.L2 134 retn 135 136.L3 ; save ebp 137 push ebp 138 139 ; unroll four times 140 mov ebp,ecx 141 shr ebp,2 142 143 ; save count 144 push ecx 145 146.L4 mov eax,[esi] 147 mov ebx,[esi+4] 148 149 rol eax,8 150 mov ecx,[esi+8] 151 152 rol ebx,8 153 mov edx,[esi+12] 154 155 rol ecx,8 156 mov [edi+0],eax 157 158 rol edx,8 159 mov [edi+4],ebx 160 161 mov [edi+8],ecx 162 mov [edi+12],edx 163 164 add esi,BYTE 16 165 add edi,BYTE 16 166 167 dec ebp 168 jnz .L4 169 170 ; check tail 171 pop ecx 172 and ecx,BYTE 11b 173 jz .L6 174 175.L5 ; tail loop 176 mov edx,[esi] 177 rol edx,8 178 mov [edi],edx 179 add esi,BYTE 4 180 add edi,BYTE 4 181 dec ecx 182 jnz .L5 183 184.L6 pop ebp 185 retn 186 187 188 189 190_ConvertX86p32_32BGRA888: 191 192 ; check short 193 cmp ecx,BYTE 32 194 ja .L3 195 196.L1 ; short loop 197 mov edx,[esi] 198 bswap edx 199 mov [edi],edx 200 add esi,BYTE 4 201 add edi,BYTE 4 202 dec ecx 203 jnz .L1 204.L2 205 retn 206 207.L3 ; save ebp 208 push ebp 209 210 ; unroll four times 211 mov ebp,ecx 212 shr ebp,2 213 214 ; save count 215 push ecx 216 217.L4 mov eax,[esi] 218 mov ebx,[esi+4] 219 220 mov ecx,[esi+8] 221 mov edx,[esi+12] 222 223 bswap eax 224 225 bswap ebx 226 227 bswap ecx 228 229 bswap edx 230 231 mov [edi+0],eax 232 mov [edi+4],ebx 233 234 mov [edi+8],ecx 235 mov [edi+12],edx 236 237 add esi,BYTE 16 238 add edi,BYTE 16 239 240 dec ebp 241 jnz .L4 242 243 ; check tail 244 pop ecx 245 and ecx,BYTE 11b 246 jz .L6 247 248.L5 ; tail loop 249 mov edx,[esi] 250 bswap edx 251 mov [edi],edx 252 add esi,BYTE 4 253 add edi,BYTE 4 254 dec ecx 255 jnz .L5 256 257.L6 pop ebp 258 retn 259 260 261 262 263;; 32 bit RGB 888 to 24 BIT RGB 888 264 265_ConvertX86p32_24RGB888: 266 267 ; check short 268 cmp ecx,BYTE 32 269 ja .L3 270 271.L1 ; short loop 272 mov al,[esi] 273 mov bl,[esi+1] 274 mov dl,[esi+2] 275 mov [edi],al 276 mov [edi+1],bl 277 mov [edi+2],dl 278 add esi,BYTE 4 279 add edi,BYTE 3 280 dec ecx 281 jnz .L1 282.L2 283 retn 284 285.L3 ; head 286 mov edx,edi 287 and edx,BYTE 11b 288 jz .L4 289 mov al,[esi] 290 mov bl,[esi+1] 291 mov dl,[esi+2] 292 mov [edi],al 293 mov [edi+1],bl 294 mov [edi+2],dl 295 add esi,BYTE 4 296 add edi,BYTE 3 297 dec ecx 298 jmp SHORT .L3 299 300.L4 ; unroll 4 times 301 push ebp 302 mov ebp,ecx 303 shr ebp,2 304 305 ; save count 306 push ecx 307 308.L5 mov eax,[esi] ; first dword eax = [A][R][G][B] 309 mov ebx,[esi+4] ; second dword ebx = [a][r][g][b] 310 311 shl eax,8 ; eax = [R][G][B][.] 312 mov ecx,[esi+12] ; third dword ecx = [a][r][g][b] 313 314 shl ebx,8 ; ebx = [r][g][b][.] 315 mov al,[esi+4] ; eax = [R][G][B][b] 316 317 ror eax,8 ; eax = [b][R][G][B] (done) 318 mov bh,[esi+8+1] ; ebx = [r][g][G][.] 319 320 mov [edi],eax 321 add edi,BYTE 3*4 322 323 shl ecx,8 ; ecx = [r][g][b][.] 324 mov bl,[esi+8+0] ; ebx = [r][g][G][B] 325 326 rol ebx,16 ; ebx = [G][B][r][g] (done) 327 mov cl,[esi+8+2] ; ecx = [r][g][b][R] (done) 328 329 mov [edi+4-3*4],ebx 330 add esi,BYTE 4*4 331 332 mov [edi+8-3*4],ecx 333 dec ebp 334 335 jnz .L5 336 337 ; check tail 338 pop ecx 339 and ecx,BYTE 11b 340 jz .L7 341 342.L6 ; tail loop 343 mov al,[esi] 344 mov bl,[esi+1] 345 mov dl,[esi+2] 346 mov [edi],al 347 mov [edi+1],bl 348 mov [edi+2],dl 349 add esi,BYTE 4 350 add edi,BYTE 3 351 dec ecx 352 jnz .L6 353 354.L7 pop ebp 355 retn 356 357 358 359 360;; 32 bit RGB 888 to 24 bit BGR 888 361 362_ConvertX86p32_24BGR888: 363 364 ; check short 365 cmp ecx,BYTE 32 366 ja .L3 367 368 369.L1 ; short loop 370 mov dl,[esi] 371 mov bl,[esi+1] 372 mov al,[esi+2] 373 mov [edi],al 374 mov [edi+1],bl 375 mov [edi+2],dl 376 add esi,BYTE 4 377 add edi,BYTE 3 378 dec ecx 379 jnz .L1 380.L2 381 retn 382 383.L3 ; head 384 mov edx,edi 385 and edx,BYTE 11b 386 jz .L4 387 mov dl,[esi] 388 mov bl,[esi+1] 389 mov al,[esi+2] 390 mov [edi],al 391 mov [edi+1],bl 392 mov [edi+2],dl 393 add esi,BYTE 4 394 add edi,BYTE 3 395 dec ecx 396 jmp SHORT .L3 397 398.L4 ; unroll 4 times 399 push ebp 400 mov ebp,ecx 401 shr ebp,2 402 403 ; save count 404 push ecx 405 406.L5 407 mov eax,[esi] ; first dword eax = [A][R][G][B] 408 mov ebx,[esi+4] ; second dword ebx = [a][r][g][b] 409 410 bswap eax ; eax = [B][G][R][A] 411 412 bswap ebx ; ebx = [b][g][r][a] 413 414 mov al,[esi+4+2] ; eax = [B][G][R][r] 415 mov bh,[esi+4+4+1] ; ebx = [b][g][G][a] 416 417 ror eax,8 ; eax = [r][B][G][R] (done) 418 mov bl,[esi+4+4+2] ; ebx = [b][g][G][R] 419 420 ror ebx,16 ; ebx = [G][R][b][g] (done) 421 mov [edi],eax 422 423 mov [edi+4],ebx 424 mov ecx,[esi+12] ; third dword ecx = [a][r][g][b] 425 426 bswap ecx ; ecx = [b][g][r][a] 427 428 mov cl,[esi+8] ; ecx = [b][g][r][B] (done) 429 add esi,BYTE 4*4 430 431 mov [edi+8],ecx 432 add edi,BYTE 3*4 433 434 dec ebp 435 jnz .L5 436 437 ; check tail 438 pop ecx 439 and ecx,BYTE 11b 440 jz .L7 441 442.L6 ; tail loop 443 mov dl,[esi] 444 mov bl,[esi+1] 445 mov al,[esi+2] 446 mov [edi],al 447 mov [edi+1],bl 448 mov [edi+2],dl 449 add esi,BYTE 4 450 add edi,BYTE 3 451 dec ecx 452 jnz .L6 453 454.L7 455 pop ebp 456 retn 457 458 459 460 461;; 32 bit RGB 888 to 16 BIT RGB 565 462 463_ConvertX86p32_16RGB565: 464 ; check short 465 cmp ecx,BYTE 16 466 ja .L3 467 468.L1 ; short loop 469 mov bl,[esi+0] ; blue 470 mov al,[esi+1] ; green 471 mov ah,[esi+2] ; red 472 shr ah,3 473 and al,11111100b 474 shl eax,3 475 shr bl,3 476 add al,bl 477 mov [edi+0],al 478 mov [edi+1],ah 479 add esi,BYTE 4 480 add edi,BYTE 2 481 dec ecx 482 jnz .L1 483 484.L2: ; End of short loop 485 retn 486 487 488.L3 ; head 489 mov ebx,edi 490 and ebx,BYTE 11b 491 jz .L4 492 493 mov bl,[esi+0] ; blue 494 mov al,[esi+1] ; green 495 mov ah,[esi+2] ; red 496 shr ah,3 497 and al,11111100b 498 shl eax,3 499 shr bl,3 500 add al,bl 501 mov [edi+0],al 502 mov [edi+1],ah 503 add esi,BYTE 4 504 add edi,BYTE 2 505 dec ecx 506 507.L4: 508 ; save count 509 push ecx 510 511 ; unroll twice 512 shr ecx,1 513 514 ; point arrays to end 515 lea esi,[esi+ecx*8] 516 lea edi,[edi+ecx*4] 517 518 ; negative counter 519 neg ecx 520 jmp SHORT .L6 521 522.L5: 523 mov [edi+ecx*4-4],eax 524.L6: 525 mov eax,[esi+ecx*8] 526 527 shr ah,2 528 mov ebx,[esi+ecx*8+4] 529 530 shr eax,3 531 mov edx,[esi+ecx*8+4] 532 533 shr bh,2 534 mov dl,[esi+ecx*8+2] 535 536 shl ebx,13 537 and eax,000007FFh 538 539 shl edx,8 540 and ebx,07FF0000h 541 542 and edx,0F800F800h 543 add eax,ebx 544 545 add eax,edx 546 inc ecx 547 548 jnz .L5 549 550 mov [edi+ecx*4-4],eax 551 552 ; tail 553 pop ecx 554 test cl,1 555 jz .L7 556 557 mov bl,[esi+0] ; blue 558 mov al,[esi+1] ; green 559 mov ah,[esi+2] ; red 560 shr ah,3 561 and al,11111100b 562 shl eax,3 563 shr bl,3 564 add al,bl 565 mov [edi+0],al 566 mov [edi+1],ah 567 add esi,BYTE 4 568 add edi,BYTE 2 569 570.L7: 571 retn 572 573 574 575 576;; 32 bit RGB 888 to 16 BIT BGR 565 577 578_ConvertX86p32_16BGR565: 579 580 ; check short 581 cmp ecx,BYTE 16 582 ja .L3 583 584.L1 ; short loop 585 mov ah,[esi+0] ; blue 586 mov al,[esi+1] ; green 587 mov bl,[esi+2] ; red 588 shr ah,3 589 and al,11111100b 590 shl eax,3 591 shr bl,3 592 add al,bl 593 mov [edi+0],al 594 mov [edi+1],ah 595 add esi,BYTE 4 596 add edi,BYTE 2 597 dec ecx 598 jnz .L1 599.L2 600 retn 601 602.L3 ; head 603 mov ebx,edi 604 and ebx,BYTE 11b 605 jz .L4 606 mov ah,[esi+0] ; blue 607 mov al,[esi+1] ; green 608 mov bl,[esi+2] ; red 609 shr ah,3 610 and al,11111100b 611 shl eax,3 612 shr bl,3 613 add al,bl 614 mov [edi+0],al 615 mov [edi+1],ah 616 add esi,BYTE 4 617 add edi,BYTE 2 618 dec ecx 619 620.L4 ; save count 621 push ecx 622 623 ; unroll twice 624 shr ecx,1 625 626 ; point arrays to end 627 lea esi,[esi+ecx*8] 628 lea edi,[edi+ecx*4] 629 630 ; negative count 631 neg ecx 632 jmp SHORT .L6 633 634.L5 635 mov [edi+ecx*4-4],eax 636.L6 637 mov edx,[esi+ecx*8+4] 638 639 mov bh,[esi+ecx*8+4] 640 mov ah,[esi+ecx*8] 641 642 shr bh,3 643 mov al,[esi+ecx*8+1] 644 645 shr ah,3 646 mov bl,[esi+ecx*8+5] 647 648 shl eax,3 649 mov dl,[esi+ecx*8+2] 650 651 shl ebx,19 652 and eax,0000FFE0h 653 654 shr edx,3 655 and ebx,0FFE00000h 656 657 and edx,001F001Fh 658 add eax,ebx 659 660 add eax,edx 661 inc ecx 662 663 jnz .L5 664 665 mov [edi+ecx*4-4],eax 666 667 ; tail 668 pop ecx 669 and ecx,BYTE 1 670 jz .L7 671 mov ah,[esi+0] ; blue 672 mov al,[esi+1] ; green 673 mov bl,[esi+2] ; red 674 shr ah,3 675 and al,11111100b 676 shl eax,3 677 shr bl,3 678 add al,bl 679 mov [edi+0],al 680 mov [edi+1],ah 681 add esi,BYTE 4 682 add edi,BYTE 2 683 684.L7 685 retn 686 687 688 689 690;; 32 BIT RGB TO 16 BIT RGB 555 691 692_ConvertX86p32_16RGB555: 693 694 ; check short 695 cmp ecx,BYTE 16 696 ja .L3 697 698.L1 ; short loop 699 mov bl,[esi+0] ; blue 700 mov al,[esi+1] ; green 701 mov ah,[esi+2] ; red 702 shr ah,3 703 and al,11111000b 704 shl eax,2 705 shr bl,3 706 add al,bl 707 mov [edi+0],al 708 mov [edi+1],ah 709 add esi,BYTE 4 710 add edi,BYTE 2 711 dec ecx 712 jnz .L1 713.L2 714 retn 715 716.L3 ; head 717 mov ebx,edi 718 and ebx,BYTE 11b 719 jz .L4 720 mov bl,[esi+0] ; blue 721 mov al,[esi+1] ; green 722 mov ah,[esi+2] ; red 723 shr ah,3 724 and al,11111000b 725 shl eax,2 726 shr bl,3 727 add al,bl 728 mov [edi+0],al 729 mov [edi+1],ah 730 add esi,BYTE 4 731 add edi,BYTE 2 732 dec ecx 733 734.L4 ; save count 735 push ecx 736 737 ; unroll twice 738 shr ecx,1 739 740 ; point arrays to end 741 lea esi,[esi+ecx*8] 742 lea edi,[edi+ecx*4] 743 744 ; negative counter 745 neg ecx 746 jmp SHORT .L6 747 748.L5 749 mov [edi+ecx*4-4],eax 750.L6 751 mov eax,[esi+ecx*8] 752 753 shr ah,3 754 mov ebx,[esi+ecx*8+4] 755 756 shr eax,3 757 mov edx,[esi+ecx*8+4] 758 759 shr bh,3 760 mov dl,[esi+ecx*8+2] 761 762 shl ebx,13 763 and eax,000007FFh 764 765 shl edx,7 766 and ebx,07FF0000h 767 768 and edx,07C007C00h 769 add eax,ebx 770 771 add eax,edx 772 inc ecx 773 774 jnz .L5 775 776 mov [edi+ecx*4-4],eax 777 778 ; tail 779 pop ecx 780 and ecx,BYTE 1 781 jz .L7 782 mov bl,[esi+0] ; blue 783 mov al,[esi+1] ; green 784 mov ah,[esi+2] ; red 785 shr ah,3 786 and al,11111000b 787 shl eax,2 788 shr bl,3 789 add al,bl 790 mov [edi+0],al 791 mov [edi+1],ah 792 add esi,BYTE 4 793 add edi,BYTE 2 794 795.L7 796 retn 797 798 799 800 801;; 32 BIT RGB TO 16 BIT BGR 555 802 803_ConvertX86p32_16BGR555: 804 805 ; check short 806 cmp ecx,BYTE 16 807 ja .L3 808 809 810.L1 ; short loop 811 mov ah,[esi+0] ; blue 812 mov al,[esi+1] ; green 813 mov bl,[esi+2] ; red 814 shr ah,3 815 and al,11111000b 816 shl eax,2 817 shr bl,3 818 add al,bl 819 mov [edi+0],al 820 mov [edi+1],ah 821 add esi,BYTE 4 822 add edi,BYTE 2 823 dec ecx 824 jnz .L1 825.L2 826 retn 827 828.L3 ; head 829 mov ebx,edi 830 and ebx,BYTE 11b 831 jz .L4 832 mov ah,[esi+0] ; blue 833 mov al,[esi+1] ; green 834 mov bl,[esi+2] ; red 835 shr ah,3 836 and al,11111000b 837 shl eax,2 838 shr bl,3 839 add al,bl 840 mov [edi+0],al 841 mov [edi+1],ah 842 add esi,BYTE 4 843 add edi,BYTE 2 844 dec ecx 845 846.L4 ; save count 847 push ecx 848 849 ; unroll twice 850 shr ecx,1 851 852 ; point arrays to end 853 lea esi,[esi+ecx*8] 854 lea edi,[edi+ecx*4] 855 856 ; negative counter 857 neg ecx 858 jmp SHORT .L6 859 860.L5 861 mov [edi+ecx*4-4],eax 862.L6 863 mov edx,[esi+ecx*8+4] 864 865 mov bh,[esi+ecx*8+4] 866 mov ah,[esi+ecx*8] 867 868 shr bh,3 869 mov al,[esi+ecx*8+1] 870 871 shr ah,3 872 mov bl,[esi+ecx*8+5] 873 874 shl eax,2 875 mov dl,[esi+ecx*8+2] 876 877 shl ebx,18 878 and eax,00007FE0h 879 880 shr edx,3 881 and ebx,07FE00000h 882 883 and edx,001F001Fh 884 add eax,ebx 885 886 add eax,edx 887 inc ecx 888 889 jnz .L5 890 891 mov [edi+ecx*4-4],eax 892 893 ; tail 894 pop ecx 895 and ecx,BYTE 1 896 jz .L7 897 mov ah,[esi+0] ; blue 898 mov al,[esi+1] ; green 899 mov bl,[esi+2] ; red 900 shr ah,3 901 and al,11111000b 902 shl eax,2 903 shr bl,3 904 add al,bl 905 mov [edi+0],al 906 mov [edi+1],ah 907 add esi,BYTE 4 908 add edi,BYTE 2 909 910.L7 911 retn 912 913 914 915 916 917;; FROM 32 BIT RGB to 8 BIT RGB (rrrgggbbb) 918;; This routine writes FOUR pixels at once (dword) and then, if they exist 919;; the trailing three pixels 920_ConvertX86p32_8RGB332: 921 922 923.L_ALIGNED 924 push ecx 925 926 shr ecx,2 ; We will draw 4 pixels at once 927 jnz .L1 928 929 jmp .L2 ; short jump out of range :( 930 931.L1: 932 mov eax,[esi] ; first pair of pixels 933 mov edx,[esi+4] 934 935 shr dl,6 936 mov ebx,eax 937 938 shr al,6 939 and ah,0e0h 940 941 shr ebx,16 942 and dh,0e0h 943 944 shr ah,3 945 and bl,0e0h 946 947 shr dh,3 948 949 or al,bl 950 951 mov ebx,edx 952 or al,ah 953 954 shr ebx,16 955 or dl,dh 956 957 and bl,0e0h 958 959 or dl,bl 960 961 mov ah,dl 962 963 964 965 mov ebx,[esi+8] ; second pair of pixels 966 967 mov edx,ebx 968 and bh,0e0h 969 970 shr bl,6 971 and edx,0e00000h 972 973 shr edx,16 974 975 shr bh,3 976 977 ror eax,16 978 or bl,dl 979 980 mov edx,[esi+12] 981 or bl,bh 982 983 mov al,bl 984 985 mov ebx,edx 986 and dh,0e0h 987 988 shr dl,6 989 and ebx,0e00000h 990 991 shr dh,3 992 mov ah,dl 993 994 shr ebx,16 995 or ah,dh 996 997 or ah,bl 998 999 rol eax,16 1000 add esi,BYTE 16 1001 1002 mov [edi],eax 1003 add edi,BYTE 4 1004 1005 dec ecx 1006 jz .L2 ; L1 out of range for short jump :( 1007 1008 jmp .L1 1009.L2: 1010 1011 pop ecx 1012 and ecx,BYTE 3 ; mask out number of pixels to draw 1013 1014 jz .L4 ; Nothing to do anymore 1015 1016.L3: 1017 mov eax,[esi] ; single pixel conversion for trailing pixels 1018 1019 mov ebx,eax 1020 1021 shr al,6 1022 and ah,0e0h 1023 1024 shr ebx,16 1025 1026 shr ah,3 1027 and bl,0e0h 1028 1029 or al,ah 1030 or al,bl 1031 1032 mov [edi],al 1033 1034 inc edi 1035 add esi,BYTE 4 1036 1037 dec ecx 1038 jnz .L3 1039 1040.L4: 1041 retn 1042 1043%ifidn __OUTPUT_FORMAT__,elf 1044section .note.GNU-stack noalloc noexec nowrite progbits 1045%endif 1046