1/* 2 * (C) Copyright IBM Corporation 2004 3 * All Rights Reserved. 4 * 5 * Permission is hereby granted, free of charge, to any person obtaining a 6 * copy of this software and associated documentation files (the "Software"), 7 * to deal in the Software without restriction, including without limitation 8 * on the rights to use, copy, modify, merge, publish, distribute, sub 9 * license, and/or sell copies of the Software, and to permit persons to whom 10 * the Software is furnished to do so, subject to the following conditions: 11 * 12 * The above copyright notice and this permission notice (including the next 13 * paragraph) shall be included in all copies or substantial portions of the 14 * Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL 19 * IBM AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, 20 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR 21 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE 22 * USE OR OTHER DEALINGS IN THE SOFTWARE. 23 */ 24 25/** 26 * \file read_rgba_span_x86.S 27 * Optimized routines to transfer pixel data from the framebuffer to a 28 * buffer in main memory. 29 * 30 * \author Ian Romanick <idr@us.ibm.com> 31 */ 32 33 .file "read_rgba_span_x86.S" 34#if !defined(__MINGW32__) && !defined(__APPLE__) /* this one cries for assyntax.h */ 35/* Kevin F. Quinn 2nd July 2006 36 * Replaced data segment constants with text-segment instructions. 37 */ 38#define LOAD_MASK(mvins,m1,m2) \ 39 pushl $0xff00ff00 ;\ 40 pushl $0xff00ff00 ;\ 41 pushl $0xff00ff00 ;\ 42 pushl $0xff00ff00 ;\ 43 mvins (%esp), m1 ;\ 44 pushl $0x00ff0000 ;\ 45 pushl $0x00ff0000 ;\ 46 pushl $0x00ff0000 ;\ 47 pushl $0x00ff0000 ;\ 48 mvins (%esp), m2 ;\ 49 addl $32, %esp 50 51/* I implemented these as macros because they appear in several places, 52 * and I've tweaked them a number of times. I got tired of changing every 53 * place they appear. :) 54 */ 55 56#define DO_ONE_PIXEL() \ 57 movl (%ebx), %eax ; \ 58 addl $4, %ebx ; \ 59 bswap %eax /* ARGB -> BGRA */ ; \ 60 rorl $8, %eax /* BGRA -> ABGR */ ; \ 61 movl %eax, (%ecx) /* ABGR -> R, G, B, A */ ; \ 62 addl $4, %ecx 63 64#define DO_ONE_LAST_PIXEL() \ 65 movl (%ebx), %eax ; \ 66 bswap %eax /* ARGB -> BGRA */ ; \ 67 rorl $8, %eax /* BGRA -> ABGR */ ; \ 68 movl %eax, (%ecx) /* ABGR -> R, G, B, A */ ; 69 70 71/** 72 * MMX optimized version of the BGRA8888_REV to RGBA copy routine. 73 * 74 * \warning 75 * This function assumes that the caller will issue the EMMS instruction 76 * at the correct places. 77 */ 78 79.globl _generic_read_RGBA_span_BGRA8888_REV_MMX 80.hidden _generic_read_RGBA_span_BGRA8888_REV_MMX 81 .type _generic_read_RGBA_span_BGRA8888_REV_MMX, @function 82_generic_read_RGBA_span_BGRA8888_REV_MMX: 83 pushl %ebx 84 85#ifdef USE_INNER_EMMS 86 emms 87#endif 88 LOAD_MASK(movq,%mm1,%mm2) 89 90 movl 8(%esp), %ebx /* source pointer */ 91 movl 16(%esp), %edx /* number of pixels to copy */ 92 movl 12(%esp), %ecx /* destination pointer */ 93 94 testl %edx, %edx 95 jle .L20 /* Bail if there's nothing to do. */ 96 97 movl %ebx, %eax 98 99 negl %eax 100 sarl $2, %eax 101 andl $1, %eax 102 je .L17 103 104 subl %eax, %edx 105 DO_ONE_PIXEL() 106.L17: 107 108 /* Would it be faster to unroll this loop once and process 4 pixels 109 * per pass, instead of just two? 110 */ 111 112 movl %edx, %eax 113 shrl %eax 114 jmp .L18 115.L19: 116 movq (%ebx), %mm0 117 addl $8, %ebx 118 119 /* These 9 instructions do what PSHUFB (if there were such an 120 * instruction) could do in 1. :( 121 */ 122 123 movq %mm0, %mm3 124 movq %mm0, %mm4 125 126 pand %mm2, %mm3 127 psllq $16, %mm4 128 psrlq $16, %mm3 129 pand %mm2, %mm4 130 131 pand %mm1, %mm0 132 por %mm4, %mm3 133 por %mm3, %mm0 134 135 movq %mm0, (%ecx) 136 addl $8, %ecx 137 subl $1, %eax 138.L18: 139 jne .L19 140 141#ifdef USE_INNER_EMMS 142 emms 143#endif 144 145 /* At this point there are either 1 or 0 pixels remaining to be 146 * converted. Convert the last pixel, if needed. 147 */ 148 149 testl $1, %edx 150 je .L20 151 152 DO_ONE_LAST_PIXEL() 153 154.L20: 155 popl %ebx 156 ret 157 .size _generic_read_RGBA_span_BGRA8888_REV_MMX, .-_generic_read_RGBA_span_BGRA8888_REV_MMX 158 159 160/** 161 * SSE optimized version of the BGRA8888_REV to RGBA copy routine. SSE 162 * instructions are only actually used to read data from the framebuffer. 163 * In practice, the speed-up is pretty small. 164 * 165 * \todo 166 * Do some more testing and determine if there's any reason to have this 167 * function in addition to the MMX version. 168 * 169 * \warning 170 * This function assumes that the caller will issue the EMMS instruction 171 * at the correct places. 172 */ 173 174.globl _generic_read_RGBA_span_BGRA8888_REV_SSE 175.hidden _generic_read_RGBA_span_BGRA8888_REV_SSE 176 .type _generic_read_RGBA_span_BGRA8888_REV_SSE, @function 177_generic_read_RGBA_span_BGRA8888_REV_SSE: 178 pushl %esi 179 pushl %ebx 180 pushl %ebp 181 182#ifdef USE_INNER_EMMS 183 emms 184#endif 185 186 LOAD_MASK(movq,%mm1,%mm2) 187 188 movl 16(%esp), %ebx /* source pointer */ 189 movl 24(%esp), %edx /* number of pixels to copy */ 190 movl 20(%esp), %ecx /* destination pointer */ 191 192 testl %edx, %edx 193 jle .L35 /* Bail if there's nothing to do. */ 194 195 movl %esp, %ebp 196 subl $16, %esp 197 andl $0xfffffff0, %esp 198 199 movl %ebx, %eax 200 movl %edx, %esi 201 202 negl %eax 203 andl $15, %eax 204 sarl $2, %eax 205 cmpl %edx, %eax 206 cmovle %eax, %esi 207 208 subl %esi, %edx 209 210 testl $1, %esi 211 je .L32 212 213 DO_ONE_PIXEL() 214.L32: 215 216 testl $2, %esi 217 je .L31 218 219 movq (%ebx), %mm0 220 addl $8, %ebx 221 222 movq %mm0, %mm3 223 movq %mm0, %mm4 224 225 pand %mm2, %mm3 226 psllq $16, %mm4 227 psrlq $16, %mm3 228 pand %mm2, %mm4 229 230 pand %mm1, %mm0 231 por %mm4, %mm3 232 por %mm3, %mm0 233 234 movq %mm0, (%ecx) 235 addl $8, %ecx 236.L31: 237 238 movl %edx, %eax 239 shrl $2, %eax 240 jmp .L33 241.L34: 242 movaps (%ebx), %xmm0 243 addl $16, %ebx 244 245 /* This would be so much better if we could just move directly from 246 * an SSE register to an MMX register. Unfortunately, that 247 * functionality wasn't introduced until SSE2 with the MOVDQ2Q 248 * instruction. 249 */ 250 251 movaps %xmm0, (%esp) 252 movq (%esp), %mm0 253 movq 8(%esp), %mm5 254 255 movq %mm0, %mm3 256 movq %mm0, %mm4 257 movq %mm5, %mm6 258 movq %mm5, %mm7 259 260 pand %mm2, %mm3 261 pand %mm2, %mm6 262 263 psllq $16, %mm4 264 psllq $16, %mm7 265 266 psrlq $16, %mm3 267 psrlq $16, %mm6 268 269 pand %mm2, %mm4 270 pand %mm2, %mm7 271 272 pand %mm1, %mm0 273 pand %mm1, %mm5 274 275 por %mm4, %mm3 276 por %mm7, %mm6 277 278 por %mm3, %mm0 279 por %mm6, %mm5 280 281 movq %mm0, (%ecx) 282 movq %mm5, 8(%ecx) 283 addl $16, %ecx 284 285 subl $1, %eax 286.L33: 287 jne .L34 288 289#ifdef USE_INNER_EMMS 290 emms 291#endif 292 movl %ebp, %esp 293 294 /* At this point there are either [0, 3] pixels remaining to be 295 * converted. 296 */ 297 298 testl $2, %edx 299 je .L36 300 301 movq (%ebx), %mm0 302 addl $8, %ebx 303 304 movq %mm0, %mm3 305 movq %mm0, %mm4 306 307 pand %mm2, %mm3 308 psllq $16, %mm4 309 psrlq $16, %mm3 310 pand %mm2, %mm4 311 312 pand %mm1, %mm0 313 por %mm4, %mm3 314 por %mm3, %mm0 315 316 movq %mm0, (%ecx) 317 addl $8, %ecx 318.L36: 319 320 testl $1, %edx 321 je .L35 322 323 DO_ONE_LAST_PIXEL() 324.L35: 325 popl %ebp 326 popl %ebx 327 popl %esi 328 ret 329 .size _generic_read_RGBA_span_BGRA8888_REV_SSE, .-_generic_read_RGBA_span_BGRA8888_REV_SSE 330 331 332/** 333 * SSE2 optimized version of the BGRA8888_REV to RGBA copy routine. 334 */ 335 336 .text 337.globl _generic_read_RGBA_span_BGRA8888_REV_SSE2 338.hidden _generic_read_RGBA_span_BGRA8888_REV_SSE2 339 .type _generic_read_RGBA_span_BGRA8888_REV_SSE2, @function 340_generic_read_RGBA_span_BGRA8888_REV_SSE2: 341 pushl %esi 342 pushl %ebx 343 344 LOAD_MASK(movdqu,%xmm1,%xmm2) 345 346 movl 12(%esp), %ebx /* source pointer */ 347 movl 20(%esp), %edx /* number of pixels to copy */ 348 movl 16(%esp), %ecx /* destination pointer */ 349 350 movl %ebx, %eax 351 movl %edx, %esi 352 353 testl %edx, %edx 354 jle .L46 /* Bail if there's nothing to do. */ 355 356 /* If the source pointer isn't a multiple of 16 we have to process 357 * a few pixels the "slow" way to get the address aligned for 358 * the SSE fetch intsructions. 359 */ 360 361 negl %eax 362 andl $15, %eax 363 sarl $2, %eax 364 365 cmpl %edx, %eax 366 cmovbe %eax, %esi 367 subl %esi, %edx 368 369 testl $1, %esi 370 je .L41 371 372 DO_ONE_PIXEL() 373.L41: 374 testl $2, %esi 375 je .L40 376 377 movq (%ebx), %xmm0 378 addl $8, %ebx 379 380 movdqa %xmm0, %xmm3 381 movdqa %xmm0, %xmm4 382 andps %xmm1, %xmm0 383 384 andps %xmm2, %xmm3 385 pslldq $2, %xmm4 386 psrldq $2, %xmm3 387 andps %xmm2, %xmm4 388 389 orps %xmm4, %xmm3 390 orps %xmm3, %xmm0 391 392 movq %xmm0, (%ecx) 393 addl $8, %ecx 394.L40: 395 396 /* Would it be worth having a specialized version of this loop for 397 * the case where the destination is 16-byte aligned? That version 398 * would be identical except that it could use movedqa instead of 399 * movdqu. 400 */ 401 402 movl %edx, %eax 403 shrl $2, %eax 404 jmp .L42 405.L43: 406 movdqa (%ebx), %xmm0 407 addl $16, %ebx 408 409 movdqa %xmm0, %xmm3 410 movdqa %xmm0, %xmm4 411 andps %xmm1, %xmm0 412 413 andps %xmm2, %xmm3 414 pslldq $2, %xmm4 415 psrldq $2, %xmm3 416 andps %xmm2, %xmm4 417 418 orps %xmm4, %xmm3 419 orps %xmm3, %xmm0 420 421 movdqu %xmm0, (%ecx) 422 addl $16, %ecx 423 subl $1, %eax 424.L42: 425 jne .L43 426 427 428 /* There may be upto 3 pixels remaining to be copied. Take care 429 * of them now. We do the 2 pixel case first because the data 430 * will be aligned. 431 */ 432 433 testl $2, %edx 434 je .L47 435 436 movq (%ebx), %xmm0 437 addl $8, %ebx 438 439 movdqa %xmm0, %xmm3 440 movdqa %xmm0, %xmm4 441 andps %xmm1, %xmm0 442 443 andps %xmm2, %xmm3 444 pslldq $2, %xmm4 445 psrldq $2, %xmm3 446 andps %xmm2, %xmm4 447 448 orps %xmm4, %xmm3 449 orps %xmm3, %xmm0 450 451 movq %xmm0, (%ecx) 452 addl $8, %ecx 453.L47: 454 455 testl $1, %edx 456 je .L46 457 458 DO_ONE_LAST_PIXEL() 459.L46: 460 461 popl %ebx 462 popl %esi 463 ret 464 .size _generic_read_RGBA_span_BGRA8888_REV_SSE2, .-_generic_read_RGBA_span_BGRA8888_REV_SSE2 465 466 467 468#define MASK_565_L 0x07e0f800 469#define MASK_565_H 0x0000001f 470/* Setting SCALE_ADJUST to 5 gives a perfect match with the 471 * classic C implementation in Mesa. Setting SCALE_ADJUST 472 * to 0 is slightly faster but at a small cost to accuracy. 473 */ 474#define SCALE_ADJUST 5 475#if SCALE_ADJUST == 5 476#define PRESCALE_L 0x00100001 477#define PRESCALE_H 0x00000200 478#define SCALE_L 0x40C620E8 479#define SCALE_H 0x0000839d 480#elif SCALE_ADJUST == 0 481#define PRESCALE_L 0x00200001 482#define PRESCALE_H 0x00000800 483#define SCALE_L 0x01040108 484#define SCALE_H 0x00000108 485#else 486#error SCALE_ADJUST must either be 5 or 0. 487#endif 488#define ALPHA_L 0x00000000 489#define ALPHA_H 0x00ff0000 490 491/** 492 * MMX optimized version of the RGB565 to RGBA copy routine. 493 */ 494 495 .text 496 .globl _generic_read_RGBA_span_RGB565_MMX 497 .hidden _generic_read_RGBA_span_RGB565_MMX 498 .type _generic_read_RGBA_span_RGB565_MMX, @function 499 500_generic_read_RGBA_span_RGB565_MMX: 501 502#ifdef USE_INNER_EMMS 503 emms 504#endif 505 506 movl 4(%esp), %eax /* source pointer */ 507 movl 8(%esp), %edx /* destination pointer */ 508 movl 12(%esp), %ecx /* number of pixels to copy */ 509 510 pushl $MASK_565_H 511 pushl $MASK_565_L 512 movq (%esp), %mm5 513 pushl $PRESCALE_H 514 pushl $PRESCALE_L 515 movq (%esp), %mm6 516 pushl $SCALE_H 517 pushl $SCALE_L 518 movq (%esp), %mm7 519 pushl $ALPHA_H 520 pushl $ALPHA_L 521 movq (%esp), %mm3 522 addl $32,%esp 523 524 sarl $2, %ecx 525 jl .L01 /* Bail early if the count is negative. */ 526 jmp .L02 527 528.L03: 529 /* Fetch 4 RGB565 pixels into %mm4. Distribute the first and 530 * second pixels into the four words of %mm0 and %mm2. 531 */ 532 533 movq (%eax), %mm4 534 addl $8, %eax 535 536 pshufw $0x00, %mm4, %mm0 537 pshufw $0x55, %mm4, %mm2 538 539 540 /* Mask the pixels so that each word of each register contains only 541 * one color component. 542 */ 543 544 pand %mm5, %mm0 545 pand %mm5, %mm2 546 547 548 /* Adjust the component values so that they are as small as possible, 549 * but large enough so that we can multiply them by an unsigned 16-bit 550 * number and get a value as large as 0x00ff0000. 551 */ 552 553 pmullw %mm6, %mm0 554 pmullw %mm6, %mm2 555#if SCALE_ADJUST > 0 556 psrlw $SCALE_ADJUST, %mm0 557 psrlw $SCALE_ADJUST, %mm2 558#endif 559 560 /* Scale the input component values to be on the range 561 * [0, 0x00ff0000]. This it the real magic of the whole routine. 562 */ 563 564 pmulhuw %mm7, %mm0 565 pmulhuw %mm7, %mm2 566 567 568 /* Always set the alpha value to 0xff. 569 */ 570 571 por %mm3, %mm0 572 por %mm3, %mm2 573 574 575 /* Pack the 16-bit values to 8-bit values and store the converted 576 * pixel data. 577 */ 578 579 packuswb %mm2, %mm0 580 movq %mm0, (%edx) 581 addl $8, %edx 582 583 pshufw $0xaa, %mm4, %mm0 584 pshufw $0xff, %mm4, %mm2 585 586 pand %mm5, %mm0 587 pand %mm5, %mm2 588 pmullw %mm6, %mm0 589 pmullw %mm6, %mm2 590#if SCALE_ADJUST > 0 591 psrlw $SCALE_ADJUST, %mm0 592 psrlw $SCALE_ADJUST, %mm2 593#endif 594 pmulhuw %mm7, %mm0 595 pmulhuw %mm7, %mm2 596 597 por %mm3, %mm0 598 por %mm3, %mm2 599 600 packuswb %mm2, %mm0 601 602 movq %mm0, (%edx) 603 addl $8, %edx 604 605 subl $1, %ecx 606.L02: 607 jne .L03 608 609 610 /* At this point there can be at most 3 pixels left to process. If 611 * there is either 2 or 3 left, process 2. 612 */ 613 614 movl 12(%esp), %ecx 615 testl $0x02, %ecx 616 je .L04 617 618 movd (%eax), %mm4 619 addl $4, %eax 620 621 pshufw $0x00, %mm4, %mm0 622 pshufw $0x55, %mm4, %mm2 623 624 pand %mm5, %mm0 625 pand %mm5, %mm2 626 pmullw %mm6, %mm0 627 pmullw %mm6, %mm2 628#if SCALE_ADJUST > 0 629 psrlw $SCALE_ADJUST, %mm0 630 psrlw $SCALE_ADJUST, %mm2 631#endif 632 pmulhuw %mm7, %mm0 633 pmulhuw %mm7, %mm2 634 635 por %mm3, %mm0 636 por %mm3, %mm2 637 638 packuswb %mm2, %mm0 639 640 movq %mm0, (%edx) 641 addl $8, %edx 642 643.L04: 644 /* At this point there can be at most 1 pixel left to process. 645 * Process it if needed. 646 */ 647 648 testl $0x01, %ecx 649 je .L01 650 651 movzwl (%eax), %ecx 652 movd %ecx, %mm4 653 654 pshufw $0x00, %mm4, %mm0 655 656 pand %mm5, %mm0 657 pmullw %mm6, %mm0 658#if SCALE_ADJUST > 0 659 psrlw $SCALE_ADJUST, %mm0 660#endif 661 pmulhuw %mm7, %mm0 662 663 por %mm3, %mm0 664 665 packuswb %mm0, %mm0 666 667 movd %mm0, (%edx) 668 669.L01: 670#ifdef USE_INNER_EMMS 671 emms 672#endif 673 ret 674#endif /* !defined(__MINGW32__) && !defined(__APPLE__) */ 675 676#if defined (__ELF__) && defined (__linux__) 677 .section .note.GNU-stack,"",%progbits 678#endif 679