1/* 2 * inffast.S is a hand tuned assembler version of: 3 * 4 * inffast.c -- fast decoding 5 * Copyright (C) 1995-2003 Mark Adler 6 * For conditions of distribution and use, see copyright notice in zlib.h 7 * 8 * Copyright (C) 2003 Chris Anderson <christop@charm.net> 9 * Please use the copyright conditions above. 10 * 11 * This version (Jan-23-2003) of inflate_fast was coded and tested under 12 * GNU/Linux on a pentium 3, using the gcc-3.2 compiler distribution. On that 13 * machine, I found that gzip style archives decompressed about 20% faster than 14 * the gcc-3.2 -O3 -fomit-frame-pointer compiled version. Your results will 15 * depend on how large of a buffer is used for z_stream.next_in & next_out 16 * (8K-32K worked best for my 256K cpu cache) and how much overhead there is in 17 * stream processing I/O and crc32/addler32. In my case, this routine used 18 * 70% of the cpu time and crc32 used 20%. 19 * 20 * I am confident that this version will work in the general case, but I have 21 * not tested a wide variety of datasets or a wide variety of platforms. 22 * 23 * Jan-24-2003 -- Added -DUSE_MMX define for slightly faster inflating. 24 * It should be a runtime flag instead of compile time flag... 25 * 26 * Jan-26-2003 -- Added runtime check for MMX support with cpuid instruction. 27 * With -DUSE_MMX, only MMX code is compiled. With -DNO_MMX, only non-MMX code 28 * is compiled. Without either option, runtime detection is enabled. Runtime 29 * detection should work on all modern cpus and the recomended algorithm (flip 30 * ID bit on eflags and then use the cpuid instruction) is used in many 31 * multimedia applications. Tested under win2k with gcc-2.95 and gas-2.12 32 * distributed with cygwin3. Compiling with gcc-2.95 -c inffast.S -o 33 * inffast.obj generates a COFF object which can then be linked with MSVC++ 34 * compiled code. Tested under FreeBSD 4.7 with gcc-2.95. 35 * 36 * Jan-28-2003 -- Tested Athlon XP... MMX mode is slower than no MMX (and 37 * slower than compiler generated code). Adjusted cpuid check to use the MMX 38 * code only for Pentiums < P4 until I have more data on the P4. Speed 39 * improvment is only about 15% on the Athlon when compared with code generated 40 * with MSVC++. Not sure yet, but I think the P4 will also be slower using the 41 * MMX mode because many of it's x86 ALU instructions execute in .5 cycles and 42 * have less latency than MMX ops. Added code to buffer the last 11 bytes of 43 * the input stream since the MMX code grabs bits in chunks of 32, which 44 * differs from the inffast.c algorithm. I don't think there would have been 45 * read overruns where a page boundary was crossed (a segfault), but there 46 * could have been overruns when next_in ends on unaligned memory (unintialized 47 * memory read). 48 * 49 * Mar-13-2003 -- P4 MMX is slightly slower than P4 NO_MMX. I created a C 50 * version of the non-MMX code so that it doesn't depend on zstrm and zstate 51 * structure offsets which are hard coded in this file. This was last tested 52 * with zlib-1.2.0 which is currently in beta testing, newer versions of this 53 * and inffas86.c can be found at http://www.eetbeetee.com/zlib/ and 54 * http://www.charm.net/~christop/zlib/ 55 */ 56 57 58/* 59 * if you have underscore linking problems (_inflate_fast undefined), try 60 * using -DGAS_COFF 61 */ 62#if ! defined( GAS_COFF ) && ! defined( GAS_ELF ) 63 64#if defined( WIN32 ) || defined( __CYGWIN__ ) 65#define GAS_COFF /* windows object format */ 66#else 67#define GAS_ELF 68#endif 69 70#endif /* ! GAS_COFF && ! GAS_ELF */ 71 72 73#if defined( GAS_COFF ) 74 75/* coff externals have underscores */ 76#define inflate_fast _inflate_fast 77#define inflate_fast_use_mmx _inflate_fast_use_mmx 78 79#endif /* GAS_COFF */ 80 81 82.file "inffast.S" 83 84.globl inflate_fast 85 86.text 87.align 4,0 88.L_invalid_literal_length_code_msg: 89.string "invalid literal/length code" 90 91.align 4,0 92.L_invalid_distance_code_msg: 93.string "invalid distance code" 94 95.align 4,0 96.L_invalid_distance_too_far_msg: 97.string "invalid distance too far back" 98 99#if ! defined( NO_MMX ) 100.align 4,0 101.L_mask: /* mask[N] = ( 1 << N ) - 1 */ 102.long 0 103.long 1 104.long 3 105.long 7 106.long 15 107.long 31 108.long 63 109.long 127 110.long 255 111.long 511 112.long 1023 113.long 2047 114.long 4095 115.long 8191 116.long 16383 117.long 32767 118.long 65535 119.long 131071 120.long 262143 121.long 524287 122.long 1048575 123.long 2097151 124.long 4194303 125.long 8388607 126.long 16777215 127.long 33554431 128.long 67108863 129.long 134217727 130.long 268435455 131.long 536870911 132.long 1073741823 133.long 2147483647 134.long 4294967295 135#endif /* NO_MMX */ 136 137.text 138 139/* 140 * struct z_stream offsets, in zlib.h 141 */ 142#define next_in_strm 0 /* strm->next_in */ 143#define avail_in_strm 4 /* strm->avail_in */ 144#define next_out_strm 12 /* strm->next_out */ 145#define avail_out_strm 16 /* strm->avail_out */ 146#define msg_strm 24 /* strm->msg */ 147#define state_strm 28 /* strm->state */ 148 149/* 150 * struct inflate_state offsets, in inflate.h 151 */ 152#define mode_state 0 /* state->mode */ 153#define wsize_state 32 /* state->wsize */ 154#define write_state 40 /* state->write */ 155#define window_state 44 /* state->window */ 156#define hold_state 48 /* state->hold */ 157#define bits_state 52 /* state->bits */ 158#define lencode_state 68 /* state->lencode */ 159#define distcode_state 72 /* state->distcode */ 160#define lenbits_state 76 /* state->lenbits */ 161#define distbits_state 80 /* state->distbits */ 162 163/* 164 * inflate_fast's activation record 165 */ 166#define local_var_size 64 /* how much local space for vars */ 167#define strm_sp 88 /* first arg: z_stream * (local_var_size + 24) */ 168#define start_sp 92 /* second arg: unsigned int (local_var_size + 28) */ 169 170/* 171 * offsets for local vars on stack 172 */ 173#define out 60 /* unsigned char* */ 174#define window 56 /* unsigned char* */ 175#define wsize 52 /* unsigned int */ 176#define write 48 /* unsigned int */ 177#define in 44 /* unsigned char* */ 178#define beg 40 /* unsigned char* */ 179#define buf 28 /* char[ 12 ] */ 180#define len 24 /* unsigned int */ 181#define last 20 /* unsigned char* */ 182#define end 16 /* unsigned char* */ 183#define dcode 12 /* code* */ 184#define lcode 8 /* code* */ 185#define dmask 4 /* unsigned int */ 186#define lmask 0 /* unsigned int */ 187 188/* 189 * typedef enum inflate_mode consts, in inflate.h 190 */ 191#define INFLATE_MODE_TYPE 11 /* state->mode flags enum-ed in inflate.h */ 192#define INFLATE_MODE_BAD 26 193 194 195#if ! defined( USE_MMX ) && ! defined( NO_MMX ) 196 197#define RUN_TIME_MMX 198 199#define CHECK_MMX 1 200#define DO_USE_MMX 2 201#define DONT_USE_MMX 3 202 203.globl inflate_fast_use_mmx 204 205.data 206 207.align 4,0 208inflate_fast_use_mmx: /* integer flag for run time control 1=check,2=mmx,3=no */ 209.long CHECK_MMX 210 211#if defined( GAS_ELF ) 212/* elf info */ 213.type inflate_fast_use_mmx,@object 214.size inflate_fast_use_mmx,4 215#endif 216 217#endif /* RUN_TIME_MMX */ 218 219#if defined( GAS_COFF ) 220/* coff info: scl 2 = extern, type 32 = function */ 221.def inflate_fast; .scl 2; .type 32; .endef 222#endif 223 224.text 225 226.align 32,0x90 227inflate_fast: 228 pushl %edi 229 pushl %esi 230 pushl %ebp 231 pushl %ebx 232 pushf /* save eflags (strm_sp, state_sp assumes this is 32 bits) */ 233 subl $local_var_size, %esp 234 cld 235 236#define strm_r %esi 237#define state_r %edi 238 239 movl strm_sp(%esp), strm_r 240 movl state_strm(strm_r), state_r 241 242 /* in = strm->next_in; 243 * out = strm->next_out; 244 * last = in + strm->avail_in - 11; 245 * beg = out - (start - strm->avail_out); 246 * end = out + (strm->avail_out - 257); 247 */ 248 movl avail_in_strm(strm_r), %edx 249 movl next_in_strm(strm_r), %eax 250 251 addl %eax, %edx /* avail_in += next_in */ 252 subl $11, %edx /* avail_in -= 11 */ 253 254 movl %eax, in(%esp) 255 movl %edx, last(%esp) 256 257 movl start_sp(%esp), %ebp 258 movl avail_out_strm(strm_r), %ecx 259 movl next_out_strm(strm_r), %ebx 260 261 subl %ecx, %ebp /* start -= avail_out */ 262 negl %ebp /* start = -start */ 263 addl %ebx, %ebp /* start += next_out */ 264 265 subl $257, %ecx /* avail_out -= 257 */ 266 addl %ebx, %ecx /* avail_out += out */ 267 268 movl %ebx, out(%esp) 269 movl %ebp, beg(%esp) 270 movl %ecx, end(%esp) 271 272 /* wsize = state->wsize; 273 * write = state->write; 274 * window = state->window; 275 * hold = state->hold; 276 * bits = state->bits; 277 * lcode = state->lencode; 278 * dcode = state->distcode; 279 * lmask = ( 1 << state->lenbits ) - 1; 280 * dmask = ( 1 << state->distbits ) - 1; 281 */ 282 283 movl lencode_state(state_r), %eax 284 movl distcode_state(state_r), %ecx 285 286 movl %eax, lcode(%esp) 287 movl %ecx, dcode(%esp) 288 289 movl $1, %eax 290 movl lenbits_state(state_r), %ecx 291 shll %cl, %eax 292 decl %eax 293 movl %eax, lmask(%esp) 294 295 movl $1, %eax 296 movl distbits_state(state_r), %ecx 297 shll %cl, %eax 298 decl %eax 299 movl %eax, dmask(%esp) 300 301 movl wsize_state(state_r), %eax 302 movl write_state(state_r), %ecx 303 movl window_state(state_r), %edx 304 305 movl %eax, wsize(%esp) 306 movl %ecx, write(%esp) 307 movl %edx, window(%esp) 308 309 movl hold_state(state_r), %ebp 310 movl bits_state(state_r), %ebx 311 312#undef strm_r 313#undef state_r 314 315#define in_r %esi 316#define from_r %esi 317#define out_r %edi 318 319 movl in(%esp), in_r 320 movl last(%esp), %ecx 321 cmpl in_r, %ecx 322 ja .L_align_long /* if in < last */ 323 324 addl $11, %ecx /* ecx = &in[ avail_in ] */ 325 subl in_r, %ecx /* ecx = avail_in */ 326 movl $12, %eax 327 subl %ecx, %eax /* eax = 12 - avail_in */ 328 leal buf(%esp), %edi 329 rep movsb /* memcpy( buf, in, avail_in ) */ 330 movl %eax, %ecx 331 xorl %eax, %eax 332 rep stosb /* memset( &buf[ avail_in ], 0, 12 - avail_in ) */ 333 leal buf(%esp), in_r /* in = buf */ 334 movl in_r, last(%esp) /* last = in, do just one iteration */ 335 jmp .L_is_aligned 336 337 /* align in_r on long boundary */ 338.L_align_long: 339 testl $3, in_r 340 jz .L_is_aligned 341 xorl %eax, %eax 342 movb (in_r), %al 343 incl in_r 344 movl %ebx, %ecx 345 addl $8, %ebx 346 shll %cl, %eax 347 orl %eax, %ebp 348 jmp .L_align_long 349 350.L_is_aligned: 351 movl out(%esp), out_r 352 353#if defined( NO_MMX ) 354 jmp .L_do_loop 355#endif 356 357#if defined( USE_MMX ) 358 jmp .L_init_mmx 359#endif 360 361/*** Runtime MMX check ***/ 362 363#if defined( RUN_TIME_MMX ) 364.L_check_mmx: 365 cmpl $DO_USE_MMX, inflate_fast_use_mmx 366 je .L_init_mmx 367 ja .L_do_loop /* > 2 */ 368 369 pushl %eax 370 pushl %ebx 371 pushl %ecx 372 pushl %edx 373 pushf 374 movl (%esp), %eax /* copy eflags to eax */ 375 xorl $0x200000, (%esp) /* try toggling ID bit of eflags (bit 21) 376 * to see if cpu supports cpuid... 377 * ID bit method not supported by NexGen but 378 * bios may load a cpuid instruction and 379 * cpuid may be disabled on Cyrix 5-6x86 */ 380 popf 381 pushf 382 popl %edx /* copy new eflags to edx */ 383 xorl %eax, %edx /* test if ID bit is flipped */ 384 jz .L_dont_use_mmx /* not flipped if zero */ 385 xorl %eax, %eax 386 cpuid 387 cmpl $0x756e6547, %ebx /* check for GenuineIntel in ebx,ecx,edx */ 388 jne .L_dont_use_mmx 389 cmpl $0x6c65746e, %ecx 390 jne .L_dont_use_mmx 391 cmpl $0x49656e69, %edx 392 jne .L_dont_use_mmx 393 movl $1, %eax 394 cpuid /* get cpu features */ 395 shrl $8, %eax 396 andl $15, %eax 397 cmpl $6, %eax /* check for Pentium family, is 0xf for P4 */ 398 jne .L_dont_use_mmx 399 testl $0x800000, %edx /* test if MMX feature is set (bit 23) */ 400 jnz .L_use_mmx 401 jmp .L_dont_use_mmx 402.L_use_mmx: 403 movl $DO_USE_MMX, inflate_fast_use_mmx 404 jmp .L_check_mmx_pop 405.L_dont_use_mmx: 406 movl $DONT_USE_MMX, inflate_fast_use_mmx 407.L_check_mmx_pop: 408 popl %edx 409 popl %ecx 410 popl %ebx 411 popl %eax 412 jmp .L_check_mmx 413#endif 414 415 416/*** Non-MMX code ***/ 417 418#if defined ( NO_MMX ) || defined( RUN_TIME_MMX ) 419 420#define hold_r %ebp 421#define bits_r %bl 422#define bitslong_r %ebx 423 424.align 32,0x90 425.L_while_test: 426 /* while (in < last && out < end) 427 */ 428 cmpl out_r, end(%esp) 429 jbe .L_break_loop /* if (out >= end) */ 430 431 cmpl in_r, last(%esp) 432 jbe .L_break_loop 433 434.L_do_loop: 435 /* regs: %esi = in, %ebp = hold, %bl = bits, %edi = out 436 * 437 * do { 438 * if (bits < 15) { 439 * hold |= *((unsigned short *)in)++ << bits; 440 * bits += 16 441 * } 442 * this = lcode[hold & lmask] 443 */ 444 cmpb $15, bits_r 445 ja .L_get_length_code /* if (15 < bits) */ 446 447 xorl %eax, %eax 448 lodsw /* al = *(ushort *)in++ */ 449 movb bits_r, %cl /* cl = bits, needs it for shifting */ 450 addb $16, bits_r /* bits += 16 */ 451 shll %cl, %eax 452 orl %eax, hold_r /* hold |= *((ushort *)in)++ << bits */ 453 454.L_get_length_code: 455 movl lmask(%esp), %edx /* edx = lmask */ 456 movl lcode(%esp), %ecx /* ecx = lcode */ 457 andl hold_r, %edx /* edx &= hold */ 458 movl (%ecx,%edx,4), %eax /* eax = lcode[hold & lmask] */ 459 460.L_dolen: 461 /* regs: %esi = in, %ebp = hold, %bl = bits, %edi = out 462 * 463 * dolen: 464 * bits -= this.bits; 465 * hold >>= this.bits 466 */ 467 movb %ah, %cl /* cl = this.bits */ 468 subb %ah, bits_r /* bits -= this.bits */ 469 shrl %cl, hold_r /* hold >>= this.bits */ 470 471 /* check if op is a literal 472 * if (op == 0) { 473 * PUP(out) = this.val; 474 * } 475 */ 476 testb %al, %al 477 jnz .L_test_for_length_base /* if (op != 0) 45.7% */ 478 479 shrl $16, %eax /* output this.val char */ 480 stosb 481 jmp .L_while_test 482 483.L_test_for_length_base: 484 /* regs: %esi = in, %ebp = hold, %bl = bits, %edi = out, %edx = len 485 * 486 * else if (op & 16) { 487 * len = this.val 488 * op &= 15 489 * if (op) { 490 * if (op > bits) { 491 * hold |= *((unsigned short *)in)++ << bits; 492 * bits += 16 493 * } 494 * len += hold & mask[op]; 495 * bits -= op; 496 * hold >>= op; 497 * } 498 */ 499#define len_r %edx 500 movl %eax, len_r /* len = this */ 501 shrl $16, len_r /* len = this.val */ 502 movb %al, %cl 503 504 testb $16, %al 505 jz .L_test_for_second_level_length /* if ((op & 16) == 0) 8% */ 506 andb $15, %cl /* op &= 15 */ 507 jz .L_save_len /* if (!op) */ 508 cmpb %cl, bits_r 509 jae .L_add_bits_to_len /* if (op <= bits) */ 510 511 movb %cl, %ch /* stash op in ch, freeing cl */ 512 xorl %eax, %eax 513 lodsw /* al = *(ushort *)in++ */ 514 movb bits_r, %cl /* cl = bits, needs it for shifting */ 515 addb $16, bits_r /* bits += 16 */ 516 shll %cl, %eax 517 orl %eax, hold_r /* hold |= *((ushort *)in)++ << bits */ 518 movb %ch, %cl /* move op back to ecx */ 519 520.L_add_bits_to_len: 521 movl $1, %eax 522 shll %cl, %eax 523 decl %eax 524 subb %cl, bits_r 525 andl hold_r, %eax /* eax &= hold */ 526 shrl %cl, hold_r 527 addl %eax, len_r /* len += hold & mask[op] */ 528 529.L_save_len: 530 movl len_r, len(%esp) /* save len */ 531#undef len_r 532 533.L_decode_distance: 534 /* regs: %esi = in, %ebp = hold, %bl = bits, %edi = out, %edx = dist 535 * 536 * if (bits < 15) { 537 * hold |= *((unsigned short *)in)++ << bits; 538 * bits += 16 539 * } 540 * this = dcode[hold & dmask]; 541 * dodist: 542 * bits -= this.bits; 543 * hold >>= this.bits; 544 * op = this.op; 545 */ 546 547 cmpb $15, bits_r 548 ja .L_get_distance_code /* if (15 < bits) */ 549 550 xorl %eax, %eax 551 lodsw /* al = *(ushort *)in++ */ 552 movb bits_r, %cl /* cl = bits, needs it for shifting */ 553 addb $16, bits_r /* bits += 16 */ 554 shll %cl, %eax 555 orl %eax, hold_r /* hold |= *((ushort *)in)++ << bits */ 556 557.L_get_distance_code: 558 movl dmask(%esp), %edx /* edx = dmask */ 559 movl dcode(%esp), %ecx /* ecx = dcode */ 560 andl hold_r, %edx /* edx &= hold */ 561 movl (%ecx,%edx,4), %eax /* eax = dcode[hold & dmask] */ 562 563#define dist_r %edx 564.L_dodist: 565 movl %eax, dist_r /* dist = this */ 566 shrl $16, dist_r /* dist = this.val */ 567 movb %ah, %cl 568 subb %ah, bits_r /* bits -= this.bits */ 569 shrl %cl, hold_r /* hold >>= this.bits */ 570 571 /* if (op & 16) { 572 * dist = this.val 573 * op &= 15 574 * if (op > bits) { 575 * hold |= *((unsigned short *)in)++ << bits; 576 * bits += 16 577 * } 578 * dist += hold & mask[op]; 579 * bits -= op; 580 * hold >>= op; 581 */ 582 movb %al, %cl /* cl = this.op */ 583 584 testb $16, %al /* if ((op & 16) == 0) */ 585 jz .L_test_for_second_level_dist 586 andb $15, %cl /* op &= 15 */ 587 jz .L_check_dist_one 588 cmpb %cl, bits_r 589 jae .L_add_bits_to_dist /* if (op <= bits) 97.6% */ 590 591 movb %cl, %ch /* stash op in ch, freeing cl */ 592 xorl %eax, %eax 593 lodsw /* al = *(ushort *)in++ */ 594 movb bits_r, %cl /* cl = bits, needs it for shifting */ 595 addb $16, bits_r /* bits += 16 */ 596 shll %cl, %eax 597 orl %eax, hold_r /* hold |= *((ushort *)in)++ << bits */ 598 movb %ch, %cl /* move op back to ecx */ 599 600.L_add_bits_to_dist: 601 movl $1, %eax 602 shll %cl, %eax 603 decl %eax /* (1 << op) - 1 */ 604 subb %cl, bits_r 605 andl hold_r, %eax /* eax &= hold */ 606 shrl %cl, hold_r 607 addl %eax, dist_r /* dist += hold & ((1 << op) - 1) */ 608 jmp .L_check_window 609 610.L_check_window: 611 /* regs: %esi = from, %ebp = hold, %bl = bits, %edi = out, %edx = dist 612 * %ecx = nbytes 613 * 614 * nbytes = out - beg; 615 * if (dist <= nbytes) { 616 * from = out - dist; 617 * do { 618 * PUP(out) = PUP(from); 619 * } while (--len > 0) { 620 * } 621 */ 622 623 movl in_r, in(%esp) /* save in so from can use it's reg */ 624 movl out_r, %eax 625 subl beg(%esp), %eax /* nbytes = out - beg */ 626 627 cmpl dist_r, %eax 628 jb .L_clip_window /* if (dist > nbytes) 4.2% */ 629 630 movl len(%esp), %ecx 631 movl out_r, from_r 632 subl dist_r, from_r /* from = out - dist */ 633 634 subl $3, %ecx 635 movb (from_r), %al 636 movb %al, (out_r) 637 movb 1(from_r), %al 638 movb 2(from_r), %dl 639 addl $3, from_r 640 movb %al, 1(out_r) 641 movb %dl, 2(out_r) 642 addl $3, out_r 643 rep movsb 644 645 movl in(%esp), in_r /* move in back to %esi, toss from */ 646 jmp .L_while_test 647 648.align 16,0x90 649.L_check_dist_one: 650 cmpl $1, dist_r 651 jne .L_check_window 652 cmpl out_r, beg(%esp) 653 je .L_check_window 654 655 decl out_r 656 movl len(%esp), %ecx 657 movb (out_r), %al 658 subl $3, %ecx 659 660 movb %al, 1(out_r) 661 movb %al, 2(out_r) 662 movb %al, 3(out_r) 663 addl $4, out_r 664 rep stosb 665 666 jmp .L_while_test 667 668.align 16,0x90 669.L_test_for_second_level_length: 670 /* else if ((op & 64) == 0) { 671 * this = lcode[this.val + (hold & mask[op])]; 672 * } 673 */ 674 testb $64, %al 675 jnz .L_test_for_end_of_block /* if ((op & 64) != 0) */ 676 677 movl $1, %eax 678 shll %cl, %eax 679 decl %eax 680 andl hold_r, %eax /* eax &= hold */ 681 addl %edx, %eax /* eax += this.val */ 682 movl lcode(%esp), %edx /* edx = lcode */ 683 movl (%edx,%eax,4), %eax /* eax = lcode[val + (hold&mask[op])] */ 684 jmp .L_dolen 685 686.align 16,0x90 687.L_test_for_second_level_dist: 688 /* else if ((op & 64) == 0) { 689 * this = dcode[this.val + (hold & mask[op])]; 690 * } 691 */ 692 testb $64, %al 693 jnz .L_invalid_distance_code /* if ((op & 64) != 0) */ 694 695 movl $1, %eax 696 shll %cl, %eax 697 decl %eax 698 andl hold_r, %eax /* eax &= hold */ 699 addl %edx, %eax /* eax += this.val */ 700 movl dcode(%esp), %edx /* edx = dcode */ 701 movl (%edx,%eax,4), %eax /* eax = dcode[val + (hold&mask[op])] */ 702 jmp .L_dodist 703 704.align 16,0x90 705.L_clip_window: 706 /* regs: %esi = from, %ebp = hold, %bl = bits, %edi = out, %edx = dist 707 * %ecx = nbytes 708 * 709 * else { 710 * if (dist > wsize) { 711 * invalid distance 712 * } 713 * from = window; 714 * nbytes = dist - nbytes; 715 * if (write == 0) { 716 * from += wsize - nbytes; 717 */ 718#define nbytes_r %ecx 719 movl %eax, nbytes_r 720 movl wsize(%esp), %eax /* prepare for dist compare */ 721 negl nbytes_r /* nbytes = -nbytes */ 722 movl window(%esp), from_r /* from = window */ 723 724 cmpl dist_r, %eax 725 jb .L_invalid_distance_too_far /* if (dist > wsize) */ 726 727 addl dist_r, nbytes_r /* nbytes = dist - nbytes */ 728 cmpl $0, write(%esp) 729 jne .L_wrap_around_window /* if (write != 0) */ 730 731 subl nbytes_r, %eax 732 addl %eax, from_r /* from += wsize - nbytes */ 733 734 /* regs: %esi = from, %ebp = hold, %bl = bits, %edi = out, %edx = dist 735 * %ecx = nbytes, %eax = len 736 * 737 * if (nbytes < len) { 738 * len -= nbytes; 739 * do { 740 * PUP(out) = PUP(from); 741 * } while (--nbytes); 742 * from = out - dist; 743 * } 744 * } 745 */ 746#define len_r %eax 747 movl len(%esp), len_r 748 cmpl nbytes_r, len_r 749 jbe .L_do_copy1 /* if (nbytes >= len) */ 750 751 subl nbytes_r, len_r /* len -= nbytes */ 752 rep movsb 753 movl out_r, from_r 754 subl dist_r, from_r /* from = out - dist */ 755 jmp .L_do_copy1 756 757 cmpl nbytes_r, len_r 758 jbe .L_do_copy1 /* if (nbytes >= len) */ 759 760 subl nbytes_r, len_r /* len -= nbytes */ 761 rep movsb 762 movl out_r, from_r 763 subl dist_r, from_r /* from = out - dist */ 764 jmp .L_do_copy1 765 766.L_wrap_around_window: 767 /* regs: %esi = from, %ebp = hold, %bl = bits, %edi = out, %edx = dist 768 * %ecx = nbytes, %eax = write, %eax = len 769 * 770 * else if (write < nbytes) { 771 * from += wsize + write - nbytes; 772 * nbytes -= write; 773 * if (nbytes < len) { 774 * len -= nbytes; 775 * do { 776 * PUP(out) = PUP(from); 777 * } while (--nbytes); 778 * from = window; 779 * nbytes = write; 780 * if (nbytes < len) { 781 * len -= nbytes; 782 * do { 783 * PUP(out) = PUP(from); 784 * } while(--nbytes); 785 * from = out - dist; 786 * } 787 * } 788 * } 789 */ 790#define write_r %eax 791 movl write(%esp), write_r 792 cmpl write_r, nbytes_r 793 jbe .L_contiguous_in_window /* if (write >= nbytes) */ 794 795 addl wsize(%esp), from_r 796 addl write_r, from_r 797 subl nbytes_r, from_r /* from += wsize + write - nbytes */ 798 subl write_r, nbytes_r /* nbytes -= write */ 799#undef write_r 800 801 movl len(%esp), len_r 802 cmpl nbytes_r, len_r 803 jbe .L_do_copy1 /* if (nbytes >= len) */ 804 805 subl nbytes_r, len_r /* len -= nbytes */ 806 rep movsb 807 movl window(%esp), from_r /* from = window */ 808 movl write(%esp), nbytes_r /* nbytes = write */ 809 cmpl nbytes_r, len_r 810 jbe .L_do_copy1 /* if (nbytes >= len) */ 811 812 subl nbytes_r, len_r /* len -= nbytes */ 813 rep movsb 814 movl out_r, from_r 815 subl dist_r, from_r /* from = out - dist */ 816 jmp .L_do_copy1 817 818.L_contiguous_in_window: 819 /* regs: %esi = from, %ebp = hold, %bl = bits, %edi = out, %edx = dist 820 * %ecx = nbytes, %eax = write, %eax = len 821 * 822 * else { 823 * from += write - nbytes; 824 * if (nbytes < len) { 825 * len -= nbytes; 826 * do { 827 * PUP(out) = PUP(from); 828 * } while (--nbytes); 829 * from = out - dist; 830 * } 831 * } 832 */ 833#define write_r %eax 834 addl write_r, from_r 835 subl nbytes_r, from_r /* from += write - nbytes */ 836#undef write_r 837 838 movl len(%esp), len_r 839 cmpl nbytes_r, len_r 840 jbe .L_do_copy1 /* if (nbytes >= len) */ 841 842 subl nbytes_r, len_r /* len -= nbytes */ 843 rep movsb 844 movl out_r, from_r 845 subl dist_r, from_r /* from = out - dist */ 846 847.L_do_copy1: 848 /* regs: %esi = from, %esi = in, %ebp = hold, %bl = bits, %edi = out 849 * %eax = len 850 * 851 * while (len > 0) { 852 * PUP(out) = PUP(from); 853 * len--; 854 * } 855 * } 856 * } while (in < last && out < end); 857 */ 858#undef nbytes_r 859#define in_r %esi 860 movl len_r, %ecx 861 rep movsb 862 863 movl in(%esp), in_r /* move in back to %esi, toss from */ 864 jmp .L_while_test 865 866#undef len_r 867#undef dist_r 868 869#endif /* NO_MMX || RUN_TIME_MMX */ 870 871 872/*** MMX code ***/ 873 874#if defined( USE_MMX ) || defined( RUN_TIME_MMX ) 875 876.align 32,0x90 877.L_init_mmx: 878 emms 879 880#undef bits_r 881#undef bitslong_r 882#define bitslong_r %ebp 883#define hold_mm %mm0 884 movd %ebp, hold_mm 885 movl %ebx, bitslong_r 886 887#define used_mm %mm1 888#define dmask2_mm %mm2 889#define lmask2_mm %mm3 890#define lmask_mm %mm4 891#define dmask_mm %mm5 892#define tmp_mm %mm6 893 894 movd lmask(%esp), lmask_mm 895 movq lmask_mm, lmask2_mm 896 movd dmask(%esp), dmask_mm 897 movq dmask_mm, dmask2_mm 898 pxor used_mm, used_mm 899 movl lcode(%esp), %ebx /* ebx = lcode */ 900 jmp .L_do_loop_mmx 901 902.align 32,0x90 903.L_while_test_mmx: 904 /* while (in < last && out < end) 905 */ 906 cmpl out_r, end(%esp) 907 jbe .L_break_loop /* if (out >= end) */ 908 909 cmpl in_r, last(%esp) 910 jbe .L_break_loop 911 912.L_do_loop_mmx: 913 psrlq used_mm, hold_mm /* hold_mm >>= last bit length */ 914 915 cmpl $32, bitslong_r 916 ja .L_get_length_code_mmx /* if (32 < bits) */ 917 918 movd bitslong_r, tmp_mm 919 movd (in_r), %mm7 920 addl $4, in_r 921 psllq tmp_mm, %mm7 922 addl $32, bitslong_r 923 por %mm7, hold_mm /* hold_mm |= *((uint *)in)++ << bits */ 924 925.L_get_length_code_mmx: 926 pand hold_mm, lmask_mm 927 movd lmask_mm, %eax 928 movq lmask2_mm, lmask_mm 929 movl (%ebx,%eax,4), %eax /* eax = lcode[hold & lmask] */ 930 931.L_dolen_mmx: 932 movzbl %ah, %ecx /* ecx = this.bits */ 933 movd %ecx, used_mm 934 subl %ecx, bitslong_r /* bits -= this.bits */ 935 936 testb %al, %al 937 jnz .L_test_for_length_base_mmx /* if (op != 0) 45.7% */ 938 939 shrl $16, %eax /* output this.val char */ 940 stosb 941 jmp .L_while_test_mmx 942 943.L_test_for_length_base_mmx: 944#define len_r %edx 945 movl %eax, len_r /* len = this */ 946 shrl $16, len_r /* len = this.val */ 947 948 testb $16, %al 949 jz .L_test_for_second_level_length_mmx /* if ((op & 16) == 0) 8% */ 950 andl $15, %eax /* op &= 15 */ 951 jz .L_decode_distance_mmx /* if (!op) */ 952 953 psrlq used_mm, hold_mm /* hold_mm >>= last bit length */ 954 movd %eax, used_mm 955 movd hold_mm, %ecx 956 subl %eax, bitslong_r 957 andl .L_mask(,%eax,4), %ecx 958 addl %ecx, len_r /* len += hold & mask[op] */ 959 960.L_decode_distance_mmx: 961 psrlq used_mm, hold_mm /* hold_mm >>= last bit length */ 962 963 cmpl $32, bitslong_r 964 ja .L_get_dist_code_mmx /* if (32 < bits) */ 965 966 movd bitslong_r, tmp_mm 967 movd (in_r), %mm7 968 addl $4, in_r 969 psllq tmp_mm, %mm7 970 addl $32, bitslong_r 971 por %mm7, hold_mm /* hold_mm |= *((uint *)in)++ << bits */ 972 973.L_get_dist_code_mmx: 974 movl dcode(%esp), %ebx /* ebx = dcode */ 975 pand hold_mm, dmask_mm 976 movd dmask_mm, %eax 977 movq dmask2_mm, dmask_mm 978 movl (%ebx,%eax,4), %eax /* eax = dcode[hold & lmask] */ 979 980.L_dodist_mmx: 981#define dist_r %ebx 982 movzbl %ah, %ecx /* ecx = this.bits */ 983 movl %eax, dist_r 984 shrl $16, dist_r /* dist = this.val */ 985 subl %ecx, bitslong_r /* bits -= this.bits */ 986 movd %ecx, used_mm 987 988 testb $16, %al /* if ((op & 16) == 0) */ 989 jz .L_test_for_second_level_dist_mmx 990 andl $15, %eax /* op &= 15 */ 991 jz .L_check_dist_one_mmx 992 993.L_add_bits_to_dist_mmx: 994 psrlq used_mm, hold_mm /* hold_mm >>= last bit length */ 995 movd %eax, used_mm /* save bit length of current op */ 996 movd hold_mm, %ecx /* get the next bits on input stream */ 997 subl %eax, bitslong_r /* bits -= op bits */ 998 andl .L_mask(,%eax,4), %ecx /* ecx = hold & mask[op] */ 999 addl %ecx, dist_r /* dist += hold & mask[op] */ 1000 1001.L_check_window_mmx: 1002 movl in_r, in(%esp) /* save in so from can use it's reg */ 1003 movl out_r, %eax 1004 subl beg(%esp), %eax /* nbytes = out - beg */ 1005 1006 cmpl dist_r, %eax 1007 jb .L_clip_window_mmx /* if (dist > nbytes) 4.2% */ 1008 1009 movl len_r, %ecx 1010 movl out_r, from_r 1011 subl dist_r, from_r /* from = out - dist */ 1012 1013 subl $3, %ecx 1014 movb (from_r), %al 1015 movb %al, (out_r) 1016 movb 1(from_r), %al 1017 movb 2(from_r), %dl 1018 addl $3, from_r 1019 movb %al, 1(out_r) 1020 movb %dl, 2(out_r) 1021 addl $3, out_r 1022 rep movsb 1023 1024 movl in(%esp), in_r /* move in back to %esi, toss from */ 1025 movl lcode(%esp), %ebx /* move lcode back to %ebx, toss dist */ 1026 jmp .L_while_test_mmx 1027 1028.align 16,0x90 1029.L_check_dist_one_mmx: 1030 cmpl $1, dist_r 1031 jne .L_check_window_mmx 1032 cmpl out_r, beg(%esp) 1033 je .L_check_window_mmx 1034 1035 decl out_r 1036 movl len_r, %ecx 1037 movb (out_r), %al 1038 subl $3, %ecx 1039 1040 movb %al, 1(out_r) 1041 movb %al, 2(out_r) 1042 movb %al, 3(out_r) 1043 addl $4, out_r 1044 rep stosb 1045 1046 movl lcode(%esp), %ebx /* move lcode back to %ebx, toss dist */ 1047 jmp .L_while_test_mmx 1048 1049.align 16,0x90 1050.L_test_for_second_level_length_mmx: 1051 testb $64, %al 1052 jnz .L_test_for_end_of_block /* if ((op & 64) != 0) */ 1053 1054 andl $15, %eax 1055 psrlq used_mm, hold_mm /* hold_mm >>= last bit length */ 1056 movd hold_mm, %ecx 1057 andl .L_mask(,%eax,4), %ecx 1058 addl len_r, %ecx 1059 movl (%ebx,%ecx,4), %eax /* eax = lcode[hold & lmask] */ 1060 jmp .L_dolen_mmx 1061 1062.align 16,0x90 1063.L_test_for_second_level_dist_mmx: 1064 testb $64, %al 1065 jnz .L_invalid_distance_code /* if ((op & 64) != 0) */ 1066 1067 andl $15, %eax 1068 psrlq used_mm, hold_mm /* hold_mm >>= last bit length */ 1069 movd hold_mm, %ecx 1070 andl .L_mask(,%eax,4), %ecx 1071 movl dcode(%esp), %eax /* ecx = dcode */ 1072 addl dist_r, %ecx 1073 movl (%eax,%ecx,4), %eax /* eax = lcode[hold & lmask] */ 1074 jmp .L_dodist_mmx 1075 1076.align 16,0x90 1077.L_clip_window_mmx: 1078#define nbytes_r %ecx 1079 movl %eax, nbytes_r 1080 movl wsize(%esp), %eax /* prepare for dist compare */ 1081 negl nbytes_r /* nbytes = -nbytes */ 1082 movl window(%esp), from_r /* from = window */ 1083 1084 cmpl dist_r, %eax 1085 jb .L_invalid_distance_too_far /* if (dist > wsize) */ 1086 1087 addl dist_r, nbytes_r /* nbytes = dist - nbytes */ 1088 cmpl $0, write(%esp) 1089 jne .L_wrap_around_window_mmx /* if (write != 0) */ 1090 1091 subl nbytes_r, %eax 1092 addl %eax, from_r /* from += wsize - nbytes */ 1093 1094 cmpl nbytes_r, len_r 1095 jbe .L_do_copy1_mmx /* if (nbytes >= len) */ 1096 1097 subl nbytes_r, len_r /* len -= nbytes */ 1098 rep movsb 1099 movl out_r, from_r 1100 subl dist_r, from_r /* from = out - dist */ 1101 jmp .L_do_copy1_mmx 1102 1103 cmpl nbytes_r, len_r 1104 jbe .L_do_copy1_mmx /* if (nbytes >= len) */ 1105 1106 subl nbytes_r, len_r /* len -= nbytes */ 1107 rep movsb 1108 movl out_r, from_r 1109 subl dist_r, from_r /* from = out - dist */ 1110 jmp .L_do_copy1_mmx 1111 1112.L_wrap_around_window_mmx: 1113#define write_r %eax 1114 movl write(%esp), write_r 1115 cmpl write_r, nbytes_r 1116 jbe .L_contiguous_in_window_mmx /* if (write >= nbytes) */ 1117 1118 addl wsize(%esp), from_r 1119 addl write_r, from_r 1120 subl nbytes_r, from_r /* from += wsize + write - nbytes */ 1121 subl write_r, nbytes_r /* nbytes -= write */ 1122#undef write_r 1123 1124 cmpl nbytes_r, len_r 1125 jbe .L_do_copy1_mmx /* if (nbytes >= len) */ 1126 1127 subl nbytes_r, len_r /* len -= nbytes */ 1128 rep movsb 1129 movl window(%esp), from_r /* from = window */ 1130 movl write(%esp), nbytes_r /* nbytes = write */ 1131 cmpl nbytes_r, len_r 1132 jbe .L_do_copy1_mmx /* if (nbytes >= len) */ 1133 1134 subl nbytes_r, len_r /* len -= nbytes */ 1135 rep movsb 1136 movl out_r, from_r 1137 subl dist_r, from_r /* from = out - dist */ 1138 jmp .L_do_copy1_mmx 1139 1140.L_contiguous_in_window_mmx: 1141#define write_r %eax 1142 addl write_r, from_r 1143 subl nbytes_r, from_r /* from += write - nbytes */ 1144#undef write_r 1145 1146 cmpl nbytes_r, len_r 1147 jbe .L_do_copy1_mmx /* if (nbytes >= len) */ 1148 1149 subl nbytes_r, len_r /* len -= nbytes */ 1150 rep movsb 1151 movl out_r, from_r 1152 subl dist_r, from_r /* from = out - dist */ 1153 1154.L_do_copy1_mmx: 1155#undef nbytes_r 1156#define in_r %esi 1157 movl len_r, %ecx 1158 rep movsb 1159 1160 movl in(%esp), in_r /* move in back to %esi, toss from */ 1161 movl lcode(%esp), %ebx /* move lcode back to %ebx, toss dist */ 1162 jmp .L_while_test_mmx 1163 1164#undef hold_r 1165#undef bitslong_r 1166 1167#endif /* USE_MMX || RUN_TIME_MMX */ 1168 1169 1170/*** USE_MMX, NO_MMX, and RUNTIME_MMX from here on ***/ 1171 1172.L_invalid_distance_code: 1173 /* else { 1174 * strm->msg = "invalid distance code"; 1175 * state->mode = BAD; 1176 * } 1177 */ 1178 movl $.L_invalid_distance_code_msg, %ecx 1179 movl $INFLATE_MODE_BAD, %edx 1180 jmp .L_update_stream_state 1181 1182.L_test_for_end_of_block: 1183 /* else if (op & 32) { 1184 * state->mode = TYPE; 1185 * break; 1186 * } 1187 */ 1188 testb $32, %al 1189 jz .L_invalid_literal_length_code /* if ((op & 32) == 0) */ 1190 1191 movl $0, %ecx 1192 movl $INFLATE_MODE_TYPE, %edx 1193 jmp .L_update_stream_state 1194 1195.L_invalid_literal_length_code: 1196 /* else { 1197 * strm->msg = "invalid literal/length code"; 1198 * state->mode = BAD; 1199 * } 1200 */ 1201 movl $.L_invalid_literal_length_code_msg, %ecx 1202 movl $INFLATE_MODE_BAD, %edx 1203 jmp .L_update_stream_state 1204 1205.L_invalid_distance_too_far: 1206 /* strm->msg = "invalid distance too far back"; 1207 * state->mode = BAD; 1208 */ 1209 movl in(%esp), in_r /* from_r has in's reg, put in back */ 1210 movl $.L_invalid_distance_too_far_msg, %ecx 1211 movl $INFLATE_MODE_BAD, %edx 1212 jmp .L_update_stream_state 1213 1214.L_update_stream_state: 1215 /* set strm->msg = %ecx, strm->state->mode = %edx */ 1216 movl strm_sp(%esp), %eax 1217 testl %ecx, %ecx /* if (msg != NULL) */ 1218 jz .L_skip_msg 1219 movl %ecx, msg_strm(%eax) /* strm->msg = msg */ 1220.L_skip_msg: 1221 movl state_strm(%eax), %eax /* state = strm->state */ 1222 movl %edx, mode_state(%eax) /* state->mode = edx (BAD | TYPE) */ 1223 jmp .L_break_loop 1224 1225.align 32,0x90 1226.L_break_loop: 1227 1228/* 1229 * Regs: 1230 * 1231 * bits = %ebp when mmx, and in %ebx when non-mmx 1232 * hold = %hold_mm when mmx, and in %ebp when non-mmx 1233 * in = %esi 1234 * out = %edi 1235 */ 1236 1237#if defined( USE_MMX ) || defined( RUN_TIME_MMX ) 1238 1239#if defined( RUN_TIME_MMX ) 1240 1241 cmpl $DO_USE_MMX, inflate_fast_use_mmx 1242 jne .L_update_next_in 1243 1244#endif /* RUN_TIME_MMX */ 1245 1246 movl %ebp, %ebx 1247 1248.L_update_next_in: 1249 1250#endif 1251 1252#define strm_r %eax 1253#define state_r %edx 1254 1255 /* len = bits >> 3; 1256 * in -= len; 1257 * bits -= len << 3; 1258 * hold &= (1U << bits) - 1; 1259 * state->hold = hold; 1260 * state->bits = bits; 1261 * strm->next_in = in; 1262 * strm->next_out = out; 1263 */ 1264 movl strm_sp(%esp), strm_r 1265 movl %ebx, %ecx 1266 movl state_strm(strm_r), state_r 1267 shrl $3, %ecx 1268 subl %ecx, in_r 1269 shll $3, %ecx 1270 subl %ecx, %ebx 1271 movl out_r, next_out_strm(strm_r) 1272 movl %ebx, bits_state(state_r) 1273 movl %ebx, %ecx 1274 1275 leal buf(%esp), %ebx 1276 cmpl %ebx, last(%esp) 1277 jne .L_buf_not_used /* if buf != last */ 1278 1279 subl %ebx, in_r /* in -= buf */ 1280 movl next_in_strm(strm_r), %ebx 1281 movl %ebx, last(%esp) /* last = strm->next_in */ 1282 addl %ebx, in_r /* in += strm->next_in */ 1283 movl avail_in_strm(strm_r), %ebx 1284 subl $11, %ebx 1285 addl %ebx, last(%esp) /* last = &strm->next_in[ avail_in - 11 ] */ 1286 1287.L_buf_not_used: 1288 movl in_r, next_in_strm(strm_r) 1289 1290 movl $1, %ebx 1291 shll %cl, %ebx 1292 decl %ebx 1293 1294#if defined( USE_MMX ) || defined( RUN_TIME_MMX ) 1295 1296#if defined( RUN_TIME_MMX ) 1297 1298 cmpl $DO_USE_MMX, inflate_fast_use_mmx 1299 jne .L_update_hold 1300 1301#endif /* RUN_TIME_MMX */ 1302 1303 psrlq used_mm, hold_mm /* hold_mm >>= last bit length */ 1304 movd hold_mm, %ebp 1305 1306 emms 1307 1308.L_update_hold: 1309 1310#endif /* USE_MMX || RUN_TIME_MMX */ 1311 1312 andl %ebx, %ebp 1313 movl %ebp, hold_state(state_r) 1314 1315#define last_r %ebx 1316 1317 /* strm->avail_in = in < last ? 11 + (last - in) : 11 - (in - last) */ 1318 movl last(%esp), last_r 1319 cmpl in_r, last_r 1320 jbe .L_last_is_smaller /* if (in >= last) */ 1321 1322 subl in_r, last_r /* last -= in */ 1323 addl $11, last_r /* last += 11 */ 1324 movl last_r, avail_in_strm(strm_r) 1325 jmp .L_fixup_out 1326.L_last_is_smaller: 1327 subl last_r, in_r /* in -= last */ 1328 negl in_r /* in = -in */ 1329 addl $11, in_r /* in += 11 */ 1330 movl in_r, avail_in_strm(strm_r) 1331 1332#undef last_r 1333#define end_r %ebx 1334 1335.L_fixup_out: 1336 /* strm->avail_out = out < end ? 257 + (end - out) : 257 - (out - end)*/ 1337 movl end(%esp), end_r 1338 cmpl out_r, end_r 1339 jbe .L_end_is_smaller /* if (out >= end) */ 1340 1341 subl out_r, end_r /* end -= out */ 1342 addl $257, end_r /* end += 257 */ 1343 movl end_r, avail_out_strm(strm_r) 1344 jmp .L_done 1345.L_end_is_smaller: 1346 subl end_r, out_r /* out -= end */ 1347 negl out_r /* out = -out */ 1348 addl $257, out_r /* out += 257 */ 1349 movl out_r, avail_out_strm(strm_r) 1350 1351#undef end_r 1352#undef strm_r 1353#undef state_r 1354 1355.L_done: 1356 addl $local_var_size, %esp 1357 popf 1358 popl %ebx 1359 popl %ebp 1360 popl %esi 1361 popl %edi 1362 ret 1363 1364#if defined( GAS_ELF ) 1365/* elf info */ 1366.type inflate_fast,@function 1367.size inflate_fast,.-inflate_fast 1368#endif 1369