1; gvmat32.asm -- Asm portion of the optimized longest_match for 32 bits x86 2; Copyright (C) 1995-1996 Jean-loup Gailly and Gilles Vollant. 3; File written by Gilles Vollant, by modifiying the longest_match 4; from Jean-loup Gailly in deflate.c 5; 6; http://www.zlib.net 7; http://www.winimage.com/zLibDll 8; http://www.muppetlabs.com/~breadbox/software/assembly.html 9; 10; For Visual C++ 4.x and higher and ML 6.x and higher 11; ml.exe is in directory \MASM611C of Win95 DDK 12; ml.exe is also distributed in http://www.masm32.com/masmdl.htm 13; and in VC++2003 toolkit at http://msdn.microsoft.com/visualc/vctoolkit2003/ 14; 15; this file contain two implementation of longest_match 16; 17; longest_match_7fff : written 1996 by Gilles Vollant optimized for 18; first Pentium. Assume s->w_mask == 0x7fff 19; longest_match_686 : written by Brian raiter (1998), optimized for Pentium Pro 20; 21; for using an seembly version of longest_match, you need define ASMV in project 22; There is two way in using gvmat32.asm 23; 24; A) Suggested method 25; if you want include both longest_match_7fff and longest_match_686 26; compile the asm file running 27; ml /coff /Zi /Flgvmat32.lst /c gvmat32.asm 28; and include gvmat32c.c in your project 29; if you have an old cpu (386,486 or first Pentium) and s->w_mask==0x7fff, 30; longest_match_7fff will be used 31; if you have a more modern CPU (Pentium Pro, II and higher) 32; longest_match_686 will be used 33; on old cpu with s->w_mask!=0x7fff, longest_match_686 will be used, 34; but this is not a sitation you'll find often 35; 36; B) Alternative 37; if you are not interresed in old cpu performance and want the smaller 38; binaries possible 39; 40; compile the asm file running 41; ml /coff /Zi /c /Flgvmat32.lst /DNOOLDPENTIUMCODE gvmat32.asm 42; and do not include gvmat32c.c in your project (ou define also 43; NOOLDPENTIUMCODE) 44; 45; note : as I known, longest_match_686 is very faster than longest_match_7fff 46; on pentium Pro/II/III, faster (but less) in P4, but it seem 47; longest_match_7fff can be faster (very very litte) on AMD Athlon64/K8 48; 49; see below : zlib1222add must be adjuster if you use a zlib version < 1.2.2.2 50 51;uInt longest_match_7fff(s, cur_match) 52; deflate_state *s; 53; IPos cur_match; /* current match */ 54 55 NbStack equ 76 56 cur_match equ dword ptr[esp+NbStack-0] 57 str_s equ dword ptr[esp+NbStack-4] 58; 5 dword on top (ret,ebp,esi,edi,ebx) 59 adrret equ dword ptr[esp+NbStack-8] 60 pushebp equ dword ptr[esp+NbStack-12] 61 pushedi equ dword ptr[esp+NbStack-16] 62 pushesi equ dword ptr[esp+NbStack-20] 63 pushebx equ dword ptr[esp+NbStack-24] 64 65 chain_length equ dword ptr [esp+NbStack-28] 66 limit equ dword ptr [esp+NbStack-32] 67 best_len equ dword ptr [esp+NbStack-36] 68 window equ dword ptr [esp+NbStack-40] 69 prev equ dword ptr [esp+NbStack-44] 70 scan_start equ word ptr [esp+NbStack-48] 71 wmask equ dword ptr [esp+NbStack-52] 72 match_start_ptr equ dword ptr [esp+NbStack-56] 73 nice_match equ dword ptr [esp+NbStack-60] 74 scan equ dword ptr [esp+NbStack-64] 75 76 windowlen equ dword ptr [esp+NbStack-68] 77 match_start equ dword ptr [esp+NbStack-72] 78 strend equ dword ptr [esp+NbStack-76] 79 NbStackAdd equ (NbStack-24) 80 81 .386p 82 83 name gvmatch 84 .MODEL FLAT 85 86 87 88; all the +zlib1222add offsets are due to the addition of fields 89; in zlib in the deflate_state structure since the asm code was first written 90; (if you compile with zlib 1.0.4 or older, use "zlib1222add equ (-4)"). 91; (if you compile with zlib between 1.0.5 and 1.2.2.1, use "zlib1222add equ 0"). 92; if you compile with zlib 1.2.2.2 or later , use "zlib1222add equ 8"). 93 94 zlib1222add equ 8 95 96; Note : these value are good with a 8 bytes boundary pack structure 97 dep_chain_length equ 74h+zlib1222add 98 dep_window equ 30h+zlib1222add 99 dep_strstart equ 64h+zlib1222add 100 dep_prev_length equ 70h+zlib1222add 101 dep_nice_match equ 88h+zlib1222add 102 dep_w_size equ 24h+zlib1222add 103 dep_prev equ 38h+zlib1222add 104 dep_w_mask equ 2ch+zlib1222add 105 dep_good_match equ 84h+zlib1222add 106 dep_match_start equ 68h+zlib1222add 107 dep_lookahead equ 6ch+zlib1222add 108 109 110_TEXT segment 111 112IFDEF NOUNDERLINE 113 IFDEF NOOLDPENTIUMCODE 114 public longest_match 115 public match_init 116 ELSE 117 public longest_match_7fff 118 public cpudetect32 119 public longest_match_686 120 ENDIF 121ELSE 122 IFDEF NOOLDPENTIUMCODE 123 public _longest_match 124 public _match_init 125 ELSE 126 public _longest_match_7fff 127 public _cpudetect32 128 public _longest_match_686 129 ENDIF 130ENDIF 131 132 MAX_MATCH equ 258 133 MIN_MATCH equ 3 134 MIN_LOOKAHEAD equ (MAX_MATCH+MIN_MATCH+1) 135 136 137 138IFNDEF NOOLDPENTIUMCODE 139IFDEF NOUNDERLINE 140longest_match_7fff proc near 141ELSE 142_longest_match_7fff proc near 143ENDIF 144 145 mov edx,[esp+4] 146 147 148 149 push ebp 150 push edi 151 push esi 152 push ebx 153 154 sub esp,NbStackAdd 155 156; initialize or check the variables used in match.asm. 157 mov ebp,edx 158 159; chain_length = s->max_chain_length 160; if (prev_length>=good_match) chain_length >>= 2 161 mov edx,[ebp+dep_chain_length] 162 mov ebx,[ebp+dep_prev_length] 163 cmp [ebp+dep_good_match],ebx 164 ja noshr 165 shr edx,2 166noshr: 167; we increment chain_length because in the asm, the --chain_lenght is in the beginning of the loop 168 inc edx 169 mov edi,[ebp+dep_nice_match] 170 mov chain_length,edx 171 mov eax,[ebp+dep_lookahead] 172 cmp eax,edi 173; if ((uInt)nice_match > s->lookahead) nice_match = s->lookahead; 174 jae nolookaheadnicematch 175 mov edi,eax 176nolookaheadnicematch: 177; best_len = s->prev_length 178 mov best_len,ebx 179 180; window = s->window 181 mov esi,[ebp+dep_window] 182 mov ecx,[ebp+dep_strstart] 183 mov window,esi 184 185 mov nice_match,edi 186; scan = window + strstart 187 add esi,ecx 188 mov scan,esi 189; dx = *window 190 mov dx,word ptr [esi] 191; bx = *(window+best_len-1) 192 mov bx,word ptr [esi+ebx-1] 193 add esi,MAX_MATCH-1 194; scan_start = *scan 195 mov scan_start,dx 196; strend = scan + MAX_MATCH-1 197 mov strend,esi 198; bx = scan_end = *(window+best_len-1) 199 200; IPos limit = s->strstart > (IPos)MAX_DIST(s) ? 201; s->strstart - (IPos)MAX_DIST(s) : NIL; 202 203 mov esi,[ebp+dep_w_size] 204 sub esi,MIN_LOOKAHEAD 205; here esi = MAX_DIST(s) 206 sub ecx,esi 207 ja nodist 208 xor ecx,ecx 209nodist: 210 mov limit,ecx 211 212; prev = s->prev 213 mov edx,[ebp+dep_prev] 214 mov prev,edx 215 216; 217 mov edx,dword ptr [ebp+dep_match_start] 218 mov bp,scan_start 219 mov eax,cur_match 220 mov match_start,edx 221 222 mov edx,window 223 mov edi,edx 224 add edi,best_len 225 mov esi,prev 226 dec edi 227; windowlen = window + best_len -1 228 mov windowlen,edi 229 230 jmp beginloop2 231 align 4 232 233; here, in the loop 234; eax = ax = cur_match 235; ecx = limit 236; bx = scan_end 237; bp = scan_start 238; edi = windowlen (window + best_len -1) 239; esi = prev 240 241 242;// here; chain_length <=16 243normalbeg0add16: 244 add chain_length,16 245 jz exitloop 246normalbeg0: 247 cmp word ptr[edi+eax],bx 248 je normalbeg2noroll 249rcontlabnoroll: 250; cur_match = prev[cur_match & wmask] 251 and eax,7fffh 252 mov ax,word ptr[esi+eax*2] 253; if cur_match > limit, go to exitloop 254 cmp ecx,eax 255 jnb exitloop 256; if --chain_length != 0, go to exitloop 257 dec chain_length 258 jnz normalbeg0 259 jmp exitloop 260 261normalbeg2noroll: 262; if (scan_start==*(cur_match+window)) goto normalbeg2 263 cmp bp,word ptr[edx+eax] 264 jne rcontlabnoroll 265 jmp normalbeg2 266 267contloop3: 268 mov edi,windowlen 269 270; cur_match = prev[cur_match & wmask] 271 and eax,7fffh 272 mov ax,word ptr[esi+eax*2] 273; if cur_match > limit, go to exitloop 274 cmp ecx,eax 275jnbexitloopshort1: 276 jnb exitloop 277; if --chain_length != 0, go to exitloop 278 279 280; begin the main loop 281beginloop2: 282 sub chain_length,16+1 283; if chain_length <=16, don't use the unrolled loop 284 jna normalbeg0add16 285 286do16: 287 cmp word ptr[edi+eax],bx 288 je normalbeg2dc0 289 290maccn MACRO lab 291 and eax,7fffh 292 mov ax,word ptr[esi+eax*2] 293 cmp ecx,eax 294 jnb exitloop 295 cmp word ptr[edi+eax],bx 296 je lab 297 ENDM 298 299rcontloop0: 300 maccn normalbeg2dc1 301 302rcontloop1: 303 maccn normalbeg2dc2 304 305rcontloop2: 306 maccn normalbeg2dc3 307 308rcontloop3: 309 maccn normalbeg2dc4 310 311rcontloop4: 312 maccn normalbeg2dc5 313 314rcontloop5: 315 maccn normalbeg2dc6 316 317rcontloop6: 318 maccn normalbeg2dc7 319 320rcontloop7: 321 maccn normalbeg2dc8 322 323rcontloop8: 324 maccn normalbeg2dc9 325 326rcontloop9: 327 maccn normalbeg2dc10 328 329rcontloop10: 330 maccn short normalbeg2dc11 331 332rcontloop11: 333 maccn short normalbeg2dc12 334 335rcontloop12: 336 maccn short normalbeg2dc13 337 338rcontloop13: 339 maccn short normalbeg2dc14 340 341rcontloop14: 342 maccn short normalbeg2dc15 343 344rcontloop15: 345 and eax,7fffh 346 mov ax,word ptr[esi+eax*2] 347 cmp ecx,eax 348 jnb exitloop 349 350 sub chain_length,16 351 ja do16 352 jmp normalbeg0add16 353 354;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 355 356normbeg MACRO rcontlab,valsub 357; if we are here, we know that *(match+best_len-1) == scan_end 358 cmp bp,word ptr[edx+eax] 359; if (match != scan_start) goto rcontlab 360 jne rcontlab 361; calculate the good chain_length, and we'll compare scan and match string 362 add chain_length,16-valsub 363 jmp iseq 364 ENDM 365 366 367normalbeg2dc11: 368 normbeg rcontloop11,11 369 370normalbeg2dc12: 371 normbeg short rcontloop12,12 372 373normalbeg2dc13: 374 normbeg short rcontloop13,13 375 376normalbeg2dc14: 377 normbeg short rcontloop14,14 378 379normalbeg2dc15: 380 normbeg short rcontloop15,15 381 382normalbeg2dc10: 383 normbeg rcontloop10,10 384 385normalbeg2dc9: 386 normbeg rcontloop9,9 387 388normalbeg2dc8: 389 normbeg rcontloop8,8 390 391normalbeg2dc7: 392 normbeg rcontloop7,7 393 394normalbeg2dc6: 395 normbeg rcontloop6,6 396 397normalbeg2dc5: 398 normbeg rcontloop5,5 399 400normalbeg2dc4: 401 normbeg rcontloop4,4 402 403normalbeg2dc3: 404 normbeg rcontloop3,3 405 406normalbeg2dc2: 407 normbeg rcontloop2,2 408 409normalbeg2dc1: 410 normbeg rcontloop1,1 411 412normalbeg2dc0: 413 normbeg rcontloop0,0 414 415 416; we go in normalbeg2 because *(ushf*)(match+best_len-1) == scan_end 417 418normalbeg2: 419 mov edi,window 420 421 cmp bp,word ptr[edi+eax] 422 jne contloop3 ; if *(ushf*)match != scan_start, continue 423 424iseq: 425; if we are here, we know that *(match+best_len-1) == scan_end 426; and (match == scan_start) 427 428 mov edi,edx 429 mov esi,scan ; esi = scan 430 add edi,eax ; edi = window + cur_match = match 431 432 mov edx,[esi+3] ; compare manually dword at match+3 433 xor edx,[edi+3] ; and scan +3 434 435 jz begincompare ; if equal, go to long compare 436 437; we will determine the unmatch byte and calculate len (in esi) 438 or dl,dl 439 je eq1rr 440 mov esi,3 441 jmp trfinval 442eq1rr: 443 or dx,dx 444 je eq1 445 446 mov esi,4 447 jmp trfinval 448eq1: 449 and edx,0ffffffh 450 jz eq11 451 mov esi,5 452 jmp trfinval 453eq11: 454 mov esi,6 455 jmp trfinval 456 457begincompare: 458 ; here we now scan and match begin same 459 add edi,6 460 add esi,6 461 mov ecx,(MAX_MATCH-(2+4))/4 ; scan for at most MAX_MATCH bytes 462 repe cmpsd ; loop until mismatch 463 464 je trfin ; go to trfin if not unmatch 465; we determine the unmatch byte 466 sub esi,4 467 mov edx,[edi-4] 468 xor edx,[esi] 469 470 or dl,dl 471 jnz trfin 472 inc esi 473 474 or dx,dx 475 jnz trfin 476 inc esi 477 478 and edx,0ffffffh 479 jnz trfin 480 inc esi 481 482trfin: 483 sub esi,scan ; esi = len 484trfinval: 485; here we have finised compare, and esi contain len of equal string 486 cmp esi,best_len ; if len > best_len, go newbestlen 487 ja short newbestlen 488; now we restore edx, ecx and esi, for the big loop 489 mov esi,prev 490 mov ecx,limit 491 mov edx,window 492 jmp contloop3 493 494newbestlen: 495 mov best_len,esi ; len become best_len 496 497 mov match_start,eax ; save new position as match_start 498 cmp esi,nice_match ; if best_len >= nice_match, exit 499 jae exitloop 500 mov ecx,scan 501 mov edx,window ; restore edx=window 502 add ecx,esi 503 add esi,edx 504 505 dec esi 506 mov windowlen,esi ; windowlen = window + best_len-1 507 mov bx,[ecx-1] ; bx = *(scan+best_len-1) = scan_end 508 509; now we restore ecx and esi, for the big loop : 510 mov esi,prev 511 mov ecx,limit 512 jmp contloop3 513 514exitloop: 515; exit : s->match_start=match_start 516 mov ebx,match_start 517 mov ebp,str_s 518 mov ecx,best_len 519 mov dword ptr [ebp+dep_match_start],ebx 520 mov eax,dword ptr [ebp+dep_lookahead] 521 cmp ecx,eax 522 ja minexlo 523 mov eax,ecx 524minexlo: 525; return min(best_len,s->lookahead) 526 527; restore stack and register ebx,esi,edi,ebp 528 add esp,NbStackAdd 529 530 pop ebx 531 pop esi 532 pop edi 533 pop ebp 534 ret 535InfoAuthor: 536; please don't remove this string ! 537; Your are free use gvmat32 in any fre or commercial apps if you don't remove the string in the binary! 538 db 0dh,0ah,"GVMat32 optimised assembly code written 1996-98 by Gilles Vollant",0dh,0ah 539 540 541 542IFDEF NOUNDERLINE 543longest_match_7fff endp 544ELSE 545_longest_match_7fff endp 546ENDIF 547 548 549IFDEF NOUNDERLINE 550cpudetect32 proc near 551ELSE 552_cpudetect32 proc near 553ENDIF 554 555 push ebx 556 557 pushfd ; push original EFLAGS 558 pop eax ; get original EFLAGS 559 mov ecx, eax ; save original EFLAGS 560 xor eax, 40000h ; flip AC bit in EFLAGS 561 push eax ; save new EFLAGS value on stack 562 popfd ; replace current EFLAGS value 563 pushfd ; get new EFLAGS 564 pop eax ; store new EFLAGS in EAX 565 xor eax, ecx ; can�t toggle AC bit, processor=80386 566 jz end_cpu_is_386 ; jump if 80386 processor 567 push ecx 568 popfd ; restore AC bit in EFLAGS first 569 570 pushfd 571 pushfd 572 pop ecx 573 574 mov eax, ecx ; get original EFLAGS 575 xor eax, 200000h ; flip ID bit in EFLAGS 576 push eax ; save new EFLAGS value on stack 577 popfd ; replace current EFLAGS value 578 pushfd ; get new EFLAGS 579 pop eax ; store new EFLAGS in EAX 580 popfd ; restore original EFLAGS 581 xor eax, ecx ; can�t toggle ID bit, 582 je is_old_486 ; processor=old 583 584 mov eax,1 585 db 0fh,0a2h ;CPUID 586 587exitcpudetect: 588 pop ebx 589 ret 590 591end_cpu_is_386: 592 mov eax,0300h 593 jmp exitcpudetect 594 595is_old_486: 596 mov eax,0400h 597 jmp exitcpudetect 598 599IFDEF NOUNDERLINE 600cpudetect32 endp 601ELSE 602_cpudetect32 endp 603ENDIF 604ENDIF 605 606MAX_MATCH equ 258 607MIN_MATCH equ 3 608MIN_LOOKAHEAD equ (MAX_MATCH + MIN_MATCH + 1) 609MAX_MATCH_8_ equ ((MAX_MATCH + 7) AND 0FFF0h) 610 611 612;;; stack frame offsets 613 614chainlenwmask equ esp + 0 ; high word: current chain len 615 ; low word: s->wmask 616window equ esp + 4 ; local copy of s->window 617windowbestlen equ esp + 8 ; s->window + bestlen 618scanstart equ esp + 16 ; first two bytes of string 619scanend equ esp + 12 ; last two bytes of string 620scanalign equ esp + 20 ; dword-misalignment of string 621nicematch equ esp + 24 ; a good enough match size 622bestlen equ esp + 28 ; size of best match so far 623scan equ esp + 32 ; ptr to string wanting match 624 625LocalVarsSize equ 36 626; saved ebx byte esp + 36 627; saved edi byte esp + 40 628; saved esi byte esp + 44 629; saved ebp byte esp + 48 630; return address byte esp + 52 631deflatestate equ esp + 56 ; the function arguments 632curmatch equ esp + 60 633 634;;; Offsets for fields in the deflate_state structure. These numbers 635;;; are calculated from the definition of deflate_state, with the 636;;; assumption that the compiler will dword-align the fields. (Thus, 637;;; changing the definition of deflate_state could easily cause this 638;;; program to crash horribly, without so much as a warning at 639;;; compile time. Sigh.) 640 641dsWSize equ 36+zlib1222add 642dsWMask equ 44+zlib1222add 643dsWindow equ 48+zlib1222add 644dsPrev equ 56+zlib1222add 645dsMatchLen equ 88+zlib1222add 646dsPrevMatch equ 92+zlib1222add 647dsStrStart equ 100+zlib1222add 648dsMatchStart equ 104+zlib1222add 649dsLookahead equ 108+zlib1222add 650dsPrevLen equ 112+zlib1222add 651dsMaxChainLen equ 116+zlib1222add 652dsGoodMatch equ 132+zlib1222add 653dsNiceMatch equ 136+zlib1222add 654 655 656;;; match.asm -- Pentium-Pro-optimized version of longest_match() 657;;; Written for zlib 1.1.2 658;;; Copyright (C) 1998 Brian Raiter <breadbox@muppetlabs.com> 659;;; You can look at http://www.muppetlabs.com/~breadbox/software/assembly.html 660;;; 661;;; This is free software; you can redistribute it and/or modify it 662;;; under the terms of the GNU General Public License. 663 664;GLOBAL _longest_match, _match_init 665 666 667;SECTION .text 668 669;;; uInt longest_match(deflate_state *deflatestate, IPos curmatch) 670 671;_longest_match: 672IFDEF NOOLDPENTIUMCODE 673 IFDEF NOUNDERLINE 674 longest_match proc near 675 ELSE 676 _longest_match proc near 677 ENDIF 678ELSE 679 IFDEF NOUNDERLINE 680 longest_match_686 proc near 681 ELSE 682 _longest_match_686 proc near 683 ENDIF 684ENDIF 685 686;;; Save registers that the compiler may be using, and adjust esp to 687;;; make room for our stack frame. 688 689 push ebp 690 push edi 691 push esi 692 push ebx 693 sub esp, LocalVarsSize 694 695;;; Retrieve the function arguments. ecx will hold cur_match 696;;; throughout the entire function. edx will hold the pointer to the 697;;; deflate_state structure during the function's setup (before 698;;; entering the main loop. 699 700 mov edx, [deflatestate] 701 mov ecx, [curmatch] 702 703;;; uInt wmask = s->w_mask; 704;;; unsigned chain_length = s->max_chain_length; 705;;; if (s->prev_length >= s->good_match) { 706;;; chain_length >>= 2; 707;;; } 708 709 mov eax, [edx + dsPrevLen] 710 mov ebx, [edx + dsGoodMatch] 711 cmp eax, ebx 712 mov eax, [edx + dsWMask] 713 mov ebx, [edx + dsMaxChainLen] 714 jl LastMatchGood 715 shr ebx, 2 716LastMatchGood: 717 718;;; chainlen is decremented once beforehand so that the function can 719;;; use the sign flag instead of the zero flag for the exit test. 720;;; It is then shifted into the high word, to make room for the wmask 721;;; value, which it will always accompany. 722 723 dec ebx 724 shl ebx, 16 725 or ebx, eax 726 mov [chainlenwmask], ebx 727 728;;; if ((uInt)nice_match > s->lookahead) nice_match = s->lookahead; 729 730 mov eax, [edx + dsNiceMatch] 731 mov ebx, [edx + dsLookahead] 732 cmp ebx, eax 733 jl LookaheadLess 734 mov ebx, eax 735LookaheadLess: mov [nicematch], ebx 736 737;;; register Bytef *scan = s->window + s->strstart; 738 739 mov esi, [edx + dsWindow] 740 mov [window], esi 741 mov ebp, [edx + dsStrStart] 742 lea edi, [esi + ebp] 743 mov [scan], edi 744 745;;; Determine how many bytes the scan ptr is off from being 746;;; dword-aligned. 747 748 mov eax, edi 749 neg eax 750 and eax, 3 751 mov [scanalign], eax 752 753;;; IPos limit = s->strstart > (IPos)MAX_DIST(s) ? 754;;; s->strstart - (IPos)MAX_DIST(s) : NIL; 755 756 mov eax, [edx + dsWSize] 757 sub eax, MIN_LOOKAHEAD 758 sub ebp, eax 759 jg LimitPositive 760 xor ebp, ebp 761LimitPositive: 762 763;;; int best_len = s->prev_length; 764 765 mov eax, [edx + dsPrevLen] 766 mov [bestlen], eax 767 768;;; Store the sum of s->window + best_len in esi locally, and in esi. 769 770 add esi, eax 771 mov [windowbestlen], esi 772 773;;; register ush scan_start = *(ushf*)scan; 774;;; register ush scan_end = *(ushf*)(scan+best_len-1); 775;;; Posf *prev = s->prev; 776 777 movzx ebx, word ptr [edi] 778 mov [scanstart], ebx 779 movzx ebx, word ptr [edi + eax - 1] 780 mov [scanend], ebx 781 mov edi, [edx + dsPrev] 782 783;;; Jump into the main loop. 784 785 mov edx, [chainlenwmask] 786 jmp short LoopEntry 787 788align 4 789 790;;; do { 791;;; match = s->window + cur_match; 792;;; if (*(ushf*)(match+best_len-1) != scan_end || 793;;; *(ushf*)match != scan_start) continue; 794;;; [...] 795;;; } while ((cur_match = prev[cur_match & wmask]) > limit 796;;; && --chain_length != 0); 797;;; 798;;; Here is the inner loop of the function. The function will spend the 799;;; majority of its time in this loop, and majority of that time will 800;;; be spent in the first ten instructions. 801;;; 802;;; Within this loop: 803;;; ebx = scanend 804;;; ecx = curmatch 805;;; edx = chainlenwmask - i.e., ((chainlen << 16) | wmask) 806;;; esi = windowbestlen - i.e., (window + bestlen) 807;;; edi = prev 808;;; ebp = limit 809 810LookupLoop: 811 and ecx, edx 812 movzx ecx, word ptr [edi + ecx*2] 813 cmp ecx, ebp 814 jbe LeaveNow 815 sub edx, 00010000h 816 js LeaveNow 817LoopEntry: movzx eax, word ptr [esi + ecx - 1] 818 cmp eax, ebx 819 jnz LookupLoop 820 mov eax, [window] 821 movzx eax, word ptr [eax + ecx] 822 cmp eax, [scanstart] 823 jnz LookupLoop 824 825;;; Store the current value of chainlen. 826 827 mov [chainlenwmask], edx 828 829;;; Point edi to the string under scrutiny, and esi to the string we 830;;; are hoping to match it up with. In actuality, esi and edi are 831;;; both pointed (MAX_MATCH_8 - scanalign) bytes ahead, and edx is 832;;; initialized to -(MAX_MATCH_8 - scanalign). 833 834 mov esi, [window] 835 mov edi, [scan] 836 add esi, ecx 837 mov eax, [scanalign] 838 mov edx, 0fffffef8h; -(MAX_MATCH_8) 839 lea edi, [edi + eax + 0108h] ;MAX_MATCH_8] 840 lea esi, [esi + eax + 0108h] ;MAX_MATCH_8] 841 842;;; Test the strings for equality, 8 bytes at a time. At the end, 843;;; adjust edx so that it is offset to the exact byte that mismatched. 844;;; 845;;; We already know at this point that the first three bytes of the 846;;; strings match each other, and they can be safely passed over before 847;;; starting the compare loop. So what this code does is skip over 0-3 848;;; bytes, as much as necessary in order to dword-align the edi 849;;; pointer. (esi will still be misaligned three times out of four.) 850;;; 851;;; It should be confessed that this loop usually does not represent 852;;; much of the total running time. Replacing it with a more 853;;; straightforward "rep cmpsb" would not drastically degrade 854;;; performance. 855 856LoopCmps: 857 mov eax, [esi + edx] 858 xor eax, [edi + edx] 859 jnz LeaveLoopCmps 860 mov eax, [esi + edx + 4] 861 xor eax, [edi + edx + 4] 862 jnz LeaveLoopCmps4 863 add edx, 8 864 jnz LoopCmps 865 jmp short LenMaximum 866LeaveLoopCmps4: add edx, 4 867LeaveLoopCmps: test eax, 0000FFFFh 868 jnz LenLower 869 add edx, 2 870 shr eax, 16 871LenLower: sub al, 1 872 adc edx, 0 873 874;;; Calculate the length of the match. If it is longer than MAX_MATCH, 875;;; then automatically accept it as the best possible match and leave. 876 877 lea eax, [edi + edx] 878 mov edi, [scan] 879 sub eax, edi 880 cmp eax, MAX_MATCH 881 jge LenMaximum 882 883;;; If the length of the match is not longer than the best match we 884;;; have so far, then forget it and return to the lookup loop. 885 886 mov edx, [deflatestate] 887 mov ebx, [bestlen] 888 cmp eax, ebx 889 jg LongerMatch 890 mov esi, [windowbestlen] 891 mov edi, [edx + dsPrev] 892 mov ebx, [scanend] 893 mov edx, [chainlenwmask] 894 jmp LookupLoop 895 896;;; s->match_start = cur_match; 897;;; best_len = len; 898;;; if (len >= nice_match) break; 899;;; scan_end = *(ushf*)(scan+best_len-1); 900 901LongerMatch: mov ebx, [nicematch] 902 mov [bestlen], eax 903 mov [edx + dsMatchStart], ecx 904 cmp eax, ebx 905 jge LeaveNow 906 mov esi, [window] 907 add esi, eax 908 mov [windowbestlen], esi 909 movzx ebx, word ptr [edi + eax - 1] 910 mov edi, [edx + dsPrev] 911 mov [scanend], ebx 912 mov edx, [chainlenwmask] 913 jmp LookupLoop 914 915;;; Accept the current string, with the maximum possible length. 916 917LenMaximum: mov edx, [deflatestate] 918 mov dword ptr [bestlen], MAX_MATCH 919 mov [edx + dsMatchStart], ecx 920 921;;; if ((uInt)best_len <= s->lookahead) return (uInt)best_len; 922;;; return s->lookahead; 923 924LeaveNow: 925 mov edx, [deflatestate] 926 mov ebx, [bestlen] 927 mov eax, [edx + dsLookahead] 928 cmp ebx, eax 929 jg LookaheadRet 930 mov eax, ebx 931LookaheadRet: 932 933;;; Restore the stack and return from whence we came. 934 935 add esp, LocalVarsSize 936 pop ebx 937 pop esi 938 pop edi 939 pop ebp 940 941 ret 942; please don't remove this string ! 943; Your can freely use gvmat32 in any free or commercial app if you don't remove the string in the binary! 944 db 0dh,0ah,"asm686 with masm, optimised assembly code from Brian Raiter, written 1998",0dh,0ah 945 946 947IFDEF NOOLDPENTIUMCODE 948 IFDEF NOUNDERLINE 949 longest_match endp 950 ELSE 951 _longest_match endp 952 ENDIF 953 954 IFDEF NOUNDERLINE 955 match_init proc near 956 ret 957 match_init endp 958 ELSE 959 _match_init proc near 960 ret 961 _match_init endp 962 ENDIF 963ELSE 964 IFDEF NOUNDERLINE 965 longest_match_686 endp 966 ELSE 967 _longest_match_686 endp 968 ENDIF 969ENDIF 970 971_TEXT ends 972end 973