1 2; match.asm -- Pentium-Pro optimized version of longest_match() 3; 4; Updated for zlib 1.1.3 and converted to MASM 6.1x 5; Copyright (C) 2000 Dan Higdon <hdan@kinesoft.com> 6; and Chuck Walbourn <chuckw@kinesoft.com> 7; Corrections by Cosmin Truta <cosmint@cs.ubbcluj.ro> 8; 9; This is free software; you can redistribute it and/or modify it 10; under the terms of the GNU General Public License. 11 12; Based on match.S 13; Written for zlib 1.1.2 14; Copyright (C) 1998 Brian Raiter <breadbox@muppetlabs.com> 15; 16; Modified by Gilles Vollant (2005) for add gzhead and gzindex 17 18 .686P 19 .MODEL FLAT 20 21;=========================================================================== 22; EQUATES 23;=========================================================================== 24 25MAX_MATCH EQU 258 26MIN_MATCH EQU 3 27MIN_LOOKAHEAD EQU (MAX_MATCH + MIN_MATCH + 1) 28MAX_MATCH_8 EQU ((MAX_MATCH + 7) AND (NOT 7)) 29 30;=========================================================================== 31; STRUCTURES 32;=========================================================================== 33 34; This STRUCT assumes a 4-byte alignment 35 36DEFLATE_STATE STRUCT 37ds_strm dd ? 38ds_status dd ? 39ds_pending_buf dd ? 40ds_pending_buf_size dd ? 41ds_pending_out dd ? 42ds_pending dd ? 43ds_wrap dd ? 44; gzhead and gzindex are added in zlib 1.2.2.2 (see deflate.h) 45ds_gzhead dd ? 46ds_gzindex dd ? 47ds_data_type db ? 48ds_method db ? 49 db ? ; padding 50 db ? ; padding 51ds_last_flush dd ? 52ds_w_size dd ? ; used 53ds_w_bits dd ? 54ds_w_mask dd ? ; used 55ds_window dd ? ; used 56ds_window_size dd ? 57ds_prev dd ? ; used 58ds_head dd ? 59ds_ins_h dd ? 60ds_hash_size dd ? 61ds_hash_bits dd ? 62ds_hash_mask dd ? 63ds_hash_shift dd ? 64ds_block_start dd ? 65ds_match_length dd ? ; used 66ds_prev_match dd ? ; used 67ds_match_available dd ? 68ds_strstart dd ? ; used 69ds_match_start dd ? ; used 70ds_lookahead dd ? ; used 71ds_prev_length dd ? ; used 72ds_max_chain_length dd ? ; used 73ds_max_laxy_match dd ? 74ds_level dd ? 75ds_strategy dd ? 76ds_good_match dd ? ; used 77ds_nice_match dd ? ; used 78 79; Don't need anymore of the struct for match 80DEFLATE_STATE ENDS 81 82;=========================================================================== 83; CODE 84;=========================================================================== 85_TEXT SEGMENT 86 87;--------------------------------------------------------------------------- 88; match_init 89;--------------------------------------------------------------------------- 90 ALIGN 4 91PUBLIC _match_init 92_match_init PROC 93 ; no initialization needed 94 ret 95_match_init ENDP 96 97;--------------------------------------------------------------------------- 98; uInt longest_match(deflate_state *deflatestate, IPos curmatch) 99;--------------------------------------------------------------------------- 100 ALIGN 4 101 102PUBLIC _longest_match 103_longest_match PROC 104 105; Since this code uses EBP for a scratch register, the stack frame must 106; be manually constructed and referenced relative to the ESP register. 107 108; Stack image 109; Variables 110chainlenwmask = 0 ; high word: current chain len 111 ; low word: s->wmask 112window = 4 ; local copy of s->window 113windowbestlen = 8 ; s->window + bestlen 114scanend = 12 ; last two bytes of string 115scanstart = 16 ; first two bytes of string 116scanalign = 20 ; dword-misalignment of string 117nicematch = 24 ; a good enough match size 118bestlen = 28 ; size of best match so far 119scan = 32 ; ptr to string wanting match 120varsize = 36 ; number of bytes (also offset to last saved register) 121 122; Saved Registers (actually pushed into place) 123ebx_save = 36 124edi_save = 40 125esi_save = 44 126ebp_save = 48 127 128; Parameters 129retaddr = 52 130deflatestate = 56 131curmatch = 60 132 133; Save registers that the compiler may be using 134 push ebp 135 push edi 136 push esi 137 push ebx 138 139; Allocate local variable space 140 sub esp,varsize 141 142; Retrieve the function arguments. ecx will hold cur_match 143; throughout the entire function. edx will hold the pointer to the 144; deflate_state structure during the function's setup (before 145; entering the main loop). 146 147 mov edx, [esp+deflatestate] 148ASSUME edx:PTR DEFLATE_STATE 149 150 mov ecx, [esp+curmatch] 151 152; uInt wmask = s->w_mask; 153; unsigned chain_length = s->max_chain_length; 154; if (s->prev_length >= s->good_match) { 155; chain_length >>= 2; 156; } 157 158 mov eax, [edx].ds_prev_length 159 mov ebx, [edx].ds_good_match 160 cmp eax, ebx 161 mov eax, [edx].ds_w_mask 162 mov ebx, [edx].ds_max_chain_length 163 jl SHORT LastMatchGood 164 shr ebx, 2 165LastMatchGood: 166 167; chainlen is decremented once beforehand so that the function can 168; use the sign flag instead of the zero flag for the exit test. 169; It is then shifted into the high word, to make room for the wmask 170; value, which it will always accompany. 171 172 dec ebx 173 shl ebx, 16 174 or ebx, eax 175 mov [esp+chainlenwmask], ebx 176 177; if ((uInt)nice_match > s->lookahead) nice_match = s->lookahead; 178 179 mov eax, [edx].ds_nice_match 180 mov ebx, [edx].ds_lookahead 181 cmp ebx, eax 182 jl SHORT LookaheadLess 183 mov ebx, eax 184LookaheadLess: 185 mov [esp+nicematch], ebx 186 187;/* register Bytef *scan = s->window + s->strstart; */ 188 189 mov esi, [edx].ds_window 190 mov [esp+window], esi 191 mov ebp, [edx].ds_strstart 192 lea edi, [esi+ebp] 193 mov [esp+scan],edi 194 195;/* Determine how many bytes the scan ptr is off from being */ 196;/* dword-aligned. */ 197 198 mov eax, edi 199 neg eax 200 and eax, 3 201 mov [esp+scanalign], eax 202 203;/* IPos limit = s->strstart > (IPos)MAX_DIST(s) ? */ 204;/* s->strstart - (IPos)MAX_DIST(s) : NIL; */ 205 206 mov eax, [edx].ds_w_size 207 sub eax, MIN_LOOKAHEAD 208 sub ebp, eax 209 jg SHORT LimitPositive 210 xor ebp, ebp 211LimitPositive: 212 213;/* int best_len = s->prev_length; */ 214 215 mov eax, [edx].ds_prev_length 216 mov [esp+bestlen], eax 217 218;/* Store the sum of s->window + best_len in %esi locally, and in %esi. */ 219 220 add esi, eax 221 mov [esp+windowbestlen], esi 222 223;/* register ush scan_start = *(ushf*)scan; */ 224;/* register ush scan_end = *(ushf*)(scan+best_len-1); */ 225;/* Posf *prev = s->prev; */ 226 227 movzx ebx, WORD PTR[edi] 228 mov [esp+scanstart], ebx 229 movzx ebx, WORD PTR[eax+edi-1] 230 mov [esp+scanend], ebx 231 mov edi, [edx].ds_prev 232 233;/* Jump into the main loop. */ 234 235 mov edx, [esp+chainlenwmask] 236 jmp SHORT LoopEntry 237 238;/* do { 239; * match = s->window + cur_match; 240; * if (*(ushf*)(match+best_len-1) != scan_end || 241; * *(ushf*)match != scan_start) continue; 242; * [...] 243; * } while ((cur_match = prev[cur_match & wmask]) > limit 244; * && --chain_length != 0); 245; * 246; * Here is the inner loop of the function. The function will spend the 247; * majority of its time in this loop, and majority of that time will 248; * be spent in the first ten instructions. 249; * 250; * Within this loop: 251; * %ebx = scanend 252; * %ecx = curmatch 253; * %edx = chainlenwmask - i.e., ((chainlen << 16) | wmask) 254; * %esi = windowbestlen - i.e., (window + bestlen) 255; * %edi = prev 256; * %ebp = limit 257; */ 258 259 ALIGN 4 260LookupLoop: 261 and ecx, edx 262 movzx ecx, WORD PTR[edi+ecx*2] 263 cmp ecx, ebp 264 jbe LeaveNow 265 sub edx, 000010000H 266 js LeaveNow 267 268LoopEntry: 269 movzx eax, WORD PTR[esi+ecx-1] 270 cmp eax, ebx 271 jnz SHORT LookupLoop 272 273 mov eax, [esp+window] 274 movzx eax, WORD PTR[eax+ecx] 275 cmp eax, [esp+scanstart] 276 jnz SHORT LookupLoop 277 278;/* Store the current value of chainlen. */ 279 280 mov [esp+chainlenwmask], edx 281 282;/* Point %edi to the string under scrutiny, and %esi to the string we */ 283;/* are hoping to match it up with. In actuality, %esi and %edi are */ 284;/* both pointed (MAX_MATCH_8 - scanalign) bytes ahead, and %edx is */ 285;/* initialized to -(MAX_MATCH_8 - scanalign). */ 286 287 mov esi, [esp+window] 288 mov edi, [esp+scan] 289 add esi, ecx 290 mov eax, [esp+scanalign] 291 mov edx, -MAX_MATCH_8 292 lea edi, [edi+eax+MAX_MATCH_8] 293 lea esi, [esi+eax+MAX_MATCH_8] 294 295;/* Test the strings for equality, 8 bytes at a time. At the end, 296; * adjust %edx so that it is offset to the exact byte that mismatched. 297; * 298; * We already know at this point that the first three bytes of the 299; * strings match each other, and they can be safely passed over before 300; * starting the compare loop. So what this code does is skip over 0-3 301; * bytes, as much as necessary in order to dword-align the %edi 302; * pointer. (%esi will still be misaligned three times out of four.) 303; * 304; * It should be confessed that this loop usually does not represent 305; * much of the total running time. Replacing it with a more 306; * straightforward "rep cmpsb" would not drastically degrade 307; * performance. 308; */ 309 310LoopCmps: 311 mov eax, DWORD PTR[esi+edx] 312 xor eax, DWORD PTR[edi+edx] 313 jnz SHORT LeaveLoopCmps 314 315 mov eax, DWORD PTR[esi+edx+4] 316 xor eax, DWORD PTR[edi+edx+4] 317 jnz SHORT LeaveLoopCmps4 318 319 add edx, 8 320 jnz SHORT LoopCmps 321 jmp LenMaximum 322 ALIGN 4 323 324LeaveLoopCmps4: 325 add edx, 4 326 327LeaveLoopCmps: 328 test eax, 00000FFFFH 329 jnz SHORT LenLower 330 331 add edx, 2 332 shr eax, 16 333 334LenLower: 335 sub al, 1 336 adc edx, 0 337 338;/* Calculate the length of the match. If it is longer than MAX_MATCH, */ 339;/* then automatically accept it as the best possible match and leave. */ 340 341 lea eax, [edi+edx] 342 mov edi, [esp+scan] 343 sub eax, edi 344 cmp eax, MAX_MATCH 345 jge SHORT LenMaximum 346 347;/* If the length of the match is not longer than the best match we */ 348;/* have so far, then forget it and return to the lookup loop. */ 349 350 mov edx, [esp+deflatestate] 351 mov ebx, [esp+bestlen] 352 cmp eax, ebx 353 jg SHORT LongerMatch 354 mov esi, [esp+windowbestlen] 355 mov edi, [edx].ds_prev 356 mov ebx, [esp+scanend] 357 mov edx, [esp+chainlenwmask] 358 jmp LookupLoop 359 ALIGN 4 360 361;/* s->match_start = cur_match; */ 362;/* best_len = len; */ 363;/* if (len >= nice_match) break; */ 364;/* scan_end = *(ushf*)(scan+best_len-1); */ 365 366LongerMatch: 367 mov ebx, [esp+nicematch] 368 mov [esp+bestlen], eax 369 mov [edx].ds_match_start, ecx 370 cmp eax, ebx 371 jge SHORT LeaveNow 372 mov esi, [esp+window] 373 add esi, eax 374 mov [esp+windowbestlen], esi 375 movzx ebx, WORD PTR[edi+eax-1] 376 mov edi, [edx].ds_prev 377 mov [esp+scanend], ebx 378 mov edx, [esp+chainlenwmask] 379 jmp LookupLoop 380 ALIGN 4 381 382;/* Accept the current string, with the maximum possible length. */ 383 384LenMaximum: 385 mov edx, [esp+deflatestate] 386 mov DWORD PTR[esp+bestlen], MAX_MATCH 387 mov [edx].ds_match_start, ecx 388 389;/* if ((uInt)best_len <= s->lookahead) return (uInt)best_len; */ 390;/* return s->lookahead; */ 391 392LeaveNow: 393 mov edx, [esp+deflatestate] 394 mov ebx, [esp+bestlen] 395 mov eax, [edx].ds_lookahead 396 cmp ebx, eax 397 jg SHORT LookaheadRet 398 mov eax, ebx 399LookaheadRet: 400 401; Restore the stack and return from whence we came. 402 403 add esp, varsize 404 pop ebx 405 pop esi 406 pop edi 407 pop ebp 408 ret 409 410_longest_match ENDP 411 412_TEXT ENDS 413END 414