1; LzFindOpt.asm -- ASM version of GetMatchesSpecN_2() function 2; 2021-07-21: Igor Pavlov : Public domain 3; 4 5ifndef x64 6; x64=1 7; .err <x64_IS_REQUIRED> 8endif 9 10include 7zAsm.asm 11 12MY_ASM_START 13 14_TEXT$LZFINDOPT SEGMENT ALIGN(64) 'CODE' 15 16MY_ALIGN macro num:req 17 align num 18endm 19 20MY_ALIGN_32 macro 21 MY_ALIGN 32 22endm 23 24MY_ALIGN_64 macro 25 MY_ALIGN 64 26endm 27 28 29t0_L equ x0_L 30t0_x equ x0 31t0 equ r0 32t1_x equ x3 33t1 equ r3 34 35cp_x equ t1_x 36cp_r equ t1 37m equ x5 38m_r equ r5 39len_x equ x6 40len equ r6 41diff_x equ x7 42diff equ r7 43len0 equ r10 44len1_x equ x11 45len1 equ r11 46maxLen_x equ x12 47maxLen equ r12 48d equ r13 49ptr0 equ r14 50ptr1 equ r15 51 52d_lim equ m_r 53cycSize equ len_x 54hash_lim equ len0 55delta1_x equ len1_x 56delta1_r equ len1 57delta_x equ maxLen_x 58delta_r equ maxLen 59hash equ ptr0 60src equ ptr1 61 62 63 64if (IS_LINUX gt 0) 65 66; r1 r2 r8 r9 : win32 67; r7 r6 r2 r1 r8 r9 : linux 68 69lenLimit equ r8 70lenLimit_x equ x8 71; pos_r equ r2 72pos equ x2 73cur equ r1 74son equ r9 75 76else 77 78lenLimit equ REG_ABI_PARAM_2 79lenLimit_x equ REG_ABI_PARAM_2_x 80pos equ REG_ABI_PARAM_1_x 81cur equ REG_ABI_PARAM_0 82son equ REG_ABI_PARAM_3 83 84endif 85 86 87if (IS_LINUX gt 0) 88 maxLen_OFFS equ (REG_SIZE * (6 + 1)) 89else 90 cutValue_OFFS equ (REG_SIZE * (8 + 1 + 4)) 91 d_OFFS equ (REG_SIZE + cutValue_OFFS) 92 maxLen_OFFS equ (REG_SIZE + d_OFFS) 93endif 94 hash_OFFS equ (REG_SIZE + maxLen_OFFS) 95 limit_OFFS equ (REG_SIZE + hash_OFFS) 96 size_OFFS equ (REG_SIZE + limit_OFFS) 97 cycPos_OFFS equ (REG_SIZE + size_OFFS) 98 cycSize_OFFS equ (REG_SIZE + cycPos_OFFS) 99 posRes_OFFS equ (REG_SIZE + cycSize_OFFS) 100 101if (IS_LINUX gt 0) 102else 103 cutValue_PAR equ [r0 + cutValue_OFFS] 104 d_PAR equ [r0 + d_OFFS] 105endif 106 maxLen_PAR equ [r0 + maxLen_OFFS] 107 hash_PAR equ [r0 + hash_OFFS] 108 limit_PAR equ [r0 + limit_OFFS] 109 size_PAR equ [r0 + size_OFFS] 110 cycPos_PAR equ [r0 + cycPos_OFFS] 111 cycSize_PAR equ [r0 + cycSize_OFFS] 112 posRes_PAR equ [r0 + posRes_OFFS] 113 114 115 cutValue_VAR equ DWORD PTR [r4 + 8 * 0] 116 cutValueCur_VAR equ DWORD PTR [r4 + 8 * 0 + 4] 117 cycPos_VAR equ DWORD PTR [r4 + 8 * 1 + 0] 118 cycSize_VAR equ DWORD PTR [r4 + 8 * 1 + 4] 119 hash_VAR equ QWORD PTR [r4 + 8 * 2] 120 limit_VAR equ QWORD PTR [r4 + 8 * 3] 121 size_VAR equ QWORD PTR [r4 + 8 * 4] 122 distances equ QWORD PTR [r4 + 8 * 5] 123 maxLen_VAR equ QWORD PTR [r4 + 8 * 6] 124 125 Old_RSP equ QWORD PTR [r4 + 8 * 7] 126 LOCAL_SIZE equ 8 * 8 127 128COPY_VAR_32 macro dest_var, src_var 129 mov x3, src_var 130 mov dest_var, x3 131endm 132 133COPY_VAR_64 macro dest_var, src_var 134 mov r3, src_var 135 mov dest_var, r3 136endm 137 138 139; MY_ALIGN_64 140MY_PROC GetMatchesSpecN_2, 13 141MY_PUSH_PRESERVED_ABI_REGS 142 mov r0, RSP 143 lea r3, [r0 - LOCAL_SIZE] 144 and r3, -64 145 mov RSP, r3 146 mov Old_RSP, r0 147 148if (IS_LINUX gt 0) 149 mov d, REG_ABI_PARAM_5 ; r13 = r9 150 mov cutValue_VAR, REG_ABI_PARAM_4_x ; = r8 151 mov son, REG_ABI_PARAM_3 ; r9 = r1 152 mov r8, REG_ABI_PARAM_2 ; r8 = r2 153 mov pos, REG_ABI_PARAM_1_x ; r2 = x6 154 mov r1, REG_ABI_PARAM_0 ; r1 = r7 155else 156 COPY_VAR_32 cutValue_VAR, cutValue_PAR 157 mov d, d_PAR 158endif 159 160 COPY_VAR_64 limit_VAR, limit_PAR 161 162 mov hash_lim, size_PAR 163 mov size_VAR, hash_lim 164 165 mov cp_x, cycPos_PAR 166 mov hash, hash_PAR 167 168 mov cycSize, cycSize_PAR 169 mov cycSize_VAR, cycSize 170 171 ; we want cur in (rcx). So we change the cur and lenLimit variables 172 sub lenLimit, cur 173 neg lenLimit_x 174 inc lenLimit_x 175 176 mov t0_x, maxLen_PAR 177 sub t0, lenLimit 178 mov maxLen_VAR, t0 179 180 jmp main_loop 181 182MY_ALIGN_64 183fill_empty: 184 ; ptr0 = *ptr1 = kEmptyHashValue; 185 mov QWORD PTR [ptr1], 0 186 inc pos 187 inc cp_x 188 mov DWORD PTR [d - 4], 0 189 cmp d, limit_VAR 190 jae fin 191 cmp hash, hash_lim 192 je fin 193 194; MY_ALIGN_64 195main_loop: 196 ; UInt32 delta = *hash++; 197 mov diff_x, [hash] ; delta 198 add hash, 4 199 ; mov cycPos_VAR, cp_x 200 201 inc cur 202 add d, 4 203 mov m, pos 204 sub m, diff_x; ; matchPos 205 206 ; CLzRef *ptr1 = son + ((size_t)(pos) << 1) - CYC_TO_POS_OFFSET * 2; 207 lea ptr1, [son + 8 * cp_r] 208 ; mov cycSize, cycSize_VAR 209 cmp pos, cycSize 210 jb directMode ; if (pos < cycSize_VAR) 211 212 ; CYC MODE 213 214 cmp diff_x, cycSize 215 jae fill_empty ; if (delta >= cycSize_VAR) 216 217 xor t0_x, t0_x 218 mov cycPos_VAR, cp_x 219 sub cp_x, diff_x 220 ; jae prepare_for_tree_loop 221 ; add cp_x, cycSize 222 cmovb t0_x, cycSize 223 add cp_x, t0_x ; cp_x += (cycPos < delta ? cycSize : 0) 224 jmp prepare_for_tree_loop 225 226 227directMode: 228 cmp diff_x, pos 229 je fill_empty ; if (delta == pos) 230 jae fin_error ; if (delta >= pos) 231 232 mov cycPos_VAR, cp_x 233 mov cp_x, m 234 235prepare_for_tree_loop: 236 mov len0, lenLimit 237 mov hash_VAR, hash 238 ; CLzRef *ptr0 = son + ((size_t)(pos) << 1) - CYC_TO_POS_OFFSET * 2 + 1; 239 lea ptr0, [ptr1 + 4] 240 ; UInt32 *_distances = ++d; 241 mov distances, d 242 243 neg len0 244 mov len1, len0 245 246 mov t0_x, cutValue_VAR 247 mov maxLen, maxLen_VAR 248 mov cutValueCur_VAR, t0_x 249 250MY_ALIGN_32 251tree_loop: 252 neg diff 253 mov len, len0 254 cmp len1, len0 255 cmovb len, len1 ; len = (len1 < len0 ? len1 : len0); 256 add diff, cur 257 258 mov t0_x, [son + cp_r * 8] ; prefetch 259 movzx t0_x, BYTE PTR [diff + 1 * len] 260 lea cp_r, [son + cp_r * 8] 261 cmp [cur + 1 * len], t0_L 262 je matched_1 263 264 jb left_0 265 266 mov [ptr1], m 267 mov m, [cp_r + 4] 268 lea ptr1, [cp_r + 4] 269 sub diff, cur ; FIX32 270 jmp next_node 271 272MY_ALIGN_32 273left_0: 274 mov [ptr0], m 275 mov m, [cp_r] 276 mov ptr0, cp_r 277 sub diff, cur ; FIX32 278 ; jmp next_node 279 280; ------------ NEXT NODE ------------ 281; MY_ALIGN_32 282next_node: 283 mov cycSize, cycSize_VAR 284 dec cutValueCur_VAR 285 je finish_tree 286 287 add diff_x, pos ; prev_match = pos + diff 288 cmp m, diff_x 289 jae fin_error ; if (new_match >= prev_match) 290 291 mov diff_x, pos 292 sub diff_x, m ; delta = pos - new_match 293 cmp pos, cycSize 294 jae cyc_mode_2 ; if (pos >= cycSize) 295 296 mov cp_x, m 297 test m, m 298 jne tree_loop ; if (m != 0) 299 300finish_tree: 301 ; ptr0 = *ptr1 = kEmptyHashValue; 302 mov DWORD PTR [ptr0], 0 303 mov DWORD PTR [ptr1], 0 304 305 inc pos 306 307 ; _distances[-1] = (UInt32)(d - _distances); 308 mov t0, distances 309 mov t1, d 310 sub t1, t0 311 shr t1_x, 2 312 mov [t0 - 4], t1_x 313 314 cmp d, limit_VAR 315 jae fin ; if (d >= limit) 316 317 mov cp_x, cycPos_VAR 318 mov hash, hash_VAR 319 mov hash_lim, size_VAR 320 inc cp_x 321 cmp hash, hash_lim 322 jne main_loop ; if (hash != size) 323 jmp fin 324 325 326MY_ALIGN_32 327cyc_mode_2: 328 cmp diff_x, cycSize 329 jae finish_tree ; if (delta >= cycSize) 330 331 mov cp_x, cycPos_VAR 332 xor t0_x, t0_x 333 sub cp_x, diff_x ; cp_x = cycPos - delta 334 cmovb t0_x, cycSize 335 add cp_x, t0_x ; cp_x += (cycPos < delta ? cycSize : 0) 336 jmp tree_loop 337 338 339MY_ALIGN_32 340matched_1: 341 342 inc len 343 ; cmp len_x, lenLimit_x 344 je short lenLimit_reach 345 movzx t0_x, BYTE PTR [diff + 1 * len] 346 cmp [cur + 1 * len], t0_L 347 jne mismatch 348 349 350MY_ALIGN_32 351match_loop: 352 ; while (++len != lenLimit) (len[diff] != len[0]) ; 353 354 inc len 355 ; cmp len_x, lenLimit_x 356 je short lenLimit_reach 357 movzx t0_x, BYTE PTR [diff + 1 * len] 358 cmp BYTE PTR [cur + 1 * len], t0_L 359 je match_loop 360 361mismatch: 362 jb left_2 363 364 mov [ptr1], m 365 mov m, [cp_r + 4] 366 lea ptr1, [cp_r + 4] 367 mov len1, len 368 369 jmp max_update 370 371MY_ALIGN_32 372left_2: 373 mov [ptr0], m 374 mov m, [cp_r] 375 mov ptr0, cp_r 376 mov len0, len 377 378max_update: 379 sub diff, cur ; restore diff 380 381 cmp maxLen, len 382 jae next_node 383 384 mov maxLen, len 385 add len, lenLimit 386 mov [d], len_x 387 mov t0_x, diff_x 388 not t0_x 389 mov [d + 4], t0_x 390 add d, 8 391 392 jmp next_node 393 394 395 396MY_ALIGN_32 397lenLimit_reach: 398 399 mov delta_r, cur 400 sub delta_r, diff 401 lea delta1_r, [delta_r - 1] 402 403 mov t0_x, [cp_r] 404 mov [ptr1], t0_x 405 mov t0_x, [cp_r + 4] 406 mov [ptr0], t0_x 407 408 mov [d], lenLimit_x 409 mov [d + 4], delta1_x 410 add d, 8 411 412 ; _distances[-1] = (UInt32)(d - _distances); 413 mov t0, distances 414 mov t1, d 415 sub t1, t0 416 shr t1_x, 2 417 mov [t0 - 4], t1_x 418 419 mov hash, hash_VAR 420 mov hash_lim, size_VAR 421 422 inc pos 423 mov cp_x, cycPos_VAR 424 inc cp_x 425 426 mov d_lim, limit_VAR 427 mov cycSize, cycSize_VAR 428 ; if (hash == size || *hash != delta || lenLimit[diff] != lenLimit[0] || d >= limit) 429 ; break; 430 cmp hash, hash_lim 431 je fin 432 cmp d, d_lim 433 jae fin 434 cmp delta_x, [hash] 435 jne main_loop 436 movzx t0_x, BYTE PTR [diff] 437 cmp [cur], t0_L 438 jne main_loop 439 440 ; jmp main_loop ; bypass for debug 441 442 mov cycPos_VAR, cp_x 443 shl len, 3 ; cycSize * 8 444 sub diff, cur ; restore diff 445 xor t0_x, t0_x 446 cmp cp_x, delta_x ; cmp (cycPos_VAR, delta) 447 lea cp_r, [son + 8 * cp_r] ; dest 448 lea src, [cp_r + 8 * diff] 449 cmovb t0, len ; t0 = (cycPos_VAR < delta ? cycSize * 8 : 0) 450 add src, t0 451 add len, son ; len = son + cycSize * 8 452 453 454MY_ALIGN_32 455long_loop: 456 add hash, 4 457 458 ; *(UInt64 *)(void *)ptr = ((const UInt64 *)(const void *)ptr)[diff]; 459 460 mov t0, [src] 461 add src, 8 462 mov [cp_r], t0 463 add cp_r, 8 464 cmp src, len 465 cmove src, son ; if end of (son) buffer is reached, we wrap to begin 466 467 mov DWORD PTR [d], 2 468 mov [d + 4], lenLimit_x 469 mov [d + 8], delta1_x 470 add d, 12 471 472 inc cur 473 474 cmp hash, hash_lim 475 je long_footer 476 cmp delta_x, [hash] 477 jne long_footer 478 movzx t0_x, BYTE PTR [diff + 1 * cur] 479 cmp [cur], t0_L 480 jne long_footer 481 cmp d, d_lim 482 jb long_loop 483 484long_footer: 485 sub cp_r, son 486 shr cp_r, 3 487 add pos, cp_x 488 sub pos, cycPos_VAR 489 mov cycSize, cycSize_VAR 490 491 cmp d, d_lim 492 jae fin 493 cmp hash, hash_lim 494 jne main_loop 495 jmp fin 496 497 498 499fin_error: 500 xor d, d 501 502fin: 503 mov RSP, Old_RSP 504 mov t0, [r4 + posRes_OFFS] 505 mov [t0], pos 506 mov r0, d 507 508MY_POP_PRESERVED_ABI_REGS 509MY_ENDP 510 511_TEXT$LZFINDOPT ENDS 512 513end 514