1/* 2 * Copyright (c) Meta Platforms, Inc. and affiliates. 3 * All rights reserved. 4 * 5 * This source code is licensed under both the BSD-style license (found in the 6 * LICENSE file in the root directory of this source tree) and the GPLv2 (found 7 * in the COPYING file in the root directory of this source tree). 8 * You may select, at your option, one of the above-listed licenses. 9 */ 10 11#include "../common/portability_macros.h" 12 13#if defined(__ELF__) && defined(__GNUC__) 14/* Stack marking 15 * ref: https://wiki.gentoo.org/wiki/Hardened/GNU_stack_quickstart 16 */ 17.section .note.GNU-stack,"",%progbits 18 19#if defined(__aarch64__) 20/* Mark that this assembly supports BTI & PAC, because it is empty for aarch64. 21 * See: https://github.com/facebook/zstd/issues/3841 22 * See: https://gcc.godbolt.org/z/sqr5T4ffK 23 * See: https://lore.kernel.org/linux-arm-kernel/20200429211641.9279-8-broonie@kernel.org/ 24 * See: https://reviews.llvm.org/D62609 25 */ 26.pushsection .note.gnu.property, "a" 27.p2align 3 28.long 4 /* size of the name - "GNU\0" */ 29.long 0x10 /* size of descriptor */ 30.long 0x5 /* NT_GNU_PROPERTY_TYPE_0 */ 31.asciz "GNU" 32.long 0xc0000000 /* pr_type - GNU_PROPERTY_AARCH64_FEATURE_1_AND */ 33.long 4 /* pr_datasz - 4 bytes */ 34.long 3 /* pr_data - GNU_PROPERTY_AARCH64_FEATURE_1_BTI | GNU_PROPERTY_AARCH64_FEATURE_1_PAC */ 35.p2align 3 /* pr_padding - bring everything to 8 byte alignment */ 36.popsection 37#endif 38 39#endif 40 41#if ZSTD_ENABLE_ASM_X86_64_BMI2 42 43/* Calling convention: 44 * 45 * %rdi (or %rcx on Windows) contains the first argument: HUF_DecompressAsmArgs*. 46 * %rbp isn't maintained (no frame pointer). 47 * %rsp contains the stack pointer that grows down. 48 * No red-zone is assumed, only addresses >= %rsp are used. 49 * All register contents are preserved. 50 */ 51 52ZSTD_HIDE_ASM_FUNCTION(HUF_decompress4X1_usingDTable_internal_fast_asm_loop) 53ZSTD_HIDE_ASM_FUNCTION(HUF_decompress4X2_usingDTable_internal_fast_asm_loop) 54ZSTD_HIDE_ASM_FUNCTION(_HUF_decompress4X2_usingDTable_internal_fast_asm_loop) 55ZSTD_HIDE_ASM_FUNCTION(_HUF_decompress4X1_usingDTable_internal_fast_asm_loop) 56.global HUF_decompress4X1_usingDTable_internal_fast_asm_loop 57.global HUF_decompress4X2_usingDTable_internal_fast_asm_loop 58.global _HUF_decompress4X1_usingDTable_internal_fast_asm_loop 59.global _HUF_decompress4X2_usingDTable_internal_fast_asm_loop 60.text 61 62/* Sets up register mappings for clarity. 63 * op[], bits[], dtable & ip[0] each get their own register. 64 * ip[1,2,3] & olimit alias var[]. 65 * %rax is a scratch register. 66 */ 67 68#define op0 rsi 69#define op1 rbx 70#define op2 rcx 71#define op3 rdi 72 73#define ip0 r8 74#define ip1 r9 75#define ip2 r10 76#define ip3 r11 77 78#define bits0 rbp 79#define bits1 rdx 80#define bits2 r12 81#define bits3 r13 82#define dtable r14 83#define olimit r15 84 85/* var[] aliases ip[1,2,3] & olimit 86 * ip[1,2,3] are saved every iteration. 87 * olimit is only used in compute_olimit. 88 */ 89#define var0 r15 90#define var1 r9 91#define var2 r10 92#define var3 r11 93 94/* 32-bit var registers */ 95#define vard0 r15d 96#define vard1 r9d 97#define vard2 r10d 98#define vard3 r11d 99 100/* Calls X(N) for each stream 0, 1, 2, 3. */ 101#define FOR_EACH_STREAM(X) \ 102 X(0); \ 103 X(1); \ 104 X(2); \ 105 X(3) 106 107/* Calls X(N, idx) for each stream 0, 1, 2, 3. */ 108#define FOR_EACH_STREAM_WITH_INDEX(X, idx) \ 109 X(0, idx); \ 110 X(1, idx); \ 111 X(2, idx); \ 112 X(3, idx) 113 114/* Define both _HUF_* & HUF_* symbols because MacOS 115 * C symbols are prefixed with '_' & Linux symbols aren't. 116 */ 117_HUF_decompress4X1_usingDTable_internal_fast_asm_loop: 118HUF_decompress4X1_usingDTable_internal_fast_asm_loop: 119 ZSTD_CET_ENDBRANCH 120 /* Save all registers - even if they are callee saved for simplicity. */ 121 push %rax 122 push %rbx 123 push %rcx 124 push %rdx 125 push %rbp 126 push %rsi 127 push %rdi 128 push %r8 129 push %r9 130 push %r10 131 push %r11 132 push %r12 133 push %r13 134 push %r14 135 push %r15 136 137 /* Read HUF_DecompressAsmArgs* args from %rax */ 138#if defined(_WIN32) 139 movq %rcx, %rax 140#else 141 movq %rdi, %rax 142#endif 143 movq 0(%rax), %ip0 144 movq 8(%rax), %ip1 145 movq 16(%rax), %ip2 146 movq 24(%rax), %ip3 147 movq 32(%rax), %op0 148 movq 40(%rax), %op1 149 movq 48(%rax), %op2 150 movq 56(%rax), %op3 151 movq 64(%rax), %bits0 152 movq 72(%rax), %bits1 153 movq 80(%rax), %bits2 154 movq 88(%rax), %bits3 155 movq 96(%rax), %dtable 156 push %rax /* argument */ 157 push 104(%rax) /* ilowest */ 158 push 112(%rax) /* oend */ 159 push %olimit /* olimit space */ 160 161 subq $24, %rsp 162 163.L_4X1_compute_olimit: 164 /* Computes how many iterations we can do safely 165 * %r15, %rax may be clobbered 166 * rbx, rdx must be saved 167 * op3 & ip0 mustn't be clobbered 168 */ 169 movq %rbx, 0(%rsp) 170 movq %rdx, 8(%rsp) 171 172 movq 32(%rsp), %rax /* rax = oend */ 173 subq %op3, %rax /* rax = oend - op3 */ 174 175 /* r15 = (oend - op3) / 5 */ 176 movabsq $-3689348814741910323, %rdx 177 mulq %rdx 178 movq %rdx, %r15 179 shrq $2, %r15 180 181 movq %ip0, %rax /* rax = ip0 */ 182 movq 40(%rsp), %rdx /* rdx = ilowest */ 183 subq %rdx, %rax /* rax = ip0 - ilowest */ 184 movq %rax, %rbx /* rbx = ip0 - ilowest */ 185 186 /* rdx = (ip0 - ilowest) / 7 */ 187 movabsq $2635249153387078803, %rdx 188 mulq %rdx 189 subq %rdx, %rbx 190 shrq %rbx 191 addq %rbx, %rdx 192 shrq $2, %rdx 193 194 /* r15 = min(%rdx, %r15) */ 195 cmpq %rdx, %r15 196 cmova %rdx, %r15 197 198 /* r15 = r15 * 5 */ 199 leaq (%r15, %r15, 4), %r15 200 201 /* olimit = op3 + r15 */ 202 addq %op3, %olimit 203 204 movq 8(%rsp), %rdx 205 movq 0(%rsp), %rbx 206 207 /* If (op3 + 20 > olimit) */ 208 movq %op3, %rax /* rax = op3 */ 209 cmpq %rax, %olimit /* op3 == olimit */ 210 je .L_4X1_exit 211 212 /* If (ip1 < ip0) go to exit */ 213 cmpq %ip0, %ip1 214 jb .L_4X1_exit 215 216 /* If (ip2 < ip1) go to exit */ 217 cmpq %ip1, %ip2 218 jb .L_4X1_exit 219 220 /* If (ip3 < ip2) go to exit */ 221 cmpq %ip2, %ip3 222 jb .L_4X1_exit 223 224/* Reads top 11 bits from bits[n] 225 * Loads dt[bits[n]] into var[n] 226 */ 227#define GET_NEXT_DELT(n) \ 228 movq $53, %var##n; \ 229 shrxq %var##n, %bits##n, %var##n; \ 230 movzwl (%dtable,%var##n,2),%vard##n 231 232/* var[n] must contain the DTable entry computed with GET_NEXT_DELT 233 * Moves var[n] to %rax 234 * bits[n] <<= var[n] & 63 235 * op[n][idx] = %rax >> 8 236 * %ah is a way to access bits [8, 16) of %rax 237 */ 238#define DECODE_FROM_DELT(n, idx) \ 239 movq %var##n, %rax; \ 240 shlxq %var##n, %bits##n, %bits##n; \ 241 movb %ah, idx(%op##n) 242 243/* Assumes GET_NEXT_DELT has been called. 244 * Calls DECODE_FROM_DELT then GET_NEXT_DELT 245 */ 246#define DECODE_AND_GET_NEXT(n, idx) \ 247 DECODE_FROM_DELT(n, idx); \ 248 GET_NEXT_DELT(n) \ 249 250/* // ctz & nbBytes is stored in bits[n] 251 * // nbBits is stored in %rax 252 * ctz = CTZ[bits[n]] 253 * nbBits = ctz & 7 254 * nbBytes = ctz >> 3 255 * op[n] += 5 256 * ip[n] -= nbBytes 257 * // Note: x86-64 is little-endian ==> no bswap 258 * bits[n] = MEM_readST(ip[n]) | 1 259 * bits[n] <<= nbBits 260 */ 261#define RELOAD_BITS(n) \ 262 bsfq %bits##n, %bits##n; \ 263 movq %bits##n, %rax; \ 264 andq $7, %rax; \ 265 shrq $3, %bits##n; \ 266 leaq 5(%op##n), %op##n; \ 267 subq %bits##n, %ip##n; \ 268 movq (%ip##n), %bits##n; \ 269 orq $1, %bits##n; \ 270 shlx %rax, %bits##n, %bits##n 271 272 /* Store clobbered variables on the stack */ 273 movq %olimit, 24(%rsp) 274 movq %ip1, 0(%rsp) 275 movq %ip2, 8(%rsp) 276 movq %ip3, 16(%rsp) 277 278 /* Call GET_NEXT_DELT for each stream */ 279 FOR_EACH_STREAM(GET_NEXT_DELT) 280 281 .p2align 6 282 283.L_4X1_loop_body: 284 /* Decode 5 symbols in each of the 4 streams (20 total) 285 * Must have called GET_NEXT_DELT for each stream 286 */ 287 FOR_EACH_STREAM_WITH_INDEX(DECODE_AND_GET_NEXT, 0) 288 FOR_EACH_STREAM_WITH_INDEX(DECODE_AND_GET_NEXT, 1) 289 FOR_EACH_STREAM_WITH_INDEX(DECODE_AND_GET_NEXT, 2) 290 FOR_EACH_STREAM_WITH_INDEX(DECODE_AND_GET_NEXT, 3) 291 FOR_EACH_STREAM_WITH_INDEX(DECODE_FROM_DELT, 4) 292 293 /* Load ip[1,2,3] from stack (var[] aliases them) 294 * ip[] is needed for RELOAD_BITS 295 * Each will be stored back to the stack after RELOAD 296 */ 297 movq 0(%rsp), %ip1 298 movq 8(%rsp), %ip2 299 movq 16(%rsp), %ip3 300 301 /* Reload each stream & fetch the next table entry 302 * to prepare for the next iteration 303 */ 304 RELOAD_BITS(0) 305 GET_NEXT_DELT(0) 306 307 RELOAD_BITS(1) 308 movq %ip1, 0(%rsp) 309 GET_NEXT_DELT(1) 310 311 RELOAD_BITS(2) 312 movq %ip2, 8(%rsp) 313 GET_NEXT_DELT(2) 314 315 RELOAD_BITS(3) 316 movq %ip3, 16(%rsp) 317 GET_NEXT_DELT(3) 318 319 /* If op3 < olimit: continue the loop */ 320 cmp %op3, 24(%rsp) 321 ja .L_4X1_loop_body 322 323 /* Reload ip[1,2,3] from stack */ 324 movq 0(%rsp), %ip1 325 movq 8(%rsp), %ip2 326 movq 16(%rsp), %ip3 327 328 /* Re-compute olimit */ 329 jmp .L_4X1_compute_olimit 330 331#undef GET_NEXT_DELT 332#undef DECODE_FROM_DELT 333#undef DECODE 334#undef RELOAD_BITS 335.L_4X1_exit: 336 addq $24, %rsp 337 338 /* Restore stack (oend & olimit) */ 339 pop %rax /* olimit */ 340 pop %rax /* oend */ 341 pop %rax /* ilowest */ 342 pop %rax /* arg */ 343 344 /* Save ip / op / bits */ 345 movq %ip0, 0(%rax) 346 movq %ip1, 8(%rax) 347 movq %ip2, 16(%rax) 348 movq %ip3, 24(%rax) 349 movq %op0, 32(%rax) 350 movq %op1, 40(%rax) 351 movq %op2, 48(%rax) 352 movq %op3, 56(%rax) 353 movq %bits0, 64(%rax) 354 movq %bits1, 72(%rax) 355 movq %bits2, 80(%rax) 356 movq %bits3, 88(%rax) 357 358 /* Restore registers */ 359 pop %r15 360 pop %r14 361 pop %r13 362 pop %r12 363 pop %r11 364 pop %r10 365 pop %r9 366 pop %r8 367 pop %rdi 368 pop %rsi 369 pop %rbp 370 pop %rdx 371 pop %rcx 372 pop %rbx 373 pop %rax 374 ret 375 376_HUF_decompress4X2_usingDTable_internal_fast_asm_loop: 377HUF_decompress4X2_usingDTable_internal_fast_asm_loop: 378 ZSTD_CET_ENDBRANCH 379 /* Save all registers - even if they are callee saved for simplicity. */ 380 push %rax 381 push %rbx 382 push %rcx 383 push %rdx 384 push %rbp 385 push %rsi 386 push %rdi 387 push %r8 388 push %r9 389 push %r10 390 push %r11 391 push %r12 392 push %r13 393 push %r14 394 push %r15 395 396 /* Read HUF_DecompressAsmArgs* args from %rax */ 397#if defined(_WIN32) 398 movq %rcx, %rax 399#else 400 movq %rdi, %rax 401#endif 402 movq 0(%rax), %ip0 403 movq 8(%rax), %ip1 404 movq 16(%rax), %ip2 405 movq 24(%rax), %ip3 406 movq 32(%rax), %op0 407 movq 40(%rax), %op1 408 movq 48(%rax), %op2 409 movq 56(%rax), %op3 410 movq 64(%rax), %bits0 411 movq 72(%rax), %bits1 412 movq 80(%rax), %bits2 413 movq 88(%rax), %bits3 414 movq 96(%rax), %dtable 415 push %rax /* argument */ 416 push %rax /* olimit */ 417 push 104(%rax) /* ilowest */ 418 419 movq 112(%rax), %rax 420 push %rax /* oend3 */ 421 422 movq %op3, %rax 423 push %rax /* oend2 */ 424 425 movq %op2, %rax 426 push %rax /* oend1 */ 427 428 movq %op1, %rax 429 push %rax /* oend0 */ 430 431 /* Scratch space */ 432 subq $8, %rsp 433 434.L_4X2_compute_olimit: 435 /* Computes how many iterations we can do safely 436 * %r15, %rax may be clobbered 437 * rdx must be saved 438 * op[1,2,3,4] & ip0 mustn't be clobbered 439 */ 440 movq %rdx, 0(%rsp) 441 442 /* We can consume up to 7 input bytes each iteration. */ 443 movq %ip0, %rax /* rax = ip0 */ 444 movq 40(%rsp), %rdx /* rdx = ilowest */ 445 subq %rdx, %rax /* rax = ip0 - ilowest */ 446 movq %rax, %r15 /* r15 = ip0 - ilowest */ 447 448 /* rdx = rax / 7 */ 449 movabsq $2635249153387078803, %rdx 450 mulq %rdx 451 subq %rdx, %r15 452 shrq %r15 453 addq %r15, %rdx 454 shrq $2, %rdx 455 456 /* r15 = (ip0 - ilowest) / 7 */ 457 movq %rdx, %r15 458 459 /* r15 = min(r15, min(oend0 - op0, oend1 - op1, oend2 - op2, oend3 - op3) / 10) */ 460 movq 8(%rsp), %rax /* rax = oend0 */ 461 subq %op0, %rax /* rax = oend0 - op0 */ 462 movq 16(%rsp), %rdx /* rdx = oend1 */ 463 subq %op1, %rdx /* rdx = oend1 - op1 */ 464 465 cmpq %rax, %rdx 466 cmova %rax, %rdx /* rdx = min(%rdx, %rax) */ 467 468 movq 24(%rsp), %rax /* rax = oend2 */ 469 subq %op2, %rax /* rax = oend2 - op2 */ 470 471 cmpq %rax, %rdx 472 cmova %rax, %rdx /* rdx = min(%rdx, %rax) */ 473 474 movq 32(%rsp), %rax /* rax = oend3 */ 475 subq %op3, %rax /* rax = oend3 - op3 */ 476 477 cmpq %rax, %rdx 478 cmova %rax, %rdx /* rdx = min(%rdx, %rax) */ 479 480 movabsq $-3689348814741910323, %rax 481 mulq %rdx 482 shrq $3, %rdx /* rdx = rdx / 10 */ 483 484 /* r15 = min(%rdx, %r15) */ 485 cmpq %rdx, %r15 486 cmova %rdx, %r15 487 488 /* olimit = op3 + 5 * r15 */ 489 movq %r15, %rax 490 leaq (%op3, %rax, 4), %olimit 491 addq %rax, %olimit 492 493 movq 0(%rsp), %rdx 494 495 /* If (op3 + 10 > olimit) */ 496 movq %op3, %rax /* rax = op3 */ 497 cmpq %rax, %olimit /* op3 == olimit */ 498 je .L_4X2_exit 499 500 /* If (ip1 < ip0) go to exit */ 501 cmpq %ip0, %ip1 502 jb .L_4X2_exit 503 504 /* If (ip2 < ip1) go to exit */ 505 cmpq %ip1, %ip2 506 jb .L_4X2_exit 507 508 /* If (ip3 < ip2) go to exit */ 509 cmpq %ip2, %ip3 510 jb .L_4X2_exit 511 512#define DECODE(n, idx) \ 513 movq %bits##n, %rax; \ 514 shrq $53, %rax; \ 515 movzwl 0(%dtable,%rax,4),%r8d; \ 516 movzbl 2(%dtable,%rax,4),%r15d; \ 517 movzbl 3(%dtable,%rax,4),%eax; \ 518 movw %r8w, (%op##n); \ 519 shlxq %r15, %bits##n, %bits##n; \ 520 addq %rax, %op##n 521 522#define RELOAD_BITS(n) \ 523 bsfq %bits##n, %bits##n; \ 524 movq %bits##n, %rax; \ 525 shrq $3, %bits##n; \ 526 andq $7, %rax; \ 527 subq %bits##n, %ip##n; \ 528 movq (%ip##n), %bits##n; \ 529 orq $1, %bits##n; \ 530 shlxq %rax, %bits##n, %bits##n 531 532 533 movq %olimit, 48(%rsp) 534 535 .p2align 6 536 537.L_4X2_loop_body: 538 /* We clobber r8, so store it on the stack */ 539 movq %r8, 0(%rsp) 540 541 /* Decode 5 symbols from each of the 4 streams (20 symbols total). */ 542 FOR_EACH_STREAM_WITH_INDEX(DECODE, 0) 543 FOR_EACH_STREAM_WITH_INDEX(DECODE, 1) 544 FOR_EACH_STREAM_WITH_INDEX(DECODE, 2) 545 FOR_EACH_STREAM_WITH_INDEX(DECODE, 3) 546 FOR_EACH_STREAM_WITH_INDEX(DECODE, 4) 547 548 /* Reload r8 */ 549 movq 0(%rsp), %r8 550 551 FOR_EACH_STREAM(RELOAD_BITS) 552 553 cmp %op3, 48(%rsp) 554 ja .L_4X2_loop_body 555 jmp .L_4X2_compute_olimit 556 557#undef DECODE 558#undef RELOAD_BITS 559.L_4X2_exit: 560 addq $8, %rsp 561 /* Restore stack (oend & olimit) */ 562 pop %rax /* oend0 */ 563 pop %rax /* oend1 */ 564 pop %rax /* oend2 */ 565 pop %rax /* oend3 */ 566 pop %rax /* ilowest */ 567 pop %rax /* olimit */ 568 pop %rax /* arg */ 569 570 /* Save ip / op / bits */ 571 movq %ip0, 0(%rax) 572 movq %ip1, 8(%rax) 573 movq %ip2, 16(%rax) 574 movq %ip3, 24(%rax) 575 movq %op0, 32(%rax) 576 movq %op1, 40(%rax) 577 movq %op2, 48(%rax) 578 movq %op3, 56(%rax) 579 movq %bits0, 64(%rax) 580 movq %bits1, 72(%rax) 581 movq %bits2, 80(%rax) 582 movq %bits3, 88(%rax) 583 584 /* Restore registers */ 585 pop %r15 586 pop %r14 587 pop %r13 588 pop %r12 589 pop %r11 590 pop %r10 591 pop %r9 592 pop %r8 593 pop %rdi 594 pop %rsi 595 pop %rbp 596 pop %rdx 597 pop %rcx 598 pop %rbx 599 pop %rax 600 ret 601 602#endif 603