• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1/*
2 * Copyright (c) Meta Platforms, Inc. and affiliates.
3 * All rights reserved.
4 *
5 * This source code is licensed under both the BSD-style license (found in the
6 * LICENSE file in the root directory of this source tree) and the GPLv2 (found
7 * in the COPYING file in the root directory of this source tree).
8 * You may select, at your option, one of the above-listed licenses.
9 */
10
11#include "../common/portability_macros.h"
12
13#if defined(__ELF__) && defined(__GNUC__)
14/* Stack marking
15 * ref: https://wiki.gentoo.org/wiki/Hardened/GNU_stack_quickstart
16 */
17.section .note.GNU-stack,"",%progbits
18
19#if defined(__aarch64__)
20/* Mark that this assembly supports BTI & PAC, because it is empty for aarch64.
21 * See: https://github.com/facebook/zstd/issues/3841
22 * See: https://gcc.godbolt.org/z/sqr5T4ffK
23 * See: https://lore.kernel.org/linux-arm-kernel/20200429211641.9279-8-broonie@kernel.org/
24 * See: https://reviews.llvm.org/D62609
25 */
26.pushsection .note.gnu.property, "a"
27.p2align 3
28.long 4                 /* size of the name - "GNU\0" */
29.long 0x10              /* size of descriptor */
30.long 0x5               /* NT_GNU_PROPERTY_TYPE_0 */
31.asciz "GNU"
32.long 0xc0000000        /* pr_type - GNU_PROPERTY_AARCH64_FEATURE_1_AND */
33.long 4                 /* pr_datasz - 4 bytes */
34.long 3                 /* pr_data - GNU_PROPERTY_AARCH64_FEATURE_1_BTI | GNU_PROPERTY_AARCH64_FEATURE_1_PAC */
35.p2align 3              /* pr_padding - bring everything to 8 byte alignment */
36.popsection
37#endif
38
39#endif
40
41#if ZSTD_ENABLE_ASM_X86_64_BMI2
42
43/* Calling convention:
44 *
45 * %rdi (or %rcx on Windows) contains the first argument: HUF_DecompressAsmArgs*.
46 * %rbp isn't maintained (no frame pointer).
47 * %rsp contains the stack pointer that grows down.
48 *      No red-zone is assumed, only addresses >= %rsp are used.
49 * All register contents are preserved.
50 */
51
52ZSTD_HIDE_ASM_FUNCTION(HUF_decompress4X1_usingDTable_internal_fast_asm_loop)
53ZSTD_HIDE_ASM_FUNCTION(HUF_decompress4X2_usingDTable_internal_fast_asm_loop)
54ZSTD_HIDE_ASM_FUNCTION(_HUF_decompress4X2_usingDTable_internal_fast_asm_loop)
55ZSTD_HIDE_ASM_FUNCTION(_HUF_decompress4X1_usingDTable_internal_fast_asm_loop)
56.global HUF_decompress4X1_usingDTable_internal_fast_asm_loop
57.global HUF_decompress4X2_usingDTable_internal_fast_asm_loop
58.global _HUF_decompress4X1_usingDTable_internal_fast_asm_loop
59.global _HUF_decompress4X2_usingDTable_internal_fast_asm_loop
60.text
61
62/* Sets up register mappings for clarity.
63 * op[], bits[], dtable & ip[0] each get their own register.
64 * ip[1,2,3] & olimit alias var[].
65 * %rax is a scratch register.
66 */
67
68#define op0    rsi
69#define op1    rbx
70#define op2    rcx
71#define op3    rdi
72
73#define ip0    r8
74#define ip1    r9
75#define ip2    r10
76#define ip3    r11
77
78#define bits0  rbp
79#define bits1  rdx
80#define bits2  r12
81#define bits3  r13
82#define dtable r14
83#define olimit r15
84
85/* var[] aliases ip[1,2,3] & olimit
86 * ip[1,2,3] are saved every iteration.
87 * olimit is only used in compute_olimit.
88 */
89#define var0   r15
90#define var1   r9
91#define var2   r10
92#define var3   r11
93
94/* 32-bit var registers */
95#define vard0  r15d
96#define vard1  r9d
97#define vard2  r10d
98#define vard3  r11d
99
100/* Calls X(N) for each stream 0, 1, 2, 3. */
101#define FOR_EACH_STREAM(X) \
102    X(0);                  \
103    X(1);                  \
104    X(2);                  \
105    X(3)
106
107/* Calls X(N, idx) for each stream 0, 1, 2, 3. */
108#define FOR_EACH_STREAM_WITH_INDEX(X, idx) \
109    X(0, idx);                             \
110    X(1, idx);                             \
111    X(2, idx);                             \
112    X(3, idx)
113
114/* Define both _HUF_* & HUF_* symbols because MacOS
115 * C symbols are prefixed with '_' & Linux symbols aren't.
116 */
117_HUF_decompress4X1_usingDTable_internal_fast_asm_loop:
118HUF_decompress4X1_usingDTable_internal_fast_asm_loop:
119    ZSTD_CET_ENDBRANCH
120    /* Save all registers - even if they are callee saved for simplicity. */
121    push %rax
122    push %rbx
123    push %rcx
124    push %rdx
125    push %rbp
126    push %rsi
127    push %rdi
128    push %r8
129    push %r9
130    push %r10
131    push %r11
132    push %r12
133    push %r13
134    push %r14
135    push %r15
136
137    /* Read HUF_DecompressAsmArgs* args from %rax */
138#if defined(_WIN32)
139    movq %rcx, %rax
140#else
141    movq %rdi, %rax
142#endif
143    movq  0(%rax), %ip0
144    movq  8(%rax), %ip1
145    movq 16(%rax), %ip2
146    movq 24(%rax), %ip3
147    movq 32(%rax), %op0
148    movq 40(%rax), %op1
149    movq 48(%rax), %op2
150    movq 56(%rax), %op3
151    movq 64(%rax), %bits0
152    movq 72(%rax), %bits1
153    movq 80(%rax), %bits2
154    movq 88(%rax), %bits3
155    movq 96(%rax), %dtable
156    push %rax      /* argument */
157    push 104(%rax) /* ilowest */
158    push 112(%rax) /* oend */
159    push %olimit   /* olimit space */
160
161    subq $24, %rsp
162
163.L_4X1_compute_olimit:
164    /* Computes how many iterations we can do safely
165     * %r15, %rax may be clobbered
166     * rbx, rdx must be saved
167     * op3 & ip0 mustn't be clobbered
168     */
169    movq %rbx, 0(%rsp)
170    movq %rdx, 8(%rsp)
171
172    movq 32(%rsp), %rax /* rax = oend */
173    subq %op3,    %rax  /* rax = oend - op3 */
174
175    /* r15 = (oend - op3) / 5 */
176    movabsq $-3689348814741910323, %rdx
177    mulq %rdx
178    movq %rdx, %r15
179    shrq $2, %r15
180
181    movq %ip0,     %rax /* rax = ip0 */
182    movq 40(%rsp), %rdx /* rdx = ilowest */
183    subq %rdx,     %rax /* rax = ip0 - ilowest */
184    movq %rax,     %rbx /* rbx = ip0 - ilowest */
185
186    /* rdx = (ip0 - ilowest) / 7 */
187    movabsq $2635249153387078803, %rdx
188    mulq %rdx
189    subq %rdx, %rbx
190    shrq %rbx
191    addq %rbx, %rdx
192    shrq $2, %rdx
193
194    /* r15 = min(%rdx, %r15) */
195    cmpq %rdx, %r15
196    cmova %rdx, %r15
197
198    /* r15 = r15 * 5 */
199    leaq (%r15, %r15, 4), %r15
200
201    /* olimit = op3 + r15 */
202    addq %op3, %olimit
203
204    movq 8(%rsp), %rdx
205    movq 0(%rsp), %rbx
206
207    /* If (op3 + 20 > olimit) */
208    movq %op3, %rax    /* rax = op3 */
209    cmpq %rax, %olimit /* op3 == olimit */
210    je .L_4X1_exit
211
212    /* If (ip1 < ip0) go to exit */
213    cmpq %ip0, %ip1
214    jb .L_4X1_exit
215
216    /* If (ip2 < ip1) go to exit */
217    cmpq %ip1, %ip2
218    jb .L_4X1_exit
219
220    /* If (ip3 < ip2) go to exit */
221    cmpq %ip2, %ip3
222    jb .L_4X1_exit
223
224/* Reads top 11 bits from bits[n]
225 * Loads dt[bits[n]] into var[n]
226 */
227#define GET_NEXT_DELT(n)                \
228    movq $53, %var##n;                  \
229    shrxq %var##n, %bits##n, %var##n;   \
230    movzwl (%dtable,%var##n,2),%vard##n
231
232/* var[n] must contain the DTable entry computed with GET_NEXT_DELT
233 * Moves var[n] to %rax
234 * bits[n] <<= var[n] & 63
235 * op[n][idx] = %rax >> 8
236 * %ah is a way to access bits [8, 16) of %rax
237 */
238#define DECODE_FROM_DELT(n, idx)       \
239    movq %var##n, %rax;                \
240    shlxq %var##n, %bits##n, %bits##n; \
241    movb %ah, idx(%op##n)
242
243/* Assumes GET_NEXT_DELT has been called.
244 * Calls DECODE_FROM_DELT then GET_NEXT_DELT
245 */
246#define DECODE_AND_GET_NEXT(n, idx) \
247    DECODE_FROM_DELT(n, idx);       \
248    GET_NEXT_DELT(n)                \
249
250/* // ctz & nbBytes is stored in bits[n]
251 * // nbBits is stored in %rax
252 * ctz  = CTZ[bits[n]]
253 * nbBits  = ctz & 7
254 * nbBytes = ctz >> 3
255 * op[n]  += 5
256 * ip[n]  -= nbBytes
257 * // Note: x86-64 is little-endian ==> no bswap
258 * bits[n] = MEM_readST(ip[n]) | 1
259 * bits[n] <<= nbBits
260 */
261#define RELOAD_BITS(n)             \
262    bsfq %bits##n, %bits##n;       \
263    movq %bits##n, %rax;           \
264    andq $7, %rax;                 \
265    shrq $3, %bits##n;             \
266    leaq 5(%op##n), %op##n;        \
267    subq %bits##n, %ip##n;         \
268    movq (%ip##n), %bits##n;       \
269    orq $1, %bits##n;              \
270    shlx %rax, %bits##n, %bits##n
271
272    /* Store clobbered variables on the stack */
273    movq %olimit, 24(%rsp)
274    movq %ip1, 0(%rsp)
275    movq %ip2, 8(%rsp)
276    movq %ip3, 16(%rsp)
277
278    /* Call GET_NEXT_DELT for each stream */
279    FOR_EACH_STREAM(GET_NEXT_DELT)
280
281    .p2align 6
282
283.L_4X1_loop_body:
284    /* Decode 5 symbols in each of the 4 streams (20 total)
285     * Must have called GET_NEXT_DELT for each stream
286     */
287    FOR_EACH_STREAM_WITH_INDEX(DECODE_AND_GET_NEXT, 0)
288    FOR_EACH_STREAM_WITH_INDEX(DECODE_AND_GET_NEXT, 1)
289    FOR_EACH_STREAM_WITH_INDEX(DECODE_AND_GET_NEXT, 2)
290    FOR_EACH_STREAM_WITH_INDEX(DECODE_AND_GET_NEXT, 3)
291    FOR_EACH_STREAM_WITH_INDEX(DECODE_FROM_DELT, 4)
292
293    /* Load ip[1,2,3] from stack (var[] aliases them)
294     * ip[] is needed for RELOAD_BITS
295     * Each will be stored back to the stack after RELOAD
296     */
297    movq 0(%rsp), %ip1
298    movq 8(%rsp), %ip2
299    movq 16(%rsp), %ip3
300
301    /* Reload each stream & fetch the next table entry
302     * to prepare for the next iteration
303     */
304    RELOAD_BITS(0)
305    GET_NEXT_DELT(0)
306
307    RELOAD_BITS(1)
308    movq %ip1, 0(%rsp)
309    GET_NEXT_DELT(1)
310
311    RELOAD_BITS(2)
312    movq %ip2, 8(%rsp)
313    GET_NEXT_DELT(2)
314
315    RELOAD_BITS(3)
316    movq %ip3, 16(%rsp)
317    GET_NEXT_DELT(3)
318
319    /* If op3 < olimit: continue the loop */
320    cmp %op3, 24(%rsp)
321    ja .L_4X1_loop_body
322
323    /* Reload ip[1,2,3] from stack */
324    movq 0(%rsp), %ip1
325    movq 8(%rsp), %ip2
326    movq 16(%rsp), %ip3
327
328    /* Re-compute olimit */
329    jmp .L_4X1_compute_olimit
330
331#undef GET_NEXT_DELT
332#undef DECODE_FROM_DELT
333#undef DECODE
334#undef RELOAD_BITS
335.L_4X1_exit:
336    addq $24, %rsp
337
338    /* Restore stack (oend & olimit) */
339    pop %rax /* olimit */
340    pop %rax /* oend */
341    pop %rax /* ilowest */
342    pop %rax /* arg */
343
344    /* Save ip / op / bits */
345    movq %ip0,  0(%rax)
346    movq %ip1,  8(%rax)
347    movq %ip2, 16(%rax)
348    movq %ip3, 24(%rax)
349    movq %op0, 32(%rax)
350    movq %op1, 40(%rax)
351    movq %op2, 48(%rax)
352    movq %op3, 56(%rax)
353    movq %bits0, 64(%rax)
354    movq %bits1, 72(%rax)
355    movq %bits2, 80(%rax)
356    movq %bits3, 88(%rax)
357
358    /* Restore registers */
359    pop %r15
360    pop %r14
361    pop %r13
362    pop %r12
363    pop %r11
364    pop %r10
365    pop %r9
366    pop %r8
367    pop %rdi
368    pop %rsi
369    pop %rbp
370    pop %rdx
371    pop %rcx
372    pop %rbx
373    pop %rax
374    ret
375
376_HUF_decompress4X2_usingDTable_internal_fast_asm_loop:
377HUF_decompress4X2_usingDTable_internal_fast_asm_loop:
378    ZSTD_CET_ENDBRANCH
379    /* Save all registers - even if they are callee saved for simplicity. */
380    push %rax
381    push %rbx
382    push %rcx
383    push %rdx
384    push %rbp
385    push %rsi
386    push %rdi
387    push %r8
388    push %r9
389    push %r10
390    push %r11
391    push %r12
392    push %r13
393    push %r14
394    push %r15
395
396    /* Read HUF_DecompressAsmArgs* args from %rax */
397#if defined(_WIN32)
398    movq %rcx, %rax
399#else
400    movq %rdi, %rax
401#endif
402    movq  0(%rax), %ip0
403    movq  8(%rax), %ip1
404    movq 16(%rax), %ip2
405    movq 24(%rax), %ip3
406    movq 32(%rax), %op0
407    movq 40(%rax), %op1
408    movq 48(%rax), %op2
409    movq 56(%rax), %op3
410    movq 64(%rax), %bits0
411    movq 72(%rax), %bits1
412    movq 80(%rax), %bits2
413    movq 88(%rax), %bits3
414    movq 96(%rax), %dtable
415    push %rax      /* argument */
416    push %rax      /* olimit */
417    push 104(%rax) /* ilowest */
418
419    movq 112(%rax), %rax
420    push %rax /* oend3 */
421
422    movq %op3, %rax
423    push %rax /* oend2 */
424
425    movq %op2, %rax
426    push %rax /* oend1 */
427
428    movq %op1, %rax
429    push %rax /* oend0 */
430
431    /* Scratch space */
432    subq $8, %rsp
433
434.L_4X2_compute_olimit:
435    /* Computes how many iterations we can do safely
436     * %r15, %rax may be clobbered
437     * rdx must be saved
438     * op[1,2,3,4] & ip0 mustn't be clobbered
439     */
440    movq %rdx, 0(%rsp)
441
442    /* We can consume up to 7 input bytes each iteration. */
443    movq %ip0,     %rax  /* rax = ip0 */
444    movq 40(%rsp), %rdx  /* rdx = ilowest */
445    subq %rdx,     %rax  /* rax = ip0 - ilowest */
446    movq %rax,    %r15   /* r15 = ip0 - ilowest */
447
448    /* rdx = rax / 7 */
449    movabsq $2635249153387078803, %rdx
450    mulq %rdx
451    subq %rdx, %r15
452    shrq %r15
453    addq %r15, %rdx
454    shrq $2, %rdx
455
456    /* r15 = (ip0 - ilowest) / 7 */
457    movq %rdx, %r15
458
459    /* r15 = min(r15, min(oend0 - op0, oend1 - op1, oend2 - op2, oend3 - op3) / 10) */
460    movq 8(%rsp),  %rax /* rax = oend0 */
461    subq %op0,     %rax /* rax = oend0 - op0 */
462    movq 16(%rsp), %rdx /* rdx = oend1 */
463    subq %op1,     %rdx /* rdx = oend1 - op1 */
464
465    cmpq  %rax,    %rdx
466    cmova %rax,    %rdx /* rdx = min(%rdx, %rax) */
467
468    movq 24(%rsp), %rax /* rax = oend2 */
469    subq %op2,     %rax /* rax = oend2 - op2 */
470
471    cmpq  %rax,    %rdx
472    cmova %rax,    %rdx /* rdx = min(%rdx, %rax) */
473
474    movq 32(%rsp), %rax /* rax = oend3 */
475    subq %op3,     %rax /* rax = oend3 - op3 */
476
477    cmpq  %rax,    %rdx
478    cmova %rax,    %rdx /* rdx = min(%rdx, %rax) */
479
480    movabsq $-3689348814741910323, %rax
481    mulq %rdx
482    shrq $3,       %rdx /* rdx = rdx / 10 */
483
484    /* r15 = min(%rdx, %r15) */
485    cmpq  %rdx, %r15
486    cmova %rdx, %r15
487
488    /* olimit = op3 + 5 * r15 */
489    movq %r15, %rax
490    leaq (%op3, %rax, 4), %olimit
491    addq %rax, %olimit
492
493    movq 0(%rsp), %rdx
494
495    /* If (op3 + 10 > olimit) */
496    movq %op3, %rax    /* rax = op3 */
497    cmpq %rax, %olimit /* op3 == olimit */
498    je .L_4X2_exit
499
500    /* If (ip1 < ip0) go to exit */
501    cmpq %ip0, %ip1
502    jb .L_4X2_exit
503
504    /* If (ip2 < ip1) go to exit */
505    cmpq %ip1, %ip2
506    jb .L_4X2_exit
507
508    /* If (ip3 < ip2) go to exit */
509    cmpq %ip2, %ip3
510    jb .L_4X2_exit
511
512#define DECODE(n, idx)              \
513    movq %bits##n, %rax;            \
514    shrq $53, %rax;                 \
515    movzwl 0(%dtable,%rax,4),%r8d;  \
516    movzbl 2(%dtable,%rax,4),%r15d; \
517    movzbl 3(%dtable,%rax,4),%eax;  \
518    movw %r8w, (%op##n);            \
519    shlxq %r15, %bits##n, %bits##n; \
520    addq %rax, %op##n
521
522#define RELOAD_BITS(n)              \
523    bsfq %bits##n, %bits##n;        \
524    movq %bits##n, %rax;            \
525    shrq $3, %bits##n;              \
526    andq $7, %rax;                  \
527    subq %bits##n, %ip##n;          \
528    movq (%ip##n), %bits##n;        \
529    orq $1, %bits##n;               \
530    shlxq %rax, %bits##n, %bits##n
531
532
533    movq %olimit, 48(%rsp)
534
535    .p2align 6
536
537.L_4X2_loop_body:
538    /* We clobber r8, so store it on the stack */
539    movq %r8, 0(%rsp)
540
541    /* Decode 5 symbols from each of the 4 streams (20 symbols total). */
542    FOR_EACH_STREAM_WITH_INDEX(DECODE, 0)
543    FOR_EACH_STREAM_WITH_INDEX(DECODE, 1)
544    FOR_EACH_STREAM_WITH_INDEX(DECODE, 2)
545    FOR_EACH_STREAM_WITH_INDEX(DECODE, 3)
546    FOR_EACH_STREAM_WITH_INDEX(DECODE, 4)
547
548    /* Reload r8 */
549    movq 0(%rsp), %r8
550
551    FOR_EACH_STREAM(RELOAD_BITS)
552
553    cmp %op3, 48(%rsp)
554    ja .L_4X2_loop_body
555    jmp .L_4X2_compute_olimit
556
557#undef DECODE
558#undef RELOAD_BITS
559.L_4X2_exit:
560    addq $8, %rsp
561    /* Restore stack (oend & olimit) */
562    pop %rax /* oend0 */
563    pop %rax /* oend1 */
564    pop %rax /* oend2 */
565    pop %rax /* oend3 */
566    pop %rax /* ilowest */
567    pop %rax /* olimit */
568    pop %rax /* arg */
569
570    /* Save ip / op / bits */
571    movq %ip0,  0(%rax)
572    movq %ip1,  8(%rax)
573    movq %ip2, 16(%rax)
574    movq %ip3, 24(%rax)
575    movq %op0, 32(%rax)
576    movq %op1, 40(%rax)
577    movq %op2, 48(%rax)
578    movq %op3, 56(%rax)
579    movq %bits0, 64(%rax)
580    movq %bits1, 72(%rax)
581    movq %bits2, 80(%rax)
582    movq %bits3, 88(%rax)
583
584    /* Restore registers */
585    pop %r15
586    pop %r14
587    pop %r13
588    pop %r12
589    pop %r11
590    pop %r10
591    pop %r9
592    pop %r8
593    pop %rdi
594    pop %rsi
595    pop %rbp
596    pop %rdx
597    pop %rcx
598    pop %rbx
599    pop %rax
600    ret
601
602#endif
603