• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* crc32.c -- compute the CRC-32 of a data stream
2  * Copyright (C) 1995-2022 Mark Adler
3  * For conditions of distribution and use, see copyright notice in zlib.h
4  *
5  * This interleaved implementation of a CRC makes use of pipelined multiple
6  * arithmetic-logic units, commonly found in modern CPU cores. It is due to
7  * Kadatch and Jenkins (2010). See doc/crc-doc.1.0.pdf in this distribution.
8  */
9 
10 /* @(#) $Id$ */
11 
12 /*
13   Note on the use of DYNAMIC_CRC_TABLE: there is no mutex or semaphore
14   protection on the static variables used to control the first-use generation
15   of the crc tables. Therefore, if you #define DYNAMIC_CRC_TABLE, you should
16   first call get_crc_table() to initialize the tables before allowing more than
17   one thread to use crc32().
18 
19   MAKECRCH can be #defined to write out crc32.h. A main() routine is also
20   produced, so that this one source file can be compiled to an executable.
21  */
22 
23 #ifdef MAKECRCH
24 #  include <stdio.h>
25 #  ifndef DYNAMIC_CRC_TABLE
26 #    define DYNAMIC_CRC_TABLE
27 #  endif /* !DYNAMIC_CRC_TABLE */
28 #endif /* MAKECRCH */
29 
30 #include "deflate.h"
31 #include "cpu_features.h"
32 #include "zutil.h"      /* for Z_U4, Z_U8, z_crc_t, and FAR definitions */
33 
34 #if defined(CRC32_SIMD_SSE42_PCLMUL) || defined(CRC32_ARMV8_CRC32)
35 #include "crc32_simd.h"
36 #endif
37 
38  /*
39   A CRC of a message is computed on N braids of words in the message, where
40   each word consists of W bytes (4 or 8). If N is 3, for example, then three
41   running sparse CRCs are calculated respectively on each braid, at these
42   indices in the array of words: 0, 3, 6, ..., 1, 4, 7, ..., and 2, 5, 8, ...
43   This is done starting at a word boundary, and continues until as many blocks
44   of N * W bytes as are available have been processed. The results are combined
45   into a single CRC at the end. For this code, N must be in the range 1..6 and
46   W must be 4 or 8. The upper limit on N can be increased if desired by adding
47   more #if blocks, extending the patterns apparent in the code. In addition,
48   crc32.h would need to be regenerated, if the maximum N value is increased.
49 
50   N and W are chosen empirically by benchmarking the execution time on a given
51   processor. The choices for N and W below were based on testing on Intel Kaby
52   Lake i7, AMD Ryzen 7, ARM Cortex-A57, Sparc64-VII, PowerPC POWER9, and MIPS64
53   Octeon II processors. The Intel, AMD, and ARM processors were all fastest
54   with N=5, W=8. The Sparc, PowerPC, and MIPS64 were all fastest at N=5, W=4.
55   They were all tested with either gcc or clang, all using the -O3 optimization
56   level. Your mileage may vary.
57  */
58 
59 /* Define N */
60 #ifdef Z_TESTN
61 #  define N Z_TESTN
62 #else
63 #  define N 5
64 #endif
65 #if N < 1 || N > 6
66 #  error N must be in 1..6
67 #endif
68 
69 /*
70   z_crc_t must be at least 32 bits. z_word_t must be at least as long as
71   z_crc_t. It is assumed here that z_word_t is either 32 bits or 64 bits, and
72   that bytes are eight bits.
73  */
74 
75 /*
76   Define W and the associated z_word_t type. If W is not defined, then a
77   braided calculation is not used, and the associated tables and code are not
78   compiled.
79  */
80 #ifdef Z_TESTW
81 #  if Z_TESTW-1 != -1
82 #    define W Z_TESTW
83 #  endif
84 #else
85 #  ifdef MAKECRCH
86 #    define W 8         /* required for MAKECRCH */
87 #  else
88 #    if defined(__x86_64__) || defined(__aarch64__)
89 #      define W 8
90 #    else
91 #      define W 4
92 #    endif
93 #  endif
94 #endif
95 #ifdef W
96 #  if W == 8 && defined(Z_U8)
97      typedef Z_U8 z_word_t;
98 #  elif defined(Z_U4)
99 #    undef W
100 #    define W 4
101      typedef Z_U4 z_word_t;
102 #  else
103 #    undef W
104 #  endif
105 #endif
106 
107 /* If available, use the ARM processor CRC32 instruction. */
108 #if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32) && W == 8 \
109     && defined(USE_CANONICAL_ARMV8_CRC32)
110 #  define ARMCRC32_CANONICAL_ZLIB
111 #endif
112 
113 /* Local functions. */
114 local z_crc_t multmodp OF((z_crc_t a, z_crc_t b));
115 local z_crc_t x2nmodp OF((z_off64_t n, unsigned k));
116 
117 #if defined(W) && (!defined(ARMCRC32_CANONICAL_ZLIB) || defined(DYNAMIC_CRC_TABLE))
118     local z_word_t byte_swap OF((z_word_t word));
119 #endif
120 
121 #if defined(W) && !defined(ARMCRC32_CANONICAL_ZLIB)
122     local z_crc_t crc_word OF((z_word_t data));
123     local z_word_t crc_word_big OF((z_word_t data));
124 #endif
125 
126 #if defined(W) && (!defined(ARMCRC32_CANONICAL_ZLIB) || defined(DYNAMIC_CRC_TABLE))
127 /*
128   Swap the bytes in a z_word_t to convert between little and big endian. Any
129   self-respecting compiler will optimize this to a single machine byte-swap
130   instruction, if one is available. This assumes that word_t is either 32 bits
131   or 64 bits.
132  */
byte_swap(word)133 local z_word_t byte_swap(word)
134     z_word_t word;
135 {
136 #  if W == 8
137     return
138         (word & 0xff00000000000000) >> 56 |
139         (word & 0xff000000000000) >> 40 |
140         (word & 0xff0000000000) >> 24 |
141         (word & 0xff00000000) >> 8 |
142         (word & 0xff000000) << 8 |
143         (word & 0xff0000) << 24 |
144         (word & 0xff00) << 40 |
145         (word & 0xff) << 56;
146 #  else   /* W == 4 */
147     return
148         (word & 0xff000000) >> 24 |
149         (word & 0xff0000) >> 8 |
150         (word & 0xff00) << 8 |
151         (word & 0xff) << 24;
152 #  endif
153 }
154 #endif
155 
156 /* CRC polynomial. */
157 #define POLY 0xedb88320         /* p(x) reflected, with x^32 implied */
158 
159 #ifdef DYNAMIC_CRC_TABLE
160 
161 local z_crc_t FAR crc_table[256];
162 local z_crc_t FAR x2n_table[32];
163 local void make_crc_table OF((void));
164 #ifdef W
165    local z_word_t FAR crc_big_table[256];
166    local z_crc_t FAR crc_braid_table[W][256];
167    local z_word_t FAR crc_braid_big_table[W][256];
168    local void braid OF((z_crc_t [][256], z_word_t [][256], int, int));
169 #endif
170 #ifdef MAKECRCH
171    local void write_table OF((FILE *, const z_crc_t FAR *, int));
172    local void write_table32hi OF((FILE *, const z_word_t FAR *, int));
173    local void write_table64 OF((FILE *, const z_word_t FAR *, int));
174 #endif /* MAKECRCH */
175 
176 /*
177   Define a once() function depending on the availability of atomics. If this is
178   compiled with DYNAMIC_CRC_TABLE defined, and if CRCs will be computed in
179   multiple threads, and if atomics are not available, then get_crc_table() must
180   be called to initialize the tables and must return before any threads are
181   allowed to compute or combine CRCs.
182  */
183 
184 /* Definition of once functionality. */
185 typedef struct once_s once_t;
186 local void once OF((once_t *, void (*)(void)));
187 
188 /* Check for the availability of atomics. */
189 #if defined(__STDC__) && __STDC_VERSION__ >= 201112L && \
190     !defined(__STDC_NO_ATOMICS__)
191 
192 #include <stdatomic.h>
193 
194 /* Structure for once(), which must be initialized with ONCE_INIT. */
195 struct once_s {
196     atomic_flag begun;
197     atomic_int done;
198 };
199 #define ONCE_INIT {ATOMIC_FLAG_INIT, 0}
200 
201 /*
202   Run the provided init() function exactly once, even if multiple threads
203   invoke once() at the same time. The state must be a once_t initialized with
204   ONCE_INIT.
205  */
once(state,init)206 local void once(state, init)
207     once_t *state;
208     void (*init)(void);
209 {
210     if (!atomic_load(&state->done)) {
211         if (atomic_flag_test_and_set(&state->begun))
212             while (!atomic_load(&state->done))
213                 ;
214         else {
215             init();
216             atomic_store(&state->done, 1);
217         }
218     }
219 }
220 
221 #else   /* no atomics */
222 
223 /* Structure for once(), which must be initialized with ONCE_INIT. */
224 struct once_s {
225     volatile int begun;
226     volatile int done;
227 };
228 #define ONCE_INIT {0, 0}
229 
230 /* Test and set. Alas, not atomic, but tries to minimize the period of
231    vulnerability. */
232 local int test_and_set OF((int volatile *));
test_and_set(flag)233 local int test_and_set(flag)
234     int volatile *flag;
235 {
236     int was;
237 
238     was = *flag;
239     *flag = 1;
240     return was;
241 }
242 
243 /* Run the provided init() function once. This is not thread-safe. */
once(state,init)244 local void once(state, init)
245     once_t *state;
246     void (*init)(void);
247 {
248     if (!state->done) {
249         if (test_and_set(&state->begun))
250             while (!state->done)
251                 ;
252         else {
253             init();
254             state->done = 1;
255         }
256     }
257 }
258 
259 #endif
260 
261 /* State for once(). */
262 local once_t made = ONCE_INIT;
263 
264 /*
265   Generate tables for a byte-wise 32-bit CRC calculation on the polynomial:
266   x^32+x^26+x^23+x^22+x^16+x^12+x^11+x^10+x^8+x^7+x^5+x^4+x^2+x+1.
267 
268   Polynomials over GF(2) are represented in binary, one bit per coefficient,
269   with the lowest powers in the most significant bit. Then adding polynomials
270   is just exclusive-or, and multiplying a polynomial by x is a right shift by
271   one. If we call the above polynomial p, and represent a byte as the
272   polynomial q, also with the lowest power in the most significant bit (so the
273   byte 0xb1 is the polynomial x^7+x^3+x^2+1), then the CRC is (q*x^32) mod p,
274   where a mod b means the remainder after dividing a by b.
275 
276   This calculation is done using the shift-register method of multiplying and
277   taking the remainder. The register is initialized to zero, and for each
278   incoming bit, x^32 is added mod p to the register if the bit is a one (where
279   x^32 mod p is p+x^32 = x^26+...+1), and the register is multiplied mod p by x
280   (which is shifting right by one and adding x^32 mod p if the bit shifted out
281   is a one). We start with the highest power (least significant bit) of q and
282   repeat for all eight bits of q.
283 
284   The table is simply the CRC of all possible eight bit values. This is all the
285   information needed to generate CRCs on data a byte at a time for all
286   combinations of CRC register values and incoming bytes.
287  */
make_crc_table()288 local void make_crc_table()
289 {
290     unsigned i, j, n;
291     z_crc_t p;
292 
293     /* initialize the CRC of bytes tables */
294     for (i = 0; i < 256; i++) {
295         p = i;
296         for (j = 0; j < 8; j++)
297             p = p & 1 ? (p >> 1) ^ POLY : p >> 1;
298         crc_table[i] = p;
299 #ifdef W
300         crc_big_table[i] = byte_swap(p);
301 #endif
302     }
303 
304     /* initialize the x^2^n mod p(x) table */
305     p = (z_crc_t)1 << 30;         /* x^1 */
306     x2n_table[0] = p;
307     for (n = 1; n < 32; n++)
308         x2n_table[n] = p = multmodp(p, p);
309 
310 #ifdef W
311     /* initialize the braiding tables -- needs x2n_table[] */
312     braid(crc_braid_table, crc_braid_big_table, N, W);
313 #endif
314 
315 #ifdef MAKECRCH
316     {
317         /*
318           The crc32.h header file contains tables for both 32-bit and 64-bit
319           z_word_t's, and so requires a 64-bit type be available. In that case,
320           z_word_t must be defined to be 64-bits. This code then also generates
321           and writes out the tables for the case that z_word_t is 32 bits.
322          */
323 #if !defined(W) || W != 8
324 #  error Need a 64-bit integer type in order to generate crc32.h.
325 #endif
326         FILE *out;
327         int k, n;
328         z_crc_t ltl[8][256];
329         z_word_t big[8][256];
330 
331         out = fopen("crc32.h", "w");
332         if (out == NULL) return;
333 
334         /* write out little-endian CRC table to crc32.h */
335         fprintf(out,
336             "/* crc32.h -- tables for rapid CRC calculation\n"
337             " * Generated automatically by crc32.c\n */\n"
338             "\n"
339             "local const z_crc_t FAR crc_table[] = {\n"
340             "    ");
341         write_table(out, crc_table, 256);
342         fprintf(out,
343             "};\n");
344 
345         /* write out big-endian CRC table for 64-bit z_word_t to crc32.h */
346         fprintf(out,
347             "\n"
348             "#ifdef W\n"
349             "\n"
350             "#if W == 8\n"
351             "\n"
352             "local const z_word_t FAR crc_big_table[] = {\n"
353             "    ");
354         write_table64(out, crc_big_table, 256);
355         fprintf(out,
356             "};\n");
357 
358         /* write out big-endian CRC table for 32-bit z_word_t to crc32.h */
359         fprintf(out,
360             "\n"
361             "#else /* W == 4 */\n"
362             "\n"
363             "local const z_word_t FAR crc_big_table[] = {\n"
364             "    ");
365         write_table32hi(out, crc_big_table, 256);
366         fprintf(out,
367             "};\n"
368             "\n"
369             "#endif\n");
370 
371         /* write out braid tables for each value of N */
372         for (n = 1; n <= 6; n++) {
373             fprintf(out,
374             "\n"
375             "#if N == %d\n", n);
376 
377             /* compute braid tables for this N and 64-bit word_t */
378             braid(ltl, big, n, 8);
379 
380             /* write out braid tables for 64-bit z_word_t to crc32.h */
381             fprintf(out,
382             "\n"
383             "#if W == 8\n"
384             "\n"
385             "local const z_crc_t FAR crc_braid_table[][256] = {\n");
386             for (k = 0; k < 8; k++) {
387                 fprintf(out, "   {");
388                 write_table(out, ltl[k], 256);
389                 fprintf(out, "}%s", k < 7 ? ",\n" : "");
390             }
391             fprintf(out,
392             "};\n"
393             "\n"
394             "local const z_word_t FAR crc_braid_big_table[][256] = {\n");
395             for (k = 0; k < 8; k++) {
396                 fprintf(out, "   {");
397                 write_table64(out, big[k], 256);
398                 fprintf(out, "}%s", k < 7 ? ",\n" : "");
399             }
400             fprintf(out,
401             "};\n");
402 
403             /* compute braid tables for this N and 32-bit word_t */
404             braid(ltl, big, n, 4);
405 
406             /* write out braid tables for 32-bit z_word_t to crc32.h */
407             fprintf(out,
408             "\n"
409             "#else /* W == 4 */\n"
410             "\n"
411             "local const z_crc_t FAR crc_braid_table[][256] = {\n");
412             for (k = 0; k < 4; k++) {
413                 fprintf(out, "   {");
414                 write_table(out, ltl[k], 256);
415                 fprintf(out, "}%s", k < 3 ? ",\n" : "");
416             }
417             fprintf(out,
418             "};\n"
419             "\n"
420             "local const z_word_t FAR crc_braid_big_table[][256] = {\n");
421             for (k = 0; k < 4; k++) {
422                 fprintf(out, "   {");
423                 write_table32hi(out, big[k], 256);
424                 fprintf(out, "}%s", k < 3 ? ",\n" : "");
425             }
426             fprintf(out,
427             "};\n"
428             "\n"
429             "#endif\n"
430             "\n"
431             "#endif\n");
432         }
433         fprintf(out,
434             "\n"
435             "#endif\n");
436 
437         /* write out zeros operator table to crc32.h */
438         fprintf(out,
439             "\n"
440             "local const z_crc_t FAR x2n_table[] = {\n"
441             "    ");
442         write_table(out, x2n_table, 32);
443         fprintf(out,
444             "};\n");
445         fclose(out);
446     }
447 #endif /* MAKECRCH */
448 }
449 
450 #ifdef MAKECRCH
451 
452 /*
453    Write the 32-bit values in table[0..k-1] to out, five per line in
454    hexadecimal separated by commas.
455  */
write_table(out,table,k)456 local void write_table(out, table, k)
457     FILE *out;
458     const z_crc_t FAR *table;
459     int k;
460 {
461     int n;
462 
463     for (n = 0; n < k; n++)
464         fprintf(out, "%s0x%08lx%s", n == 0 || n % 5 ? "" : "    ",
465                 (unsigned long)(table[n]),
466                 n == k - 1 ? "" : (n % 5 == 4 ? ",\n" : ", "));
467 }
468 
469 /*
470    Write the high 32-bits of each value in table[0..k-1] to out, five per line
471    in hexadecimal separated by commas.
472  */
write_table32hi(out,table,k)473 local void write_table32hi(out, table, k)
474 FILE *out;
475 const z_word_t FAR *table;
476 int k;
477 {
478     int n;
479 
480     for (n = 0; n < k; n++)
481         fprintf(out, "%s0x%08lx%s", n == 0 || n % 5 ? "" : "    ",
482                 (unsigned long)(table[n] >> 32),
483                 n == k - 1 ? "" : (n % 5 == 4 ? ",\n" : ", "));
484 }
485 
486 /*
487   Write the 64-bit values in table[0..k-1] to out, three per line in
488   hexadecimal separated by commas. This assumes that if there is a 64-bit
489   type, then there is also a long long integer type, and it is at least 64
490   bits. If not, then the type cast and format string can be adjusted
491   accordingly.
492  */
write_table64(out,table,k)493 local void write_table64(out, table, k)
494     FILE *out;
495     const z_word_t FAR *table;
496     int k;
497 {
498     int n;
499 
500     for (n = 0; n < k; n++)
501         fprintf(out, "%s0x%016llx%s", n == 0 || n % 3 ? "" : "    ",
502                 (unsigned long long)(table[n]),
503                 n == k - 1 ? "" : (n % 3 == 2 ? ",\n" : ", "));
504 }
505 
506 /* Actually do the deed. */
main()507 int main()
508 {
509     make_crc_table();
510     return 0;
511 }
512 
513 #endif /* MAKECRCH */
514 
515 #ifdef W
516 /*
517   Generate the little and big-endian braid tables for the given n and z_word_t
518   size w. Each array must have room for w blocks of 256 elements.
519  */
braid(ltl,big,n,w)520 local void braid(ltl, big, n, w)
521     z_crc_t ltl[][256];
522     z_word_t big[][256];
523     int n;
524     int w;
525 {
526     int k;
527     z_crc_t i, p, q;
528     for (k = 0; k < w; k++) {
529         p = x2nmodp((n * w + 3 - k) << 3, 0);
530         ltl[k][0] = 0;
531         big[w - 1 - k][0] = 0;
532         for (i = 1; i < 256; i++) {
533             ltl[k][i] = q = multmodp(i << 24, p);
534             big[w - 1 - k][i] = byte_swap(q);
535         }
536     }
537 }
538 #endif
539 
540 #else /* !DYNAMIC_CRC_TABLE */
541 /* ========================================================================
542  * Tables for byte-wise and braided CRC-32 calculations, and a table of powers
543  * of x for combining CRC-32s, all made by make_crc_table().
544  */
545 #include "crc32.h"
546 #endif /* DYNAMIC_CRC_TABLE */
547 
548 /* ========================================================================
549  * Routines used for CRC calculation. Some are also required for the table
550  * generation above.
551  */
552 
553 /*
554   Return a(x) multiplied by b(x) modulo p(x), where p(x) is the CRC polynomial,
555   reflected. For speed, this requires that a not be zero.
556  */
multmodp(a,b)557 local z_crc_t multmodp(a, b)
558     z_crc_t a;
559     z_crc_t b;
560 {
561     z_crc_t m, p;
562 
563     m = (z_crc_t)1 << 31;
564     p = 0;
565     for (;;) {
566         if (a & m) {
567             p ^= b;
568             if ((a & (m - 1)) == 0)
569                 break;
570         }
571         m >>= 1;
572         b = b & 1 ? (b >> 1) ^ POLY : b >> 1;
573     }
574     return p;
575 }
576 
577 /*
578   Return x^(n * 2^k) modulo p(x). Requires that x2n_table[] has been
579   initialized.
580  */
x2nmodp(n,k)581 local z_crc_t x2nmodp(n, k)
582     z_off64_t n;
583     unsigned k;
584 {
585     z_crc_t p;
586 
587     p = (z_crc_t)1 << 31;           /* x^0 == 1 */
588     while (n) {
589         if (n & 1)
590             p = multmodp(x2n_table[k & 31], p);
591         n >>= 1;
592         k++;
593     }
594     return p;
595 }
596 
597 /* =========================================================================
598  * This function can be used by asm versions of crc32(), and to force the
599  * generation of the CRC tables in a threaded application.
600  */
get_crc_table()601 const z_crc_t FAR * ZEXPORT get_crc_table()
602 {
603 #ifdef DYNAMIC_CRC_TABLE
604     once(&made, make_crc_table);
605 #endif /* DYNAMIC_CRC_TABLE */
606     return (const z_crc_t FAR *)crc_table;
607 }
608 
609 /* =========================================================================
610  * Use ARM machine instructions if available. This will compute the CRC about
611  * ten times faster than the braided calculation. This code does not check for
612  * the presence of the CRC instruction at run time. __ARM_FEATURE_CRC32 will
613  * only be defined if the compilation specifies an ARM processor architecture
614  * that has the instructions. For example, compiling with -march=armv8.1-a or
615  * -march=armv8-a+crc, or -march=native if the compile machine has the crc32
616  * instructions.
617  */
618 #if ARMCRC32_CANONICAL_ZLIB
619 
620 /*
621    Constants empirically determined to maximize speed. These values are from
622    measurements on a Cortex-A57. Your mileage may vary.
623  */
624 #define Z_BATCH 3990                /* number of words in a batch */
625 #define Z_BATCH_ZEROS 0xa10d3d0c    /* computed from Z_BATCH = 3990 */
626 #define Z_BATCH_MIN 800             /* fewest words in a final batch */
627 
crc32_z(crc,buf,len)628 unsigned long ZEXPORT crc32_z(crc, buf, len)
629     unsigned long crc;
630     const unsigned char FAR *buf;
631     z_size_t len;
632 {
633     z_crc_t val;
634     z_word_t crc1, crc2;
635     const z_word_t *word;
636     z_word_t val0, val1, val2;
637     z_size_t last, last2, i;
638     z_size_t num;
639 
640     /* Return initial CRC, if requested. */
641     if (buf == Z_NULL) return 0;
642 
643 #ifdef DYNAMIC_CRC_TABLE
644     once(&made, make_crc_table);
645 #endif /* DYNAMIC_CRC_TABLE */
646 
647     /* Pre-condition the CRC */
648     crc = (~crc) & 0xffffffff;
649 
650     /* Compute the CRC up to a word boundary. */
651     while (len && ((z_size_t)buf & 7) != 0) {
652         len--;
653         val = *buf++;
654         __asm__ volatile("crc32b %w0, %w0, %w1" : "+r"(crc) : "r"(val));
655     }
656 
657     /* Prepare to compute the CRC on full 64-bit words word[0..num-1]. */
658     word = (z_word_t const *)buf;
659     num = len >> 3;
660     len &= 7;
661 
662     /* Do three interleaved CRCs to realize the throughput of one crc32x
663        instruction per cycle. Each CRC is calculated on Z_BATCH words. The
664        three CRCs are combined into a single CRC after each set of batches. */
665     while (num >= 3 * Z_BATCH) {
666         crc1 = 0;
667         crc2 = 0;
668         for (i = 0; i < Z_BATCH; i++) {
669             val0 = word[i];
670             val1 = word[i + Z_BATCH];
671             val2 = word[i + 2 * Z_BATCH];
672             __asm__ volatile("crc32x %w0, %w0, %x1" : "+r"(crc) : "r"(val0));
673             __asm__ volatile("crc32x %w0, %w0, %x1" : "+r"(crc1) : "r"(val1));
674             __asm__ volatile("crc32x %w0, %w0, %x1" : "+r"(crc2) : "r"(val2));
675         }
676         word += 3 * Z_BATCH;
677         num -= 3 * Z_BATCH;
678         crc = multmodp(Z_BATCH_ZEROS, crc) ^ crc1;
679         crc = multmodp(Z_BATCH_ZEROS, crc) ^ crc2;
680     }
681 
682     /* Do one last smaller batch with the remaining words, if there are enough
683        to pay for the combination of CRCs. */
684     last = num / 3;
685     if (last >= Z_BATCH_MIN) {
686         last2 = last << 1;
687         crc1 = 0;
688         crc2 = 0;
689         for (i = 0; i < last; i++) {
690             val0 = word[i];
691             val1 = word[i + last];
692             val2 = word[i + last2];
693             __asm__ volatile("crc32x %w0, %w0, %x1" : "+r"(crc) : "r"(val0));
694             __asm__ volatile("crc32x %w0, %w0, %x1" : "+r"(crc1) : "r"(val1));
695             __asm__ volatile("crc32x %w0, %w0, %x1" : "+r"(crc2) : "r"(val2));
696         }
697         word += 3 * last;
698         num -= 3 * last;
699         val = x2nmodp(last, 6);
700         crc = multmodp(val, crc) ^ crc1;
701         crc = multmodp(val, crc) ^ crc2;
702     }
703 
704     /* Compute the CRC on any remaining words. */
705     for (i = 0; i < num; i++) {
706         val0 = word[i];
707         __asm__ volatile("crc32x %w0, %w0, %x1" : "+r"(crc) : "r"(val0));
708     }
709     word += num;
710 
711     /* Complete the CRC on any remaining bytes. */
712     buf = (const unsigned char FAR *)word;
713     while (len) {
714         len--;
715         val = *buf++;
716         __asm__ volatile("crc32b %w0, %w0, %w1" : "+r"(crc) : "r"(val));
717     }
718 
719     /* Return the CRC, post-conditioned. */
720     return crc ^ 0xffffffff;
721 }
722 
723 #else
724 
725 #ifdef W
726 
727 /*
728   Return the CRC of the W bytes in the word_t data, taking the
729   least-significant byte of the word as the first byte of data, without any pre
730   or post conditioning. This is used to combine the CRCs of each braid.
731  */
crc_word(data)732 local z_crc_t crc_word(data)
733     z_word_t data;
734 {
735     int k;
736     for (k = 0; k < W; k++)
737         data = (data >> 8) ^ crc_table[data & 0xff];
738     return (z_crc_t)data;
739 }
740 
crc_word_big(data)741 local z_word_t crc_word_big(data)
742     z_word_t data;
743 {
744     int k;
745     for (k = 0; k < W; k++)
746         data = (data << 8) ^
747             crc_big_table[(data >> ((W - 1) << 3)) & 0xff];
748     return data;
749 }
750 
751 #endif
752 
753 /* ========================================================================= */
crc32_z(crc,buf,len)754 unsigned long ZEXPORT crc32_z(crc, buf, len)
755     unsigned long crc;
756     const unsigned char FAR *buf;
757     z_size_t len;
758 {
759     /*
760      * zlib convention is to call crc32(0, NULL, 0); before making
761      * calls to crc32(). So this is a good, early (and infrequent)
762      * place to cache CPU features if needed for those later, more
763      * interesting crc32() calls.
764      */
765 #if defined(CRC32_SIMD_SSE42_PCLMUL) || defined(CRC32_ARMV8_CRC32)
766     /*
767      * Since this routine can be freely used, check CPU features here.
768      */
769     if (buf == Z_NULL) {
770         if (!len) /* Assume user is calling crc32(0, NULL, 0); */
771             cpu_check_features();
772         return 0UL;
773     }
774 
775 #endif
776 #if defined(CRC32_SIMD_SSE42_PCLMUL)
777     if (x86_cpu_enable_simd && len >= Z_CRC32_SSE42_MINIMUM_LENGTH) {
778         /* crc32 16-byte chunks */
779         z_size_t chunk_size = len & ~Z_CRC32_SSE42_CHUNKSIZE_MASK;
780         crc = ~crc32_sse42_simd_(buf, chunk_size, ~(uint32_t)crc);
781         /* check remaining data */
782         len -= chunk_size;
783         if (!len)
784             return crc;
785         /* Fall into the default crc32 for the remaining data. */
786         buf += chunk_size;
787     }
788 #elif defined(CRC32_ARMV8_CRC32)
789     if (arm_cpu_enable_crc32) {
790 #if defined(__aarch64__)
791         /* PMULL is 64bit only, plus code needs at least a 64 bytes buffer. */
792         if (arm_cpu_enable_pmull && (len > Z_CRC32_PMULL_MINIMUM_LENGTH)) {
793             const size_t chunk_size = len & ~Z_CRC32_PMULL_CHUNKSIZE_MASK;
794             crc = ~armv8_crc32_pmull_little(buf, chunk_size, ~(uint32_t)crc);
795             /* Check remaining data. */
796             len -= chunk_size;
797             if (!len)
798                 return crc;
799 
800             /* Fall through for the remaining data. */
801             buf += chunk_size;
802         }
803 #endif
804         return armv8_crc32_little(buf, len, crc); /* Armv8@32bit or tail. */
805     }
806 #else
807     if (buf == Z_NULL) {
808         return 0UL;
809     }
810 #endif /* CRC32_SIMD */
811 
812 #ifdef DYNAMIC_CRC_TABLE
813     once(&made, make_crc_table);
814 #endif /* DYNAMIC_CRC_TABLE */
815     /* Pre-condition the CRC */
816     crc = (~crc) & 0xffffffff;
817 
818 #ifdef W
819 
820     /* If provided enough bytes, do a braided CRC calculation. */
821     if (len >= N * W + W - 1) {
822         z_size_t blks;
823         z_word_t const *words;
824         unsigned endian;
825         int k;
826 
827         /* Compute the CRC up to a z_word_t boundary. */
828         while (len && ((z_size_t)buf & (W - 1)) != 0) {
829             len--;
830             crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff];
831         }
832 
833         /* Compute the CRC on as many N z_word_t blocks as are available. */
834         blks = len / (N * W);
835         len -= blks * N * W;
836         words = (z_word_t const *)buf;
837 
838         /* Do endian check at execution time instead of compile time, since ARM
839            processors can change the endianess at execution time. If the
840            compiler knows what the endianess will be, it can optimize out the
841            check and the unused branch. */
842         endian = 1;
843         if (*(unsigned char *)&endian) {
844             /* Little endian. */
845 
846             z_crc_t crc0;
847             z_word_t word0;
848 #if N > 1
849             z_crc_t crc1;
850             z_word_t word1;
851 #if N > 2
852             z_crc_t crc2;
853             z_word_t word2;
854 #if N > 3
855             z_crc_t crc3;
856             z_word_t word3;
857 #if N > 4
858             z_crc_t crc4;
859             z_word_t word4;
860 #if N > 5
861             z_crc_t crc5;
862             z_word_t word5;
863 #endif
864 #endif
865 #endif
866 #endif
867 #endif
868 
869             /* Initialize the CRC for each braid. */
870             crc0 = crc;
871 #if N > 1
872             crc1 = 0;
873 #if N > 2
874             crc2 = 0;
875 #if N > 3
876             crc3 = 0;
877 #if N > 4
878             crc4 = 0;
879 #if N > 5
880             crc5 = 0;
881 #endif
882 #endif
883 #endif
884 #endif
885 #endif
886 
887             /*
888               Process the first blks-1 blocks, computing the CRCs on each braid
889               independently.
890              */
891             while (--blks) {
892                 /* Load the word for each braid into registers. */
893                 word0 = crc0 ^ words[0];
894 #if N > 1
895                 word1 = crc1 ^ words[1];
896 #if N > 2
897                 word2 = crc2 ^ words[2];
898 #if N > 3
899                 word3 = crc3 ^ words[3];
900 #if N > 4
901                 word4 = crc4 ^ words[4];
902 #if N > 5
903                 word5 = crc5 ^ words[5];
904 #endif
905 #endif
906 #endif
907 #endif
908 #endif
909                 words += N;
910 
911                 /* Compute and update the CRC for each word. The loop should
912                    get unrolled. */
913                 crc0 = crc_braid_table[0][word0 & 0xff];
914 #if N > 1
915                 crc1 = crc_braid_table[0][word1 & 0xff];
916 #if N > 2
917                 crc2 = crc_braid_table[0][word2 & 0xff];
918 #if N > 3
919                 crc3 = crc_braid_table[0][word3 & 0xff];
920 #if N > 4
921                 crc4 = crc_braid_table[0][word4 & 0xff];
922 #if N > 5
923                 crc5 = crc_braid_table[0][word5 & 0xff];
924 #endif
925 #endif
926 #endif
927 #endif
928 #endif
929                 for (k = 1; k < W; k++) {
930                     crc0 ^= crc_braid_table[k][(word0 >> (k << 3)) & 0xff];
931 #if N > 1
932                     crc1 ^= crc_braid_table[k][(word1 >> (k << 3)) & 0xff];
933 #if N > 2
934                     crc2 ^= crc_braid_table[k][(word2 >> (k << 3)) & 0xff];
935 #if N > 3
936                     crc3 ^= crc_braid_table[k][(word3 >> (k << 3)) & 0xff];
937 #if N > 4
938                     crc4 ^= crc_braid_table[k][(word4 >> (k << 3)) & 0xff];
939 #if N > 5
940                     crc5 ^= crc_braid_table[k][(word5 >> (k << 3)) & 0xff];
941 #endif
942 #endif
943 #endif
944 #endif
945 #endif
946                 }
947             }
948 
949             /*
950               Process the last block, combining the CRCs of the N braids at the
951               same time.
952              */
953             crc = crc_word(crc0 ^ words[0]);
954 #if N > 1
955             crc = crc_word(crc1 ^ words[1] ^ crc);
956 #if N > 2
957             crc = crc_word(crc2 ^ words[2] ^ crc);
958 #if N > 3
959             crc = crc_word(crc3 ^ words[3] ^ crc);
960 #if N > 4
961             crc = crc_word(crc4 ^ words[4] ^ crc);
962 #if N > 5
963             crc = crc_word(crc5 ^ words[5] ^ crc);
964 #endif
965 #endif
966 #endif
967 #endif
968 #endif
969             words += N;
970         }
971         else {
972             /* Big endian. */
973 
974             z_word_t crc0, word0, comb;
975 #if N > 1
976             z_word_t crc1, word1;
977 #if N > 2
978             z_word_t crc2, word2;
979 #if N > 3
980             z_word_t crc3, word3;
981 #if N > 4
982             z_word_t crc4, word4;
983 #if N > 5
984             z_word_t crc5, word5;
985 #endif
986 #endif
987 #endif
988 #endif
989 #endif
990 
991             /* Initialize the CRC for each braid. */
992             crc0 = byte_swap(crc);
993 #if N > 1
994             crc1 = 0;
995 #if N > 2
996             crc2 = 0;
997 #if N > 3
998             crc3 = 0;
999 #if N > 4
1000             crc4 = 0;
1001 #if N > 5
1002             crc5 = 0;
1003 #endif
1004 #endif
1005 #endif
1006 #endif
1007 #endif
1008 
1009             /*
1010               Process the first blks-1 blocks, computing the CRCs on each braid
1011               independently.
1012              */
1013             while (--blks) {
1014                 /* Load the word for each braid into registers. */
1015                 word0 = crc0 ^ words[0];
1016 #if N > 1
1017                 word1 = crc1 ^ words[1];
1018 #if N > 2
1019                 word2 = crc2 ^ words[2];
1020 #if N > 3
1021                 word3 = crc3 ^ words[3];
1022 #if N > 4
1023                 word4 = crc4 ^ words[4];
1024 #if N > 5
1025                 word5 = crc5 ^ words[5];
1026 #endif
1027 #endif
1028 #endif
1029 #endif
1030 #endif
1031                 words += N;
1032 
1033                 /* Compute and update the CRC for each word. The loop should
1034                    get unrolled. */
1035                 crc0 = crc_braid_big_table[0][word0 & 0xff];
1036 #if N > 1
1037                 crc1 = crc_braid_big_table[0][word1 & 0xff];
1038 #if N > 2
1039                 crc2 = crc_braid_big_table[0][word2 & 0xff];
1040 #if N > 3
1041                 crc3 = crc_braid_big_table[0][word3 & 0xff];
1042 #if N > 4
1043                 crc4 = crc_braid_big_table[0][word4 & 0xff];
1044 #if N > 5
1045                 crc5 = crc_braid_big_table[0][word5 & 0xff];
1046 #endif
1047 #endif
1048 #endif
1049 #endif
1050 #endif
1051                 for (k = 1; k < W; k++) {
1052                     crc0 ^= crc_braid_big_table[k][(word0 >> (k << 3)) & 0xff];
1053 #if N > 1
1054                     crc1 ^= crc_braid_big_table[k][(word1 >> (k << 3)) & 0xff];
1055 #if N > 2
1056                     crc2 ^= crc_braid_big_table[k][(word2 >> (k << 3)) & 0xff];
1057 #if N > 3
1058                     crc3 ^= crc_braid_big_table[k][(word3 >> (k << 3)) & 0xff];
1059 #if N > 4
1060                     crc4 ^= crc_braid_big_table[k][(word4 >> (k << 3)) & 0xff];
1061 #if N > 5
1062                     crc5 ^= crc_braid_big_table[k][(word5 >> (k << 3)) & 0xff];
1063 #endif
1064 #endif
1065 #endif
1066 #endif
1067 #endif
1068                 }
1069             }
1070 
1071             /*
1072               Process the last block, combining the CRCs of the N braids at the
1073               same time.
1074              */
1075             comb = crc_word_big(crc0 ^ words[0]);
1076 #if N > 1
1077             comb = crc_word_big(crc1 ^ words[1] ^ comb);
1078 #if N > 2
1079             comb = crc_word_big(crc2 ^ words[2] ^ comb);
1080 #if N > 3
1081             comb = crc_word_big(crc3 ^ words[3] ^ comb);
1082 #if N > 4
1083             comb = crc_word_big(crc4 ^ words[4] ^ comb);
1084 #if N > 5
1085             comb = crc_word_big(crc5 ^ words[5] ^ comb);
1086 #endif
1087 #endif
1088 #endif
1089 #endif
1090 #endif
1091             words += N;
1092             crc = byte_swap(comb);
1093         }
1094 
1095         /*
1096           Update the pointer to the remaining bytes to process.
1097          */
1098         buf = (unsigned char const *)words;
1099     }
1100 
1101 #endif /* W */
1102 
1103     /* Complete the computation of the CRC on any remaining bytes. */
1104     while (len >= 8) {
1105         len -= 8;
1106         crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff];
1107         crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff];
1108         crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff];
1109         crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff];
1110         crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff];
1111         crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff];
1112         crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff];
1113         crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff];
1114     }
1115     while (len) {
1116         len--;
1117         crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff];
1118     }
1119 
1120     /* Return the CRC, post-conditioned. */
1121     return crc ^ 0xffffffff;
1122 }
1123 
1124 #endif
1125 
1126 /* ========================================================================= */
crc32(crc,buf,len)1127 unsigned long ZEXPORT crc32(crc, buf, len)
1128     unsigned long crc;
1129     const unsigned char FAR *buf;
1130     uInt len;
1131 {
1132     /* Some bots compile with optimizations disabled, others will emulate
1133      * ARM on x86 and other weird combinations.
1134      */
1135 #if defined(CRC32_SIMD_SSE42_PCLMUL) || defined(CRC32_ARMV8_CRC32)
1136     /* We got to verify CPU features, so exploit the common usage pattern
1137      * of calling this function with Z_NULL for an initial valid crc value.
1138      * This allows to cache the result of the feature check and avoid extraneous
1139      * function calls.
1140      */
1141     if (buf == Z_NULL) {
1142         if (!len) /* Assume user is calling crc32(0, NULL, 0); */
1143             cpu_check_features();
1144         return 0UL;
1145     }
1146 #endif
1147 
1148 #if defined(CRC32_ARMV8_CRC32)
1149     if (arm_cpu_enable_crc32) {
1150 #if defined(__aarch64__)
1151         /* PMULL is 64bit only, plus code needs at least a 64 bytes buffer. */
1152         if (arm_cpu_enable_pmull && (len > Z_CRC32_PMULL_MINIMUM_LENGTH)) {
1153             const size_t chunk_size = len & ~Z_CRC32_PMULL_CHUNKSIZE_MASK;
1154             crc = ~armv8_crc32_pmull_little(buf, chunk_size, ~(uint32_t)crc);
1155             /* Check remaining data. */
1156             len -= chunk_size;
1157             if (!len)
1158                 return crc;
1159 
1160             /* Fall through for the remaining data. */
1161             buf += chunk_size;
1162         }
1163 #endif
1164         return armv8_crc32_little(buf, len, crc); /* Armv8@32bit or tail. */
1165     }
1166 #endif
1167     return crc32_z(crc, buf, len); /* Armv7 or Armv8 w/o crypto extensions. */
1168 }
1169 
1170 /* ========================================================================= */
crc32_combine64(crc1,crc2,len2)1171 uLong ZEXPORT crc32_combine64(crc1, crc2, len2)
1172     uLong crc1;
1173     uLong crc2;
1174     z_off64_t len2;
1175 {
1176 #ifdef DYNAMIC_CRC_TABLE
1177     once(&made, make_crc_table);
1178 #endif /* DYNAMIC_CRC_TABLE */
1179     return multmodp(x2nmodp(len2, 3), crc1) ^ (crc2 & 0xffffffff);
1180 }
1181 
1182 /* ========================================================================= */
crc32_combine(crc1,crc2,len2)1183 uLong ZEXPORT crc32_combine(crc1, crc2, len2)
1184     uLong crc1;
1185     uLong crc2;
1186     z_off_t len2;
1187 {
1188     return crc32_combine64(crc1, crc2, (z_off64_t)len2);
1189 }
1190 /* ========================================================================= */
crc32_combine_gen64(len2)1191 uLong ZEXPORT crc32_combine_gen64(len2)
1192     z_off64_t len2;
1193 {
1194 #ifdef DYNAMIC_CRC_TABLE
1195     once(&made, make_crc_table);
1196 #endif /* DYNAMIC_CRC_TABLE */
1197     return x2nmodp(len2, 3);
1198 }
1199 
1200 /* ========================================================================= */
crc32_combine_gen(len2)1201 uLong ZEXPORT crc32_combine_gen(len2)
1202     z_off_t len2;
1203 {
1204     return crc32_combine_gen64((z_off64_t)len2);
1205 }
1206 
1207 /* ========================================================================= */
crc32_combine_op(crc1,crc2,op)1208 uLong ZEXPORT crc32_combine_op(crc1, crc2, op)
1209     uLong crc1;
1210     uLong crc2;
1211     uLong op;
1212 {
1213     return multmodp(op, crc1) ^ (crc2 & 0xffffffff);
1214 }
1215 
crc_reset(deflate_state * const s)1216 ZLIB_INTERNAL void crc_reset(deflate_state *const s)
1217 {
1218 #ifdef CRC32_SIMD_SSE42_PCLMUL
1219     if (x86_cpu_enable_simd) {
1220         crc_fold_init(s);
1221         return;
1222     }
1223 #endif
1224     s->strm->adler = crc32(0L, Z_NULL, 0);
1225 }
1226 
crc_finalize(deflate_state * const s)1227 ZLIB_INTERNAL void crc_finalize(deflate_state *const s)
1228 {
1229 #ifdef CRC32_SIMD_SSE42_PCLMUL
1230     if (x86_cpu_enable_simd)
1231         s->strm->adler = crc_fold_512to32(s);
1232 #endif
1233 }
1234 
copy_with_crc(z_streamp strm,Bytef * dst,long size)1235 ZLIB_INTERNAL void copy_with_crc(z_streamp strm, Bytef *dst, long size)
1236 {
1237 #ifdef CRC32_SIMD_SSE42_PCLMUL
1238     if (x86_cpu_enable_simd) {
1239         crc_fold_copy(strm->state, dst, strm->next_in, size);
1240         return;
1241     }
1242 #endif
1243     zmemcpy(dst, strm->next_in, size);
1244     strm->adler = crc32(strm->adler, dst, size);
1245 }
1246