• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* crc32_simd.c
2  *
3  * Copyright 2017 The Chromium Authors. All rights reserved.
4  * Use of this source code is governed by a BSD-style license that can be
5  * found in the Chromium source repository LICENSE file.
6  */
7 
8 #include "crc32_simd.h"
9 
10 #if defined(CRC32_SIMD_SSE42_PCLMUL)
11 #ifndef __GNUC__
12 #define __attribute__()
13 #endif
14 
15 /*
16  * crc32_sse42_simd_(): compute the crc32 of the buffer, where the buffer
17  * length must be at least 64, and a multiple of 16. Based on:
18  *
19  * "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction"
20  *  V. Gopal, E. Ozturk, et al., 2009, http://intel.ly/2ySEwL0
21  */
22 
23 #include <emmintrin.h>
24 #include <smmintrin.h>
25 #include <wmmintrin.h>
26 
27 __attribute__((target("sse4.2,pclmul")))
crc32_sse42_simd_(const unsigned char * buf,z_size_t len,uint32_t crc)28 uint32_t ZLIB_INTERNAL crc32_sse42_simd_(  /* SSE4.2+PCLMUL */
29     const unsigned char *buf,
30     z_size_t len,
31     uint32_t crc)
32 {
33     /*
34      * Definitions of the bit-reflected domain constants k1,k2,k3, etc and
35      * the CRC32+Barrett polynomials given at the end of the paper.
36      */
37     static const uint64_t zalign(16) k1k2[] = { 0x0154442bd4, 0x01c6e41596 };
38     static const uint64_t zalign(16) k3k4[] = { 0x01751997d0, 0x00ccaa009e };
39     static const uint64_t zalign(16) k5k0[] = { 0x0163cd6124, 0x0000000000 };
40     static const uint64_t zalign(16) poly[] = { 0x01db710641, 0x01f7011641 };
41 
42     __m128i x0, x1, x2, x3, x4, x5, x6, x7, x8, y5, y6, y7, y8;
43 
44     /*
45      * There's at least one block of 64.
46      */
47     x1 = _mm_loadu_si128((__m128i *)(buf + 0x00));
48     x2 = _mm_loadu_si128((__m128i *)(buf + 0x10));
49     x3 = _mm_loadu_si128((__m128i *)(buf + 0x20));
50     x4 = _mm_loadu_si128((__m128i *)(buf + 0x30));
51 
52     x1 = _mm_xor_si128(x1, _mm_cvtsi32_si128(crc));
53 
54     x0 = _mm_load_si128((__m128i *)k1k2);
55 
56     buf += 64;
57     len -= 64;
58 
59     /*
60      * Parallel fold blocks of 64, if any.
61      */
62     while (len >= 64)
63     {
64         x5 = _mm_clmulepi64_si128(x1, x0, 0x00);
65         x6 = _mm_clmulepi64_si128(x2, x0, 0x00);
66         x7 = _mm_clmulepi64_si128(x3, x0, 0x00);
67         x8 = _mm_clmulepi64_si128(x4, x0, 0x00);
68 
69         x1 = _mm_clmulepi64_si128(x1, x0, 0x11);
70         x2 = _mm_clmulepi64_si128(x2, x0, 0x11);
71         x3 = _mm_clmulepi64_si128(x3, x0, 0x11);
72         x4 = _mm_clmulepi64_si128(x4, x0, 0x11);
73 
74         y5 = _mm_loadu_si128((__m128i *)(buf + 0x00));
75         y6 = _mm_loadu_si128((__m128i *)(buf + 0x10));
76         y7 = _mm_loadu_si128((__m128i *)(buf + 0x20));
77         y8 = _mm_loadu_si128((__m128i *)(buf + 0x30));
78 
79         x1 = _mm_xor_si128(x1, x5);
80         x2 = _mm_xor_si128(x2, x6);
81         x3 = _mm_xor_si128(x3, x7);
82         x4 = _mm_xor_si128(x4, x8);
83 
84         x1 = _mm_xor_si128(x1, y5);
85         x2 = _mm_xor_si128(x2, y6);
86         x3 = _mm_xor_si128(x3, y7);
87         x4 = _mm_xor_si128(x4, y8);
88 
89         buf += 64;
90         len -= 64;
91     }
92 
93     /*
94      * Fold into 128-bits.
95      */
96     x0 = _mm_load_si128((__m128i *)k3k4);
97 
98     x5 = _mm_clmulepi64_si128(x1, x0, 0x00);
99     x1 = _mm_clmulepi64_si128(x1, x0, 0x11);
100     x1 = _mm_xor_si128(x1, x2);
101     x1 = _mm_xor_si128(x1, x5);
102 
103     x5 = _mm_clmulepi64_si128(x1, x0, 0x00);
104     x1 = _mm_clmulepi64_si128(x1, x0, 0x11);
105     x1 = _mm_xor_si128(x1, x3);
106     x1 = _mm_xor_si128(x1, x5);
107 
108     x5 = _mm_clmulepi64_si128(x1, x0, 0x00);
109     x1 = _mm_clmulepi64_si128(x1, x0, 0x11);
110     x1 = _mm_xor_si128(x1, x4);
111     x1 = _mm_xor_si128(x1, x5);
112 
113     /*
114      * Single fold blocks of 16, if any.
115      */
116     while (len >= 16)
117     {
118         x2 = _mm_loadu_si128((__m128i *)buf);
119 
120         x5 = _mm_clmulepi64_si128(x1, x0, 0x00);
121         x1 = _mm_clmulepi64_si128(x1, x0, 0x11);
122         x1 = _mm_xor_si128(x1, x2);
123         x1 = _mm_xor_si128(x1, x5);
124 
125         buf += 16;
126         len -= 16;
127     }
128 
129     /*
130      * Fold 128-bits to 64-bits.
131      */
132     x2 = _mm_clmulepi64_si128(x1, x0, 0x10);
133     x3 = _mm_setr_epi32(~0, 0, ~0, 0);
134     x1 = _mm_srli_si128(x1, 8);
135     x1 = _mm_xor_si128(x1, x2);
136 
137     x0 = _mm_loadl_epi64((__m128i*)k5k0);
138 
139     x2 = _mm_srli_si128(x1, 4);
140     x1 = _mm_and_si128(x1, x3);
141     x1 = _mm_clmulepi64_si128(x1, x0, 0x00);
142     x1 = _mm_xor_si128(x1, x2);
143 
144     /*
145      * Barret reduce to 32-bits.
146      */
147     x0 = _mm_load_si128((__m128i*)poly);
148 
149     x2 = _mm_and_si128(x1, x3);
150     x2 = _mm_clmulepi64_si128(x2, x0, 0x10);
151     x2 = _mm_and_si128(x2, x3);
152     x2 = _mm_clmulepi64_si128(x2, x0, 0x00);
153     x1 = _mm_xor_si128(x1, x2);
154 
155     /*
156      * Return the crc32.
157      */
158     return _mm_extract_epi32(x1, 1);
159 }
160 
161 #elif defined(CRC32_ARMV8_CRC32)
162 
163 /* CRC32 checksums using ARMv8-a crypto instructions.
164  *
165  * TODO: implement a version using the PMULL instruction.
166  */
167 
168 #if defined(__clang__)
169 /* CRC32 intrinsics are #ifdef'ed out of arm_acle.h unless we build with an
170  * armv8 target, which is incompatible with ThinLTO optimizations on Android.
171  * (Namely, mixing and matching different module-level targets makes ThinLTO
172  * warn, and Android defaults to armv7-a. This restriction does not apply to
173  * function-level `target`s, however.)
174  *
175  * Since we only need four crc intrinsics, and since clang's implementation of
176  * those are just wrappers around compiler builtins, it's simplest to #define
177  * those builtins directly. If this #define list grows too much (or we depend on
178  * an intrinsic that isn't a trivial wrapper), we may have to find a better way
179  * to go about this.
180  *
181  * NOTE: clang currently complains that "'+soft-float-abi' is not a recognized
182  * feature for this target (ignoring feature)." This appears to be a harmless
183  * bug in clang.
184  */
185 #define __crc32b __builtin_arm_crc32b
186 #define __crc32d __builtin_arm_crc32d
187 #define __crc32w __builtin_arm_crc32w
188 #define __crc32cw __builtin_arm_crc32cw
189 
190 #if defined(__aarch64__)
191 #define TARGET_ARMV8_WITH_CRC __attribute__((target("crc")))
192 #else  // !defined(__aarch64__)
193 #define TARGET_ARMV8_WITH_CRC __attribute__((target("armv8-a,crc")))
194 #endif  // defined(__aarch64__)
195 
196 #elif defined(__GNUC__)
197 /* For GCC, we are setting CRC extensions at module level, so ThinLTO is not
198  * allowed. We can just include arm_acle.h.
199  */
200 #include <arm_acle.h>
201 #define TARGET_ARMV8_WITH_CRC
202 #else  // !defined(__GNUC__) && !defined(_aarch64__)
203 #error ARM CRC32 SIMD extensions only supported for Clang and GCC
204 #endif
205 
206 TARGET_ARMV8_WITH_CRC
armv8_crc32_little(unsigned long crc,const unsigned char * buf,z_size_t len)207 uint32_t ZLIB_INTERNAL armv8_crc32_little(unsigned long crc,
208                                           const unsigned char *buf,
209                                           z_size_t len)
210 {
211     uint32_t c = (uint32_t) ~crc;
212 
213     while (len && ((uintptr_t)buf & 7)) {
214         c = __crc32b(c, *buf++);
215         --len;
216     }
217 
218     const uint64_t *buf8 = (const uint64_t *)buf;
219 
220     while (len >= 64) {
221         c = __crc32d(c, *buf8++);
222         c = __crc32d(c, *buf8++);
223         c = __crc32d(c, *buf8++);
224         c = __crc32d(c, *buf8++);
225 
226         c = __crc32d(c, *buf8++);
227         c = __crc32d(c, *buf8++);
228         c = __crc32d(c, *buf8++);
229         c = __crc32d(c, *buf8++);
230         len -= 64;
231     }
232 
233     while (len >= 8) {
234         c = __crc32d(c, *buf8++);
235         len -= 8;
236     }
237 
238     buf = (const unsigned char *)buf8;
239 
240     while (len--) {
241         c = __crc32b(c, *buf++);
242     }
243 
244     return ~c;
245 }
246 
247 #endif
248