1 /* crc32_simd.c
2 *
3 * Copyright 2017 The Chromium Authors. All rights reserved.
4 * Use of this source code is governed by a BSD-style license that can be
5 * found in the Chromium source repository LICENSE file.
6 */
7
8 #include "crc32_simd.h"
9
10 #if defined(CRC32_SIMD_SSE42_PCLMUL)
11 #ifndef __GNUC__
12 #define __attribute__()
13 #endif
14
15 /*
16 * crc32_sse42_simd_(): compute the crc32 of the buffer, where the buffer
17 * length must be at least 64, and a multiple of 16. Based on:
18 *
19 * "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction"
20 * V. Gopal, E. Ozturk, et al., 2009, http://intel.ly/2ySEwL0
21 */
22
23 #include <emmintrin.h>
24 #include <smmintrin.h>
25 #include <wmmintrin.h>
26
27 __attribute__((target("sse4.2,pclmul")))
crc32_sse42_simd_(const unsigned char * buf,z_size_t len,uint32_t crc)28 uint32_t ZLIB_INTERNAL crc32_sse42_simd_( /* SSE4.2+PCLMUL */
29 const unsigned char *buf,
30 z_size_t len,
31 uint32_t crc)
32 {
33 /*
34 * Definitions of the bit-reflected domain constants k1,k2,k3, etc and
35 * the CRC32+Barrett polynomials given at the end of the paper.
36 */
37 static const uint64_t zalign(16) k1k2[] = { 0x0154442bd4, 0x01c6e41596 };
38 static const uint64_t zalign(16) k3k4[] = { 0x01751997d0, 0x00ccaa009e };
39 static const uint64_t zalign(16) k5k0[] = { 0x0163cd6124, 0x0000000000 };
40 static const uint64_t zalign(16) poly[] = { 0x01db710641, 0x01f7011641 };
41
42 __m128i x0, x1, x2, x3, x4, x5, x6, x7, x8, y5, y6, y7, y8;
43
44 /*
45 * There's at least one block of 64.
46 */
47 x1 = _mm_loadu_si128((__m128i *)(buf + 0x00));
48 x2 = _mm_loadu_si128((__m128i *)(buf + 0x10));
49 x3 = _mm_loadu_si128((__m128i *)(buf + 0x20));
50 x4 = _mm_loadu_si128((__m128i *)(buf + 0x30));
51
52 x1 = _mm_xor_si128(x1, _mm_cvtsi32_si128(crc));
53
54 x0 = _mm_load_si128((__m128i *)k1k2);
55
56 buf += 64;
57 len -= 64;
58
59 /*
60 * Parallel fold blocks of 64, if any.
61 */
62 while (len >= 64)
63 {
64 x5 = _mm_clmulepi64_si128(x1, x0, 0x00);
65 x6 = _mm_clmulepi64_si128(x2, x0, 0x00);
66 x7 = _mm_clmulepi64_si128(x3, x0, 0x00);
67 x8 = _mm_clmulepi64_si128(x4, x0, 0x00);
68
69 x1 = _mm_clmulepi64_si128(x1, x0, 0x11);
70 x2 = _mm_clmulepi64_si128(x2, x0, 0x11);
71 x3 = _mm_clmulepi64_si128(x3, x0, 0x11);
72 x4 = _mm_clmulepi64_si128(x4, x0, 0x11);
73
74 y5 = _mm_loadu_si128((__m128i *)(buf + 0x00));
75 y6 = _mm_loadu_si128((__m128i *)(buf + 0x10));
76 y7 = _mm_loadu_si128((__m128i *)(buf + 0x20));
77 y8 = _mm_loadu_si128((__m128i *)(buf + 0x30));
78
79 x1 = _mm_xor_si128(x1, x5);
80 x2 = _mm_xor_si128(x2, x6);
81 x3 = _mm_xor_si128(x3, x7);
82 x4 = _mm_xor_si128(x4, x8);
83
84 x1 = _mm_xor_si128(x1, y5);
85 x2 = _mm_xor_si128(x2, y6);
86 x3 = _mm_xor_si128(x3, y7);
87 x4 = _mm_xor_si128(x4, y8);
88
89 buf += 64;
90 len -= 64;
91 }
92
93 /*
94 * Fold into 128-bits.
95 */
96 x0 = _mm_load_si128((__m128i *)k3k4);
97
98 x5 = _mm_clmulepi64_si128(x1, x0, 0x00);
99 x1 = _mm_clmulepi64_si128(x1, x0, 0x11);
100 x1 = _mm_xor_si128(x1, x2);
101 x1 = _mm_xor_si128(x1, x5);
102
103 x5 = _mm_clmulepi64_si128(x1, x0, 0x00);
104 x1 = _mm_clmulepi64_si128(x1, x0, 0x11);
105 x1 = _mm_xor_si128(x1, x3);
106 x1 = _mm_xor_si128(x1, x5);
107
108 x5 = _mm_clmulepi64_si128(x1, x0, 0x00);
109 x1 = _mm_clmulepi64_si128(x1, x0, 0x11);
110 x1 = _mm_xor_si128(x1, x4);
111 x1 = _mm_xor_si128(x1, x5);
112
113 /*
114 * Single fold blocks of 16, if any.
115 */
116 while (len >= 16)
117 {
118 x2 = _mm_loadu_si128((__m128i *)buf);
119
120 x5 = _mm_clmulepi64_si128(x1, x0, 0x00);
121 x1 = _mm_clmulepi64_si128(x1, x0, 0x11);
122 x1 = _mm_xor_si128(x1, x2);
123 x1 = _mm_xor_si128(x1, x5);
124
125 buf += 16;
126 len -= 16;
127 }
128
129 /*
130 * Fold 128-bits to 64-bits.
131 */
132 x2 = _mm_clmulepi64_si128(x1, x0, 0x10);
133 x3 = _mm_setr_epi32(~0, 0, ~0, 0);
134 x1 = _mm_srli_si128(x1, 8);
135 x1 = _mm_xor_si128(x1, x2);
136
137 x0 = _mm_loadl_epi64((__m128i*)k5k0);
138
139 x2 = _mm_srli_si128(x1, 4);
140 x1 = _mm_and_si128(x1, x3);
141 x1 = _mm_clmulepi64_si128(x1, x0, 0x00);
142 x1 = _mm_xor_si128(x1, x2);
143
144 /*
145 * Barret reduce to 32-bits.
146 */
147 x0 = _mm_load_si128((__m128i*)poly);
148
149 x2 = _mm_and_si128(x1, x3);
150 x2 = _mm_clmulepi64_si128(x2, x0, 0x10);
151 x2 = _mm_and_si128(x2, x3);
152 x2 = _mm_clmulepi64_si128(x2, x0, 0x00);
153 x1 = _mm_xor_si128(x1, x2);
154
155 /*
156 * Return the crc32.
157 */
158 return _mm_extract_epi32(x1, 1);
159 }
160
161 #elif defined(CRC32_ARMV8_CRC32)
162
163 /* CRC32 checksums using ARMv8-a crypto instructions.
164 *
165 * TODO: implement a version using the PMULL instruction.
166 */
167
168 #if defined(__clang__)
169 /* CRC32 intrinsics are #ifdef'ed out of arm_acle.h unless we build with an
170 * armv8 target, which is incompatible with ThinLTO optimizations on Android.
171 * (Namely, mixing and matching different module-level targets makes ThinLTO
172 * warn, and Android defaults to armv7-a. This restriction does not apply to
173 * function-level `target`s, however.)
174 *
175 * Since we only need four crc intrinsics, and since clang's implementation of
176 * those are just wrappers around compiler builtins, it's simplest to #define
177 * those builtins directly. If this #define list grows too much (or we depend on
178 * an intrinsic that isn't a trivial wrapper), we may have to find a better way
179 * to go about this.
180 *
181 * NOTE: clang currently complains that "'+soft-float-abi' is not a recognized
182 * feature for this target (ignoring feature)." This appears to be a harmless
183 * bug in clang.
184 */
185 #define __crc32b __builtin_arm_crc32b
186 #define __crc32d __builtin_arm_crc32d
187 #define __crc32w __builtin_arm_crc32w
188 #define __crc32cw __builtin_arm_crc32cw
189
190 #if defined(__aarch64__)
191 #define TARGET_ARMV8_WITH_CRC __attribute__((target("crc")))
192 #else // !defined(__aarch64__)
193 #define TARGET_ARMV8_WITH_CRC __attribute__((target("armv8-a,crc")))
194 #endif // defined(__aarch64__)
195
196 #elif defined(__GNUC__)
197 /* For GCC, we are setting CRC extensions at module level, so ThinLTO is not
198 * allowed. We can just include arm_acle.h.
199 */
200 #include <arm_acle.h>
201 #define TARGET_ARMV8_WITH_CRC
202 #else // !defined(__GNUC__) && !defined(_aarch64__)
203 #error ARM CRC32 SIMD extensions only supported for Clang and GCC
204 #endif
205
206 TARGET_ARMV8_WITH_CRC
armv8_crc32_little(unsigned long crc,const unsigned char * buf,z_size_t len)207 uint32_t ZLIB_INTERNAL armv8_crc32_little(unsigned long crc,
208 const unsigned char *buf,
209 z_size_t len)
210 {
211 uint32_t c = (uint32_t) ~crc;
212
213 while (len && ((uintptr_t)buf & 7)) {
214 c = __crc32b(c, *buf++);
215 --len;
216 }
217
218 const uint64_t *buf8 = (const uint64_t *)buf;
219
220 while (len >= 64) {
221 c = __crc32d(c, *buf8++);
222 c = __crc32d(c, *buf8++);
223 c = __crc32d(c, *buf8++);
224 c = __crc32d(c, *buf8++);
225
226 c = __crc32d(c, *buf8++);
227 c = __crc32d(c, *buf8++);
228 c = __crc32d(c, *buf8++);
229 c = __crc32d(c, *buf8++);
230 len -= 64;
231 }
232
233 while (len >= 8) {
234 c = __crc32d(c, *buf8++);
235 len -= 8;
236 }
237
238 buf = (const unsigned char *)buf8;
239
240 while (len--) {
241 c = __crc32b(c, *buf++);
242 }
243
244 return ~c;
245 }
246
247 #endif
248