1 /*
2 * Compute the CRC32 using a parallelized folding approach with the PCLMULQDQ
3 * instruction.
4 *
5 * A white paper describing this algorithm can be found at:
6 * http://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf
7 *
8 * Copyright (C) 2013 Intel Corporation. All rights reserved.
9 * Authors:
10 * Wajdi Feghali <wajdi.k.feghali@intel.com>
11 * Jim Guilford <james.guilford@intel.com>
12 * Vinodh Gopal <vinodh.gopal@intel.com>
13 * Erdinc Ozturk <erdinc.ozturk@intel.com>
14 * Jim Kukunas <james.t.kukunas@linux.intel.com>
15 *
16 * For conditions of distribution and use, see copyright notice in zlib.h
17 */
18
19 #include "deflate.h"
20
21 #include <inttypes.h>
22 #include <emmintrin.h>
23 #include <immintrin.h>
24 #include <wmmintrin.h>
25
26 #ifndef __GNUC__
27 #define __attribute__()
28 #endif
29
30 #define CRC_LOAD(s) \
31 do { \
32 __m128i xmm_crc0 = _mm_loadu_si128((__m128i *)s->crc0 + 0);\
33 __m128i xmm_crc1 = _mm_loadu_si128((__m128i *)s->crc0 + 1);\
34 __m128i xmm_crc2 = _mm_loadu_si128((__m128i *)s->crc0 + 2);\
35 __m128i xmm_crc3 = _mm_loadu_si128((__m128i *)s->crc0 + 3);\
36 __m128i xmm_crc_part = _mm_loadu_si128((__m128i *)s->crc0 + 4);
37
38 #define CRC_SAVE(s) \
39 _mm_storeu_si128((__m128i *)s->crc0 + 0, xmm_crc0);\
40 _mm_storeu_si128((__m128i *)s->crc0 + 1, xmm_crc1);\
41 _mm_storeu_si128((__m128i *)s->crc0 + 2, xmm_crc2);\
42 _mm_storeu_si128((__m128i *)s->crc0 + 3, xmm_crc3);\
43 _mm_storeu_si128((__m128i *)s->crc0 + 4, xmm_crc_part);\
44 } while (0);
45
46 __attribute__((target("sse4.2,pclmul")))
crc_fold_init(deflate_state * const s)47 ZLIB_INTERNAL void crc_fold_init(deflate_state *const s)
48 {
49 CRC_LOAD(s)
50
51 xmm_crc0 = _mm_cvtsi32_si128(0x9db42487);
52 xmm_crc1 = _mm_setzero_si128();
53 xmm_crc2 = _mm_setzero_si128();
54 xmm_crc3 = _mm_setzero_si128();
55
56 CRC_SAVE(s)
57
58 s->strm->adler = 0;
59 }
60
61 __attribute__((target("sse4.2,pclmul")))
fold_1(deflate_state * const s,__m128i * xmm_crc0,__m128i * xmm_crc1,__m128i * xmm_crc2,__m128i * xmm_crc3)62 local void fold_1(deflate_state *const s,
63 __m128i *xmm_crc0, __m128i *xmm_crc1,
64 __m128i *xmm_crc2, __m128i *xmm_crc3)
65 {
66 const __m128i xmm_fold4 = _mm_set_epi32(
67 0x00000001, 0x54442bd4,
68 0x00000001, 0xc6e41596);
69
70 __m128i x_tmp3;
71 __m128 ps_crc0, ps_crc3, ps_res;
72
73 x_tmp3 = *xmm_crc3;
74
75 *xmm_crc3 = *xmm_crc0;
76 *xmm_crc0 = _mm_clmulepi64_si128(*xmm_crc0, xmm_fold4, 0x01);
77 *xmm_crc3 = _mm_clmulepi64_si128(*xmm_crc3, xmm_fold4, 0x10);
78 ps_crc0 = _mm_castsi128_ps(*xmm_crc0);
79 ps_crc3 = _mm_castsi128_ps(*xmm_crc3);
80 ps_res = _mm_xor_ps(ps_crc0, ps_crc3);
81
82 *xmm_crc0 = *xmm_crc1;
83 *xmm_crc1 = *xmm_crc2;
84 *xmm_crc2 = x_tmp3;
85 *xmm_crc3 = _mm_castps_si128(ps_res);
86 }
87
88 __attribute__((target("sse4.2,pclmul")))
fold_2(deflate_state * const s,__m128i * xmm_crc0,__m128i * xmm_crc1,__m128i * xmm_crc2,__m128i * xmm_crc3)89 local void fold_2(deflate_state *const s,
90 __m128i *xmm_crc0, __m128i *xmm_crc1,
91 __m128i *xmm_crc2, __m128i *xmm_crc3)
92 {
93 const __m128i xmm_fold4 = _mm_set_epi32(
94 0x00000001, 0x54442bd4,
95 0x00000001, 0xc6e41596);
96
97 __m128i x_tmp3, x_tmp2;
98 __m128 ps_crc0, ps_crc1, ps_crc2, ps_crc3, ps_res31, ps_res20;
99
100 x_tmp3 = *xmm_crc3;
101 x_tmp2 = *xmm_crc2;
102
103 *xmm_crc3 = *xmm_crc1;
104 *xmm_crc1 = _mm_clmulepi64_si128(*xmm_crc1, xmm_fold4, 0x01);
105 *xmm_crc3 = _mm_clmulepi64_si128(*xmm_crc3, xmm_fold4, 0x10);
106 ps_crc3 = _mm_castsi128_ps(*xmm_crc3);
107 ps_crc1 = _mm_castsi128_ps(*xmm_crc1);
108 ps_res31= _mm_xor_ps(ps_crc3, ps_crc1);
109
110 *xmm_crc2 = *xmm_crc0;
111 *xmm_crc0 = _mm_clmulepi64_si128(*xmm_crc0, xmm_fold4, 0x01);
112 *xmm_crc2 = _mm_clmulepi64_si128(*xmm_crc2, xmm_fold4, 0x10);
113 ps_crc0 = _mm_castsi128_ps(*xmm_crc0);
114 ps_crc2 = _mm_castsi128_ps(*xmm_crc2);
115 ps_res20= _mm_xor_ps(ps_crc0, ps_crc2);
116
117 *xmm_crc0 = x_tmp2;
118 *xmm_crc1 = x_tmp3;
119 *xmm_crc2 = _mm_castps_si128(ps_res20);
120 *xmm_crc3 = _mm_castps_si128(ps_res31);
121 }
122
123 __attribute__((target("sse4.2,pclmul")))
fold_3(deflate_state * const s,__m128i * xmm_crc0,__m128i * xmm_crc1,__m128i * xmm_crc2,__m128i * xmm_crc3)124 local void fold_3(deflate_state *const s,
125 __m128i *xmm_crc0, __m128i *xmm_crc1,
126 __m128i *xmm_crc2, __m128i *xmm_crc3)
127 {
128 const __m128i xmm_fold4 = _mm_set_epi32(
129 0x00000001, 0x54442bd4,
130 0x00000001, 0xc6e41596);
131
132 __m128i x_tmp3;
133 __m128 ps_crc0, ps_crc1, ps_crc2, ps_crc3, ps_res32, ps_res21, ps_res10;
134
135 x_tmp3 = *xmm_crc3;
136
137 *xmm_crc3 = *xmm_crc2;
138 *xmm_crc2 = _mm_clmulepi64_si128(*xmm_crc2, xmm_fold4, 0x01);
139 *xmm_crc3 = _mm_clmulepi64_si128(*xmm_crc3, xmm_fold4, 0x10);
140 ps_crc2 = _mm_castsi128_ps(*xmm_crc2);
141 ps_crc3 = _mm_castsi128_ps(*xmm_crc3);
142 ps_res32 = _mm_xor_ps(ps_crc2, ps_crc3);
143
144 *xmm_crc2 = *xmm_crc1;
145 *xmm_crc1 = _mm_clmulepi64_si128(*xmm_crc1, xmm_fold4, 0x01);
146 *xmm_crc2 = _mm_clmulepi64_si128(*xmm_crc2, xmm_fold4, 0x10);
147 ps_crc1 = _mm_castsi128_ps(*xmm_crc1);
148 ps_crc2 = _mm_castsi128_ps(*xmm_crc2);
149 ps_res21= _mm_xor_ps(ps_crc1, ps_crc2);
150
151 *xmm_crc1 = *xmm_crc0;
152 *xmm_crc0 = _mm_clmulepi64_si128(*xmm_crc0, xmm_fold4, 0x01);
153 *xmm_crc1 = _mm_clmulepi64_si128(*xmm_crc1, xmm_fold4, 0x10);
154 ps_crc0 = _mm_castsi128_ps(*xmm_crc0);
155 ps_crc1 = _mm_castsi128_ps(*xmm_crc1);
156 ps_res10= _mm_xor_ps(ps_crc0, ps_crc1);
157
158 *xmm_crc0 = x_tmp3;
159 *xmm_crc1 = _mm_castps_si128(ps_res10);
160 *xmm_crc2 = _mm_castps_si128(ps_res21);
161 *xmm_crc3 = _mm_castps_si128(ps_res32);
162 }
163
164 __attribute__((target("sse4.2,pclmul")))
fold_4(deflate_state * const s,__m128i * xmm_crc0,__m128i * xmm_crc1,__m128i * xmm_crc2,__m128i * xmm_crc3)165 local void fold_4(deflate_state *const s,
166 __m128i *xmm_crc0, __m128i *xmm_crc1,
167 __m128i *xmm_crc2, __m128i *xmm_crc3)
168 {
169 const __m128i xmm_fold4 = _mm_set_epi32(
170 0x00000001, 0x54442bd4,
171 0x00000001, 0xc6e41596);
172
173 __m128i x_tmp0, x_tmp1, x_tmp2, x_tmp3;
174 __m128 ps_crc0, ps_crc1, ps_crc2, ps_crc3;
175 __m128 ps_t0, ps_t1, ps_t2, ps_t3;
176 __m128 ps_res0, ps_res1, ps_res2, ps_res3;
177
178 x_tmp0 = *xmm_crc0;
179 x_tmp1 = *xmm_crc1;
180 x_tmp2 = *xmm_crc2;
181 x_tmp3 = *xmm_crc3;
182
183 *xmm_crc0 = _mm_clmulepi64_si128(*xmm_crc0, xmm_fold4, 0x01);
184 x_tmp0 = _mm_clmulepi64_si128(x_tmp0, xmm_fold4, 0x10);
185 ps_crc0 = _mm_castsi128_ps(*xmm_crc0);
186 ps_t0 = _mm_castsi128_ps(x_tmp0);
187 ps_res0 = _mm_xor_ps(ps_crc0, ps_t0);
188
189 *xmm_crc1 = _mm_clmulepi64_si128(*xmm_crc1, xmm_fold4, 0x01);
190 x_tmp1 = _mm_clmulepi64_si128(x_tmp1, xmm_fold4, 0x10);
191 ps_crc1 = _mm_castsi128_ps(*xmm_crc1);
192 ps_t1 = _mm_castsi128_ps(x_tmp1);
193 ps_res1 = _mm_xor_ps(ps_crc1, ps_t1);
194
195 *xmm_crc2 = _mm_clmulepi64_si128(*xmm_crc2, xmm_fold4, 0x01);
196 x_tmp2 = _mm_clmulepi64_si128(x_tmp2, xmm_fold4, 0x10);
197 ps_crc2 = _mm_castsi128_ps(*xmm_crc2);
198 ps_t2 = _mm_castsi128_ps(x_tmp2);
199 ps_res2 = _mm_xor_ps(ps_crc2, ps_t2);
200
201 *xmm_crc3 = _mm_clmulepi64_si128(*xmm_crc3, xmm_fold4, 0x01);
202 x_tmp3 = _mm_clmulepi64_si128(x_tmp3, xmm_fold4, 0x10);
203 ps_crc3 = _mm_castsi128_ps(*xmm_crc3);
204 ps_t3 = _mm_castsi128_ps(x_tmp3);
205 ps_res3 = _mm_xor_ps(ps_crc3, ps_t3);
206
207 *xmm_crc0 = _mm_castps_si128(ps_res0);
208 *xmm_crc1 = _mm_castps_si128(ps_res1);
209 *xmm_crc2 = _mm_castps_si128(ps_res2);
210 *xmm_crc3 = _mm_castps_si128(ps_res3);
211 }
212
213 local const unsigned zalign(32) pshufb_shf_table[60] = {
214 0x84838281,0x88878685,0x8c8b8a89,0x008f8e8d, /* shl 15 (16 - 1)/shr1 */
215 0x85848382,0x89888786,0x8d8c8b8a,0x01008f8e, /* shl 14 (16 - 3)/shr2 */
216 0x86858483,0x8a898887,0x8e8d8c8b,0x0201008f, /* shl 13 (16 - 4)/shr3 */
217 0x87868584,0x8b8a8988,0x8f8e8d8c,0x03020100, /* shl 12 (16 - 4)/shr4 */
218 0x88878685,0x8c8b8a89,0x008f8e8d,0x04030201, /* shl 11 (16 - 5)/shr5 */
219 0x89888786,0x8d8c8b8a,0x01008f8e,0x05040302, /* shl 10 (16 - 6)/shr6 */
220 0x8a898887,0x8e8d8c8b,0x0201008f,0x06050403, /* shl 9 (16 - 7)/shr7 */
221 0x8b8a8988,0x8f8e8d8c,0x03020100,0x07060504, /* shl 8 (16 - 8)/shr8 */
222 0x8c8b8a89,0x008f8e8d,0x04030201,0x08070605, /* shl 7 (16 - 9)/shr9 */
223 0x8d8c8b8a,0x01008f8e,0x05040302,0x09080706, /* shl 6 (16 -10)/shr10*/
224 0x8e8d8c8b,0x0201008f,0x06050403,0x0a090807, /* shl 5 (16 -11)/shr11*/
225 0x8f8e8d8c,0x03020100,0x07060504,0x0b0a0908, /* shl 4 (16 -12)/shr12*/
226 0x008f8e8d,0x04030201,0x08070605,0x0c0b0a09, /* shl 3 (16 -13)/shr13*/
227 0x01008f8e,0x05040302,0x09080706,0x0d0c0b0a, /* shl 2 (16 -14)/shr14*/
228 0x0201008f,0x06050403,0x0a090807,0x0e0d0c0b /* shl 1 (16 -15)/shr15*/
229 };
230
231 __attribute__((target("sse4.2,pclmul")))
partial_fold(deflate_state * const s,const size_t len,__m128i * xmm_crc0,__m128i * xmm_crc1,__m128i * xmm_crc2,__m128i * xmm_crc3,__m128i * xmm_crc_part)232 local void partial_fold(deflate_state *const s, const size_t len,
233 __m128i *xmm_crc0, __m128i *xmm_crc1,
234 __m128i *xmm_crc2, __m128i *xmm_crc3,
235 __m128i *xmm_crc_part)
236 {
237
238 const __m128i xmm_fold4 = _mm_set_epi32(
239 0x00000001, 0x54442bd4,
240 0x00000001, 0xc6e41596);
241 const __m128i xmm_mask3 = _mm_set1_epi32(0x80808080);
242
243 __m128i xmm_shl, xmm_shr, xmm_tmp1, xmm_tmp2, xmm_tmp3;
244 __m128i xmm_a0_0, xmm_a0_1;
245 __m128 ps_crc3, psa0_0, psa0_1, ps_res;
246
247 xmm_shl = _mm_load_si128((__m128i *)pshufb_shf_table + (len - 1));
248 xmm_shr = xmm_shl;
249 xmm_shr = _mm_xor_si128(xmm_shr, xmm_mask3);
250
251 xmm_a0_0 = _mm_shuffle_epi8(*xmm_crc0, xmm_shl);
252
253 *xmm_crc0 = _mm_shuffle_epi8(*xmm_crc0, xmm_shr);
254 xmm_tmp1 = _mm_shuffle_epi8(*xmm_crc1, xmm_shl);
255 *xmm_crc0 = _mm_or_si128(*xmm_crc0, xmm_tmp1);
256
257 *xmm_crc1 = _mm_shuffle_epi8(*xmm_crc1, xmm_shr);
258 xmm_tmp2 = _mm_shuffle_epi8(*xmm_crc2, xmm_shl);
259 *xmm_crc1 = _mm_or_si128(*xmm_crc1, xmm_tmp2);
260
261 *xmm_crc2 = _mm_shuffle_epi8(*xmm_crc2, xmm_shr);
262 xmm_tmp3 = _mm_shuffle_epi8(*xmm_crc3, xmm_shl);
263 *xmm_crc2 = _mm_or_si128(*xmm_crc2, xmm_tmp3);
264
265 *xmm_crc3 = _mm_shuffle_epi8(*xmm_crc3, xmm_shr);
266 *xmm_crc_part = _mm_shuffle_epi8(*xmm_crc_part, xmm_shl);
267 *xmm_crc3 = _mm_or_si128(*xmm_crc3, *xmm_crc_part);
268
269 xmm_a0_1 = _mm_clmulepi64_si128(xmm_a0_0, xmm_fold4, 0x10);
270 xmm_a0_0 = _mm_clmulepi64_si128(xmm_a0_0, xmm_fold4, 0x01);
271
272 ps_crc3 = _mm_castsi128_ps(*xmm_crc3);
273 psa0_0 = _mm_castsi128_ps(xmm_a0_0);
274 psa0_1 = _mm_castsi128_ps(xmm_a0_1);
275
276 ps_res = _mm_xor_ps(ps_crc3, psa0_0);
277 ps_res = _mm_xor_ps(ps_res, psa0_1);
278
279 *xmm_crc3 = _mm_castps_si128(ps_res);
280 }
281
282 __attribute__((target("sse4.2,pclmul")))
crc_fold_copy(deflate_state * const s,unsigned char * dst,const unsigned char * src,long len)283 ZLIB_INTERNAL void crc_fold_copy(deflate_state *const s,
284 unsigned char *dst, const unsigned char *src, long len)
285 {
286 unsigned long algn_diff;
287 __m128i xmm_t0, xmm_t1, xmm_t2, xmm_t3;
288
289 CRC_LOAD(s)
290
291 if (len < 16) {
292 if (len == 0)
293 return;
294 goto partial;
295 }
296
297 algn_diff = 0 - (uintptr_t)src & 0xF;
298 if (algn_diff) {
299 xmm_crc_part = _mm_loadu_si128((__m128i *)src);
300 _mm_storeu_si128((__m128i *)dst, xmm_crc_part);
301
302 dst += algn_diff;
303 src += algn_diff;
304 len -= algn_diff;
305
306 partial_fold(s, algn_diff, &xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3,
307 &xmm_crc_part);
308 }
309
310 while ((len -= 64) >= 0) {
311 xmm_t0 = _mm_load_si128((__m128i *)src);
312 xmm_t1 = _mm_load_si128((__m128i *)src + 1);
313 xmm_t2 = _mm_load_si128((__m128i *)src + 2);
314 xmm_t3 = _mm_load_si128((__m128i *)src + 3);
315
316 fold_4(s, &xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
317
318 _mm_storeu_si128((__m128i *)dst, xmm_t0);
319 _mm_storeu_si128((__m128i *)dst + 1, xmm_t1);
320 _mm_storeu_si128((__m128i *)dst + 2, xmm_t2);
321 _mm_storeu_si128((__m128i *)dst + 3, xmm_t3);
322
323 xmm_crc0 = _mm_xor_si128(xmm_crc0, xmm_t0);
324 xmm_crc1 = _mm_xor_si128(xmm_crc1, xmm_t1);
325 xmm_crc2 = _mm_xor_si128(xmm_crc2, xmm_t2);
326 xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t3);
327
328 src += 64;
329 dst += 64;
330 }
331
332 /*
333 * len = num bytes left - 64
334 */
335 if (len + 16 >= 0) {
336 len += 16;
337
338 xmm_t0 = _mm_load_si128((__m128i *)src);
339 xmm_t1 = _mm_load_si128((__m128i *)src + 1);
340 xmm_t2 = _mm_load_si128((__m128i *)src + 2);
341
342 fold_3(s, &xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
343
344 _mm_storeu_si128((__m128i *)dst, xmm_t0);
345 _mm_storeu_si128((__m128i *)dst + 1, xmm_t1);
346 _mm_storeu_si128((__m128i *)dst + 2, xmm_t2);
347
348 xmm_crc1 = _mm_xor_si128(xmm_crc1, xmm_t0);
349 xmm_crc2 = _mm_xor_si128(xmm_crc2, xmm_t1);
350 xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t2);
351
352 if (len == 0)
353 goto done;
354
355 dst += 48;
356 src += 48;
357 } else if (len + 32 >= 0) {
358 len += 32;
359
360 xmm_t0 = _mm_load_si128((__m128i *)src);
361 xmm_t1 = _mm_load_si128((__m128i *)src + 1);
362
363 fold_2(s, &xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
364
365 _mm_storeu_si128((__m128i *)dst, xmm_t0);
366 _mm_storeu_si128((__m128i *)dst + 1, xmm_t1);
367
368 xmm_crc2 = _mm_xor_si128(xmm_crc2, xmm_t0);
369 xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t1);
370
371 if (len == 0)
372 goto done;
373
374 dst += 32;
375 src += 32;
376 } else if (len + 48 >= 0) {
377 len += 48;
378
379 xmm_t0 = _mm_load_si128((__m128i *)src);
380
381 fold_1(s, &xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
382
383 _mm_storeu_si128((__m128i *)dst, xmm_t0);
384
385 xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t0);
386
387 if (len == 0)
388 goto done;
389
390 dst += 16;
391 src += 16;
392 } else {
393 len += 64;
394 if (len == 0)
395 goto done;
396 }
397
398 partial:
399
400 #if defined(_MSC_VER)
401 /* VS does not permit the use of _mm_set_epi64x in 32-bit builds */
402 {
403 int32_t parts[4] = {0, 0, 0, 0};
404 memcpy(&parts, src, len);
405 xmm_crc_part = _mm_set_epi32(parts[3], parts[2], parts[1], parts[0]);
406 }
407 #else
408 {
409 int64_t parts[2] = {0, 0};
410 memcpy(&parts, src, len);
411 xmm_crc_part = _mm_set_epi64x(parts[1], parts[0]);
412 }
413 #endif
414
415 _mm_storeu_si128((__m128i *)dst, xmm_crc_part);
416 partial_fold(s, len, &xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3,
417 &xmm_crc_part);
418 done:
419 CRC_SAVE(s)
420 }
421
422 local const unsigned zalign(16) crc_k[] = {
423 0xccaa009e, 0x00000000, /* rk1 */
424 0x751997d0, 0x00000001, /* rk2 */
425 0xccaa009e, 0x00000000, /* rk5 */
426 0x63cd6124, 0x00000001, /* rk6 */
427 0xf7011640, 0x00000001, /* rk7 */
428 0xdb710640, 0x00000001 /* rk8 */
429 };
430
431 local const unsigned zalign(16) crc_mask[4] = {
432 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000
433 };
434
435 local const unsigned zalign(16) crc_mask2[4] = {
436 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF
437 };
438
439 __attribute__((target("sse4.2,pclmul")))
crc_fold_512to32(deflate_state * const s)440 unsigned ZLIB_INTERNAL crc_fold_512to32(deflate_state *const s)
441 {
442 const __m128i xmm_mask = _mm_load_si128((__m128i *)crc_mask);
443 const __m128i xmm_mask2 = _mm_load_si128((__m128i *)crc_mask2);
444
445 unsigned crc;
446 __m128i x_tmp0, x_tmp1, x_tmp2, crc_fold;
447
448 CRC_LOAD(s)
449
450 /*
451 * k1
452 */
453 crc_fold = _mm_load_si128((__m128i *)crc_k);
454
455 x_tmp0 = _mm_clmulepi64_si128(xmm_crc0, crc_fold, 0x10);
456 xmm_crc0 = _mm_clmulepi64_si128(xmm_crc0, crc_fold, 0x01);
457 xmm_crc1 = _mm_xor_si128(xmm_crc1, x_tmp0);
458 xmm_crc1 = _mm_xor_si128(xmm_crc1, xmm_crc0);
459
460 x_tmp1 = _mm_clmulepi64_si128(xmm_crc1, crc_fold, 0x10);
461 xmm_crc1 = _mm_clmulepi64_si128(xmm_crc1, crc_fold, 0x01);
462 xmm_crc2 = _mm_xor_si128(xmm_crc2, x_tmp1);
463 xmm_crc2 = _mm_xor_si128(xmm_crc2, xmm_crc1);
464
465 x_tmp2 = _mm_clmulepi64_si128(xmm_crc2, crc_fold, 0x10);
466 xmm_crc2 = _mm_clmulepi64_si128(xmm_crc2, crc_fold, 0x01);
467 xmm_crc3 = _mm_xor_si128(xmm_crc3, x_tmp2);
468 xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_crc2);
469
470 /*
471 * k5
472 */
473 crc_fold = _mm_load_si128((__m128i *)crc_k + 1);
474
475 xmm_crc0 = xmm_crc3;
476 xmm_crc3 = _mm_clmulepi64_si128(xmm_crc3, crc_fold, 0);
477 xmm_crc0 = _mm_srli_si128(xmm_crc0, 8);
478 xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_crc0);
479
480 xmm_crc0 = xmm_crc3;
481 xmm_crc3 = _mm_slli_si128(xmm_crc3, 4);
482 xmm_crc3 = _mm_clmulepi64_si128(xmm_crc3, crc_fold, 0x10);
483 xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_crc0);
484 xmm_crc3 = _mm_and_si128(xmm_crc3, xmm_mask2);
485
486 /*
487 * k7
488 */
489 xmm_crc1 = xmm_crc3;
490 xmm_crc2 = xmm_crc3;
491 crc_fold = _mm_load_si128((__m128i *)crc_k + 2);
492
493 xmm_crc3 = _mm_clmulepi64_si128(xmm_crc3, crc_fold, 0);
494 xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_crc2);
495 xmm_crc3 = _mm_and_si128(xmm_crc3, xmm_mask);
496
497 xmm_crc2 = xmm_crc3;
498 xmm_crc3 = _mm_clmulepi64_si128(xmm_crc3, crc_fold, 0x10);
499 xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_crc2);
500 xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_crc1);
501
502 crc = _mm_extract_epi32(xmm_crc3, 2);
503 return ~crc;
504 CRC_SAVE(s)
505 }
506