• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1diff --git a/crc32.c b/crc32.c
2index 9580440c0e6b..9162429cc7b4 100644
3--- a/crc32.c
4+++ b/crc32.c
5@@ -28,6 +28,8 @@
6 #  endif /* !DYNAMIC_CRC_TABLE */
7 #endif /* MAKECRCH */
8
9+#include "deflate.h"
10+#include "x86.h"
11 #include "zutil.h"      /* for STDC and FAR definitions */
12
13 /* Definitions for doing the crc four data bytes at a time. */
14@@ -440,3 +442,28 @@ uLong ZEXPORT crc32_combine64(crc1, crc2, len2)
15 {
16     return crc32_combine_(crc1, crc2, len2);
17 }
18+
19+ZLIB_INTERNAL void crc_reset(deflate_state *const s)
20+{
21+    if (x86_cpu_enable_simd) {
22+        crc_fold_init(s);
23+        return;
24+    }
25+    s->strm->adler = crc32(0L, Z_NULL, 0);
26+}
27+
28+ZLIB_INTERNAL void crc_finalize(deflate_state *const s)
29+{
30+    if (x86_cpu_enable_simd)
31+        s->strm->adler = crc_fold_512to32(s);
32+}
33+
34+ZLIB_INTERNAL void copy_with_crc(z_streamp strm, Bytef *dst, long size)
35+{
36+    if (x86_cpu_enable_simd) {
37+        crc_fold_copy(strm->state, dst, strm->next_in, size);
38+        return;
39+    }
40+    zmemcpy(dst, strm->next_in, size);
41+    strm->adler = crc32(strm->adler, dst, size);
42+}
43diff --git a/crc_folding.c b/crc_folding.c
44new file mode 100644
45index 000000000000..48d77744aaf4
46--- /dev/null
47+++ b/crc_folding.c
48@@ -0,0 +1,493 @@
49+/*
50+ * Compute the CRC32 using a parallelized folding approach with the PCLMULQDQ
51+ * instruction.
52+ *
53+ * A white paper describing this algorithm can be found at:
54+ * http://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf
55+ *
56+ * Copyright (C) 2013 Intel Corporation. All rights reserved.
57+ * Authors:
58+ * 	Wajdi Feghali   <wajdi.k.feghali@intel.com>
59+ * 	Jim Guilford    <james.guilford@intel.com>
60+ * 	Vinodh Gopal    <vinodh.gopal@intel.com>
61+ * 	Erdinc Ozturk   <erdinc.ozturk@intel.com>
62+ * 	Jim Kukunas     <james.t.kukunas@linux.intel.com>
63+ *
64+ * For conditions of distribution and use, see copyright notice in zlib.h
65+ */
66+
67+#include "deflate.h"
68+
69+#include <inttypes.h>
70+#include <emmintrin.h>
71+#include <immintrin.h>
72+#include <wmmintrin.h>
73+
74+#define CRC_LOAD(s) \
75+    do { \
76+        __m128i xmm_crc0 = _mm_loadu_si128((__m128i *)s->crc0 + 0);\
77+        __m128i xmm_crc1 = _mm_loadu_si128((__m128i *)s->crc0 + 1);\
78+        __m128i xmm_crc2 = _mm_loadu_si128((__m128i *)s->crc0 + 2);\
79+        __m128i xmm_crc3 = _mm_loadu_si128((__m128i *)s->crc0 + 3);\
80+        __m128i xmm_crc_part = _mm_loadu_si128((__m128i *)s->crc0 + 4);
81+
82+#define CRC_SAVE(s) \
83+        _mm_storeu_si128((__m128i *)s->crc0 + 0, xmm_crc0);\
84+        _mm_storeu_si128((__m128i *)s->crc0 + 1, xmm_crc1);\
85+        _mm_storeu_si128((__m128i *)s->crc0 + 2, xmm_crc2);\
86+        _mm_storeu_si128((__m128i *)s->crc0 + 3, xmm_crc3);\
87+        _mm_storeu_si128((__m128i *)s->crc0 + 4, xmm_crc_part);\
88+    } while (0);
89+
90+ZLIB_INTERNAL void crc_fold_init(deflate_state *const s)
91+{
92+    CRC_LOAD(s)
93+
94+    xmm_crc0 = _mm_cvtsi32_si128(0x9db42487);
95+    xmm_crc1 = _mm_setzero_si128();
96+    xmm_crc2 = _mm_setzero_si128();
97+    xmm_crc3 = _mm_setzero_si128();
98+
99+    CRC_SAVE(s)
100+
101+    s->strm->adler = 0;
102+}
103+
104+local void fold_1(deflate_state *const s,
105+        __m128i *xmm_crc0, __m128i *xmm_crc1,
106+        __m128i *xmm_crc2, __m128i *xmm_crc3)
107+{
108+    const __m128i xmm_fold4 = _mm_set_epi32(
109+            0x00000001, 0x54442bd4,
110+            0x00000001, 0xc6e41596);
111+
112+    __m128i x_tmp3;
113+    __m128 ps_crc0, ps_crc3, ps_res;
114+
115+    x_tmp3 = *xmm_crc3;
116+
117+    *xmm_crc3 = *xmm_crc0;
118+    *xmm_crc0 = _mm_clmulepi64_si128(*xmm_crc0, xmm_fold4, 0x01);
119+    *xmm_crc3 = _mm_clmulepi64_si128(*xmm_crc3, xmm_fold4, 0x10);
120+    ps_crc0 = _mm_castsi128_ps(*xmm_crc0);
121+    ps_crc3 = _mm_castsi128_ps(*xmm_crc3);
122+    ps_res = _mm_xor_ps(ps_crc0, ps_crc3);
123+
124+    *xmm_crc0 = *xmm_crc1;
125+    *xmm_crc1 = *xmm_crc2;
126+    *xmm_crc2 = x_tmp3;
127+    *xmm_crc3 = _mm_castps_si128(ps_res);
128+}
129+
130+local void fold_2(deflate_state *const s,
131+        __m128i *xmm_crc0, __m128i *xmm_crc1,
132+        __m128i *xmm_crc2, __m128i *xmm_crc3)
133+{
134+    const __m128i xmm_fold4 = _mm_set_epi32(
135+            0x00000001, 0x54442bd4,
136+            0x00000001, 0xc6e41596);
137+
138+    __m128i x_tmp3, x_tmp2;
139+    __m128 ps_crc0, ps_crc1, ps_crc2, ps_crc3, ps_res31, ps_res20;
140+
141+    x_tmp3 = *xmm_crc3;
142+    x_tmp2 = *xmm_crc2;
143+
144+    *xmm_crc3 = *xmm_crc1;
145+    *xmm_crc1 = _mm_clmulepi64_si128(*xmm_crc1, xmm_fold4, 0x01);
146+    *xmm_crc3 = _mm_clmulepi64_si128(*xmm_crc3, xmm_fold4, 0x10);
147+    ps_crc3 = _mm_castsi128_ps(*xmm_crc3);
148+    ps_crc1 = _mm_castsi128_ps(*xmm_crc1);
149+    ps_res31= _mm_xor_ps(ps_crc3, ps_crc1);
150+
151+    *xmm_crc2 = *xmm_crc0;
152+    *xmm_crc0 = _mm_clmulepi64_si128(*xmm_crc0, xmm_fold4, 0x01);
153+    *xmm_crc2 = _mm_clmulepi64_si128(*xmm_crc2, xmm_fold4, 0x10);
154+    ps_crc0 = _mm_castsi128_ps(*xmm_crc0);
155+    ps_crc2 = _mm_castsi128_ps(*xmm_crc2);
156+    ps_res20= _mm_xor_ps(ps_crc0, ps_crc2);
157+
158+    *xmm_crc0 = x_tmp2;
159+    *xmm_crc1 = x_tmp3;
160+    *xmm_crc2 = _mm_castps_si128(ps_res20);
161+    *xmm_crc3 = _mm_castps_si128(ps_res31);
162+}
163+
164+local void fold_3(deflate_state *const s,
165+        __m128i *xmm_crc0, __m128i *xmm_crc1,
166+        __m128i *xmm_crc2, __m128i *xmm_crc3)
167+{
168+    const __m128i xmm_fold4 = _mm_set_epi32(
169+            0x00000001, 0x54442bd4,
170+            0x00000001, 0xc6e41596);
171+
172+    __m128i x_tmp3;
173+    __m128 ps_crc0, ps_crc1, ps_crc2, ps_crc3, ps_res32, ps_res21, ps_res10;
174+
175+    x_tmp3 = *xmm_crc3;
176+
177+    *xmm_crc3 = *xmm_crc2;
178+    *xmm_crc2 = _mm_clmulepi64_si128(*xmm_crc2, xmm_fold4, 0x01);
179+    *xmm_crc3 = _mm_clmulepi64_si128(*xmm_crc3, xmm_fold4, 0x10);
180+    ps_crc2 = _mm_castsi128_ps(*xmm_crc2);
181+    ps_crc3 = _mm_castsi128_ps(*xmm_crc3);
182+    ps_res32 = _mm_xor_ps(ps_crc2, ps_crc3);
183+
184+    *xmm_crc2 = *xmm_crc1;
185+    *xmm_crc1 = _mm_clmulepi64_si128(*xmm_crc1, xmm_fold4, 0x01);
186+    *xmm_crc2 = _mm_clmulepi64_si128(*xmm_crc2, xmm_fold4, 0x10);
187+    ps_crc1 = _mm_castsi128_ps(*xmm_crc1);
188+    ps_crc2 = _mm_castsi128_ps(*xmm_crc2);
189+    ps_res21= _mm_xor_ps(ps_crc1, ps_crc2);
190+
191+    *xmm_crc1 = *xmm_crc0;
192+    *xmm_crc0 = _mm_clmulepi64_si128(*xmm_crc0, xmm_fold4, 0x01);
193+    *xmm_crc1 = _mm_clmulepi64_si128(*xmm_crc1, xmm_fold4, 0x10);
194+    ps_crc0 = _mm_castsi128_ps(*xmm_crc0);
195+    ps_crc1 = _mm_castsi128_ps(*xmm_crc1);
196+    ps_res10= _mm_xor_ps(ps_crc0, ps_crc1);
197+
198+    *xmm_crc0 = x_tmp3;
199+    *xmm_crc1 = _mm_castps_si128(ps_res10);
200+    *xmm_crc2 = _mm_castps_si128(ps_res21);
201+    *xmm_crc3 = _mm_castps_si128(ps_res32);
202+}
203+
204+local void fold_4(deflate_state *const s,
205+        __m128i *xmm_crc0, __m128i *xmm_crc1,
206+        __m128i *xmm_crc2, __m128i *xmm_crc3)
207+{
208+    const __m128i xmm_fold4 = _mm_set_epi32(
209+            0x00000001, 0x54442bd4,
210+            0x00000001, 0xc6e41596);
211+
212+    __m128i x_tmp0, x_tmp1, x_tmp2, x_tmp3;
213+    __m128 ps_crc0, ps_crc1, ps_crc2, ps_crc3;
214+    __m128 ps_t0, ps_t1, ps_t2, ps_t3;
215+    __m128 ps_res0, ps_res1, ps_res2, ps_res3;
216+
217+    x_tmp0 = *xmm_crc0;
218+    x_tmp1 = *xmm_crc1;
219+    x_tmp2 = *xmm_crc2;
220+    x_tmp3 = *xmm_crc3;
221+
222+    *xmm_crc0 = _mm_clmulepi64_si128(*xmm_crc0, xmm_fold4, 0x01);
223+    x_tmp0 = _mm_clmulepi64_si128(x_tmp0, xmm_fold4, 0x10);
224+    ps_crc0 = _mm_castsi128_ps(*xmm_crc0);
225+    ps_t0 = _mm_castsi128_ps(x_tmp0);
226+    ps_res0 = _mm_xor_ps(ps_crc0, ps_t0);
227+
228+    *xmm_crc1 = _mm_clmulepi64_si128(*xmm_crc1, xmm_fold4, 0x01);
229+    x_tmp1 = _mm_clmulepi64_si128(x_tmp1, xmm_fold4, 0x10);
230+    ps_crc1 = _mm_castsi128_ps(*xmm_crc1);
231+    ps_t1 = _mm_castsi128_ps(x_tmp1);
232+    ps_res1 = _mm_xor_ps(ps_crc1, ps_t1);
233+
234+    *xmm_crc2 = _mm_clmulepi64_si128(*xmm_crc2, xmm_fold4, 0x01);
235+    x_tmp2 = _mm_clmulepi64_si128(x_tmp2, xmm_fold4, 0x10);
236+    ps_crc2 = _mm_castsi128_ps(*xmm_crc2);
237+    ps_t2 = _mm_castsi128_ps(x_tmp2);
238+    ps_res2 = _mm_xor_ps(ps_crc2, ps_t2);
239+
240+    *xmm_crc3 = _mm_clmulepi64_si128(*xmm_crc3, xmm_fold4, 0x01);
241+    x_tmp3 = _mm_clmulepi64_si128(x_tmp3, xmm_fold4, 0x10);
242+    ps_crc3 = _mm_castsi128_ps(*xmm_crc3);
243+    ps_t3 = _mm_castsi128_ps(x_tmp3);
244+    ps_res3 = _mm_xor_ps(ps_crc3, ps_t3);
245+
246+    *xmm_crc0 = _mm_castps_si128(ps_res0);
247+    *xmm_crc1 = _mm_castps_si128(ps_res1);
248+    *xmm_crc2 = _mm_castps_si128(ps_res2);
249+    *xmm_crc3 = _mm_castps_si128(ps_res3);
250+}
251+
252+local const unsigned zalign(32) pshufb_shf_table[60] = {
253+	0x84838281,0x88878685,0x8c8b8a89,0x008f8e8d, /* shl 15 (16 - 1)/shr1 */
254+	0x85848382,0x89888786,0x8d8c8b8a,0x01008f8e, /* shl 14 (16 - 3)/shr2 */
255+	0x86858483,0x8a898887,0x8e8d8c8b,0x0201008f, /* shl 13 (16 - 4)/shr3 */
256+	0x87868584,0x8b8a8988,0x8f8e8d8c,0x03020100, /* shl 12 (16 - 4)/shr4 */
257+	0x88878685,0x8c8b8a89,0x008f8e8d,0x04030201, /* shl 11 (16 - 5)/shr5 */
258+	0x89888786,0x8d8c8b8a,0x01008f8e,0x05040302, /* shl 10 (16 - 6)/shr6 */
259+	0x8a898887,0x8e8d8c8b,0x0201008f,0x06050403, /* shl  9 (16 - 7)/shr7 */
260+	0x8b8a8988,0x8f8e8d8c,0x03020100,0x07060504, /* shl  8 (16 - 8)/shr8 */
261+	0x8c8b8a89,0x008f8e8d,0x04030201,0x08070605, /* shl  7 (16 - 9)/shr9 */
262+	0x8d8c8b8a,0x01008f8e,0x05040302,0x09080706, /* shl  6 (16 -10)/shr10*/
263+	0x8e8d8c8b,0x0201008f,0x06050403,0x0a090807, /* shl  5 (16 -11)/shr11*/
264+	0x8f8e8d8c,0x03020100,0x07060504,0x0b0a0908, /* shl  4 (16 -12)/shr12*/
265+	0x008f8e8d,0x04030201,0x08070605,0x0c0b0a09, /* shl  3 (16 -13)/shr13*/
266+	0x01008f8e,0x05040302,0x09080706,0x0d0c0b0a, /* shl  2 (16 -14)/shr14*/
267+	0x0201008f,0x06050403,0x0a090807,0x0e0d0c0b  /* shl  1 (16 -15)/shr15*/
268+};
269+
270+local void partial_fold(deflate_state *const s, const size_t len,
271+        __m128i *xmm_crc0, __m128i *xmm_crc1,
272+        __m128i *xmm_crc2, __m128i *xmm_crc3,
273+        __m128i *xmm_crc_part)
274+{
275+
276+    const __m128i xmm_fold4 = _mm_set_epi32(
277+            0x00000001, 0x54442bd4,
278+            0x00000001, 0xc6e41596);
279+    const __m128i xmm_mask3 = _mm_set1_epi32(0x80808080);
280+
281+    __m128i xmm_shl, xmm_shr, xmm_tmp1, xmm_tmp2, xmm_tmp3;
282+    __m128i xmm_a0_0, xmm_a0_1;
283+    __m128 ps_crc3, psa0_0, psa0_1, ps_res;
284+
285+    xmm_shl = _mm_load_si128((__m128i *)pshufb_shf_table + (len - 1));
286+    xmm_shr = xmm_shl;
287+    xmm_shr = _mm_xor_si128(xmm_shr, xmm_mask3);
288+
289+    xmm_a0_0 = _mm_shuffle_epi8(*xmm_crc0, xmm_shl);
290+
291+    *xmm_crc0 = _mm_shuffle_epi8(*xmm_crc0, xmm_shr);
292+    xmm_tmp1 = _mm_shuffle_epi8(*xmm_crc1, xmm_shl);
293+    *xmm_crc0 = _mm_or_si128(*xmm_crc0, xmm_tmp1);
294+
295+    *xmm_crc1 = _mm_shuffle_epi8(*xmm_crc1, xmm_shr);
296+    xmm_tmp2 = _mm_shuffle_epi8(*xmm_crc2, xmm_shl);
297+    *xmm_crc1 = _mm_or_si128(*xmm_crc1, xmm_tmp2);
298+
299+    *xmm_crc2 = _mm_shuffle_epi8(*xmm_crc2, xmm_shr);
300+    xmm_tmp3 = _mm_shuffle_epi8(*xmm_crc3, xmm_shl);
301+    *xmm_crc2 = _mm_or_si128(*xmm_crc2, xmm_tmp3);
302+
303+    *xmm_crc3 = _mm_shuffle_epi8(*xmm_crc3, xmm_shr);
304+    *xmm_crc_part = _mm_shuffle_epi8(*xmm_crc_part, xmm_shl);
305+    *xmm_crc3 = _mm_or_si128(*xmm_crc3, *xmm_crc_part);
306+
307+    xmm_a0_1 = _mm_clmulepi64_si128(xmm_a0_0, xmm_fold4, 0x10);
308+    xmm_a0_0 = _mm_clmulepi64_si128(xmm_a0_0, xmm_fold4, 0x01);
309+
310+    ps_crc3 = _mm_castsi128_ps(*xmm_crc3);
311+    psa0_0 = _mm_castsi128_ps(xmm_a0_0);
312+    psa0_1 = _mm_castsi128_ps(xmm_a0_1);
313+
314+    ps_res = _mm_xor_ps(ps_crc3, psa0_0);
315+    ps_res = _mm_xor_ps(ps_res, psa0_1);
316+
317+    *xmm_crc3 = _mm_castps_si128(ps_res);
318+}
319+
320+ZLIB_INTERNAL void crc_fold_copy(deflate_state *const s,
321+        unsigned char *dst, const unsigned char *src, long len)
322+{
323+    unsigned long algn_diff;
324+    __m128i xmm_t0, xmm_t1, xmm_t2, xmm_t3;
325+
326+    CRC_LOAD(s)
327+
328+    if (len < 16) {
329+        if (len == 0)
330+            return;
331+        goto partial;
332+    }
333+
334+    algn_diff = 0 - (uintptr_t)src & 0xF;
335+    if (algn_diff) {
336+        xmm_crc_part = _mm_loadu_si128((__m128i *)src);
337+        _mm_storeu_si128((__m128i *)dst, xmm_crc_part);
338+
339+        dst += algn_diff;
340+        src += algn_diff;
341+        len -= algn_diff;
342+
343+        partial_fold(s, algn_diff, &xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3,
344+            &xmm_crc_part);
345+    }
346+
347+    while ((len -= 64) >= 0) {
348+        xmm_t0 = _mm_load_si128((__m128i *)src);
349+        xmm_t1 = _mm_load_si128((__m128i *)src + 1);
350+        xmm_t2 = _mm_load_si128((__m128i *)src + 2);
351+        xmm_t3 = _mm_load_si128((__m128i *)src + 3);
352+
353+        fold_4(s, &xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
354+
355+        _mm_storeu_si128((__m128i *)dst, xmm_t0);
356+        _mm_storeu_si128((__m128i *)dst + 1, xmm_t1);
357+        _mm_storeu_si128((__m128i *)dst + 2, xmm_t2);
358+        _mm_storeu_si128((__m128i *)dst + 3, xmm_t3);
359+
360+        xmm_crc0 = _mm_xor_si128(xmm_crc0, xmm_t0);
361+        xmm_crc1 = _mm_xor_si128(xmm_crc1, xmm_t1);
362+        xmm_crc2 = _mm_xor_si128(xmm_crc2, xmm_t2);
363+        xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t3);
364+
365+        src += 64;
366+        dst += 64;
367+    }
368+
369+    /*
370+     * len = num bytes left - 64
371+     */
372+    if (len + 16 >= 0) {
373+        len += 16;
374+
375+        xmm_t0 = _mm_load_si128((__m128i *)src);
376+        xmm_t1 = _mm_load_si128((__m128i *)src + 1);
377+        xmm_t2 = _mm_load_si128((__m128i *)src + 2);
378+
379+        fold_3(s, &xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
380+
381+        _mm_storeu_si128((__m128i *)dst, xmm_t0);
382+        _mm_storeu_si128((__m128i *)dst + 1, xmm_t1);
383+        _mm_storeu_si128((__m128i *)dst + 2, xmm_t2);
384+
385+        xmm_crc1 = _mm_xor_si128(xmm_crc1, xmm_t0);
386+        xmm_crc2 = _mm_xor_si128(xmm_crc2, xmm_t1);
387+        xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t2);
388+
389+        if (len == 0)
390+            goto done;
391+
392+        dst += 48;
393+        src += 48;
394+    } else if (len + 32 >= 0) {
395+        len += 32;
396+
397+        xmm_t0 = _mm_load_si128((__m128i *)src);
398+        xmm_t1 = _mm_load_si128((__m128i *)src + 1);
399+
400+        fold_2(s, &xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
401+
402+        _mm_storeu_si128((__m128i *)dst, xmm_t0);
403+        _mm_storeu_si128((__m128i *)dst + 1, xmm_t1);
404+
405+        xmm_crc2 = _mm_xor_si128(xmm_crc2, xmm_t0);
406+        xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t1);
407+
408+        if (len == 0)
409+            goto done;
410+
411+        dst += 32;
412+        src += 32;
413+    } else if (len + 48 >= 0) {
414+        len += 48;
415+
416+        xmm_t0 = _mm_load_si128((__m128i *)src);
417+
418+        fold_1(s, &xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
419+
420+        _mm_storeu_si128((__m128i *)dst, xmm_t0);
421+
422+        xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t0);
423+
424+        if (len == 0)
425+            goto done;
426+
427+        dst += 16;
428+        src += 16;
429+    } else {
430+        len += 64;
431+        if (len == 0)
432+            goto done;
433+    }
434+
435+partial:
436+
437+#if defined(_MSC_VER)
438+    /* VS does not permit the use of _mm_set_epi64x in 32-bit builds */
439+    {
440+        int32_t parts[4] = {0, 0, 0, 0};
441+        memcpy(&parts, src, len);
442+        xmm_crc_part = _mm_set_epi32(parts[3], parts[2], parts[1], parts[0]);
443+    }
444+#else
445+    {
446+        int64_t parts[2] = {0, 0};
447+        memcpy(&parts, src, len);
448+        xmm_crc_part = _mm_set_epi64x(parts[1], parts[0]);
449+    }
450+#endif
451+
452+    _mm_storeu_si128((__m128i *)dst, xmm_crc_part);
453+    partial_fold(s, len, &xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3,
454+        &xmm_crc_part);
455+done:
456+    CRC_SAVE(s)
457+}
458+
459+local const unsigned zalign(16) crc_k[] = {
460+    0xccaa009e, 0x00000000, /* rk1 */
461+    0x751997d0, 0x00000001, /* rk2 */
462+    0xccaa009e, 0x00000000, /* rk5 */
463+    0x63cd6124, 0x00000001, /* rk6 */
464+    0xf7011640, 0x00000001, /* rk7 */
465+    0xdb710640, 0x00000001  /* rk8 */
466+};
467+
468+local const unsigned zalign(16) crc_mask[4] = {
469+    0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000
470+};
471+
472+local const unsigned zalign(16) crc_mask2[4] = {
473+    0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF
474+};
475+
476+unsigned ZLIB_INTERNAL crc_fold_512to32(deflate_state *const s)
477+{
478+    const __m128i xmm_mask  = _mm_load_si128((__m128i *)crc_mask);
479+    const __m128i xmm_mask2 = _mm_load_si128((__m128i *)crc_mask2);
480+
481+    unsigned crc;
482+    __m128i x_tmp0, x_tmp1, x_tmp2, crc_fold;
483+
484+    CRC_LOAD(s)
485+
486+    /*
487+     * k1
488+     */
489+    crc_fold = _mm_load_si128((__m128i *)crc_k);
490+
491+    x_tmp0 = _mm_clmulepi64_si128(xmm_crc0, crc_fold, 0x10);
492+    xmm_crc0 = _mm_clmulepi64_si128(xmm_crc0, crc_fold, 0x01);
493+    xmm_crc1 = _mm_xor_si128(xmm_crc1, x_tmp0);
494+    xmm_crc1 = _mm_xor_si128(xmm_crc1, xmm_crc0);
495+
496+    x_tmp1 = _mm_clmulepi64_si128(xmm_crc1, crc_fold, 0x10);
497+    xmm_crc1 = _mm_clmulepi64_si128(xmm_crc1, crc_fold, 0x01);
498+    xmm_crc2 = _mm_xor_si128(xmm_crc2, x_tmp1);
499+    xmm_crc2 = _mm_xor_si128(xmm_crc2, xmm_crc1);
500+
501+    x_tmp2 = _mm_clmulepi64_si128(xmm_crc2, crc_fold, 0x10);
502+    xmm_crc2 = _mm_clmulepi64_si128(xmm_crc2, crc_fold, 0x01);
503+    xmm_crc3 = _mm_xor_si128(xmm_crc3, x_tmp2);
504+    xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_crc2);
505+
506+    /*
507+     * k5
508+     */
509+    crc_fold = _mm_load_si128((__m128i *)crc_k + 1);
510+
511+    xmm_crc0 = xmm_crc3;
512+    xmm_crc3 = _mm_clmulepi64_si128(xmm_crc3, crc_fold, 0);
513+    xmm_crc0 = _mm_srli_si128(xmm_crc0, 8);
514+    xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_crc0);
515+
516+    xmm_crc0 = xmm_crc3;
517+    xmm_crc3 = _mm_slli_si128(xmm_crc3, 4);
518+    xmm_crc3 = _mm_clmulepi64_si128(xmm_crc3, crc_fold, 0x10);
519+    xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_crc0);
520+    xmm_crc3 = _mm_and_si128(xmm_crc3, xmm_mask2);
521+
522+    /*
523+     * k7
524+     */
525+    xmm_crc1 = xmm_crc3;
526+    xmm_crc2 = xmm_crc3;
527+    crc_fold = _mm_load_si128((__m128i *)crc_k + 2);
528+
529+    xmm_crc3 = _mm_clmulepi64_si128(xmm_crc3, crc_fold, 0);
530+    xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_crc2);
531+    xmm_crc3 = _mm_and_si128(xmm_crc3, xmm_mask);
532+
533+    xmm_crc2 = xmm_crc3;
534+    xmm_crc3 = _mm_clmulepi64_si128(xmm_crc3, crc_fold, 0x10);
535+    xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_crc2);
536+    xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_crc1);
537+
538+    crc = _mm_extract_epi32(xmm_crc3, 2);
539+    return ~crc;
540+    CRC_SAVE(s)
541+}
542diff --git a/deflate.c b/deflate.c
543index 1ec761448de9..aa0c9c67a6dc 100644
544--- a/deflate.c
545+++ b/deflate.c
546@@ -48,8 +48,9 @@
547  */
548
549 /* @(#) $Id$ */
550-
551+#include <assert.h>
552 #include "deflate.h"
553+#include "x86.h"
554
555 const char deflate_copyright[] =
556    " deflate 1.2.11 Copyright 1995-2017 Jean-loup Gailly and Mark Adler ";
557@@ -86,7 +87,7 @@ local block_state deflate_huff   OF((deflate_state *s, int flush));
558 local void lm_init        OF((deflate_state *s));
559 local void putShortMSB    OF((deflate_state *s, uInt b));
560 local void flush_pending  OF((z_streamp strm));
561-local unsigned read_buf   OF((z_streamp strm, Bytef *buf, unsigned size));
562+unsigned ZLIB_INTERNAL deflate_read_buf OF((z_streamp strm, Bytef *buf, unsigned size));
563 #ifdef ASMV
564 #  pragma message("Assembler code may have bugs -- use at your own risk")
565       void match_init OF((void)); /* asm code initialization */
566@@ -100,6 +101,20 @@ local  void check_match OF((deflate_state *s, IPos start, IPos match,
567                             int length));
568 #endif
569
570+/* From crc32.c */
571+extern void ZLIB_INTERNAL crc_reset(deflate_state *const s);
572+extern void ZLIB_INTERNAL crc_finalize(deflate_state *const s);
573+extern void ZLIB_INTERNAL copy_with_crc(z_streamp strm, Bytef *dst, long size);
574+
575+#ifdef _MSC_VER
576+#define INLINE __inline
577+#else
578+#define INLINE inline
579+#endif
580+
581+/* Inline optimisation */
582+local INLINE Pos insert_string_sse(deflate_state *const s, const Pos str);
583+
584 /* ===========================================================================
585  * Local data
586  */
587@@ -162,7 +177,6 @@ local const config configuration_table[10] = {
588  */
589 #define UPDATE_HASH(s,h,c) (h = (((h)<<s->hash_shift) ^ (c)) & s->hash_mask)
590
591-
592 /* ===========================================================================
593  * Insert string str in the dictionary and set match_head to the previous head
594  * of the hash chain (the most recent string with same hash key). Return
595@@ -173,17 +187,28 @@ local const config configuration_table[10] = {
596  *    characters and the first MIN_MATCH bytes of str are valid (except for
597  *    the last MIN_MATCH-1 bytes of the input file).
598  */
599+local INLINE Pos insert_string_c(deflate_state *const s, const Pos str)
600+{
601+    Pos ret;
602+
603+    UPDATE_HASH(s, s->ins_h, s->window[(str) + (MIN_MATCH-1)]);
604 #ifdef FASTEST
605-#define INSERT_STRING(s, str, match_head) \
606-   (UPDATE_HASH(s, s->ins_h, s->window[(str) + (MIN_MATCH-1)]), \
607-    match_head = s->head[s->ins_h], \
608-    s->head[s->ins_h] = (Pos)(str))
609+    ret = s->head[s->ins_h];
610 #else
611-#define INSERT_STRING(s, str, match_head) \
612-   (UPDATE_HASH(s, s->ins_h, s->window[(str) + (MIN_MATCH-1)]), \
613-    match_head = s->prev[(str) & s->w_mask] = s->head[s->ins_h], \
614-    s->head[s->ins_h] = (Pos)(str))
615+    ret = s->prev[str & s->w_mask] = s->head[s->ins_h];
616 #endif
617+    s->head[s->ins_h] = str;
618+
619+    return ret;
620+}
621+
622+local INLINE Pos insert_string(deflate_state *const s, const Pos str)
623+{
624+    if (x86_cpu_enable_simd)
625+        return insert_string_sse(s, str);
626+    return insert_string_c(s, str);
627+}
628+
629
630 /* ===========================================================================
631  * Initialize the hash table (avoiding 64K overflow for 16 bit systems).
632@@ -248,6 +273,7 @@ int ZEXPORT deflateInit2_(strm, level, method, windowBits, memLevel, strategy,
633     const char *version;
634     int stream_size;
635 {
636+    unsigned window_padding = 8;
637     deflate_state *s;
638     int wrap = 1;
639     static const char my_version[] = ZLIB_VERSION;
640@@ -257,6 +283,8 @@ int ZEXPORT deflateInit2_(strm, level, method, windowBits, memLevel, strategy,
641      * output size for (length,distance) codes is <= 24 bits.
642      */
643
644+    x86_check_features();
645+
646     if (version == Z_NULL || version[0] != my_version[0] ||
647         stream_size != sizeof(z_stream)) {
648         return Z_VERSION_ERROR;
649@@ -313,12 +341,19 @@ int ZEXPORT deflateInit2_(strm, level, method, windowBits, memLevel, strategy,
650     s->w_size = 1 << s->w_bits;
651     s->w_mask = s->w_size - 1;
652
653-    s->hash_bits = (uInt)memLevel + 7;
654+    if (x86_cpu_enable_simd) {
655+        s->hash_bits = 15;
656+    } else {
657+        s->hash_bits = memLevel + 7;
658+    }
659+
660     s->hash_size = 1 << s->hash_bits;
661     s->hash_mask = s->hash_size - 1;
662     s->hash_shift =  ((s->hash_bits+MIN_MATCH-1)/MIN_MATCH);
663
664-    s->window = (Bytef *) ZALLOC(strm, s->w_size, 2*sizeof(Byte));
665+    s->window = (Bytef *) ZALLOC(strm,
666+                                 s->w_size + window_padding,
667+                                 2*sizeof(Byte));
668     s->prev   = (Posf *)  ZALLOC(strm, s->w_size, sizeof(Pos));
669     s->head   = (Posf *)  ZALLOC(strm, s->hash_size, sizeof(Pos));
670
671@@ -418,11 +453,7 @@ int ZEXPORT deflateSetDictionary (strm, dictionary, dictLength)
672         str = s->strstart;
673         n = s->lookahead - (MIN_MATCH-1);
674         do {
675-            UPDATE_HASH(s, s->ins_h, s->window[str + MIN_MATCH-1]);
676-#ifndef FASTEST
677-            s->prev[str & s->w_mask] = s->head[s->ins_h];
678-#endif
679-            s->head[s->ins_h] = (Pos)str;
680+            insert_string(s, str);
681             str++;
682         } while (--n);
683         s->strstart = str;
684@@ -848,7 +879,7 @@ int ZEXPORT deflate (strm, flush)
685 #ifdef GZIP
686     if (s->status == GZIP_STATE) {
687         /* gzip header */
688-        strm->adler = crc32(0L, Z_NULL, 0);
689+        crc_reset(s);
690         put_byte(s, 31);
691         put_byte(s, 139);
692         put_byte(s, 8);
693@@ -1049,6 +1080,7 @@ int ZEXPORT deflate (strm, flush)
694     /* Write the trailer */
695 #ifdef GZIP
696     if (s->wrap == 2) {
697+        crc_finalize(s);
698         put_byte(s, (Byte)(strm->adler & 0xff));
699         put_byte(s, (Byte)((strm->adler >> 8) & 0xff));
700         put_byte(s, (Byte)((strm->adler >> 16) & 0xff));
701@@ -1161,7 +1193,7 @@ int ZEXPORT deflateCopy (dest, source)
702  * allocating a large strm->next_in buffer and copying from it.
703  * (See also flush_pending()).
704  */
705-local unsigned read_buf(strm, buf, size)
706+ZLIB_INTERNAL unsigned deflate_read_buf(strm, buf, size)
707     z_streamp strm;
708     Bytef *buf;
709     unsigned size;
710@@ -1173,15 +1205,16 @@ local unsigned read_buf(strm, buf, size)
711
712     strm->avail_in  -= len;
713
714-    zmemcpy(buf, strm->next_in, len);
715-    if (strm->state->wrap == 1) {
716-        strm->adler = adler32(strm->adler, buf, len);
717-    }
718 #ifdef GZIP
719-    else if (strm->state->wrap == 2) {
720-        strm->adler = crc32(strm->adler, buf, len);
721-    }
722+    if (strm->state->wrap == 2)
723+        copy_with_crc(strm, buf, len);
724+    else
725 #endif
726+    {
727+        zmemcpy(buf, strm->next_in, len);
728+        if (strm->state->wrap == 1)
729+            strm->adler = adler32(strm->adler, buf, len);
730+    }
731     strm->next_in  += len;
732     strm->total_in += len;
733
734@@ -1479,7 +1512,19 @@ local void check_match(s, start, match, length)
735  *    performed for at least two bytes (required for the zip translate_eol
736  *    option -- not supported here).
737  */
738-local void fill_window(s)
739+local void fill_window_c(deflate_state *s);
740+
741+local void fill_window(deflate_state *s)
742+{
743+    if (x86_cpu_enable_simd) {
744+        fill_window_sse(s);
745+        return;
746+    }
747+
748+    fill_window_c(s);
749+}
750+
751+local void fill_window_c(s)
752     deflate_state *s;
753 {
754     unsigned n;
755@@ -1847,7 +1892,7 @@ local block_state deflate_fast(s, flush)
756          */
757         hash_head = NIL;
758         if (s->lookahead >= MIN_MATCH) {
759-            INSERT_STRING(s, s->strstart, hash_head);
760+            hash_head = insert_string(s, s->strstart);
761         }
762
763         /* Find the longest match, discarding those <= prev_length.
764@@ -1878,7 +1923,7 @@ local block_state deflate_fast(s, flush)
765                 s->match_length--; /* string at strstart already in table */
766                 do {
767                     s->strstart++;
768-                    INSERT_STRING(s, s->strstart, hash_head);
769+                    hash_head = insert_string(s, s->strstart);
770                     /* strstart never exceeds WSIZE-MAX_MATCH, so there are
771                      * always MIN_MATCH bytes ahead.
772                      */
773@@ -1950,7 +1995,7 @@ local block_state deflate_slow(s, flush)
774          */
775         hash_head = NIL;
776         if (s->lookahead >= MIN_MATCH) {
777-            INSERT_STRING(s, s->strstart, hash_head);
778+            hash_head = insert_string(s, s->strstart);
779         }
780
781         /* Find the longest match, discarding those <= prev_length.
782@@ -2001,7 +2046,7 @@ local block_state deflate_slow(s, flush)
783             s->prev_length -= 2;
784             do {
785                 if (++s->strstart <= max_insert) {
786-                    INSERT_STRING(s, s->strstart, hash_head);
787+                    hash_head = insert_string(s, s->strstart);
788                 }
789             } while (--s->prev_length != 0);
790             s->match_available = 0;
791@@ -2161,3 +2206,37 @@ local block_state deflate_huff(s, flush)
792         FLUSH_BLOCK(s, 0);
793     return block_done;
794 }
795+
796+/* Safe to inline this as GCC/clang will use inline asm and Visual Studio will
797+ * use intrinsic without extra params
798+ */
799+local INLINE Pos insert_string_sse(deflate_state *const s, const Pos str)
800+{
801+    Pos ret;
802+    unsigned *ip, val, h = 0;
803+
804+    ip = (unsigned *)&s->window[str];
805+    val = *ip;
806+
807+    if (s->level >= 6)
808+        val &= 0xFFFFFF;
809+
810+/* Windows clang should use inline asm */
811+#if defined(_MSC_VER) && !defined(__clang__)
812+    h = _mm_crc32_u32(h, val);
813+#elif defined(__i386__) || defined(__amd64__)
814+    __asm__ __volatile__ (
815+        "crc32 %1,%0\n\t"
816+    : "+r" (h)
817+    : "r" (val)
818+    );
819+#else
820+    /* This should never happen */
821+    assert(0);
822+#endif
823+
824+    ret = s->head[h & s->hash_mask];
825+    s->head[h & s->hash_mask] = str;
826+    s->prev[str & s->w_mask] = ret;
827+    return ret;
828+}
829diff --git a/deflate.h b/deflate.h
830index 23ecdd312bc0..ab56df7663b6 100644
831--- a/deflate.h
832+++ b/deflate.h
833@@ -109,7 +109,7 @@ typedef struct internal_state {
834     ulg   gzindex;       /* where in extra, name, or comment */
835     Byte  method;        /* can only be DEFLATED */
836     int   last_flush;    /* value of flush param for previous deflate call */
837-
838+    unsigned zalign(16) crc0[4 * 5];
839                 /* used by deflate.c: */
840
841     uInt  w_size;        /* LZ77 window size (32K by default) */
842@@ -346,4 +346,14 @@ void ZLIB_INTERNAL _tr_stored_block OF((deflate_state *s, charf *buf,
843               flush = _tr_tally(s, distance, length)
844 #endif
845
846+/* Functions that are SIMD optimised on x86 */
847+void ZLIB_INTERNAL crc_fold_init(deflate_state* const s);
848+void ZLIB_INTERNAL crc_fold_copy(deflate_state* const s,
849+                                 unsigned char* dst,
850+                                 const unsigned char* src,
851+                                 long len);
852+unsigned ZLIB_INTERNAL crc_fold_512to32(deflate_state* const s);
853+
854+void ZLIB_INTERNAL fill_window_sse(deflate_state* s);
855+
856 #endif /* DEFLATE_H */
857diff --git a/fill_window_sse.c b/fill_window_sse.c
858new file mode 100644
859index 000000000000..949ccce1ba9c
860--- /dev/null
861+++ b/fill_window_sse.c
862@@ -0,0 +1,177 @@
863+/*
864+ * Fill Window with SSE2-optimized hash shifting
865+ *
866+ * Copyright (C) 2013 Intel Corporation
867+ * Authors:
868+ *  Arjan van de Ven    <arjan@linux.intel.com>
869+ *  Jim Kukunas         <james.t.kukunas@linux.intel.com>
870+ *
871+ * For conditions of distribution and use, see copyright notice in zlib.h
872+ */
873+
874+#include <immintrin.h>
875+#include "deflate.h"
876+
877+#define UPDATE_HASH(s,h,i) \
878+    {\
879+        if (s->level < 6) { \
880+            h = (3483 * (s->window[i]) +\
881+                 23081* (s->window[i+1]) +\
882+                 6954 * (s->window[i+2]) +\
883+                 20947* (s->window[i+3])) & s->hash_mask;\
884+        } else {\
885+            h = (25881* (s->window[i]) +\
886+                 24674* (s->window[i+1]) +\
887+                 25811* (s->window[i+2])) & s->hash_mask;\
888+        }\
889+    }\
890+
891+extern int deflate_read_buf OF((z_streamp strm, Bytef *buf, unsigned size));
892+
893+void fill_window_sse(deflate_state *s)
894+{
895+    const __m128i xmm_wsize = _mm_set1_epi16(s->w_size);
896+
897+    register unsigned n;
898+    register Posf *p;
899+    unsigned more;    /* Amount of free space at the end of the window. */
900+    uInt wsize = s->w_size;
901+
902+    Assert(s->lookahead < MIN_LOOKAHEAD, "already enough lookahead");
903+
904+    do {
905+        more = (unsigned)(s->window_size -(ulg)s->lookahead -(ulg)s->strstart);
906+
907+        /* Deal with !@#$% 64K limit: */
908+        if (sizeof(int) <= 2) {
909+            if (more == 0 && s->strstart == 0 && s->lookahead == 0) {
910+                more = wsize;
911+
912+            } else if (more == (unsigned)(-1)) {
913+                /* Very unlikely, but possible on 16 bit machine if
914+                 * strstart == 0 && lookahead == 1 (input done a byte at time)
915+                 */
916+                more--;
917+            }
918+        }
919+
920+        /* If the window is almost full and there is insufficient lookahead,
921+         * move the upper half to the lower one to make room in the upper half.
922+         */
923+        if (s->strstart >= wsize+MAX_DIST(s)) {
924+
925+            zmemcpy(s->window, s->window+wsize, (unsigned)wsize);
926+            s->match_start -= wsize;
927+            s->strstart    -= wsize; /* we now have strstart >= MAX_DIST */
928+            s->block_start -= (long) wsize;
929+
930+            /* Slide the hash table (could be avoided with 32 bit values
931+               at the expense of memory usage). We slide even when level == 0
932+               to keep the hash table consistent if we switch back to level > 0
933+               later. (Using level 0 permanently is not an optimal usage of
934+               zlib, so we don't care about this pathological case.)
935+             */
936+            n = s->hash_size;
937+            p = &s->head[n];
938+            p -= 8;
939+            do {
940+                __m128i value, result;
941+
942+                value = _mm_loadu_si128((__m128i *)p);
943+                result = _mm_subs_epu16(value, xmm_wsize);
944+                _mm_storeu_si128((__m128i *)p, result);
945+
946+                p -= 8;
947+                n -= 8;
948+            } while (n > 0);
949+
950+            n = wsize;
951+#ifndef FASTEST
952+            p = &s->prev[n];
953+            p -= 8;
954+            do {
955+                __m128i value, result;
956+
957+                value = _mm_loadu_si128((__m128i *)p);
958+                result = _mm_subs_epu16(value, xmm_wsize);
959+                _mm_storeu_si128((__m128i *)p, result);
960+
961+                p -= 8;
962+                n -= 8;
963+            } while (n > 0);
964+#endif
965+            more += wsize;
966+        }
967+        if (s->strm->avail_in == 0) break;
968+
969+        /* If there was no sliding:
970+         *    strstart <= WSIZE+MAX_DIST-1 && lookahead <= MIN_LOOKAHEAD - 1 &&
971+         *    more == window_size - lookahead - strstart
972+         * => more >= window_size - (MIN_LOOKAHEAD-1 + WSIZE + MAX_DIST-1)
973+         * => more >= window_size - 2*WSIZE + 2
974+         * In the BIG_MEM or MMAP case (not yet supported),
975+         *   window_size == input_size + MIN_LOOKAHEAD  &&
976+         *   strstart + s->lookahead <= input_size => more >= MIN_LOOKAHEAD.
977+         * Otherwise, window_size == 2*WSIZE so more >= 2.
978+         * If there was sliding, more >= WSIZE. So in all cases, more >= 2.
979+         */
980+        Assert(more >= 2, "more < 2");
981+
982+        n = deflate_read_buf(s->strm,
983+                             s->window + s->strstart + s->lookahead,
984+                             more);
985+        s->lookahead += n;
986+
987+        /* Initialize the hash value now that we have some input: */
988+        if (s->lookahead >= MIN_MATCH) {
989+            uInt str = s->strstart;
990+            s->ins_h = s->window[str];
991+            if (str >= 1)
992+                UPDATE_HASH(s, s->ins_h, str + 1 - (MIN_MATCH-1));
993+#if MIN_MATCH != 3
994+            Call UPDATE_HASH() MIN_MATCH-3 more times
995+#endif
996+        }
997+        /* If the whole input has less than MIN_MATCH bytes, ins_h is garbage,
998+         * but this is not important since only literal bytes will be emitted.
999+         */
1000+
1001+    } while (s->lookahead < MIN_LOOKAHEAD && s->strm->avail_in != 0);
1002+
1003+    /* If the WIN_INIT bytes after the end of the current data have never been
1004+     * written, then zero those bytes in order to avoid memory check reports of
1005+     * the use of uninitialized (or uninitialised as Julian writes) bytes by
1006+     * the longest match routines.  Update the high water mark for the next
1007+     * time through here.  WIN_INIT is set to MAX_MATCH since the longest match
1008+     * routines allow scanning to strstart + MAX_MATCH, ignoring lookahead.
1009+     */
1010+    if (s->high_water < s->window_size) {
1011+        ulg curr = s->strstart + (ulg)(s->lookahead);
1012+        ulg init;
1013+
1014+        if (s->high_water < curr) {
1015+            /* Previous high water mark below current data -- zero WIN_INIT
1016+             * bytes or up to end of window, whichever is less.
1017+             */
1018+            init = s->window_size - curr;
1019+            if (init > WIN_INIT)
1020+                init = WIN_INIT;
1021+            zmemzero(s->window + curr, (unsigned)init);
1022+            s->high_water = curr + init;
1023+        }
1024+        else if (s->high_water < (ulg)curr + WIN_INIT) {
1025+            /* High water mark at or above current data, but below current data
1026+             * plus WIN_INIT -- zero out to current data plus WIN_INIT, or up
1027+             * to end of window, whichever is less.
1028+             */
1029+            init = (ulg)curr + WIN_INIT - s->high_water;
1030+            if (init > s->window_size - s->high_water)
1031+                init = s->window_size - s->high_water;
1032+            zmemzero(s->window + s->high_water, (unsigned)init);
1033+            s->high_water += init;
1034+        }
1035+    }
1036+
1037+    Assert((ulg)s->strstart <= s->window_size - MIN_LOOKAHEAD,
1038+           "not enough room for search");
1039+}
1040diff --git a/simd_stub.c b/simd_stub.c
1041new file mode 100644
1042index 000000000000..c6d46051498f
1043--- /dev/null
1044+++ b/simd_stub.c
1045@@ -0,0 +1,35 @@
1046+/* simd_stub.c -- stub implementations
1047+* Copyright (C) 2014 Intel Corporation
1048+* For conditions of distribution and use, see copyright notice in zlib.h
1049+*/
1050+#include <assert.h>
1051+
1052+#include "deflate.h"
1053+#include "x86.h"
1054+
1055+int ZLIB_INTERNAL x86_cpu_enable_simd = 0;
1056+
1057+void ZLIB_INTERNAL crc_fold_init(deflate_state *const s) {
1058+    assert(0);
1059+}
1060+
1061+void ZLIB_INTERNAL crc_fold_copy(deflate_state *const s,
1062+                                 unsigned char *dst,
1063+                                 const unsigned char *src,
1064+                                 long len) {
1065+    assert(0);
1066+}
1067+
1068+unsigned ZLIB_INTERNAL crc_fold_512to32(deflate_state *const s) {
1069+    assert(0);
1070+    return 0;
1071+}
1072+
1073+void ZLIB_INTERNAL fill_window_sse(deflate_state *s)
1074+{
1075+    assert(0);
1076+}
1077+
1078+void x86_check_features(void)
1079+{
1080+}
1081diff --git a/x86.c b/x86.c
1082new file mode 100644
1083index 000000000000..e56fe8b85a39
1084--- /dev/null
1085+++ b/x86.c
1086@@ -0,0 +1,92 @@
1087+/*
1088+ * x86 feature check
1089+ *
1090+ * Copyright (C) 2013 Intel Corporation. All rights reserved.
1091+ * Author:
1092+ *  Jim Kukunas
1093+ *
1094+ * For conditions of distribution and use, see copyright notice in zlib.h
1095+ */
1096+
1097+#include "x86.h"
1098+#include "zutil.h"
1099+
1100+int ZLIB_INTERNAL x86_cpu_enable_simd = 0;
1101+
1102+#ifndef _MSC_VER
1103+#include <pthread.h>
1104+
1105+pthread_once_t cpu_check_inited_once = PTHREAD_ONCE_INIT;
1106+static void _x86_check_features(void);
1107+
1108+void x86_check_features(void)
1109+{
1110+  pthread_once(&cpu_check_inited_once, _x86_check_features);
1111+}
1112+
1113+static void _x86_check_features(void)
1114+{
1115+    int x86_cpu_has_sse2;
1116+    int x86_cpu_has_sse42;
1117+    int x86_cpu_has_pclmulqdq;
1118+    unsigned eax, ebx, ecx, edx;
1119+
1120+    eax = 1;
1121+#ifdef __i386__
1122+    __asm__ __volatile__ (
1123+        "xchg %%ebx, %1\n\t"
1124+        "cpuid\n\t"
1125+        "xchg %1, %%ebx\n\t"
1126+    : "+a" (eax), "=S" (ebx), "=c" (ecx), "=d" (edx)
1127+    );
1128+#else
1129+    __asm__ __volatile__ (
1130+        "cpuid\n\t"
1131+    : "+a" (eax), "=b" (ebx), "=c" (ecx), "=d" (edx)
1132+    );
1133+#endif  /* (__i386__) */
1134+
1135+    x86_cpu_has_sse2 = edx & 0x4000000;
1136+    x86_cpu_has_sse42 = ecx & 0x100000;
1137+    x86_cpu_has_pclmulqdq = ecx & 0x2;
1138+
1139+    x86_cpu_enable_simd = x86_cpu_has_sse2 &&
1140+                          x86_cpu_has_sse42 &&
1141+                          x86_cpu_has_pclmulqdq;
1142+}
1143+#else
1144+#include <intrin.h>
1145+#include <windows.h>
1146+
1147+static BOOL CALLBACK _x86_check_features(PINIT_ONCE once,
1148+                                         PVOID param,
1149+                                         PVOID *context);
1150+static INIT_ONCE cpu_check_inited_once = INIT_ONCE_STATIC_INIT;
1151+
1152+void x86_check_features(void)
1153+{
1154+    InitOnceExecuteOnce(&cpu_check_inited_once, _x86_check_features,
1155+                        NULL, NULL);
1156+}
1157+
1158+static BOOL CALLBACK _x86_check_features(PINIT_ONCE once,
1159+                                         PVOID param,
1160+                                         PVOID *context)
1161+{
1162+    int x86_cpu_has_sse2;
1163+    int x86_cpu_has_sse42;
1164+    int x86_cpu_has_pclmulqdq;
1165+    int regs[4];
1166+
1167+    __cpuid(regs, 1);
1168+
1169+    x86_cpu_has_sse2 = regs[3] & 0x4000000;
1170+    x86_cpu_has_sse42= regs[2] & 0x100000;
1171+    x86_cpu_has_pclmulqdq = regs[2] & 0x2;
1172+
1173+    x86_cpu_enable_simd = x86_cpu_has_sse2 &&
1174+                          x86_cpu_has_sse42 &&
1175+                          x86_cpu_has_pclmulqdq;
1176+    return TRUE;
1177+}
1178+#endif  /* _MSC_VER */
1179diff --git a/x86.h b/x86.h
1180new file mode 100644
1181index 000000000000..ebcf10ab09d2
1182--- /dev/null
1183+++ b/x86.h
1184@@ -0,0 +1,15 @@
1185+/* x86.h -- check for x86 CPU features
1186+* Copyright (C) 2013 Intel Corporation Jim Kukunas
1187+* For conditions of distribution and use, see copyright notice in zlib.h
1188+*/
1189+
1190+#ifndef X86_H
1191+#define X86_H
1192+
1193+#include "zlib.h"
1194+
1195+extern int x86_cpu_enable_simd;
1196+
1197+void x86_check_features(void);
1198+
1199+#endif  /* X86_H */
1200diff --git a/zutil.h b/zutil.h
1201index 80375b8b6109..4425bcf75eb3 100644
1202--- a/zutil.h
1203+++ b/zutil.h
1204@@ -283,4 +283,10 @@ extern z_const char * const z_errmsg[10]; /* indexed by 2-zlib_error */
1205 #define ZSWAP32(q) ((((q) >> 24) & 0xff) + (((q) >> 8) & 0xff00) + \
1206                     (((q) & 0xff00) << 8) + (((q) & 0xff) << 24))
1207
1208+#ifdef _MSC_VER
1209+#define zalign(x) __declspec(align(x))
1210+#else
1211+#define zalign(x) __attribute__((aligned((x))))
1212+#endif
1213+
1214 #endif /* ZUTIL_H */
1215