1 /*
2 * Fill Window with SSE2-optimized hash shifting
3 *
4 * Copyright (C) 2013 Intel Corporation
5 * Authors:
6 * Arjan van de Ven <arjan@linux.intel.com>
7 * Jim Kukunas <james.t.kukunas@linux.intel.com>
8 *
9 * For conditions of distribution and use, see copyright notice in zlib.h
10 */
11
12 #pragma GCC target ("sse2")
13 #include <immintrin.h>
14 #include "deflate.h"
15
16 #define UPDATE_HASH(s,h,i) \
17 {\
18 if (s->level < 6) { \
19 h = (3483 * (s->window[i]) +\
20 23081* (s->window[i+1]) +\
21 6954 * (s->window[i+2]) +\
22 20947* (s->window[i+3])) & s->hash_mask;\
23 } else {\
24 h = (25881* (s->window[i]) +\
25 24674* (s->window[i+1]) +\
26 25811* (s->window[i+2])) & s->hash_mask;\
27 }\
28 }\
29
30 extern int deflate_read_buf OF((z_streamp strm, Bytef *buf, unsigned size));
31
fill_window_sse(deflate_state * s)32 void fill_window_sse(deflate_state *s)
33 {
34 const __m128i xmm_wsize = _mm_set1_epi16(s->w_size);
35
36 register unsigned n;
37 register Posf *p;
38 unsigned more; /* Amount of free space at the end of the window. */
39 uInt wsize = s->w_size;
40
41 Assert(s->lookahead < MIN_LOOKAHEAD, "already enough lookahead");
42
43 do {
44 more = (unsigned)(s->window_size -(ulg)s->lookahead -(ulg)s->strstart);
45
46 /* Deal with !@#$% 64K limit: */
47 if (sizeof(int) <= 2) {
48 if (more == 0 && s->strstart == 0 && s->lookahead == 0) {
49 more = wsize;
50
51 } else if (more == (unsigned)(-1)) {
52 /* Very unlikely, but possible on 16 bit machine if
53 * strstart == 0 && lookahead == 1 (input done a byte at time)
54 */
55 more--;
56 }
57 }
58
59 /* If the window is almost full and there is insufficient lookahead,
60 * move the upper half to the lower one to make room in the upper half.
61 */
62 if (s->strstart >= wsize+MAX_DIST(s)) {
63
64 zmemcpy(s->window, s->window+wsize, (unsigned)wsize);
65 s->match_start -= wsize;
66 s->strstart -= wsize; /* we now have strstart >= MAX_DIST */
67 s->block_start -= (long) wsize;
68
69 /* Slide the hash table (could be avoided with 32 bit values
70 at the expense of memory usage). We slide even when level == 0
71 to keep the hash table consistent if we switch back to level > 0
72 later. (Using level 0 permanently is not an optimal usage of
73 zlib, so we don't care about this pathological case.)
74 */
75 n = s->hash_size;
76 p = &s->head[n];
77 p -= 8;
78 do {
79 __m128i value, result;
80
81 value = _mm_loadu_si128((__m128i *)p);
82 result = _mm_subs_epu16(value, xmm_wsize);
83 _mm_storeu_si128((__m128i *)p, result);
84
85 p -= 8;
86 n -= 8;
87 } while (n > 0);
88
89 n = wsize;
90 #ifndef FASTEST
91 p = &s->prev[n];
92 p -= 8;
93 do {
94 __m128i value, result;
95
96 value = _mm_loadu_si128((__m128i *)p);
97 result = _mm_subs_epu16(value, xmm_wsize);
98 _mm_storeu_si128((__m128i *)p, result);
99
100 p -= 8;
101 n -= 8;
102 } while (n > 0);
103 #endif
104 more += wsize;
105 }
106 if (s->strm->avail_in == 0) break;
107
108 /* If there was no sliding:
109 * strstart <= WSIZE+MAX_DIST-1 && lookahead <= MIN_LOOKAHEAD - 1 &&
110 * more == window_size - lookahead - strstart
111 * => more >= window_size - (MIN_LOOKAHEAD-1 + WSIZE + MAX_DIST-1)
112 * => more >= window_size - 2*WSIZE + 2
113 * In the BIG_MEM or MMAP case (not yet supported),
114 * window_size == input_size + MIN_LOOKAHEAD &&
115 * strstart + s->lookahead <= input_size => more >= MIN_LOOKAHEAD.
116 * Otherwise, window_size == 2*WSIZE so more >= 2.
117 * If there was sliding, more >= WSIZE. So in all cases, more >= 2.
118 */
119 Assert(more >= 2, "more < 2");
120
121 n = deflate_read_buf(s->strm,
122 s->window + s->strstart + s->lookahead,
123 more);
124 s->lookahead += n;
125
126 /* Initialize the hash value now that we have some input: */
127 if (s->lookahead >= MIN_MATCH) {
128 uInt str = s->strstart;
129 s->ins_h = s->window[str];
130 if (str >= 1)
131 UPDATE_HASH(s, s->ins_h, str + 1 - (MIN_MATCH-1));
132 #if MIN_MATCH != 3
133 Call UPDATE_HASH() MIN_MATCH-3 more times
134 #endif
135 }
136 /* If the whole input has less than MIN_MATCH bytes, ins_h is garbage,
137 * but this is not important since only literal bytes will be emitted.
138 */
139
140 } while (s->lookahead < MIN_LOOKAHEAD && s->strm->avail_in != 0);
141
142 /* If the WIN_INIT bytes after the end of the current data have never been
143 * written, then zero those bytes in order to avoid memory check reports of
144 * the use of uninitialized (or uninitialised as Julian writes) bytes by
145 * the longest match routines. Update the high water mark for the next
146 * time through here. WIN_INIT is set to MAX_MATCH since the longest match
147 * routines allow scanning to strstart + MAX_MATCH, ignoring lookahead.
148 */
149 if (s->high_water < s->window_size) {
150 ulg curr = s->strstart + (ulg)(s->lookahead);
151 ulg init;
152
153 if (s->high_water < curr) {
154 /* Previous high water mark below current data -- zero WIN_INIT
155 * bytes or up to end of window, whichever is less.
156 */
157 init = s->window_size - curr;
158 if (init > WIN_INIT)
159 init = WIN_INIT;
160 zmemzero(s->window + curr, (unsigned)init);
161 s->high_water = curr + init;
162 }
163 else if (s->high_water < (ulg)curr + WIN_INIT) {
164 /* High water mark at or above current data, but below current data
165 * plus WIN_INIT -- zero out to current data plus WIN_INIT, or up
166 * to end of window, whichever is less.
167 */
168 init = (ulg)curr + WIN_INIT - s->high_water;
169 if (init > s->window_size - s->high_water)
170 init = s->window_size - s->high_water;
171 zmemzero(s->window + s->high_water, (unsigned)init);
172 s->high_water += init;
173 }
174 }
175
176 Assert((ulg)s->strstart <= s->window_size - MIN_LOOKAHEAD,
177 "not enough room for search");
178 }
179