1 /* NOLINT(build/header_guard) */
2 /* Copyright 2016 Google Inc. All Rights Reserved.
3
4 Distributed under MIT license.
5 See file LICENSE for detail or copy at https://opensource.org/licenses/MIT
6 */
7
8 /* template parameters: FN, BUCKET_BITS, MAX_TREE_COMP_LENGTH,
9 MAX_TREE_SEARCH_DEPTH */
10
11 /* A (forgetful) hash table where each hash bucket contains a binary tree of
12 sequences whose first 4 bytes share the same hash code.
13 Each sequence is MAX_TREE_COMP_LENGTH long and is identified by its starting
14 position in the input data. The binary tree is sorted by the lexicographic
15 order of the sequences, and it is also a max-heap with respect to the
16 starting positions. */
17
18 #define HashToBinaryTree HASHER()
19
20 #define BUCKET_SIZE (1 << BUCKET_BITS)
21
FN(HashTypeLength)22 static BROTLI_INLINE size_t FN(HashTypeLength)(void) { return 4; }
FN(StoreLookahead)23 static BROTLI_INLINE size_t FN(StoreLookahead)(void) {
24 return MAX_TREE_COMP_LENGTH;
25 }
26
FN(HashBytes)27 static uint32_t FN(HashBytes)(const uint8_t* BROTLI_RESTRICT data) {
28 uint32_t h = BROTLI_UNALIGNED_LOAD32LE(data) * kHashMul32;
29 /* The higher bits contain more mixture from the multiplication,
30 so we take our results from there. */
31 return h >> (32 - BUCKET_BITS);
32 }
33
34 typedef struct HashToBinaryTree {
35 /* The window size minus 1 */
36 size_t window_mask_;
37
38 /* Hash table that maps the 4-byte hashes of the sequence to the last
39 position where this hash was found, which is the root of the binary
40 tree of sequences that share this hash bucket. */
41 uint32_t* buckets_; /* uint32_t[BUCKET_SIZE]; */
42
43 /* A position used to mark a non-existent sequence, i.e. a tree is empty if
44 its root is at invalid_pos_ and a node is a leaf if both its children
45 are at invalid_pos_. */
46 uint32_t invalid_pos_;
47
48 /* --- Dynamic size members --- */
49
50 /* The union of the binary trees of each hash bucket. The root of the tree
51 corresponding to a hash is a sequence starting at buckets_[hash] and
52 the left and right children of a sequence starting at pos are
53 forest_[2 * pos] and forest_[2 * pos + 1]. */
54 uint32_t* forest_; /* uint32_t[2 * num_nodes] */
55 } HashToBinaryTree;
56
FN(Initialize)57 static void FN(Initialize)(
58 HasherCommon* common, HashToBinaryTree* BROTLI_RESTRICT self,
59 const BrotliEncoderParams* params) {
60 self->buckets_ = (uint32_t*)common->extra;
61 self->forest_ = &self->buckets_[BUCKET_SIZE];
62
63 self->window_mask_ = (1u << params->lgwin) - 1u;
64 self->invalid_pos_ = (uint32_t)(0 - self->window_mask_);
65 }
66
FN(Prepare)67 static void FN(Prepare)
68 (HashToBinaryTree* BROTLI_RESTRICT self, BROTLI_BOOL one_shot,
69 size_t input_size, const uint8_t* BROTLI_RESTRICT data) {
70 uint32_t invalid_pos = self->invalid_pos_;
71 uint32_t i;
72 uint32_t* BROTLI_RESTRICT buckets = self->buckets_;
73 BROTLI_UNUSED(data);
74 BROTLI_UNUSED(one_shot);
75 BROTLI_UNUSED(input_size);
76 for (i = 0; i < BUCKET_SIZE; i++) {
77 buckets[i] = invalid_pos;
78 }
79 }
80
FN(HashMemAllocInBytes)81 static BROTLI_INLINE size_t FN(HashMemAllocInBytes)(
82 const BrotliEncoderParams* params, BROTLI_BOOL one_shot,
83 size_t input_size) {
84 size_t num_nodes = (size_t)1 << params->lgwin;
85 if (one_shot && input_size < num_nodes) {
86 num_nodes = input_size;
87 }
88 return sizeof(uint32_t) * BUCKET_SIZE + 2 * sizeof(uint32_t) * num_nodes;
89 }
90
FN(LeftChildIndex)91 static BROTLI_INLINE size_t FN(LeftChildIndex)(
92 HashToBinaryTree* BROTLI_RESTRICT self,
93 const size_t pos) {
94 return 2 * (pos & self->window_mask_);
95 }
96
FN(RightChildIndex)97 static BROTLI_INLINE size_t FN(RightChildIndex)(
98 HashToBinaryTree* BROTLI_RESTRICT self,
99 const size_t pos) {
100 return 2 * (pos & self->window_mask_) + 1;
101 }
102
103 /* Stores the hash of the next 4 bytes and in a single tree-traversal, the
104 hash bucket's binary tree is searched for matches and is re-rooted at the
105 current position.
106
107 If less than MAX_TREE_COMP_LENGTH data is available, the hash bucket of the
108 current position is searched for matches, but the state of the hash table
109 is not changed, since we can not know the final sorting order of the
110 current (incomplete) sequence.
111
112 This function must be called with increasing cur_ix positions. */
FN(StoreAndFindMatches)113 static BROTLI_INLINE BackwardMatch* FN(StoreAndFindMatches)(
114 HashToBinaryTree* BROTLI_RESTRICT self, const uint8_t* BROTLI_RESTRICT data,
115 const size_t cur_ix, const size_t ring_buffer_mask, const size_t max_length,
116 const size_t max_backward, size_t* const BROTLI_RESTRICT best_len,
117 BackwardMatch* BROTLI_RESTRICT matches) {
118 const size_t cur_ix_masked = cur_ix & ring_buffer_mask;
119 const size_t max_comp_len =
120 BROTLI_MIN(size_t, max_length, MAX_TREE_COMP_LENGTH);
121 const BROTLI_BOOL should_reroot_tree =
122 TO_BROTLI_BOOL(max_length >= MAX_TREE_COMP_LENGTH);
123 const uint32_t key = FN(HashBytes)(&data[cur_ix_masked]);
124 uint32_t* BROTLI_RESTRICT buckets = self->buckets_;
125 uint32_t* BROTLI_RESTRICT forest = self->forest_;
126 size_t prev_ix = buckets[key];
127 /* The forest index of the rightmost node of the left subtree of the new
128 root, updated as we traverse and re-root the tree of the hash bucket. */
129 size_t node_left = FN(LeftChildIndex)(self, cur_ix);
130 /* The forest index of the leftmost node of the right subtree of the new
131 root, updated as we traverse and re-root the tree of the hash bucket. */
132 size_t node_right = FN(RightChildIndex)(self, cur_ix);
133 /* The match length of the rightmost node of the left subtree of the new
134 root, updated as we traverse and re-root the tree of the hash bucket. */
135 size_t best_len_left = 0;
136 /* The match length of the leftmost node of the right subtree of the new
137 root, updated as we traverse and re-root the tree of the hash bucket. */
138 size_t best_len_right = 0;
139 size_t depth_remaining;
140 if (should_reroot_tree) {
141 buckets[key] = (uint32_t)cur_ix;
142 }
143 for (depth_remaining = MAX_TREE_SEARCH_DEPTH; ; --depth_remaining) {
144 const size_t backward = cur_ix - prev_ix;
145 const size_t prev_ix_masked = prev_ix & ring_buffer_mask;
146 if (backward == 0 || backward > max_backward || depth_remaining == 0) {
147 if (should_reroot_tree) {
148 forest[node_left] = self->invalid_pos_;
149 forest[node_right] = self->invalid_pos_;
150 }
151 break;
152 }
153 {
154 const size_t cur_len = BROTLI_MIN(size_t, best_len_left, best_len_right);
155 size_t len;
156 BROTLI_DCHECK(cur_len <= MAX_TREE_COMP_LENGTH);
157 len = cur_len +
158 FindMatchLengthWithLimit(&data[cur_ix_masked + cur_len],
159 &data[prev_ix_masked + cur_len],
160 max_length - cur_len);
161 BROTLI_DCHECK(
162 0 == memcmp(&data[cur_ix_masked], &data[prev_ix_masked], len));
163 if (matches && len > *best_len) {
164 *best_len = len;
165 InitBackwardMatch(matches++, backward, len);
166 }
167 if (len >= max_comp_len) {
168 if (should_reroot_tree) {
169 forest[node_left] = forest[FN(LeftChildIndex)(self, prev_ix)];
170 forest[node_right] = forest[FN(RightChildIndex)(self, prev_ix)];
171 }
172 break;
173 }
174 if (data[cur_ix_masked + len] > data[prev_ix_masked + len]) {
175 best_len_left = len;
176 if (should_reroot_tree) {
177 forest[node_left] = (uint32_t)prev_ix;
178 }
179 node_left = FN(RightChildIndex)(self, prev_ix);
180 prev_ix = forest[node_left];
181 } else {
182 best_len_right = len;
183 if (should_reroot_tree) {
184 forest[node_right] = (uint32_t)prev_ix;
185 }
186 node_right = FN(LeftChildIndex)(self, prev_ix);
187 prev_ix = forest[node_right];
188 }
189 }
190 }
191 return matches;
192 }
193
194 /* Finds all backward matches of &data[cur_ix & ring_buffer_mask] up to the
195 length of max_length and stores the position cur_ix in the hash table.
196
197 Sets *num_matches to the number of matches found, and stores the found
198 matches in matches[0] to matches[*num_matches - 1]. The matches will be
199 sorted by strictly increasing length and (non-strictly) increasing
200 distance. */
FN(FindAllMatches)201 static BROTLI_INLINE size_t FN(FindAllMatches)(
202 HashToBinaryTree* BROTLI_RESTRICT self,
203 const BrotliEncoderDictionary* dictionary,
204 const uint8_t* BROTLI_RESTRICT data,
205 const size_t ring_buffer_mask, const size_t cur_ix,
206 const size_t max_length, const size_t max_backward,
207 const size_t dictionary_distance, const BrotliEncoderParams* params,
208 BackwardMatch* matches) {
209 BackwardMatch* const orig_matches = matches;
210 const size_t cur_ix_masked = cur_ix & ring_buffer_mask;
211 size_t best_len = 1;
212 const size_t short_match_max_backward =
213 params->quality != HQ_ZOPFLIFICATION_QUALITY ? 16 : 64;
214 size_t stop = cur_ix - short_match_max_backward;
215 uint32_t dict_matches[BROTLI_MAX_STATIC_DICTIONARY_MATCH_LEN + 1];
216 size_t i;
217 if (cur_ix < short_match_max_backward) { stop = 0; }
218 for (i = cur_ix - 1; i > stop && best_len <= 2; --i) {
219 size_t prev_ix = i;
220 const size_t backward = cur_ix - prev_ix;
221 if (BROTLI_PREDICT_FALSE(backward > max_backward)) {
222 break;
223 }
224 prev_ix &= ring_buffer_mask;
225 if (data[cur_ix_masked] != data[prev_ix] ||
226 data[cur_ix_masked + 1] != data[prev_ix + 1]) {
227 continue;
228 }
229 {
230 const size_t len =
231 FindMatchLengthWithLimit(&data[prev_ix], &data[cur_ix_masked],
232 max_length);
233 if (len > best_len) {
234 best_len = len;
235 InitBackwardMatch(matches++, backward, len);
236 }
237 }
238 }
239 if (best_len < max_length) {
240 matches = FN(StoreAndFindMatches)(self, data, cur_ix,
241 ring_buffer_mask, max_length, max_backward, &best_len, matches);
242 }
243 for (i = 0; i <= BROTLI_MAX_STATIC_DICTIONARY_MATCH_LEN; ++i) {
244 dict_matches[i] = kInvalidMatch;
245 }
246 {
247 size_t minlen = BROTLI_MAX(size_t, 4, best_len + 1);
248 if (BrotliFindAllStaticDictionaryMatches(dictionary,
249 &data[cur_ix_masked], minlen, max_length, &dict_matches[0])) {
250 size_t maxlen = BROTLI_MIN(
251 size_t, BROTLI_MAX_STATIC_DICTIONARY_MATCH_LEN, max_length);
252 size_t l;
253 for (l = minlen; l <= maxlen; ++l) {
254 uint32_t dict_id = dict_matches[l];
255 if (dict_id < kInvalidMatch) {
256 size_t distance = dictionary_distance + (dict_id >> 5) + 1;
257 if (distance <= params->dist.max_distance) {
258 InitDictionaryBackwardMatch(matches++, distance, l, dict_id & 31);
259 }
260 }
261 }
262 }
263 }
264 return (size_t)(matches - orig_matches);
265 }
266
267 /* Stores the hash of the next 4 bytes and re-roots the binary tree at the
268 current sequence, without returning any matches.
269 REQUIRES: ix + MAX_TREE_COMP_LENGTH <= end-of-current-block */
FN(Store)270 static BROTLI_INLINE void FN(Store)(HashToBinaryTree* BROTLI_RESTRICT self,
271 const uint8_t* BROTLI_RESTRICT data,
272 const size_t mask, const size_t ix) {
273 /* Maximum distance is window size - 16, see section 9.1. of the spec. */
274 const size_t max_backward = self->window_mask_ - BROTLI_WINDOW_GAP + 1;
275 FN(StoreAndFindMatches)(self, data, ix, mask, MAX_TREE_COMP_LENGTH,
276 max_backward, NULL, NULL);
277 }
278
FN(StoreRange)279 static BROTLI_INLINE void FN(StoreRange)(HashToBinaryTree* BROTLI_RESTRICT self,
280 const uint8_t* BROTLI_RESTRICT data, const size_t mask,
281 const size_t ix_start, const size_t ix_end) {
282 size_t i = ix_start;
283 size_t j = ix_start;
284 if (ix_start + 63 <= ix_end) {
285 i = ix_end - 63;
286 }
287 if (ix_start + 512 <= i) {
288 for (; j < i; j += 8) {
289 FN(Store)(self, data, mask, j);
290 }
291 }
292 for (; i < ix_end; ++i) {
293 FN(Store)(self, data, mask, i);
294 }
295 }
296
FN(StitchToPreviousBlock)297 static BROTLI_INLINE void FN(StitchToPreviousBlock)(
298 HashToBinaryTree* BROTLI_RESTRICT self,
299 size_t num_bytes, size_t position, const uint8_t* ringbuffer,
300 size_t ringbuffer_mask) {
301 if (num_bytes >= FN(HashTypeLength)() - 1 &&
302 position >= MAX_TREE_COMP_LENGTH) {
303 /* Store the last `MAX_TREE_COMP_LENGTH - 1` positions in the hasher.
304 These could not be calculated before, since they require knowledge
305 of both the previous and the current block. */
306 const size_t i_start = position - MAX_TREE_COMP_LENGTH + 1;
307 const size_t i_end = BROTLI_MIN(size_t, position, i_start + num_bytes);
308 size_t i;
309 for (i = i_start; i < i_end; ++i) {
310 /* Maximum distance is window size - 16, see section 9.1. of the spec.
311 Furthermore, we have to make sure that we don't look further back
312 from the start of the next block than the window size, otherwise we
313 could access already overwritten areas of the ring-buffer. */
314 const size_t max_backward =
315 self->window_mask_ - BROTLI_MAX(size_t,
316 BROTLI_WINDOW_GAP - 1,
317 position - i);
318 /* We know that i + MAX_TREE_COMP_LENGTH <= position + num_bytes, i.e. the
319 end of the current block and that we have at least
320 MAX_TREE_COMP_LENGTH tail in the ring-buffer. */
321 FN(StoreAndFindMatches)(self, ringbuffer, i, ringbuffer_mask,
322 MAX_TREE_COMP_LENGTH, max_backward, NULL, NULL);
323 }
324 }
325 }
326
327 #undef BUCKET_SIZE
328
329 #undef HashToBinaryTree
330