1 /* NOLINT(build/header_guard) */
2 /* Copyright 2016 Google Inc. All Rights Reserved.
3
4 Distributed under MIT license.
5 See file LICENSE for detail or copy at https://opensource.org/licenses/MIT
6 */
7
8 /* template parameters: FN, BUCKET_BITS, MAX_TREE_COMP_LENGTH,
9 MAX_TREE_SEARCH_DEPTH */
10
11 /* A (forgetful) hash table where each hash bucket contains a binary tree of
12 sequences whose first 4 bytes share the same hash code.
13 Each sequence is MAX_TREE_COMP_LENGTH long and is identified by its starting
14 position in the input data. The binary tree is sorted by the lexicographic
15 order of the sequences, and it is also a max-heap with respect to the
16 starting positions. */
17
18 #define HashToBinaryTree HASHER()
19
20 #define BUCKET_SIZE (1 << BUCKET_BITS)
21
FN(HashTypeLength)22 static size_t FN(HashTypeLength)(void) { return 4; }
FN(StoreLookahead)23 static size_t FN(StoreLookahead)(void) { return MAX_TREE_COMP_LENGTH; }
24
FN(HashBytes)25 static uint32_t FN(HashBytes)(const uint8_t *data) {
26 uint32_t h = BROTLI_UNALIGNED_LOAD32(data) * kHashMul32;
27 /* The higher bits contain more mixture from the multiplication,
28 so we take our results from there. */
29 return h >> (32 - BUCKET_BITS);
30 }
31
32 typedef struct HashToBinaryTree {
33 /* The window size minus 1 */
34 size_t window_mask_;
35
36 /* Hash table that maps the 4-byte hashes of the sequence to the last
37 position where this hash was found, which is the root of the binary
38 tree of sequences that share this hash bucket. */
39 uint32_t buckets_[BUCKET_SIZE];
40
41 /* A position used to mark a non-existent sequence, i.e. a tree is empty if
42 its root is at invalid_pos_ and a node is a leaf if both its children
43 are at invalid_pos_. */
44 uint32_t invalid_pos_;
45
46 /* --- Dynamic size members --- */
47
48 /* The union of the binary trees of each hash bucket. The root of the tree
49 corresponding to a hash is a sequence starting at buckets_[hash] and
50 the left and right children of a sequence starting at pos are
51 forest_[2 * pos] and forest_[2 * pos + 1]. */
52 /* uint32_t forest[2 * num_nodes] */
53 } HashToBinaryTree;
54
FN(Self)55 static BROTLI_INLINE HashToBinaryTree* FN(Self)(HasherHandle handle) {
56 return (HashToBinaryTree*)&(GetHasherCommon(handle)[1]);
57 }
58
FN(Forest)59 static BROTLI_INLINE uint32_t* FN(Forest)(HashToBinaryTree* self) {
60 return (uint32_t*)(&self[1]);
61 }
62
FN(Initialize)63 static void FN(Initialize)(
64 HasherHandle handle, const BrotliEncoderParams* params) {
65 HashToBinaryTree* self = FN(Self)(handle);
66 self->window_mask_ = (1u << params->lgwin) - 1u;
67 self->invalid_pos_ = (uint32_t)(0 - self->window_mask_);
68 }
69
FN(Prepare)70 static void FN(Prepare)(HasherHandle handle, BROTLI_BOOL one_shot,
71 size_t input_size, const uint8_t* data) {
72 HashToBinaryTree* self = FN(Self)(handle);
73 uint32_t invalid_pos = self->invalid_pos_;
74 uint32_t i;
75 BROTLI_UNUSED(data);
76 BROTLI_UNUSED(one_shot);
77 BROTLI_UNUSED(input_size);
78 for (i = 0; i < BUCKET_SIZE; i++) {
79 self->buckets_[i] = invalid_pos;
80 }
81 }
82
FN(HashMemAllocInBytes)83 static BROTLI_INLINE size_t FN(HashMemAllocInBytes)(
84 const BrotliEncoderParams* params, BROTLI_BOOL one_shot,
85 size_t input_size) {
86 size_t num_nodes = (size_t)1 << params->lgwin;
87 if (one_shot && input_size < num_nodes) {
88 num_nodes = input_size;
89 }
90 return sizeof(HashToBinaryTree) + 2 * sizeof(uint32_t) * num_nodes;
91 }
92
FN(LeftChildIndex)93 static BROTLI_INLINE size_t FN(LeftChildIndex)(HashToBinaryTree* self,
94 const size_t pos) {
95 return 2 * (pos & self->window_mask_);
96 }
97
FN(RightChildIndex)98 static BROTLI_INLINE size_t FN(RightChildIndex)(HashToBinaryTree* self,
99 const size_t pos) {
100 return 2 * (pos & self->window_mask_) + 1;
101 }
102
103 /* Stores the hash of the next 4 bytes and in a single tree-traversal, the
104 hash bucket's binary tree is searched for matches and is re-rooted at the
105 current position.
106
107 If less than MAX_TREE_COMP_LENGTH data is available, the hash bucket of the
108 current position is searched for matches, but the state of the hash table
109 is not changed, since we can not know the final sorting order of the
110 current (incomplete) sequence.
111
112 This function must be called with increasing cur_ix positions. */
FN(StoreAndFindMatches)113 static BROTLI_INLINE BackwardMatch* FN(StoreAndFindMatches)(
114 HashToBinaryTree* self, const uint8_t* const BROTLI_RESTRICT data,
115 const size_t cur_ix, const size_t ring_buffer_mask, const size_t max_length,
116 const size_t max_backward, size_t* const BROTLI_RESTRICT best_len,
117 BackwardMatch* BROTLI_RESTRICT matches) {
118 const size_t cur_ix_masked = cur_ix & ring_buffer_mask;
119 const size_t max_comp_len =
120 BROTLI_MIN(size_t, max_length, MAX_TREE_COMP_LENGTH);
121 const BROTLI_BOOL should_reroot_tree =
122 TO_BROTLI_BOOL(max_length >= MAX_TREE_COMP_LENGTH);
123 const uint32_t key = FN(HashBytes)(&data[cur_ix_masked]);
124 uint32_t* forest = FN(Forest)(self);
125 size_t prev_ix = self->buckets_[key];
126 /* The forest index of the rightmost node of the left subtree of the new
127 root, updated as we traverse and re-root the tree of the hash bucket. */
128 size_t node_left = FN(LeftChildIndex)(self, cur_ix);
129 /* The forest index of the leftmost node of the right subtree of the new
130 root, updated as we traverse and re-root the tree of the hash bucket. */
131 size_t node_right = FN(RightChildIndex)(self, cur_ix);
132 /* The match length of the rightmost node of the left subtree of the new
133 root, updated as we traverse and re-root the tree of the hash bucket. */
134 size_t best_len_left = 0;
135 /* The match length of the leftmost node of the right subtree of the new
136 root, updated as we traverse and re-root the tree of the hash bucket. */
137 size_t best_len_right = 0;
138 size_t depth_remaining;
139 if (should_reroot_tree) {
140 self->buckets_[key] = (uint32_t)cur_ix;
141 }
142 for (depth_remaining = MAX_TREE_SEARCH_DEPTH; ; --depth_remaining) {
143 const size_t backward = cur_ix - prev_ix;
144 const size_t prev_ix_masked = prev_ix & ring_buffer_mask;
145 if (backward == 0 || backward > max_backward || depth_remaining == 0) {
146 if (should_reroot_tree) {
147 forest[node_left] = self->invalid_pos_;
148 forest[node_right] = self->invalid_pos_;
149 }
150 break;
151 }
152 {
153 const size_t cur_len = BROTLI_MIN(size_t, best_len_left, best_len_right);
154 size_t len;
155 assert(cur_len <= MAX_TREE_COMP_LENGTH);
156 len = cur_len +
157 FindMatchLengthWithLimit(&data[cur_ix_masked + cur_len],
158 &data[prev_ix_masked + cur_len],
159 max_length - cur_len);
160 assert(0 == memcmp(&data[cur_ix_masked], &data[prev_ix_masked], len));
161 if (matches && len > *best_len) {
162 *best_len = len;
163 InitBackwardMatch(matches++, backward, len);
164 }
165 if (len >= max_comp_len) {
166 if (should_reroot_tree) {
167 forest[node_left] = forest[FN(LeftChildIndex)(self, prev_ix)];
168 forest[node_right] = forest[FN(RightChildIndex)(self, prev_ix)];
169 }
170 break;
171 }
172 if (data[cur_ix_masked + len] > data[prev_ix_masked + len]) {
173 best_len_left = len;
174 if (should_reroot_tree) {
175 forest[node_left] = (uint32_t)prev_ix;
176 }
177 node_left = FN(RightChildIndex)(self, prev_ix);
178 prev_ix = forest[node_left];
179 } else {
180 best_len_right = len;
181 if (should_reroot_tree) {
182 forest[node_right] = (uint32_t)prev_ix;
183 }
184 node_right = FN(LeftChildIndex)(self, prev_ix);
185 prev_ix = forest[node_right];
186 }
187 }
188 }
189 return matches;
190 }
191
192 /* Finds all backward matches of &data[cur_ix & ring_buffer_mask] up to the
193 length of max_length and stores the position cur_ix in the hash table.
194
195 Sets *num_matches to the number of matches found, and stores the found
196 matches in matches[0] to matches[*num_matches - 1]. The matches will be
197 sorted by strictly increasing length and (non-strictly) increasing
198 distance. */
FN(FindAllMatches)199 static BROTLI_INLINE size_t FN(FindAllMatches)(HasherHandle handle,
200 const BrotliDictionary* dictionary, const uint8_t* data,
201 const size_t ring_buffer_mask, const size_t cur_ix,
202 const size_t max_length, const size_t max_backward,
203 const BrotliEncoderParams* params, BackwardMatch* matches) {
204 BackwardMatch* const orig_matches = matches;
205 const size_t cur_ix_masked = cur_ix & ring_buffer_mask;
206 size_t best_len = 1;
207 const size_t short_match_max_backward =
208 params->quality != HQ_ZOPFLIFICATION_QUALITY ? 16 : 64;
209 size_t stop = cur_ix - short_match_max_backward;
210 uint32_t dict_matches[BROTLI_MAX_STATIC_DICTIONARY_MATCH_LEN + 1];
211 size_t i;
212 if (cur_ix < short_match_max_backward) { stop = 0; }
213 for (i = cur_ix - 1; i > stop && best_len <= 2; --i) {
214 size_t prev_ix = i;
215 const size_t backward = cur_ix - prev_ix;
216 if (BROTLI_PREDICT_FALSE(backward > max_backward)) {
217 break;
218 }
219 prev_ix &= ring_buffer_mask;
220 if (data[cur_ix_masked] != data[prev_ix] ||
221 data[cur_ix_masked + 1] != data[prev_ix + 1]) {
222 continue;
223 }
224 {
225 const size_t len =
226 FindMatchLengthWithLimit(&data[prev_ix], &data[cur_ix_masked],
227 max_length);
228 if (len > best_len) {
229 best_len = len;
230 InitBackwardMatch(matches++, backward, len);
231 }
232 }
233 }
234 if (best_len < max_length) {
235 matches = FN(StoreAndFindMatches)(FN(Self)(handle), data, cur_ix,
236 ring_buffer_mask, max_length, max_backward, &best_len, matches);
237 }
238 for (i = 0; i <= BROTLI_MAX_STATIC_DICTIONARY_MATCH_LEN; ++i) {
239 dict_matches[i] = kInvalidMatch;
240 }
241 {
242 size_t minlen = BROTLI_MAX(size_t, 4, best_len + 1);
243 if (BrotliFindAllStaticDictionaryMatches(dictionary,
244 &data[cur_ix_masked], minlen, max_length, &dict_matches[0])) {
245 size_t maxlen = BROTLI_MIN(
246 size_t, BROTLI_MAX_STATIC_DICTIONARY_MATCH_LEN, max_length);
247 size_t l;
248 for (l = minlen; l <= maxlen; ++l) {
249 uint32_t dict_id = dict_matches[l];
250 if (dict_id < kInvalidMatch) {
251 InitDictionaryBackwardMatch(matches++,
252 max_backward + (dict_id >> 5) + 1, l, dict_id & 31);
253 }
254 }
255 }
256 }
257 return (size_t)(matches - orig_matches);
258 }
259
260 /* Stores the hash of the next 4 bytes and re-roots the binary tree at the
261 current sequence, without returning any matches.
262 REQUIRES: ix + MAX_TREE_COMP_LENGTH <= end-of-current-block */
FN(Store)263 static BROTLI_INLINE void FN(Store)(HasherHandle handle, const uint8_t *data,
264 const size_t mask, const size_t ix) {
265 HashToBinaryTree* self = FN(Self)(handle);
266 /* Maximum distance is window size - 16, see section 9.1. of the spec. */
267 const size_t max_backward = self->window_mask_ - BROTLI_WINDOW_GAP + 1;
268 FN(StoreAndFindMatches)(self, data, ix, mask, MAX_TREE_COMP_LENGTH,
269 max_backward, NULL, NULL);
270 }
271
FN(StoreRange)272 static BROTLI_INLINE void FN(StoreRange)(HasherHandle handle,
273 const uint8_t *data, const size_t mask, const size_t ix_start,
274 const size_t ix_end) {
275 size_t i = ix_start;
276 size_t j = ix_start;
277 if (ix_start + 63 <= ix_end) {
278 i = ix_end - 63;
279 }
280 if (ix_start + 512 <= i) {
281 for (; j < i; j += 8) {
282 FN(Store)(handle, data, mask, j);
283 }
284 }
285 for (; i < ix_end; ++i) {
286 FN(Store)(handle, data, mask, i);
287 }
288 }
289
FN(StitchToPreviousBlock)290 static BROTLI_INLINE void FN(StitchToPreviousBlock)(HasherHandle handle,
291 size_t num_bytes, size_t position, const uint8_t* ringbuffer,
292 size_t ringbuffer_mask) {
293 HashToBinaryTree* self = FN(Self)(handle);
294 if (num_bytes >= FN(HashTypeLength)() - 1 &&
295 position >= MAX_TREE_COMP_LENGTH) {
296 /* Store the last `MAX_TREE_COMP_LENGTH - 1` positions in the hasher.
297 These could not be calculated before, since they require knowledge
298 of both the previous and the current block. */
299 const size_t i_start = position - MAX_TREE_COMP_LENGTH + 1;
300 const size_t i_end = BROTLI_MIN(size_t, position, i_start + num_bytes);
301 size_t i;
302 for (i = i_start; i < i_end; ++i) {
303 /* Maximum distance is window size - 16, see section 9.1. of the spec.
304 Furthermore, we have to make sure that we don't look further back
305 from the start of the next block than the window size, otherwise we
306 could access already overwritten areas of the ring-buffer. */
307 const size_t max_backward =
308 self->window_mask_ - BROTLI_MAX(size_t,
309 BROTLI_WINDOW_GAP - 1,
310 position - i);
311 /* We know that i + MAX_TREE_COMP_LENGTH <= position + num_bytes, i.e. the
312 end of the current block and that we have at least
313 MAX_TREE_COMP_LENGTH tail in the ring-buffer. */
314 FN(StoreAndFindMatches)(self, ringbuffer, i, ringbuffer_mask,
315 MAX_TREE_COMP_LENGTH, max_backward, NULL, NULL);
316 }
317 }
318 }
319
320 #undef BUCKET_SIZE
321
322 #undef HashToBinaryTree
323