1 /* Copyright 2010 Google Inc. All Rights Reserved.
2
3 Distributed under MIT license.
4 See file LICENSE for detail or copy at https://opensource.org/licenses/MIT
5 */
6
7 /* A (forgetful) hash table to the data seen by the compressor, to
8 help create backward references to previous data. */
9
10 #ifndef BROTLI_ENC_HASH_H_
11 #define BROTLI_ENC_HASH_H_
12
13 #include <string.h> /* memcmp, memset */
14
15 #include "../common/constants.h"
16 #include "../common/dictionary.h"
17 #include <brotli/types.h>
18 #include "./fast_log.h"
19 #include "./find_match_length.h"
20 #include "./memory.h"
21 #include "./port.h"
22 #include "./quality.h"
23 #include "./static_dict.h"
24
25 #if defined(__cplusplus) || defined(c_plusplus)
26 extern "C" {
27 #endif
28
29 /* Pointer to hasher data.
30 *
31 * Excluding initialization and destruction, hasher can be passed as
32 * HasherHandle by value.
33 *
34 * Typically hasher data consists of 3 sections:
35 * * HasherCommon structure
36 * * private structured hasher data, depending on hasher type
37 * * private dynamic hasher data, depending on hasher type and parameters
38 */
39 typedef uint8_t* HasherHandle;
40
41 typedef struct {
42 BrotliHasherParams params;
43
44 /* False if hasher needs to be "prepared" before use. */
45 BROTLI_BOOL is_prepared_;
46
47 size_t dict_num_lookups;
48 size_t dict_num_matches;
49 } HasherCommon;
50
GetHasherCommon(HasherHandle handle)51 static BROTLI_INLINE HasherCommon* GetHasherCommon(HasherHandle handle) {
52 return (HasherCommon*)handle;
53 }
54
55 #define score_t size_t
56
57 static const uint32_t kCutoffTransformsCount = 10;
58 /* 0, 12, 27, 23, 42, 63, 56, 48, 59, 64 */
59 /* 0+0, 4+8, 8+19, 12+11, 16+26, 20+43, 24+32, 28+20, 32+27, 36+28 */
60 static const uint64_t kCutoffTransforms =
61 BROTLI_MAKE_UINT64_T(0x071B520A, 0xDA2D3200);
62
63 typedef struct HasherSearchResult {
64 size_t len;
65 size_t len_x_code; /* == len ^ len_code */
66 size_t distance;
67 score_t score;
68 } HasherSearchResult;
69
70 /* kHashMul32 multiplier has these properties:
71 * The multiplier must be odd. Otherwise we may lose the highest bit.
72 * No long streaks of ones or zeros.
73 * There is no effort to ensure that it is a prime, the oddity is enough
74 for this use.
75 * The number has been tuned heuristically against compression benchmarks. */
76 static const uint32_t kHashMul32 = 0x1e35a7bd;
77 static const uint64_t kHashMul64 = BROTLI_MAKE_UINT64_T(0x1e35a7bd, 0x1e35a7bd);
78 static const uint64_t kHashMul64Long =
79 BROTLI_MAKE_UINT64_T(0x1fe35a7bU, 0xd3579bd3U);
80
Hash14(const uint8_t * data)81 static BROTLI_INLINE uint32_t Hash14(const uint8_t* data) {
82 uint32_t h = BROTLI_UNALIGNED_LOAD32(data) * kHashMul32;
83 /* The higher bits contain more mixture from the multiplication,
84 so we take our results from there. */
85 return h >> (32 - 14);
86 }
87
PrepareDistanceCache(int * BROTLI_RESTRICT distance_cache,const int num_distances)88 static BROTLI_INLINE void PrepareDistanceCache(
89 int* BROTLI_RESTRICT distance_cache, const int num_distances) {
90 if (num_distances > 4) {
91 int last_distance = distance_cache[0];
92 distance_cache[4] = last_distance - 1;
93 distance_cache[5] = last_distance + 1;
94 distance_cache[6] = last_distance - 2;
95 distance_cache[7] = last_distance + 2;
96 distance_cache[8] = last_distance - 3;
97 distance_cache[9] = last_distance + 3;
98 if (num_distances > 10) {
99 int next_last_distance = distance_cache[1];
100 distance_cache[10] = next_last_distance - 1;
101 distance_cache[11] = next_last_distance + 1;
102 distance_cache[12] = next_last_distance - 2;
103 distance_cache[13] = next_last_distance + 2;
104 distance_cache[14] = next_last_distance - 3;
105 distance_cache[15] = next_last_distance + 3;
106 }
107 }
108 }
109
110 #define BROTLI_LITERAL_BYTE_SCORE 135
111 #define BROTLI_DISTANCE_BIT_PENALTY 30
112 /* Score must be positive after applying maximal penalty. */
113 #define BROTLI_SCORE_BASE (BROTLI_DISTANCE_BIT_PENALTY * 8 * sizeof(size_t))
114
115 /* Usually, we always choose the longest backward reference. This function
116 allows for the exception of that rule.
117
118 If we choose a backward reference that is further away, it will
119 usually be coded with more bits. We approximate this by assuming
120 log2(distance). If the distance can be expressed in terms of the
121 last four distances, we use some heuristic constants to estimate
122 the bits cost. For the first up to four literals we use the bit
123 cost of the literals from the literal cost model, after that we
124 use the average bit cost of the cost model.
125
126 This function is used to sometimes discard a longer backward reference
127 when it is not much longer and the bit cost for encoding it is more
128 than the saved literals.
129
130 backward_reference_offset MUST be positive. */
BackwardReferenceScore(size_t copy_length,size_t backward_reference_offset)131 static BROTLI_INLINE score_t BackwardReferenceScore(
132 size_t copy_length, size_t backward_reference_offset) {
133 return BROTLI_SCORE_BASE + BROTLI_LITERAL_BYTE_SCORE * (score_t)copy_length -
134 BROTLI_DISTANCE_BIT_PENALTY * Log2FloorNonZero(backward_reference_offset);
135 }
136
BackwardReferenceScoreUsingLastDistance(size_t copy_length)137 static BROTLI_INLINE score_t BackwardReferenceScoreUsingLastDistance(
138 size_t copy_length) {
139 return BROTLI_LITERAL_BYTE_SCORE * (score_t)copy_length +
140 BROTLI_SCORE_BASE + 15;
141 }
142
BackwardReferencePenaltyUsingLastDistance(size_t distance_short_code)143 static BROTLI_INLINE score_t BackwardReferencePenaltyUsingLastDistance(
144 size_t distance_short_code) {
145 return (score_t)39 + ((0x1CA10 >> (distance_short_code & 0xE)) & 0xE);
146 }
147
TestStaticDictionaryItem(const BrotliDictionary * dictionary,size_t item,const uint8_t * data,size_t max_length,size_t max_backward,HasherSearchResult * out)148 static BROTLI_INLINE BROTLI_BOOL TestStaticDictionaryItem(
149 const BrotliDictionary* dictionary, size_t item, const uint8_t* data,
150 size_t max_length, size_t max_backward, HasherSearchResult* out) {
151 size_t len;
152 size_t dist;
153 size_t offset;
154 size_t matchlen;
155 size_t backward;
156 score_t score;
157 len = item & 0x1F;
158 dist = item >> 5;
159 offset = dictionary->offsets_by_length[len] + len * dist;
160 if (len > max_length) {
161 return BROTLI_FALSE;
162 }
163
164 matchlen =
165 FindMatchLengthWithLimit(data, &dictionary->data[offset], len);
166 if (matchlen + kCutoffTransformsCount <= len || matchlen == 0) {
167 return BROTLI_FALSE;
168 }
169 {
170 size_t cut = len - matchlen;
171 size_t transform_id =
172 (cut << 2) + (size_t)((kCutoffTransforms >> (cut * 6)) & 0x3F);
173 backward = max_backward + dist + 1 +
174 (transform_id << dictionary->size_bits_by_length[len]);
175 }
176 score = BackwardReferenceScore(matchlen, backward);
177 if (score < out->score) {
178 return BROTLI_FALSE;
179 }
180 out->len = matchlen;
181 out->len_x_code = len ^ matchlen;
182 out->distance = backward;
183 out->score = score;
184 return BROTLI_TRUE;
185 }
186
SearchInStaticDictionary(const BrotliDictionary * dictionary,const uint16_t * dictionary_hash,HasherHandle handle,const uint8_t * data,size_t max_length,size_t max_backward,HasherSearchResult * out,BROTLI_BOOL shallow)187 static BROTLI_INLINE BROTLI_BOOL SearchInStaticDictionary(
188 const BrotliDictionary* dictionary, const uint16_t* dictionary_hash,
189 HasherHandle handle, const uint8_t* data, size_t max_length,
190 size_t max_backward, HasherSearchResult* out, BROTLI_BOOL shallow) {
191 size_t key;
192 size_t i;
193 BROTLI_BOOL is_match_found = BROTLI_FALSE;
194 HasherCommon* self = GetHasherCommon(handle);
195 if (self->dict_num_matches < (self->dict_num_lookups >> 7)) {
196 return BROTLI_FALSE;
197 }
198 key = Hash14(data) << 1;
199 for (i = 0; i < (shallow ? 1u : 2u); ++i, ++key) {
200 size_t item = dictionary_hash[key];
201 self->dict_num_lookups++;
202 if (item != 0) {
203 BROTLI_BOOL item_matches = TestStaticDictionaryItem(
204 dictionary, item, data, max_length, max_backward, out);
205 if (item_matches) {
206 self->dict_num_matches++;
207 is_match_found = BROTLI_TRUE;
208 }
209 }
210 }
211 return is_match_found;
212 }
213
214 typedef struct BackwardMatch {
215 uint32_t distance;
216 uint32_t length_and_code;
217 } BackwardMatch;
218
InitBackwardMatch(BackwardMatch * self,size_t dist,size_t len)219 static BROTLI_INLINE void InitBackwardMatch(BackwardMatch* self,
220 size_t dist, size_t len) {
221 self->distance = (uint32_t)dist;
222 self->length_and_code = (uint32_t)(len << 5);
223 }
224
InitDictionaryBackwardMatch(BackwardMatch * self,size_t dist,size_t len,size_t len_code)225 static BROTLI_INLINE void InitDictionaryBackwardMatch(BackwardMatch* self,
226 size_t dist, size_t len, size_t len_code) {
227 self->distance = (uint32_t)dist;
228 self->length_and_code =
229 (uint32_t)((len << 5) | (len == len_code ? 0 : len_code));
230 }
231
BackwardMatchLength(const BackwardMatch * self)232 static BROTLI_INLINE size_t BackwardMatchLength(const BackwardMatch* self) {
233 return self->length_and_code >> 5;
234 }
235
BackwardMatchLengthCode(const BackwardMatch * self)236 static BROTLI_INLINE size_t BackwardMatchLengthCode(const BackwardMatch* self) {
237 size_t code = self->length_and_code & 31;
238 return code ? code : BackwardMatchLength(self);
239 }
240
241 #define EXPAND_CAT(a, b) CAT(a, b)
242 #define CAT(a, b) a ## b
243 #define FN(X) EXPAND_CAT(X, HASHER())
244
245 #define HASHER() H10
246 #define BUCKET_BITS 17
247 #define MAX_TREE_SEARCH_DEPTH 64
248 #define MAX_TREE_COMP_LENGTH 128
249 #include "./hash_to_binary_tree_inc.h" /* NOLINT(build/include) */
250 #undef MAX_TREE_SEARCH_DEPTH
251 #undef MAX_TREE_COMP_LENGTH
252 #undef BUCKET_BITS
253 #undef HASHER
254 /* MAX_NUM_MATCHES == 64 + MAX_TREE_SEARCH_DEPTH */
255 #define MAX_NUM_MATCHES_H10 128
256
257 /* For BUCKET_SWEEP == 1, enabling the dictionary lookup makes compression
258 a little faster (0.5% - 1%) and it compresses 0.15% better on small text
259 and HTML inputs. */
260
261 #define HASHER() H2
262 #define BUCKET_BITS 16
263 #define BUCKET_SWEEP 1
264 #define HASH_LEN 5
265 #define USE_DICTIONARY 1
266 #include "./hash_longest_match_quickly_inc.h" /* NOLINT(build/include) */
267 #undef BUCKET_SWEEP
268 #undef USE_DICTIONARY
269 #undef HASHER
270
271 #define HASHER() H3
272 #define BUCKET_SWEEP 2
273 #define USE_DICTIONARY 0
274 #include "./hash_longest_match_quickly_inc.h" /* NOLINT(build/include) */
275 #undef USE_DICTIONARY
276 #undef BUCKET_SWEEP
277 #undef BUCKET_BITS
278 #undef HASHER
279
280 #define HASHER() H4
281 #define BUCKET_BITS 17
282 #define BUCKET_SWEEP 4
283 #define USE_DICTIONARY 1
284 #include "./hash_longest_match_quickly_inc.h" /* NOLINT(build/include) */
285 #undef USE_DICTIONARY
286 #undef HASH_LEN
287 #undef BUCKET_SWEEP
288 #undef BUCKET_BITS
289 #undef HASHER
290
291 #define HASHER() H5
292 #include "./hash_longest_match_inc.h" /* NOLINT(build/include) */
293 #undef HASHER
294
295 #define HASHER() H6
296 #include "./hash_longest_match64_inc.h" /* NOLINT(build/include) */
297 #undef HASHER
298
299 #define BUCKET_BITS 15
300
301 #define NUM_LAST_DISTANCES_TO_CHECK 4
302 #define NUM_BANKS 1
303 #define BANK_BITS 16
304 #define HASHER() H40
305 #include "./hash_forgetful_chain_inc.h" /* NOLINT(build/include) */
306 #undef HASHER
307 #undef NUM_LAST_DISTANCES_TO_CHECK
308
309 #define NUM_LAST_DISTANCES_TO_CHECK 10
310 #define HASHER() H41
311 #include "./hash_forgetful_chain_inc.h" /* NOLINT(build/include) */
312 #undef HASHER
313 #undef NUM_LAST_DISTANCES_TO_CHECK
314 #undef NUM_BANKS
315 #undef BANK_BITS
316
317 #define NUM_LAST_DISTANCES_TO_CHECK 16
318 #define NUM_BANKS 512
319 #define BANK_BITS 9
320 #define HASHER() H42
321 #include "./hash_forgetful_chain_inc.h" /* NOLINT(build/include) */
322 #undef HASHER
323 #undef NUM_LAST_DISTANCES_TO_CHECK
324 #undef NUM_BANKS
325 #undef BANK_BITS
326
327 #undef BUCKET_BITS
328
329 #define HASHER() H54
330 #define BUCKET_BITS 20
331 #define BUCKET_SWEEP 4
332 #define HASH_LEN 7
333 #define USE_DICTIONARY 0
334 #include "./hash_longest_match_quickly_inc.h" /* NOLINT(build/include) */
335 #undef USE_DICTIONARY
336 #undef HASH_LEN
337 #undef BUCKET_SWEEP
338 #undef BUCKET_BITS
339 #undef HASHER
340
341 #undef FN
342 #undef CAT
343 #undef EXPAND_CAT
344
345 #define FOR_GENERIC_HASHERS(H) H(2) H(3) H(4) H(5) H(6) H(40) H(41) H(42) H(54)
346 #define FOR_ALL_HASHERS(H) FOR_GENERIC_HASHERS(H) H(10)
347
DestroyHasher(MemoryManager * m,HasherHandle * handle)348 static BROTLI_INLINE void DestroyHasher(
349 MemoryManager* m, HasherHandle* handle) {
350 if (*handle == NULL) return;
351 BROTLI_FREE(m, *handle);
352 }
353
HasherReset(HasherHandle handle)354 static BROTLI_INLINE void HasherReset(HasherHandle handle) {
355 if (handle == NULL) return;
356 GetHasherCommon(handle)->is_prepared_ = BROTLI_FALSE;
357 }
358
HasherSize(const BrotliEncoderParams * params,BROTLI_BOOL one_shot,const size_t input_size)359 static BROTLI_INLINE size_t HasherSize(const BrotliEncoderParams* params,
360 BROTLI_BOOL one_shot, const size_t input_size) {
361 size_t result = sizeof(HasherCommon);
362 switch (params->hasher.type) {
363 #define SIZE_(N) \
364 case N: \
365 result += HashMemAllocInBytesH ## N(params, one_shot, input_size); \
366 break;
367 FOR_ALL_HASHERS(SIZE_)
368 #undef SIZE_
369 default:
370 break;
371 }
372 return result;
373 }
374
HasherSetup(MemoryManager * m,HasherHandle * handle,BrotliEncoderParams * params,const uint8_t * data,size_t position,size_t input_size,BROTLI_BOOL is_last)375 static BROTLI_INLINE void HasherSetup(MemoryManager* m, HasherHandle* handle,
376 BrotliEncoderParams* params, const uint8_t* data, size_t position,
377 size_t input_size, BROTLI_BOOL is_last) {
378 HasherHandle self = NULL;
379 HasherCommon* common = NULL;
380 BROTLI_BOOL one_shot = (position == 0 && is_last);
381 if (*handle == NULL) {
382 size_t alloc_size;
383 ChooseHasher(params, ¶ms->hasher);
384 alloc_size = HasherSize(params, one_shot, input_size);
385 self = BROTLI_ALLOC(m, uint8_t, alloc_size);
386 if (BROTLI_IS_OOM(m)) return;
387 *handle = self;
388 common = GetHasherCommon(self);
389 common->params = params->hasher;
390 switch (common->params.type) {
391 #define INITIALIZE_(N) \
392 case N: \
393 InitializeH ## N(*handle, params); \
394 break;
395 FOR_ALL_HASHERS(INITIALIZE_);
396 #undef INITIALIZE_
397 default:
398 break;
399 }
400 HasherReset(*handle);
401 }
402
403 self = *handle;
404 common = GetHasherCommon(self);
405 if (!common->is_prepared_) {
406 switch (common->params.type) {
407 #define PREPARE_(N) \
408 case N: \
409 PrepareH ## N(self, one_shot, input_size, data); \
410 break;
411 FOR_ALL_HASHERS(PREPARE_)
412 #undef PREPARE_
413 default: break;
414 }
415 if (position == 0) {
416 common->dict_num_lookups = 0;
417 common->dict_num_matches = 0;
418 }
419 common->is_prepared_ = BROTLI_TRUE;
420 }
421 }
422
423 /* Custom LZ77 window. */
HasherPrependCustomDictionary(MemoryManager * m,HasherHandle * handle,BrotliEncoderParams * params,const size_t size,const uint8_t * dict)424 static BROTLI_INLINE void HasherPrependCustomDictionary(
425 MemoryManager* m, HasherHandle* handle, BrotliEncoderParams* params,
426 const size_t size, const uint8_t* dict) {
427 size_t overlap;
428 size_t i;
429 HasherHandle self;
430 HasherSetup(m, handle, params, dict, 0, size, BROTLI_FALSE);
431 if (BROTLI_IS_OOM(m)) return;
432 self = *handle;
433 switch (GetHasherCommon(self)->params.type) {
434 #define PREPEND_(N) \
435 case N: \
436 overlap = (StoreLookaheadH ## N()) - 1; \
437 for (i = 0; i + overlap < size; i++) { \
438 StoreH ## N(self, dict, ~(size_t)0, i); \
439 } \
440 break;
441 FOR_ALL_HASHERS(PREPEND_)
442 #undef PREPEND_
443 default: break;
444 }
445 }
446
InitOrStitchToPreviousBlock(MemoryManager * m,HasherHandle * handle,const uint8_t * data,size_t mask,BrotliEncoderParams * params,size_t position,size_t input_size,BROTLI_BOOL is_last)447 static BROTLI_INLINE void InitOrStitchToPreviousBlock(
448 MemoryManager* m, HasherHandle* handle, const uint8_t* data, size_t mask,
449 BrotliEncoderParams* params, size_t position, size_t input_size,
450 BROTLI_BOOL is_last) {
451 HasherHandle self;
452 HasherSetup(m, handle, params, data, position, input_size, is_last);
453 if (BROTLI_IS_OOM(m)) return;
454 self = *handle;
455 switch (GetHasherCommon(self)->params.type) {
456 #define INIT_(N) \
457 case N: \
458 StitchToPreviousBlockH ## N(self, input_size, position, data, mask); \
459 break;
460 FOR_ALL_HASHERS(INIT_)
461 #undef INIT_
462 default: break;
463 }
464 }
465
466 #if defined(__cplusplus) || defined(c_plusplus)
467 } /* extern "C" */
468 #endif
469
470 #endif /* BROTLI_ENC_HASH_H_ */
471