• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2010 Google Inc. All Rights Reserved.
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 //
15 // A (forgetful) hash table to the data seen by the compressor, to
16 // help create backward references to previous data.
17 
18 #ifndef BROTLI_ENC_HASH_H_
19 #define BROTLI_ENC_HASH_H_
20 
21 #include <stddef.h>
22 #include <stdint.h>
23 #include <string.h>
24 #include <sys/types.h>
25 #include <algorithm>
26 #include <cstdlib>
27 #include <memory>
28 #include <string>
29 
30 #include "./transform.h"
31 #include "./fast_log.h"
32 #include "./find_match_length.h"
33 #include "./port.h"
34 #include "./static_dict.h"
35 
36 namespace brotli {
37 
38 // kHashMul32 multiplier has these properties:
39 // * The multiplier must be odd. Otherwise we may lose the highest bit.
40 // * No long streaks of 1s or 0s.
41 // * There is no effort to ensure that it is a prime, the oddity is enough
42 //   for this use.
43 // * The number has been tuned heuristically against compression benchmarks.
44 static const uint32_t kHashMul32 = 0x1e35a7bd;
45 
46 template<int kShiftBits, int kMinLength>
Hash(const uint8_t * data)47 inline uint32_t Hash(const uint8_t *data) {
48   if (kMinLength <= 3) {
49     // If kMinLength is 2 or 3, we hash the first 3 bytes of data.
50     uint32_t h = (BROTLI_UNALIGNED_LOAD32(data) & 0xffffff) * kHashMul32;
51     // The higher bits contain more mixture from the multiplication,
52     // so we take our results from there.
53     return h >> (32 - kShiftBits);
54   } else {
55     // If kMinLength is at least 4, we hash the first 4 bytes of data.
56     uint32_t h = BROTLI_UNALIGNED_LOAD32(data) * kHashMul32;
57     // The higher bits contain more mixture from the multiplication,
58     // so we take our results from there.
59     return h >> (32 - kShiftBits);
60   }
61 }
62 
63 // Usually, we always choose the longest backward reference. This function
64 // allows for the exception of that rule.
65 //
66 // If we choose a backward reference that is further away, it will
67 // usually be coded with more bits. We approximate this by assuming
68 // log2(distance). If the distance can be expressed in terms of the
69 // last four distances, we use some heuristic constants to estimate
70 // the bits cost. For the first up to four literals we use the bit
71 // cost of the literals from the literal cost model, after that we
72 // use the average bit cost of the cost model.
73 //
74 // This function is used to sometimes discard a longer backward reference
75 // when it is not much longer and the bit cost for encoding it is more
76 // than the saved literals.
BackwardReferenceScore(double average_cost,double start_cost4,double start_cost3,double start_cost2,int copy_length,int backward_reference_offset)77 inline double BackwardReferenceScore(double average_cost,
78                                      double start_cost4,
79                                      double start_cost3,
80                                      double start_cost2,
81                                      int copy_length,
82                                      int backward_reference_offset) {
83   double retval = 0;
84   switch (copy_length) {
85     case 2: retval = start_cost2; break;
86     case 3: retval = start_cost3; break;
87     default: retval = start_cost4 + (copy_length - 4) * average_cost; break;
88   }
89   retval -= 1.20 * Log2Floor(backward_reference_offset);
90   return retval;
91 }
92 
BackwardReferenceScoreUsingLastDistance(double average_cost,double start_cost4,double start_cost3,double start_cost2,int copy_length,int distance_short_code)93 inline double BackwardReferenceScoreUsingLastDistance(double average_cost,
94                                                       double start_cost4,
95                                                       double start_cost3,
96                                                       double start_cost2,
97                                                       int copy_length,
98                                                       int distance_short_code) {
99   double retval = 0;
100   switch (copy_length) {
101     case 2: retval = start_cost2; break;
102     case 3: retval = start_cost3; break;
103     default: retval = start_cost4 + (copy_length - 4) * average_cost; break;
104   }
105   static const double kDistanceShortCodeBitCost[16] = {
106     -0.6, 0.95, 1.17, 1.27,
107     0.93, 0.93, 0.96, 0.96, 0.99, 0.99,
108     1.05, 1.05, 1.15, 1.15, 1.25, 1.25
109   };
110   retval -= kDistanceShortCodeBitCost[distance_short_code];
111   return retval;
112 }
113 
114 // A (forgetful) hash table to the data seen by the compressor, to
115 // help create backward references to previous data.
116 //
117 // This is a hash map of fixed size (kBucketSize) to a ring buffer of
118 // fixed size (kBlockSize). The ring buffer contains the last kBlockSize
119 // index positions of the given hash key in the compressed data.
120 template <int kBucketBits, int kBlockBits, int kMinLength>
121 class HashLongestMatch {
122  public:
HashLongestMatch()123   HashLongestMatch()
124       : last_distance1_(4),
125         last_distance2_(11),
126         last_distance3_(15),
127         last_distance4_(16),
128         insert_length_(0),
129         average_cost_(5.4),
130         static_dict_(NULL) {
131     Reset();
132   }
Reset()133   void Reset() {
134     std::fill(&num_[0], &num_[sizeof(num_) / sizeof(num_[0])], 0);
135   }
SetStaticDictionary(const StaticDictionary * dict)136   void SetStaticDictionary(const StaticDictionary *dict) {
137     static_dict_ = dict;
138   }
HasStaticDictionary()139   bool HasStaticDictionary() const {
140     return static_dict_ != NULL;
141   }
142 
143   // Look at 3 bytes at data.
144   // Compute a hash from these, and store the value of ix at that position.
Store(const uint8_t * data,const int ix)145   inline void Store(const uint8_t *data, const int ix) {
146     const uint32_t key = Hash<kBucketBits, kMinLength>(data);
147     const int minor_ix = num_[key] & kBlockMask;
148     buckets_[key][minor_ix] = ix;
149     ++num_[key];
150   }
151 
152   // Store hashes for a range of data.
StoreHashes(const uint8_t * data,size_t len,int startix,int mask)153   void StoreHashes(const uint8_t *data, size_t len, int startix, int mask) {
154     for (int p = 0; p < len; ++p) {
155       Store(&data[p & mask], startix + p);
156     }
157   }
158 
159   // Find a longest backward match of &data[cur_ix] up to the length of
160   // max_length.
161   //
162   // Does not look for matches longer than max_length.
163   // Does not look for matches further away than max_backward.
164   // Writes the best found match length into best_len_out.
165   // Writes the index (&data[index]) offset from the start of the best match
166   // into best_distance_out.
167   // Write the score of the best match into best_score_out.
FindLongestMatch(const uint8_t * __restrict data,const float * __restrict literal_cost,const size_t ring_buffer_mask,const uint32_t cur_ix,uint32_t max_length,const uint32_t max_backward,size_t * __restrict best_len_out,size_t * __restrict best_len_code_out,size_t * __restrict best_distance_out,double * __restrict best_score_out,bool * __restrict in_dictionary)168   bool FindLongestMatch(const uint8_t * __restrict data,
169                         const float * __restrict literal_cost,
170                         const size_t ring_buffer_mask,
171                         const uint32_t cur_ix,
172                         uint32_t max_length,
173                         const uint32_t max_backward,
174                         size_t * __restrict best_len_out,
175                         size_t * __restrict best_len_code_out,
176                         size_t * __restrict best_distance_out,
177                         double * __restrict best_score_out,
178                         bool * __restrict in_dictionary) {
179     *in_dictionary = true;
180     *best_len_code_out = 0;
181     const size_t cur_ix_masked = cur_ix & ring_buffer_mask;
182     const double start_cost4 = literal_cost == NULL ? 20 :
183         literal_cost[cur_ix_masked] +
184         literal_cost[(cur_ix + 1) & ring_buffer_mask] +
185         literal_cost[(cur_ix + 2) & ring_buffer_mask] +
186         literal_cost[(cur_ix + 3) & ring_buffer_mask];
187     const double start_cost3 = literal_cost == NULL ? 15 :
188         literal_cost[cur_ix_masked] +
189         literal_cost[(cur_ix + 1) & ring_buffer_mask] +
190         literal_cost[(cur_ix + 2) & ring_buffer_mask] + 0.3;
191     double start_cost2 = literal_cost == NULL ? 10 :
192         literal_cost[cur_ix_masked] +
193         literal_cost[(cur_ix + 1) & ring_buffer_mask] + 1.2;
194     bool match_found = false;
195     // Don't accept a short copy from far away.
196     double best_score = 8.115;
197     if (insert_length_ < 4) {
198       double cost_diff[4] = { 0.10, 0.04, 0.02, 0.01 };
199       best_score += cost_diff[insert_length_];
200     }
201     size_t best_len = *best_len_out;
202     *best_len_out = 0;
203     size_t best_ix = 1;
204     // Try last distance first.
205     for (int i = 0; i < 16; ++i) {
206       size_t prev_ix = cur_ix;
207       switch(i) {
208         case 0: prev_ix -= last_distance1_; break;
209         case 1: prev_ix -= last_distance2_; break;
210         case 2: prev_ix -= last_distance3_; break;
211         case 3: prev_ix -= last_distance4_; break;
212 
213         case 4: prev_ix -= last_distance1_ - 1; break;
214         case 5: prev_ix -= last_distance1_ + 1; break;
215         case 6: prev_ix -= last_distance1_ - 2; break;
216         case 7: prev_ix -= last_distance1_ + 2; break;
217         case 8: prev_ix -= last_distance1_ - 3; break;
218         case 9: prev_ix -= last_distance1_ + 3; break;
219 
220         case 10: prev_ix -= last_distance2_ - 1; break;
221         case 11: prev_ix -= last_distance2_ + 1; break;
222         case 12: prev_ix -= last_distance2_ - 2; break;
223         case 13: prev_ix -= last_distance2_ + 2; break;
224         case 14: prev_ix -= last_distance2_ - 3; break;
225         case 15: prev_ix -= last_distance2_ + 3; break;
226       }
227       if (prev_ix >= cur_ix) {
228         continue;
229       }
230       const size_t backward = cur_ix - prev_ix;
231       if (PREDICT_FALSE(backward > max_backward)) {
232         continue;
233       }
234       prev_ix &= ring_buffer_mask;
235       if (cur_ix_masked + best_len > ring_buffer_mask ||
236           prev_ix + best_len > ring_buffer_mask ||
237           data[cur_ix_masked + best_len] != data[prev_ix + best_len]) {
238         continue;
239       }
240       const size_t len =
241           FindMatchLengthWithLimit(&data[prev_ix], &data[cur_ix_masked],
242                                    max_length);
243       if (len >= std::max(kMinLength, 3) ||
244           (kMinLength == 2 && len == 2 && i < 2)) {
245         // Comparing for >= 2 does not change the semantics, but just saves for
246         // a few unnecessary binary logarithms in backward reference score,
247         // since we are not interested in such short matches.
248         const double score = BackwardReferenceScoreUsingLastDistance(
249             average_cost_,
250             start_cost4,
251             start_cost3,
252             start_cost2,
253             len, i);
254         if (best_score < score) {
255           best_score = score;
256           best_len = len;
257           best_ix = backward;
258           *best_len_out = best_len;
259           *best_len_code_out = best_len;
260           *best_distance_out = best_ix;
261           *best_score_out = best_score;
262           match_found = true;
263           *in_dictionary = backward > max_backward;
264         }
265       }
266     }
267     if (kMinLength == 2) {
268       int stop = int(cur_ix) - 64;
269       if (stop < 0) { stop = 0; }
270       start_cost2 -= 1.0;
271       for (int i = cur_ix - 1; i > stop; --i) {
272         size_t prev_ix = i;
273         const size_t backward = cur_ix - prev_ix;
274         if (PREDICT_FALSE(backward > max_backward)) {
275           break;
276         }
277         prev_ix &= ring_buffer_mask;
278         if (data[cur_ix_masked] != data[prev_ix] ||
279             data[cur_ix_masked + 1] != data[prev_ix + 1]) {
280           continue;
281         }
282         int len = 2;
283         const double score = start_cost2 - 2.3 * Log2Floor(backward);
284 
285         if (best_score < score) {
286           best_score = score;
287           best_len = len;
288           best_ix = backward;
289           *best_len_out = best_len;
290           *best_len_code_out = best_len;
291           *best_distance_out = best_ix;
292           match_found = true;
293         }
294       }
295     }
296     const uint32_t key = Hash<kBucketBits, kMinLength>(&data[cur_ix_masked]);
297     const int * __restrict const bucket = &buckets_[key][0];
298     const int down = (num_[key] > kBlockSize) ? (num_[key] - kBlockSize) : 0;
299     for (int i = num_[key] - 1; i >= down; --i) {
300       int prev_ix = bucket[i & kBlockMask];
301       if (prev_ix >= 0) {
302         const size_t backward = cur_ix - prev_ix;
303         if (PREDICT_FALSE(backward > max_backward)) {
304           break;
305         }
306         prev_ix &= ring_buffer_mask;
307         if (cur_ix_masked + best_len > ring_buffer_mask ||
308             prev_ix + best_len > ring_buffer_mask ||
309             data[cur_ix_masked + best_len] != data[prev_ix + best_len]) {
310           continue;
311         }
312         const size_t len =
313             FindMatchLengthWithLimit(&data[prev_ix], &data[cur_ix_masked],
314                                      max_length);
315         if (len >= std::max(kMinLength, 3)) {
316           // Comparing for >= 3 does not change the semantics, but just saves
317           // for a few unnecessary binary logarithms in backward reference
318           // score, since we are not interested in such short matches.
319           const double score = BackwardReferenceScore(average_cost_,
320                                                       start_cost4,
321                                                       start_cost3,
322                                                       start_cost2,
323                                                       len, backward);
324           if (best_score < score) {
325             best_score = score;
326             best_len = len;
327             best_ix = backward;
328             *best_len_out = best_len;
329             *best_len_code_out = best_len;
330             *best_distance_out = best_ix;
331             *best_score_out = best_score;
332             match_found = true;
333             *in_dictionary = false;
334           }
335         }
336       }
337     }
338     if (static_dict_ != NULL) {
339       // We decide based on first 4 bytes how many bytes to test for.
340       int prefix = BROTLI_UNALIGNED_LOAD32(&data[cur_ix_masked]);
341       int maxlen = static_dict_->GetLength(prefix);
342       for (int len = std::min<size_t>(maxlen, max_length);
343            len > best_len && len >= 4; --len) {
344         std::string snippet((const char *)&data[cur_ix_masked], len);
345         int copy_len_code;
346         int word_id;
347         if (static_dict_->Get(snippet, &copy_len_code, &word_id)) {
348           const size_t backward = max_backward + word_id + 1;
349           const double score = BackwardReferenceScore(average_cost_,
350                                                       start_cost4,
351                                                       start_cost3,
352                                                       start_cost2,
353                                                       len, backward);
354           if (best_score < score) {
355             best_score = score;
356             best_len = len;
357             best_ix = backward;
358             *best_len_out = best_len;
359             *best_len_code_out = copy_len_code;
360             *best_distance_out = best_ix;
361             *best_score_out = best_score;
362             match_found = true;
363             *in_dictionary = true;
364           }
365         }
366       }
367     }
368     return match_found;
369   }
370 
set_last_distance(int v)371   void set_last_distance(int v) {
372     if (last_distance1_ != v) {
373       last_distance4_ = last_distance3_;
374       last_distance3_ = last_distance2_;
375       last_distance2_ = last_distance1_;
376       last_distance1_ = v;
377     }
378   }
379 
last_distance()380   int last_distance() const { return last_distance1_; }
381 
set_insert_length(int v)382   void set_insert_length(int v) { insert_length_ = v; }
383 
set_average_cost(double v)384   void set_average_cost(double v) { average_cost_ = v; }
385 
386  private:
387   // Number of hash buckets.
388   static const uint32_t kBucketSize = 1 << kBucketBits;
389 
390   // Only kBlockSize newest backward references are kept,
391   // and the older are forgotten.
392   static const uint32_t kBlockSize = 1 << kBlockBits;
393 
394   // Mask for accessing entries in a block (in a ringbuffer manner).
395   static const uint32_t kBlockMask = (1 << kBlockBits) - 1;
396 
397   // Number of entries in a particular bucket.
398   uint16_t num_[kBucketSize];
399 
400   // Buckets containing kBlockSize of backward references.
401   int buckets_[kBucketSize][kBlockSize];
402 
403   int last_distance1_;
404   int last_distance2_;
405   int last_distance3_;
406   int last_distance4_;
407 
408   // Cost adjustment for how many literals we are planning to insert
409   // anyway.
410   int insert_length_;
411 
412   double average_cost_;
413 
414   const StaticDictionary *static_dict_;
415 };
416 
417 struct Hashers {
418   enum Type {
419     HASH_15_8_4 = 0,
420     HASH_15_8_2 = 1,
421   };
422 
InitHashers423   void Init(Type type) {
424     switch (type) {
425       case HASH_15_8_4:
426         hash_15_8_4.reset(new HashLongestMatch<15, 8, 4>());
427         break;
428       case HASH_15_8_2:
429         hash_15_8_2.reset(new HashLongestMatch<15, 8, 2>());
430         break;
431       default:
432         break;
433     }
434   }
435 
SetStaticDictionaryHashers436   void SetStaticDictionary(const StaticDictionary *dict) {
437     if (hash_15_8_4.get() != NULL) hash_15_8_4->SetStaticDictionary(dict);
438     if (hash_15_8_2.get() != NULL) hash_15_8_2->SetStaticDictionary(dict);
439   }
440 
441   std::unique_ptr<HashLongestMatch<15, 8, 4> > hash_15_8_4;
442   std::unique_ptr<HashLongestMatch<15, 8, 2> > hash_15_8_2;
443 };
444 
445 }  // namespace brotli
446 
447 #endif  // BROTLI_ENC_HASH_H_
448