• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2013 Google Inc. All Rights Reserved.
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 //
15 // Implementation of Brotli compressor.
16 
17 #include "./encode.h"
18 
19 #include <algorithm>
20 #include <limits>
21 
22 #include "./backward_references.h"
23 #include "./bit_cost.h"
24 #include "./block_splitter.h"
25 #include "./cluster.h"
26 #include "./context.h"
27 #include "./transform.h"
28 #include "./entropy_encode.h"
29 #include "./fast_log.h"
30 #include "./hash.h"
31 #include "./histogram.h"
32 #include "./literal_cost.h"
33 #include "./prefix.h"
34 #include "./write_bits.h"
35 
36 namespace brotli {
37 
38 static const int kWindowBits = 22;
39 // To make decoding faster, we allow the decoder to write 16 bytes ahead in
40 // its ringbuffer, therefore the encoder has to decrease max distance by this
41 // amount.
42 static const int kDecoderRingBufferWriteAheadSlack = 16;
43 static const int kMaxBackwardDistance =
44     (1 << kWindowBits) - kDecoderRingBufferWriteAheadSlack;
45 
46 static const int kMetaBlockSizeBits = 21;
47 static const int kRingBufferBits = 23;
48 static const int kRingBufferMask = (1 << kRingBufferBits) - 1;
49 
50 template<int kSize>
Entropy(const std::vector<Histogram<kSize>> & histograms)51 double Entropy(const std::vector<Histogram<kSize> >& histograms) {
52   double retval = 0;
53   for (int i = 0; i < histograms.size(); ++i) {
54     retval += histograms[i].EntropyBitCost();
55   }
56   return retval;
57 }
58 
59 template<int kSize>
TotalBitCost(const std::vector<Histogram<kSize>> & histograms)60 double TotalBitCost(const std::vector<Histogram<kSize> >& histograms) {
61   double retval = 0;
62   for (int i = 0; i < histograms.size(); ++i) {
63     retval += PopulationCost(histograms[i]);
64   }
65   return retval;
66 }
67 
EncodeVarLenUint8(int n,int * storage_ix,uint8_t * storage)68 void EncodeVarLenUint8(int n, int* storage_ix, uint8_t* storage) {
69   if (n == 0) {
70     WriteBits(1, 0, storage_ix, storage);
71   } else {
72     WriteBits(1, 1, storage_ix, storage);
73     int nbits = Log2Floor(n);
74     WriteBits(3, nbits, storage_ix, storage);
75     if (nbits > 0) {
76       WriteBits(nbits, n - (1 << nbits), storage_ix, storage);
77     }
78   }
79 }
80 
ParseAsUTF8(int * symbol,const uint8_t * input,int size)81 int ParseAsUTF8(int* symbol, const uint8_t* input, int size) {
82   // ASCII
83   if ((input[0] & 0x80) == 0) {
84     *symbol = input[0];
85     if (*symbol > 0) {
86       return 1;
87     }
88   }
89   // 2-byte UTF8
90   if (size > 1 &&
91       (input[0] & 0xe0) == 0xc0 &&
92       (input[1] & 0xc0) == 0x80) {
93     *symbol = (((input[0] & 0x1f) << 6) |
94                (input[1] & 0x3f));
95     if (*symbol > 0x7f) {
96       return 2;
97     }
98   }
99   // 3-byte UFT8
100   if (size > 2 &&
101       (input[0] & 0xf0) == 0xe0 &&
102       (input[1] & 0xc0) == 0x80 &&
103       (input[2] & 0xc0) == 0x80) {
104     *symbol = (((input[0] & 0x0f) << 12) |
105                ((input[1] & 0x3f) << 6) |
106                (input[2] & 0x3f));
107     if (*symbol > 0x7ff) {
108       return 3;
109     }
110   }
111   // 4-byte UFT8
112   if (size > 3 &&
113       (input[0] & 0xf8) == 0xf0 &&
114       (input[1] & 0xc0) == 0x80 &&
115       (input[2] & 0xc0) == 0x80 &&
116       (input[3] & 0xc0) == 0x80) {
117     *symbol = (((input[0] & 0x07) << 18) |
118                ((input[1] & 0x3f) << 12) |
119                ((input[2] & 0x3f) << 6) |
120                (input[3] & 0x3f));
121     if (*symbol > 0xffff && *symbol <= 0x10ffff) {
122       return 4;
123     }
124   }
125   // Not UTF8, emit a special symbol above the UTF8-code space
126   *symbol = 0x110000 | input[0];
127   return 1;
128 }
129 
130 // Returns true if at least min_fraction of the data is UTF8-encoded.
IsMostlyUTF8(const uint8_t * data,size_t length,double min_fraction)131 bool IsMostlyUTF8(const uint8_t* data, size_t length, double min_fraction) {
132   size_t size_utf8 = 0;
133   size_t pos = 0;
134   while (pos < length) {
135     int symbol;
136     int bytes_read = ParseAsUTF8(&symbol, data + pos, length - pos);
137     pos += bytes_read;
138     if (symbol < 0x110000) size_utf8 += bytes_read;
139   }
140   return size_utf8 > min_fraction * length;
141 }
142 
EncodeMetaBlockLength(size_t meta_block_size,bool is_last,bool is_uncompressed,int * storage_ix,uint8_t * storage)143 void EncodeMetaBlockLength(size_t meta_block_size,
144                            bool is_last,
145                            bool is_uncompressed,
146                            int* storage_ix, uint8_t* storage) {
147   WriteBits(1, is_last, storage_ix, storage);
148   if (is_last) {
149     if (meta_block_size == 0) {
150       WriteBits(1, 1, storage_ix, storage);
151       return;
152     }
153     WriteBits(1, 0, storage_ix, storage);
154   }
155   --meta_block_size;
156   int num_bits = Log2Floor(meta_block_size) + 1;
157   if (num_bits < 16) {
158     num_bits = 16;
159   }
160   WriteBits(2, (num_bits - 13) >> 2, storage_ix, storage);
161   while (num_bits > 0) {
162     WriteBits(4, meta_block_size & 0xf, storage_ix, storage);
163     meta_block_size >>= 4;
164     num_bits -= 4;
165   }
166   if (!is_last) {
167     WriteBits(1, is_uncompressed, storage_ix, storage);
168   }
169 }
170 
StoreHuffmanTreeOfHuffmanTreeToBitMask(const uint8_t * code_length_bitdepth,int * storage_ix,uint8_t * storage)171 void StoreHuffmanTreeOfHuffmanTreeToBitMask(
172     const uint8_t* code_length_bitdepth,
173     int* storage_ix, uint8_t* storage) {
174   static const uint8_t kStorageOrder[kCodeLengthCodes] = {
175     1, 2, 3, 4, 0, 5, 17, 6, 16, 7, 8, 9, 10, 11, 12, 13, 14, 15,
176   };
177   // Throw away trailing zeros:
178   int codes_to_store = kCodeLengthCodes;
179   for (; codes_to_store > 0; --codes_to_store) {
180     if (code_length_bitdepth[kStorageOrder[codes_to_store - 1]] != 0) {
181       break;
182     }
183   }
184   int num_codes = 0;
185   for (int i = 0; i < codes_to_store; ++i) {
186     if (code_length_bitdepth[kStorageOrder[i]] != 0) {
187       ++num_codes;
188     }
189   }
190   if (num_codes == 1) {
191     codes_to_store = kCodeLengthCodes;
192   }
193   int skip_some = 0;  // skips none.
194   if (code_length_bitdepth[kStorageOrder[0]] == 0 &&
195       code_length_bitdepth[kStorageOrder[1]] == 0) {
196     skip_some = 2;  // skips two.
197     if (code_length_bitdepth[kStorageOrder[2]] == 0) {
198       skip_some = 3;  // skips three.
199     }
200   }
201   WriteBits(2, skip_some, storage_ix, storage);
202   for (int i = skip_some; i < codes_to_store; ++i) {
203     uint8_t len[] = { 2, 4, 3, 2, 2, 4 };
204     uint8_t bits[] = { 0, 7, 3, 2, 1, 15 };
205     int v = code_length_bitdepth[kStorageOrder[i]];
206     WriteBits(len[v], bits[v], storage_ix, storage);
207   }
208 }
209 
StoreHuffmanTreeToBitMask(const uint8_t * huffman_tree,const uint8_t * huffman_tree_extra_bits,const int huffman_tree_size,const EntropyCode<kCodeLengthCodes> & entropy,int * storage_ix,uint8_t * storage)210 void StoreHuffmanTreeToBitMask(
211     const uint8_t* huffman_tree,
212     const uint8_t* huffman_tree_extra_bits,
213     const int huffman_tree_size,
214     const EntropyCode<kCodeLengthCodes>& entropy,
215     int* storage_ix, uint8_t* storage) {
216   for (int i = 0; i < huffman_tree_size; ++i) {
217     const int ix = huffman_tree[i];
218     const int extra_bits = huffman_tree_extra_bits[i];
219     if (entropy.count_ > 1) {
220       WriteBits(entropy.depth_[ix], entropy.bits_[ix], storage_ix, storage);
221     }
222     switch (ix) {
223       case 16:
224         WriteBits(2, extra_bits, storage_ix, storage);
225         break;
226       case 17:
227         WriteBits(3, extra_bits, storage_ix, storage);
228         break;
229     }
230   }
231 }
232 
233 template<int kSize>
StoreHuffmanCodeSimple(const EntropyCode<kSize> & code,int alphabet_size,int max_bits,int * storage_ix,uint8_t * storage)234 void StoreHuffmanCodeSimple(
235     const EntropyCode<kSize>& code, int alphabet_size,
236     int max_bits, int* storage_ix, uint8_t* storage) {
237   const uint8_t *depth = &code.depth_[0];
238   int symbols[4];
239   // Quadratic sort.
240   int k, j;
241   for (k = 0; k < code.count_; ++k) {
242     symbols[k] = code.symbols_[k];
243   }
244   for (k = 0; k < code.count_; ++k) {
245     for (j = k + 1; j < code.count_; ++j) {
246       if (depth[symbols[j]] < depth[symbols[k]]) {
247         int t = symbols[k];
248         symbols[k] = symbols[j];
249         symbols[j] = t;
250       }
251     }
252   }
253   // Small tree marker to encode 1-4 symbols.
254   WriteBits(2, 1, storage_ix, storage);
255   WriteBits(2, code.count_ - 1, storage_ix, storage);
256   for (int i = 0; i < code.count_; ++i) {
257     WriteBits(max_bits, symbols[i], storage_ix, storage);
258   }
259   if (code.count_ == 4) {
260     if (depth[symbols[0]] == 2 &&
261         depth[symbols[1]] == 2 &&
262         depth[symbols[2]] == 2 &&
263         depth[symbols[3]] == 2) {
264       WriteBits(1, 0, storage_ix, storage);
265     } else {
266       WriteBits(1, 1, storage_ix, storage);
267     }
268   }
269 }
270 
271 template<int kSize>
StoreHuffmanCodeComplex(const EntropyCode<kSize> & code,int alphabet_size,int * storage_ix,uint8_t * storage)272 void StoreHuffmanCodeComplex(
273     const EntropyCode<kSize>& code, int alphabet_size,
274     int* storage_ix, uint8_t* storage) {
275   const uint8_t *depth = &code.depth_[0];
276   uint8_t huffman_tree[kSize];
277   uint8_t huffman_tree_extra_bits[kSize];
278   int huffman_tree_size = 0;
279   WriteHuffmanTree(depth,
280                    alphabet_size,
281                    &huffman_tree[0],
282                    &huffman_tree_extra_bits[0],
283                    &huffman_tree_size);
284   Histogram<kCodeLengthCodes> huffman_tree_histogram;
285   memset(huffman_tree_histogram.data_, 0, sizeof(huffman_tree_histogram.data_));
286   for (int i = 0; i < huffman_tree_size; ++i) {
287     huffman_tree_histogram.Add(huffman_tree[i]);
288   }
289   EntropyCode<kCodeLengthCodes> huffman_tree_entropy;
290   BuildEntropyCode(huffman_tree_histogram, 5, kCodeLengthCodes,
291                    &huffman_tree_entropy);
292   StoreHuffmanTreeOfHuffmanTreeToBitMask(
293       &huffman_tree_entropy.depth_[0], storage_ix, storage);
294   StoreHuffmanTreeToBitMask(&huffman_tree[0], &huffman_tree_extra_bits[0],
295                             huffman_tree_size, huffman_tree_entropy,
296                             storage_ix, storage);
297 }
298 
299 template<int kSize>
BuildAndStoreEntropyCode(const Histogram<kSize> & histogram,const int tree_limit,const int alphabet_size,EntropyCode<kSize> * code,int * storage_ix,uint8_t * storage)300 void BuildAndStoreEntropyCode(const Histogram<kSize>& histogram,
301                               const int tree_limit,
302                               const int alphabet_size,
303                               EntropyCode<kSize>* code,
304                               int* storage_ix, uint8_t* storage) {
305   memset(code->depth_, 0, sizeof(code->depth_));
306   memset(code->bits_, 0, sizeof(code->bits_));
307   memset(code->symbols_, 0, sizeof(code->symbols_));
308   code->count_ = 0;
309 
310   int max_bits_counter = alphabet_size - 1;
311   int max_bits = 0;
312   while (max_bits_counter) {
313     max_bits_counter >>= 1;
314     ++max_bits;
315   }
316 
317   for (size_t i = 0; i < alphabet_size; i++) {
318     if (histogram.data_[i] > 0) {
319       if (code->count_ < 4) code->symbols_[code->count_] = i;
320       ++code->count_;
321     }
322   }
323 
324   if (code->count_ <= 1) {
325     WriteBits(2, 1, storage_ix, storage);
326     WriteBits(2, 0, storage_ix, storage);
327     WriteBits(max_bits, code->symbols_[0], storage_ix, storage);
328     return;
329   }
330 
331   if (alphabet_size >= 50 && code->count_ >= 16) {
332     std::vector<int> counts(alphabet_size);
333     memcpy(&counts[0], histogram.data_, sizeof(counts[0]) * alphabet_size);
334     OptimizeHuffmanCountsForRle(alphabet_size, &counts[0]);
335     CreateHuffmanTree(&counts[0], alphabet_size, tree_limit, code->depth_);
336   } else {
337     CreateHuffmanTree(histogram.data_, alphabet_size, tree_limit, code->depth_);
338   }
339   ConvertBitDepthsToSymbols(code->depth_, alphabet_size, code->bits_);
340 
341   if (code->count_ <= 4) {
342     StoreHuffmanCodeSimple(*code, alphabet_size, max_bits, storage_ix, storage);
343   } else {
344     StoreHuffmanCodeComplex(*code, alphabet_size, storage_ix, storage);
345   }
346 }
347 
348 template<int kSize>
BuildAndStoreEntropyCodes(const std::vector<Histogram<kSize>> & histograms,int alphabet_size,std::vector<EntropyCode<kSize>> * entropy_codes,int * storage_ix,uint8_t * storage)349 void BuildAndStoreEntropyCodes(
350     const std::vector<Histogram<kSize> >& histograms,
351     int alphabet_size,
352     std::vector<EntropyCode<kSize> >* entropy_codes,
353     int* storage_ix, uint8_t* storage) {
354   entropy_codes->resize(histograms.size());
355   for (int i = 0; i < histograms.size(); ++i) {
356     BuildAndStoreEntropyCode(histograms[i], 15, alphabet_size,
357                              &(*entropy_codes)[i],
358                              storage_ix, storage);
359   }
360 }
361 
EncodeCommand(const Command & cmd,const EntropyCodeCommand & entropy,int * storage_ix,uint8_t * storage)362 void EncodeCommand(const Command& cmd,
363                    const EntropyCodeCommand& entropy,
364                    int* storage_ix, uint8_t* storage) {
365   int code = cmd.command_prefix_;
366   WriteBits(entropy.depth_[code], entropy.bits_[code], storage_ix, storage);
367   if (code >= 128) {
368     code -= 128;
369   }
370   int insert_extra_bits = InsertLengthExtraBits(code);
371   uint64_t insert_extra_bits_val =
372       cmd.insert_length_ - InsertLengthOffset(code);
373   int copy_extra_bits = CopyLengthExtraBits(code);
374   uint64_t copy_extra_bits_val = cmd.copy_length_code_ - CopyLengthOffset(code);
375   if (insert_extra_bits > 0) {
376     WriteBits(insert_extra_bits, insert_extra_bits_val, storage_ix, storage);
377   }
378   if (copy_extra_bits > 0) {
379     WriteBits(copy_extra_bits, copy_extra_bits_val, storage_ix, storage);
380   }
381 }
382 
EncodeCopyDistance(const Command & cmd,const EntropyCodeDistance & entropy,int * storage_ix,uint8_t * storage)383 void EncodeCopyDistance(const Command& cmd, const EntropyCodeDistance& entropy,
384                         int* storage_ix, uint8_t* storage) {
385   int code = cmd.distance_prefix_;
386   int extra_bits = cmd.distance_extra_bits_;
387   uint64_t extra_bits_val = cmd.distance_extra_bits_value_;
388   WriteBits(entropy.depth_[code], entropy.bits_[code], storage_ix, storage);
389   if (extra_bits > 0) {
390     WriteBits(extra_bits, extra_bits_val, storage_ix, storage);
391   }
392 }
393 
ComputeDistanceShortCodes(std::vector<Command> * cmds,size_t pos,const size_t max_backward,int * dist_ringbuffer,size_t * ringbuffer_idx)394 void ComputeDistanceShortCodes(std::vector<Command>* cmds,
395                                size_t pos,
396                                const size_t max_backward,
397                                int* dist_ringbuffer,
398                                size_t* ringbuffer_idx) {
399   static const int kIndexOffset[16] = {
400     3, 2, 1, 0, 3, 3, 3, 3, 3, 3, 2, 2, 2, 2, 2, 2
401   };
402   static const int kValueOffset[16] = {
403     0, 0, 0, 0, -1, 1, -2, 2, -3, 3, -1, 1, -2, 2, -3, 3
404   };
405   for (int i = 0; i < cmds->size(); ++i) {
406     pos += (*cmds)[i].insert_length_;
407     size_t max_distance = std::min(pos, max_backward);
408     int cur_dist = (*cmds)[i].copy_distance_;
409     int dist_code = cur_dist + 16;
410     if (cur_dist <= max_distance) {
411       if (cur_dist == 0) break;
412       int limits[16] = { 0, 0, 0, 0,
413                          6, 6, 11, 11,
414                          11, 11, 11, 11,
415                          12, 12, 12, 12 };
416       for (int k = 0; k < 16; ++k) {
417         // Only accept more popular choices.
418         if (cur_dist < limits[k]) {
419           // Typically unpopular ranges, don't replace a short distance
420           // with them.
421           continue;
422         }
423         int comp = (dist_ringbuffer[(*ringbuffer_idx + kIndexOffset[k]) & 3] +
424                     kValueOffset[k]);
425         if (cur_dist == comp) {
426           dist_code = k + 1;
427           break;
428         }
429       }
430       if (dist_code > 1) {
431         dist_ringbuffer[*ringbuffer_idx & 3] = cur_dist;
432         ++(*ringbuffer_idx);
433       }
434       pos += (*cmds)[i].copy_length_;
435     } else {
436       int word_idx = cur_dist - max_distance - 1;
437       const std::string word =
438           GetTransformedDictionaryWord((*cmds)[i].copy_length_code_, word_idx);
439       pos += word.size();
440     }
441     (*cmds)[i].distance_code_ = dist_code;
442   }
443 }
444 
ComputeCommandPrefixes(std::vector<Command> * cmds,int num_direct_distance_codes,int distance_postfix_bits)445 void ComputeCommandPrefixes(std::vector<Command>* cmds,
446                             int num_direct_distance_codes,
447                             int distance_postfix_bits) {
448   for (int i = 0; i < cmds->size(); ++i) {
449     Command* cmd = &(*cmds)[i];
450     cmd->command_prefix_ = CommandPrefix(cmd->insert_length_,
451                                          cmd->copy_length_code_);
452     if (cmd->copy_length_code_ > 0) {
453       PrefixEncodeCopyDistance(cmd->distance_code_,
454                                num_direct_distance_codes,
455                                distance_postfix_bits,
456                                &cmd->distance_prefix_,
457                                &cmd->distance_extra_bits_,
458                                &cmd->distance_extra_bits_value_);
459     }
460     if (cmd->command_prefix_ < 128 && cmd->distance_prefix_ == 0) {
461       cmd->distance_prefix_ = 0xffff;
462     } else {
463       cmd->command_prefix_ += 128;
464     }
465   }
466 }
467 
IndexOf(const std::vector<int> & v,int value)468 int IndexOf(const std::vector<int>& v, int value) {
469   for (int i = 0; i < v.size(); ++i) {
470     if (v[i] == value) return i;
471   }
472   return -1;
473 }
474 
MoveToFront(std::vector<int> * v,int index)475 void MoveToFront(std::vector<int>* v, int index) {
476   int value = (*v)[index];
477   for (int i = index; i > 0; --i) {
478     (*v)[i] = (*v)[i - 1];
479   }
480   (*v)[0] = value;
481 }
482 
MoveToFrontTransform(const std::vector<int> & v)483 std::vector<int> MoveToFrontTransform(const std::vector<int>& v) {
484   if (v.empty()) return v;
485   std::vector<int> mtf(*max_element(v.begin(), v.end()) + 1);
486   for (int i = 0; i < mtf.size(); ++i) mtf[i] = i;
487   std::vector<int> result(v.size());
488   for (int i = 0; i < v.size(); ++i) {
489     int index = IndexOf(mtf, v[i]);
490     result[i] = index;
491     MoveToFront(&mtf, index);
492   }
493   return result;
494 }
495 
496 // Finds runs of zeros in v_in and replaces them with a prefix code of the run
497 // length plus extra bits in *v_out and *extra_bits. Non-zero values in v_in are
498 // shifted by *max_length_prefix. Will not create prefix codes bigger than the
499 // initial value of *max_run_length_prefix. The prefix code of run length L is
500 // simply Log2Floor(L) and the number of extra bits is the same as the prefix
501 // code.
RunLengthCodeZeros(const std::vector<int> & v_in,int * max_run_length_prefix,std::vector<int> * v_out,std::vector<int> * extra_bits)502 void RunLengthCodeZeros(const std::vector<int>& v_in,
503                         int* max_run_length_prefix,
504                         std::vector<int>* v_out,
505                         std::vector<int>* extra_bits) {
506   int max_reps = 0;
507   for (int i = 0; i < v_in.size();) {
508     for (; i < v_in.size() && v_in[i] != 0; ++i) ;
509     int reps = 0;
510     for (; i < v_in.size() && v_in[i] == 0; ++i) {
511       ++reps;
512     }
513     max_reps = std::max(reps, max_reps);
514   }
515   int max_prefix = max_reps > 0 ? Log2Floor(max_reps) : 0;
516   *max_run_length_prefix = std::min(max_prefix, *max_run_length_prefix);
517   for (int i = 0; i < v_in.size();) {
518     if (v_in[i] != 0) {
519       v_out->push_back(v_in[i] + *max_run_length_prefix);
520       extra_bits->push_back(0);
521       ++i;
522     } else {
523       int reps = 1;
524       for (uint32_t k = i + 1; k < v_in.size() && v_in[k] == 0; ++k) {
525         ++reps;
526       }
527       i += reps;
528       while (reps) {
529         if (reps < (2 << *max_run_length_prefix)) {
530           int run_length_prefix = Log2Floor(reps);
531           v_out->push_back(run_length_prefix);
532           extra_bits->push_back(reps - (1 << run_length_prefix));
533           break;
534         } else {
535           v_out->push_back(*max_run_length_prefix);
536           extra_bits->push_back((1 << *max_run_length_prefix) - 1);
537           reps -= (2 << *max_run_length_prefix) - 1;
538         }
539       }
540     }
541   }
542 }
543 
544 // Returns a maximum zero-run-length-prefix value such that run-length coding
545 // zeros in v with this maximum prefix value and then encoding the resulting
546 // histogram and entropy-coding v produces the least amount of bits.
BestMaxZeroRunLengthPrefix(const std::vector<int> & v)547 int BestMaxZeroRunLengthPrefix(const std::vector<int>& v) {
548   int min_cost = std::numeric_limits<int>::max();
549   int best_max_prefix = 0;
550   for (int max_prefix = 0; max_prefix <= 16; ++max_prefix) {
551     std::vector<int> rle_symbols;
552     std::vector<int> extra_bits;
553     int max_run_length_prefix = max_prefix;
554     RunLengthCodeZeros(v, &max_run_length_prefix, &rle_symbols, &extra_bits);
555     if (max_run_length_prefix < max_prefix) break;
556     HistogramContextMap histogram;
557     for (int i = 0; i < rle_symbols.size(); ++i) {
558       histogram.Add(rle_symbols[i]);
559     }
560     int bit_cost = PopulationCost(histogram);
561     if (max_prefix > 0) {
562       bit_cost += 4;
563     }
564     for (int i = 1; i <= max_prefix; ++i) {
565       bit_cost += histogram.data_[i] * i;  // extra bits
566     }
567     if (bit_cost < min_cost) {
568       min_cost = bit_cost;
569       best_max_prefix = max_prefix;
570     }
571   }
572   return best_max_prefix;
573 }
574 
EncodeContextMap(const std::vector<int> & context_map,int num_clusters,int * storage_ix,uint8_t * storage)575 void EncodeContextMap(const std::vector<int>& context_map,
576                       int num_clusters,
577                       int* storage_ix, uint8_t* storage) {
578   EncodeVarLenUint8(num_clusters - 1, storage_ix, storage);
579 
580   if (num_clusters == 1) {
581     return;
582   }
583 
584   std::vector<int> transformed_symbols = MoveToFrontTransform(context_map);
585   std::vector<int> rle_symbols;
586   std::vector<int> extra_bits;
587   int max_run_length_prefix = BestMaxZeroRunLengthPrefix(transformed_symbols);
588   RunLengthCodeZeros(transformed_symbols, &max_run_length_prefix,
589                      &rle_symbols, &extra_bits);
590   HistogramContextMap symbol_histogram;
591   for (int i = 0; i < rle_symbols.size(); ++i) {
592     symbol_histogram.Add(rle_symbols[i]);
593   }
594   bool use_rle = max_run_length_prefix > 0;
595   WriteBits(1, use_rle, storage_ix, storage);
596   if (use_rle) {
597     WriteBits(4, max_run_length_prefix - 1, storage_ix, storage);
598   }
599   EntropyCodeContextMap symbol_code;
600   BuildAndStoreEntropyCode(symbol_histogram, 15,
601                            num_clusters + max_run_length_prefix,
602                            &symbol_code,
603                            storage_ix, storage);
604   for (int i = 0; i < rle_symbols.size(); ++i) {
605     WriteBits(symbol_code.depth_[rle_symbols[i]],
606               symbol_code.bits_[rle_symbols[i]],
607               storage_ix, storage);
608     if (rle_symbols[i] > 0 && rle_symbols[i] <= max_run_length_prefix) {
609       WriteBits(rle_symbols[i], extra_bits[i], storage_ix, storage);
610     }
611   }
612   WriteBits(1, 1, storage_ix, storage);  // use move-to-front
613 }
614 
615 struct BlockSplitCode {
616   EntropyCodeBlockType block_type_code;
617   EntropyCodeBlockLength block_len_code;
618 };
619 
EncodeBlockLength(const EntropyCodeBlockLength & entropy,int length,int * storage_ix,uint8_t * storage)620 void EncodeBlockLength(const EntropyCodeBlockLength& entropy,
621                        int length,
622                        int* storage_ix, uint8_t* storage) {
623   int len_code = BlockLengthPrefix(length);
624   int extra_bits = BlockLengthExtraBits(len_code);
625   int extra_bits_value = length - BlockLengthOffset(len_code);
626   WriteBits(entropy.depth_[len_code], entropy.bits_[len_code],
627             storage_ix, storage);
628   if (extra_bits > 0) {
629     WriteBits(extra_bits, extra_bits_value, storage_ix, storage);
630   }
631 }
632 
ComputeBlockTypeShortCodes(BlockSplit * split)633 void ComputeBlockTypeShortCodes(BlockSplit* split) {
634   if (split->num_types_ <= 1) {
635     split->num_types_ = 1;
636     return;
637   }
638   int ringbuffer[2] = { 0, 1 };
639   size_t index = 0;
640   for (int i = 0; i < split->types_.size(); ++i) {
641     int type = split->types_[i];
642     int type_code;
643     if (type == ringbuffer[index & 1]) {
644       type_code = 0;
645     } else if (type == ringbuffer[(index - 1) & 1] + 1) {
646       type_code = 1;
647     } else {
648       type_code = type + 2;
649     }
650     ringbuffer[index & 1] = type;
651     ++index;
652     split->type_codes_.push_back(type_code);
653   }
654 }
655 
BuildAndEncodeBlockSplitCode(const BlockSplit & split,BlockSplitCode * code,int * storage_ix,uint8_t * storage)656 void BuildAndEncodeBlockSplitCode(const BlockSplit& split,
657                                   BlockSplitCode* code,
658                                   int* storage_ix, uint8_t* storage) {
659   EncodeVarLenUint8(split.num_types_ - 1, storage_ix, storage);
660 
661   if (split.num_types_ == 1) {
662     return;
663   }
664 
665   HistogramBlockType type_histo;
666   for (int i = 1; i < split.type_codes_.size(); ++i) {
667     type_histo.Add(split.type_codes_[i]);
668   }
669   HistogramBlockLength length_histo;
670   for (int i = 0; i < split.lengths_.size(); ++i) {
671     length_histo.Add(BlockLengthPrefix(split.lengths_[i]));
672   }
673   BuildAndStoreEntropyCode(type_histo, 15, split.num_types_ + 2,
674                            &code->block_type_code,
675                            storage_ix, storage);
676   BuildAndStoreEntropyCode(length_histo, 15, kNumBlockLenPrefixes,
677                            &code->block_len_code,
678                            storage_ix, storage);
679   EncodeBlockLength(code->block_len_code, split.lengths_[0],
680                     storage_ix, storage);
681 }
682 
MoveAndEncode(const BlockSplitCode & code,BlockSplitIterator * it,int * storage_ix,uint8_t * storage)683 void MoveAndEncode(const BlockSplitCode& code,
684                    BlockSplitIterator* it,
685                    int* storage_ix, uint8_t* storage) {
686   if (it->length_ == 0) {
687     ++it->idx_;
688     it->type_ = it->split_.types_[it->idx_];
689     it->length_ = it->split_.lengths_[it->idx_];
690     int type_code = it->split_.type_codes_[it->idx_];
691     WriteBits(code.block_type_code.depth_[type_code],
692               code.block_type_code.bits_[type_code],
693               storage_ix, storage);
694     EncodeBlockLength(code.block_len_code, it->length_, storage_ix, storage);
695   }
696   --it->length_;
697 }
698 
699 struct EncodingParams {
700   int num_direct_distance_codes;
701   int distance_postfix_bits;
702   int literal_context_mode;
703 };
704 
705 struct MetaBlock {
706   std::vector<Command> cmds;
707   EncodingParams params;
708   BlockSplit literal_split;
709   BlockSplit command_split;
710   BlockSplit distance_split;
711   std::vector<int> literal_context_modes;
712   std::vector<int> literal_context_map;
713   std::vector<int> distance_context_map;
714   std::vector<HistogramLiteral> literal_histograms;
715   std::vector<HistogramCommand> command_histograms;
716   std::vector<HistogramDistance> distance_histograms;
717 };
718 
BuildMetaBlock(const EncodingParams & params,const std::vector<Command> & cmds,const uint8_t * ringbuffer,const size_t pos,const size_t mask,MetaBlock * mb)719 void BuildMetaBlock(const EncodingParams& params,
720                     const std::vector<Command>& cmds,
721                     const uint8_t* ringbuffer,
722                     const size_t pos,
723                     const size_t mask,
724                     MetaBlock* mb) {
725   mb->cmds = cmds;
726   mb->params = params;
727   if (cmds.empty()) {
728     return;
729   }
730   ComputeCommandPrefixes(&mb->cmds,
731                          mb->params.num_direct_distance_codes,
732                          mb->params.distance_postfix_bits);
733   SplitBlock(mb->cmds,
734              &ringbuffer[pos & mask],
735              &mb->literal_split,
736              &mb->command_split,
737              &mb->distance_split);
738   ComputeBlockTypeShortCodes(&mb->literal_split);
739   ComputeBlockTypeShortCodes(&mb->command_split);
740   ComputeBlockTypeShortCodes(&mb->distance_split);
741 
742   mb->literal_context_modes.resize(mb->literal_split.num_types_,
743                                    mb->params.literal_context_mode);
744 
745 
746   int num_literal_contexts =
747       mb->literal_split.num_types_ << kLiteralContextBits;
748   int num_distance_contexts =
749       mb->distance_split.num_types_ << kDistanceContextBits;
750   std::vector<HistogramLiteral> literal_histograms(num_literal_contexts);
751   mb->command_histograms.resize(mb->command_split.num_types_);
752   std::vector<HistogramDistance> distance_histograms(num_distance_contexts);
753   BuildHistograms(mb->cmds,
754                   mb->literal_split,
755                   mb->command_split,
756                   mb->distance_split,
757                   ringbuffer,
758                   pos,
759                   mask,
760                   mb->literal_context_modes,
761                   &literal_histograms,
762                   &mb->command_histograms,
763                   &distance_histograms);
764 
765   // Histogram ids need to fit in one byte.
766   static const int kMaxNumberOfHistograms = 256;
767 
768   mb->literal_histograms = literal_histograms;
769   ClusterHistograms(literal_histograms,
770                     1 << kLiteralContextBits,
771                     mb->literal_split.num_types_,
772                     kMaxNumberOfHistograms,
773                     &mb->literal_histograms,
774                     &mb->literal_context_map);
775 
776   mb->distance_histograms = distance_histograms;
777   ClusterHistograms(distance_histograms,
778                     1 << kDistanceContextBits,
779                     mb->distance_split.num_types_,
780                     kMaxNumberOfHistograms,
781                     &mb->distance_histograms,
782                     &mb->distance_context_map);
783 }
784 
MetaBlockLength(const std::vector<Command> & cmds)785 size_t MetaBlockLength(const std::vector<Command>& cmds) {
786   size_t length = 0;
787   for (int i = 0; i < cmds.size(); ++i) {
788     const Command& cmd = cmds[i];
789     length += cmd.insert_length_ + cmd.copy_length_;
790   }
791   return length;
792 }
793 
StoreMetaBlock(const MetaBlock & mb,const bool is_last,const uint8_t * ringbuffer,const size_t mask,size_t * pos,int * storage_ix,uint8_t * storage)794 void StoreMetaBlock(const MetaBlock& mb,
795                     const bool is_last,
796                     const uint8_t* ringbuffer,
797                     const size_t mask,
798                     size_t* pos,
799                     int* storage_ix, uint8_t* storage) {
800   size_t length = MetaBlockLength(mb.cmds);
801   const size_t end_pos = *pos + length;
802   EncodeMetaBlockLength(length, is_last, false, storage_ix, storage);
803 
804   if (length == 0) {
805     return;
806   }
807   BlockSplitCode literal_split_code;
808   BlockSplitCode command_split_code;
809   BlockSplitCode distance_split_code;
810   BuildAndEncodeBlockSplitCode(mb.literal_split, &literal_split_code,
811                                storage_ix, storage);
812   BuildAndEncodeBlockSplitCode(mb.command_split, &command_split_code,
813                                storage_ix, storage);
814   BuildAndEncodeBlockSplitCode(mb.distance_split, &distance_split_code,
815                                storage_ix, storage);
816   WriteBits(2, mb.params.distance_postfix_bits, storage_ix, storage);
817   WriteBits(4,
818             mb.params.num_direct_distance_codes >>
819             mb.params.distance_postfix_bits,
820             storage_ix, storage);
821   int num_distance_codes =
822       kNumDistanceShortCodes + mb.params.num_direct_distance_codes +
823       (48 << mb.params.distance_postfix_bits);
824   for (int i = 0; i < mb.literal_split.num_types_; ++i) {
825     WriteBits(2, mb.literal_context_modes[i], storage_ix, storage);
826   }
827   EncodeContextMap(mb.literal_context_map, mb.literal_histograms.size(),
828                    storage_ix, storage);
829   EncodeContextMap(mb.distance_context_map, mb.distance_histograms.size(),
830                    storage_ix, storage);
831   std::vector<EntropyCodeLiteral> literal_codes;
832   std::vector<EntropyCodeCommand> command_codes;
833   std::vector<EntropyCodeDistance> distance_codes;
834   BuildAndStoreEntropyCodes(mb.literal_histograms, 256, &literal_codes,
835                             storage_ix, storage);
836   BuildAndStoreEntropyCodes(mb.command_histograms, kNumCommandPrefixes,
837                             &command_codes, storage_ix, storage);
838   BuildAndStoreEntropyCodes(mb.distance_histograms, num_distance_codes,
839                             &distance_codes, storage_ix, storage);
840   BlockSplitIterator literal_it(mb.literal_split);
841   BlockSplitIterator command_it(mb.command_split);
842   BlockSplitIterator distance_it(mb.distance_split);
843   for (int i = 0; i < mb.cmds.size(); ++i) {
844     const Command& cmd = mb.cmds[i];
845     MoveAndEncode(command_split_code, &command_it, storage_ix, storage);
846     EncodeCommand(cmd, command_codes[command_it.type_], storage_ix, storage);
847     for (int j = 0; j < cmd.insert_length_; ++j) {
848       MoveAndEncode(literal_split_code, &literal_it, storage_ix, storage);
849       int histogram_idx = literal_it.type_;
850       uint8_t prev_byte = *pos > 0 ? ringbuffer[(*pos - 1) & mask] : 0;
851       uint8_t prev_byte2 = *pos > 1 ? ringbuffer[(*pos - 2) & mask] : 0;
852       int context = ((literal_it.type_ << kLiteralContextBits) +
853                      Context(prev_byte, prev_byte2,
854                              mb.literal_context_modes[literal_it.type_]));
855       histogram_idx = mb.literal_context_map[context];
856       int literal = ringbuffer[*pos & mask];
857       WriteBits(literal_codes[histogram_idx].depth_[literal],
858                 literal_codes[histogram_idx].bits_[literal],
859                 storage_ix, storage);
860       ++(*pos);
861     }
862     if (*pos < end_pos && cmd.distance_prefix_ != 0xffff) {
863       MoveAndEncode(distance_split_code, &distance_it, storage_ix, storage);
864       int context = (distance_it.type_ << 2) +
865           ((cmd.copy_length_code_ > 4) ? 3 : cmd.copy_length_code_ - 2);
866       int histogram_index = mb.distance_context_map[context];
867       size_t max_distance = std::min(*pos, (size_t)kMaxBackwardDistance);
868       EncodeCopyDistance(cmd, distance_codes[histogram_index],
869                          storage_ix, storage);
870     }
871     *pos += cmd.copy_length_;
872   }
873 }
874 
BrotliCompressor(BrotliParams params)875 BrotliCompressor::BrotliCompressor(BrotliParams params)
876     : params_(params),
877       window_bits_(kWindowBits),
878       hashers_(new Hashers()),
879       dist_ringbuffer_idx_(0),
880       input_pos_(0),
881       ringbuffer_(kRingBufferBits, kMetaBlockSizeBits),
882       literal_cost_(1 << kRingBufferBits),
883       storage_ix_(0),
884       storage_(new uint8_t[2 << kMetaBlockSizeBits]) {
885   dist_ringbuffer_[0] = 16;
886   dist_ringbuffer_[1] = 15;
887   dist_ringbuffer_[2] = 11;
888   dist_ringbuffer_[3] = 4;
889   storage_[0] = 0;
890   switch (params.mode) {
891     case BrotliParams::MODE_TEXT: hash_type_ = Hashers::HASH_15_8_4; break;
892     case BrotliParams::MODE_FONT: hash_type_ = Hashers::HASH_15_8_2; break;
893     default: break;
894   }
895   hashers_->Init(hash_type_);
896   if (params.mode == BrotliParams::MODE_TEXT) {
897     StoreDictionaryWordHashes();
898   }
899 }
900 
~BrotliCompressor()901 BrotliCompressor::~BrotliCompressor() {
902   delete[] storage_;
903 }
904 
905 StaticDictionary *BrotliCompressor::static_dictionary_ = NULL;
906 
StoreDictionaryWordHashes()907 void BrotliCompressor::StoreDictionaryWordHashes() {
908   const int num_transforms = kNumTransforms;
909   if (static_dictionary_ == NULL) {
910     static_dictionary_ = new StaticDictionary;
911     for (int t = num_transforms - 1; t >= 0; --t) {
912       for (int i = kMaxDictionaryWordLength;
913            i >= kMinDictionaryWordLength; --i) {
914         const int num_words = 1 << kBrotliDictionarySizeBitsByLength[i];
915         for (int j = num_words - 1; j >= 0; --j) {
916           int word_id = t * num_words + j;
917           std::string word = GetTransformedDictionaryWord(i, word_id);
918           if (word.size() >= 4) {
919             static_dictionary_->Insert(word, i, word_id);
920           }
921         }
922       }
923     }
924   }
925   hashers_->SetStaticDictionary(static_dictionary_);
926 }
927 
WriteStreamHeader()928 void BrotliCompressor::WriteStreamHeader() {
929   // Encode window size.
930   if (window_bits_ == 16) {
931     WriteBits(1, 0, &storage_ix_, storage_);
932   } else {
933     WriteBits(1, 1, &storage_ix_, storage_);
934     WriteBits(3, window_bits_ - 17, &storage_ix_, storage_);
935   }
936 }
937 
WriteMetaBlock(const size_t input_size,const uint8_t * input_buffer,const bool is_last,size_t * encoded_size,uint8_t * encoded_buffer)938 void BrotliCompressor::WriteMetaBlock(const size_t input_size,
939                                       const uint8_t* input_buffer,
940                                       const bool is_last,
941                                       size_t* encoded_size,
942                                       uint8_t* encoded_buffer) {
943   static const double kMinUTF8Ratio = 0.75;
944   bool utf8_mode = false;
945   std::vector<Command> commands;
946   if (input_size > 0) {
947     ringbuffer_.Write(input_buffer, input_size);
948     utf8_mode = IsMostlyUTF8(
949       &ringbuffer_.start()[input_pos_ & kRingBufferMask],
950       input_size, kMinUTF8Ratio);
951     if (utf8_mode) {
952       EstimateBitCostsForLiteralsUTF8(input_pos_, input_size,
953                                       kRingBufferMask, kRingBufferMask,
954                                       ringbuffer_.start(), &literal_cost_[0]);
955     } else {
956       EstimateBitCostsForLiterals(input_pos_, input_size,
957                                   kRingBufferMask, kRingBufferMask,
958                                   ringbuffer_.start(), &literal_cost_[0]);
959     }
960     CreateBackwardReferences(
961         input_size, input_pos_,
962         ringbuffer_.start(),
963         &literal_cost_[0],
964         kRingBufferMask, kMaxBackwardDistance,
965         hashers_.get(),
966         hash_type_,
967         &commands);
968     ComputeDistanceShortCodes(&commands, input_pos_, kMaxBackwardDistance,
969                               dist_ringbuffer_,
970                               &dist_ringbuffer_idx_);
971   }
972   EncodingParams params;
973   params.num_direct_distance_codes =
974       params_.mode == BrotliParams::MODE_FONT ? 12 : 0;
975   params.distance_postfix_bits =
976       params_.mode == BrotliParams::MODE_FONT ? 1 : 0;
977   params.literal_context_mode = CONTEXT_SIGNED;
978   const int storage_ix0 = storage_ix_;
979   MetaBlock mb;
980   BuildMetaBlock(params, commands, ringbuffer_.start(), input_pos_,
981                  kRingBufferMask, &mb);
982   StoreMetaBlock(mb, is_last, ringbuffer_.start(), kRingBufferMask,
983                  &input_pos_, &storage_ix_, storage_);
984   size_t output_size = is_last ? ((storage_ix_ + 7) >> 3) : (storage_ix_ >> 3);
985   output_size -= (storage_ix0 >> 3);
986   if (input_size + 4 < output_size) {
987     storage_ix_ = storage_ix0;
988     storage_[storage_ix_ >> 3] &= (1 << (storage_ix_ & 7)) - 1;
989     EncodeMetaBlockLength(input_size, false, true, &storage_ix_, storage_);
990     size_t hdr_size = (storage_ix_ + 7) >> 3;
991     memcpy(encoded_buffer, storage_, hdr_size);
992     memcpy(encoded_buffer + hdr_size, input_buffer, input_size);
993     *encoded_size = hdr_size + input_size;
994     if (is_last) {
995       encoded_buffer[*encoded_size] = 0x3;  // ISLAST, ISEMPTY
996       ++(*encoded_size);
997     }
998     storage_ix_ = 0;
999     storage_[0] = 0;
1000   } else {
1001     memcpy(encoded_buffer, storage_, output_size);
1002     *encoded_size = output_size;
1003     if (is_last) {
1004       storage_ix_ = 0;
1005       storage_[0] = 0;
1006     } else {
1007       storage_ix_ -= output_size << 3;
1008       storage_[storage_ix_ >> 3] = storage_[output_size];
1009     }
1010   }
1011 }
1012 
FinishStream(size_t * encoded_size,uint8_t * encoded_buffer)1013 void BrotliCompressor::FinishStream(
1014     size_t* encoded_size, uint8_t* encoded_buffer) {
1015   WriteMetaBlock(0, NULL, true, encoded_size, encoded_buffer);
1016 }
1017 
1018 
BrotliCompressBuffer(BrotliParams params,size_t input_size,const uint8_t * input_buffer,size_t * encoded_size,uint8_t * encoded_buffer)1019 int BrotliCompressBuffer(BrotliParams params,
1020                          size_t input_size,
1021                          const uint8_t* input_buffer,
1022                          size_t* encoded_size,
1023                          uint8_t* encoded_buffer) {
1024   if (input_size == 0) {
1025     encoded_buffer[0] = 6;
1026     *encoded_size = 1;
1027     return 1;
1028   }
1029 
1030   BrotliCompressor compressor(params);
1031   compressor.WriteStreamHeader();
1032 
1033   const int max_block_size = 1 << kMetaBlockSizeBits;
1034   size_t max_output_size = *encoded_size;
1035   const uint8_t* input_end = input_buffer + input_size;
1036   *encoded_size = 0;
1037 
1038   while (input_buffer < input_end) {
1039     int block_size = max_block_size;
1040     bool is_last = false;
1041     if (block_size >= input_end - input_buffer) {
1042       block_size = input_end - input_buffer;
1043       is_last = true;
1044     }
1045     size_t output_size = max_output_size;
1046     compressor.WriteMetaBlock(block_size, input_buffer, is_last,
1047                               &output_size, &encoded_buffer[*encoded_size]);
1048     input_buffer += block_size;
1049     *encoded_size += output_size;
1050     max_output_size -= output_size;
1051   }
1052 
1053   return 1;
1054 }
1055 
1056 }  // namespace brotli
1057