1 // Copyright 2013 Google Inc. All Rights Reserved.
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 //
15 // Implementation of Brotli compressor.
16
17 #include "./encode.h"
18
19 #include <algorithm>
20 #include <limits>
21
22 #include "./backward_references.h"
23 #include "./bit_cost.h"
24 #include "./block_splitter.h"
25 #include "./cluster.h"
26 #include "./context.h"
27 #include "./transform.h"
28 #include "./entropy_encode.h"
29 #include "./fast_log.h"
30 #include "./hash.h"
31 #include "./histogram.h"
32 #include "./literal_cost.h"
33 #include "./prefix.h"
34 #include "./write_bits.h"
35
36 namespace brotli {
37
38 static const int kWindowBits = 22;
39 // To make decoding faster, we allow the decoder to write 16 bytes ahead in
40 // its ringbuffer, therefore the encoder has to decrease max distance by this
41 // amount.
42 static const int kDecoderRingBufferWriteAheadSlack = 16;
43 static const int kMaxBackwardDistance =
44 (1 << kWindowBits) - kDecoderRingBufferWriteAheadSlack;
45
46 static const int kMetaBlockSizeBits = 21;
47 static const int kRingBufferBits = 23;
48 static const int kRingBufferMask = (1 << kRingBufferBits) - 1;
49
50 template<int kSize>
Entropy(const std::vector<Histogram<kSize>> & histograms)51 double Entropy(const std::vector<Histogram<kSize> >& histograms) {
52 double retval = 0;
53 for (int i = 0; i < histograms.size(); ++i) {
54 retval += histograms[i].EntropyBitCost();
55 }
56 return retval;
57 }
58
59 template<int kSize>
TotalBitCost(const std::vector<Histogram<kSize>> & histograms)60 double TotalBitCost(const std::vector<Histogram<kSize> >& histograms) {
61 double retval = 0;
62 for (int i = 0; i < histograms.size(); ++i) {
63 retval += PopulationCost(histograms[i]);
64 }
65 return retval;
66 }
67
EncodeVarLenUint8(int n,int * storage_ix,uint8_t * storage)68 void EncodeVarLenUint8(int n, int* storage_ix, uint8_t* storage) {
69 if (n == 0) {
70 WriteBits(1, 0, storage_ix, storage);
71 } else {
72 WriteBits(1, 1, storage_ix, storage);
73 int nbits = Log2Floor(n);
74 WriteBits(3, nbits, storage_ix, storage);
75 if (nbits > 0) {
76 WriteBits(nbits, n - (1 << nbits), storage_ix, storage);
77 }
78 }
79 }
80
ParseAsUTF8(int * symbol,const uint8_t * input,int size)81 int ParseAsUTF8(int* symbol, const uint8_t* input, int size) {
82 // ASCII
83 if ((input[0] & 0x80) == 0) {
84 *symbol = input[0];
85 if (*symbol > 0) {
86 return 1;
87 }
88 }
89 // 2-byte UTF8
90 if (size > 1 &&
91 (input[0] & 0xe0) == 0xc0 &&
92 (input[1] & 0xc0) == 0x80) {
93 *symbol = (((input[0] & 0x1f) << 6) |
94 (input[1] & 0x3f));
95 if (*symbol > 0x7f) {
96 return 2;
97 }
98 }
99 // 3-byte UFT8
100 if (size > 2 &&
101 (input[0] & 0xf0) == 0xe0 &&
102 (input[1] & 0xc0) == 0x80 &&
103 (input[2] & 0xc0) == 0x80) {
104 *symbol = (((input[0] & 0x0f) << 12) |
105 ((input[1] & 0x3f) << 6) |
106 (input[2] & 0x3f));
107 if (*symbol > 0x7ff) {
108 return 3;
109 }
110 }
111 // 4-byte UFT8
112 if (size > 3 &&
113 (input[0] & 0xf8) == 0xf0 &&
114 (input[1] & 0xc0) == 0x80 &&
115 (input[2] & 0xc0) == 0x80 &&
116 (input[3] & 0xc0) == 0x80) {
117 *symbol = (((input[0] & 0x07) << 18) |
118 ((input[1] & 0x3f) << 12) |
119 ((input[2] & 0x3f) << 6) |
120 (input[3] & 0x3f));
121 if (*symbol > 0xffff && *symbol <= 0x10ffff) {
122 return 4;
123 }
124 }
125 // Not UTF8, emit a special symbol above the UTF8-code space
126 *symbol = 0x110000 | input[0];
127 return 1;
128 }
129
130 // Returns true if at least min_fraction of the data is UTF8-encoded.
IsMostlyUTF8(const uint8_t * data,size_t length,double min_fraction)131 bool IsMostlyUTF8(const uint8_t* data, size_t length, double min_fraction) {
132 size_t size_utf8 = 0;
133 size_t pos = 0;
134 while (pos < length) {
135 int symbol;
136 int bytes_read = ParseAsUTF8(&symbol, data + pos, length - pos);
137 pos += bytes_read;
138 if (symbol < 0x110000) size_utf8 += bytes_read;
139 }
140 return size_utf8 > min_fraction * length;
141 }
142
EncodeMetaBlockLength(size_t meta_block_size,bool is_last,bool is_uncompressed,int * storage_ix,uint8_t * storage)143 void EncodeMetaBlockLength(size_t meta_block_size,
144 bool is_last,
145 bool is_uncompressed,
146 int* storage_ix, uint8_t* storage) {
147 WriteBits(1, is_last, storage_ix, storage);
148 if (is_last) {
149 if (meta_block_size == 0) {
150 WriteBits(1, 1, storage_ix, storage);
151 return;
152 }
153 WriteBits(1, 0, storage_ix, storage);
154 }
155 --meta_block_size;
156 int num_bits = Log2Floor(meta_block_size) + 1;
157 if (num_bits < 16) {
158 num_bits = 16;
159 }
160 WriteBits(2, (num_bits - 13) >> 2, storage_ix, storage);
161 while (num_bits > 0) {
162 WriteBits(4, meta_block_size & 0xf, storage_ix, storage);
163 meta_block_size >>= 4;
164 num_bits -= 4;
165 }
166 if (!is_last) {
167 WriteBits(1, is_uncompressed, storage_ix, storage);
168 }
169 }
170
StoreHuffmanTreeOfHuffmanTreeToBitMask(const uint8_t * code_length_bitdepth,int * storage_ix,uint8_t * storage)171 void StoreHuffmanTreeOfHuffmanTreeToBitMask(
172 const uint8_t* code_length_bitdepth,
173 int* storage_ix, uint8_t* storage) {
174 static const uint8_t kStorageOrder[kCodeLengthCodes] = {
175 1, 2, 3, 4, 0, 5, 17, 6, 16, 7, 8, 9, 10, 11, 12, 13, 14, 15,
176 };
177 // Throw away trailing zeros:
178 int codes_to_store = kCodeLengthCodes;
179 for (; codes_to_store > 0; --codes_to_store) {
180 if (code_length_bitdepth[kStorageOrder[codes_to_store - 1]] != 0) {
181 break;
182 }
183 }
184 int num_codes = 0;
185 for (int i = 0; i < codes_to_store; ++i) {
186 if (code_length_bitdepth[kStorageOrder[i]] != 0) {
187 ++num_codes;
188 }
189 }
190 if (num_codes == 1) {
191 codes_to_store = kCodeLengthCodes;
192 }
193 int skip_some = 0; // skips none.
194 if (code_length_bitdepth[kStorageOrder[0]] == 0 &&
195 code_length_bitdepth[kStorageOrder[1]] == 0) {
196 skip_some = 2; // skips two.
197 if (code_length_bitdepth[kStorageOrder[2]] == 0) {
198 skip_some = 3; // skips three.
199 }
200 }
201 WriteBits(2, skip_some, storage_ix, storage);
202 for (int i = skip_some; i < codes_to_store; ++i) {
203 uint8_t len[] = { 2, 4, 3, 2, 2, 4 };
204 uint8_t bits[] = { 0, 7, 3, 2, 1, 15 };
205 int v = code_length_bitdepth[kStorageOrder[i]];
206 WriteBits(len[v], bits[v], storage_ix, storage);
207 }
208 }
209
StoreHuffmanTreeToBitMask(const uint8_t * huffman_tree,const uint8_t * huffman_tree_extra_bits,const int huffman_tree_size,const EntropyCode<kCodeLengthCodes> & entropy,int * storage_ix,uint8_t * storage)210 void StoreHuffmanTreeToBitMask(
211 const uint8_t* huffman_tree,
212 const uint8_t* huffman_tree_extra_bits,
213 const int huffman_tree_size,
214 const EntropyCode<kCodeLengthCodes>& entropy,
215 int* storage_ix, uint8_t* storage) {
216 for (int i = 0; i < huffman_tree_size; ++i) {
217 const int ix = huffman_tree[i];
218 const int extra_bits = huffman_tree_extra_bits[i];
219 if (entropy.count_ > 1) {
220 WriteBits(entropy.depth_[ix], entropy.bits_[ix], storage_ix, storage);
221 }
222 switch (ix) {
223 case 16:
224 WriteBits(2, extra_bits, storage_ix, storage);
225 break;
226 case 17:
227 WriteBits(3, extra_bits, storage_ix, storage);
228 break;
229 }
230 }
231 }
232
233 template<int kSize>
StoreHuffmanCodeSimple(const EntropyCode<kSize> & code,int alphabet_size,int max_bits,int * storage_ix,uint8_t * storage)234 void StoreHuffmanCodeSimple(
235 const EntropyCode<kSize>& code, int alphabet_size,
236 int max_bits, int* storage_ix, uint8_t* storage) {
237 const uint8_t *depth = &code.depth_[0];
238 int symbols[4];
239 // Quadratic sort.
240 int k, j;
241 for (k = 0; k < code.count_; ++k) {
242 symbols[k] = code.symbols_[k];
243 }
244 for (k = 0; k < code.count_; ++k) {
245 for (j = k + 1; j < code.count_; ++j) {
246 if (depth[symbols[j]] < depth[symbols[k]]) {
247 int t = symbols[k];
248 symbols[k] = symbols[j];
249 symbols[j] = t;
250 }
251 }
252 }
253 // Small tree marker to encode 1-4 symbols.
254 WriteBits(2, 1, storage_ix, storage);
255 WriteBits(2, code.count_ - 1, storage_ix, storage);
256 for (int i = 0; i < code.count_; ++i) {
257 WriteBits(max_bits, symbols[i], storage_ix, storage);
258 }
259 if (code.count_ == 4) {
260 if (depth[symbols[0]] == 2 &&
261 depth[symbols[1]] == 2 &&
262 depth[symbols[2]] == 2 &&
263 depth[symbols[3]] == 2) {
264 WriteBits(1, 0, storage_ix, storage);
265 } else {
266 WriteBits(1, 1, storage_ix, storage);
267 }
268 }
269 }
270
271 template<int kSize>
StoreHuffmanCodeComplex(const EntropyCode<kSize> & code,int alphabet_size,int * storage_ix,uint8_t * storage)272 void StoreHuffmanCodeComplex(
273 const EntropyCode<kSize>& code, int alphabet_size,
274 int* storage_ix, uint8_t* storage) {
275 const uint8_t *depth = &code.depth_[0];
276 uint8_t huffman_tree[kSize];
277 uint8_t huffman_tree_extra_bits[kSize];
278 int huffman_tree_size = 0;
279 WriteHuffmanTree(depth,
280 alphabet_size,
281 &huffman_tree[0],
282 &huffman_tree_extra_bits[0],
283 &huffman_tree_size);
284 Histogram<kCodeLengthCodes> huffman_tree_histogram;
285 memset(huffman_tree_histogram.data_, 0, sizeof(huffman_tree_histogram.data_));
286 for (int i = 0; i < huffman_tree_size; ++i) {
287 huffman_tree_histogram.Add(huffman_tree[i]);
288 }
289 EntropyCode<kCodeLengthCodes> huffman_tree_entropy;
290 BuildEntropyCode(huffman_tree_histogram, 5, kCodeLengthCodes,
291 &huffman_tree_entropy);
292 StoreHuffmanTreeOfHuffmanTreeToBitMask(
293 &huffman_tree_entropy.depth_[0], storage_ix, storage);
294 StoreHuffmanTreeToBitMask(&huffman_tree[0], &huffman_tree_extra_bits[0],
295 huffman_tree_size, huffman_tree_entropy,
296 storage_ix, storage);
297 }
298
299 template<int kSize>
BuildAndStoreEntropyCode(const Histogram<kSize> & histogram,const int tree_limit,const int alphabet_size,EntropyCode<kSize> * code,int * storage_ix,uint8_t * storage)300 void BuildAndStoreEntropyCode(const Histogram<kSize>& histogram,
301 const int tree_limit,
302 const int alphabet_size,
303 EntropyCode<kSize>* code,
304 int* storage_ix, uint8_t* storage) {
305 memset(code->depth_, 0, sizeof(code->depth_));
306 memset(code->bits_, 0, sizeof(code->bits_));
307 memset(code->symbols_, 0, sizeof(code->symbols_));
308 code->count_ = 0;
309
310 int max_bits_counter = alphabet_size - 1;
311 int max_bits = 0;
312 while (max_bits_counter) {
313 max_bits_counter >>= 1;
314 ++max_bits;
315 }
316
317 for (size_t i = 0; i < alphabet_size; i++) {
318 if (histogram.data_[i] > 0) {
319 if (code->count_ < 4) code->symbols_[code->count_] = i;
320 ++code->count_;
321 }
322 }
323
324 if (code->count_ <= 1) {
325 WriteBits(2, 1, storage_ix, storage);
326 WriteBits(2, 0, storage_ix, storage);
327 WriteBits(max_bits, code->symbols_[0], storage_ix, storage);
328 return;
329 }
330
331 if (alphabet_size >= 50 && code->count_ >= 16) {
332 std::vector<int> counts(alphabet_size);
333 memcpy(&counts[0], histogram.data_, sizeof(counts[0]) * alphabet_size);
334 OptimizeHuffmanCountsForRle(alphabet_size, &counts[0]);
335 CreateHuffmanTree(&counts[0], alphabet_size, tree_limit, code->depth_);
336 } else {
337 CreateHuffmanTree(histogram.data_, alphabet_size, tree_limit, code->depth_);
338 }
339 ConvertBitDepthsToSymbols(code->depth_, alphabet_size, code->bits_);
340
341 if (code->count_ <= 4) {
342 StoreHuffmanCodeSimple(*code, alphabet_size, max_bits, storage_ix, storage);
343 } else {
344 StoreHuffmanCodeComplex(*code, alphabet_size, storage_ix, storage);
345 }
346 }
347
348 template<int kSize>
BuildAndStoreEntropyCodes(const std::vector<Histogram<kSize>> & histograms,int alphabet_size,std::vector<EntropyCode<kSize>> * entropy_codes,int * storage_ix,uint8_t * storage)349 void BuildAndStoreEntropyCodes(
350 const std::vector<Histogram<kSize> >& histograms,
351 int alphabet_size,
352 std::vector<EntropyCode<kSize> >* entropy_codes,
353 int* storage_ix, uint8_t* storage) {
354 entropy_codes->resize(histograms.size());
355 for (int i = 0; i < histograms.size(); ++i) {
356 BuildAndStoreEntropyCode(histograms[i], 15, alphabet_size,
357 &(*entropy_codes)[i],
358 storage_ix, storage);
359 }
360 }
361
EncodeCommand(const Command & cmd,const EntropyCodeCommand & entropy,int * storage_ix,uint8_t * storage)362 void EncodeCommand(const Command& cmd,
363 const EntropyCodeCommand& entropy,
364 int* storage_ix, uint8_t* storage) {
365 int code = cmd.command_prefix_;
366 WriteBits(entropy.depth_[code], entropy.bits_[code], storage_ix, storage);
367 if (code >= 128) {
368 code -= 128;
369 }
370 int insert_extra_bits = InsertLengthExtraBits(code);
371 uint64_t insert_extra_bits_val =
372 cmd.insert_length_ - InsertLengthOffset(code);
373 int copy_extra_bits = CopyLengthExtraBits(code);
374 uint64_t copy_extra_bits_val = cmd.copy_length_code_ - CopyLengthOffset(code);
375 if (insert_extra_bits > 0) {
376 WriteBits(insert_extra_bits, insert_extra_bits_val, storage_ix, storage);
377 }
378 if (copy_extra_bits > 0) {
379 WriteBits(copy_extra_bits, copy_extra_bits_val, storage_ix, storage);
380 }
381 }
382
EncodeCopyDistance(const Command & cmd,const EntropyCodeDistance & entropy,int * storage_ix,uint8_t * storage)383 void EncodeCopyDistance(const Command& cmd, const EntropyCodeDistance& entropy,
384 int* storage_ix, uint8_t* storage) {
385 int code = cmd.distance_prefix_;
386 int extra_bits = cmd.distance_extra_bits_;
387 uint64_t extra_bits_val = cmd.distance_extra_bits_value_;
388 WriteBits(entropy.depth_[code], entropy.bits_[code], storage_ix, storage);
389 if (extra_bits > 0) {
390 WriteBits(extra_bits, extra_bits_val, storage_ix, storage);
391 }
392 }
393
ComputeDistanceShortCodes(std::vector<Command> * cmds,size_t pos,const size_t max_backward,int * dist_ringbuffer,size_t * ringbuffer_idx)394 void ComputeDistanceShortCodes(std::vector<Command>* cmds,
395 size_t pos,
396 const size_t max_backward,
397 int* dist_ringbuffer,
398 size_t* ringbuffer_idx) {
399 static const int kIndexOffset[16] = {
400 3, 2, 1, 0, 3, 3, 3, 3, 3, 3, 2, 2, 2, 2, 2, 2
401 };
402 static const int kValueOffset[16] = {
403 0, 0, 0, 0, -1, 1, -2, 2, -3, 3, -1, 1, -2, 2, -3, 3
404 };
405 for (int i = 0; i < cmds->size(); ++i) {
406 pos += (*cmds)[i].insert_length_;
407 size_t max_distance = std::min(pos, max_backward);
408 int cur_dist = (*cmds)[i].copy_distance_;
409 int dist_code = cur_dist + 16;
410 if (cur_dist <= max_distance) {
411 if (cur_dist == 0) break;
412 int limits[16] = { 0, 0, 0, 0,
413 6, 6, 11, 11,
414 11, 11, 11, 11,
415 12, 12, 12, 12 };
416 for (int k = 0; k < 16; ++k) {
417 // Only accept more popular choices.
418 if (cur_dist < limits[k]) {
419 // Typically unpopular ranges, don't replace a short distance
420 // with them.
421 continue;
422 }
423 int comp = (dist_ringbuffer[(*ringbuffer_idx + kIndexOffset[k]) & 3] +
424 kValueOffset[k]);
425 if (cur_dist == comp) {
426 dist_code = k + 1;
427 break;
428 }
429 }
430 if (dist_code > 1) {
431 dist_ringbuffer[*ringbuffer_idx & 3] = cur_dist;
432 ++(*ringbuffer_idx);
433 }
434 pos += (*cmds)[i].copy_length_;
435 } else {
436 int word_idx = cur_dist - max_distance - 1;
437 const std::string word =
438 GetTransformedDictionaryWord((*cmds)[i].copy_length_code_, word_idx);
439 pos += word.size();
440 }
441 (*cmds)[i].distance_code_ = dist_code;
442 }
443 }
444
ComputeCommandPrefixes(std::vector<Command> * cmds,int num_direct_distance_codes,int distance_postfix_bits)445 void ComputeCommandPrefixes(std::vector<Command>* cmds,
446 int num_direct_distance_codes,
447 int distance_postfix_bits) {
448 for (int i = 0; i < cmds->size(); ++i) {
449 Command* cmd = &(*cmds)[i];
450 cmd->command_prefix_ = CommandPrefix(cmd->insert_length_,
451 cmd->copy_length_code_);
452 if (cmd->copy_length_code_ > 0) {
453 PrefixEncodeCopyDistance(cmd->distance_code_,
454 num_direct_distance_codes,
455 distance_postfix_bits,
456 &cmd->distance_prefix_,
457 &cmd->distance_extra_bits_,
458 &cmd->distance_extra_bits_value_);
459 }
460 if (cmd->command_prefix_ < 128 && cmd->distance_prefix_ == 0) {
461 cmd->distance_prefix_ = 0xffff;
462 } else {
463 cmd->command_prefix_ += 128;
464 }
465 }
466 }
467
IndexOf(const std::vector<int> & v,int value)468 int IndexOf(const std::vector<int>& v, int value) {
469 for (int i = 0; i < v.size(); ++i) {
470 if (v[i] == value) return i;
471 }
472 return -1;
473 }
474
MoveToFront(std::vector<int> * v,int index)475 void MoveToFront(std::vector<int>* v, int index) {
476 int value = (*v)[index];
477 for (int i = index; i > 0; --i) {
478 (*v)[i] = (*v)[i - 1];
479 }
480 (*v)[0] = value;
481 }
482
MoveToFrontTransform(const std::vector<int> & v)483 std::vector<int> MoveToFrontTransform(const std::vector<int>& v) {
484 if (v.empty()) return v;
485 std::vector<int> mtf(*max_element(v.begin(), v.end()) + 1);
486 for (int i = 0; i < mtf.size(); ++i) mtf[i] = i;
487 std::vector<int> result(v.size());
488 for (int i = 0; i < v.size(); ++i) {
489 int index = IndexOf(mtf, v[i]);
490 result[i] = index;
491 MoveToFront(&mtf, index);
492 }
493 return result;
494 }
495
496 // Finds runs of zeros in v_in and replaces them with a prefix code of the run
497 // length plus extra bits in *v_out and *extra_bits. Non-zero values in v_in are
498 // shifted by *max_length_prefix. Will not create prefix codes bigger than the
499 // initial value of *max_run_length_prefix. The prefix code of run length L is
500 // simply Log2Floor(L) and the number of extra bits is the same as the prefix
501 // code.
RunLengthCodeZeros(const std::vector<int> & v_in,int * max_run_length_prefix,std::vector<int> * v_out,std::vector<int> * extra_bits)502 void RunLengthCodeZeros(const std::vector<int>& v_in,
503 int* max_run_length_prefix,
504 std::vector<int>* v_out,
505 std::vector<int>* extra_bits) {
506 int max_reps = 0;
507 for (int i = 0; i < v_in.size();) {
508 for (; i < v_in.size() && v_in[i] != 0; ++i) ;
509 int reps = 0;
510 for (; i < v_in.size() && v_in[i] == 0; ++i) {
511 ++reps;
512 }
513 max_reps = std::max(reps, max_reps);
514 }
515 int max_prefix = max_reps > 0 ? Log2Floor(max_reps) : 0;
516 *max_run_length_prefix = std::min(max_prefix, *max_run_length_prefix);
517 for (int i = 0; i < v_in.size();) {
518 if (v_in[i] != 0) {
519 v_out->push_back(v_in[i] + *max_run_length_prefix);
520 extra_bits->push_back(0);
521 ++i;
522 } else {
523 int reps = 1;
524 for (uint32_t k = i + 1; k < v_in.size() && v_in[k] == 0; ++k) {
525 ++reps;
526 }
527 i += reps;
528 while (reps) {
529 if (reps < (2 << *max_run_length_prefix)) {
530 int run_length_prefix = Log2Floor(reps);
531 v_out->push_back(run_length_prefix);
532 extra_bits->push_back(reps - (1 << run_length_prefix));
533 break;
534 } else {
535 v_out->push_back(*max_run_length_prefix);
536 extra_bits->push_back((1 << *max_run_length_prefix) - 1);
537 reps -= (2 << *max_run_length_prefix) - 1;
538 }
539 }
540 }
541 }
542 }
543
544 // Returns a maximum zero-run-length-prefix value such that run-length coding
545 // zeros in v with this maximum prefix value and then encoding the resulting
546 // histogram and entropy-coding v produces the least amount of bits.
BestMaxZeroRunLengthPrefix(const std::vector<int> & v)547 int BestMaxZeroRunLengthPrefix(const std::vector<int>& v) {
548 int min_cost = std::numeric_limits<int>::max();
549 int best_max_prefix = 0;
550 for (int max_prefix = 0; max_prefix <= 16; ++max_prefix) {
551 std::vector<int> rle_symbols;
552 std::vector<int> extra_bits;
553 int max_run_length_prefix = max_prefix;
554 RunLengthCodeZeros(v, &max_run_length_prefix, &rle_symbols, &extra_bits);
555 if (max_run_length_prefix < max_prefix) break;
556 HistogramContextMap histogram;
557 for (int i = 0; i < rle_symbols.size(); ++i) {
558 histogram.Add(rle_symbols[i]);
559 }
560 int bit_cost = PopulationCost(histogram);
561 if (max_prefix > 0) {
562 bit_cost += 4;
563 }
564 for (int i = 1; i <= max_prefix; ++i) {
565 bit_cost += histogram.data_[i] * i; // extra bits
566 }
567 if (bit_cost < min_cost) {
568 min_cost = bit_cost;
569 best_max_prefix = max_prefix;
570 }
571 }
572 return best_max_prefix;
573 }
574
EncodeContextMap(const std::vector<int> & context_map,int num_clusters,int * storage_ix,uint8_t * storage)575 void EncodeContextMap(const std::vector<int>& context_map,
576 int num_clusters,
577 int* storage_ix, uint8_t* storage) {
578 EncodeVarLenUint8(num_clusters - 1, storage_ix, storage);
579
580 if (num_clusters == 1) {
581 return;
582 }
583
584 std::vector<int> transformed_symbols = MoveToFrontTransform(context_map);
585 std::vector<int> rle_symbols;
586 std::vector<int> extra_bits;
587 int max_run_length_prefix = BestMaxZeroRunLengthPrefix(transformed_symbols);
588 RunLengthCodeZeros(transformed_symbols, &max_run_length_prefix,
589 &rle_symbols, &extra_bits);
590 HistogramContextMap symbol_histogram;
591 for (int i = 0; i < rle_symbols.size(); ++i) {
592 symbol_histogram.Add(rle_symbols[i]);
593 }
594 bool use_rle = max_run_length_prefix > 0;
595 WriteBits(1, use_rle, storage_ix, storage);
596 if (use_rle) {
597 WriteBits(4, max_run_length_prefix - 1, storage_ix, storage);
598 }
599 EntropyCodeContextMap symbol_code;
600 BuildAndStoreEntropyCode(symbol_histogram, 15,
601 num_clusters + max_run_length_prefix,
602 &symbol_code,
603 storage_ix, storage);
604 for (int i = 0; i < rle_symbols.size(); ++i) {
605 WriteBits(symbol_code.depth_[rle_symbols[i]],
606 symbol_code.bits_[rle_symbols[i]],
607 storage_ix, storage);
608 if (rle_symbols[i] > 0 && rle_symbols[i] <= max_run_length_prefix) {
609 WriteBits(rle_symbols[i], extra_bits[i], storage_ix, storage);
610 }
611 }
612 WriteBits(1, 1, storage_ix, storage); // use move-to-front
613 }
614
615 struct BlockSplitCode {
616 EntropyCodeBlockType block_type_code;
617 EntropyCodeBlockLength block_len_code;
618 };
619
EncodeBlockLength(const EntropyCodeBlockLength & entropy,int length,int * storage_ix,uint8_t * storage)620 void EncodeBlockLength(const EntropyCodeBlockLength& entropy,
621 int length,
622 int* storage_ix, uint8_t* storage) {
623 int len_code = BlockLengthPrefix(length);
624 int extra_bits = BlockLengthExtraBits(len_code);
625 int extra_bits_value = length - BlockLengthOffset(len_code);
626 WriteBits(entropy.depth_[len_code], entropy.bits_[len_code],
627 storage_ix, storage);
628 if (extra_bits > 0) {
629 WriteBits(extra_bits, extra_bits_value, storage_ix, storage);
630 }
631 }
632
ComputeBlockTypeShortCodes(BlockSplit * split)633 void ComputeBlockTypeShortCodes(BlockSplit* split) {
634 if (split->num_types_ <= 1) {
635 split->num_types_ = 1;
636 return;
637 }
638 int ringbuffer[2] = { 0, 1 };
639 size_t index = 0;
640 for (int i = 0; i < split->types_.size(); ++i) {
641 int type = split->types_[i];
642 int type_code;
643 if (type == ringbuffer[index & 1]) {
644 type_code = 0;
645 } else if (type == ringbuffer[(index - 1) & 1] + 1) {
646 type_code = 1;
647 } else {
648 type_code = type + 2;
649 }
650 ringbuffer[index & 1] = type;
651 ++index;
652 split->type_codes_.push_back(type_code);
653 }
654 }
655
BuildAndEncodeBlockSplitCode(const BlockSplit & split,BlockSplitCode * code,int * storage_ix,uint8_t * storage)656 void BuildAndEncodeBlockSplitCode(const BlockSplit& split,
657 BlockSplitCode* code,
658 int* storage_ix, uint8_t* storage) {
659 EncodeVarLenUint8(split.num_types_ - 1, storage_ix, storage);
660
661 if (split.num_types_ == 1) {
662 return;
663 }
664
665 HistogramBlockType type_histo;
666 for (int i = 1; i < split.type_codes_.size(); ++i) {
667 type_histo.Add(split.type_codes_[i]);
668 }
669 HistogramBlockLength length_histo;
670 for (int i = 0; i < split.lengths_.size(); ++i) {
671 length_histo.Add(BlockLengthPrefix(split.lengths_[i]));
672 }
673 BuildAndStoreEntropyCode(type_histo, 15, split.num_types_ + 2,
674 &code->block_type_code,
675 storage_ix, storage);
676 BuildAndStoreEntropyCode(length_histo, 15, kNumBlockLenPrefixes,
677 &code->block_len_code,
678 storage_ix, storage);
679 EncodeBlockLength(code->block_len_code, split.lengths_[0],
680 storage_ix, storage);
681 }
682
MoveAndEncode(const BlockSplitCode & code,BlockSplitIterator * it,int * storage_ix,uint8_t * storage)683 void MoveAndEncode(const BlockSplitCode& code,
684 BlockSplitIterator* it,
685 int* storage_ix, uint8_t* storage) {
686 if (it->length_ == 0) {
687 ++it->idx_;
688 it->type_ = it->split_.types_[it->idx_];
689 it->length_ = it->split_.lengths_[it->idx_];
690 int type_code = it->split_.type_codes_[it->idx_];
691 WriteBits(code.block_type_code.depth_[type_code],
692 code.block_type_code.bits_[type_code],
693 storage_ix, storage);
694 EncodeBlockLength(code.block_len_code, it->length_, storage_ix, storage);
695 }
696 --it->length_;
697 }
698
699 struct EncodingParams {
700 int num_direct_distance_codes;
701 int distance_postfix_bits;
702 int literal_context_mode;
703 };
704
705 struct MetaBlock {
706 std::vector<Command> cmds;
707 EncodingParams params;
708 BlockSplit literal_split;
709 BlockSplit command_split;
710 BlockSplit distance_split;
711 std::vector<int> literal_context_modes;
712 std::vector<int> literal_context_map;
713 std::vector<int> distance_context_map;
714 std::vector<HistogramLiteral> literal_histograms;
715 std::vector<HistogramCommand> command_histograms;
716 std::vector<HistogramDistance> distance_histograms;
717 };
718
BuildMetaBlock(const EncodingParams & params,const std::vector<Command> & cmds,const uint8_t * ringbuffer,const size_t pos,const size_t mask,MetaBlock * mb)719 void BuildMetaBlock(const EncodingParams& params,
720 const std::vector<Command>& cmds,
721 const uint8_t* ringbuffer,
722 const size_t pos,
723 const size_t mask,
724 MetaBlock* mb) {
725 mb->cmds = cmds;
726 mb->params = params;
727 if (cmds.empty()) {
728 return;
729 }
730 ComputeCommandPrefixes(&mb->cmds,
731 mb->params.num_direct_distance_codes,
732 mb->params.distance_postfix_bits);
733 SplitBlock(mb->cmds,
734 &ringbuffer[pos & mask],
735 &mb->literal_split,
736 &mb->command_split,
737 &mb->distance_split);
738 ComputeBlockTypeShortCodes(&mb->literal_split);
739 ComputeBlockTypeShortCodes(&mb->command_split);
740 ComputeBlockTypeShortCodes(&mb->distance_split);
741
742 mb->literal_context_modes.resize(mb->literal_split.num_types_,
743 mb->params.literal_context_mode);
744
745
746 int num_literal_contexts =
747 mb->literal_split.num_types_ << kLiteralContextBits;
748 int num_distance_contexts =
749 mb->distance_split.num_types_ << kDistanceContextBits;
750 std::vector<HistogramLiteral> literal_histograms(num_literal_contexts);
751 mb->command_histograms.resize(mb->command_split.num_types_);
752 std::vector<HistogramDistance> distance_histograms(num_distance_contexts);
753 BuildHistograms(mb->cmds,
754 mb->literal_split,
755 mb->command_split,
756 mb->distance_split,
757 ringbuffer,
758 pos,
759 mask,
760 mb->literal_context_modes,
761 &literal_histograms,
762 &mb->command_histograms,
763 &distance_histograms);
764
765 // Histogram ids need to fit in one byte.
766 static const int kMaxNumberOfHistograms = 256;
767
768 mb->literal_histograms = literal_histograms;
769 ClusterHistograms(literal_histograms,
770 1 << kLiteralContextBits,
771 mb->literal_split.num_types_,
772 kMaxNumberOfHistograms,
773 &mb->literal_histograms,
774 &mb->literal_context_map);
775
776 mb->distance_histograms = distance_histograms;
777 ClusterHistograms(distance_histograms,
778 1 << kDistanceContextBits,
779 mb->distance_split.num_types_,
780 kMaxNumberOfHistograms,
781 &mb->distance_histograms,
782 &mb->distance_context_map);
783 }
784
MetaBlockLength(const std::vector<Command> & cmds)785 size_t MetaBlockLength(const std::vector<Command>& cmds) {
786 size_t length = 0;
787 for (int i = 0; i < cmds.size(); ++i) {
788 const Command& cmd = cmds[i];
789 length += cmd.insert_length_ + cmd.copy_length_;
790 }
791 return length;
792 }
793
StoreMetaBlock(const MetaBlock & mb,const bool is_last,const uint8_t * ringbuffer,const size_t mask,size_t * pos,int * storage_ix,uint8_t * storage)794 void StoreMetaBlock(const MetaBlock& mb,
795 const bool is_last,
796 const uint8_t* ringbuffer,
797 const size_t mask,
798 size_t* pos,
799 int* storage_ix, uint8_t* storage) {
800 size_t length = MetaBlockLength(mb.cmds);
801 const size_t end_pos = *pos + length;
802 EncodeMetaBlockLength(length, is_last, false, storage_ix, storage);
803
804 if (length == 0) {
805 return;
806 }
807 BlockSplitCode literal_split_code;
808 BlockSplitCode command_split_code;
809 BlockSplitCode distance_split_code;
810 BuildAndEncodeBlockSplitCode(mb.literal_split, &literal_split_code,
811 storage_ix, storage);
812 BuildAndEncodeBlockSplitCode(mb.command_split, &command_split_code,
813 storage_ix, storage);
814 BuildAndEncodeBlockSplitCode(mb.distance_split, &distance_split_code,
815 storage_ix, storage);
816 WriteBits(2, mb.params.distance_postfix_bits, storage_ix, storage);
817 WriteBits(4,
818 mb.params.num_direct_distance_codes >>
819 mb.params.distance_postfix_bits,
820 storage_ix, storage);
821 int num_distance_codes =
822 kNumDistanceShortCodes + mb.params.num_direct_distance_codes +
823 (48 << mb.params.distance_postfix_bits);
824 for (int i = 0; i < mb.literal_split.num_types_; ++i) {
825 WriteBits(2, mb.literal_context_modes[i], storage_ix, storage);
826 }
827 EncodeContextMap(mb.literal_context_map, mb.literal_histograms.size(),
828 storage_ix, storage);
829 EncodeContextMap(mb.distance_context_map, mb.distance_histograms.size(),
830 storage_ix, storage);
831 std::vector<EntropyCodeLiteral> literal_codes;
832 std::vector<EntropyCodeCommand> command_codes;
833 std::vector<EntropyCodeDistance> distance_codes;
834 BuildAndStoreEntropyCodes(mb.literal_histograms, 256, &literal_codes,
835 storage_ix, storage);
836 BuildAndStoreEntropyCodes(mb.command_histograms, kNumCommandPrefixes,
837 &command_codes, storage_ix, storage);
838 BuildAndStoreEntropyCodes(mb.distance_histograms, num_distance_codes,
839 &distance_codes, storage_ix, storage);
840 BlockSplitIterator literal_it(mb.literal_split);
841 BlockSplitIterator command_it(mb.command_split);
842 BlockSplitIterator distance_it(mb.distance_split);
843 for (int i = 0; i < mb.cmds.size(); ++i) {
844 const Command& cmd = mb.cmds[i];
845 MoveAndEncode(command_split_code, &command_it, storage_ix, storage);
846 EncodeCommand(cmd, command_codes[command_it.type_], storage_ix, storage);
847 for (int j = 0; j < cmd.insert_length_; ++j) {
848 MoveAndEncode(literal_split_code, &literal_it, storage_ix, storage);
849 int histogram_idx = literal_it.type_;
850 uint8_t prev_byte = *pos > 0 ? ringbuffer[(*pos - 1) & mask] : 0;
851 uint8_t prev_byte2 = *pos > 1 ? ringbuffer[(*pos - 2) & mask] : 0;
852 int context = ((literal_it.type_ << kLiteralContextBits) +
853 Context(prev_byte, prev_byte2,
854 mb.literal_context_modes[literal_it.type_]));
855 histogram_idx = mb.literal_context_map[context];
856 int literal = ringbuffer[*pos & mask];
857 WriteBits(literal_codes[histogram_idx].depth_[literal],
858 literal_codes[histogram_idx].bits_[literal],
859 storage_ix, storage);
860 ++(*pos);
861 }
862 if (*pos < end_pos && cmd.distance_prefix_ != 0xffff) {
863 MoveAndEncode(distance_split_code, &distance_it, storage_ix, storage);
864 int context = (distance_it.type_ << 2) +
865 ((cmd.copy_length_code_ > 4) ? 3 : cmd.copy_length_code_ - 2);
866 int histogram_index = mb.distance_context_map[context];
867 size_t max_distance = std::min(*pos, (size_t)kMaxBackwardDistance);
868 EncodeCopyDistance(cmd, distance_codes[histogram_index],
869 storage_ix, storage);
870 }
871 *pos += cmd.copy_length_;
872 }
873 }
874
BrotliCompressor(BrotliParams params)875 BrotliCompressor::BrotliCompressor(BrotliParams params)
876 : params_(params),
877 window_bits_(kWindowBits),
878 hashers_(new Hashers()),
879 dist_ringbuffer_idx_(0),
880 input_pos_(0),
881 ringbuffer_(kRingBufferBits, kMetaBlockSizeBits),
882 literal_cost_(1 << kRingBufferBits),
883 storage_ix_(0),
884 storage_(new uint8_t[2 << kMetaBlockSizeBits]) {
885 dist_ringbuffer_[0] = 16;
886 dist_ringbuffer_[1] = 15;
887 dist_ringbuffer_[2] = 11;
888 dist_ringbuffer_[3] = 4;
889 storage_[0] = 0;
890 switch (params.mode) {
891 case BrotliParams::MODE_TEXT: hash_type_ = Hashers::HASH_15_8_4; break;
892 case BrotliParams::MODE_FONT: hash_type_ = Hashers::HASH_15_8_2; break;
893 default: break;
894 }
895 hashers_->Init(hash_type_);
896 if (params.mode == BrotliParams::MODE_TEXT) {
897 StoreDictionaryWordHashes();
898 }
899 }
900
~BrotliCompressor()901 BrotliCompressor::~BrotliCompressor() {
902 delete[] storage_;
903 }
904
905 StaticDictionary *BrotliCompressor::static_dictionary_ = NULL;
906
StoreDictionaryWordHashes()907 void BrotliCompressor::StoreDictionaryWordHashes() {
908 const int num_transforms = kNumTransforms;
909 if (static_dictionary_ == NULL) {
910 static_dictionary_ = new StaticDictionary;
911 for (int t = num_transforms - 1; t >= 0; --t) {
912 for (int i = kMaxDictionaryWordLength;
913 i >= kMinDictionaryWordLength; --i) {
914 const int num_words = 1 << kBrotliDictionarySizeBitsByLength[i];
915 for (int j = num_words - 1; j >= 0; --j) {
916 int word_id = t * num_words + j;
917 std::string word = GetTransformedDictionaryWord(i, word_id);
918 if (word.size() >= 4) {
919 static_dictionary_->Insert(word, i, word_id);
920 }
921 }
922 }
923 }
924 }
925 hashers_->SetStaticDictionary(static_dictionary_);
926 }
927
WriteStreamHeader()928 void BrotliCompressor::WriteStreamHeader() {
929 // Encode window size.
930 if (window_bits_ == 16) {
931 WriteBits(1, 0, &storage_ix_, storage_);
932 } else {
933 WriteBits(1, 1, &storage_ix_, storage_);
934 WriteBits(3, window_bits_ - 17, &storage_ix_, storage_);
935 }
936 }
937
WriteMetaBlock(const size_t input_size,const uint8_t * input_buffer,const bool is_last,size_t * encoded_size,uint8_t * encoded_buffer)938 void BrotliCompressor::WriteMetaBlock(const size_t input_size,
939 const uint8_t* input_buffer,
940 const bool is_last,
941 size_t* encoded_size,
942 uint8_t* encoded_buffer) {
943 static const double kMinUTF8Ratio = 0.75;
944 bool utf8_mode = false;
945 std::vector<Command> commands;
946 if (input_size > 0) {
947 ringbuffer_.Write(input_buffer, input_size);
948 utf8_mode = IsMostlyUTF8(
949 &ringbuffer_.start()[input_pos_ & kRingBufferMask],
950 input_size, kMinUTF8Ratio);
951 if (utf8_mode) {
952 EstimateBitCostsForLiteralsUTF8(input_pos_, input_size,
953 kRingBufferMask, kRingBufferMask,
954 ringbuffer_.start(), &literal_cost_[0]);
955 } else {
956 EstimateBitCostsForLiterals(input_pos_, input_size,
957 kRingBufferMask, kRingBufferMask,
958 ringbuffer_.start(), &literal_cost_[0]);
959 }
960 CreateBackwardReferences(
961 input_size, input_pos_,
962 ringbuffer_.start(),
963 &literal_cost_[0],
964 kRingBufferMask, kMaxBackwardDistance,
965 hashers_.get(),
966 hash_type_,
967 &commands);
968 ComputeDistanceShortCodes(&commands, input_pos_, kMaxBackwardDistance,
969 dist_ringbuffer_,
970 &dist_ringbuffer_idx_);
971 }
972 EncodingParams params;
973 params.num_direct_distance_codes =
974 params_.mode == BrotliParams::MODE_FONT ? 12 : 0;
975 params.distance_postfix_bits =
976 params_.mode == BrotliParams::MODE_FONT ? 1 : 0;
977 params.literal_context_mode = CONTEXT_SIGNED;
978 const int storage_ix0 = storage_ix_;
979 MetaBlock mb;
980 BuildMetaBlock(params, commands, ringbuffer_.start(), input_pos_,
981 kRingBufferMask, &mb);
982 StoreMetaBlock(mb, is_last, ringbuffer_.start(), kRingBufferMask,
983 &input_pos_, &storage_ix_, storage_);
984 size_t output_size = is_last ? ((storage_ix_ + 7) >> 3) : (storage_ix_ >> 3);
985 output_size -= (storage_ix0 >> 3);
986 if (input_size + 4 < output_size) {
987 storage_ix_ = storage_ix0;
988 storage_[storage_ix_ >> 3] &= (1 << (storage_ix_ & 7)) - 1;
989 EncodeMetaBlockLength(input_size, false, true, &storage_ix_, storage_);
990 size_t hdr_size = (storage_ix_ + 7) >> 3;
991 memcpy(encoded_buffer, storage_, hdr_size);
992 memcpy(encoded_buffer + hdr_size, input_buffer, input_size);
993 *encoded_size = hdr_size + input_size;
994 if (is_last) {
995 encoded_buffer[*encoded_size] = 0x3; // ISLAST, ISEMPTY
996 ++(*encoded_size);
997 }
998 storage_ix_ = 0;
999 storage_[0] = 0;
1000 } else {
1001 memcpy(encoded_buffer, storage_, output_size);
1002 *encoded_size = output_size;
1003 if (is_last) {
1004 storage_ix_ = 0;
1005 storage_[0] = 0;
1006 } else {
1007 storage_ix_ -= output_size << 3;
1008 storage_[storage_ix_ >> 3] = storage_[output_size];
1009 }
1010 }
1011 }
1012
FinishStream(size_t * encoded_size,uint8_t * encoded_buffer)1013 void BrotliCompressor::FinishStream(
1014 size_t* encoded_size, uint8_t* encoded_buffer) {
1015 WriteMetaBlock(0, NULL, true, encoded_size, encoded_buffer);
1016 }
1017
1018
BrotliCompressBuffer(BrotliParams params,size_t input_size,const uint8_t * input_buffer,size_t * encoded_size,uint8_t * encoded_buffer)1019 int BrotliCompressBuffer(BrotliParams params,
1020 size_t input_size,
1021 const uint8_t* input_buffer,
1022 size_t* encoded_size,
1023 uint8_t* encoded_buffer) {
1024 if (input_size == 0) {
1025 encoded_buffer[0] = 6;
1026 *encoded_size = 1;
1027 return 1;
1028 }
1029
1030 BrotliCompressor compressor(params);
1031 compressor.WriteStreamHeader();
1032
1033 const int max_block_size = 1 << kMetaBlockSizeBits;
1034 size_t max_output_size = *encoded_size;
1035 const uint8_t* input_end = input_buffer + input_size;
1036 *encoded_size = 0;
1037
1038 while (input_buffer < input_end) {
1039 int block_size = max_block_size;
1040 bool is_last = false;
1041 if (block_size >= input_end - input_buffer) {
1042 block_size = input_end - input_buffer;
1043 is_last = true;
1044 }
1045 size_t output_size = max_output_size;
1046 compressor.WriteMetaBlock(block_size, input_buffer, is_last,
1047 &output_size, &encoded_buffer[*encoded_size]);
1048 input_buffer += block_size;
1049 *encoded_size += output_size;
1050 max_output_size -= output_size;
1051 }
1052
1053 return 1;
1054 }
1055
1056 } // namespace brotli
1057