• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* Copyright 2013 Google Inc. All Rights Reserved.
2 
3    Distributed under MIT license.
4    See file LICENSE for detail or copy at https://opensource.org/licenses/MIT
5 */
6 
7 /* Literal cost model to allow backward reference replacement to be efficient.
8 */
9 
10 #include "literal_cost.h"
11 
12 #include <string.h>  /* memset */
13 
14 #include "../common/platform.h"
15 #include <brotli/types.h>
16 #include "fast_log.h"
17 #include "utf8_util.h"
18 
19 #if defined(__cplusplus) || defined(c_plusplus)
20 extern "C" {
21 #endif
22 
UTF8Position(size_t last,size_t c,size_t clamp)23 static size_t UTF8Position(size_t last, size_t c, size_t clamp) {
24   if (c < 128) {
25     return 0;  /* Next one is the 'Byte 1' again. */
26   } else if (c >= 192) {  /* Next one is the 'Byte 2' of utf-8 encoding. */
27     return BROTLI_MIN(size_t, 1, clamp);
28   } else {
29     /* Let's decide over the last byte if this ends the sequence. */
30     if (last < 0xE0) {
31       return 0;  /* Completed two or three byte coding. */
32     } else {  /* Next one is the 'Byte 3' of utf-8 encoding. */
33       return BROTLI_MIN(size_t, 2, clamp);
34     }
35   }
36 }
37 
DecideMultiByteStatsLevel(size_t pos,size_t len,size_t mask,const uint8_t * data)38 static size_t DecideMultiByteStatsLevel(size_t pos, size_t len, size_t mask,
39                                         const uint8_t* data) {
40   size_t counts[3] = { 0 };
41   size_t max_utf8 = 1;  /* should be 2, but 1 compresses better. */
42   size_t last_c = 0;
43   size_t i;
44   for (i = 0; i < len; ++i) {
45     size_t c = data[(pos + i) & mask];
46     ++counts[UTF8Position(last_c, c, 2)];
47     last_c = c;
48   }
49   if (counts[2] < 500) {
50     max_utf8 = 1;
51   }
52   if (counts[1] + counts[2] < 25) {
53     max_utf8 = 0;
54   }
55   return max_utf8;
56 }
57 
EstimateBitCostsForLiteralsUTF8(size_t pos,size_t len,size_t mask,const uint8_t * data,size_t * histogram,float * cost)58 static void EstimateBitCostsForLiteralsUTF8(size_t pos, size_t len, size_t mask,
59                                             const uint8_t* data,
60                                             size_t* histogram, float* cost) {
61   /* max_utf8 is 0 (normal ASCII single byte modeling),
62      1 (for 2-byte UTF-8 modeling), or 2 (for 3-byte UTF-8 modeling). */
63   const size_t max_utf8 = DecideMultiByteStatsLevel(pos, len, mask, data);
64   size_t window_half = 495;
65   size_t in_window = BROTLI_MIN(size_t, window_half, len);
66   size_t in_window_utf8[3] = { 0 };
67   size_t i;
68   memset(histogram, 0, 3 * 256 * sizeof(histogram[0]));
69 
70   {  /* Bootstrap histograms. */
71     size_t last_c = 0;
72     size_t utf8_pos = 0;
73     for (i = 0; i < in_window; ++i) {
74       size_t c = data[(pos + i) & mask];
75       ++histogram[256 * utf8_pos + c];
76       ++in_window_utf8[utf8_pos];
77       utf8_pos = UTF8Position(last_c, c, max_utf8);
78       last_c = c;
79     }
80   }
81 
82   /* Compute bit costs with sliding window. */
83   for (i = 0; i < len; ++i) {
84     if (i >= window_half) {
85       /* Remove a byte in the past. */
86       size_t c =
87           i < window_half + 1 ? 0 : data[(pos + i - window_half - 1) & mask];
88       size_t last_c =
89           i < window_half + 2 ? 0 : data[(pos + i - window_half - 2) & mask];
90       size_t utf8_pos2 = UTF8Position(last_c, c, max_utf8);
91       --histogram[256 * utf8_pos2 + data[(pos + i - window_half) & mask]];
92       --in_window_utf8[utf8_pos2];
93     }
94     if (i + window_half < len) {
95       /* Add a byte in the future. */
96       size_t c = data[(pos + i + window_half - 1) & mask];
97       size_t last_c = data[(pos + i + window_half - 2) & mask];
98       size_t utf8_pos2 = UTF8Position(last_c, c, max_utf8);
99       ++histogram[256 * utf8_pos2 + data[(pos + i + window_half) & mask]];
100       ++in_window_utf8[utf8_pos2];
101     }
102     {
103       size_t c = i < 1 ? 0 : data[(pos + i - 1) & mask];
104       size_t last_c = i < 2 ? 0 : data[(pos + i - 2) & mask];
105       size_t utf8_pos = UTF8Position(last_c, c, max_utf8);
106       size_t masked_pos = (pos + i) & mask;
107       size_t histo = histogram[256 * utf8_pos + data[masked_pos]];
108       double lit_cost;
109       if (histo == 0) {
110         histo = 1;
111       }
112       lit_cost = FastLog2(in_window_utf8[utf8_pos]) - FastLog2(histo);
113       lit_cost += 0.02905;
114       if (lit_cost < 1.0) {
115         lit_cost *= 0.5;
116         lit_cost += 0.5;
117       }
118       /* Make the first bytes more expensive -- seems to help, not sure why.
119          Perhaps because the entropy source is changing its properties
120          rapidly in the beginning of the file, perhaps because the beginning
121          of the data is a statistical "anomaly". */
122       if (i < 2000) {
123         lit_cost += 0.7 - ((double)(2000 - i) / 2000.0 * 0.35);
124       }
125       cost[i] = (float)lit_cost;
126     }
127   }
128 }
129 
BrotliEstimateBitCostsForLiterals(size_t pos,size_t len,size_t mask,const uint8_t * data,size_t * histogram,float * cost)130 void BrotliEstimateBitCostsForLiterals(size_t pos, size_t len, size_t mask,
131                                        const uint8_t* data,
132                                        size_t* histogram, float* cost) {
133   if (BrotliIsMostlyUTF8(data, pos, mask, len, kMinUTF8Ratio)) {
134     EstimateBitCostsForLiteralsUTF8(pos, len, mask, data, histogram, cost);
135     return;
136   } else {
137     size_t window_half = 2000;
138     size_t in_window = BROTLI_MIN(size_t, window_half, len);
139     size_t i;
140     memset(histogram, 0, 256 * sizeof(histogram[0]));
141 
142     /* Bootstrap histogram. */
143     for (i = 0; i < in_window; ++i) {
144       ++histogram[data[(pos + i) & mask]];
145     }
146 
147     /* Compute bit costs with sliding window. */
148     for (i = 0; i < len; ++i) {
149       size_t histo;
150       if (i >= window_half) {
151         /* Remove a byte in the past. */
152         --histogram[data[(pos + i - window_half) & mask]];
153         --in_window;
154       }
155       if (i + window_half < len) {
156         /* Add a byte in the future. */
157         ++histogram[data[(pos + i + window_half) & mask]];
158         ++in_window;
159       }
160       histo = histogram[data[(pos + i) & mask]];
161       if (histo == 0) {
162         histo = 1;
163       }
164       {
165         double lit_cost = FastLog2(in_window) - FastLog2(histo);
166         lit_cost += 0.029;
167         if (lit_cost < 1.0) {
168           lit_cost *= 0.5;
169           lit_cost += 0.5;
170         }
171         cost[i] = (float)lit_cost;
172       }
173     }
174   }
175 }
176 
177 #if defined(__cplusplus) || defined(c_plusplus)
178 }  /* extern "C" */
179 #endif
180