• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* Copyright 2013 Google Inc. All Rights Reserved.
2 
3    Distributed under MIT license.
4    See file LICENSE for detail or copy at https://opensource.org/licenses/MIT
5 */
6 
7 /* Literal cost model to allow backward reference replacement to be efficient.
8 */
9 
10 #include "literal_cost.h"
11 
12 #include <string.h>  /* memset */
13 
14 #include <brotli/types.h>
15 
16 #include "../common/platform.h"
17 #include "fast_log.h"
18 #include "utf8_util.h"
19 
20 #if defined(__cplusplus) || defined(c_plusplus)
21 extern "C" {
22 #endif
23 
UTF8Position(size_t last,size_t c,size_t clamp)24 static size_t UTF8Position(size_t last, size_t c, size_t clamp) {
25   if (c < 128) {
26     return 0;  /* Next one is the 'Byte 1' again. */
27   } else if (c >= 192) {  /* Next one is the 'Byte 2' of utf-8 encoding. */
28     return BROTLI_MIN(size_t, 1, clamp);
29   } else {
30     /* Let's decide over the last byte if this ends the sequence. */
31     if (last < 0xE0) {
32       return 0;  /* Completed two or three byte coding. */
33     } else {  /* Next one is the 'Byte 3' of utf-8 encoding. */
34       return BROTLI_MIN(size_t, 2, clamp);
35     }
36   }
37 }
38 
DecideMultiByteStatsLevel(size_t pos,size_t len,size_t mask,const uint8_t * data)39 static size_t DecideMultiByteStatsLevel(size_t pos, size_t len, size_t mask,
40                                         const uint8_t* data) {
41   size_t counts[3] = { 0 };
42   size_t max_utf8 = 1;  /* should be 2, but 1 compresses better. */
43   size_t last_c = 0;
44   size_t i;
45   for (i = 0; i < len; ++i) {
46     size_t c = data[(pos + i) & mask];
47     ++counts[UTF8Position(last_c, c, 2)];
48     last_c = c;
49   }
50   if (counts[2] < 500) {
51     max_utf8 = 1;
52   }
53   if (counts[1] + counts[2] < 25) {
54     max_utf8 = 0;
55   }
56   return max_utf8;
57 }
58 
EstimateBitCostsForLiteralsUTF8(size_t pos,size_t len,size_t mask,const uint8_t * data,size_t * histogram,float * cost)59 static void EstimateBitCostsForLiteralsUTF8(size_t pos, size_t len, size_t mask,
60                                             const uint8_t* data,
61                                             size_t* histogram, float* cost) {
62   /* max_utf8 is 0 (normal ASCII single byte modeling),
63      1 (for 2-byte UTF-8 modeling), or 2 (for 3-byte UTF-8 modeling). */
64   const size_t max_utf8 = DecideMultiByteStatsLevel(pos, len, mask, data);
65   size_t window_half = 495;
66   size_t in_window = BROTLI_MIN(size_t, window_half, len);
67   size_t in_window_utf8[3] = { 0 };
68   size_t i;
69   memset(histogram, 0, 3 * 256 * sizeof(histogram[0]));
70 
71   {  /* Bootstrap histograms. */
72     size_t last_c = 0;
73     size_t utf8_pos = 0;
74     for (i = 0; i < in_window; ++i) {
75       size_t c = data[(pos + i) & mask];
76       ++histogram[256 * utf8_pos + c];
77       ++in_window_utf8[utf8_pos];
78       utf8_pos = UTF8Position(last_c, c, max_utf8);
79       last_c = c;
80     }
81   }
82 
83   /* Compute bit costs with sliding window. */
84   for (i = 0; i < len; ++i) {
85     if (i >= window_half) {
86       /* Remove a byte in the past. */
87       size_t c =
88           i < window_half + 1 ? 0 : data[(pos + i - window_half - 1) & mask];
89       size_t last_c =
90           i < window_half + 2 ? 0 : data[(pos + i - window_half - 2) & mask];
91       size_t utf8_pos2 = UTF8Position(last_c, c, max_utf8);
92       --histogram[256 * utf8_pos2 + data[(pos + i - window_half) & mask]];
93       --in_window_utf8[utf8_pos2];
94     }
95     if (i + window_half < len) {
96       /* Add a byte in the future. */
97       size_t c = data[(pos + i + window_half - 1) & mask];
98       size_t last_c = data[(pos + i + window_half - 2) & mask];
99       size_t utf8_pos2 = UTF8Position(last_c, c, max_utf8);
100       ++histogram[256 * utf8_pos2 + data[(pos + i + window_half) & mask]];
101       ++in_window_utf8[utf8_pos2];
102     }
103     {
104       size_t c = i < 1 ? 0 : data[(pos + i - 1) & mask];
105       size_t last_c = i < 2 ? 0 : data[(pos + i - 2) & mask];
106       size_t utf8_pos = UTF8Position(last_c, c, max_utf8);
107       size_t masked_pos = (pos + i) & mask;
108       size_t histo = histogram[256 * utf8_pos + data[masked_pos]];
109       double lit_cost;
110       if (histo == 0) {
111         histo = 1;
112       }
113       lit_cost = FastLog2(in_window_utf8[utf8_pos]) - FastLog2(histo);
114       lit_cost += 0.02905;
115       if (lit_cost < 1.0) {
116         lit_cost *= 0.5;
117         lit_cost += 0.5;
118       }
119       /* Make the first bytes more expensive -- seems to help, not sure why.
120          Perhaps because the entropy source is changing its properties
121          rapidly in the beginning of the file, perhaps because the beginning
122          of the data is a statistical "anomaly". */
123       if (i < 2000) {
124         lit_cost += 0.7 - ((double)(2000 - i) / 2000.0 * 0.35);
125       }
126       cost[i] = (float)lit_cost;
127     }
128   }
129 }
130 
BrotliEstimateBitCostsForLiterals(size_t pos,size_t len,size_t mask,const uint8_t * data,size_t * histogram,float * cost)131 void BrotliEstimateBitCostsForLiterals(size_t pos, size_t len, size_t mask,
132                                        const uint8_t* data,
133                                        size_t* histogram, float* cost) {
134   if (BrotliIsMostlyUTF8(data, pos, mask, len, kMinUTF8Ratio)) {
135     EstimateBitCostsForLiteralsUTF8(pos, len, mask, data, histogram, cost);
136     return;
137   } else {
138     size_t window_half = 2000;
139     size_t in_window = BROTLI_MIN(size_t, window_half, len);
140     size_t i;
141     memset(histogram, 0, 256 * sizeof(histogram[0]));
142 
143     /* Bootstrap histogram. */
144     for (i = 0; i < in_window; ++i) {
145       ++histogram[data[(pos + i) & mask]];
146     }
147 
148     /* Compute bit costs with sliding window. */
149     for (i = 0; i < len; ++i) {
150       size_t histo;
151       if (i >= window_half) {
152         /* Remove a byte in the past. */
153         --histogram[data[(pos + i - window_half) & mask]];
154         --in_window;
155       }
156       if (i + window_half < len) {
157         /* Add a byte in the future. */
158         ++histogram[data[(pos + i + window_half) & mask]];
159         ++in_window;
160       }
161       histo = histogram[data[(pos + i) & mask]];
162       if (histo == 0) {
163         histo = 1;
164       }
165       {
166         double lit_cost = FastLog2(in_window) - FastLog2(histo);
167         lit_cost += 0.029;
168         if (lit_cost < 1.0) {
169           lit_cost *= 0.5;
170           lit_cost += 0.5;
171         }
172         cost[i] = (float)lit_cost;
173       }
174     }
175   }
176 }
177 
178 #if defined(__cplusplus) || defined(c_plusplus)
179 }  /* extern "C" */
180 #endif
181