1 /* Copyright 2013 Google Inc. All Rights Reserved.
2
3 Distributed under MIT license.
4 See file LICENSE for detail or copy at https://opensource.org/licenses/MIT
5 */
6
7 #include "./static_dict.h"
8
9 #include "../common/dictionary.h"
10 #include "./find_match_length.h"
11 #include "./port.h"
12 #include "./static_dict_lut.h"
13
14 #if defined(__cplusplus) || defined(c_plusplus)
15 extern "C" {
16 #endif
17
18 static const uint8_t kUppercaseFirst = 10;
19 static const uint8_t kOmitLastNTransforms[10] = {
20 0, 12, 27, 23, 42, 63, 56, 48, 59, 64,
21 };
22
Hash(const uint8_t * data)23 static BROTLI_INLINE uint32_t Hash(const uint8_t *data) {
24 uint32_t h = BROTLI_UNALIGNED_LOAD32(data) * kDictHashMul32;
25 /* The higher bits contain more mixture from the multiplication,
26 so we take our results from there. */
27 return h >> (32 - kDictNumBits);
28 }
29
AddMatch(size_t distance,size_t len,size_t len_code,uint32_t * matches)30 static BROTLI_INLINE void AddMatch(size_t distance, size_t len, size_t len_code,
31 uint32_t* matches) {
32 uint32_t match = (uint32_t)((distance << 5) + len_code);
33 matches[len] = BROTLI_MIN(uint32_t, matches[len], match);
34 }
35
DictMatchLength(const BrotliDictionary * dictionary,const uint8_t * data,size_t id,size_t len,size_t maxlen)36 static BROTLI_INLINE size_t DictMatchLength(const BrotliDictionary* dictionary,
37 const uint8_t* data,
38 size_t id,
39 size_t len,
40 size_t maxlen) {
41 const size_t offset = dictionary->offsets_by_length[len] + len * id;
42 return FindMatchLengthWithLimit(&dictionary->data[offset], data,
43 BROTLI_MIN(size_t, len, maxlen));
44 }
45
IsMatch(const BrotliDictionary * dictionary,DictWord w,const uint8_t * data,size_t max_length)46 static BROTLI_INLINE BROTLI_BOOL IsMatch(const BrotliDictionary* dictionary,
47 DictWord w, const uint8_t* data, size_t max_length) {
48 if (w.len > max_length) {
49 return BROTLI_FALSE;
50 } else {
51 const size_t offset = dictionary->offsets_by_length[w.len] +
52 (size_t)w.len * (size_t)w.idx;
53 const uint8_t* dict = &dictionary->data[offset];
54 if (w.transform == 0) {
55 /* Match against base dictionary word. */
56 return
57 TO_BROTLI_BOOL(FindMatchLengthWithLimit(dict, data, w.len) == w.len);
58 } else if (w.transform == 10) {
59 /* Match against uppercase first transform.
60 Note that there are only ASCII uppercase words in the lookup table. */
61 return TO_BROTLI_BOOL(dict[0] >= 'a' && dict[0] <= 'z' &&
62 (dict[0] ^ 32) == data[0] &&
63 FindMatchLengthWithLimit(&dict[1], &data[1], w.len - 1u) ==
64 w.len - 1u);
65 } else {
66 /* Match against uppercase all transform.
67 Note that there are only ASCII uppercase words in the lookup table. */
68 size_t i;
69 for (i = 0; i < w.len; ++i) {
70 if (dict[i] >= 'a' && dict[i] <= 'z') {
71 if ((dict[i] ^ 32) != data[i]) return BROTLI_FALSE;
72 } else {
73 if (dict[i] != data[i]) return BROTLI_FALSE;
74 }
75 }
76 return BROTLI_TRUE;
77 }
78 }
79 }
80
BrotliFindAllStaticDictionaryMatches(const BrotliDictionary * dictionary,const uint8_t * data,size_t min_length,size_t max_length,uint32_t * matches)81 BROTLI_BOOL BrotliFindAllStaticDictionaryMatches(
82 const BrotliDictionary* dictionary, const uint8_t* data, size_t min_length,
83 size_t max_length, uint32_t* matches) {
84 BROTLI_BOOL has_found_match = BROTLI_FALSE;
85 {
86 size_t offset = kStaticDictionaryBuckets[Hash(data)];
87 BROTLI_BOOL end = !offset;
88 while (!end) {
89 DictWord w = kStaticDictionaryWords[offset++];
90 const size_t l = w.len & 0x1F;
91 const size_t n = (size_t)1 << dictionary->size_bits_by_length[l];
92 const size_t id = w.idx;
93 end = !!(w.len & 0x80);
94 w.len = (uint8_t)l;
95 if (w.transform == 0) {
96 const size_t matchlen =
97 DictMatchLength(dictionary, data, id, l, max_length);
98 const uint8_t* s;
99 size_t minlen;
100 size_t maxlen;
101 size_t len;
102 /* Transform "" + kIdentity + "" */
103 if (matchlen == l) {
104 AddMatch(id, l, l, matches);
105 has_found_match = BROTLI_TRUE;
106 }
107 /* Transforms "" + kOmitLast1 + "" and "" + kOmitLast1 + "ing " */
108 if (matchlen >= l - 1) {
109 AddMatch(id + 12 * n, l - 1, l, matches);
110 if (l + 2 < max_length &&
111 data[l - 1] == 'i' && data[l] == 'n' && data[l + 1] == 'g' &&
112 data[l + 2] == ' ') {
113 AddMatch(id + 49 * n, l + 3, l, matches);
114 }
115 has_found_match = BROTLI_TRUE;
116 }
117 /* Transform "" + kOmitLastN + "" (N = 2 .. 9) */
118 minlen = min_length;
119 if (l > 9) minlen = BROTLI_MAX(size_t, minlen, l - 9);
120 maxlen = BROTLI_MIN(size_t, matchlen, l - 2);
121 for (len = minlen; len <= maxlen; ++len) {
122 AddMatch(id + kOmitLastNTransforms[l - len] * n, len, l, matches);
123 has_found_match = BROTLI_TRUE;
124 }
125 if (matchlen < l || l + 6 >= max_length) {
126 continue;
127 }
128 s = &data[l];
129 /* Transforms "" + kIdentity + <suffix> */
130 if (s[0] == ' ') {
131 AddMatch(id + n, l + 1, l, matches);
132 if (s[1] == 'a') {
133 if (s[2] == ' ') {
134 AddMatch(id + 28 * n, l + 3, l, matches);
135 } else if (s[2] == 's') {
136 if (s[3] == ' ') AddMatch(id + 46 * n, l + 4, l, matches);
137 } else if (s[2] == 't') {
138 if (s[3] == ' ') AddMatch(id + 60 * n, l + 4, l, matches);
139 } else if (s[2] == 'n') {
140 if (s[3] == 'd' && s[4] == ' ') {
141 AddMatch(id + 10 * n, l + 5, l, matches);
142 }
143 }
144 } else if (s[1] == 'b') {
145 if (s[2] == 'y' && s[3] == ' ') {
146 AddMatch(id + 38 * n, l + 4, l, matches);
147 }
148 } else if (s[1] == 'i') {
149 if (s[2] == 'n') {
150 if (s[3] == ' ') AddMatch(id + 16 * n, l + 4, l, matches);
151 } else if (s[2] == 's') {
152 if (s[3] == ' ') AddMatch(id + 47 * n, l + 4, l, matches);
153 }
154 } else if (s[1] == 'f') {
155 if (s[2] == 'o') {
156 if (s[3] == 'r' && s[4] == ' ') {
157 AddMatch(id + 25 * n, l + 5, l, matches);
158 }
159 } else if (s[2] == 'r') {
160 if (s[3] == 'o' && s[4] == 'm' && s[5] == ' ') {
161 AddMatch(id + 37 * n, l + 6, l, matches);
162 }
163 }
164 } else if (s[1] == 'o') {
165 if (s[2] == 'f') {
166 if (s[3] == ' ') AddMatch(id + 8 * n, l + 4, l, matches);
167 } else if (s[2] == 'n') {
168 if (s[3] == ' ') AddMatch(id + 45 * n, l + 4, l, matches);
169 }
170 } else if (s[1] == 'n') {
171 if (s[2] == 'o' && s[3] == 't' && s[4] == ' ') {
172 AddMatch(id + 80 * n, l + 5, l, matches);
173 }
174 } else if (s[1] == 't') {
175 if (s[2] == 'h') {
176 if (s[3] == 'e') {
177 if (s[4] == ' ') AddMatch(id + 5 * n, l + 5, l, matches);
178 } else if (s[3] == 'a') {
179 if (s[4] == 't' && s[5] == ' ') {
180 AddMatch(id + 29 * n, l + 6, l, matches);
181 }
182 }
183 } else if (s[2] == 'o') {
184 if (s[3] == ' ') AddMatch(id + 17 * n, l + 4, l, matches);
185 }
186 } else if (s[1] == 'w') {
187 if (s[2] == 'i' && s[3] == 't' && s[4] == 'h' && s[5] == ' ') {
188 AddMatch(id + 35 * n, l + 6, l, matches);
189 }
190 }
191 } else if (s[0] == '"') {
192 AddMatch(id + 19 * n, l + 1, l, matches);
193 if (s[1] == '>') {
194 AddMatch(id + 21 * n, l + 2, l, matches);
195 }
196 } else if (s[0] == '.') {
197 AddMatch(id + 20 * n, l + 1, l, matches);
198 if (s[1] == ' ') {
199 AddMatch(id + 31 * n, l + 2, l, matches);
200 if (s[2] == 'T' && s[3] == 'h') {
201 if (s[4] == 'e') {
202 if (s[5] == ' ') AddMatch(id + 43 * n, l + 6, l, matches);
203 } else if (s[4] == 'i') {
204 if (s[5] == 's' && s[6] == ' ') {
205 AddMatch(id + 75 * n, l + 7, l, matches);
206 }
207 }
208 }
209 }
210 } else if (s[0] == ',') {
211 AddMatch(id + 76 * n, l + 1, l, matches);
212 if (s[1] == ' ') {
213 AddMatch(id + 14 * n, l + 2, l, matches);
214 }
215 } else if (s[0] == '\n') {
216 AddMatch(id + 22 * n, l + 1, l, matches);
217 if (s[1] == '\t') {
218 AddMatch(id + 50 * n, l + 2, l, matches);
219 }
220 } else if (s[0] == ']') {
221 AddMatch(id + 24 * n, l + 1, l, matches);
222 } else if (s[0] == '\'') {
223 AddMatch(id + 36 * n, l + 1, l, matches);
224 } else if (s[0] == ':') {
225 AddMatch(id + 51 * n, l + 1, l, matches);
226 } else if (s[0] == '(') {
227 AddMatch(id + 57 * n, l + 1, l, matches);
228 } else if (s[0] == '=') {
229 if (s[1] == '"') {
230 AddMatch(id + 70 * n, l + 2, l, matches);
231 } else if (s[1] == '\'') {
232 AddMatch(id + 86 * n, l + 2, l, matches);
233 }
234 } else if (s[0] == 'a') {
235 if (s[1] == 'l' && s[2] == ' ') {
236 AddMatch(id + 84 * n, l + 3, l, matches);
237 }
238 } else if (s[0] == 'e') {
239 if (s[1] == 'd') {
240 if (s[2] == ' ') AddMatch(id + 53 * n, l + 3, l, matches);
241 } else if (s[1] == 'r') {
242 if (s[2] == ' ') AddMatch(id + 82 * n, l + 3, l, matches);
243 } else if (s[1] == 's') {
244 if (s[2] == 't' && s[3] == ' ') {
245 AddMatch(id + 95 * n, l + 4, l, matches);
246 }
247 }
248 } else if (s[0] == 'f') {
249 if (s[1] == 'u' && s[2] == 'l' && s[3] == ' ') {
250 AddMatch(id + 90 * n, l + 4, l, matches);
251 }
252 } else if (s[0] == 'i') {
253 if (s[1] == 'v') {
254 if (s[2] == 'e' && s[3] == ' ') {
255 AddMatch(id + 92 * n, l + 4, l, matches);
256 }
257 } else if (s[1] == 'z') {
258 if (s[2] == 'e' && s[3] == ' ') {
259 AddMatch(id + 100 * n, l + 4, l, matches);
260 }
261 }
262 } else if (s[0] == 'l') {
263 if (s[1] == 'e') {
264 if (s[2] == 's' && s[3] == 's' && s[4] == ' ') {
265 AddMatch(id + 93 * n, l + 5, l, matches);
266 }
267 } else if (s[1] == 'y') {
268 if (s[2] == ' ') AddMatch(id + 61 * n, l + 3, l, matches);
269 }
270 } else if (s[0] == 'o') {
271 if (s[1] == 'u' && s[2] == 's' && s[3] == ' ') {
272 AddMatch(id + 106 * n, l + 4, l, matches);
273 }
274 }
275 } else {
276 /* Set is_all_caps=0 for kUppercaseFirst and
277 is_all_caps=1 otherwise (kUppercaseAll) transform. */
278 const BROTLI_BOOL is_all_caps =
279 TO_BROTLI_BOOL(w.transform != kUppercaseFirst);
280 const uint8_t* s;
281 if (!IsMatch(dictionary, w, data, max_length)) {
282 continue;
283 }
284 /* Transform "" + kUppercase{First,All} + "" */
285 AddMatch(id + (is_all_caps ? 44 : 9) * n, l, l, matches);
286 has_found_match = BROTLI_TRUE;
287 if (l + 1 >= max_length) {
288 continue;
289 }
290 /* Transforms "" + kUppercase{First,All} + <suffix> */
291 s = &data[l];
292 if (s[0] == ' ') {
293 AddMatch(id + (is_all_caps ? 68 : 4) * n, l + 1, l, matches);
294 } else if (s[0] == '"') {
295 AddMatch(id + (is_all_caps ? 87 : 66) * n, l + 1, l, matches);
296 if (s[1] == '>') {
297 AddMatch(id + (is_all_caps ? 97 : 69) * n, l + 2, l, matches);
298 }
299 } else if (s[0] == '.') {
300 AddMatch(id + (is_all_caps ? 101 : 79) * n, l + 1, l, matches);
301 if (s[1] == ' ') {
302 AddMatch(id + (is_all_caps ? 114 : 88) * n, l + 2, l, matches);
303 }
304 } else if (s[0] == ',') {
305 AddMatch(id + (is_all_caps ? 112 : 99) * n, l + 1, l, matches);
306 if (s[1] == ' ') {
307 AddMatch(id + (is_all_caps ? 107 : 58) * n, l + 2, l, matches);
308 }
309 } else if (s[0] == '\'') {
310 AddMatch(id + (is_all_caps ? 94 : 74) * n, l + 1, l, matches);
311 } else if (s[0] == '(') {
312 AddMatch(id + (is_all_caps ? 113 : 78) * n, l + 1, l, matches);
313 } else if (s[0] == '=') {
314 if (s[1] == '"') {
315 AddMatch(id + (is_all_caps ? 105 : 104) * n, l + 2, l, matches);
316 } else if (s[1] == '\'') {
317 AddMatch(id + (is_all_caps ? 116 : 108) * n, l + 2, l, matches);
318 }
319 }
320 }
321 }
322 }
323 /* Transforms with prefixes " " and "." */
324 if (max_length >= 5 && (data[0] == ' ' || data[0] == '.')) {
325 BROTLI_BOOL is_space = TO_BROTLI_BOOL(data[0] == ' ');
326 size_t offset = kStaticDictionaryBuckets[Hash(&data[1])];
327 BROTLI_BOOL end = !offset;
328 while (!end) {
329 DictWord w = kStaticDictionaryWords[offset++];
330 const size_t l = w.len & 0x1F;
331 const size_t n = (size_t)1 << dictionary->size_bits_by_length[l];
332 const size_t id = w.idx;
333 end = !!(w.len & 0x80);
334 w.len = (uint8_t)l;
335 if (w.transform == 0) {
336 const uint8_t* s;
337 if (!IsMatch(dictionary, w, &data[1], max_length - 1)) {
338 continue;
339 }
340 /* Transforms " " + kIdentity + "" and "." + kIdentity + "" */
341 AddMatch(id + (is_space ? 6 : 32) * n, l + 1, l, matches);
342 has_found_match = BROTLI_TRUE;
343 if (l + 2 >= max_length) {
344 continue;
345 }
346 /* Transforms " " + kIdentity + <suffix> and "." + kIdentity + <suffix>
347 */
348 s = &data[l + 1];
349 if (s[0] == ' ') {
350 AddMatch(id + (is_space ? 2 : 77) * n, l + 2, l, matches);
351 } else if (s[0] == '(') {
352 AddMatch(id + (is_space ? 89 : 67) * n, l + 2, l, matches);
353 } else if (is_space) {
354 if (s[0] == ',') {
355 AddMatch(id + 103 * n, l + 2, l, matches);
356 if (s[1] == ' ') {
357 AddMatch(id + 33 * n, l + 3, l, matches);
358 }
359 } else if (s[0] == '.') {
360 AddMatch(id + 71 * n, l + 2, l, matches);
361 if (s[1] == ' ') {
362 AddMatch(id + 52 * n, l + 3, l, matches);
363 }
364 } else if (s[0] == '=') {
365 if (s[1] == '"') {
366 AddMatch(id + 81 * n, l + 3, l, matches);
367 } else if (s[1] == '\'') {
368 AddMatch(id + 98 * n, l + 3, l, matches);
369 }
370 }
371 }
372 } else if (is_space) {
373 /* Set is_all_caps=0 for kUppercaseFirst and
374 is_all_caps=1 otherwise (kUppercaseAll) transform. */
375 const BROTLI_BOOL is_all_caps =
376 TO_BROTLI_BOOL(w.transform != kUppercaseFirst);
377 const uint8_t* s;
378 if (!IsMatch(dictionary, w, &data[1], max_length - 1)) {
379 continue;
380 }
381 /* Transforms " " + kUppercase{First,All} + "" */
382 AddMatch(id + (is_all_caps ? 85 : 30) * n, l + 1, l, matches);
383 has_found_match = BROTLI_TRUE;
384 if (l + 2 >= max_length) {
385 continue;
386 }
387 /* Transforms " " + kUppercase{First,All} + <suffix> */
388 s = &data[l + 1];
389 if (s[0] == ' ') {
390 AddMatch(id + (is_all_caps ? 83 : 15) * n, l + 2, l, matches);
391 } else if (s[0] == ',') {
392 if (!is_all_caps) {
393 AddMatch(id + 109 * n, l + 2, l, matches);
394 }
395 if (s[1] == ' ') {
396 AddMatch(id + (is_all_caps ? 111 : 65) * n, l + 3, l, matches);
397 }
398 } else if (s[0] == '.') {
399 AddMatch(id + (is_all_caps ? 115 : 96) * n, l + 2, l, matches);
400 if (s[1] == ' ') {
401 AddMatch(id + (is_all_caps ? 117 : 91) * n, l + 3, l, matches);
402 }
403 } else if (s[0] == '=') {
404 if (s[1] == '"') {
405 AddMatch(id + (is_all_caps ? 110 : 118) * n, l + 3, l, matches);
406 } else if (s[1] == '\'') {
407 AddMatch(id + (is_all_caps ? 119 : 120) * n, l + 3, l, matches);
408 }
409 }
410 }
411 }
412 }
413 if (max_length >= 6) {
414 /* Transforms with prefixes "e ", "s ", ", " and "\xc2\xa0" */
415 if ((data[1] == ' ' &&
416 (data[0] == 'e' || data[0] == 's' || data[0] == ',')) ||
417 (data[0] == 0xc2 && data[1] == 0xa0)) {
418 size_t offset = kStaticDictionaryBuckets[Hash(&data[2])];
419 BROTLI_BOOL end = !offset;
420 while (!end) {
421 DictWord w = kStaticDictionaryWords[offset++];
422 const size_t l = w.len & 0x1F;
423 const size_t n = (size_t)1 << dictionary->size_bits_by_length[l];
424 const size_t id = w.idx;
425 end = !!(w.len & 0x80);
426 w.len = (uint8_t)l;
427 if (w.transform == 0 &&
428 IsMatch(dictionary, w, &data[2], max_length - 2)) {
429 if (data[0] == 0xc2) {
430 AddMatch(id + 102 * n, l + 2, l, matches);
431 has_found_match = BROTLI_TRUE;
432 } else if (l + 2 < max_length && data[l + 2] == ' ') {
433 size_t t = data[0] == 'e' ? 18 : (data[0] == 's' ? 7 : 13);
434 AddMatch(id + t * n, l + 3, l, matches);
435 has_found_match = BROTLI_TRUE;
436 }
437 }
438 }
439 }
440 }
441 if (max_length >= 9) {
442 /* Transforms with prefixes " the " and ".com/" */
443 if ((data[0] == ' ' && data[1] == 't' && data[2] == 'h' &&
444 data[3] == 'e' && data[4] == ' ') ||
445 (data[0] == '.' && data[1] == 'c' && data[2] == 'o' &&
446 data[3] == 'm' && data[4] == '/')) {
447 size_t offset = kStaticDictionaryBuckets[Hash(&data[5])];
448 BROTLI_BOOL end = !offset;
449 while (!end) {
450 DictWord w = kStaticDictionaryWords[offset++];
451 const size_t l = w.len & 0x1F;
452 const size_t n = (size_t)1 << dictionary->size_bits_by_length[l];
453 const size_t id = w.idx;
454 end = !!(w.len & 0x80);
455 w.len = (uint8_t)l;
456 if (w.transform == 0 &&
457 IsMatch(dictionary, w, &data[5], max_length - 5)) {
458 AddMatch(id + (data[0] == ' ' ? 41 : 72) * n, l + 5, l, matches);
459 has_found_match = BROTLI_TRUE;
460 if (l + 5 < max_length) {
461 const uint8_t* s = &data[l + 5];
462 if (data[0] == ' ') {
463 if (l + 8 < max_length &&
464 s[0] == ' ' && s[1] == 'o' && s[2] == 'f' && s[3] == ' ') {
465 AddMatch(id + 62 * n, l + 9, l, matches);
466 if (l + 12 < max_length &&
467 s[4] == 't' && s[5] == 'h' && s[6] == 'e' && s[7] == ' ') {
468 AddMatch(id + 73 * n, l + 13, l, matches);
469 }
470 }
471 }
472 }
473 }
474 }
475 }
476 }
477 return has_found_match;
478 }
479
480 #if defined(__cplusplus) || defined(c_plusplus)
481 } /* extern "C" */
482 #endif
483