1 /* Copyright 2013 Google Inc. All Rights Reserved.
2
3 Distributed under MIT license.
4 See file LICENSE for detail or copy at https://opensource.org/licenses/MIT
5 */
6
7 /* Heuristics for deciding about the UTF8-ness of strings. */
8
9 #include "./utf8_util.h"
10
11 #include <brotli/types.h>
12
13 #if defined(__cplusplus) || defined(c_plusplus)
14 extern "C" {
15 #endif
16
BrotliParseAsUTF8(int * symbol,const uint8_t * input,size_t size)17 static size_t BrotliParseAsUTF8(
18 int* symbol, const uint8_t* input, size_t size) {
19 /* ASCII */
20 if ((input[0] & 0x80) == 0) {
21 *symbol = input[0];
22 if (*symbol > 0) {
23 return 1;
24 }
25 }
26 /* 2-byte UTF8 */
27 if (size > 1u &&
28 (input[0] & 0xe0) == 0xc0 &&
29 (input[1] & 0xc0) == 0x80) {
30 *symbol = (((input[0] & 0x1f) << 6) |
31 (input[1] & 0x3f));
32 if (*symbol > 0x7f) {
33 return 2;
34 }
35 }
36 /* 3-byte UFT8 */
37 if (size > 2u &&
38 (input[0] & 0xf0) == 0xe0 &&
39 (input[1] & 0xc0) == 0x80 &&
40 (input[2] & 0xc0) == 0x80) {
41 *symbol = (((input[0] & 0x0f) << 12) |
42 ((input[1] & 0x3f) << 6) |
43 (input[2] & 0x3f));
44 if (*symbol > 0x7ff) {
45 return 3;
46 }
47 }
48 /* 4-byte UFT8 */
49 if (size > 3u &&
50 (input[0] & 0xf8) == 0xf0 &&
51 (input[1] & 0xc0) == 0x80 &&
52 (input[2] & 0xc0) == 0x80 &&
53 (input[3] & 0xc0) == 0x80) {
54 *symbol = (((input[0] & 0x07) << 18) |
55 ((input[1] & 0x3f) << 12) |
56 ((input[2] & 0x3f) << 6) |
57 (input[3] & 0x3f));
58 if (*symbol > 0xffff && *symbol <= 0x10ffff) {
59 return 4;
60 }
61 }
62 /* Not UTF8, emit a special symbol above the UTF8-code space */
63 *symbol = 0x110000 | input[0];
64 return 1;
65 }
66
67 /* Returns 1 if at least min_fraction of the data is UTF8-encoded.*/
BrotliIsMostlyUTF8(const uint8_t * data,const size_t pos,const size_t mask,const size_t length,const double min_fraction)68 BROTLI_BOOL BrotliIsMostlyUTF8(
69 const uint8_t* data, const size_t pos, const size_t mask,
70 const size_t length, const double min_fraction) {
71 size_t size_utf8 = 0;
72 size_t i = 0;
73 while (i < length) {
74 int symbol;
75 size_t bytes_read =
76 BrotliParseAsUTF8(&symbol, &data[(pos + i) & mask], length - i);
77 i += bytes_read;
78 if (symbol < 0x110000) size_utf8 += bytes_read;
79 }
80 return TO_BROTLI_BOOL(size_utf8 > min_fraction * (double)length);
81 }
82
83 #if defined(__cplusplus) || defined(c_plusplus)
84 } /* extern "C" */
85 #endif
86