1 /* Copyright 2015-2016 The Chromium Authors. All rights reserved.
2 * Use of this source code is governed by a BSD-style license that can be
3 * found in the LICENSE.chromium file.
4 *
5 * Converted to C89 2015 by Tim Rühsen
6 */
7
8 #include <stddef.h>
9
10 #if defined(__GNUC__) && defined(__GNUC_MINOR__)
11 # define _GCC_VERSION_AT_LEAST(major, minor) ((__GNUC__ > (major)) || (__GNUC__ == (major) && __GNUC_MINOR__ >= (minor)))
12 #else
13 # define _GCC_VERSION_AT_LEAST(major, minor) 0
14 #endif
15
16 #define CHECK_LT(a, b) if ((a) >= b) return 0
17
18 static const char multibyte_length_table[16] = {
19 0, 0, 0, 0, /* 0x00-0x3F */
20 0, 0, 0, 0, /* 0x40-0x7F */
21 0, 0, 0, 0, /* 0x80-0xBF */
22 2, 2, 3, 4, /* 0xC0-0xFF */
23 };
24
25
26 /*
27 * Get length of multibyte character sequence starting at a given byte.
28 * Returns zero if the byte is not a valid leading byte in UTF-8.
29 */
GetMultibyteLength(char c)30 static int GetMultibyteLength(char c) {
31 return multibyte_length_table[((unsigned char)c) >> 4];
32 }
33
34 /*
35 * Moves pointers one byte forward.
36 */
NextPos(const unsigned char ** pos,const char ** key,const char ** multibyte_start)37 static void NextPos(const unsigned char** pos,
38 const char** key,
39 const char** multibyte_start)
40 {
41 ++*pos;
42 if (*multibyte_start) {
43 /* Advance key to next byte in multibyte sequence. */
44 ++*key;
45 /* Reset multibyte_start if last byte in multibyte sequence was consumed. */
46 if (*key - *multibyte_start == GetMultibyteLength(**multibyte_start))
47 *multibyte_start = 0;
48 } else {
49 if (GetMultibyteLength(**key)) {
50 /* Multibyte prefix was matched in the dafsa, start matching multibyte
51 * content in next round. */
52 *multibyte_start = *key;
53 } else {
54 /* Advance key as a single byte character was matched. */
55 ++*key;
56 }
57 }
58 }
59
60 /*
61 * Read next offset from pos.
62 * Returns true if an offset could be read, false otherwise.
63 */
64
GetNextOffset(const unsigned char ** pos,const unsigned char * end,const unsigned char ** offset)65 static int GetNextOffset(const unsigned char** pos,
66 const unsigned char* end,
67 const unsigned char** offset)
68 {
69 size_t bytes_consumed;
70
71 if (*pos == end)
72 return 0;
73
74 /* When reading an offset the byte array must always contain at least
75 * three more bytes to consume. First the offset to read, then a node
76 * to skip over and finally a destination node. No object can be smaller
77 * than one byte. */
78 CHECK_LT(*pos + 2, end);
79 switch (**pos & 0x60) {
80 case 0x60: /* Read three byte offset */
81 *offset += (((*pos)[0] & 0x1F) << 16) | ((*pos)[1] << 8) | (*pos)[2];
82 bytes_consumed = 3;
83 break;
84 case 0x40: /* Read two byte offset */
85 *offset += (((*pos)[0] & 0x1F) << 8) | (*pos)[1];
86 bytes_consumed = 2;
87 break;
88 default:
89 *offset += (*pos)[0] & 0x3F;
90 bytes_consumed = 1;
91 }
92 if ((**pos & 0x80) != 0) {
93 *pos = end;
94 } else {
95 *pos += bytes_consumed;
96 }
97 return 1;
98 }
99
100 /*
101 * Check if byte at offset is last in label.
102 */
103
IsEOL(const unsigned char * offset,const unsigned char * end)104 static int IsEOL(const unsigned char* offset, const unsigned char* end)
105 {
106 CHECK_LT(offset, end);
107 return(*offset & 0x80) != 0;
108 }
109
110 /*
111 * Check if byte at offset matches first character in key.
112 * This version assumes a range check was already performed by the caller.
113 */
114
IsMatchUnchecked(const unsigned char matcher,const char * key,const char * multibyte_start)115 static int IsMatchUnchecked(const unsigned char matcher,
116 const char* key,
117 const char* multibyte_start)
118 {
119 if (multibyte_start) {
120 /* Multibyte matching mode. */
121 if (multibyte_start == key) {
122 /* Match leading byte, which will also match the sequence length. */
123 return (matcher ^ 0x80) == (const unsigned char)*key;
124 } else {
125 /* Match following bytes. */
126 return (matcher ^ 0xC0) == (const unsigned char)*key;
127 }
128 }
129 /* If key points at a leading byte in a multibyte sequence, but we are not yet
130 * in multibyte mode, then the dafsa should contain a special byte to indicate
131 * a mode switch. */
132 if (GetMultibyteLength(*key)) {
133 return matcher == 0x1F;
134 }
135 /* Normal matching of a single byte character. */
136 return matcher == (const unsigned char)*key;
137 }
138
139 /*
140 * Check if byte at offset matches first character in key.
141 * This version matches characters not last in label.
142 */
143
IsMatch(const unsigned char * offset,const unsigned char * end,const char * key,const char * multibyte_start)144 static int IsMatch(const unsigned char* offset,
145 const unsigned char* end,
146 const char* key,
147 const char* multibyte_start)
148 {
149 CHECK_LT(offset, end);
150 return IsMatchUnchecked(*offset, key, multibyte_start);
151 }
152
153 /*
154 * Check if byte at offset matches first character in key.
155 * This version matches characters last in label.
156 */
157
IsEndCharMatch(const unsigned char * offset,const unsigned char * end,const char * key,const char * multibyte_start)158 static int IsEndCharMatch(const unsigned char* offset,
159 const unsigned char* end,
160 const char* key,
161 const char* multibyte_start)
162 {
163 CHECK_LT(offset, end);
164 return IsMatchUnchecked(*offset ^ 0x80, key, multibyte_start);
165 }
166
167 /*
168 * Read return value at offset.
169 * Returns true if a return value could be read, false otherwise.
170 */
171
GetReturnValue(const unsigned char * offset,const unsigned char * end,const char * multibyte_start,int * return_value)172 static int GetReturnValue(const unsigned char* offset,
173 const unsigned char* end,
174 const char* multibyte_start,
175 int* return_value)
176 {
177 CHECK_LT(offset, end);
178 if (!multibyte_start && (*offset & 0xE0) == 0x80) {
179 *return_value = *offset & 0x0F;
180 return 1;
181 }
182 return 0;
183 }
184
185 /*
186 * Looks up the string |key| with length |key_length| in a fixed set of
187 * strings. The set of strings must be known at compile time. It is converted to
188 * a graph structure named a DAFSA (Deterministic Acyclic Finite State
189 * Automaton) by the script psl-make-dafsa during compilation. This permits
190 * efficient (in time and space) lookup. The graph generated by psl-make-dafsa
191 * takes the form of a constant byte array which should be supplied via the
192 * |graph| and |length| parameters. The return value is kDafsaNotFound,
193 * kDafsaFound, or a bitmap consisting of one or more of kDafsaExceptionRule,
194 * kDafsaWildcardRule and kDafsaPrivateRule ORed together.
195 *
196 * Lookup a domain key in a byte array generated by psl-make-dafsa.
197 */
198
199 /* prototype to skip warning with -Wmissing-prototypes */
200 int LookupStringInFixedSet(const unsigned char*, size_t,const char*, size_t);
201
LookupStringInFixedSet(const unsigned char * graph,size_t length,const char * key,size_t key_length)202 int LookupStringInFixedSet(const unsigned char* graph,
203 size_t length,
204 const char* key,
205 size_t key_length)
206 {
207 const unsigned char* pos = graph;
208 const unsigned char* end = graph + length;
209 const unsigned char* offset = pos;
210 const char* key_end = key + key_length;
211 const char* multibyte_start = 0;
212
213 while (GetNextOffset(&pos, end, &offset)) {
214 /*char <char>+ end_char offsets
215 * char <char>+ return value
216 * char end_char offsets
217 * char return value
218 * end_char offsets
219 * return_value
220 */
221 int did_consume = 0;
222
223 if (key != key_end && !IsEOL(offset, end)) {
224 /* Leading <char> is not a match. Don't dive into this child */
225 if (!IsMatch(offset, end, key, multibyte_start))
226 continue;
227 did_consume = 1;
228 NextPos(&offset, &key, &multibyte_start);
229 /* Possible matches at this point:
230 * <char>+ end_char offsets
231 * <char>+ return value
232 * end_char offsets
233 * return value
234 */
235
236 /* Remove all remaining <char> nodes possible */
237 while (!IsEOL(offset, end) && key != key_end) {
238 if (!IsMatch(offset, end, key, multibyte_start))
239 return -1;
240 NextPos(&offset, &key, &multibyte_start);
241 }
242 }
243 /* Possible matches at this point:
244 * end_char offsets
245 * return_value
246 * If one or more <char> elements were consumed, a failure
247 * to match is terminal. Otherwise, try the next node.
248 */
249 if (key == key_end) {
250 int return_value;
251
252 if (GetReturnValue(offset, end, multibyte_start, &return_value))
253 return return_value;
254 /* The DAFSA guarantees that if the first char is a match, all
255 * remaining char elements MUST match if the key is truly present.
256 */
257 if (did_consume)
258 return -1;
259 continue;
260 }
261 if (!IsEndCharMatch(offset, end, key, multibyte_start)) {
262 if (did_consume)
263 return -1; /* Unexpected */
264 continue;
265 }
266 NextPos(&offset, &key, &multibyte_start);
267 pos = offset; /* Dive into child */
268 }
269
270 return -1; /* No match */
271 }
272
273 /* prototype to skip warning with -Wmissing-prototypes */
274 int GetUtfMode(const unsigned char *graph, size_t length);
275
GetUtfMode(const unsigned char * graph,size_t length)276 int GetUtfMode(const unsigned char *graph, size_t length)
277 {
278 return length > 0 && graph[length - 1] < 0x80;
279 }
280