• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* Copyright 2015-2016 The Chromium Authors. All rights reserved.
2  * Use of this source code is governed by a BSD-style license that can be
3  * found in the LICENSE.chromium file.
4  *
5  * Converted to C89 2015 by Tim Rühsen
6  */
7 
8 #include <stddef.h>
9 
10 #if defined(__GNUC__) && defined(__GNUC_MINOR__)
11 #       define _GCC_VERSION_AT_LEAST(major, minor) ((__GNUC__ > (major)) || (__GNUC__ == (major) && __GNUC_MINOR__ >= (minor)))
12 #else
13 #       define _GCC_VERSION_AT_LEAST(major, minor) 0
14 #endif
15 
16 #define CHECK_LT(a, b) if ((a) >= b) return 0
17 
18 static const char multibyte_length_table[16] = {
19 	0, 0, 0, 0,	 /* 0x00-0x3F */
20 	0, 0, 0, 0,	 /* 0x40-0x7F */
21 	0, 0, 0, 0,	 /* 0x80-0xBF */
22 	2, 2, 3, 4,	 /* 0xC0-0xFF */
23 };
24 
25 
26 /*
27  * Get length of multibyte character sequence starting at a given byte.
28  * Returns zero if the byte is not a valid leading byte in UTF-8.
29  */
GetMultibyteLength(char c)30 static int GetMultibyteLength(char c) {
31 	return multibyte_length_table[((unsigned char)c) >> 4];
32 }
33 
34 /*
35  * Moves pointers one byte forward.
36  */
NextPos(const unsigned char ** pos,const char ** key,const char ** multibyte_start)37 static void NextPos(const unsigned char** pos,
38 	const char** key,
39 	const char** multibyte_start)
40 {
41 	++*pos;
42 	if (*multibyte_start) {
43 		/* Advance key to next byte in multibyte sequence. */
44 		++*key;
45 		/* Reset multibyte_start if last byte in multibyte sequence was consumed. */
46 		if (*key - *multibyte_start == GetMultibyteLength(**multibyte_start))
47 			*multibyte_start = 0;
48 	} else {
49 		if (GetMultibyteLength(**key)) {
50 			/* Multibyte prefix was matched in the dafsa, start matching multibyte
51 			 * content in next round. */
52 			*multibyte_start = *key;
53 		} else {
54 			/* Advance key as a single byte character was matched. */
55 			++*key;
56 		}
57 	}
58 }
59 
60 /*
61  * Read next offset from pos.
62  * Returns true if an offset could be read, false otherwise.
63  */
64 
GetNextOffset(const unsigned char ** pos,const unsigned char * end,const unsigned char ** offset)65 static int GetNextOffset(const unsigned char** pos,
66 	const unsigned char* end,
67 	const unsigned char** offset)
68 {
69 	size_t bytes_consumed;
70 
71 	if (*pos == end)
72 		return 0;
73 
74 	/* When reading an offset the byte array must always contain at least
75 	 * three more bytes to consume. First the offset to read, then a node
76 	 * to skip over and finally a destination node. No object can be smaller
77 	 * than one byte. */
78 	CHECK_LT(*pos + 2, end);
79 	switch (**pos & 0x60) {
80 	case 0x60: /* Read three byte offset */
81 		*offset += (((*pos)[0] & 0x1F) << 16) | ((*pos)[1] << 8) | (*pos)[2];
82 		bytes_consumed = 3;
83 		break;
84 	case 0x40: /* Read two byte offset */
85 		*offset += (((*pos)[0] & 0x1F) << 8) | (*pos)[1];
86 		bytes_consumed = 2;
87 		break;
88 	default:
89 		*offset += (*pos)[0] & 0x3F;
90 		bytes_consumed = 1;
91 	}
92 	if ((**pos & 0x80) != 0) {
93 		*pos = end;
94 	} else {
95 		*pos += bytes_consumed;
96 	}
97 	return 1;
98 }
99 
100 /*
101  * Check if byte at offset is last in label.
102  */
103 
IsEOL(const unsigned char * offset,const unsigned char * end)104 static int IsEOL(const unsigned char* offset, const unsigned char* end)
105 {
106 	CHECK_LT(offset, end);
107 	return(*offset & 0x80) != 0;
108 }
109 
110 /*
111  * Check if byte at offset matches first character in key.
112  * This version assumes a range check was already performed by the caller.
113  */
114 
IsMatchUnchecked(const unsigned char matcher,const char * key,const char * multibyte_start)115 static int IsMatchUnchecked(const unsigned char matcher,
116 	const char* key,
117 	const char* multibyte_start)
118 {
119 	if (multibyte_start) {
120 		/* Multibyte matching mode. */
121 		if (multibyte_start == key) {
122 			/* Match leading byte, which will also match the sequence length. */
123 			return (matcher ^ 0x80) == (const unsigned char)*key;
124 		} else {
125 			/* Match following bytes. */
126 			return (matcher ^ 0xC0) == (const unsigned char)*key;
127 		}
128 	}
129 	/* If key points at a leading byte in a multibyte sequence, but we are not yet
130 	 * in multibyte mode, then the dafsa should contain a special byte to indicate
131 	 * a mode switch. */
132 	if (GetMultibyteLength(*key)) {
133 		return matcher == 0x1F;
134 	}
135 	/* Normal matching of a single byte character. */
136 	return matcher == (const unsigned char)*key;
137 }
138 
139 /*
140  * Check if byte at offset matches first character in key.
141  * This version matches characters not last in label.
142  */
143 
IsMatch(const unsigned char * offset,const unsigned char * end,const char * key,const char * multibyte_start)144 static int IsMatch(const unsigned char* offset,
145 	const unsigned char* end,
146 	const char* key,
147 	const char* multibyte_start)
148 {
149 	CHECK_LT(offset, end);
150 	return IsMatchUnchecked(*offset, key, multibyte_start);
151 }
152 
153 /*
154  * Check if byte at offset matches first character in key.
155  * This version matches characters last in label.
156  */
157 
IsEndCharMatch(const unsigned char * offset,const unsigned char * end,const char * key,const char * multibyte_start)158 static int IsEndCharMatch(const unsigned char* offset,
159 	const unsigned char* end,
160 	const char* key,
161 	const char* multibyte_start)
162 {
163 	CHECK_LT(offset, end);
164 	return IsMatchUnchecked(*offset ^ 0x80, key, multibyte_start);
165 }
166 
167 /*
168  * Read return value at offset.
169  * Returns true if a return value could be read, false otherwise.
170  */
171 
GetReturnValue(const unsigned char * offset,const unsigned char * end,const char * multibyte_start,int * return_value)172 static int GetReturnValue(const unsigned char* offset,
173 	const unsigned char* end,
174 	const char* multibyte_start,
175 	int* return_value)
176 {
177 	CHECK_LT(offset, end);
178 	if (!multibyte_start && (*offset & 0xE0) == 0x80) {
179 		*return_value = *offset & 0x0F;
180 		return 1;
181 	}
182 	return 0;
183 }
184 
185 /*
186  *  Looks up the string |key| with length |key_length| in a fixed set of
187  * strings. The set of strings must be known at compile time. It is converted to
188  * a graph structure named a DAFSA (Deterministic Acyclic Finite State
189  * Automaton) by the script psl-make-dafsa during compilation. This permits
190  * efficient (in time and space) lookup. The graph generated by psl-make-dafsa
191  * takes the form of a constant byte array which should be supplied via the
192  * |graph| and |length| parameters.  The return value is kDafsaNotFound,
193  * kDafsaFound, or a bitmap consisting of one or more of kDafsaExceptionRule,
194  * kDafsaWildcardRule and kDafsaPrivateRule ORed together.
195  *
196  * Lookup a domain key in a byte array generated by psl-make-dafsa.
197  */
198 
199 /* prototype to skip warning with -Wmissing-prototypes */
200 int LookupStringInFixedSet(const unsigned char*, size_t,const char*, size_t);
201 
LookupStringInFixedSet(const unsigned char * graph,size_t length,const char * key,size_t key_length)202 int LookupStringInFixedSet(const unsigned char* graph,
203 	size_t length,
204 	const char* key,
205 	size_t key_length)
206 {
207 	const unsigned char* pos = graph;
208 	const unsigned char* end = graph + length;
209 	const unsigned char* offset = pos;
210 	const char* key_end = key + key_length;
211 	const char* multibyte_start = 0;
212 
213 	while (GetNextOffset(&pos, end, &offset)) {
214 		/*char <char>+ end_char offsets
215 		 * char <char>+ return value
216 		 * char end_char offsets
217 		 * char return value
218 		 * end_char offsets
219 		 * return_value
220 		 */
221 		int did_consume = 0;
222 
223 		if (key != key_end && !IsEOL(offset, end)) {
224 			/* Leading <char> is not a match. Don't dive into this child */
225 			if (!IsMatch(offset, end, key, multibyte_start))
226 				continue;
227 			did_consume = 1;
228 			NextPos(&offset, &key, &multibyte_start);
229 			/* Possible matches at this point:
230 			 * <char>+ end_char offsets
231 			 * <char>+ return value
232 			 * end_char offsets
233 			 * return value
234 			 */
235 
236 			/* Remove all remaining <char> nodes possible */
237 			while (!IsEOL(offset, end) && key != key_end) {
238 				if (!IsMatch(offset, end, key, multibyte_start))
239 					return -1;
240 				NextPos(&offset, &key, &multibyte_start);
241 			}
242 		}
243 		/* Possible matches at this point:
244 		 * end_char offsets
245 		 * return_value
246 		 * If one or more <char> elements were consumed, a failure
247 		 * to match is terminal. Otherwise, try the next node.
248 		 */
249 		if (key == key_end) {
250 			int return_value;
251 
252 			if (GetReturnValue(offset, end, multibyte_start, &return_value))
253 				return return_value;
254 			/* The DAFSA guarantees that if the first char is a match, all
255 			 * remaining char elements MUST match if the key is truly present.
256 			 */
257 			if (did_consume)
258 				return -1;
259 			continue;
260 		}
261 		if (!IsEndCharMatch(offset, end, key, multibyte_start)) {
262 			if (did_consume)
263 				return -1; /* Unexpected */
264 			continue;
265 		}
266 		NextPos(&offset, &key, &multibyte_start);
267 		pos = offset; /* Dive into child */
268 	}
269 
270 	return -1; /* No match */
271 }
272 
273 /* prototype to skip warning with -Wmissing-prototypes */
274 int GetUtfMode(const unsigned char *graph, size_t length);
275 
GetUtfMode(const unsigned char * graph,size_t length)276 int GetUtfMode(const unsigned char *graph, size_t length)
277 {
278 	return length > 0 && graph[length - 1] < 0x80;
279 }
280