1 /* GLIB - Library of useful routines for C programming
2 * Copyright (C) 2001 Matthias Clasen <matthiasc@poet.de>
3 *
4 * This library is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU Lesser General Public
6 * License as published by the Free Software Foundation; either
7 * version 2 of the License, or (at your option) any later version.
8 *
9 * This library is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 * Lesser General Public License for more details.
13 *
14 * You should have received a copy of the GNU Lesser General Public
15 * License along with this library; if not, write to the
16 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
17 * Boston, MA 02111-1307, USA.
18 */
19
20 #include "glib.h"
21
22 #define UNICODE_VALID(Char) \
23 ((Char) < 0x110000 && \
24 (((Char) & 0xFFFFF800) != 0xD800) && \
25 ((Char) < 0xFDD0 || (Char) > 0xFDEF) && \
26 ((Char) & 0xFFFE) != 0xFFFE)
27
28
29
30 static gboolean any_failed = FALSE;
31
32 struct {
33 const gchar *text;
34 gint max_len;
35 gint offset;
36 gboolean valid;
37 } test[] = {
38 /* some tests to check max_len handling */
39 /* length 1 */
40 { "abcde", -1, 5, TRUE },
41 { "abcde", 3, 3, TRUE },
42 { "abcde", 5, 5, TRUE },
43 { "abcde", 7, 5, FALSE },
44 /* length 2 */
45 { "\xc2\xa9\xc2\xa9\xc2\xa9", -1, 6, TRUE },
46 { "\xc2\xa9\xc2\xa9\xc2\xa9", 1, 0, FALSE },
47 { "\xc2\xa9\xc2\xa9\xc2\xa9", 2, 2, TRUE },
48 { "\xc2\xa9\xc2\xa9\xc2\xa9", 3, 2, FALSE },
49 { "\xc2\xa9\xc2\xa9\xc2\xa9", 4, 4, TRUE },
50 { "\xc2\xa9\xc2\xa9\xc2\xa9", 5, 4, FALSE },
51 { "\xc2\xa9\xc2\xa9\xc2\xa9", 6, 6, TRUE },
52 { "\xc2\xa9\xc2\xa9\xc2\xa9", 7, 6, FALSE },
53 /* length 3 */
54 { "\xe2\x89\xa0\xe2\x89\xa0", -1, 6, TRUE },
55 { "\xe2\x89\xa0\xe2\x89\xa0", 1, 0, FALSE },
56 { "\xe2\x89\xa0\xe2\x89\xa0", 2, 0, FALSE },
57 { "\xe2\x89\xa0\xe2\x89\xa0", 3, 3, TRUE },
58 { "\xe2\x89\xa0\xe2\x89\xa0", 4, 3, FALSE },
59 { "\xe2\x89\xa0\xe2\x89\xa0", 5, 3, FALSE },
60 { "\xe2\x89\xa0\xe2\x89\xa0", 6, 6, TRUE },
61 { "\xe2\x89\xa0\xe2\x89\xa0", 7, 6, FALSE },
62
63 /* examples from http://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt */
64 /* greek 'kosme' */
65 { "\xce\xba\xe1\xbd\xb9\xcf\x83\xce\xbc\xce\xb5", -1, 11, TRUE },
66 /* first sequence of each length */
67 { "\x00", -1, 0, TRUE },
68 { "\xc2\x80", -1, 2, TRUE },
69 { "\xe0\xa0\x80", -1, 3, TRUE },
70 { "\xf0\x90\x80\x80", -1, 4, TRUE },
71 { "\xf8\x88\x80\x80\x80", -1, 0, FALSE },
72 { "\xfc\x84\x80\x80\x80\x80", -1, 0, FALSE },
73 /* last sequence of each length */
74 { "\x7f", -1, 1, TRUE },
75 { "\xdf\xbf", -1, 2, TRUE },
76 { "\xef\xbf\xbf", -1, 0, FALSE },
77 { "\xf7\xbf\xbf\xbf", -1, 0, FALSE },
78 { "\xfb\xbf\xbf\xbf\xbf", -1, 0, FALSE },
79 { "\xfd\xbf\xbf\xbf\xbf\xbf", -1, 0, FALSE },
80 /* other boundary conditions */
81 { "\xed\x9f\xbf", -1, 3, TRUE },
82 { "\xee\x80\x80", -1, 3, TRUE },
83 { "\xef\xbf\xbd", -1, 3, TRUE },
84 { "\xf4\x8f\xbf\xbf", -1, 0, FALSE },
85 { "\xf4\x90\x80\x80", -1, 0, FALSE },
86 /* malformed sequences */
87 /* continuation bytes */
88 { "\x80", -1, 0, FALSE },
89 { "\xbf", -1, 0, FALSE },
90 { "\x80\xbf", -1, 0, FALSE },
91 { "\x80\xbf\x80", -1, 0, FALSE },
92 { "\x80\xbf\x80\xbf", -1, 0, FALSE },
93 { "\x80\xbf\x80\xbf\x80", -1, 0, FALSE },
94 { "\x80\xbf\x80\xbf\x80\xbf", -1, 0, FALSE },
95 { "\x80\xbf\x80\xbf\x80\xbf\x80", -1, 0, FALSE },
96
97 /* all possible continuation byte */
98 { "\x80", -1, 0, FALSE },
99 { "\x81", -1, 0, FALSE },
100 { "\x82", -1, 0, FALSE },
101 { "\x83", -1, 0, FALSE },
102 { "\x84", -1, 0, FALSE },
103 { "\x85", -1, 0, FALSE },
104 { "\x86", -1, 0, FALSE },
105 { "\x87", -1, 0, FALSE },
106 { "\x88", -1, 0, FALSE },
107 { "\x89", -1, 0, FALSE },
108 { "\x8a", -1, 0, FALSE },
109 { "\x8b", -1, 0, FALSE },
110 { "\x8c", -1, 0, FALSE },
111 { "\x8d", -1, 0, FALSE },
112 { "\x8e", -1, 0, FALSE },
113 { "\x8f", -1, 0, FALSE },
114 { "\x90", -1, 0, FALSE },
115 { "\x91", -1, 0, FALSE },
116 { "\x92", -1, 0, FALSE },
117 { "\x93", -1, 0, FALSE },
118 { "\x94", -1, 0, FALSE },
119 { "\x95", -1, 0, FALSE },
120 { "\x96", -1, 0, FALSE },
121 { "\x97", -1, 0, FALSE },
122 { "\x98", -1, 0, FALSE },
123 { "\x99", -1, 0, FALSE },
124 { "\x9a", -1, 0, FALSE },
125 { "\x9b", -1, 0, FALSE },
126 { "\x9c", -1, 0, FALSE },
127 { "\x9d", -1, 0, FALSE },
128 { "\x9e", -1, 0, FALSE },
129 { "\x9f", -1, 0, FALSE },
130 { "\xa0", -1, 0, FALSE },
131 { "\xa1", -1, 0, FALSE },
132 { "\xa2", -1, 0, FALSE },
133 { "\xa3", -1, 0, FALSE },
134 { "\xa4", -1, 0, FALSE },
135 { "\xa5", -1, 0, FALSE },
136 { "\xa6", -1, 0, FALSE },
137 { "\xa7", -1, 0, FALSE },
138 { "\xa8", -1, 0, FALSE },
139 { "\xa9", -1, 0, FALSE },
140 { "\xaa", -1, 0, FALSE },
141 { "\xab", -1, 0, FALSE },
142 { "\xac", -1, 0, FALSE },
143 { "\xad", -1, 0, FALSE },
144 { "\xae", -1, 0, FALSE },
145 { "\xaf", -1, 0, FALSE },
146 { "\xb0", -1, 0, FALSE },
147 { "\xb1", -1, 0, FALSE },
148 { "\xb2", -1, 0, FALSE },
149 { "\xb3", -1, 0, FALSE },
150 { "\xb4", -1, 0, FALSE },
151 { "\xb5", -1, 0, FALSE },
152 { "\xb6", -1, 0, FALSE },
153 { "\xb7", -1, 0, FALSE },
154 { "\xb8", -1, 0, FALSE },
155 { "\xb9", -1, 0, FALSE },
156 { "\xba", -1, 0, FALSE },
157 { "\xbb", -1, 0, FALSE },
158 { "\xbc", -1, 0, FALSE },
159 { "\xbd", -1, 0, FALSE },
160 { "\xbe", -1, 0, FALSE },
161 { "\xbf", -1, 0, FALSE },
162 /* lone start characters */
163 { "\xc0\x20", -1, 0, FALSE },
164 { "\xc1\x20", -1, 0, FALSE },
165 { "\xc2\x20", -1, 0, FALSE },
166 { "\xc3\x20", -1, 0, FALSE },
167 { "\xc4\x20", -1, 0, FALSE },
168 { "\xc5\x20", -1, 0, FALSE },
169 { "\xc6\x20", -1, 0, FALSE },
170 { "\xc7\x20", -1, 0, FALSE },
171 { "\xc8\x20", -1, 0, FALSE },
172 { "\xc9\x20", -1, 0, FALSE },
173 { "\xca\x20", -1, 0, FALSE },
174 { "\xcb\x20", -1, 0, FALSE },
175 { "\xcc\x20", -1, 0, FALSE },
176 { "\xcd\x20", -1, 0, FALSE },
177 { "\xce\x20", -1, 0, FALSE },
178 { "\xcf\x20", -1, 0, FALSE },
179 { "\xd0\x20", -1, 0, FALSE },
180 { "\xd1\x20", -1, 0, FALSE },
181 { "\xd2\x20", -1, 0, FALSE },
182 { "\xd3\x20", -1, 0, FALSE },
183 { "\xd4\x20", -1, 0, FALSE },
184 { "\xd5\x20", -1, 0, FALSE },
185 { "\xd6\x20", -1, 0, FALSE },
186 { "\xd7\x20", -1, 0, FALSE },
187 { "\xd8\x20", -1, 0, FALSE },
188 { "\xd9\x20", -1, 0, FALSE },
189 { "\xda\x20", -1, 0, FALSE },
190 { "\xdb\x20", -1, 0, FALSE },
191 { "\xdc\x20", -1, 0, FALSE },
192 { "\xdd\x20", -1, 0, FALSE },
193 { "\xde\x20", -1, 0, FALSE },
194 { "\xdf\x20", -1, 0, FALSE },
195 { "\xe0\x20", -1, 0, FALSE },
196 { "\xe1\x20", -1, 0, FALSE },
197 { "\xe2\x20", -1, 0, FALSE },
198 { "\xe3\x20", -1, 0, FALSE },
199 { "\xe4\x20", -1, 0, FALSE },
200 { "\xe5\x20", -1, 0, FALSE },
201 { "\xe6\x20", -1, 0, FALSE },
202 { "\xe7\x20", -1, 0, FALSE },
203 { "\xe8\x20", -1, 0, FALSE },
204 { "\xe9\x20", -1, 0, FALSE },
205 { "\xea\x20", -1, 0, FALSE },
206 { "\xeb\x20", -1, 0, FALSE },
207 { "\xec\x20", -1, 0, FALSE },
208 { "\xed\x20", -1, 0, FALSE },
209 { "\xee\x20", -1, 0, FALSE },
210 { "\xef\x20", -1, 0, FALSE },
211 { "\xf0\x20", -1, 0, FALSE },
212 { "\xf1\x20", -1, 0, FALSE },
213 { "\xf2\x20", -1, 0, FALSE },
214 { "\xf3\x20", -1, 0, FALSE },
215 { "\xf4\x20", -1, 0, FALSE },
216 { "\xf5\x20", -1, 0, FALSE },
217 { "\xf6\x20", -1, 0, FALSE },
218 { "\xf7\x20", -1, 0, FALSE },
219 { "\xf8\x20", -1, 0, FALSE },
220 { "\xf9\x20", -1, 0, FALSE },
221 { "\xfa\x20", -1, 0, FALSE },
222 { "\xfb\x20", -1, 0, FALSE },
223 { "\xfc\x20", -1, 0, FALSE },
224 { "\xfd\x20", -1, 0, FALSE },
225 /* missing continuation bytes */
226 { "\x20\xc0", -1, 1, FALSE },
227 { "\x20\xe0\x80", -1, 1, FALSE },
228 { "\x20\xf0\x80\x80", -1, 1, FALSE },
229 { "\x20\xf8\x80\x80\x80", -1, 1, FALSE },
230 { "\x20\xfc\x80\x80\x80\x80", -1, 1, FALSE },
231 { "\x20\xdf", -1, 1, FALSE },
232 { "\x20\xef\xbf", -1, 1, FALSE },
233 { "\x20\xf7\xbf\xbf", -1, 1, FALSE },
234 { "\x20\xfb\xbf\xbf\xbf", -1, 1, FALSE },
235 { "\x20\xfd\xbf\xbf\xbf\xbf", -1, 1, FALSE },
236 /* impossible bytes */
237 { "\x20\xfe\x20", -1, 1, FALSE },
238 { "\x20\xff\x20", -1, 1, FALSE },
239 /* overlong sequences */
240 { "\x20\xc0\xaf\x20", -1, 1, FALSE },
241 { "\x20\xe0\x80\xaf\x20", -1, 1, FALSE },
242 { "\x20\xf0\x80\x80\xaf\x20", -1, 1, FALSE },
243 { "\x20\xf8\x80\x80\x80\xaf\x20", -1, 1, FALSE },
244 { "\x20\xfc\x80\x80\x80\x80\xaf\x20", -1, 1, FALSE },
245 { "\x20\xc1\xbf\x20", -1, 1, FALSE },
246 { "\x20\xe0\x9f\xbf\x20", -1, 1, FALSE },
247 { "\x20\xf0\x8f\xbf\xbf\x20", -1, 1, FALSE },
248 { "\x20\xf8\x87\xbf\xbf\xbf\x20", -1, 1, FALSE },
249 { "\x20\xfc\x83\xbf\xbf\xbf\xbf\x20", -1, 1, FALSE },
250 { "\x20\xc0\x80\x20", -1, 1, FALSE },
251 { "\x20\xe0\x80\x80\x20", -1, 1, FALSE },
252 { "\x20\xf0\x80\x80\x80\x20", -1, 1, FALSE },
253 { "\x20\xf8\x80\x80\x80\x80\x20", -1, 1, FALSE },
254 { "\x20\xfc\x80\x80\x80\x80\x80\x20", -1, 1, FALSE },
255 /* illegal code positions */
256 { "\x20\xed\xa0\x80\x20", -1, 1, FALSE },
257 { "\x20\xed\xad\xbf\x20", -1, 1, FALSE },
258 { "\x20\xed\xae\x80\x20", -1, 1, FALSE },
259 { "\x20\xed\xaf\xbf\x20", -1, 1, FALSE },
260 { "\x20\xed\xb0\x80\x20", -1, 1, FALSE },
261 { "\x20\xed\xbe\x80\x20", -1, 1, FALSE },
262 { "\x20\xed\xbf\xbf\x20", -1, 1, FALSE },
263 { "\x20\xed\xa0\x80\xed\xb0\x80\x20", -1, 1, FALSE },
264 { "\x20\xed\xa0\x80\xed\xbf\xbf\x20", -1, 1, FALSE },
265 { "\x20\xed\xad\xbf\xed\xb0\x80\x20", -1, 1, FALSE },
266 { "\x20\xed\xad\xbf\xed\xbf\xbf\x20", -1, 1, FALSE },
267 { "\x20\xed\xae\x80\xed\xb0\x80\x20", -1, 1, FALSE },
268 { "\x20\xed\xae\x80\xed\xbf\xbf\x20", -1, 1, FALSE },
269 { "\x20\xed\xaf\xbf\xed\xb0\x80\x20", -1, 1, FALSE },
270 { "\x20\xed\xaf\xbf\xed\xbf\xbf\x20", -1, 1, FALSE },
271 { "\x20\xef\xbf\xbe\x20", -1, 1, FALSE },
272 { "\x20\xef\xbf\xbf\x20", -1, 1, FALSE },
273
274 { NULL, }
275 };
276
277 static void
do_test(gint index,const gchar * text,gint max_len,gint offset,gboolean valid)278 do_test (gint index,
279 const gchar *text,
280 gint max_len,
281 gint offset,
282 gboolean valid)
283 {
284 const gchar *end;
285 gboolean result;
286
287 result = g_utf8_validate (text, max_len, &end);
288
289 if (result != valid || end - text != offset)
290 {
291 GString *str;
292 const gchar *p;
293
294 any_failed = TRUE;
295
296 str = g_string_new (0);
297 for (p = text; *p; p++)
298 g_string_append_printf (str, "\\x%02hhx", *p);
299 g_print ("%d: g_utf8_validate (\"%s\", %d) failed, "
300 "expected %s %d, got %s %d\n",
301 index,
302 str->str, max_len,
303 valid ? "TRUE" : "FALSE", offset,
304 result ? "TRUE" : "FALSE", (gint) (end - text));
305 g_string_free (str, FALSE);
306 }
307 }
308
309 int
main(int argc,char * argv[])310 main (int argc, char *argv[])
311 {
312 gint i;
313
314 for (i = 0; test[i].text; i++)
315 do_test (i, test[i].text, test[i].max_len,
316 test[i].offset, test[i].valid);
317
318 return any_failed ? 1 : 0;
319 }
320