1 #undef G_DISABLE_ASSERT
2 #undef G_LOG_DOMAIN
3
4 #include <stdarg.h>
5 #include <stdio.h>
6 #include <stdlib.h>
7 #include <string.h>
8 #include <glib.h>
9
10 static gint exit_status = 0;
11
12 G_GNUC_PRINTF (1, 2)
13 static void
croak(char * format,...)14 croak (char *format, ...)
15 {
16 va_list va;
17
18 va_start (va, format);
19 vfprintf (stderr, format, va);
20 va_end (va);
21
22 exit (1);
23 }
24
25 G_GNUC_PRINTF (1, 2)
26 static void
fail(char * format,...)27 fail (char *format, ...)
28 {
29 va_list va;
30
31 va_start (va, format);
32 vfprintf (stderr, format, va);
33 va_end (va);
34
35 exit_status |= 1;
36 }
37
38 typedef enum
39 {
40 VALID,
41 INCOMPLETE,
42 NOTUNICODE,
43 OVERLONG,
44 MALFORMED
45 } Status;
46
47 static gboolean
ucs4_equal(gunichar * a,gunichar * b)48 ucs4_equal (gunichar *a, gunichar *b)
49 {
50 while (*a && *b && (*a == *b))
51 {
52 a++;
53 b++;
54 }
55
56 return (*a == *b);
57 }
58
59 static gboolean
utf16_equal(gunichar2 * a,gunichar2 * b)60 utf16_equal (gunichar2 *a, gunichar2 *b)
61 {
62 while (*a && *b && (*a == *b))
63 {
64 a++;
65 b++;
66 }
67
68 return (*a == *b);
69 }
70
71 static gint
utf16_count(gunichar2 * a)72 utf16_count (gunichar2 *a)
73 {
74 gint result = 0;
75
76 while (a[result])
77 result++;
78
79 return result;
80 }
81
82 static void
print_ucs4(const gchar * prefix,gunichar * ucs4,gint ucs4_len)83 print_ucs4 (const gchar *prefix, gunichar *ucs4, gint ucs4_len)
84 {
85 gint i;
86 g_print ("%s ", prefix);
87 for (i = 0; i < ucs4_len; i++)
88 g_print ("%x ", ucs4[i]);
89 g_print ("\n");
90 }
91
92 static void
process(gint line,gchar * utf8,Status status,gunichar * ucs4,gint ucs4_len)93 process (gint line,
94 gchar *utf8,
95 Status status,
96 gunichar *ucs4,
97 gint ucs4_len)
98 {
99 const gchar *end;
100 gboolean is_valid = g_utf8_validate (utf8, -1, &end);
101 GError *error = NULL;
102 glong items_read, items_written;
103
104 switch (status)
105 {
106 case VALID:
107 if (!is_valid)
108 {
109 fail ("line %d: valid but g_utf8_validate returned FALSE\n", line);
110 return;
111 }
112 break;
113 case NOTUNICODE:
114 case INCOMPLETE:
115 case OVERLONG:
116 case MALFORMED:
117 if (is_valid)
118 {
119 fail ("line %d: invalid but g_utf8_validate returned TRUE\n", line);
120 return;
121 }
122 break;
123 }
124
125 if (status == INCOMPLETE)
126 {
127 gunichar *ucs4_result;
128
129 ucs4_result = g_utf8_to_ucs4 (utf8, -1, NULL, NULL, &error);
130
131 if (!error || !g_error_matches (error, G_CONVERT_ERROR, G_CONVERT_ERROR_PARTIAL_INPUT))
132 {
133 fail ("line %d: incomplete input not properly detected\n", line);
134 return;
135 }
136 g_clear_error (&error);
137
138 ucs4_result = g_utf8_to_ucs4 (utf8, -1, &items_read, NULL, &error);
139
140 if (!ucs4_result || items_read == strlen (utf8))
141 {
142 fail ("line %d: incomplete input not properly detected\n", line);
143 return;
144 }
145
146 g_free (ucs4_result);
147 }
148
149 if (status == VALID || status == NOTUNICODE)
150 {
151 gunichar *ucs4_result;
152
153 ucs4_result = g_utf8_to_ucs4 (utf8, -1, &items_read, &items_written, &error);
154 if (!ucs4_result)
155 {
156 fail ("line %d: conversion with status %d to ucs4 failed: %s\n", line, status, error->message);
157 return;
158 }
159
160 if (!ucs4_equal (ucs4_result, ucs4) ||
161 items_read != strlen (utf8) ||
162 items_written != ucs4_len)
163 {
164 fail ("line %d: results of conversion with status %d to ucs4 do not match expected.\n", line, status);
165 print_ucs4 ("expected: ", ucs4, ucs4_len);
166 print_ucs4 ("received: ", ucs4_result, items_written);
167 return;
168 }
169
170 g_free (ucs4_result);
171 }
172
173 if (status == VALID)
174 {
175 gunichar *ucs4_result;
176 gchar *utf8_result;
177
178 ucs4_result = g_utf8_to_ucs4_fast (utf8, -1, &items_written);
179
180 if (!ucs4_equal (ucs4_result, ucs4) ||
181 items_written != ucs4_len)
182 {
183 fail ("line %d: results of fast conversion with status %d to ucs4 do not match expected.\n", line, status);
184 print_ucs4 ("expected: ", ucs4, ucs4_len);
185 print_ucs4 ("received: ", ucs4_result, items_written);
186 return;
187 }
188
189 utf8_result = g_ucs4_to_utf8 (ucs4_result, -1, &items_read, &items_written, &error);
190 if (!utf8_result)
191 {
192 fail ("line %d: conversion back to utf8 failed: %s", line, error->message);
193 return;
194 }
195
196 if (strcmp (utf8_result, utf8) != 0 ||
197 items_read != ucs4_len ||
198 items_written != strlen (utf8))
199 {
200 fail ("line %d: conversion back to utf8 did not match original\n", line);
201 return;
202 }
203
204 g_free (utf8_result);
205 g_free (ucs4_result);
206 }
207
208 if (status == VALID)
209 {
210 gunichar2 *utf16_expected_tmp;
211 gunichar2 *utf16_expected;
212 gunichar2 *utf16_from_utf8;
213 gunichar2 *utf16_from_ucs4;
214 gunichar *ucs4_result;
215 gsize bytes_written;
216 gint n_chars;
217 gchar *utf8_result;
218
219 #if G_BYTE_ORDER == G_LITTLE_ENDIAN
220 #define TARGET "UTF-16LE"
221 #else
222 #define TARGET "UTF-16"
223 #endif
224
225 if (!(utf16_expected_tmp = (gunichar2 *)g_convert (utf8, -1, TARGET, "UTF-8",
226 NULL, &bytes_written, NULL)))
227 {
228 fail ("line %d: could not convert to UTF-16 via g_convert\n", line);
229 return;
230 }
231
232 /* zero-terminate and remove BOM
233 */
234 n_chars = bytes_written / 2;
235 if (utf16_expected_tmp[0] == 0xfeff) /* BOM */
236 {
237 n_chars--;
238 utf16_expected = g_new (gunichar2, n_chars + 1);
239 memcpy (utf16_expected, utf16_expected_tmp + 1, sizeof(gunichar2) * n_chars);
240 }
241 else if (utf16_expected_tmp[0] == 0xfffe) /* ANTI-BOM */
242 {
243 fail ("line %d: conversion via iconv to \"UTF-16\" is not native-endian\n", line);
244 return;
245 }
246 else
247 {
248 utf16_expected = g_new (gunichar2, n_chars + 1);
249 memcpy (utf16_expected, utf16_expected_tmp, sizeof(gunichar2) * n_chars);
250 }
251
252 utf16_expected[n_chars] = '\0';
253
254 if (!(utf16_from_utf8 = g_utf8_to_utf16 (utf8, -1, &items_read, &items_written, &error)))
255 {
256 fail ("line %d: conversion to ucs16 failed: %s\n", line, error->message);
257 return;
258 }
259
260 if (items_read != strlen (utf8) ||
261 utf16_count (utf16_from_utf8) != items_written)
262 {
263 fail ("line %d: length error in conversion to ucs16\n", line);
264 return;
265 }
266
267 if (!(utf16_from_ucs4 = g_ucs4_to_utf16 (ucs4, -1, &items_read, &items_written, &error)))
268 {
269 fail ("line %d: conversion to ucs16 failed: %s\n", line, error->message);
270 return;
271 }
272
273 if (items_read != ucs4_len ||
274 utf16_count (utf16_from_ucs4) != items_written)
275 {
276 fail ("line %d: length error in conversion to ucs16\n", line);
277 return;
278 }
279
280 if (!utf16_equal (utf16_from_utf8, utf16_expected) ||
281 !utf16_equal (utf16_from_ucs4, utf16_expected))
282 {
283 fail ("line %d: results of conversion to ucs16 do not match\n", line);
284 return;
285 }
286
287 if (!(utf8_result = g_utf16_to_utf8 (utf16_from_utf8, -1, &items_read, &items_written, &error)))
288 {
289 fail ("line %d: conversion back to utf8 failed: %s\n", line, error->message);
290 return;
291 }
292
293 if (items_read != utf16_count (utf16_from_utf8) ||
294 items_written != strlen (utf8))
295 {
296 fail ("line %d: length error in conversion from ucs16 to utf8\n", line);
297 return;
298 }
299
300 if (!(ucs4_result = g_utf16_to_ucs4 (utf16_from_ucs4, -1, &items_read, &items_written, &error)))
301 {
302 fail ("line %d: conversion back to utf8/ucs4 failed\n", line);
303 return;
304 }
305
306 if (items_read != utf16_count (utf16_from_utf8) ||
307 items_written != ucs4_len)
308 {
309 fail ("line %d: length error in conversion from ucs16 to ucs4\n", line);
310 return;
311 }
312
313 if (strcmp (utf8, utf8_result) != 0 ||
314 !ucs4_equal (ucs4, ucs4_result))
315 {
316 fail ("line %d: conversion back to utf8/ucs4 did not match original\n", line);
317 return;
318 }
319
320 g_free (utf16_expected_tmp);
321 g_free (utf16_expected);
322 g_free (utf16_from_utf8);
323 g_free (utf16_from_ucs4);
324 g_free (utf8_result);
325 g_free (ucs4_result);
326 }
327 }
328
329 int
main(int argc,char ** argv)330 main (int argc, char **argv)
331 {
332 gchar *testfile;
333 gchar *contents;
334 GError *error = NULL;
335 gchar *p, *end;
336 char *tmp;
337 gint state = 0;
338 gint line = 1;
339 gint start_line = 0; /* Quiet GCC */
340 gchar *utf8 = NULL; /* Quiet GCC */
341 GArray *ucs4;
342 Status status = VALID; /* Quiet GCC */
343
344 g_test_init (&argc, &argv, NULL);
345
346 testfile = g_test_build_filename (G_TEST_DIST, "utf8.txt", NULL);
347
348 g_file_get_contents (testfile, &contents, NULL, &error);
349 if (error)
350 croak ("Cannot open utf8.txt: %s", error->message);
351
352 ucs4 = g_array_new (TRUE, FALSE, sizeof(gunichar));
353
354 p = contents;
355
356 /* Loop over lines */
357 while (*p)
358 {
359 while (*p && (*p == ' ' || *p == '\t'))
360 p++;
361
362 end = p;
363 while (*end && (*end != '\r' && *end != '\n'))
364 end++;
365
366 if (!*p || *p == '#' || *p == '\r' || *p == '\n')
367 goto next_line;
368
369 tmp = g_strstrip (g_strndup (p, end - p));
370
371 switch (state)
372 {
373 case 0:
374 /* UTF-8 string */
375 start_line = line;
376 utf8 = tmp;
377 tmp = NULL;
378 break;
379
380 case 1:
381 /* Status */
382 if (!strcmp (tmp, "VALID"))
383 status = VALID;
384 else if (!strcmp (tmp, "INCOMPLETE"))
385 status = INCOMPLETE;
386 else if (!strcmp (tmp, "NOTUNICODE"))
387 status = NOTUNICODE;
388 else if (!strcmp (tmp, "OVERLONG"))
389 status = OVERLONG;
390 else if (!strcmp (tmp, "MALFORMED"))
391 status = MALFORMED;
392 else
393 croak ("Invalid status on line %d\n", line);
394
395 if (status != VALID && status != NOTUNICODE)
396 state++; /* No UCS-4 data */
397
398 break;
399
400 case 2:
401 /* UCS-4 version */
402
403 p = strtok (tmp, " \t");
404 while (p)
405 {
406 gchar *endptr;
407
408 gunichar ch = strtoul (p, &endptr, 16);
409 if (*endptr != '\0')
410 croak ("Invalid UCS-4 character on line %d\n", line);
411
412 g_array_append_val (ucs4, ch);
413
414 p = strtok (NULL, " \t");
415 }
416
417 break;
418 }
419
420 g_free (tmp);
421 state = (state + 1) % 3;
422
423 if (state == 0)
424 {
425 process (start_line, utf8, status, (gunichar *)ucs4->data, ucs4->len);
426 g_array_set_size (ucs4, 0);
427 g_free (utf8);
428 }
429
430 next_line:
431 p = end;
432 if (*p && *p == '\r')
433 p++;
434 if (*p && *p == '\n')
435 p++;
436
437 line++;
438 }
439
440 g_free (testfile);
441 g_array_free (ucs4, TRUE);
442 g_free (contents);
443 return exit_status;
444 }
445