1 /*
2 * Copyright 2001-2004 Unicode, Inc.
3 *
4 * Disclaimer
5 *
6 * This source code is provided as is by Unicode, Inc. No claims are
7 * made as to fitness for any particular purpose. No warranties of any
8 * kind are expressed or implied. The recipient agrees to determine
9 * applicability of information provided. If this file has been
10 * purchased on magnetic or optical media from Unicode, Inc., the
11 * sole remedy for any claim will be exchange of defective media
12 * within 90 days of receipt.
13 *
14 * Limitations on Rights to Redistribute This Code
15 *
16 * Unicode, Inc. hereby grants the right to freely use the information
17 * supplied in this file in the creation of products supporting the
18 * Unicode Standard, and to make copies of this file in any form
19 * for internal or external distribution as long as this notice
20 * remains attached.
21 *
22 * harness.c
23 *
24 * This is a test harness for "ConvertUTF.c". Compile this
25 * and run without arguments. It will exhaustively test
26 * the conversion routines, and print a few lines of diagnostic
27 * output. You don't need to compile ConvertUTF.c itself,
28 * since it gets #included here along with the header.
29 * Example of a compile line:
30 *
31 * $ gcc -g harness.c -o harness
32 *
33 * Rev History: Rick McGowan, new file April 2001.
34 * Sept 19, 2002: Corrected error on line 234: utf16_buf[2] becomes utf16_result[2]
35 * per report from Iain Murray.
36 * July 3, 2003: Updated printout message.
37 * Oct 19, 2004: Updated isLegalUTF8 test data and corrected switch statements to catch
38 * illegal surrogate use in UTF-8, per report from Frank Tang.
39 *
40 */
41
42 #define CVTUTF_DEBUG 1
43
44 #include <stdio.h>
45 #include "ConvertUTF.c"
46
47 /* ---------------------------------------------------------------------
48 test01 - Spot check a few legal & illegal UTF-8 values only.
49 This is not an exhaustive test, just a brief one that was
50 used to develop the "isLegalUTF8" routine.
51
52 Legal UTF-8 sequences are:
53
54 1st---- 2nd---- 3rd---- 4th---- Codepoints---
55
56 00-7F 0000- 007F
57 C2-DF 80-BF 0080- 07FF
58 E0 A0-BF 80-BF 0800- 0FFF
59 E1-EC 80-BF 80-BF 1000- CFFF
60 ED 80-9F 80-BF D000- D7FF
61 EE-EF 80-BF 80-BF E000- FFFF
62 F0 90-BF 80-BF 80-BF 10000- 3FFFF
63 F1-F3 80-BF 80-BF 80-BF 40000- FFFFF
64 F4 80-8F 80-BF 80-BF 100000-10FFFF
65
66 --------------------------------------------------------------------- */
67
68
69 struct utf8_test {
70 Boolean utf8_legal; /* is legal sequence? */
71 int utf8_len; /* length of sequence */
72 unsigned char utf8_seq[5]; /* the sequence */
73 };
74
75 struct utf8_test utf8_testData[] = {
76 { 1, 1, { 0x7A, 0x00, 0x00, 0x00, 0x00 }}, /* 0 */
77 { 1, 2, { 0xC2, 0xAC, 0x00, 0x00, 0x00 }}, /* 1 */
78 { 1, 2, { 0xDF, 0xB2, 0x00, 0x00, 0x00 }}, /* 2 */
79 { 1, 3, { 0xE0, 0xA1, 0x81, 0x00, 0x00 }}, /* 3 */
80 { 1, 3, { 0xE1, 0xAC, 0x90, 0x00, 0x00 }}, /* 4 */
81 { 1, 3, { 0xF0, 0x93, 0xB2, 0xA1, 0x00 }}, /* 5 */
82 { 1, 4, { 0xF1, 0x87, 0x9A, 0xB0, 0x00 }}, /* 6 */
83 { 1, 4, { 0xF3, 0x88, 0x9B, 0xAD, 0x00 }}, /* 7 */
84 { 1, 4, { 0xF4, 0x82, 0x89, 0x8F, 0x00 }}, /* 8 */
85
86 { 0, 3, { 0x82, 0x00, 0x00, 0x00, 0x00 }}, /* 9 */
87 { 0, 2, { 0xF8, 0xAC, 0x00, 0x00, 0x00 }}, /* 10 */
88 { 0, 2, { 0xE1, 0xFC, 0xFF, 0x00, 0x00 }}, /* 11 */
89 { 0, 3, { 0xC2, 0xFC, 0x00, 0x00, 0x00 }}, /* 12 */
90 { 0, 3, { 0xE1, 0xC2, 0x81, 0x00, 0x00 }}, /* 13 */
91 { 0, 2, { 0xC2, 0xC1, 0x00, 0x00, 0x00 }}, /* 14 */
92 { 0, 2, { 0xC0, 0xAF, 0x00, 0x00, 0x00 }}, /* 15 */
93 { 0, 3, { 0xE0, 0x9F, 0x80, 0x00, 0x00 }}, /* 16 */
94 { 0, 4, { 0xF0, 0x93, 0xB2, 0xC1, 0x00 }}, /* 17 */
95
96 { 1, 3, { 0xED, 0x9F, 0xBF, 0x00, 0x00 }}, /* 18 */
97 { 1, 3, { 0xEE, 0x80, 0x80, 0x00, 0x00 }}, /* 19 */
98 { 0, 3, { 0xED, 0xA0, 0x80, 0x00, 0x00 }}, /* 20 */
99 { 0, 3, { 0xED, 0xBF, 0xBF, 0x00, 0x00 }}, /* 21 */
100
101 /* for all > 21 use "short" buffer lengths to detect over-run */
102 { 0, 4, { 0xF0, 0x93, 0xB2, 0xC3, 0x00 }}, /* 18 use short buflen */
103 { 0, 0, { 0x00, 0x00, 0x00, 0x00, 0x00 }},
104
105 };
106
test01()107 int test01() {
108 int i;
109 int rval, wantVal1, wantVal2, gotVal1, gotVal2, len2;
110
111 printf("Begin Test01\n"); fflush(stdout);
112
113 rval = 0;
114 for (i = 0; utf8_testData[i].utf8_len; i++) {
115 wantVal1 = wantVal2 = utf8_testData[i].utf8_legal;
116 gotVal1 = isLegalUTF8(&(utf8_testData[i].utf8_seq[0]), utf8_testData[i].utf8_len);
117 /* use truncated length for tests over 21 */
118 if (i <= 21) { len2 = 4; } else { len2 = utf8_testData[i].utf8_len-1; wantVal2 = 0; }
119 gotVal2 = isLegalUTF8Sequence(&(utf8_testData[i].utf8_seq[0]), &(utf8_testData[i].utf8_seq[0])+len2);
120 if ((gotVal1 != wantVal1) || (gotVal2 != wantVal2)) {
121 printf("Test01 error: seq %d is %d & %d (should be %d & %d) for bytes (%x,%x,%x,%x,%x,) & len %d\n",
122 i, gotVal1, gotVal2, wantVal1, wantVal2, utf8_testData[i].utf8_seq[0],
123 utf8_testData[i].utf8_seq[1], utf8_testData[i].utf8_seq[2],
124 utf8_testData[i].utf8_seq[3], utf8_testData[i].utf8_seq[4],
125 utf8_testData[i].utf8_len);
126 ++rval;
127 }
128 }
129
130 return (rval ? 0 : 1);
131 }
132
133
134 /* ---------------------------------------------------------------------
135 test02 - Test round trip UTF32 -> UTF16 -> UTF8 -> UTF16 -> UTF32
136
137 This is an exhaustive test of values 0 through 0x10FFFF. It
138 takes each integer value and converts from UTC4 through the
139 other encoding forms, and back to UTR32, checking the results
140 along the way.
141
142 It does not check the un-paired low surrogates, except for
143 the first low surrogate. It intends to get that one illegal
144 result, prints a message, and continues with tests.
145
146 --------------------------------------------------------------------- */
147
test02()148 int test02() {
149 int i, n;
150 ConversionResult result;
151 UTF32 utf32_buf[2], utf32_result[2];
152 UTF16 utf16_buf[3], utf16_result[3];
153 UTF8 utf8_buf[8];
154 UTF32 *utf32SourceStart, *utf32TargetStart;
155 UTF16 *utf16SourceStart, *utf16TargetStart;
156 UTF8 *utf8SourceStart, *utf8TargetStart;
157
158 printf("Begin Test02\n"); fflush(stdout);
159
160 for (i = 0; i <= 0x10FFFF; i++) {
161 utf32_buf[0] = i; utf32_buf[1] = 0;
162 utf32_result[0] = utf32_result[1] = 0;
163 utf16_buf[0] = utf16_buf[1] = utf16_buf[2] = 0;
164 utf16_result[0] = utf16_result[1] = utf16_result[2] = 0;
165 for (n = 0; n < 8; n++) utf8_buf[n] = 0;
166
167 utf32SourceStart = utf32_buf; utf32TargetStart = utf32_result;
168 utf16TargetStart = utf16SourceStart = utf16_buf;
169 utf8TargetStart = utf8SourceStart = utf8_buf;
170
171 /*
172 * Test UTF32 -> UTF16
173 */
174 result = ConvertUTF32toUTF16((const UTF32 **) &utf32SourceStart, &(utf32_buf[1]), &utf16TargetStart, &(utf16_buf[2]), strictConversion);
175 if (i < UNI_SUR_HIGH_START || i > UNI_SUR_LOW_END) {
176 /* skip result checking for all but 0000d800, which we know to be illegal */
177 switch (result) {
178 default: fprintf(stderr, "Test02A fatal error: result %d for input %08x\n", result, utf32_buf[0]); exit(1);
179 case conversionOK: break;
180 case sourceExhausted: printf("sourceExhausted\t"); break;
181 case targetExhausted: printf("targetExhausted\t"); break;
182 case sourceIllegal: printf("sourceIllegal\t"); break;
183 }
184 }
185 if (result != conversionOK) {
186 if (i <= UNI_SUR_HIGH_START || i > UNI_SUR_LOW_END) {
187 printf("Test02A for %d, input %08x, output %04x,%04x, result %d\n",
188 i, utf32_buf[0], utf16_buf[0], utf16_buf[1], result);
189 if ((i != UNI_SUR_HIGH_START) || (result != sourceIllegal)) {
190 return 0;
191 } else {
192 printf("!!! Test02A: note expected illegal result for 0x0000D800\n");
193 }
194 }
195 }
196 if (i > UNI_SUR_HIGH_START && i <= UNI_SUR_LOW_END) continue;
197
198 /*
199 * Test UTF16 -> UTF8, with legality check on. We check for everything except
200 * for unpaired low surrogates. We do make one check that the lowest low
201 * surrogate, when unpaired, is illegal.
202 */
203 result = ConvertUTF16toUTF8((const UTF16 **) &utf16SourceStart, &(utf16_buf[2]), &utf8TargetStart, &(utf8_buf[7]), strictConversion);
204 switch (result) {
205 default: fprintf(stderr, "Test02B fatal error: result %d for input %08x\n", result, utf32_buf[0]); exit(1);
206 case conversionOK: break;
207 case sourceExhausted: printf("sourceExhausted\t"); break;
208 case targetExhausted: printf("targetExhausted\t"); break;
209 case sourceIllegal: printf("sourceIllegal\t"); break;
210 }
211 if (result != conversionOK) {
212 printf("Test02B for %d (0x%x), input %04x,%04x; output %s; result %d\n",
213 i, utf32_buf[0], utf16_buf[0], utf16_buf[1], utf8_buf, result);
214 if ((i != UNI_SUR_LOW_START) && (i != UNI_SUR_HIGH_START)) {
215 return 0;
216 } else {
217 /* Note: This illegal result only happens if we remove the surrogate
218 check in Test02A. So it shouldn't be seen unless that check and
219 the "continue" are removed in the test above.
220 */
221 if (i == UNI_SUR_LOW_START)
222 printf("!!! Test02B: note expected illegal result for 0xDC00,0000\n");
223 else if (i == UNI_SUR_HIGH_START)
224 printf("!!! Test02B: note expected illegal result for 0xD800,0000\n");
225 }
226 }
227 if ((i == UNI_SUR_LOW_START) && result != sourceIllegal) {
228 printf("Test02B for %d (0x%x), input %04x,%04x; output %s; result %d\n",
229 i, utf32_buf[0], utf16_buf[0], utf16_buf[1], utf8_buf, result);
230 printf("Test02B: expected illegal result for 0xDC00,0000 was not flagged illegal.\n");
231 return 0;
232 }
233
234 if ((i >= UNI_SUR_HIGH_START) & (i <= UNI_SUR_LOW_END)) continue;
235
236 /*
237 * Reset some result buffer pointers for the trip back.
238 */
239 utf32SourceStart = utf32_buf; utf32TargetStart = utf32_result;
240 utf16TargetStart = utf16SourceStart = utf16_result;
241 utf8TargetStart = utf8SourceStart = utf8_buf;
242
243 /*
244 * Test UTF8 -> UTF16, with legality check on.
245 */
246 result = ConvertUTF8toUTF16((const UTF8 **) &utf8SourceStart, &(utf8_buf[trailingBytesForUTF8[utf8_buf[0]]+1]), &utf16TargetStart, &(utf16_result[2]), strictConversion);
247 switch (result) {
248 default: fprintf(stderr, "Test02C fatal error: result %d for input %08x\n", result, utf32_buf[0]); exit(1);
249 case conversionOK: break;
250 case sourceExhausted: printf("sourceExhausted\t"); break;
251 case targetExhausted: printf("targetExhausted\t"); break;
252 case sourceIllegal: printf("sourceIllegal\t"); break;
253 }
254 if (result != conversionOK) {
255 printf("Test02C for %d (0x%x), input %s; output %04x,%04x; result %d\n",
256 i, utf32_buf[0], utf8_buf, utf16_buf[0], utf16_buf[1], result);
257 return 0;
258 }
259 for (n = 0; n < 3; n++) { /* check that the utf16 result is the same as what went in. */
260 if (utf16_buf[n] != utf16_result[n]) {
261 printf("Test02C error: input = 0x%08x; utf16_buf = 0x%04x,0x%04x; utf16_result = 0x%04x,0x%04x\n",
262 utf32_buf[0], utf16_buf[0], utf16_buf[1], utf16_result[0], utf16_result[1]);
263 return 0;
264 }
265 }
266
267 /*
268 * Test UTF16 -> UTF32, with legality check on. If the result of our previous
269 * conversion gave us a "surrogate pair", then we need to convert 2 entities
270 * back to UTF32.
271 */
272 if (utf16_result[0] >= UNI_SUR_HIGH_START && utf16_result[0] <= UNI_SUR_HIGH_END) {
273 result = ConvertUTF16toUTF32((const UTF16 **) &utf16SourceStart, &(utf16_result[2]), &utf32TargetStart, &(utf32_result[1]), strictConversion);
274 } else {
275 result = ConvertUTF16toUTF32((const UTF16 **) &utf16SourceStart, &(utf16_result[1]), &utf32TargetStart, &(utf32_result[1]), strictConversion);
276 }
277 switch (result) {
278 default: fprintf(stderr, "Test02D fatal error: result %d for input %08x\n", result, utf32_buf[0]); exit(1);
279 case conversionOK: break;
280 case sourceExhausted: printf("sourceExhausted\t"); break;
281 case targetExhausted: printf("targetExhausted\t"); break;
282 case sourceIllegal: printf("sourceIllegal\t"); break;
283 }
284 if (result != conversionOK) {
285 printf("Test02D for %d (0x%x), input %04x,%04x; output %08x; result %d\n",
286 i, utf32_buf[0], utf16_buf[0], utf16_buf[1], utf32_result[0], result);
287 return 0;
288 }
289
290 /*
291 * Now, check the final round-trip value.
292 */
293 if (utf32_buf[0] != utf32_result[0]) {
294 printf("Test02E for %d: utf32 input %08x; trip output %08x (utf_16buf is %04x,%04x)\n", i, utf32_buf[0], utf32_result[0], utf16_buf[0], utf16_buf[1]);
295 return 0;
296 }
297 }
298 return 1;
299 }
300
301 /* ---------------------------------------------------------------------
302 test03 - Test round trip UTF32 -> UTF8 -> UTF32
303
304 This tests the functions that were not tested by test02 above.
305 For each UTF32 value 0 through 0x10FFFF, it tests the conversion
306 to UTF-8 and back. The test is exhaustive.
307
308 --------------------------------------------------------------------- */
309
test03()310 int test03() {
311 int i, n;
312 ConversionResult result;
313 UTF32 utf32_buf[2], utf32_result[2];
314 UTF8 utf8_buf[8];
315 UTF32 *utf32SourceStart, *utf32TargetStart;
316 UTF8 *utf8SourceStart, *utf8TargetStart;
317
318 printf("Begin Test03\n"); fflush(stdout);
319
320 for (i = 0; i <= 0x10FFFF; i++) {
321 /* Skip all surrogates except UNI_SUR_HIGH_START, which we test for illegality. */
322 if (i > UNI_SUR_HIGH_START && i <= UNI_SUR_LOW_END) continue;
323
324 utf32_buf[0] = i; utf32_buf[1] = 0;
325 utf32_result[0] = utf32_result[1] = 0;
326 for (n = 0; n < 8; n++) utf8_buf[n] = 0;
327
328 utf32SourceStart = utf32_buf; utf32TargetStart = utf32_result;
329 utf8TargetStart = utf8SourceStart = utf8_buf;
330
331 /*
332 * Test UTF32 -> UTF8, with legality check on.
333 */
334 result = ConvertUTF32toUTF8((const UTF32 **) &utf32SourceStart, &(utf32_buf[1]), & utf8TargetStart, &(utf8_buf[7]), strictConversion);
335 switch (result) {
336 default: fprintf(stderr, "Test03A fatal error: result %d for input %08x\n", result, utf32_buf[0]); exit(1);
337 case conversionOK: break;
338 case sourceExhausted: printf("sourceExhausted\t"); break;
339 case targetExhausted: printf("targetExhausted\t"); break;
340 case sourceIllegal: printf("sourceIllegal\t"); break;
341 }
342 if (result != conversionOK) {
343 printf("Test03A for %d (0x%x); output %s; result %d\n",
344 i, utf32_buf[0], utf8_buf, result);
345 if (i != UNI_SUR_HIGH_START) {
346 return 0;
347 } else {
348 printf("!!! Test03A: note expected illegal result for 0x0000D800\n");
349 }
350 }
351 if ((i == UNI_SUR_HIGH_START) && result != sourceIllegal) {
352 printf("Test03A for %d (0x%x); output %s; result %d\n",
353 i, utf32_buf[0], utf8_buf, result);
354 printf("Test03A: expected illegal result for 0x0000D800 was not flagged illegal.\n");
355 return 0;
356 }
357
358 if ((i >= UNI_SUR_HIGH_START) & (i <= UNI_SUR_LOW_END)) continue;
359
360 /*
361 * Reset some result buffer pointers for the trip back.
362 */
363 utf32SourceStart = utf32_buf; utf32TargetStart = utf32_result;
364 utf8TargetStart = utf8SourceStart = utf8_buf;
365
366 /*
367 * Test UTF8 -> UTF32, with legality check on.
368 */
369 result = ConvertUTF8toUTF32((const UTF8 **) &utf8SourceStart, &(utf8_buf[trailingBytesForUTF8[utf8_buf[0]]+1]), &utf32TargetStart, &(utf32_result[1]), strictConversion);
370 switch (result) {
371 default: fprintf(stderr, "Test03B fatal error: result %d for input %08x\n", result, utf32_buf[0]); exit(1);
372 case conversionOK: break;
373 case sourceExhausted: printf("sourceExhausted\t"); break;
374 case targetExhausted: printf("targetExhausted\t"); break;
375 case sourceIllegal: printf("sourceIllegal\t"); break;
376 }
377 if (result != conversionOK) {
378 printf("Test03B for %d (0x%x), input %s; output 0x%08x; result %d\n",
379 i, utf32_buf[0], utf8_buf, utf32_result[0], result);
380 return 0;
381 }
382
383 /*
384 * Now, check the final round-trip value.
385 */
386 if (utf32_buf[0] != utf32_result[0]) {
387 printf("Test03C for %d: utf32 input %08x; utf8 buf %s; trip output %08x\n", i, utf32_buf[0], utf8_buf, utf32_result[0]);
388 return 0;
389 }
390 }
391 return 1;
392 }
393
394 /* ---------------------------------------------------------------------
395 test04 - Test an illegal UTF-32 value > 10FFFF conversion to UTF-8.
396 Expect it will be turned into UNI_REPLACEMENT_CHAR.
397
398 --------------------------------------------------------------------- */
399
test04()400 int test04() {
401 int i, n;
402 ConversionResult result;
403 UTF32 utf32_buf[2];
404 UTF8 utf8_buf[8];
405 UTF32 *utf32SourceStart, *utf32TargetStart;
406 UTF8 *utf8SourceStart, *utf8TargetStart;
407
408 printf("Begin Test04\n"); fflush(stdout);
409
410 i = 0x10FFFF + 21; /* an arbitrary value > legal */
411
412 utf32_buf[0] = i; utf32_buf[1] = 0;
413 for (n = 0; n < 8; n++) utf8_buf[n] = 0;
414
415 utf32SourceStart = utf32_buf;
416 utf8TargetStart = utf8_buf;
417
418 /*
419 * Test UTF32 -> UTF8, with legality check on.
420 */
421 result = ConvertUTF32toUTF8((const UTF32 **) &utf32SourceStart, &(utf32_buf[1]), & utf8TargetStart, &(utf8_buf[7]), strictConversion);
422 if (result != sourceIllegal) {
423 fprintf(stderr, "Test04A fatal error: result %d for input %08x\n", result, utf32_buf[0]); exit(1);
424 }
425
426 return 1;
427 }
428
429 /* --------------------------------------------------------------------- */
430
main()431 int main() {
432 printf("Three tests of round-trip conversions will be performed.\n");
433 printf("One test of illegal UTF-32 will be peroformed.\n");
434 printf("Two illegal result messages are expected; one in test 02A; one in test 03A.\n");
435 printf("These are for tests of Surrogate conversion.\n\n");
436 fflush(stdout);
437 if (test01()) { printf("******** Test01 succeeded without error. ********\n\n"); }
438 else { printf("-------- Test01 failed. --------\n\n"); }
439 if (test02()) { printf("******** Test02 succeeded without error. ********\n\n"); }
440 else { printf("-------- Test02 failed. --------\n\n"); }
441 if (test03()) { printf("******** Test03 succeeded without error. ********\n\n"); }
442 else { printf("-------- Test03 failed. --------\n\n"); }
443 if (test04()) { printf("******** Test04 succeeded without error. ********\n\n"); }
444 else { printf("-------- Test04 failed. --------\n\n"); }
445 return 0;
446 }
447