1 /**
2 * Test the UTF-8 decoding routines
3 *
4 * author: Daniel Veillard
5 * copy: see Copyright for the status of this software.
6 */
7
8 #include <stdio.h>
9 #include <string.h>
10 #include <libxml/parser.h>
11 #include <libxml/parserInternals.h>
12
13 #include "buf.h"
14
15 int lastError;
16
errorHandler(void * unused,xmlErrorPtr err)17 static void errorHandler(void *unused, xmlErrorPtr err) {
18 if ((unused == NULL) && (err != NULL) && (lastError == 0)) {
19 lastError = err->code;
20 }
21 }
22
23 char document1[100] = "<doc>XXXX</doc>";
24 char document2[100] = "<doc foo='XXXX'/>";
25
testDocumentRangeByte1(xmlParserCtxtPtr ctxt,char * document,int len,char * data,int forbid1,int forbid2)26 static void testDocumentRangeByte1(xmlParserCtxtPtr ctxt, char *document,
27 int len, char *data, int forbid1, int forbid2) {
28 int i;
29 xmlDocPtr res;
30
31 for (i = 0;i <= 0xFF;i++) {
32 lastError = 0;
33 xmlCtxtReset(ctxt);
34
35 data[0] = (char) i;
36
37 res = xmlReadMemory(document, len, "test", NULL, 0);
38
39 if ((i == forbid1) || (i == forbid2)) {
40 if ((lastError == 0) || (res != NULL))
41 fprintf(stderr,
42 "Failed to detect invalid char for Byte 0x%02X: %c\n",
43 i, i);
44 }
45
46 else if ((i == '<') || (i == '&')) {
47 if ((lastError == 0) || (res != NULL))
48 fprintf(stderr,
49 "Failed to detect illegal char %c for Byte 0x%02X\n", i, i);
50 }
51 else if (((i < 0x20) || (i >= 0x80)) &&
52 (i != 0x9) && (i != 0xA) && (i != 0xD)) {
53 if ((lastError != XML_ERR_INVALID_CHAR) && (res != NULL))
54 fprintf(stderr,
55 "Failed to detect invalid char for Byte 0x%02X\n", i);
56 }
57 else if (res == NULL) {
58 fprintf(stderr,
59 "Failed to parse valid char for Byte 0x%02X : %c\n", i, i);
60 }
61 if (res != NULL)
62 xmlFreeDoc(res);
63 }
64 }
65
testDocumentRangeByte2(xmlParserCtxtPtr ctxt,char * document,int len,char * data)66 static void testDocumentRangeByte2(xmlParserCtxtPtr ctxt, char *document,
67 int len, char *data) {
68 int i, j;
69 xmlDocPtr res;
70
71 for (i = 0x80;i <= 0xFF;i++) {
72 for (j = 0;j <= 0xFF;j++) {
73 lastError = 0;
74 xmlCtxtReset(ctxt);
75
76 data[0] = (char) i;
77 data[1] = (char) j;
78
79 res = xmlReadMemory(document, len, "test", NULL, 0);
80
81 /* if first bit of first char is set, then second bit must too */
82 if ((i & 0x80) && ((i & 0x40) == 0)) {
83 if ((lastError == 0) || (res != NULL))
84 fprintf(stderr,
85 "Failed to detect invalid char for Bytes 0x%02X 0x%02X\n",
86 i, j);
87 }
88
89 /*
90 * if first bit of first char is set, then second char first
91 * bits must be 10
92 */
93 else if ((i & 0x80) && ((j & 0xC0) != 0x80)) {
94 if ((lastError == 0) || (res != NULL))
95 fprintf(stderr,
96 "Failed to detect invalid char for Bytes 0x%02X 0x%02X\n",
97 i, j);
98 }
99
100 /*
101 * if using a 2 byte encoding then the value must be greater
102 * than 0x80, i.e. one of bits 5 to 1 of i must be set
103 */
104 else if ((i & 0x80) && ((i & 0x1E) == 0)) {
105 if ((lastError == 0) || (res != NULL))
106 fprintf(stderr,
107 "Failed to detect invalid char for Bytes 0x%02X 0x%02X\n",
108 i, j);
109 }
110
111 /*
112 * if third bit of first char is set, then the sequence would need
113 * at least 3 bytes, but we give only 2 !
114 */
115 else if ((i & 0xE0) == 0xE0) {
116 if ((lastError == 0) || (res != NULL))
117 fprintf(stderr,
118 "Failed to detect invalid char for Bytes 0x%02X 0x%02X 0x00\n",
119 i, j);
120 }
121
122 /*
123 * We should see no error in remaining cases
124 */
125 else if ((lastError != 0) || (res == NULL)) {
126 fprintf(stderr,
127 "Failed to parse document for Bytes 0x%02X 0x%02X\n", i, j);
128 }
129 if (res != NULL)
130 xmlFreeDoc(res);
131 }
132 }
133 }
134
135 /**
136 * testDocumentRanges:
137 *
138 * Test the correct UTF8 character parsing in context of XML documents
139 * Those are in-context injection tests checking the parser behaviour on
140 * edge case values at different point in content, beginning and end of
141 * CDATA in text or in attribute values.
142 */
143
testDocumentRanges(void)144 static void testDocumentRanges(void) {
145 xmlParserCtxtPtr ctxt;
146 char *data;
147
148 /*
149 * Set up a parsing context using the first document as
150 * the current input source.
151 */
152 ctxt = xmlNewParserCtxt();
153 if (ctxt == NULL) {
154 fprintf(stderr, "Failed to allocate parser context\n");
155 return;
156 }
157
158 printf("testing 1 byte char in document: 1");
159 fflush(stdout);
160 data = &document1[5];
161 data[0] = ' ';
162 data[1] = ' ';
163 data[2] = ' ';
164 data[3] = ' ';
165 /* test 1 byte injection at beginning of area */
166 testDocumentRangeByte1(ctxt, &document1[0], strlen(document1),
167 data, -1, -1);
168 printf(" 2");
169 fflush(stdout);
170 data[0] = ' ';
171 data[1] = ' ';
172 data[2] = ' ';
173 data[3] = ' ';
174 /* test 1 byte injection at end of area */
175 testDocumentRangeByte1(ctxt, &document1[0], strlen(document1),
176 data + 3, -1, -1);
177
178 printf(" 3");
179 fflush(stdout);
180 data = &document2[10];
181 data[0] = ' ';
182 data[1] = ' ';
183 data[2] = ' ';
184 data[3] = ' ';
185 /* test 1 byte injection at beginning of area */
186 testDocumentRangeByte1(ctxt, &document2[0], strlen(document2),
187 data, '\'', -1);
188 printf(" 4");
189 fflush(stdout);
190 data[0] = ' ';
191 data[1] = ' ';
192 data[2] = ' ';
193 data[3] = ' ';
194 /* test 1 byte injection at end of area */
195 testDocumentRangeByte1(ctxt, &document2[0], strlen(document2),
196 data + 3, '\'', -1);
197 printf(" done\n");
198
199 printf("testing 2 byte char in document: 1");
200 fflush(stdout);
201 data = &document1[5];
202 data[0] = ' ';
203 data[1] = ' ';
204 data[2] = ' ';
205 data[3] = ' ';
206 /* test 2 byte injection at beginning of area */
207 testDocumentRangeByte2(ctxt, &document1[0], strlen(document1),
208 data);
209 printf(" 2");
210 fflush(stdout);
211 data[0] = ' ';
212 data[1] = ' ';
213 data[2] = ' ';
214 data[3] = ' ';
215 /* test 2 byte injection at end of area */
216 testDocumentRangeByte2(ctxt, &document1[0], strlen(document1),
217 data + 2);
218
219 printf(" 3");
220 fflush(stdout);
221 data = &document2[10];
222 data[0] = ' ';
223 data[1] = ' ';
224 data[2] = ' ';
225 data[3] = ' ';
226 /* test 2 byte injection at beginning of area */
227 testDocumentRangeByte2(ctxt, &document2[0], strlen(document2),
228 data);
229 printf(" 4");
230 fflush(stdout);
231 data[0] = ' ';
232 data[1] = ' ';
233 data[2] = ' ';
234 data[3] = ' ';
235 /* test 2 byte injection at end of area */
236 testDocumentRangeByte2(ctxt, &document2[0], strlen(document2),
237 data + 2);
238 printf(" done\n");
239
240 xmlFreeParserCtxt(ctxt);
241 }
242
testCharRangeByte1(xmlParserCtxtPtr ctxt,char * data)243 static void testCharRangeByte1(xmlParserCtxtPtr ctxt, char *data) {
244 int i = 0;
245 int len, c;
246
247 data[1] = 0;
248 data[2] = 0;
249 data[3] = 0;
250 for (i = 0;i <= 0xFF;i++) {
251 data[0] = (char) i;
252 ctxt->charset = XML_CHAR_ENCODING_UTF8;
253
254 lastError = 0;
255 c = xmlCurrentChar(ctxt, &len);
256 if ((i == 0) || (i >= 0x80)) {
257 /* we must see an error there */
258 if (lastError != XML_ERR_INVALID_CHAR)
259 fprintf(stderr,
260 "Failed to detect invalid char for Byte 0x%02X\n", i);
261 } else if (i == 0xD) {
262 if ((c != 0xA) || (len != 1))
263 fprintf(stderr, "Failed to convert char for Byte 0x%02X\n", i);
264 } else if ((c != i) || (len != 1)) {
265 fprintf(stderr, "Failed to parse char for Byte 0x%02X\n", i);
266 }
267 }
268 }
269
testCharRangeByte2(xmlParserCtxtPtr ctxt,char * data)270 static void testCharRangeByte2(xmlParserCtxtPtr ctxt, char *data) {
271 int i, j;
272 int len, c;
273
274 data[2] = 0;
275 data[3] = 0;
276 for (i = 0x80;i <= 0xFF;i++) {
277 for (j = 0;j <= 0xFF;j++) {
278 data[0] = (char) i;
279 data[1] = (char) j;
280 ctxt->charset = XML_CHAR_ENCODING_UTF8;
281
282 lastError = 0;
283 c = xmlCurrentChar(ctxt, &len);
284
285 /* if first bit of first char is set, then second bit must too */
286 if ((i & 0x80) && ((i & 0x40) == 0)) {
287 if (lastError != XML_ERR_INVALID_CHAR)
288 fprintf(stderr,
289 "Failed to detect invalid char for Bytes 0x%02X 0x%02X\n",
290 i, j);
291 }
292
293 /*
294 * if first bit of first char is set, then second char first
295 * bits must be 10
296 */
297 else if ((i & 0x80) && ((j & 0xC0) != 0x80)) {
298 if (lastError != XML_ERR_INVALID_CHAR)
299 fprintf(stderr,
300 "Failed to detect invalid char for Bytes 0x%02X 0x%02X: %d\n",
301 i, j, c);
302 }
303
304 /*
305 * if using a 2 byte encoding then the value must be greater
306 * than 0x80, i.e. one of bits 5 to 1 of i must be set
307 */
308 else if ((i & 0x80) && ((i & 0x1E) == 0)) {
309 if (lastError != XML_ERR_INVALID_CHAR)
310 fprintf(stderr,
311 "Failed to detect invalid char for Bytes 0x%02X 0x%02X: %d\n",
312 i, j, c);
313 }
314
315 /*
316 * if third bit of first char is set, then the sequence would need
317 * at least 3 bytes, but we give only 2 !
318 */
319 else if ((i & 0xE0) == 0xE0) {
320 if (lastError != XML_ERR_INVALID_CHAR)
321 fprintf(stderr,
322 "Failed to detect invalid char for Bytes 0x%02X 0x%02X 0x00\n",
323 i, j);
324 }
325
326 /*
327 * We should see no error in remaining cases
328 */
329 else if ((lastError != 0) || (len != 2)) {
330 fprintf(stderr,
331 "Failed to parse char for Bytes 0x%02X 0x%02X\n", i, j);
332 }
333
334 /*
335 * Finally check the value is right
336 */
337 else if (c != (j & 0x3F) + ((i & 0x1F) << 6)) {
338 fprintf(stderr,
339 "Failed to parse char for Bytes 0x%02X 0x%02X: expect %d got %d\n",
340 i, j, ((j & 0x3F) + ((i & 0x1F) << 6)), c);
341 }
342 }
343 }
344 }
345
testCharRangeByte3(xmlParserCtxtPtr ctxt,char * data)346 static void testCharRangeByte3(xmlParserCtxtPtr ctxt, char *data) {
347 int i, j, k, K;
348 int len, c;
349 unsigned char lows[6] = {0, 0x80, 0x81, 0xC1, 0xFF, 0xBF};
350 int value;
351
352 data[3] = 0;
353 for (i = 0xE0;i <= 0xFF;i++) {
354 for (j = 0;j <= 0xFF;j++) {
355 for (k = 0;k < 6;k++) {
356 data[0] = (char) i;
357 data[1] = (char) j;
358 K = lows[k];
359 data[2] = (char) K;
360 value = (K & 0x3F) + ((j & 0x3F) << 6) + ((i & 0xF) << 12);
361 ctxt->charset = XML_CHAR_ENCODING_UTF8;
362
363 lastError = 0;
364 c = xmlCurrentChar(ctxt, &len);
365
366 /*
367 * if fourth bit of first char is set, then the sequence would need
368 * at least 4 bytes, but we give only 3 !
369 */
370 if ((i & 0xF0) == 0xF0) {
371 if (lastError != XML_ERR_INVALID_CHAR)
372 fprintf(stderr,
373 "Failed to detect invalid char for Bytes 0x%02X 0x%02X 0x%02X 0x%02X\n",
374 i, j, K, data[3]);
375 }
376
377 /*
378 * The second and the third bytes must start with 10
379 */
380 else if (((j & 0xC0) != 0x80) || ((K & 0xC0) != 0x80)) {
381 if (lastError != XML_ERR_INVALID_CHAR)
382 fprintf(stderr,
383 "Failed to detect invalid char for Bytes 0x%02X 0x%02X 0x%02X\n",
384 i, j, K);
385 }
386
387 /*
388 * if using a 3 byte encoding then the value must be greater
389 * than 0x800, i.e. one of bits 4 to 0 of i must be set or
390 * the 6th byte of data[1] must be set
391 */
392 else if (((i & 0xF) == 0) && ((j & 0x20) == 0)) {
393 if (lastError != XML_ERR_INVALID_CHAR)
394 fprintf(stderr,
395 "Failed to detect invalid char for Bytes 0x%02X 0x%02X 0x%02X\n",
396 i, j, K);
397 }
398
399 /*
400 * There are values in that range that are not allowed in XML-1.0
401 */
402 else if (((value > 0xD7FF) && (value <0xE000)) ||
403 ((value > 0xFFFD) && (value <0x10000))) {
404 if (lastError != XML_ERR_INVALID_CHAR)
405 fprintf(stderr,
406 "Failed to detect invalid char 0x%04X for Bytes 0x%02X 0x%02X 0x%02X\n",
407 value, i, j, K);
408 }
409
410 /*
411 * We should see no error in remaining cases
412 */
413 else if ((lastError != 0) || (len != 3)) {
414 fprintf(stderr,
415 "Failed to parse char for Bytes 0x%02X 0x%02X 0x%02X\n",
416 i, j, K);
417 }
418
419 /*
420 * Finally check the value is right
421 */
422 else if (c != value) {
423 fprintf(stderr,
424 "Failed to parse char for Bytes 0x%02X 0x%02X 0x%02X: expect %d got %d\n",
425 i, j, data[2], value, c);
426 }
427 }
428 }
429 }
430 }
431
testCharRangeByte4(xmlParserCtxtPtr ctxt,char * data)432 static void testCharRangeByte4(xmlParserCtxtPtr ctxt, char *data) {
433 int i, j, k, K, l, L;
434 int len, c;
435 unsigned char lows[6] = {0, 0x80, 0x81, 0xC1, 0xFF, 0xBF};
436 int value;
437
438 data[4] = 0;
439 for (i = 0xF0;i <= 0xFF;i++) {
440 for (j = 0;j <= 0xFF;j++) {
441 for (k = 0;k < 6;k++) {
442 for (l = 0;l < 6;l++) {
443 data[0] = (char) i;
444 data[1] = (char) j;
445 K = lows[k];
446 data[2] = (char) K;
447 L = lows[l];
448 data[3] = (char) L;
449 value = (L & 0x3F) + ((K & 0x3F) << 6) + ((j & 0x3F) << 12) +
450 ((i & 0x7) << 18);
451 ctxt->charset = XML_CHAR_ENCODING_UTF8;
452
453 lastError = 0;
454 c = xmlCurrentChar(ctxt, &len);
455
456 /*
457 * if fifth bit of first char is set, then the sequence would need
458 * at least 5 bytes, but we give only 4 !
459 */
460 if ((i & 0xF8) == 0xF8) {
461 if (lastError != XML_ERR_INVALID_CHAR)
462 fprintf(stderr,
463 "Failed to detect invalid char for Bytes 0x%02X 0x%02X 0x%02X 0x%02X\n",
464 i, j, K, data[3]);
465 }
466
467 /*
468 * The second, third and fourth bytes must start with 10
469 */
470 else if (((j & 0xC0) != 0x80) || ((K & 0xC0) != 0x80) ||
471 ((L & 0xC0) != 0x80)) {
472 if (lastError != XML_ERR_INVALID_CHAR)
473 fprintf(stderr,
474 "Failed to detect invalid char for Bytes 0x%02X 0x%02X 0x%02X 0x%02X\n",
475 i, j, K, L);
476 }
477
478 /*
479 * if using a 3 byte encoding then the value must be greater
480 * than 0x10000, i.e. one of bits 3 to 0 of i must be set or
481 * the 6 or 5th byte of j must be set
482 */
483 else if (((i & 0x7) == 0) && ((j & 0x30) == 0)) {
484 if (lastError != XML_ERR_INVALID_CHAR)
485 fprintf(stderr,
486 "Failed to detect invalid char for Bytes 0x%02X 0x%02X 0x%02X 0x%02X\n",
487 i, j, K, L);
488 }
489
490 /*
491 * There are values in that range that are not allowed in XML-1.0
492 */
493 else if (((value > 0xD7FF) && (value <0xE000)) ||
494 ((value > 0xFFFD) && (value <0x10000)) ||
495 (value > 0x10FFFF)) {
496 if (lastError != XML_ERR_INVALID_CHAR)
497 fprintf(stderr,
498 "Failed to detect invalid char 0x%04X for Bytes 0x%02X 0x%02X 0x%02X 0x%02X\n",
499 value, i, j, K, L);
500 }
501
502 /*
503 * We should see no error in remaining cases
504 */
505 else if ((lastError != 0) || (len != 4)) {
506 fprintf(stderr,
507 "Failed to parse char for Bytes 0x%02X 0x%02X 0x%02X\n",
508 i, j, K);
509 }
510
511 /*
512 * Finally check the value is right
513 */
514 else if (c != value) {
515 fprintf(stderr,
516 "Failed to parse char for Bytes 0x%02X 0x%02X 0x%02X: expect %d got %d\n",
517 i, j, data[2], value, c);
518 }
519 }
520 }
521 }
522 }
523 }
524
525 /**
526 * testCharRanges:
527 *
528 * Test the correct UTF8 character parsing in isolation i.e.
529 * not when parsing a full document, this is less expensive and we can
530 * cover the full range of UTF-8 chars accepted by XML-1.0
531 */
532
testCharRanges(void)533 static void testCharRanges(void) {
534 char data[5];
535 xmlParserCtxtPtr ctxt;
536 xmlParserInputBufferPtr buf;
537 xmlParserInputPtr input;
538
539 memset(data, 0, 5);
540
541 /*
542 * Set up a parsing context using the above data buffer as
543 * the current input source.
544 */
545 ctxt = xmlNewParserCtxt();
546 if (ctxt == NULL) {
547 fprintf(stderr, "Failed to allocate parser context\n");
548 return;
549 }
550 buf = xmlParserInputBufferCreateStatic(data, sizeof(data),
551 XML_CHAR_ENCODING_NONE);
552 if (buf == NULL) {
553 fprintf(stderr, "Failed to allocate input buffer\n");
554 goto error;
555 }
556 input = xmlNewInputStream(ctxt);
557 if (input == NULL) {
558 xmlFreeParserInputBuffer(buf);
559 goto error;
560 }
561 input->filename = NULL;
562 input->buf = buf;
563 input->cur =
564 input->base = xmlBufContent(input->buf->buffer);
565 input->end = input->base + 4;
566 inputPush(ctxt, input);
567
568 printf("testing char range: 1");
569 fflush(stdout);
570 testCharRangeByte1(ctxt, data);
571 printf(" 2");
572 fflush(stdout);
573 testCharRangeByte2(ctxt, data);
574 printf(" 3");
575 fflush(stdout);
576 testCharRangeByte3(ctxt, data);
577 printf(" 4");
578 fflush(stdout);
579 testCharRangeByte4(ctxt, data);
580 printf(" done\n");
581 fflush(stdout);
582
583 error:
584 xmlFreeParserCtxt(ctxt);
585 }
586
main(void)587 int main(void) {
588
589 /*
590 * this initialize the library and check potential ABI mismatches
591 * between the version it was compiled for and the actual shared
592 * library used.
593 */
594 LIBXML_TEST_VERSION
595
596 /*
597 * Catch errors separately
598 */
599
600 xmlSetStructuredErrorFunc(NULL, errorHandler);
601
602 /*
603 * Run the tests
604 */
605 testCharRanges();
606 testDocumentRanges();
607
608 /*
609 * Cleanup function for the XML library.
610 */
611 xmlCleanupParser();
612 /*
613 * this is to debug memory for regression tests
614 */
615 xmlMemoryDump();
616 return(0);
617 }
618