1From 6a5510bc6b7efe743356296724e0b38300f05379 Mon Sep 17 00:00:00 2001 2From: Sebastian Pipping <sebastian@pipping.org> 3Date: Tue, 8 Feb 2022 04:06:21 +0100 4Subject: [PATCH] tests: Cover missing validation of encoding 5 (CVE-2022-25235) 6 7--- 8 tests/runtests.c | 109 +++++++++++++++++++++++++++++++++++++++++++++++++ 9 1 file changed, 109 insertions(+) 10 11diff --git a/tests/runtests.c b/tests/runtests.c 12index bc5344b..9b155b8 100644 13--- a/tests/runtests.c 14+++ b/tests/runtests.c 15@@ -5998,6 +5998,105 @@ START_TEST(test_utf8_in_cdata_section_2) { 16 } 17 END_TEST 18 19+START_TEST(test_utf8_in_start_tags) { 20+ struct test_case { 21+ bool goodName; 22+ bool goodNameStart; 23+ const char *tagName; 24+ }; 25+ 26+ // The idea with the tests below is this: 27+ // We want to cover 1-, 2- and 3-byte sequences, 4-byte sequences 28+ // go to isNever and are hence not a concern. 29+ // 30+ // We start with a character that is a valid name character 31+ // (or even name-start character, see XML 1.0r4 spec) and then we flip 32+ // single bits at places where (1) the result leaves the UTF-8 encoding space 33+ // and (2) we stay in the same n-byte sequence family. 34+ // 35+ // The flipped bits are highlighted in angle brackets in comments, 36+ // e.g. "[<1>011 1001]" means we had [0011 1001] but we now flipped 37+ // the most significant bit to 1 to leave UTF-8 encoding space. 38+ struct test_case cases[] = { 39+ // 1-byte UTF-8: [0xxx xxxx] 40+ {true, true, "\x3A"}, // [0011 1010] = ASCII colon ':' 41+ {false, false, "\xBA"}, // [<1>011 1010] 42+ {true, false, "\x39"}, // [0011 1001] = ASCII nine '9' 43+ {false, false, "\xB9"}, // [<1>011 1001] 44+ 45+ // 2-byte UTF-8: [110x xxxx] [10xx xxxx] 46+ {true, true, "\xDB\xA5"}, // [1101 1011] [1010 0101] = 47+ // Arabic small waw U+06E5 48+ {false, false, "\x9B\xA5"}, // [1<0>01 1011] [1010 0101] 49+ {false, false, "\xDB\x25"}, // [1101 1011] [<0>010 0101] 50+ {false, false, "\xDB\xE5"}, // [1101 1011] [1<1>10 0101] 51+ {true, false, "\xCC\x81"}, // [1100 1100] [1000 0001] = 52+ // combining char U+0301 53+ {false, false, "\x8C\x81"}, // [1<0>00 1100] [1000 0001] 54+ {false, false, "\xCC\x01"}, // [1100 1100] [<0>000 0001] 55+ {false, false, "\xCC\xC1"}, // [1100 1100] [1<1>00 0001] 56+ 57+ // 3-byte UTF-8: [1110 xxxx] [10xx xxxx] [10xxxxxx] 58+ {true, true, "\xE0\xA4\x85"}, // [1110 0000] [1010 0100] [1000 0101] = 59+ // Devanagari Letter A U+0905 60+ {false, false, "\xA0\xA4\x85"}, // [1<0>10 0000] [1010 0100] [1000 0101] 61+ {false, false, "\xE0\x24\x85"}, // [1110 0000] [<0>010 0100] [1000 0101] 62+ {false, false, "\xE0\xE4\x85"}, // [1110 0000] [1<1>10 0100] [1000 0101] 63+ {false, false, "\xE0\xA4\x05"}, // [1110 0000] [1010 0100] [<0>000 0101] 64+ {false, false, "\xE0\xA4\xC5"}, // [1110 0000] [1010 0100] [1<1>00 0101] 65+ {true, false, "\xE0\xA4\x81"}, // [1110 0000] [1010 0100] [1000 0001] = 66+ // combining char U+0901 67+ {false, false, "\xA0\xA4\x81"}, // [1<0>10 0000] [1010 0100] [1000 0001] 68+ {false, false, "\xE0\x24\x81"}, // [1110 0000] [<0>010 0100] [1000 0001] 69+ {false, false, "\xE0\xE4\x81"}, // [1110 0000] [1<1>10 0100] [1000 0001] 70+ {false, false, "\xE0\xA4\x01"}, // [1110 0000] [1010 0100] [<0>000 0001] 71+ {false, false, "\xE0\xA4\xC1"}, // [1110 0000] [1010 0100] [1<1>00 0001] 72+ }; 73+ const bool atNameStart[] = {true, false}; 74+ 75+ size_t i = 0; 76+ char doc[1024]; 77+ size_t failCount = 0; 78+ 79+ for (; i < sizeof(cases) / sizeof(cases[0]); i++) { 80+ size_t j = 0; 81+ for (; j < sizeof(atNameStart) / sizeof(atNameStart[0]); j++) { 82+ const bool expectedSuccess 83+ = atNameStart[j] ? cases[i].goodNameStart : cases[i].goodName; 84+ sprintf(doc, "<%s%s><!--", atNameStart[j] ? "" : "a", cases[i].tagName); 85+ XML_Parser parser = XML_ParserCreate(NULL); 86+ 87+ const enum XML_Status status 88+ = XML_Parse(parser, doc, (int)strlen(doc), /*isFinal=*/XML_FALSE); 89+ 90+ bool success = true; 91+ if ((status == XML_STATUS_OK) != expectedSuccess) { 92+ success = false; 93+ } 94+ if ((status == XML_STATUS_ERROR) 95+ && (XML_GetErrorCode(parser) != XML_ERROR_INVALID_TOKEN)) { 96+ success = false; 97+ } 98+ 99+ if (! success) { 100+ fprintf( 101+ stderr, 102+ "FAIL case %2u (%sat name start, %u-byte sequence, error code %d)\n", 103+ (unsigned)i + 1u, atNameStart[j] ? " " : "not ", 104+ (unsigned)strlen(cases[i].tagName), XML_GetErrorCode(parser)); 105+ failCount++; 106+ } 107+ 108+ XML_ParserFree(parser); 109+ } 110+ } 111+ 112+ if (failCount > 0) { 113+ fail("UTF-8 regression detected"); 114+ } 115+} 116+END_TEST 117+ 118 /* Test trailing spaces in elements are accepted */ 119 static void XMLCALL 120 record_element_end_handler(void *userData, const XML_Char *name) { 121@@ -6175,6 +6274,14 @@ START_TEST(test_bad_doctype) { 122 } 123 END_TEST 124 125+START_TEST(test_bad_doctype_utf8) { 126+ const char *text = "<!DOCTYPE \xDB\x25" 127+ "doc><doc/>"; // [1101 1011] [<0>010 0101] 128+ expect_failure(text, XML_ERROR_INVALID_TOKEN, 129+ "Invalid UTF-8 in DOCTYPE not faulted"); 130+} 131+END_TEST 132+ 133 START_TEST(test_bad_doctype_utf16) { 134 const char text[] = 135 /* <!DOCTYPE doc [ \x06f2 ]><doc/> 136@@ -11870,6 +11977,7 @@ make_suite(void) { 137 tcase_add_test(tc_basic, test_ext_entity_utf8_non_bom); 138 tcase_add_test(tc_basic, test_utf8_in_cdata_section); 139 tcase_add_test(tc_basic, test_utf8_in_cdata_section_2); 140+ tcase_add_test(tc_basic, test_utf8_in_start_tags); 141 tcase_add_test(tc_basic, test_trailing_spaces_in_elements); 142 tcase_add_test(tc_basic, test_utf16_attribute); 143 tcase_add_test(tc_basic, test_utf16_second_attr); 144@@ -11878,6 +11986,7 @@ make_suite(void) { 145 tcase_add_test(tc_basic, test_bad_attr_desc_keyword); 146 tcase_add_test(tc_basic, test_bad_attr_desc_keyword_utf16); 147 tcase_add_test(tc_basic, test_bad_doctype); 148+ tcase_add_test(tc_basic, test_bad_doctype_utf8); 149 tcase_add_test(tc_basic, test_bad_doctype_utf16); 150 tcase_add_test(tc_basic, test_bad_doctype_plus); 151 tcase_add_test(tc_basic, test_bad_doctype_star); 152-- 1531.8.3.1 154 155