• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1From 6a5510bc6b7efe743356296724e0b38300f05379 Mon Sep 17 00:00:00 2001
2From: Sebastian Pipping <sebastian@pipping.org>
3Date: Tue, 8 Feb 2022 04:06:21 +0100
4Subject: [PATCH] tests: Cover missing validation of encoding
5 (CVE-2022-25235)
6
7---
8 tests/runtests.c | 109 +++++++++++++++++++++++++++++++++++++++++++++++++
9 1 file changed, 109 insertions(+)
10
11diff --git a/tests/runtests.c b/tests/runtests.c
12index bc5344b..9b155b8 100644
13--- a/tests/runtests.c
14+++ b/tests/runtests.c
15@@ -5998,6 +5998,105 @@ START_TEST(test_utf8_in_cdata_section_2) {
16 }
17 END_TEST
18
19+START_TEST(test_utf8_in_start_tags) {
20+  struct test_case {
21+    bool goodName;
22+    bool goodNameStart;
23+    const char *tagName;
24+  };
25+
26+  // The idea with the tests below is this:
27+  // We want to cover 1-, 2- and 3-byte sequences, 4-byte sequences
28+  // go to isNever and are hence not a concern.
29+  //
30+  // We start with a character that is a valid name character
31+  // (or even name-start character, see XML 1.0r4 spec) and then we flip
32+  // single bits at places where (1) the result leaves the UTF-8 encoding space
33+  // and (2) we stay in the same n-byte sequence family.
34+  //
35+  // The flipped bits are highlighted in angle brackets in comments,
36+  // e.g. "[<1>011 1001]" means we had [0011 1001] but we now flipped
37+  // the most significant bit to 1 to leave UTF-8 encoding space.
38+  struct test_case cases[] = {
39+      // 1-byte UTF-8: [0xxx xxxx]
40+      {true, true, "\x3A"},   // [0011 1010] = ASCII colon ':'
41+      {false, false, "\xBA"}, // [<1>011 1010]
42+      {true, false, "\x39"},  // [0011 1001] = ASCII nine '9'
43+      {false, false, "\xB9"}, // [<1>011 1001]
44+
45+      // 2-byte UTF-8: [110x xxxx] [10xx xxxx]
46+      {true, true, "\xDB\xA5"},   // [1101 1011] [1010 0101] =
47+                                  // Arabic small waw U+06E5
48+      {false, false, "\x9B\xA5"}, // [1<0>01 1011] [1010 0101]
49+      {false, false, "\xDB\x25"}, // [1101 1011] [<0>010 0101]
50+      {false, false, "\xDB\xE5"}, // [1101 1011] [1<1>10 0101]
51+      {true, false, "\xCC\x81"},  // [1100 1100] [1000 0001] =
52+                                  // combining char U+0301
53+      {false, false, "\x8C\x81"}, // [1<0>00 1100] [1000 0001]
54+      {false, false, "\xCC\x01"}, // [1100 1100] [<0>000 0001]
55+      {false, false, "\xCC\xC1"}, // [1100 1100] [1<1>00 0001]
56+
57+      // 3-byte UTF-8: [1110 xxxx] [10xx xxxx] [10xxxxxx]
58+      {true, true, "\xE0\xA4\x85"},   // [1110 0000] [1010 0100] [1000 0101] =
59+                                      // Devanagari Letter A U+0905
60+      {false, false, "\xA0\xA4\x85"}, // [1<0>10 0000] [1010 0100] [1000 0101]
61+      {false, false, "\xE0\x24\x85"}, // [1110 0000] [<0>010 0100] [1000 0101]
62+      {false, false, "\xE0\xE4\x85"}, // [1110 0000] [1<1>10 0100] [1000 0101]
63+      {false, false, "\xE0\xA4\x05"}, // [1110 0000] [1010 0100] [<0>000 0101]
64+      {false, false, "\xE0\xA4\xC5"}, // [1110 0000] [1010 0100] [1<1>00 0101]
65+      {true, false, "\xE0\xA4\x81"},  // [1110 0000] [1010 0100] [1000 0001] =
66+                                      // combining char U+0901
67+      {false, false, "\xA0\xA4\x81"}, // [1<0>10 0000] [1010 0100] [1000 0001]
68+      {false, false, "\xE0\x24\x81"}, // [1110 0000] [<0>010 0100] [1000 0001]
69+      {false, false, "\xE0\xE4\x81"}, // [1110 0000] [1<1>10 0100] [1000 0001]
70+      {false, false, "\xE0\xA4\x01"}, // [1110 0000] [1010 0100] [<0>000 0001]
71+      {false, false, "\xE0\xA4\xC1"}, // [1110 0000] [1010 0100] [1<1>00 0001]
72+  };
73+  const bool atNameStart[] = {true, false};
74+
75+  size_t i = 0;
76+  char doc[1024];
77+  size_t failCount = 0;
78+
79+  for (; i < sizeof(cases) / sizeof(cases[0]); i++) {
80+    size_t j = 0;
81+    for (; j < sizeof(atNameStart) / sizeof(atNameStart[0]); j++) {
82+      const bool expectedSuccess
83+          = atNameStart[j] ? cases[i].goodNameStart : cases[i].goodName;
84+      sprintf(doc, "<%s%s><!--", atNameStart[j] ? "" : "a", cases[i].tagName);
85+      XML_Parser parser = XML_ParserCreate(NULL);
86+
87+      const enum XML_Status status
88+          = XML_Parse(parser, doc, (int)strlen(doc), /*isFinal=*/XML_FALSE);
89+
90+      bool success = true;
91+      if ((status == XML_STATUS_OK) != expectedSuccess) {
92+        success = false;
93+      }
94+      if ((status == XML_STATUS_ERROR)
95+          && (XML_GetErrorCode(parser) != XML_ERROR_INVALID_TOKEN)) {
96+        success = false;
97+      }
98+
99+      if (! success) {
100+        fprintf(
101+            stderr,
102+            "FAIL case %2u (%sat name start, %u-byte sequence, error code %d)\n",
103+            (unsigned)i + 1u, atNameStart[j] ? "    " : "not ",
104+            (unsigned)strlen(cases[i].tagName), XML_GetErrorCode(parser));
105+        failCount++;
106+      }
107+
108+      XML_ParserFree(parser);
109+    }
110+  }
111+
112+  if (failCount > 0) {
113+    fail("UTF-8 regression detected");
114+  }
115+}
116+END_TEST
117+
118 /* Test trailing spaces in elements are accepted */
119 static void XMLCALL
120 record_element_end_handler(void *userData, const XML_Char *name) {
121@@ -6175,6 +6274,14 @@ START_TEST(test_bad_doctype) {
122 }
123 END_TEST
124
125+START_TEST(test_bad_doctype_utf8) {
126+  const char *text = "<!DOCTYPE \xDB\x25"
127+                     "doc><doc/>"; // [1101 1011] [<0>010 0101]
128+  expect_failure(text, XML_ERROR_INVALID_TOKEN,
129+                 "Invalid UTF-8 in DOCTYPE not faulted");
130+}
131+END_TEST
132+
133 START_TEST(test_bad_doctype_utf16) {
134   const char text[] =
135       /* <!DOCTYPE doc [ \x06f2 ]><doc/>
136@@ -11870,6 +11977,7 @@ make_suite(void) {
137   tcase_add_test(tc_basic, test_ext_entity_utf8_non_bom);
138   tcase_add_test(tc_basic, test_utf8_in_cdata_section);
139   tcase_add_test(tc_basic, test_utf8_in_cdata_section_2);
140+  tcase_add_test(tc_basic, test_utf8_in_start_tags);
141   tcase_add_test(tc_basic, test_trailing_spaces_in_elements);
142   tcase_add_test(tc_basic, test_utf16_attribute);
143   tcase_add_test(tc_basic, test_utf16_second_attr);
144@@ -11878,6 +11986,7 @@ make_suite(void) {
145   tcase_add_test(tc_basic, test_bad_attr_desc_keyword);
146   tcase_add_test(tc_basic, test_bad_attr_desc_keyword_utf16);
147   tcase_add_test(tc_basic, test_bad_doctype);
148+  tcase_add_test(tc_basic, test_bad_doctype_utf8);
149   tcase_add_test(tc_basic, test_bad_doctype_utf16);
150   tcase_add_test(tc_basic, test_bad_doctype_plus);
151   tcase_add_test(tc_basic, test_bad_doctype_star);
152--
1531.8.3.1
154
155