1 // Copyright 2016 The PDFium Authors
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 #include "core/fxcrt/xml/cfx_xmlparser.h"
6
7 #include <memory>
8
9 #include "core/fxcrt/cfx_read_only_span_stream.h"
10 #include "core/fxcrt/fx_codepage.h"
11 #include "core/fxcrt/xml/cfx_xmldocument.h"
12 #include "core/fxcrt/xml/cfx_xmlelement.h"
13 #include "core/fxcrt/xml/cfx_xmlinstruction.h"
14 #include "testing/gtest/include/gtest/gtest.h"
15
16 class CFXXMLParserTest : public testing::Test {
17 public:
Parse(pdfium::span<const char> input)18 std::unique_ptr<CFX_XMLDocument> Parse(pdfium::span<const char> input) {
19 CFX_XMLParser parser(
20 pdfium::MakeRetain<CFX_ReadOnlySpanStream>(pdfium::as_bytes(input)));
21 return parser.Parse();
22 }
23 };
24
TEST_F(CFXXMLParserTest,AttributesMustBeQuoted)25 TEST_F(CFXXMLParserTest, AttributesMustBeQuoted) {
26 static const char input[] =
27 "<script display=1>\n"
28 "</script>";
29 ASSERT_TRUE(Parse(input) == nullptr);
30 }
31
TEST_F(CFXXMLParserTest,Attributes)32 TEST_F(CFXXMLParserTest, Attributes) {
33 static const char input[] =
34 "<script contentType=\"application/x-javascript\" display=\"1\">\n"
35 "</script>";
36
37 std::unique_ptr<CFX_XMLDocument> doc = Parse(input);
38 ASSERT_TRUE(doc != nullptr);
39
40 CFX_XMLElement* script = doc->GetRoot()->GetFirstChildNamed(L"script");
41 ASSERT_TRUE(script != nullptr);
42
43 EXPECT_EQ(L"application/x-javascript", script->GetAttribute(L"contentType"));
44 EXPECT_EQ(L"1", script->GetAttribute(L"display"));
45 }
46
TEST_F(CFXXMLParserTest,CData)47 TEST_F(CFXXMLParserTest, CData) {
48 static const char input[] =
49 "<script>\n"
50 " <![CDATA[\n"
51 " if (a[1] < 3)\n"
52 " app.alert(\"Tclams\");\n"
53 " ]]>\n"
54 "</script>";
55
56 static const wchar_t cdata[] =
57 L"\n \n"
58 L" if (a[1] < 3)\n"
59 L" app.alert(\"Tclams\");\n"
60 L" \n";
61
62 std::unique_ptr<CFX_XMLDocument> doc = Parse(input);
63 ASSERT_TRUE(doc != nullptr);
64
65 CFX_XMLElement* script = doc->GetRoot()->GetFirstChildNamed(L"script");
66 ASSERT_TRUE(script != nullptr);
67 EXPECT_EQ(cdata, script->GetTextData());
68 }
69
TEST_F(CFXXMLParserTest,CDataWithInnerScript)70 TEST_F(CFXXMLParserTest, CDataWithInnerScript) {
71 static const char input[] =
72 "<script>\n"
73 " <![CDATA[\n"
74 " if (a[1] < 3)\n"
75 " app.alert(\"Tclams\");\n"
76 " </script>\n"
77 " ]]>\n"
78 "</script>";
79
80 static const wchar_t cdata[] =
81 L"\n \n"
82 L" if (a[1] < 3)\n"
83 L" app.alert(\"Tclams\");\n"
84 L" </script>\n"
85 L" \n";
86
87 std::unique_ptr<CFX_XMLDocument> doc = Parse(input);
88 ASSERT_TRUE(doc != nullptr);
89
90 CFX_XMLElement* script = doc->GetRoot()->GetFirstChildNamed(L"script");
91 ASSERT_TRUE(script != nullptr);
92 EXPECT_EQ(cdata, script->GetTextData());
93 }
94
TEST_F(CFXXMLParserTest,ArrowBangArrow)95 TEST_F(CFXXMLParserTest, ArrowBangArrow) {
96 static const char input[] =
97 "<script>\n"
98 " <!>\n"
99 "</script>";
100
101 std::unique_ptr<CFX_XMLDocument> doc = Parse(input);
102 ASSERT_TRUE(doc != nullptr);
103
104 CFX_XMLElement* script = doc->GetRoot()->GetFirstChildNamed(L"script");
105 ASSERT_TRUE(script != nullptr);
106 EXPECT_EQ(L"\n \n", script->GetTextData());
107 }
108
TEST_F(CFXXMLParserTest,ArrowBangBracketArrow)109 TEST_F(CFXXMLParserTest, ArrowBangBracketArrow) {
110 static const char input[] =
111 "<script>\n"
112 " <![>\n"
113 "</script>";
114
115 std::unique_ptr<CFX_XMLDocument> doc = Parse(input);
116 ASSERT_TRUE(doc != nullptr);
117
118 CFX_XMLElement* script = doc->GetRoot()->GetFirstChildNamed(L"script");
119 ASSERT_TRUE(script != nullptr);
120 EXPECT_EQ(L"\n ", script->GetTextData());
121 }
122
TEST_F(CFXXMLParserTest,IncompleteCData)123 TEST_F(CFXXMLParserTest, IncompleteCData) {
124 static const char input[] =
125 "<script>\n"
126 " <![CDATA>\n"
127 "</script>";
128
129 std::unique_ptr<CFX_XMLDocument> doc = Parse(input);
130 ASSERT_TRUE(doc != nullptr);
131
132 CFX_XMLElement* script = doc->GetRoot()->GetFirstChildNamed(L"script");
133 ASSERT_TRUE(script != nullptr);
134 EXPECT_EQ(L"\n ", script->GetTextData());
135 }
136
TEST_F(CFXXMLParserTest,UnClosedCData)137 TEST_F(CFXXMLParserTest, UnClosedCData) {
138 static const char input[] =
139 "<script>\n"
140 " <![CDATA[\n"
141 "</script>";
142
143 std::unique_ptr<CFX_XMLDocument> doc = Parse(input);
144 ASSERT_TRUE(doc != nullptr);
145
146 CFX_XMLElement* script = doc->GetRoot()->GetFirstChildNamed(L"script");
147 ASSERT_TRUE(script != nullptr);
148 EXPECT_EQ(L"\n ", script->GetTextData());
149 }
150
TEST_F(CFXXMLParserTest,EmptyCData)151 TEST_F(CFXXMLParserTest, EmptyCData) {
152 static const char input[] =
153 "<script>\n"
154 " <![CDATA[]]>\n"
155 "</script>";
156
157 std::unique_ptr<CFX_XMLDocument> doc = Parse(input);
158 ASSERT_TRUE(doc != nullptr);
159
160 CFX_XMLElement* script = doc->GetRoot()->GetFirstChildNamed(L"script");
161 ASSERT_TRUE(script != nullptr);
162 EXPECT_EQ(L"\n \n", script->GetTextData());
163 }
164
TEST_F(CFXXMLParserTest,Comment)165 TEST_F(CFXXMLParserTest, Comment) {
166 static const char input[] =
167 "<script>\n"
168 " <!-- A Comment -->\n"
169 "</script>";
170
171 std::unique_ptr<CFX_XMLDocument> doc = Parse(input);
172 ASSERT_TRUE(doc != nullptr);
173
174 CFX_XMLElement* script = doc->GetRoot()->GetFirstChildNamed(L"script");
175 ASSERT_TRUE(script != nullptr);
176 EXPECT_EQ(L"\n \n", script->GetTextData());
177 }
178
TEST_F(CFXXMLParserTest,IncorrectCommentStart)179 TEST_F(CFXXMLParserTest, IncorrectCommentStart) {
180 static const char input[] =
181 "<script>\n"
182 " <!- A Comment -->\n"
183 "</script>";
184
185 std::unique_ptr<CFX_XMLDocument> doc = Parse(input);
186 ASSERT_TRUE(doc != nullptr);
187
188 CFX_XMLElement* script = doc->GetRoot()->GetFirstChildNamed(L"script");
189 ASSERT_TRUE(script != nullptr);
190 EXPECT_EQ(L"\n \n", script->GetTextData());
191 }
192
TEST_F(CFXXMLParserTest,CommentEmpty)193 TEST_F(CFXXMLParserTest, CommentEmpty) {
194 static const char input[] =
195 "<script>\n"
196 " <!---->\n"
197 "</script>";
198
199 std::unique_ptr<CFX_XMLDocument> doc = Parse(input);
200 ASSERT_TRUE(doc != nullptr);
201
202 CFX_XMLElement* script = doc->GetRoot()->GetFirstChildNamed(L"script");
203 ASSERT_TRUE(script != nullptr);
204 EXPECT_EQ(L"\n \n", script->GetTextData());
205 }
206
TEST_F(CFXXMLParserTest,CommentThreeDash)207 TEST_F(CFXXMLParserTest, CommentThreeDash) {
208 static const char input[] =
209 "<script>\n"
210 " <!--->\n"
211 "</script>";
212
213 std::unique_ptr<CFX_XMLDocument> doc = Parse(input);
214 ASSERT_TRUE(doc != nullptr);
215
216 CFX_XMLElement* script = doc->GetRoot()->GetFirstChildNamed(L"script");
217 ASSERT_TRUE(script != nullptr);
218 EXPECT_EQ(L"\n ", script->GetTextData());
219 }
220
TEST_F(CFXXMLParserTest,CommentTwoDash)221 TEST_F(CFXXMLParserTest, CommentTwoDash) {
222 static const char input[] =
223 "<script>\n"
224 " <!-->\n"
225 "</script>";
226
227 std::unique_ptr<CFX_XMLDocument> doc = Parse(input);
228 ASSERT_TRUE(doc != nullptr);
229
230 CFX_XMLElement* script = doc->GetRoot()->GetFirstChildNamed(L"script");
231 EXPECT_EQ(L"\n ", script->GetTextData());
232 }
233
TEST_F(CFXXMLParserTest,Entities)234 TEST_F(CFXXMLParserTest, Entities) {
235 static const char input[] =
236 "<script>"
237 "B" // B
238 "T" // T
239 "j" // j
240 "H" // H
241 "ꭈ" // \xab48
242 "�"
243 "&"
244 "<"
245 ">"
246 "'"
247 """
248 "&something_else;"
249 "</script>";
250
251 std::unique_ptr<CFX_XMLDocument> doc = Parse(input);
252 ASSERT_TRUE(doc != nullptr);
253
254 CFX_XMLElement* script = doc->GetRoot()->GetFirstChildNamed(L"script");
255 ASSERT_TRUE(script != nullptr);
256 EXPECT_EQ(L"BTjH\xab48&<>'\"", script->GetTextData());
257 }
258
TEST_F(CFXXMLParserTest,EntityOverflowHex)259 TEST_F(CFXXMLParserTest, EntityOverflowHex) {
260 static const char input[] =
261 "<script>"
262 "�"
263 "�"
264 "</script>";
265
266 std::unique_ptr<CFX_XMLDocument> doc = Parse(input);
267 ASSERT_TRUE(doc != nullptr);
268
269 CFX_XMLElement* script = doc->GetRoot()->GetFirstChildNamed(L"script");
270 ASSERT_TRUE(script != nullptr);
271 EXPECT_EQ(L" ", script->GetTextData());
272 }
273
TEST_F(CFXXMLParserTest,EntityOverflowDecimal)274 TEST_F(CFXXMLParserTest, EntityOverflowDecimal) {
275 static const char input[] =
276 "<script>"
277 "�"
278 "�"
279 "</script>";
280
281 std::unique_ptr<CFX_XMLDocument> doc = Parse(input);
282 ASSERT_TRUE(doc != nullptr);
283
284 CFX_XMLElement* script = doc->GetRoot()->GetFirstChildNamed(L"script");
285 ASSERT_TRUE(script != nullptr);
286 EXPECT_EQ(L" ", script->GetTextData());
287 }
288
TEST_F(CFXXMLParserTest,IsXMLNameChar)289 TEST_F(CFXXMLParserTest, IsXMLNameChar) {
290 EXPECT_FALSE(CFX_XMLParser::IsXMLNameChar(L'-', true));
291 EXPECT_TRUE(CFX_XMLParser::IsXMLNameChar(L'-', false));
292
293 EXPECT_FALSE(CFX_XMLParser::IsXMLNameChar(L'.', true));
294 EXPECT_TRUE(CFX_XMLParser::IsXMLNameChar(L'.', false));
295
296 EXPECT_FALSE(CFX_XMLParser::IsXMLNameChar(L'0', true));
297 EXPECT_TRUE(CFX_XMLParser::IsXMLNameChar(L'0', false));
298
299 EXPECT_TRUE(CFX_XMLParser::IsXMLNameChar(L'a', true));
300 EXPECT_TRUE(CFX_XMLParser::IsXMLNameChar(L'a', false));
301
302 EXPECT_TRUE(CFX_XMLParser::IsXMLNameChar(L'A', true));
303 EXPECT_TRUE(CFX_XMLParser::IsXMLNameChar(L'A', false));
304
305 EXPECT_FALSE(CFX_XMLParser::IsXMLNameChar(L'(', false));
306 EXPECT_FALSE(CFX_XMLParser::IsXMLNameChar(L'(', true));
307 EXPECT_FALSE(CFX_XMLParser::IsXMLNameChar(L')', false));
308 EXPECT_FALSE(CFX_XMLParser::IsXMLNameChar(L')', true));
309 EXPECT_FALSE(CFX_XMLParser::IsXMLNameChar(L'[', false));
310 EXPECT_FALSE(CFX_XMLParser::IsXMLNameChar(L'[', true));
311 EXPECT_FALSE(CFX_XMLParser::IsXMLNameChar(L']', false));
312 EXPECT_FALSE(CFX_XMLParser::IsXMLNameChar(L']', true));
313
314 EXPECT_FALSE(CFX_XMLParser::IsXMLNameChar(0x2069, true));
315 EXPECT_TRUE(CFX_XMLParser::IsXMLNameChar(0x2070, true));
316 EXPECT_TRUE(CFX_XMLParser::IsXMLNameChar(0x2073, true));
317 EXPECT_TRUE(CFX_XMLParser::IsXMLNameChar(0x218F, true));
318 EXPECT_FALSE(CFX_XMLParser::IsXMLNameChar(0x2190, true));
319
320 EXPECT_FALSE(CFX_XMLParser::IsXMLNameChar(0xFDEF, true));
321 EXPECT_TRUE(CFX_XMLParser::IsXMLNameChar(0xFDF0, true));
322 EXPECT_TRUE(CFX_XMLParser::IsXMLNameChar(0xFDF1, true));
323 EXPECT_TRUE(CFX_XMLParser::IsXMLNameChar(0xFFFD, true));
324 EXPECT_FALSE(CFX_XMLParser::IsXMLNameChar(0xFFFE, true));
325 }
326
TEST_F(CFXXMLParserTest,BadElementClose)327 TEST_F(CFXXMLParserTest, BadElementClose) {
328 ASSERT_TRUE(Parse("</endtag>") == nullptr);
329 }
330
TEST_F(CFXXMLParserTest,DoubleElementClose)331 TEST_F(CFXXMLParserTest, DoubleElementClose) {
332 ASSERT_TRUE(Parse("<p></p></p>") == nullptr);
333 }
334
TEST_F(CFXXMLParserTest,ParseInstruction)335 TEST_F(CFXXMLParserTest, ParseInstruction) {
336 static const char input[] =
337 "<?originalXFAVersion http://www.xfa.org/schema/xfa-template/3.3/ ?>"
338 "<form></form>";
339
340 std::unique_ptr<CFX_XMLDocument> doc = Parse(input);
341 ASSERT_TRUE(doc != nullptr);
342
343 CFX_XMLElement* root = doc->GetRoot();
344 ASSERT_TRUE(root->GetFirstChild() != nullptr);
345 ASSERT_EQ(CFX_XMLNode::Type::kInstruction, root->GetFirstChild()->GetType());
346
347 CFX_XMLInstruction* instruction = ToXMLInstruction(root->GetFirstChild());
348 EXPECT_TRUE(instruction->IsOriginalXFAVersion());
349 }
350
TEST_F(CFXXMLParserTest,BadEntity)351 TEST_F(CFXXMLParserTest, BadEntity) {
352 static const char input[] =
353 "<script>"
354 "Test &<p>; thing"
355 "</script>";
356 ASSERT_TRUE(Parse(input) == nullptr);
357 }
358