1 // Copyright 2016 The PDFium Authors
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 // Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com
6
7 #include "core/fxcrt/xml/cfx_xmlparser.h"
8
9 #include <stdint.h>
10
11 #include <algorithm>
12 #include <iterator>
13 #include <stack>
14 #include <utility>
15
16 #include "core/fxcrt/autorestorer.h"
17 #include "core/fxcrt/cfx_seekablestreamproxy.h"
18 #include "core/fxcrt/check.h"
19 #include "core/fxcrt/data_vector.h"
20 #include "core/fxcrt/fx_codepage.h"
21 #include "core/fxcrt/fx_extension.h"
22 #include "core/fxcrt/fx_safe_types.h"
23 #include "core/fxcrt/notreached.h"
24 #include "core/fxcrt/xml/cfx_xmlchardata.h"
25 #include "core/fxcrt/xml/cfx_xmldocument.h"
26 #include "core/fxcrt/xml/cfx_xmlelement.h"
27 #include "core/fxcrt/xml/cfx_xmlinstruction.h"
28 #include "core/fxcrt/xml/cfx_xmlnode.h"
29 #include "core/fxcrt/xml/cfx_xmltext.h"
30
31 namespace {
32
33 constexpr size_t kCurrentTextReserve = 128;
34 constexpr uint32_t kMaxCharRange = 0x10ffff;
35
IsXMLWhiteSpace(wchar_t ch)36 bool IsXMLWhiteSpace(wchar_t ch) {
37 return ch == L' ' || ch == 0x0A || ch == 0x0D || ch == 0x09;
38 }
39
40 struct FX_XMLNAMECHAR {
41 uint16_t wStart;
42 uint16_t wEnd;
43 bool bStartChar;
44 };
45
46 constexpr FX_XMLNAMECHAR kXMLNameChars[] = {
47 {L'-', L'.', false}, {L'0', L'9', false}, {L':', L':', false},
48 {L'A', L'Z', true}, {L'_', L'_', true}, {L'a', L'z', true},
49 {0xB7, 0xB7, false}, {0xC0, 0xD6, true}, {0xD8, 0xF6, true},
50 {0xF8, 0x02FF, true}, {0x0300, 0x036F, false}, {0x0370, 0x037D, true},
51 {0x037F, 0x1FFF, true}, {0x200C, 0x200D, true}, {0x203F, 0x2040, false},
52 {0x2070, 0x218F, true}, {0x2C00, 0x2FEF, true}, {0x3001, 0xD7FF, true},
53 {0xF900, 0xFDCF, true}, {0xFDF0, 0xFFFD, true},
54 };
55
56 } // namespace
57
58 // static
IsXMLNameChar(wchar_t ch,bool bFirstChar)59 bool CFX_XMLParser::IsXMLNameChar(wchar_t ch, bool bFirstChar) {
60 auto* it = std::lower_bound(
61 std::begin(kXMLNameChars), std::end(kXMLNameChars), ch,
62 [](const FX_XMLNAMECHAR& arg, wchar_t ch) { return arg.wEnd < ch; });
63 return it != std::end(kXMLNameChars) && ch >= it->wStart &&
64 (!bFirstChar || it->bStartChar);
65 }
66
CFX_XMLParser(const RetainPtr<IFX_SeekableReadStream> & pStream)67 CFX_XMLParser::CFX_XMLParser(const RetainPtr<IFX_SeekableReadStream>& pStream) {
68 DCHECK(pStream);
69
70 auto proxy = pdfium::MakeRetain<CFX_SeekableStreamProxy>(pStream);
71 FX_CodePage wCodePage = proxy->GetCodePage();
72 if (wCodePage != FX_CodePage::kUTF16LE &&
73 wCodePage != FX_CodePage::kUTF16BE && wCodePage != FX_CodePage::kUTF8) {
74 proxy->SetCodePage(FX_CodePage::kUTF8);
75 }
76 stream_ = proxy;
77
78 xml_plane_size_ = std::min(xml_plane_size_,
79 pdfium::checked_cast<size_t>(stream_->GetSize()));
80
81 current_text_.Reserve(kCurrentTextReserve);
82 }
83
84 CFX_XMLParser::~CFX_XMLParser() = default;
85
Parse()86 std::unique_ptr<CFX_XMLDocument> CFX_XMLParser::Parse() {
87 auto doc = std::make_unique<CFX_XMLDocument>();
88 AutoRestorer<UnownedPtr<CFX_XMLNode>> restorer(¤t_node_);
89 current_node_ = doc->GetRoot();
90 if (!DoSyntaxParse(doc.get())) {
91 return nullptr;
92 }
93 return doc;
94 }
95
DoSyntaxParse(CFX_XMLDocument * doc)96 bool CFX_XMLParser::DoSyntaxParse(CFX_XMLDocument* doc) {
97 if (xml_plane_size_ <= 0)
98 return false;
99
100 FX_SAFE_SIZE_T alloc_size_safe = xml_plane_size_;
101 alloc_size_safe += 1; // For NUL.
102 if (!alloc_size_safe.IsValid())
103 return false;
104
105 size_t current_buffer_idx = 0;
106 size_t buffer_size = 0;
107
108 DataVector<wchar_t> buffer;
109 buffer.resize(alloc_size_safe.ValueOrDie());
110
111 std::stack<wchar_t> character_to_skip_too_stack;
112 std::stack<CFX_XMLNode::Type> node_type_stack;
113 WideString current_attribute_name;
114 FDE_XmlSyntaxState current_parser_state = FDE_XmlSyntaxState::Text;
115 wchar_t current_quote_character = 0;
116 wchar_t current_character_to_skip_to = 0;
117
118 while (true) {
119 if (current_buffer_idx >= buffer_size) {
120 if (stream_->IsEOF())
121 return true;
122
123 size_t buffer_chars =
124 stream_->ReadBlock(pdfium::make_span(buffer).first(xml_plane_size_));
125 if (buffer_chars == 0)
126 return true;
127
128 current_buffer_idx = 0;
129 buffer_size = buffer_chars;
130 }
131
132 while (current_buffer_idx < buffer_size) {
133 wchar_t ch = buffer[current_buffer_idx];
134 switch (current_parser_state) {
135 case FDE_XmlSyntaxState::Text:
136 if (ch == L'<') {
137 if (!current_text_.IsEmpty()) {
138 current_node_->AppendLastChild(
139 doc->CreateNode<CFX_XMLText>(GetTextData()));
140 } else {
141 current_buffer_idx++;
142 current_parser_state = FDE_XmlSyntaxState::Node;
143 }
144 } else {
145 // Fail if there is text outside of the root element, ignore
146 // whitespace/null.
147 if (node_type_stack.empty() && ch && !FXSYS_iswspace(ch))
148 return false;
149 ProcessTextChar(ch);
150 current_buffer_idx++;
151 }
152 break;
153 case FDE_XmlSyntaxState::Node:
154 if (ch == L'!') {
155 current_buffer_idx++;
156 current_parser_state = FDE_XmlSyntaxState::SkipCommentOrDecl;
157 } else if (ch == L'/') {
158 current_buffer_idx++;
159 current_parser_state = FDE_XmlSyntaxState::CloseElement;
160 } else if (ch == L'?') {
161 node_type_stack.push(CFX_XMLNode::Type::kInstruction);
162 current_buffer_idx++;
163 current_parser_state = FDE_XmlSyntaxState::Target;
164 } else {
165 node_type_stack.push(CFX_XMLNode::Type::kElement);
166 current_parser_state = FDE_XmlSyntaxState::Tag;
167 }
168 break;
169 case FDE_XmlSyntaxState::Target:
170 if (!IsXMLNameChar(ch, current_text_.IsEmpty())) {
171 if (current_text_.IsEmpty()) {
172 return false;
173 }
174
175 current_parser_state = FDE_XmlSyntaxState::TargetData;
176
177 WideString target_name = GetTextData();
178 if (target_name.EqualsASCII("originalXFAVersion") ||
179 target_name.EqualsASCII("acrobat")) {
180 auto* node = doc->CreateNode<CFX_XMLInstruction>(target_name);
181 current_node_->AppendLastChild(node);
182 current_node_ = node;
183 }
184 } else {
185 current_text_ += ch;
186 current_buffer_idx++;
187 }
188 break;
189 case FDE_XmlSyntaxState::Tag:
190 if (!IsXMLNameChar(ch, current_text_.IsEmpty())) {
191 if (current_text_.IsEmpty()) {
192 return false;
193 }
194
195 current_parser_state = FDE_XmlSyntaxState::AttriName;
196
197 auto* child = doc->CreateNode<CFX_XMLElement>(GetTextData());
198 current_node_->AppendLastChild(child);
199 current_node_ = child;
200 } else {
201 current_text_ += ch;
202 current_buffer_idx++;
203 }
204 break;
205 case FDE_XmlSyntaxState::AttriName:
206 if (current_text_.IsEmpty() && IsXMLWhiteSpace(ch)) {
207 current_buffer_idx++;
208 break;
209 }
210 if (!IsXMLNameChar(ch, current_text_.IsEmpty())) {
211 if (current_text_.IsEmpty()) {
212 if (node_type_stack.top() == CFX_XMLNode::Type::kElement) {
213 if (ch == L'>' || ch == L'/') {
214 current_parser_state = FDE_XmlSyntaxState::BreakElement;
215 break;
216 }
217 } else if (node_type_stack.top() ==
218 CFX_XMLNode::Type::kInstruction) {
219 if (ch == L'?') {
220 current_parser_state = FDE_XmlSyntaxState::CloseInstruction;
221 current_buffer_idx++;
222 } else {
223 current_parser_state = FDE_XmlSyntaxState::TargetData;
224 }
225 break;
226 }
227 return false;
228 } else {
229 if (node_type_stack.top() == CFX_XMLNode::Type::kInstruction) {
230 if (ch != '=' && !IsXMLWhiteSpace(ch)) {
231 current_parser_state = FDE_XmlSyntaxState::TargetData;
232 break;
233 }
234 }
235 current_parser_state = FDE_XmlSyntaxState::AttriEqualSign;
236 current_attribute_name = GetTextData();
237 }
238 } else {
239 current_text_ += ch;
240 current_buffer_idx++;
241 }
242 break;
243 case FDE_XmlSyntaxState::AttriEqualSign:
244 if (IsXMLWhiteSpace(ch)) {
245 current_buffer_idx++;
246 break;
247 }
248 if (ch != L'=') {
249 if (node_type_stack.top() == CFX_XMLNode::Type::kInstruction) {
250 current_parser_state = FDE_XmlSyntaxState::TargetData;
251 break;
252 }
253 return false;
254 } else {
255 current_parser_state = FDE_XmlSyntaxState::AttriQuotation;
256 current_buffer_idx++;
257 }
258 break;
259 case FDE_XmlSyntaxState::AttriQuotation:
260 if (IsXMLWhiteSpace(ch)) {
261 current_buffer_idx++;
262 break;
263 }
264 if (ch != L'\"' && ch != L'\'') {
265 return false;
266 }
267
268 current_quote_character = ch;
269 current_parser_state = FDE_XmlSyntaxState::AttriValue;
270 current_buffer_idx++;
271 break;
272 case FDE_XmlSyntaxState::AttriValue:
273 if (ch == current_quote_character) {
274 if (entity_start_.has_value())
275 return false;
276
277 current_quote_character = 0;
278 current_buffer_idx++;
279 current_parser_state = FDE_XmlSyntaxState::AttriName;
280
281 CFX_XMLElement* elem = ToXMLElement(current_node_);
282 if (elem)
283 elem->SetAttribute(current_attribute_name, GetTextData());
284
285 current_attribute_name.clear();
286 } else {
287 ProcessTextChar(ch);
288 current_buffer_idx++;
289 }
290 break;
291 case FDE_XmlSyntaxState::CloseInstruction:
292 if (ch != L'>') {
293 current_text_ += ch;
294 current_parser_state = FDE_XmlSyntaxState::TargetData;
295 } else if (!current_text_.IsEmpty()) {
296 ProcessTargetData();
297 } else {
298 current_buffer_idx++;
299 if (node_type_stack.empty())
300 return false;
301
302 node_type_stack.pop();
303 current_parser_state = FDE_XmlSyntaxState::Text;
304
305 if (current_node_ &&
306 current_node_->GetType() == CFX_XMLNode::Type::kInstruction)
307 current_node_ = current_node_->GetParent();
308 }
309 break;
310 case FDE_XmlSyntaxState::BreakElement:
311 if (ch == L'>') {
312 current_parser_state = FDE_XmlSyntaxState::Text;
313 } else if (ch == L'/') {
314 current_parser_state = FDE_XmlSyntaxState::CloseElement;
315 } else {
316 return false;
317 }
318 current_buffer_idx++;
319 break;
320 case FDE_XmlSyntaxState::CloseElement:
321 if (!IsXMLNameChar(ch, current_text_.IsEmpty())) {
322 if (ch == L'>') {
323 if (node_type_stack.empty())
324 return false;
325
326 node_type_stack.pop();
327 current_parser_state = FDE_XmlSyntaxState::Text;
328
329 CFX_XMLElement* element = ToXMLElement(current_node_);
330 if (!element)
331 return false;
332
333 WideString element_name = GetTextData();
334 if (element_name.GetLength() > 0 &&
335 element_name != element->GetName()) {
336 return false;
337 }
338
339 current_node_ = current_node_->GetParent();
340 } else if (!IsXMLWhiteSpace(ch)) {
341 return false;
342 }
343 } else {
344 current_text_ += ch;
345 }
346 current_buffer_idx++;
347 break;
348 case FDE_XmlSyntaxState::SkipCommentOrDecl: {
349 auto current_view = WideStringView(
350 pdfium::make_span(buffer).subspan(current_buffer_idx));
351 if (current_view.First(2).EqualsASCII("--")) {
352 current_buffer_idx += 2;
353 current_parser_state = FDE_XmlSyntaxState::SkipComment;
354 } else if (current_view.First(7).EqualsASCIINoCase("[CDATA[")) {
355 current_buffer_idx += 7;
356 current_parser_state = FDE_XmlSyntaxState::SkipCData;
357 } else {
358 current_parser_state = FDE_XmlSyntaxState::SkipDeclNode;
359 current_character_to_skip_to = L'>';
360 character_to_skip_too_stack.push(L'>');
361 }
362 break;
363 }
364 case FDE_XmlSyntaxState::SkipCData: {
365 auto current_view = WideStringView(
366 pdfium::make_span(buffer).subspan(current_buffer_idx));
367 if (current_view.First(3).EqualsASCII("]]>")) {
368 current_buffer_idx += 3;
369 current_parser_state = FDE_XmlSyntaxState::Text;
370 current_node_->AppendLastChild(
371 doc->CreateNode<CFX_XMLCharData>(GetTextData()));
372 } else {
373 current_text_ += ch;
374 current_buffer_idx++;
375 }
376 break;
377 }
378 case FDE_XmlSyntaxState::SkipDeclNode:
379 if (current_character_to_skip_to == L'\'' ||
380 current_character_to_skip_to == L'\"') {
381 current_buffer_idx++;
382 if (ch != current_character_to_skip_to)
383 break;
384
385 character_to_skip_too_stack.pop();
386 if (character_to_skip_too_stack.empty())
387 current_parser_state = FDE_XmlSyntaxState::Text;
388 else
389 current_character_to_skip_to = character_to_skip_too_stack.top();
390 } else {
391 switch (ch) {
392 case L'<':
393 current_character_to_skip_to = L'>';
394 character_to_skip_too_stack.push(L'>');
395 break;
396 case L'[':
397 current_character_to_skip_to = L']';
398 character_to_skip_too_stack.push(L']');
399 break;
400 case L'(':
401 current_character_to_skip_to = L')';
402 character_to_skip_too_stack.push(L')');
403 break;
404 case L'\'':
405 current_character_to_skip_to = L'\'';
406 character_to_skip_too_stack.push(L'\'');
407 break;
408 case L'\"':
409 current_character_to_skip_to = L'\"';
410 character_to_skip_too_stack.push(L'\"');
411 break;
412 default:
413 if (ch == current_character_to_skip_to) {
414 character_to_skip_too_stack.pop();
415 if (character_to_skip_too_stack.empty()) {
416 current_parser_state = FDE_XmlSyntaxState::Text;
417 } else {
418 current_character_to_skip_to =
419 character_to_skip_too_stack.top();
420 }
421 }
422 break;
423 }
424 current_buffer_idx++;
425 }
426 break;
427 case FDE_XmlSyntaxState::SkipComment: {
428 auto current_view = WideStringView(
429 pdfium::make_span(buffer).subspan(current_buffer_idx));
430 if (current_view.First(3).EqualsASCII("-->")) {
431 current_buffer_idx += 2;
432 current_parser_state = FDE_XmlSyntaxState::Text;
433 }
434 current_buffer_idx++;
435 break;
436 }
437 case FDE_XmlSyntaxState::TargetData:
438 if (IsXMLWhiteSpace(ch)) {
439 if (current_text_.IsEmpty()) {
440 current_buffer_idx++;
441 break;
442 }
443 if (current_quote_character == 0) {
444 current_buffer_idx++;
445 ProcessTargetData();
446 break;
447 }
448 }
449 if (ch == '?') {
450 current_parser_state = FDE_XmlSyntaxState::CloseInstruction;
451 current_buffer_idx++;
452 } else if (ch == '\"') {
453 if (current_quote_character == 0) {
454 current_quote_character = ch;
455 current_buffer_idx++;
456 } else if (ch == current_quote_character) {
457 current_quote_character = 0;
458 current_buffer_idx++;
459 ProcessTargetData();
460 } else {
461 return false;
462 }
463 } else {
464 current_text_ += ch;
465 current_buffer_idx++;
466 }
467 break;
468 }
469 }
470 }
471
472 NOTREACHED_NORETURN();
473 }
474
ProcessTextChar(wchar_t character)475 void CFX_XMLParser::ProcessTextChar(wchar_t character) {
476 current_text_ += character;
477
478 if (entity_start_.has_value() && character == L';') {
479 // Copy the entity out into a string and remove from the current text. When
480 // we copy the entity we don't want to copy out the & or the ; so we start
481 // shifted by one and want to copy 2 less characters in total.
482 WideString csEntity = current_text_.Substr(
483 entity_start_.value() + 1,
484 current_text_.GetLength() - entity_start_.value() - 2);
485
486 current_text_.Delete(entity_start_.value(),
487 current_text_.GetLength() - entity_start_.value());
488
489 size_t iLen = csEntity.GetLength();
490 if (iLen > 0) {
491 if (csEntity[0] == L'#') {
492 uint32_t ch = 0;
493 if (iLen > 1 && csEntity[1] == L'x') {
494 for (size_t i = 2; i < iLen; i++) {
495 if (!FXSYS_IsHexDigit(csEntity[i]))
496 break;
497 ch = (ch << 4) + FXSYS_HexCharToInt(csEntity[i]);
498 }
499 } else {
500 for (size_t i = 1; i < iLen; i++) {
501 if (!FXSYS_IsDecimalDigit(csEntity[i]))
502 break;
503 ch = ch * 10 + FXSYS_DecimalCharToInt(csEntity[i]);
504 }
505 }
506 if (ch > kMaxCharRange)
507 ch = ' ';
508
509 character = static_cast<wchar_t>(ch);
510 if (character != 0)
511 current_text_ += character;
512 } else {
513 if (csEntity.EqualsASCII("amp")) {
514 current_text_ += L'&';
515 } else if (csEntity.EqualsASCII("lt")) {
516 current_text_ += L'<';
517 } else if (csEntity.EqualsASCII("gt")) {
518 current_text_ += L'>';
519 } else if (csEntity.EqualsASCII("apos")) {
520 current_text_ += L'\'';
521 } else if (csEntity.EqualsASCII("quot")) {
522 current_text_ += L'"';
523 }
524 }
525 }
526 entity_start_ = std::nullopt;
527 } else if (!entity_start_.has_value() && character == L'&') {
528 entity_start_ = current_text_.GetLength() - 1;
529 }
530 }
531
ProcessTargetData()532 void CFX_XMLParser::ProcessTargetData() {
533 WideString target_data = GetTextData();
534 if (target_data.IsEmpty())
535 return;
536
537 CFX_XMLInstruction* instruction = ToXMLInstruction(current_node_);
538 if (instruction)
539 instruction->AppendData(target_data);
540 }
541
GetTextData()542 WideString CFX_XMLParser::GetTextData() {
543 WideString ret = std::move(current_text_);
544 current_text_.Reserve(kCurrentTextReserve);
545 entity_start_ = std::nullopt;
546 return ret;
547 }
548