1 // Copyright 2016 The PDFium Authors
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 // Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com
6
7 #include "core/fxcrt/xml/cfx_xmlparser.h"
8
9 #include <stdint.h>
10
11 #include <algorithm>
12 #include <iterator>
13 #include <stack>
14 #include <utility>
15
16 #include "core/fxcrt/cfx_seekablestreamproxy.h"
17 #include "core/fxcrt/data_vector.h"
18 #include "core/fxcrt/fx_codepage.h"
19 #include "core/fxcrt/fx_extension.h"
20 #include "core/fxcrt/fx_safe_types.h"
21 #include "core/fxcrt/xml/cfx_xmlchardata.h"
22 #include "core/fxcrt/xml/cfx_xmldocument.h"
23 #include "core/fxcrt/xml/cfx_xmlelement.h"
24 #include "core/fxcrt/xml/cfx_xmlinstruction.h"
25 #include "core/fxcrt/xml/cfx_xmlnode.h"
26 #include "core/fxcrt/xml/cfx_xmltext.h"
27 #include "third_party/base/check.h"
28 #include "third_party/base/notreached.h"
29
30 namespace {
31
32 constexpr size_t kCurrentTextReserve = 128;
33 constexpr uint32_t kMaxCharRange = 0x10ffff;
34
IsXMLWhiteSpace(wchar_t ch)35 bool IsXMLWhiteSpace(wchar_t ch) {
36 return ch == L' ' || ch == 0x0A || ch == 0x0D || ch == 0x09;
37 }
38
39 struct FX_XMLNAMECHAR {
40 uint16_t wStart;
41 uint16_t wEnd;
42 bool bStartChar;
43 };
44
45 constexpr FX_XMLNAMECHAR kXMLNameChars[] = {
46 {L'-', L'.', false}, {L'0', L'9', false}, {L':', L':', false},
47 {L'A', L'Z', true}, {L'_', L'_', true}, {L'a', L'z', true},
48 {0xB7, 0xB7, false}, {0xC0, 0xD6, true}, {0xD8, 0xF6, true},
49 {0xF8, 0x02FF, true}, {0x0300, 0x036F, false}, {0x0370, 0x037D, true},
50 {0x037F, 0x1FFF, true}, {0x200C, 0x200D, true}, {0x203F, 0x2040, false},
51 {0x2070, 0x218F, true}, {0x2C00, 0x2FEF, true}, {0x3001, 0xD7FF, true},
52 {0xF900, 0xFDCF, true}, {0xFDF0, 0xFFFD, true},
53 };
54
55 } // namespace
56
57 // static
IsXMLNameChar(wchar_t ch,bool bFirstChar)58 bool CFX_XMLParser::IsXMLNameChar(wchar_t ch, bool bFirstChar) {
59 auto* it = std::lower_bound(
60 std::begin(kXMLNameChars), std::end(kXMLNameChars), ch,
61 [](const FX_XMLNAMECHAR& arg, wchar_t ch) { return arg.wEnd < ch; });
62 return it != std::end(kXMLNameChars) && ch >= it->wStart &&
63 (!bFirstChar || it->bStartChar);
64 }
65
CFX_XMLParser(const RetainPtr<IFX_SeekableReadStream> & pStream)66 CFX_XMLParser::CFX_XMLParser(const RetainPtr<IFX_SeekableReadStream>& pStream) {
67 DCHECK(pStream);
68
69 auto proxy = pdfium::MakeRetain<CFX_SeekableStreamProxy>(pStream);
70 FX_CodePage wCodePage = proxy->GetCodePage();
71 if (wCodePage != FX_CodePage::kUTF16LE &&
72 wCodePage != FX_CodePage::kUTF16BE && wCodePage != FX_CodePage::kUTF8) {
73 proxy->SetCodePage(FX_CodePage::kUTF8);
74 }
75 stream_ = proxy;
76
77 xml_plane_size_ = std::min(
78 xml_plane_size_, pdfium::base::checked_cast<size_t>(stream_->GetSize()));
79
80 current_text_.reserve(kCurrentTextReserve);
81 }
82
83 CFX_XMLParser::~CFX_XMLParser() = default;
84
Parse()85 std::unique_ptr<CFX_XMLDocument> CFX_XMLParser::Parse() {
86 auto doc = std::make_unique<CFX_XMLDocument>();
87 current_node_ = doc->GetRoot();
88
89 return DoSyntaxParse(doc.get()) ? std::move(doc) : nullptr;
90 }
91
DoSyntaxParse(CFX_XMLDocument * doc)92 bool CFX_XMLParser::DoSyntaxParse(CFX_XMLDocument* doc) {
93 if (xml_plane_size_ <= 0)
94 return false;
95
96 FX_SAFE_SIZE_T alloc_size_safe = xml_plane_size_;
97 alloc_size_safe += 1; // For NUL.
98 if (!alloc_size_safe.IsValid())
99 return false;
100
101 size_t current_buffer_idx = 0;
102 size_t buffer_size = 0;
103
104 DataVector<wchar_t> buffer;
105 buffer.resize(alloc_size_safe.ValueOrDie());
106
107 std::stack<wchar_t> character_to_skip_too_stack;
108 std::stack<CFX_XMLNode::Type> node_type_stack;
109 WideString current_attribute_name;
110 FDE_XmlSyntaxState current_parser_state = FDE_XmlSyntaxState::Text;
111 wchar_t current_quote_character = 0;
112 wchar_t current_character_to_skip_to = 0;
113
114 while (true) {
115 if (current_buffer_idx >= buffer_size) {
116 if (stream_->IsEOF())
117 return true;
118
119 size_t buffer_chars = stream_->ReadBlock(buffer.data(), xml_plane_size_);
120 if (buffer_chars == 0)
121 return true;
122
123 current_buffer_idx = 0;
124 buffer_size = buffer_chars;
125 }
126
127 while (current_buffer_idx < buffer_size) {
128 wchar_t ch = buffer[current_buffer_idx];
129 switch (current_parser_state) {
130 case FDE_XmlSyntaxState::Text:
131 if (ch == L'<') {
132 if (!current_text_.empty()) {
133 current_node_->AppendLastChild(
134 doc->CreateNode<CFX_XMLText>(GetTextData()));
135 } else {
136 current_buffer_idx++;
137 current_parser_state = FDE_XmlSyntaxState::Node;
138 }
139 } else {
140 // Fail if there is text outside of the root element, ignore
141 // whitespace/null.
142 if (node_type_stack.empty() && ch && !FXSYS_iswspace(ch))
143 return false;
144 ProcessTextChar(ch);
145 current_buffer_idx++;
146 }
147 break;
148 case FDE_XmlSyntaxState::Node:
149 if (ch == L'!') {
150 current_buffer_idx++;
151 current_parser_state = FDE_XmlSyntaxState::SkipCommentOrDecl;
152 } else if (ch == L'/') {
153 current_buffer_idx++;
154 current_parser_state = FDE_XmlSyntaxState::CloseElement;
155 } else if (ch == L'?') {
156 node_type_stack.push(CFX_XMLNode::Type::kInstruction);
157 current_buffer_idx++;
158 current_parser_state = FDE_XmlSyntaxState::Target;
159 } else {
160 node_type_stack.push(CFX_XMLNode::Type::kElement);
161 current_parser_state = FDE_XmlSyntaxState::Tag;
162 }
163 break;
164 case FDE_XmlSyntaxState::Target:
165 if (!IsXMLNameChar(ch, current_text_.empty())) {
166 if (current_text_.empty())
167 return false;
168
169 current_parser_state = FDE_XmlSyntaxState::TargetData;
170
171 WideString target_name = GetTextData();
172 if (target_name.EqualsASCII("originalXFAVersion") ||
173 target_name.EqualsASCII("acrobat")) {
174 auto* node = doc->CreateNode<CFX_XMLInstruction>(target_name);
175 current_node_->AppendLastChild(node);
176 current_node_ = node;
177 }
178 } else {
179 current_text_.push_back(ch);
180 current_buffer_idx++;
181 }
182 break;
183 case FDE_XmlSyntaxState::Tag:
184 if (!IsXMLNameChar(ch, current_text_.empty())) {
185 if (current_text_.empty())
186 return false;
187
188 current_parser_state = FDE_XmlSyntaxState::AttriName;
189
190 auto* child = doc->CreateNode<CFX_XMLElement>(GetTextData());
191 current_node_->AppendLastChild(child);
192 current_node_ = child;
193 } else {
194 current_text_.push_back(ch);
195 current_buffer_idx++;
196 }
197 break;
198 case FDE_XmlSyntaxState::AttriName:
199 if (current_text_.empty() && IsXMLWhiteSpace(ch)) {
200 current_buffer_idx++;
201 break;
202 }
203 if (!IsXMLNameChar(ch, current_text_.empty())) {
204 if (current_text_.empty()) {
205 if (node_type_stack.top() == CFX_XMLNode::Type::kElement) {
206 if (ch == L'>' || ch == L'/') {
207 current_parser_state = FDE_XmlSyntaxState::BreakElement;
208 break;
209 }
210 } else if (node_type_stack.top() ==
211 CFX_XMLNode::Type::kInstruction) {
212 if (ch == L'?') {
213 current_parser_state = FDE_XmlSyntaxState::CloseInstruction;
214 current_buffer_idx++;
215 } else {
216 current_parser_state = FDE_XmlSyntaxState::TargetData;
217 }
218 break;
219 }
220 return false;
221 } else {
222 if (node_type_stack.top() == CFX_XMLNode::Type::kInstruction) {
223 if (ch != '=' && !IsXMLWhiteSpace(ch)) {
224 current_parser_state = FDE_XmlSyntaxState::TargetData;
225 break;
226 }
227 }
228 current_parser_state = FDE_XmlSyntaxState::AttriEqualSign;
229 current_attribute_name = GetTextData();
230 }
231 } else {
232 current_text_.push_back(ch);
233 current_buffer_idx++;
234 }
235 break;
236 case FDE_XmlSyntaxState::AttriEqualSign:
237 if (IsXMLWhiteSpace(ch)) {
238 current_buffer_idx++;
239 break;
240 }
241 if (ch != L'=') {
242 if (node_type_stack.top() == CFX_XMLNode::Type::kInstruction) {
243 current_parser_state = FDE_XmlSyntaxState::TargetData;
244 break;
245 }
246 return false;
247 } else {
248 current_parser_state = FDE_XmlSyntaxState::AttriQuotation;
249 current_buffer_idx++;
250 }
251 break;
252 case FDE_XmlSyntaxState::AttriQuotation:
253 if (IsXMLWhiteSpace(ch)) {
254 current_buffer_idx++;
255 break;
256 }
257 if (ch != L'\"' && ch != L'\'') {
258 return false;
259 }
260
261 current_quote_character = ch;
262 current_parser_state = FDE_XmlSyntaxState::AttriValue;
263 current_buffer_idx++;
264 break;
265 case FDE_XmlSyntaxState::AttriValue:
266 if (ch == current_quote_character) {
267 if (entity_start_.has_value())
268 return false;
269
270 current_quote_character = 0;
271 current_buffer_idx++;
272 current_parser_state = FDE_XmlSyntaxState::AttriName;
273
274 CFX_XMLElement* elem = ToXMLElement(current_node_);
275 if (elem)
276 elem->SetAttribute(current_attribute_name, GetTextData());
277
278 current_attribute_name.clear();
279 } else {
280 ProcessTextChar(ch);
281 current_buffer_idx++;
282 }
283 break;
284 case FDE_XmlSyntaxState::CloseInstruction:
285 if (ch != L'>') {
286 current_text_.push_back(ch);
287 current_parser_state = FDE_XmlSyntaxState::TargetData;
288 } else if (!current_text_.empty()) {
289 ProcessTargetData();
290 } else {
291 current_buffer_idx++;
292 if (node_type_stack.empty())
293 return false;
294
295 node_type_stack.pop();
296 current_parser_state = FDE_XmlSyntaxState::Text;
297
298 if (current_node_ &&
299 current_node_->GetType() == CFX_XMLNode::Type::kInstruction)
300 current_node_ = current_node_->GetParent();
301 }
302 break;
303 case FDE_XmlSyntaxState::BreakElement:
304 if (ch == L'>') {
305 current_parser_state = FDE_XmlSyntaxState::Text;
306 } else if (ch == L'/') {
307 current_parser_state = FDE_XmlSyntaxState::CloseElement;
308 } else {
309 return false;
310 }
311 current_buffer_idx++;
312 break;
313 case FDE_XmlSyntaxState::CloseElement:
314 if (!IsXMLNameChar(ch, current_text_.empty())) {
315 if (ch == L'>') {
316 if (node_type_stack.empty())
317 return false;
318
319 node_type_stack.pop();
320 current_parser_state = FDE_XmlSyntaxState::Text;
321
322 CFX_XMLElement* element = ToXMLElement(current_node_);
323 if (!element)
324 return false;
325
326 WideString element_name = GetTextData();
327 if (element_name.GetLength() > 0 &&
328 element_name != element->GetName()) {
329 return false;
330 }
331
332 current_node_ = current_node_->GetParent();
333 } else if (!IsXMLWhiteSpace(ch)) {
334 return false;
335 }
336 } else {
337 current_text_.push_back(ch);
338 }
339 current_buffer_idx++;
340 break;
341 case FDE_XmlSyntaxState::SkipCommentOrDecl: {
342 auto current_span =
343 pdfium::make_span(buffer).subspan(current_buffer_idx);
344 if (FXSYS_wcsnicmp(current_span.data(), L"--", 2) == 0) {
345 current_buffer_idx += 2;
346 current_parser_state = FDE_XmlSyntaxState::SkipComment;
347 } else if (FXSYS_wcsnicmp(current_span.data(), L"[CDATA[", 7) == 0) {
348 current_buffer_idx += 7;
349 current_parser_state = FDE_XmlSyntaxState::SkipCData;
350 } else {
351 current_parser_state = FDE_XmlSyntaxState::SkipDeclNode;
352 current_character_to_skip_to = L'>';
353 character_to_skip_too_stack.push(L'>');
354 }
355 break;
356 }
357 case FDE_XmlSyntaxState::SkipCData: {
358 auto current_span =
359 pdfium::make_span(buffer).subspan(current_buffer_idx);
360 if (FXSYS_wcsnicmp(current_span.data(), L"]]>", 3) == 0) {
361 current_buffer_idx += 3;
362 current_parser_state = FDE_XmlSyntaxState::Text;
363 current_node_->AppendLastChild(
364 doc->CreateNode<CFX_XMLCharData>(GetTextData()));
365 } else {
366 current_text_.push_back(ch);
367 current_buffer_idx++;
368 }
369 break;
370 }
371 case FDE_XmlSyntaxState::SkipDeclNode:
372 if (current_character_to_skip_to == L'\'' ||
373 current_character_to_skip_to == L'\"') {
374 current_buffer_idx++;
375 if (ch != current_character_to_skip_to)
376 break;
377
378 character_to_skip_too_stack.pop();
379 if (character_to_skip_too_stack.empty())
380 current_parser_state = FDE_XmlSyntaxState::Text;
381 else
382 current_character_to_skip_to = character_to_skip_too_stack.top();
383 } else {
384 switch (ch) {
385 case L'<':
386 current_character_to_skip_to = L'>';
387 character_to_skip_too_stack.push(L'>');
388 break;
389 case L'[':
390 current_character_to_skip_to = L']';
391 character_to_skip_too_stack.push(L']');
392 break;
393 case L'(':
394 current_character_to_skip_to = L')';
395 character_to_skip_too_stack.push(L')');
396 break;
397 case L'\'':
398 current_character_to_skip_to = L'\'';
399 character_to_skip_too_stack.push(L'\'');
400 break;
401 case L'\"':
402 current_character_to_skip_to = L'\"';
403 character_to_skip_too_stack.push(L'\"');
404 break;
405 default:
406 if (ch == current_character_to_skip_to) {
407 character_to_skip_too_stack.pop();
408 if (character_to_skip_too_stack.empty()) {
409 current_parser_state = FDE_XmlSyntaxState::Text;
410 } else {
411 current_character_to_skip_to =
412 character_to_skip_too_stack.top();
413 }
414 }
415 break;
416 }
417 current_buffer_idx++;
418 }
419 break;
420 case FDE_XmlSyntaxState::SkipComment: {
421 auto current_span =
422 pdfium::make_span(buffer).subspan(current_buffer_idx);
423 if (FXSYS_wcsnicmp(current_span.data(), L"-->", 3) == 0) {
424 current_buffer_idx += 2;
425 current_parser_state = FDE_XmlSyntaxState::Text;
426 }
427 current_buffer_idx++;
428 break;
429 }
430 case FDE_XmlSyntaxState::TargetData:
431 if (IsXMLWhiteSpace(ch)) {
432 if (current_text_.empty()) {
433 current_buffer_idx++;
434 break;
435 }
436 if (current_quote_character == 0) {
437 current_buffer_idx++;
438 ProcessTargetData();
439 break;
440 }
441 }
442 if (ch == '?') {
443 current_parser_state = FDE_XmlSyntaxState::CloseInstruction;
444 current_buffer_idx++;
445 } else if (ch == '\"') {
446 if (current_quote_character == 0) {
447 current_quote_character = ch;
448 current_buffer_idx++;
449 } else if (ch == current_quote_character) {
450 current_quote_character = 0;
451 current_buffer_idx++;
452 ProcessTargetData();
453 } else {
454 return false;
455 }
456 } else {
457 current_text_.push_back(ch);
458 current_buffer_idx++;
459 }
460 break;
461 default:
462 break;
463 }
464 }
465 }
466
467 NOTREACHED();
468 return false;
469 }
470
ProcessTextChar(wchar_t character)471 void CFX_XMLParser::ProcessTextChar(wchar_t character) {
472 current_text_.push_back(character);
473
474 if (entity_start_.has_value() && character == L';') {
475 // Copy the entity out into a string and remove from the vector. When we
476 // copy the entity we don't want to copy out the & or the ; so we start
477 // shifted by one and want to copy 2 less characters in total.
478 WideString csEntity(current_text_.data() + entity_start_.value() + 1,
479 current_text_.size() - entity_start_.value() - 2);
480 current_text_.erase(current_text_.begin() + entity_start_.value(),
481 current_text_.end());
482
483 size_t iLen = csEntity.GetLength();
484 if (iLen > 0) {
485 if (csEntity[0] == L'#') {
486 uint32_t ch = 0;
487 if (iLen > 1 && csEntity[1] == L'x') {
488 for (size_t i = 2; i < iLen; i++) {
489 if (!FXSYS_IsHexDigit(csEntity[i]))
490 break;
491 ch = (ch << 4) + FXSYS_HexCharToInt(csEntity[i]);
492 }
493 } else {
494 for (size_t i = 1; i < iLen; i++) {
495 if (!FXSYS_IsDecimalDigit(csEntity[i]))
496 break;
497 ch = ch * 10 + FXSYS_DecimalCharToInt(csEntity[i]);
498 }
499 }
500 if (ch > kMaxCharRange)
501 ch = ' ';
502
503 character = static_cast<wchar_t>(ch);
504 if (character != 0)
505 current_text_.push_back(character);
506 } else {
507 if (csEntity == L"amp") {
508 current_text_.push_back(L'&');
509 } else if (csEntity == L"lt") {
510 current_text_.push_back(L'<');
511 } else if (csEntity == L"gt") {
512 current_text_.push_back(L'>');
513 } else if (csEntity == L"apos") {
514 current_text_.push_back(L'\'');
515 } else if (csEntity == L"quot") {
516 current_text_.push_back(L'"');
517 }
518 }
519 }
520 entity_start_ = absl::nullopt;
521 } else if (!entity_start_.has_value() && character == L'&') {
522 entity_start_ = current_text_.size() - 1;
523 }
524 }
525
ProcessTargetData()526 void CFX_XMLParser::ProcessTargetData() {
527 WideString target_data = GetTextData();
528 if (target_data.IsEmpty())
529 return;
530
531 CFX_XMLInstruction* instruction = ToXMLInstruction(current_node_);
532 if (instruction)
533 instruction->AppendData(target_data);
534 }
535
GetTextData()536 WideString CFX_XMLParser::GetTextData() {
537 WideString ret(current_text_.data(), current_text_.size());
538 entity_start_ = absl::nullopt;
539 current_text_.clear();
540 current_text_.reserve(kCurrentTextReserve);
541 return ret;
542 }
543