1 // Copyright 2016 PDFium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 // Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com
6
7 #include "core/fxcrt/xml/cfx_xmlparser.h"
8
9 #include <algorithm>
10 #include <cwctype>
11 #include <iterator>
12 #include <stack>
13 #include <utility>
14
15 #include "core/fxcrt/cfx_seekablestreamproxy.h"
16 #include "core/fxcrt/fx_codepage.h"
17 #include "core/fxcrt/fx_extension.h"
18 #include "core/fxcrt/fx_safe_types.h"
19 #include "core/fxcrt/xml/cfx_xmlchardata.h"
20 #include "core/fxcrt/xml/cfx_xmldocument.h"
21 #include "core/fxcrt/xml/cfx_xmlelement.h"
22 #include "core/fxcrt/xml/cfx_xmlinstruction.h"
23 #include "core/fxcrt/xml/cfx_xmlnode.h"
24 #include "core/fxcrt/xml/cfx_xmltext.h"
25 #include "third_party/base/ptr_util.h"
26
27 namespace {
28
29 constexpr size_t kCurrentTextReserve = 128;
30 constexpr uint32_t kMaxCharRange = 0x10ffff;
31
IsXMLWhiteSpace(wchar_t ch)32 bool IsXMLWhiteSpace(wchar_t ch) {
33 return ch == L' ' || ch == 0x0A || ch == 0x0D || ch == 0x09;
34 }
35
36 struct FX_XMLNAMECHAR {
37 uint16_t wStart;
38 uint16_t wEnd;
39 bool bStartChar;
40 };
41
42 const FX_XMLNAMECHAR g_XMLNameChars[] = {
43 {L'-', L'.', false}, {L'0', L'9', false}, {L':', L':', false},
44 {L'A', L'Z', true}, {L'_', L'_', true}, {L'a', L'z', true},
45 {0xB7, 0xB7, false}, {0xC0, 0xD6, true}, {0xD8, 0xF6, true},
46 {0xF8, 0x02FF, true}, {0x0300, 0x036F, false}, {0x0370, 0x037D, true},
47 {0x037F, 0x1FFF, true}, {0x200C, 0x200D, true}, {0x203F, 0x2040, false},
48 {0x2070, 0x218F, true}, {0x2C00, 0x2FEF, true}, {0x3001, 0xD7FF, true},
49 {0xF900, 0xFDCF, true}, {0xFDF0, 0xFFFD, true},
50 };
51
52 } // namespace
53
54 // static
IsXMLNameChar(wchar_t ch,bool bFirstChar)55 bool CFX_XMLParser::IsXMLNameChar(wchar_t ch, bool bFirstChar) {
56 auto* it = std::lower_bound(
57 std::begin(g_XMLNameChars), std::end(g_XMLNameChars), ch,
58 [](const FX_XMLNAMECHAR& arg, wchar_t ch) { return arg.wEnd < ch; });
59 return it != std::end(g_XMLNameChars) && ch >= it->wStart &&
60 (!bFirstChar || it->bStartChar);
61 }
62
CFX_XMLParser(const RetainPtr<IFX_SeekableReadStream> & pStream)63 CFX_XMLParser::CFX_XMLParser(const RetainPtr<IFX_SeekableReadStream>& pStream) {
64 ASSERT(pStream);
65
66 auto proxy = pdfium::MakeRetain<CFX_SeekableStreamProxy>(pStream);
67 uint16_t wCodePage = proxy->GetCodePage();
68 if (wCodePage != FX_CODEPAGE_UTF16LE && wCodePage != FX_CODEPAGE_UTF16BE &&
69 wCodePage != FX_CODEPAGE_UTF8) {
70 proxy->SetCodePage(FX_CODEPAGE_UTF8);
71 }
72 stream_ = proxy;
73
74 xml_plane_size_ = std::min(
75 xml_plane_size_, pdfium::base::checked_cast<size_t>(stream_->GetSize()));
76
77 current_text_.reserve(kCurrentTextReserve);
78 }
79
80 CFX_XMLParser::~CFX_XMLParser() = default;
81
Parse()82 std::unique_ptr<CFX_XMLDocument> CFX_XMLParser::Parse() {
83 auto doc = pdfium::MakeUnique<CFX_XMLDocument>();
84 current_node_ = doc->GetRoot();
85
86 return DoSyntaxParse(doc.get()) ? std::move(doc) : nullptr;
87 }
88
DoSyntaxParse(CFX_XMLDocument * doc)89 bool CFX_XMLParser::DoSyntaxParse(CFX_XMLDocument* doc) {
90 if (xml_plane_size_ <= 0)
91 return false;
92
93 FX_SAFE_SIZE_T alloc_size_safe = xml_plane_size_;
94 alloc_size_safe += 1; // For NUL.
95 if (!alloc_size_safe.IsValid())
96 return false;
97
98 FX_FILESIZE current_buffer_idx = 0;
99 FX_FILESIZE buffer_size = 0;
100
101 std::vector<wchar_t, FxAllocAllocator<wchar_t>> buffer;
102 buffer.resize(alloc_size_safe.ValueOrDie());
103
104 std::stack<wchar_t> character_to_skip_too_stack;
105 std::stack<CFX_XMLNode::Type> node_type_stack;
106 WideString current_attribute_name;
107 FDE_XmlSyntaxState current_parser_state = FDE_XmlSyntaxState::Text;
108 int32_t iCount = 0;
109 wchar_t current_quote_character = 0;
110 wchar_t current_character_to_skip_to = 0;
111
112 while (true) {
113 if (current_buffer_idx >= buffer_size) {
114 if (stream_->IsEOF())
115 return true;
116
117 size_t buffer_chars = stream_->ReadBlock(buffer.data(), xml_plane_size_);
118 if (buffer_chars == 0)
119 return true;
120
121 current_buffer_idx = 0;
122 buffer_size = buffer_chars;
123 }
124
125 while (current_buffer_idx < buffer_size) {
126 wchar_t ch = buffer[current_buffer_idx];
127 switch (current_parser_state) {
128 case FDE_XmlSyntaxState::Text:
129 if (ch == L'<') {
130 if (!current_text_.empty()) {
131 current_node_->AppendLastChild(
132 doc->CreateNode<CFX_XMLText>(GetTextData()));
133 } else {
134 current_buffer_idx++;
135 current_parser_state = FDE_XmlSyntaxState::Node;
136 }
137 } else {
138 // Fail if there is text outside of the root element, ignore
139 // whitespace/null.
140 if (node_type_stack.empty() && ch && !FXSYS_iswspace(ch))
141 return false;
142 ProcessTextChar(ch);
143 current_buffer_idx++;
144 }
145 break;
146 case FDE_XmlSyntaxState::Node:
147 if (ch == L'!') {
148 current_buffer_idx++;
149 current_parser_state = FDE_XmlSyntaxState::SkipCommentOrDecl;
150 } else if (ch == L'/') {
151 current_buffer_idx++;
152 current_parser_state = FDE_XmlSyntaxState::CloseElement;
153 } else if (ch == L'?') {
154 node_type_stack.push(CFX_XMLNode::Type::kInstruction);
155 current_buffer_idx++;
156 current_parser_state = FDE_XmlSyntaxState::Target;
157 } else {
158 node_type_stack.push(CFX_XMLNode::Type::kElement);
159 current_parser_state = FDE_XmlSyntaxState::Tag;
160 }
161 break;
162 case FDE_XmlSyntaxState::Target:
163 if (!IsXMLNameChar(ch, current_text_.empty())) {
164 if (current_text_.empty())
165 return false;
166
167 current_parser_state = FDE_XmlSyntaxState::TargetData;
168
169 WideString target_name = GetTextData();
170 if (target_name.EqualsASCII("originalXFAVersion") ||
171 target_name.EqualsASCII("acrobat")) {
172 auto* node = doc->CreateNode<CFX_XMLInstruction>(target_name);
173 current_node_->AppendLastChild(node);
174 current_node_ = node;
175 }
176 } else {
177 current_text_.push_back(ch);
178 current_buffer_idx++;
179 }
180 break;
181 case FDE_XmlSyntaxState::Tag:
182 if (!IsXMLNameChar(ch, current_text_.empty())) {
183 if (current_text_.empty())
184 return false;
185
186 current_parser_state = FDE_XmlSyntaxState::AttriName;
187
188 auto* child = doc->CreateNode<CFX_XMLElement>(GetTextData());
189 current_node_->AppendLastChild(child);
190 current_node_ = child;
191 } else {
192 current_text_.push_back(ch);
193 current_buffer_idx++;
194 }
195 break;
196 case FDE_XmlSyntaxState::AttriName:
197 if (current_text_.empty() && IsXMLWhiteSpace(ch)) {
198 current_buffer_idx++;
199 break;
200 }
201 if (!IsXMLNameChar(ch, current_text_.empty())) {
202 if (current_text_.empty()) {
203 if (node_type_stack.top() == CFX_XMLNode::Type::kElement) {
204 if (ch == L'>' || ch == L'/') {
205 current_parser_state = FDE_XmlSyntaxState::BreakElement;
206 break;
207 }
208 } else if (node_type_stack.top() ==
209 CFX_XMLNode::Type::kInstruction) {
210 if (ch == L'?') {
211 current_parser_state = FDE_XmlSyntaxState::CloseInstruction;
212 current_buffer_idx++;
213 } else {
214 current_parser_state = FDE_XmlSyntaxState::TargetData;
215 }
216 break;
217 }
218 return false;
219 } else {
220 if (node_type_stack.top() == CFX_XMLNode::Type::kInstruction) {
221 if (ch != '=' && !IsXMLWhiteSpace(ch)) {
222 current_parser_state = FDE_XmlSyntaxState::TargetData;
223 break;
224 }
225 }
226 current_parser_state = FDE_XmlSyntaxState::AttriEqualSign;
227 current_attribute_name = GetTextData();
228 }
229 } else {
230 current_text_.push_back(ch);
231 current_buffer_idx++;
232 }
233 break;
234 case FDE_XmlSyntaxState::AttriEqualSign:
235 if (IsXMLWhiteSpace(ch)) {
236 current_buffer_idx++;
237 break;
238 }
239 if (ch != L'=') {
240 if (node_type_stack.top() == CFX_XMLNode::Type::kInstruction) {
241 current_parser_state = FDE_XmlSyntaxState::TargetData;
242 break;
243 }
244 return false;
245 } else {
246 current_parser_state = FDE_XmlSyntaxState::AttriQuotation;
247 current_buffer_idx++;
248 }
249 break;
250 case FDE_XmlSyntaxState::AttriQuotation:
251 if (IsXMLWhiteSpace(ch)) {
252 current_buffer_idx++;
253 break;
254 }
255 if (ch != L'\"' && ch != L'\'') {
256 return false;
257 }
258
259 current_quote_character = ch;
260 current_parser_state = FDE_XmlSyntaxState::AttriValue;
261 current_buffer_idx++;
262 break;
263 case FDE_XmlSyntaxState::AttriValue:
264 if (ch == current_quote_character) {
265 if (entity_start_ > -1)
266 return false;
267
268 current_quote_character = 0;
269 current_buffer_idx++;
270 current_parser_state = FDE_XmlSyntaxState::AttriName;
271
272 CFX_XMLElement* elem = ToXMLElement(current_node_);
273 if (elem)
274 elem->SetAttribute(current_attribute_name, GetTextData());
275
276 current_attribute_name.clear();
277 } else {
278 ProcessTextChar(ch);
279 current_buffer_idx++;
280 }
281 break;
282 case FDE_XmlSyntaxState::CloseInstruction:
283 if (ch != L'>') {
284 current_text_.push_back(ch);
285 current_parser_state = FDE_XmlSyntaxState::TargetData;
286 } else if (!current_text_.empty()) {
287 ProcessTargetData();
288 } else {
289 current_buffer_idx++;
290 if (node_type_stack.empty())
291 return false;
292
293 node_type_stack.pop();
294 current_parser_state = FDE_XmlSyntaxState::Text;
295
296 if (current_node_ &&
297 current_node_->GetType() == CFX_XMLNode::Type::kInstruction)
298 current_node_ = current_node_->GetParent();
299 }
300 break;
301 case FDE_XmlSyntaxState::BreakElement:
302 if (ch == L'>') {
303 current_parser_state = FDE_XmlSyntaxState::Text;
304 } else if (ch == L'/') {
305 current_parser_state = FDE_XmlSyntaxState::CloseElement;
306 } else {
307 return false;
308 }
309 current_buffer_idx++;
310 break;
311 case FDE_XmlSyntaxState::CloseElement:
312 if (!IsXMLNameChar(ch, current_text_.empty())) {
313 if (ch == L'>') {
314 if (node_type_stack.empty())
315 return false;
316
317 node_type_stack.pop();
318 current_parser_state = FDE_XmlSyntaxState::Text;
319
320 CFX_XMLElement* element = ToXMLElement(current_node_);
321 if (!element)
322 return false;
323
324 WideString element_name = GetTextData();
325 if (element_name.GetLength() > 0 &&
326 element_name != element->GetName()) {
327 return false;
328 }
329
330 current_node_ = current_node_->GetParent();
331 iCount++;
332 } else if (!IsXMLWhiteSpace(ch)) {
333 return false;
334 }
335 } else {
336 current_text_.push_back(ch);
337 }
338 current_buffer_idx++;
339 break;
340 case FDE_XmlSyntaxState::SkipCommentOrDecl: {
341 auto current_span =
342 pdfium::make_span(buffer).subspan(current_buffer_idx);
343 if (FXSYS_wcsnicmp(current_span.data(), L"--", 2) == 0) {
344 current_buffer_idx += 2;
345 current_parser_state = FDE_XmlSyntaxState::SkipComment;
346 } else if (FXSYS_wcsnicmp(current_span.data(), L"[CDATA[", 7) == 0) {
347 current_buffer_idx += 7;
348 current_parser_state = FDE_XmlSyntaxState::SkipCData;
349 } else {
350 current_parser_state = FDE_XmlSyntaxState::SkipDeclNode;
351 current_character_to_skip_to = L'>';
352 character_to_skip_too_stack.push(L'>');
353 }
354 break;
355 }
356 case FDE_XmlSyntaxState::SkipCData: {
357 auto current_span =
358 pdfium::make_span(buffer).subspan(current_buffer_idx);
359 if (FXSYS_wcsnicmp(current_span.data(), L"]]>", 3) == 0) {
360 current_buffer_idx += 3;
361 current_parser_state = FDE_XmlSyntaxState::Text;
362 current_node_->AppendLastChild(
363 doc->CreateNode<CFX_XMLCharData>(GetTextData()));
364 } else {
365 current_text_.push_back(ch);
366 current_buffer_idx++;
367 }
368 break;
369 }
370 case FDE_XmlSyntaxState::SkipDeclNode:
371 if (current_character_to_skip_to == L'\'' ||
372 current_character_to_skip_to == L'\"') {
373 current_buffer_idx++;
374 if (ch != current_character_to_skip_to)
375 break;
376
377 character_to_skip_too_stack.pop();
378 if (character_to_skip_too_stack.empty())
379 current_parser_state = FDE_XmlSyntaxState::Text;
380 else
381 current_character_to_skip_to = character_to_skip_too_stack.top();
382 } else {
383 switch (ch) {
384 case L'<':
385 current_character_to_skip_to = L'>';
386 character_to_skip_too_stack.push(L'>');
387 break;
388 case L'[':
389 current_character_to_skip_to = L']';
390 character_to_skip_too_stack.push(L']');
391 break;
392 case L'(':
393 current_character_to_skip_to = L')';
394 character_to_skip_too_stack.push(L')');
395 break;
396 case L'\'':
397 current_character_to_skip_to = L'\'';
398 character_to_skip_too_stack.push(L'\'');
399 break;
400 case L'\"':
401 current_character_to_skip_to = L'\"';
402 character_to_skip_too_stack.push(L'\"');
403 break;
404 default:
405 if (ch == current_character_to_skip_to) {
406 character_to_skip_too_stack.pop();
407 if (character_to_skip_too_stack.empty()) {
408 current_parser_state = FDE_XmlSyntaxState::Text;
409 } else {
410 current_character_to_skip_to =
411 character_to_skip_too_stack.top();
412 }
413 }
414 break;
415 }
416 current_buffer_idx++;
417 }
418 break;
419 case FDE_XmlSyntaxState::SkipComment: {
420 auto current_span =
421 pdfium::make_span(buffer).subspan(current_buffer_idx);
422 if (FXSYS_wcsnicmp(current_span.data(), L"-->", 3) == 0) {
423 current_buffer_idx += 2;
424 current_parser_state = FDE_XmlSyntaxState::Text;
425 }
426 current_buffer_idx++;
427 break;
428 }
429 case FDE_XmlSyntaxState::TargetData:
430 if (IsXMLWhiteSpace(ch)) {
431 if (current_text_.empty()) {
432 current_buffer_idx++;
433 break;
434 }
435 if (current_quote_character == 0) {
436 current_buffer_idx++;
437 ProcessTargetData();
438 break;
439 }
440 }
441 if (ch == '?') {
442 current_parser_state = FDE_XmlSyntaxState::CloseInstruction;
443 current_buffer_idx++;
444 } else if (ch == '\"') {
445 if (current_quote_character == 0) {
446 current_quote_character = ch;
447 current_buffer_idx++;
448 } else if (ch == current_quote_character) {
449 current_quote_character = 0;
450 current_buffer_idx++;
451 ProcessTargetData();
452 } else {
453 return false;
454 }
455 } else {
456 current_text_.push_back(ch);
457 current_buffer_idx++;
458 }
459 break;
460 default:
461 break;
462 }
463 }
464 }
465
466 NOTREACHED();
467 return false;
468 }
469
ProcessTextChar(wchar_t character)470 void CFX_XMLParser::ProcessTextChar(wchar_t character) {
471 current_text_.push_back(character);
472
473 if (entity_start_ > -1 && character == L';') {
474 // Copy the entity out into a string and remove from the vector. When we
475 // copy the entity we don't want to copy out the & or the ; so we start
476 // shifted by one and want to copy 2 less characters in total.
477 WideString csEntity(current_text_.data() + entity_start_ + 1,
478 current_text_.size() - entity_start_ - 2);
479 current_text_.erase(current_text_.begin() + entity_start_,
480 current_text_.end());
481
482 int32_t iLen = csEntity.GetLength();
483 if (iLen > 0) {
484 if (csEntity[0] == L'#') {
485 uint32_t ch = 0;
486 if (iLen > 1 && csEntity[1] == L'x') {
487 for (int32_t i = 2; i < iLen; i++) {
488 if (!FXSYS_IsHexDigit(csEntity[i]))
489 break;
490 ch = (ch << 4) + FXSYS_HexCharToInt(csEntity[i]);
491 }
492 } else {
493 for (int32_t i = 1; i < iLen; i++) {
494 if (!FXSYS_IsDecimalDigit(csEntity[i]))
495 break;
496 ch = ch * 10 + FXSYS_DecimalCharToInt(csEntity[i]);
497 }
498 }
499 if (ch > kMaxCharRange)
500 ch = ' ';
501
502 character = static_cast<wchar_t>(ch);
503 if (character != 0)
504 current_text_.push_back(character);
505 } else {
506 if (csEntity.Compare(L"amp") == 0) {
507 current_text_.push_back(L'&');
508 } else if (csEntity.Compare(L"lt") == 0) {
509 current_text_.push_back(L'<');
510 } else if (csEntity.Compare(L"gt") == 0) {
511 current_text_.push_back(L'>');
512 } else if (csEntity.Compare(L"apos") == 0) {
513 current_text_.push_back(L'\'');
514 } else if (csEntity.Compare(L"quot") == 0) {
515 current_text_.push_back(L'"');
516 }
517 }
518 }
519
520 entity_start_ = -1;
521 } else if (entity_start_ < 0 && character == L'&') {
522 entity_start_ = current_text_.size() - 1;
523 }
524 }
525
ProcessTargetData()526 void CFX_XMLParser::ProcessTargetData() {
527 WideString target_data = GetTextData();
528 if (target_data.IsEmpty())
529 return;
530
531 CFX_XMLInstruction* instruction = ToXMLInstruction(current_node_);
532 if (instruction)
533 instruction->AppendData(target_data);
534 }
535
GetTextData()536 WideString CFX_XMLParser::GetTextData() {
537 WideString ret(current_text_.data(), current_text_.size());
538 entity_start_ = -1;
539 current_text_.clear();
540 current_text_.reserve(kCurrentTextReserve);
541 return ret;
542 }
543