• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2016 The PDFium Authors
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 // Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com
6 
7 #include "core/fxcrt/xml/cfx_xmlparser.h"
8 
9 #include <stdint.h>
10 
11 #include <algorithm>
12 #include <iterator>
13 #include <stack>
14 #include <utility>
15 
16 #include "core/fxcrt/cfx_seekablestreamproxy.h"
17 #include "core/fxcrt/data_vector.h"
18 #include "core/fxcrt/fx_codepage.h"
19 #include "core/fxcrt/fx_extension.h"
20 #include "core/fxcrt/fx_safe_types.h"
21 #include "core/fxcrt/xml/cfx_xmlchardata.h"
22 #include "core/fxcrt/xml/cfx_xmldocument.h"
23 #include "core/fxcrt/xml/cfx_xmlelement.h"
24 #include "core/fxcrt/xml/cfx_xmlinstruction.h"
25 #include "core/fxcrt/xml/cfx_xmlnode.h"
26 #include "core/fxcrt/xml/cfx_xmltext.h"
27 #include "third_party/base/check.h"
28 #include "third_party/base/notreached.h"
29 
30 namespace {
31 
32 constexpr size_t kCurrentTextReserve = 128;
33 constexpr uint32_t kMaxCharRange = 0x10ffff;
34 
IsXMLWhiteSpace(wchar_t ch)35 bool IsXMLWhiteSpace(wchar_t ch) {
36   return ch == L' ' || ch == 0x0A || ch == 0x0D || ch == 0x09;
37 }
38 
39 struct FX_XMLNAMECHAR {
40   uint16_t wStart;
41   uint16_t wEnd;
42   bool bStartChar;
43 };
44 
45 constexpr FX_XMLNAMECHAR kXMLNameChars[] = {
46     {L'-', L'.', false},    {L'0', L'9', false},     {L':', L':', false},
47     {L'A', L'Z', true},     {L'_', L'_', true},      {L'a', L'z', true},
48     {0xB7, 0xB7, false},    {0xC0, 0xD6, true},      {0xD8, 0xF6, true},
49     {0xF8, 0x02FF, true},   {0x0300, 0x036F, false}, {0x0370, 0x037D, true},
50     {0x037F, 0x1FFF, true}, {0x200C, 0x200D, true},  {0x203F, 0x2040, false},
51     {0x2070, 0x218F, true}, {0x2C00, 0x2FEF, true},  {0x3001, 0xD7FF, true},
52     {0xF900, 0xFDCF, true}, {0xFDF0, 0xFFFD, true},
53 };
54 
55 }  // namespace
56 
57 // static
IsXMLNameChar(wchar_t ch,bool bFirstChar)58 bool CFX_XMLParser::IsXMLNameChar(wchar_t ch, bool bFirstChar) {
59   auto* it = std::lower_bound(
60       std::begin(kXMLNameChars), std::end(kXMLNameChars), ch,
61       [](const FX_XMLNAMECHAR& arg, wchar_t ch) { return arg.wEnd < ch; });
62   return it != std::end(kXMLNameChars) && ch >= it->wStart &&
63          (!bFirstChar || it->bStartChar);
64 }
65 
CFX_XMLParser(const RetainPtr<IFX_SeekableReadStream> & pStream)66 CFX_XMLParser::CFX_XMLParser(const RetainPtr<IFX_SeekableReadStream>& pStream) {
67   DCHECK(pStream);
68 
69   auto proxy = pdfium::MakeRetain<CFX_SeekableStreamProxy>(pStream);
70   FX_CodePage wCodePage = proxy->GetCodePage();
71   if (wCodePage != FX_CodePage::kUTF16LE &&
72       wCodePage != FX_CodePage::kUTF16BE && wCodePage != FX_CodePage::kUTF8) {
73     proxy->SetCodePage(FX_CodePage::kUTF8);
74   }
75   stream_ = proxy;
76 
77   xml_plane_size_ = std::min(
78       xml_plane_size_, pdfium::base::checked_cast<size_t>(stream_->GetSize()));
79 
80   current_text_.reserve(kCurrentTextReserve);
81 }
82 
83 CFX_XMLParser::~CFX_XMLParser() = default;
84 
Parse()85 std::unique_ptr<CFX_XMLDocument> CFX_XMLParser::Parse() {
86   auto doc = std::make_unique<CFX_XMLDocument>();
87   current_node_ = doc->GetRoot();
88 
89   return DoSyntaxParse(doc.get()) ? std::move(doc) : nullptr;
90 }
91 
DoSyntaxParse(CFX_XMLDocument * doc)92 bool CFX_XMLParser::DoSyntaxParse(CFX_XMLDocument* doc) {
93   if (xml_plane_size_ <= 0)
94     return false;
95 
96   FX_SAFE_SIZE_T alloc_size_safe = xml_plane_size_;
97   alloc_size_safe += 1;  // For NUL.
98   if (!alloc_size_safe.IsValid())
99     return false;
100 
101   size_t current_buffer_idx = 0;
102   size_t buffer_size = 0;
103 
104   DataVector<wchar_t> buffer;
105   buffer.resize(alloc_size_safe.ValueOrDie());
106 
107   std::stack<wchar_t> character_to_skip_too_stack;
108   std::stack<CFX_XMLNode::Type> node_type_stack;
109   WideString current_attribute_name;
110   FDE_XmlSyntaxState current_parser_state = FDE_XmlSyntaxState::Text;
111   wchar_t current_quote_character = 0;
112   wchar_t current_character_to_skip_to = 0;
113 
114   while (true) {
115     if (current_buffer_idx >= buffer_size) {
116       if (stream_->IsEOF())
117         return true;
118 
119       size_t buffer_chars = stream_->ReadBlock(buffer.data(), xml_plane_size_);
120       if (buffer_chars == 0)
121         return true;
122 
123       current_buffer_idx = 0;
124       buffer_size = buffer_chars;
125     }
126 
127     while (current_buffer_idx < buffer_size) {
128       wchar_t ch = buffer[current_buffer_idx];
129       switch (current_parser_state) {
130         case FDE_XmlSyntaxState::Text:
131           if (ch == L'<') {
132             if (!current_text_.empty()) {
133               current_node_->AppendLastChild(
134                   doc->CreateNode<CFX_XMLText>(GetTextData()));
135             } else {
136               current_buffer_idx++;
137               current_parser_state = FDE_XmlSyntaxState::Node;
138             }
139           } else {
140             // Fail if there is text outside of the root element, ignore
141             // whitespace/null.
142             if (node_type_stack.empty() && ch && !FXSYS_iswspace(ch))
143               return false;
144             ProcessTextChar(ch);
145             current_buffer_idx++;
146           }
147           break;
148         case FDE_XmlSyntaxState::Node:
149           if (ch == L'!') {
150             current_buffer_idx++;
151             current_parser_state = FDE_XmlSyntaxState::SkipCommentOrDecl;
152           } else if (ch == L'/') {
153             current_buffer_idx++;
154             current_parser_state = FDE_XmlSyntaxState::CloseElement;
155           } else if (ch == L'?') {
156             node_type_stack.push(CFX_XMLNode::Type::kInstruction);
157             current_buffer_idx++;
158             current_parser_state = FDE_XmlSyntaxState::Target;
159           } else {
160             node_type_stack.push(CFX_XMLNode::Type::kElement);
161             current_parser_state = FDE_XmlSyntaxState::Tag;
162           }
163           break;
164         case FDE_XmlSyntaxState::Target:
165           if (!IsXMLNameChar(ch, current_text_.empty())) {
166             if (current_text_.empty())
167               return false;
168 
169             current_parser_state = FDE_XmlSyntaxState::TargetData;
170 
171             WideString target_name = GetTextData();
172             if (target_name.EqualsASCII("originalXFAVersion") ||
173                 target_name.EqualsASCII("acrobat")) {
174               auto* node = doc->CreateNode<CFX_XMLInstruction>(target_name);
175               current_node_->AppendLastChild(node);
176               current_node_ = node;
177             }
178           } else {
179             current_text_.push_back(ch);
180             current_buffer_idx++;
181           }
182           break;
183         case FDE_XmlSyntaxState::Tag:
184           if (!IsXMLNameChar(ch, current_text_.empty())) {
185             if (current_text_.empty())
186               return false;
187 
188             current_parser_state = FDE_XmlSyntaxState::AttriName;
189 
190             auto* child = doc->CreateNode<CFX_XMLElement>(GetTextData());
191             current_node_->AppendLastChild(child);
192             current_node_ = child;
193           } else {
194             current_text_.push_back(ch);
195             current_buffer_idx++;
196           }
197           break;
198         case FDE_XmlSyntaxState::AttriName:
199           if (current_text_.empty() && IsXMLWhiteSpace(ch)) {
200             current_buffer_idx++;
201             break;
202           }
203           if (!IsXMLNameChar(ch, current_text_.empty())) {
204             if (current_text_.empty()) {
205               if (node_type_stack.top() == CFX_XMLNode::Type::kElement) {
206                 if (ch == L'>' || ch == L'/') {
207                   current_parser_state = FDE_XmlSyntaxState::BreakElement;
208                   break;
209                 }
210               } else if (node_type_stack.top() ==
211                          CFX_XMLNode::Type::kInstruction) {
212                 if (ch == L'?') {
213                   current_parser_state = FDE_XmlSyntaxState::CloseInstruction;
214                   current_buffer_idx++;
215                 } else {
216                   current_parser_state = FDE_XmlSyntaxState::TargetData;
217                 }
218                 break;
219               }
220               return false;
221             } else {
222               if (node_type_stack.top() == CFX_XMLNode::Type::kInstruction) {
223                 if (ch != '=' && !IsXMLWhiteSpace(ch)) {
224                   current_parser_state = FDE_XmlSyntaxState::TargetData;
225                   break;
226                 }
227               }
228               current_parser_state = FDE_XmlSyntaxState::AttriEqualSign;
229               current_attribute_name = GetTextData();
230             }
231           } else {
232             current_text_.push_back(ch);
233             current_buffer_idx++;
234           }
235           break;
236         case FDE_XmlSyntaxState::AttriEqualSign:
237           if (IsXMLWhiteSpace(ch)) {
238             current_buffer_idx++;
239             break;
240           }
241           if (ch != L'=') {
242             if (node_type_stack.top() == CFX_XMLNode::Type::kInstruction) {
243               current_parser_state = FDE_XmlSyntaxState::TargetData;
244               break;
245             }
246             return false;
247           } else {
248             current_parser_state = FDE_XmlSyntaxState::AttriQuotation;
249             current_buffer_idx++;
250           }
251           break;
252         case FDE_XmlSyntaxState::AttriQuotation:
253           if (IsXMLWhiteSpace(ch)) {
254             current_buffer_idx++;
255             break;
256           }
257           if (ch != L'\"' && ch != L'\'') {
258             return false;
259           }
260 
261           current_quote_character = ch;
262           current_parser_state = FDE_XmlSyntaxState::AttriValue;
263           current_buffer_idx++;
264           break;
265         case FDE_XmlSyntaxState::AttriValue:
266           if (ch == current_quote_character) {
267             if (entity_start_.has_value())
268               return false;
269 
270             current_quote_character = 0;
271             current_buffer_idx++;
272             current_parser_state = FDE_XmlSyntaxState::AttriName;
273 
274             CFX_XMLElement* elem = ToXMLElement(current_node_);
275             if (elem)
276               elem->SetAttribute(current_attribute_name, GetTextData());
277 
278             current_attribute_name.clear();
279           } else {
280             ProcessTextChar(ch);
281             current_buffer_idx++;
282           }
283           break;
284         case FDE_XmlSyntaxState::CloseInstruction:
285           if (ch != L'>') {
286             current_text_.push_back(ch);
287             current_parser_state = FDE_XmlSyntaxState::TargetData;
288           } else if (!current_text_.empty()) {
289             ProcessTargetData();
290           } else {
291             current_buffer_idx++;
292             if (node_type_stack.empty())
293               return false;
294 
295             node_type_stack.pop();
296             current_parser_state = FDE_XmlSyntaxState::Text;
297 
298             if (current_node_ &&
299                 current_node_->GetType() == CFX_XMLNode::Type::kInstruction)
300               current_node_ = current_node_->GetParent();
301           }
302           break;
303         case FDE_XmlSyntaxState::BreakElement:
304           if (ch == L'>') {
305             current_parser_state = FDE_XmlSyntaxState::Text;
306           } else if (ch == L'/') {
307             current_parser_state = FDE_XmlSyntaxState::CloseElement;
308           } else {
309             return false;
310           }
311           current_buffer_idx++;
312           break;
313         case FDE_XmlSyntaxState::CloseElement:
314           if (!IsXMLNameChar(ch, current_text_.empty())) {
315             if (ch == L'>') {
316               if (node_type_stack.empty())
317                 return false;
318 
319               node_type_stack.pop();
320               current_parser_state = FDE_XmlSyntaxState::Text;
321 
322               CFX_XMLElement* element = ToXMLElement(current_node_);
323               if (!element)
324                 return false;
325 
326               WideString element_name = GetTextData();
327               if (element_name.GetLength() > 0 &&
328                   element_name != element->GetName()) {
329                 return false;
330               }
331 
332               current_node_ = current_node_->GetParent();
333             } else if (!IsXMLWhiteSpace(ch)) {
334               return false;
335             }
336           } else {
337             current_text_.push_back(ch);
338           }
339           current_buffer_idx++;
340           break;
341         case FDE_XmlSyntaxState::SkipCommentOrDecl: {
342           auto current_span =
343               pdfium::make_span(buffer).subspan(current_buffer_idx);
344           if (FXSYS_wcsnicmp(current_span.data(), L"--", 2) == 0) {
345             current_buffer_idx += 2;
346             current_parser_state = FDE_XmlSyntaxState::SkipComment;
347           } else if (FXSYS_wcsnicmp(current_span.data(), L"[CDATA[", 7) == 0) {
348             current_buffer_idx += 7;
349             current_parser_state = FDE_XmlSyntaxState::SkipCData;
350           } else {
351             current_parser_state = FDE_XmlSyntaxState::SkipDeclNode;
352             current_character_to_skip_to = L'>';
353             character_to_skip_too_stack.push(L'>');
354           }
355           break;
356         }
357         case FDE_XmlSyntaxState::SkipCData: {
358           auto current_span =
359               pdfium::make_span(buffer).subspan(current_buffer_idx);
360           if (FXSYS_wcsnicmp(current_span.data(), L"]]>", 3) == 0) {
361             current_buffer_idx += 3;
362             current_parser_state = FDE_XmlSyntaxState::Text;
363             current_node_->AppendLastChild(
364                 doc->CreateNode<CFX_XMLCharData>(GetTextData()));
365           } else {
366             current_text_.push_back(ch);
367             current_buffer_idx++;
368           }
369           break;
370         }
371         case FDE_XmlSyntaxState::SkipDeclNode:
372           if (current_character_to_skip_to == L'\'' ||
373               current_character_to_skip_to == L'\"') {
374             current_buffer_idx++;
375             if (ch != current_character_to_skip_to)
376               break;
377 
378             character_to_skip_too_stack.pop();
379             if (character_to_skip_too_stack.empty())
380               current_parser_state = FDE_XmlSyntaxState::Text;
381             else
382               current_character_to_skip_to = character_to_skip_too_stack.top();
383           } else {
384             switch (ch) {
385               case L'<':
386                 current_character_to_skip_to = L'>';
387                 character_to_skip_too_stack.push(L'>');
388                 break;
389               case L'[':
390                 current_character_to_skip_to = L']';
391                 character_to_skip_too_stack.push(L']');
392                 break;
393               case L'(':
394                 current_character_to_skip_to = L')';
395                 character_to_skip_too_stack.push(L')');
396                 break;
397               case L'\'':
398                 current_character_to_skip_to = L'\'';
399                 character_to_skip_too_stack.push(L'\'');
400                 break;
401               case L'\"':
402                 current_character_to_skip_to = L'\"';
403                 character_to_skip_too_stack.push(L'\"');
404                 break;
405               default:
406                 if (ch == current_character_to_skip_to) {
407                   character_to_skip_too_stack.pop();
408                   if (character_to_skip_too_stack.empty()) {
409                     current_parser_state = FDE_XmlSyntaxState::Text;
410                   } else {
411                     current_character_to_skip_to =
412                         character_to_skip_too_stack.top();
413                   }
414                 }
415                 break;
416             }
417             current_buffer_idx++;
418           }
419           break;
420         case FDE_XmlSyntaxState::SkipComment: {
421           auto current_span =
422               pdfium::make_span(buffer).subspan(current_buffer_idx);
423           if (FXSYS_wcsnicmp(current_span.data(), L"-->", 3) == 0) {
424             current_buffer_idx += 2;
425             current_parser_state = FDE_XmlSyntaxState::Text;
426           }
427           current_buffer_idx++;
428           break;
429         }
430         case FDE_XmlSyntaxState::TargetData:
431           if (IsXMLWhiteSpace(ch)) {
432             if (current_text_.empty()) {
433               current_buffer_idx++;
434               break;
435             }
436             if (current_quote_character == 0) {
437               current_buffer_idx++;
438               ProcessTargetData();
439               break;
440             }
441           }
442           if (ch == '?') {
443             current_parser_state = FDE_XmlSyntaxState::CloseInstruction;
444             current_buffer_idx++;
445           } else if (ch == '\"') {
446             if (current_quote_character == 0) {
447               current_quote_character = ch;
448               current_buffer_idx++;
449             } else if (ch == current_quote_character) {
450               current_quote_character = 0;
451               current_buffer_idx++;
452               ProcessTargetData();
453             } else {
454               return false;
455             }
456           } else {
457             current_text_.push_back(ch);
458             current_buffer_idx++;
459           }
460           break;
461         default:
462           break;
463       }
464     }
465   }
466 
467   NOTREACHED();
468   return false;
469 }
470 
ProcessTextChar(wchar_t character)471 void CFX_XMLParser::ProcessTextChar(wchar_t character) {
472   current_text_.push_back(character);
473 
474   if (entity_start_.has_value() && character == L';') {
475     // Copy the entity out into a string and remove from the vector. When we
476     // copy the entity we don't want to copy out the & or the ; so we start
477     // shifted by one and want to copy 2 less characters in total.
478     WideString csEntity(current_text_.data() + entity_start_.value() + 1,
479                         current_text_.size() - entity_start_.value() - 2);
480     current_text_.erase(current_text_.begin() + entity_start_.value(),
481                         current_text_.end());
482 
483     size_t iLen = csEntity.GetLength();
484     if (iLen > 0) {
485       if (csEntity[0] == L'#') {
486         uint32_t ch = 0;
487         if (iLen > 1 && csEntity[1] == L'x') {
488           for (size_t i = 2; i < iLen; i++) {
489             if (!FXSYS_IsHexDigit(csEntity[i]))
490               break;
491             ch = (ch << 4) + FXSYS_HexCharToInt(csEntity[i]);
492           }
493         } else {
494           for (size_t i = 1; i < iLen; i++) {
495             if (!FXSYS_IsDecimalDigit(csEntity[i]))
496               break;
497             ch = ch * 10 + FXSYS_DecimalCharToInt(csEntity[i]);
498           }
499         }
500         if (ch > kMaxCharRange)
501           ch = ' ';
502 
503         character = static_cast<wchar_t>(ch);
504         if (character != 0)
505           current_text_.push_back(character);
506       } else {
507         if (csEntity == L"amp") {
508           current_text_.push_back(L'&');
509         } else if (csEntity == L"lt") {
510           current_text_.push_back(L'<');
511         } else if (csEntity == L"gt") {
512           current_text_.push_back(L'>');
513         } else if (csEntity == L"apos") {
514           current_text_.push_back(L'\'');
515         } else if (csEntity == L"quot") {
516           current_text_.push_back(L'"');
517         }
518       }
519     }
520     entity_start_ = absl::nullopt;
521   } else if (!entity_start_.has_value() && character == L'&') {
522     entity_start_ = current_text_.size() - 1;
523   }
524 }
525 
ProcessTargetData()526 void CFX_XMLParser::ProcessTargetData() {
527   WideString target_data = GetTextData();
528   if (target_data.IsEmpty())
529     return;
530 
531   CFX_XMLInstruction* instruction = ToXMLInstruction(current_node_);
532   if (instruction)
533     instruction->AppendData(target_data);
534 }
535 
GetTextData()536 WideString CFX_XMLParser::GetTextData() {
537   WideString ret(current_text_.data(), current_text_.size());
538   entity_start_ = absl::nullopt;
539   current_text_.clear();
540   current_text_.reserve(kCurrentTextReserve);
541   return ret;
542 }
543