• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2016 The PDFium Authors
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 // Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com
6 
7 #include "core/fxcrt/xml/cfx_xmlparser.h"
8 
9 #include <stdint.h>
10 
11 #include <algorithm>
12 #include <iterator>
13 #include <stack>
14 #include <utility>
15 
16 #include "core/fxcrt/autorestorer.h"
17 #include "core/fxcrt/cfx_seekablestreamproxy.h"
18 #include "core/fxcrt/check.h"
19 #include "core/fxcrt/data_vector.h"
20 #include "core/fxcrt/fx_codepage.h"
21 #include "core/fxcrt/fx_extension.h"
22 #include "core/fxcrt/fx_safe_types.h"
23 #include "core/fxcrt/notreached.h"
24 #include "core/fxcrt/xml/cfx_xmlchardata.h"
25 #include "core/fxcrt/xml/cfx_xmldocument.h"
26 #include "core/fxcrt/xml/cfx_xmlelement.h"
27 #include "core/fxcrt/xml/cfx_xmlinstruction.h"
28 #include "core/fxcrt/xml/cfx_xmlnode.h"
29 #include "core/fxcrt/xml/cfx_xmltext.h"
30 
31 namespace {
32 
33 constexpr size_t kCurrentTextReserve = 128;
34 constexpr uint32_t kMaxCharRange = 0x10ffff;
35 
IsXMLWhiteSpace(wchar_t ch)36 bool IsXMLWhiteSpace(wchar_t ch) {
37   return ch == L' ' || ch == 0x0A || ch == 0x0D || ch == 0x09;
38 }
39 
40 struct FX_XMLNAMECHAR {
41   uint16_t wStart;
42   uint16_t wEnd;
43   bool bStartChar;
44 };
45 
46 constexpr FX_XMLNAMECHAR kXMLNameChars[] = {
47     {L'-', L'.', false},    {L'0', L'9', false},     {L':', L':', false},
48     {L'A', L'Z', true},     {L'_', L'_', true},      {L'a', L'z', true},
49     {0xB7, 0xB7, false},    {0xC0, 0xD6, true},      {0xD8, 0xF6, true},
50     {0xF8, 0x02FF, true},   {0x0300, 0x036F, false}, {0x0370, 0x037D, true},
51     {0x037F, 0x1FFF, true}, {0x200C, 0x200D, true},  {0x203F, 0x2040, false},
52     {0x2070, 0x218F, true}, {0x2C00, 0x2FEF, true},  {0x3001, 0xD7FF, true},
53     {0xF900, 0xFDCF, true}, {0xFDF0, 0xFFFD, true},
54 };
55 
56 }  // namespace
57 
58 // static
IsXMLNameChar(wchar_t ch,bool bFirstChar)59 bool CFX_XMLParser::IsXMLNameChar(wchar_t ch, bool bFirstChar) {
60   auto* it = std::lower_bound(
61       std::begin(kXMLNameChars), std::end(kXMLNameChars), ch,
62       [](const FX_XMLNAMECHAR& arg, wchar_t ch) { return arg.wEnd < ch; });
63   return it != std::end(kXMLNameChars) && ch >= it->wStart &&
64          (!bFirstChar || it->bStartChar);
65 }
66 
CFX_XMLParser(const RetainPtr<IFX_SeekableReadStream> & pStream)67 CFX_XMLParser::CFX_XMLParser(const RetainPtr<IFX_SeekableReadStream>& pStream) {
68   DCHECK(pStream);
69 
70   auto proxy = pdfium::MakeRetain<CFX_SeekableStreamProxy>(pStream);
71   FX_CodePage wCodePage = proxy->GetCodePage();
72   if (wCodePage != FX_CodePage::kUTF16LE &&
73       wCodePage != FX_CodePage::kUTF16BE && wCodePage != FX_CodePage::kUTF8) {
74     proxy->SetCodePage(FX_CodePage::kUTF8);
75   }
76   stream_ = proxy;
77 
78   xml_plane_size_ = std::min(xml_plane_size_,
79                              pdfium::checked_cast<size_t>(stream_->GetSize()));
80 
81   current_text_.Reserve(kCurrentTextReserve);
82 }
83 
84 CFX_XMLParser::~CFX_XMLParser() = default;
85 
Parse()86 std::unique_ptr<CFX_XMLDocument> CFX_XMLParser::Parse() {
87   auto doc = std::make_unique<CFX_XMLDocument>();
88   AutoRestorer<UnownedPtr<CFX_XMLNode>> restorer(&current_node_);
89   current_node_ = doc->GetRoot();
90   if (!DoSyntaxParse(doc.get())) {
91     return nullptr;
92   }
93   return doc;
94 }
95 
DoSyntaxParse(CFX_XMLDocument * doc)96 bool CFX_XMLParser::DoSyntaxParse(CFX_XMLDocument* doc) {
97   if (xml_plane_size_ <= 0)
98     return false;
99 
100   FX_SAFE_SIZE_T alloc_size_safe = xml_plane_size_;
101   alloc_size_safe += 1;  // For NUL.
102   if (!alloc_size_safe.IsValid())
103     return false;
104 
105   size_t current_buffer_idx = 0;
106   size_t buffer_size = 0;
107 
108   DataVector<wchar_t> buffer;
109   buffer.resize(alloc_size_safe.ValueOrDie());
110 
111   std::stack<wchar_t> character_to_skip_too_stack;
112   std::stack<CFX_XMLNode::Type> node_type_stack;
113   WideString current_attribute_name;
114   FDE_XmlSyntaxState current_parser_state = FDE_XmlSyntaxState::Text;
115   wchar_t current_quote_character = 0;
116   wchar_t current_character_to_skip_to = 0;
117 
118   while (true) {
119     if (current_buffer_idx >= buffer_size) {
120       if (stream_->IsEOF())
121         return true;
122 
123       size_t buffer_chars =
124           stream_->ReadBlock(pdfium::make_span(buffer).first(xml_plane_size_));
125       if (buffer_chars == 0)
126         return true;
127 
128       current_buffer_idx = 0;
129       buffer_size = buffer_chars;
130     }
131 
132     while (current_buffer_idx < buffer_size) {
133       wchar_t ch = buffer[current_buffer_idx];
134       switch (current_parser_state) {
135         case FDE_XmlSyntaxState::Text:
136           if (ch == L'<') {
137             if (!current_text_.IsEmpty()) {
138               current_node_->AppendLastChild(
139                   doc->CreateNode<CFX_XMLText>(GetTextData()));
140             } else {
141               current_buffer_idx++;
142               current_parser_state = FDE_XmlSyntaxState::Node;
143             }
144           } else {
145             // Fail if there is text outside of the root element, ignore
146             // whitespace/null.
147             if (node_type_stack.empty() && ch && !FXSYS_iswspace(ch))
148               return false;
149             ProcessTextChar(ch);
150             current_buffer_idx++;
151           }
152           break;
153         case FDE_XmlSyntaxState::Node:
154           if (ch == L'!') {
155             current_buffer_idx++;
156             current_parser_state = FDE_XmlSyntaxState::SkipCommentOrDecl;
157           } else if (ch == L'/') {
158             current_buffer_idx++;
159             current_parser_state = FDE_XmlSyntaxState::CloseElement;
160           } else if (ch == L'?') {
161             node_type_stack.push(CFX_XMLNode::Type::kInstruction);
162             current_buffer_idx++;
163             current_parser_state = FDE_XmlSyntaxState::Target;
164           } else {
165             node_type_stack.push(CFX_XMLNode::Type::kElement);
166             current_parser_state = FDE_XmlSyntaxState::Tag;
167           }
168           break;
169         case FDE_XmlSyntaxState::Target:
170           if (!IsXMLNameChar(ch, current_text_.IsEmpty())) {
171             if (current_text_.IsEmpty()) {
172               return false;
173             }
174 
175             current_parser_state = FDE_XmlSyntaxState::TargetData;
176 
177             WideString target_name = GetTextData();
178             if (target_name.EqualsASCII("originalXFAVersion") ||
179                 target_name.EqualsASCII("acrobat")) {
180               auto* node = doc->CreateNode<CFX_XMLInstruction>(target_name);
181               current_node_->AppendLastChild(node);
182               current_node_ = node;
183             }
184           } else {
185             current_text_ += ch;
186             current_buffer_idx++;
187           }
188           break;
189         case FDE_XmlSyntaxState::Tag:
190           if (!IsXMLNameChar(ch, current_text_.IsEmpty())) {
191             if (current_text_.IsEmpty()) {
192               return false;
193             }
194 
195             current_parser_state = FDE_XmlSyntaxState::AttriName;
196 
197             auto* child = doc->CreateNode<CFX_XMLElement>(GetTextData());
198             current_node_->AppendLastChild(child);
199             current_node_ = child;
200           } else {
201             current_text_ += ch;
202             current_buffer_idx++;
203           }
204           break;
205         case FDE_XmlSyntaxState::AttriName:
206           if (current_text_.IsEmpty() && IsXMLWhiteSpace(ch)) {
207             current_buffer_idx++;
208             break;
209           }
210           if (!IsXMLNameChar(ch, current_text_.IsEmpty())) {
211             if (current_text_.IsEmpty()) {
212               if (node_type_stack.top() == CFX_XMLNode::Type::kElement) {
213                 if (ch == L'>' || ch == L'/') {
214                   current_parser_state = FDE_XmlSyntaxState::BreakElement;
215                   break;
216                 }
217               } else if (node_type_stack.top() ==
218                          CFX_XMLNode::Type::kInstruction) {
219                 if (ch == L'?') {
220                   current_parser_state = FDE_XmlSyntaxState::CloseInstruction;
221                   current_buffer_idx++;
222                 } else {
223                   current_parser_state = FDE_XmlSyntaxState::TargetData;
224                 }
225                 break;
226               }
227               return false;
228             } else {
229               if (node_type_stack.top() == CFX_XMLNode::Type::kInstruction) {
230                 if (ch != '=' && !IsXMLWhiteSpace(ch)) {
231                   current_parser_state = FDE_XmlSyntaxState::TargetData;
232                   break;
233                 }
234               }
235               current_parser_state = FDE_XmlSyntaxState::AttriEqualSign;
236               current_attribute_name = GetTextData();
237             }
238           } else {
239             current_text_ += ch;
240             current_buffer_idx++;
241           }
242           break;
243         case FDE_XmlSyntaxState::AttriEqualSign:
244           if (IsXMLWhiteSpace(ch)) {
245             current_buffer_idx++;
246             break;
247           }
248           if (ch != L'=') {
249             if (node_type_stack.top() == CFX_XMLNode::Type::kInstruction) {
250               current_parser_state = FDE_XmlSyntaxState::TargetData;
251               break;
252             }
253             return false;
254           } else {
255             current_parser_state = FDE_XmlSyntaxState::AttriQuotation;
256             current_buffer_idx++;
257           }
258           break;
259         case FDE_XmlSyntaxState::AttriQuotation:
260           if (IsXMLWhiteSpace(ch)) {
261             current_buffer_idx++;
262             break;
263           }
264           if (ch != L'\"' && ch != L'\'') {
265             return false;
266           }
267 
268           current_quote_character = ch;
269           current_parser_state = FDE_XmlSyntaxState::AttriValue;
270           current_buffer_idx++;
271           break;
272         case FDE_XmlSyntaxState::AttriValue:
273           if (ch == current_quote_character) {
274             if (entity_start_.has_value())
275               return false;
276 
277             current_quote_character = 0;
278             current_buffer_idx++;
279             current_parser_state = FDE_XmlSyntaxState::AttriName;
280 
281             CFX_XMLElement* elem = ToXMLElement(current_node_);
282             if (elem)
283               elem->SetAttribute(current_attribute_name, GetTextData());
284 
285             current_attribute_name.clear();
286           } else {
287             ProcessTextChar(ch);
288             current_buffer_idx++;
289           }
290           break;
291         case FDE_XmlSyntaxState::CloseInstruction:
292           if (ch != L'>') {
293             current_text_ += ch;
294             current_parser_state = FDE_XmlSyntaxState::TargetData;
295           } else if (!current_text_.IsEmpty()) {
296             ProcessTargetData();
297           } else {
298             current_buffer_idx++;
299             if (node_type_stack.empty())
300               return false;
301 
302             node_type_stack.pop();
303             current_parser_state = FDE_XmlSyntaxState::Text;
304 
305             if (current_node_ &&
306                 current_node_->GetType() == CFX_XMLNode::Type::kInstruction)
307               current_node_ = current_node_->GetParent();
308           }
309           break;
310         case FDE_XmlSyntaxState::BreakElement:
311           if (ch == L'>') {
312             current_parser_state = FDE_XmlSyntaxState::Text;
313           } else if (ch == L'/') {
314             current_parser_state = FDE_XmlSyntaxState::CloseElement;
315           } else {
316             return false;
317           }
318           current_buffer_idx++;
319           break;
320         case FDE_XmlSyntaxState::CloseElement:
321           if (!IsXMLNameChar(ch, current_text_.IsEmpty())) {
322             if (ch == L'>') {
323               if (node_type_stack.empty())
324                 return false;
325 
326               node_type_stack.pop();
327               current_parser_state = FDE_XmlSyntaxState::Text;
328 
329               CFX_XMLElement* element = ToXMLElement(current_node_);
330               if (!element)
331                 return false;
332 
333               WideString element_name = GetTextData();
334               if (element_name.GetLength() > 0 &&
335                   element_name != element->GetName()) {
336                 return false;
337               }
338 
339               current_node_ = current_node_->GetParent();
340             } else if (!IsXMLWhiteSpace(ch)) {
341               return false;
342             }
343           } else {
344             current_text_ += ch;
345           }
346           current_buffer_idx++;
347           break;
348         case FDE_XmlSyntaxState::SkipCommentOrDecl: {
349           auto current_view = WideStringView(
350               pdfium::make_span(buffer).subspan(current_buffer_idx));
351           if (current_view.First(2).EqualsASCII("--")) {
352             current_buffer_idx += 2;
353             current_parser_state = FDE_XmlSyntaxState::SkipComment;
354           } else if (current_view.First(7).EqualsASCIINoCase("[CDATA[")) {
355             current_buffer_idx += 7;
356             current_parser_state = FDE_XmlSyntaxState::SkipCData;
357           } else {
358             current_parser_state = FDE_XmlSyntaxState::SkipDeclNode;
359             current_character_to_skip_to = L'>';
360             character_to_skip_too_stack.push(L'>');
361           }
362           break;
363         }
364         case FDE_XmlSyntaxState::SkipCData: {
365           auto current_view = WideStringView(
366               pdfium::make_span(buffer).subspan(current_buffer_idx));
367           if (current_view.First(3).EqualsASCII("]]>")) {
368             current_buffer_idx += 3;
369             current_parser_state = FDE_XmlSyntaxState::Text;
370             current_node_->AppendLastChild(
371                 doc->CreateNode<CFX_XMLCharData>(GetTextData()));
372           } else {
373             current_text_ += ch;
374             current_buffer_idx++;
375           }
376           break;
377         }
378         case FDE_XmlSyntaxState::SkipDeclNode:
379           if (current_character_to_skip_to == L'\'' ||
380               current_character_to_skip_to == L'\"') {
381             current_buffer_idx++;
382             if (ch != current_character_to_skip_to)
383               break;
384 
385             character_to_skip_too_stack.pop();
386             if (character_to_skip_too_stack.empty())
387               current_parser_state = FDE_XmlSyntaxState::Text;
388             else
389               current_character_to_skip_to = character_to_skip_too_stack.top();
390           } else {
391             switch (ch) {
392               case L'<':
393                 current_character_to_skip_to = L'>';
394                 character_to_skip_too_stack.push(L'>');
395                 break;
396               case L'[':
397                 current_character_to_skip_to = L']';
398                 character_to_skip_too_stack.push(L']');
399                 break;
400               case L'(':
401                 current_character_to_skip_to = L')';
402                 character_to_skip_too_stack.push(L')');
403                 break;
404               case L'\'':
405                 current_character_to_skip_to = L'\'';
406                 character_to_skip_too_stack.push(L'\'');
407                 break;
408               case L'\"':
409                 current_character_to_skip_to = L'\"';
410                 character_to_skip_too_stack.push(L'\"');
411                 break;
412               default:
413                 if (ch == current_character_to_skip_to) {
414                   character_to_skip_too_stack.pop();
415                   if (character_to_skip_too_stack.empty()) {
416                     current_parser_state = FDE_XmlSyntaxState::Text;
417                   } else {
418                     current_character_to_skip_to =
419                         character_to_skip_too_stack.top();
420                   }
421                 }
422                 break;
423             }
424             current_buffer_idx++;
425           }
426           break;
427         case FDE_XmlSyntaxState::SkipComment: {
428           auto current_view = WideStringView(
429               pdfium::make_span(buffer).subspan(current_buffer_idx));
430           if (current_view.First(3).EqualsASCII("-->")) {
431             current_buffer_idx += 2;
432             current_parser_state = FDE_XmlSyntaxState::Text;
433           }
434           current_buffer_idx++;
435           break;
436         }
437         case FDE_XmlSyntaxState::TargetData:
438           if (IsXMLWhiteSpace(ch)) {
439             if (current_text_.IsEmpty()) {
440               current_buffer_idx++;
441               break;
442             }
443             if (current_quote_character == 0) {
444               current_buffer_idx++;
445               ProcessTargetData();
446               break;
447             }
448           }
449           if (ch == '?') {
450             current_parser_state = FDE_XmlSyntaxState::CloseInstruction;
451             current_buffer_idx++;
452           } else if (ch == '\"') {
453             if (current_quote_character == 0) {
454               current_quote_character = ch;
455               current_buffer_idx++;
456             } else if (ch == current_quote_character) {
457               current_quote_character = 0;
458               current_buffer_idx++;
459               ProcessTargetData();
460             } else {
461               return false;
462             }
463           } else {
464             current_text_ += ch;
465             current_buffer_idx++;
466           }
467           break;
468       }
469     }
470   }
471 
472   NOTREACHED_NORETURN();
473 }
474 
ProcessTextChar(wchar_t character)475 void CFX_XMLParser::ProcessTextChar(wchar_t character) {
476   current_text_ += character;
477 
478   if (entity_start_.has_value() && character == L';') {
479     // Copy the entity out into a string and remove from the current text. When
480     // we copy the entity we don't want to copy out the & or the ; so we start
481     // shifted by one and want to copy 2 less characters in total.
482     WideString csEntity = current_text_.Substr(
483         entity_start_.value() + 1,
484         current_text_.GetLength() - entity_start_.value() - 2);
485 
486     current_text_.Delete(entity_start_.value(),
487                          current_text_.GetLength() - entity_start_.value());
488 
489     size_t iLen = csEntity.GetLength();
490     if (iLen > 0) {
491       if (csEntity[0] == L'#') {
492         uint32_t ch = 0;
493         if (iLen > 1 && csEntity[1] == L'x') {
494           for (size_t i = 2; i < iLen; i++) {
495             if (!FXSYS_IsHexDigit(csEntity[i]))
496               break;
497             ch = (ch << 4) + FXSYS_HexCharToInt(csEntity[i]);
498           }
499         } else {
500           for (size_t i = 1; i < iLen; i++) {
501             if (!FXSYS_IsDecimalDigit(csEntity[i]))
502               break;
503             ch = ch * 10 + FXSYS_DecimalCharToInt(csEntity[i]);
504           }
505         }
506         if (ch > kMaxCharRange)
507           ch = ' ';
508 
509         character = static_cast<wchar_t>(ch);
510         if (character != 0)
511           current_text_ += character;
512       } else {
513         if (csEntity.EqualsASCII("amp")) {
514           current_text_ += L'&';
515         } else if (csEntity.EqualsASCII("lt")) {
516           current_text_ += L'<';
517         } else if (csEntity.EqualsASCII("gt")) {
518           current_text_ += L'>';
519         } else if (csEntity.EqualsASCII("apos")) {
520           current_text_ += L'\'';
521         } else if (csEntity.EqualsASCII("quot")) {
522           current_text_ += L'"';
523         }
524       }
525     }
526     entity_start_ = std::nullopt;
527   } else if (!entity_start_.has_value() && character == L'&') {
528     entity_start_ = current_text_.GetLength() - 1;
529   }
530 }
531 
ProcessTargetData()532 void CFX_XMLParser::ProcessTargetData() {
533   WideString target_data = GetTextData();
534   if (target_data.IsEmpty())
535     return;
536 
537   CFX_XMLInstruction* instruction = ToXMLInstruction(current_node_);
538   if (instruction)
539     instruction->AppendData(target_data);
540 }
541 
GetTextData()542 WideString CFX_XMLParser::GetTextData() {
543   WideString ret = std::move(current_text_);
544   current_text_.Reserve(kCurrentTextReserve);
545   entity_start_ = std::nullopt;
546   return ret;
547 }
548