• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2016 PDFium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 // Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com
6 
7 #include "core/fxcrt/xml/cfx_xmlparser.h"
8 
9 #include <algorithm>
10 #include <cwctype>
11 #include <iterator>
12 #include <stack>
13 #include <utility>
14 
15 #include "core/fxcrt/cfx_seekablestreamproxy.h"
16 #include "core/fxcrt/fx_codepage.h"
17 #include "core/fxcrt/fx_extension.h"
18 #include "core/fxcrt/fx_safe_types.h"
19 #include "core/fxcrt/xml/cfx_xmlchardata.h"
20 #include "core/fxcrt/xml/cfx_xmldocument.h"
21 #include "core/fxcrt/xml/cfx_xmlelement.h"
22 #include "core/fxcrt/xml/cfx_xmlinstruction.h"
23 #include "core/fxcrt/xml/cfx_xmlnode.h"
24 #include "core/fxcrt/xml/cfx_xmltext.h"
25 #include "third_party/base/ptr_util.h"
26 
27 namespace {
28 
29 constexpr size_t kCurrentTextReserve = 128;
30 constexpr uint32_t kMaxCharRange = 0x10ffff;
31 
IsXMLWhiteSpace(wchar_t ch)32 bool IsXMLWhiteSpace(wchar_t ch) {
33   return ch == L' ' || ch == 0x0A || ch == 0x0D || ch == 0x09;
34 }
35 
36 struct FX_XMLNAMECHAR {
37   uint16_t wStart;
38   uint16_t wEnd;
39   bool bStartChar;
40 };
41 
42 const FX_XMLNAMECHAR g_XMLNameChars[] = {
43     {L'-', L'.', false},    {L'0', L'9', false},     {L':', L':', false},
44     {L'A', L'Z', true},     {L'_', L'_', true},      {L'a', L'z', true},
45     {0xB7, 0xB7, false},    {0xC0, 0xD6, true},      {0xD8, 0xF6, true},
46     {0xF8, 0x02FF, true},   {0x0300, 0x036F, false}, {0x0370, 0x037D, true},
47     {0x037F, 0x1FFF, true}, {0x200C, 0x200D, true},  {0x203F, 0x2040, false},
48     {0x2070, 0x218F, true}, {0x2C00, 0x2FEF, true},  {0x3001, 0xD7FF, true},
49     {0xF900, 0xFDCF, true}, {0xFDF0, 0xFFFD, true},
50 };
51 
52 }  // namespace
53 
54 // static
IsXMLNameChar(wchar_t ch,bool bFirstChar)55 bool CFX_XMLParser::IsXMLNameChar(wchar_t ch, bool bFirstChar) {
56   auto* it = std::lower_bound(
57       std::begin(g_XMLNameChars), std::end(g_XMLNameChars), ch,
58       [](const FX_XMLNAMECHAR& arg, wchar_t ch) { return arg.wEnd < ch; });
59   return it != std::end(g_XMLNameChars) && ch >= it->wStart &&
60          (!bFirstChar || it->bStartChar);
61 }
62 
CFX_XMLParser(const RetainPtr<IFX_SeekableReadStream> & pStream)63 CFX_XMLParser::CFX_XMLParser(const RetainPtr<IFX_SeekableReadStream>& pStream) {
64   ASSERT(pStream);
65 
66   auto proxy = pdfium::MakeRetain<CFX_SeekableStreamProxy>(pStream);
67   uint16_t wCodePage = proxy->GetCodePage();
68   if (wCodePage != FX_CODEPAGE_UTF16LE && wCodePage != FX_CODEPAGE_UTF16BE &&
69       wCodePage != FX_CODEPAGE_UTF8) {
70     proxy->SetCodePage(FX_CODEPAGE_UTF8);
71   }
72   stream_ = proxy;
73 
74   xml_plane_size_ = std::min(
75       xml_plane_size_, pdfium::base::checked_cast<size_t>(stream_->GetSize()));
76 
77   current_text_.reserve(kCurrentTextReserve);
78 }
79 
80 CFX_XMLParser::~CFX_XMLParser() = default;
81 
Parse()82 std::unique_ptr<CFX_XMLDocument> CFX_XMLParser::Parse() {
83   auto doc = pdfium::MakeUnique<CFX_XMLDocument>();
84   current_node_ = doc->GetRoot();
85 
86   return DoSyntaxParse(doc.get()) ? std::move(doc) : nullptr;
87 }
88 
DoSyntaxParse(CFX_XMLDocument * doc)89 bool CFX_XMLParser::DoSyntaxParse(CFX_XMLDocument* doc) {
90   if (xml_plane_size_ <= 0)
91     return false;
92 
93   FX_SAFE_SIZE_T alloc_size_safe = xml_plane_size_;
94   alloc_size_safe += 1;  // For NUL.
95   if (!alloc_size_safe.IsValid())
96     return false;
97 
98   FX_FILESIZE current_buffer_idx = 0;
99   FX_FILESIZE buffer_size = 0;
100 
101   std::vector<wchar_t, FxAllocAllocator<wchar_t>> buffer;
102   buffer.resize(alloc_size_safe.ValueOrDie());
103 
104   std::stack<wchar_t> character_to_skip_too_stack;
105   std::stack<CFX_XMLNode::Type> node_type_stack;
106   WideString current_attribute_name;
107   FDE_XmlSyntaxState current_parser_state = FDE_XmlSyntaxState::Text;
108   int32_t iCount = 0;
109   wchar_t current_quote_character = 0;
110   wchar_t current_character_to_skip_to = 0;
111 
112   while (true) {
113     if (current_buffer_idx >= buffer_size) {
114       if (stream_->IsEOF())
115         return true;
116 
117       size_t buffer_chars = stream_->ReadBlock(buffer.data(), xml_plane_size_);
118       if (buffer_chars == 0)
119         return true;
120 
121       current_buffer_idx = 0;
122       buffer_size = buffer_chars;
123     }
124 
125     while (current_buffer_idx < buffer_size) {
126       wchar_t ch = buffer[current_buffer_idx];
127       switch (current_parser_state) {
128         case FDE_XmlSyntaxState::Text:
129           if (ch == L'<') {
130             if (!current_text_.empty()) {
131               current_node_->AppendLastChild(
132                   doc->CreateNode<CFX_XMLText>(GetTextData()));
133             } else {
134               current_buffer_idx++;
135               current_parser_state = FDE_XmlSyntaxState::Node;
136             }
137           } else {
138             // Fail if there is text outside of the root element, ignore
139             // whitespace/null.
140             if (node_type_stack.empty() && ch && !FXSYS_iswspace(ch))
141               return false;
142             ProcessTextChar(ch);
143             current_buffer_idx++;
144           }
145           break;
146         case FDE_XmlSyntaxState::Node:
147           if (ch == L'!') {
148             current_buffer_idx++;
149             current_parser_state = FDE_XmlSyntaxState::SkipCommentOrDecl;
150           } else if (ch == L'/') {
151             current_buffer_idx++;
152             current_parser_state = FDE_XmlSyntaxState::CloseElement;
153           } else if (ch == L'?') {
154             node_type_stack.push(CFX_XMLNode::Type::kInstruction);
155             current_buffer_idx++;
156             current_parser_state = FDE_XmlSyntaxState::Target;
157           } else {
158             node_type_stack.push(CFX_XMLNode::Type::kElement);
159             current_parser_state = FDE_XmlSyntaxState::Tag;
160           }
161           break;
162         case FDE_XmlSyntaxState::Target:
163           if (!IsXMLNameChar(ch, current_text_.empty())) {
164             if (current_text_.empty())
165               return false;
166 
167             current_parser_state = FDE_XmlSyntaxState::TargetData;
168 
169             WideString target_name = GetTextData();
170             if (target_name.EqualsASCII("originalXFAVersion") ||
171                 target_name.EqualsASCII("acrobat")) {
172               auto* node = doc->CreateNode<CFX_XMLInstruction>(target_name);
173               current_node_->AppendLastChild(node);
174               current_node_ = node;
175             }
176           } else {
177             current_text_.push_back(ch);
178             current_buffer_idx++;
179           }
180           break;
181         case FDE_XmlSyntaxState::Tag:
182           if (!IsXMLNameChar(ch, current_text_.empty())) {
183             if (current_text_.empty())
184               return false;
185 
186             current_parser_state = FDE_XmlSyntaxState::AttriName;
187 
188             auto* child = doc->CreateNode<CFX_XMLElement>(GetTextData());
189             current_node_->AppendLastChild(child);
190             current_node_ = child;
191           } else {
192             current_text_.push_back(ch);
193             current_buffer_idx++;
194           }
195           break;
196         case FDE_XmlSyntaxState::AttriName:
197           if (current_text_.empty() && IsXMLWhiteSpace(ch)) {
198             current_buffer_idx++;
199             break;
200           }
201           if (!IsXMLNameChar(ch, current_text_.empty())) {
202             if (current_text_.empty()) {
203               if (node_type_stack.top() == CFX_XMLNode::Type::kElement) {
204                 if (ch == L'>' || ch == L'/') {
205                   current_parser_state = FDE_XmlSyntaxState::BreakElement;
206                   break;
207                 }
208               } else if (node_type_stack.top() ==
209                          CFX_XMLNode::Type::kInstruction) {
210                 if (ch == L'?') {
211                   current_parser_state = FDE_XmlSyntaxState::CloseInstruction;
212                   current_buffer_idx++;
213                 } else {
214                   current_parser_state = FDE_XmlSyntaxState::TargetData;
215                 }
216                 break;
217               }
218               return false;
219             } else {
220               if (node_type_stack.top() == CFX_XMLNode::Type::kInstruction) {
221                 if (ch != '=' && !IsXMLWhiteSpace(ch)) {
222                   current_parser_state = FDE_XmlSyntaxState::TargetData;
223                   break;
224                 }
225               }
226               current_parser_state = FDE_XmlSyntaxState::AttriEqualSign;
227               current_attribute_name = GetTextData();
228             }
229           } else {
230             current_text_.push_back(ch);
231             current_buffer_idx++;
232           }
233           break;
234         case FDE_XmlSyntaxState::AttriEqualSign:
235           if (IsXMLWhiteSpace(ch)) {
236             current_buffer_idx++;
237             break;
238           }
239           if (ch != L'=') {
240             if (node_type_stack.top() == CFX_XMLNode::Type::kInstruction) {
241               current_parser_state = FDE_XmlSyntaxState::TargetData;
242               break;
243             }
244             return false;
245           } else {
246             current_parser_state = FDE_XmlSyntaxState::AttriQuotation;
247             current_buffer_idx++;
248           }
249           break;
250         case FDE_XmlSyntaxState::AttriQuotation:
251           if (IsXMLWhiteSpace(ch)) {
252             current_buffer_idx++;
253             break;
254           }
255           if (ch != L'\"' && ch != L'\'') {
256             return false;
257           }
258 
259           current_quote_character = ch;
260           current_parser_state = FDE_XmlSyntaxState::AttriValue;
261           current_buffer_idx++;
262           break;
263         case FDE_XmlSyntaxState::AttriValue:
264           if (ch == current_quote_character) {
265             if (entity_start_ > -1)
266               return false;
267 
268             current_quote_character = 0;
269             current_buffer_idx++;
270             current_parser_state = FDE_XmlSyntaxState::AttriName;
271 
272             CFX_XMLElement* elem = ToXMLElement(current_node_);
273             if (elem)
274               elem->SetAttribute(current_attribute_name, GetTextData());
275 
276             current_attribute_name.clear();
277           } else {
278             ProcessTextChar(ch);
279             current_buffer_idx++;
280           }
281           break;
282         case FDE_XmlSyntaxState::CloseInstruction:
283           if (ch != L'>') {
284             current_text_.push_back(ch);
285             current_parser_state = FDE_XmlSyntaxState::TargetData;
286           } else if (!current_text_.empty()) {
287             ProcessTargetData();
288           } else {
289             current_buffer_idx++;
290             if (node_type_stack.empty())
291               return false;
292 
293             node_type_stack.pop();
294             current_parser_state = FDE_XmlSyntaxState::Text;
295 
296             if (current_node_ &&
297                 current_node_->GetType() == CFX_XMLNode::Type::kInstruction)
298               current_node_ = current_node_->GetParent();
299           }
300           break;
301         case FDE_XmlSyntaxState::BreakElement:
302           if (ch == L'>') {
303             current_parser_state = FDE_XmlSyntaxState::Text;
304           } else if (ch == L'/') {
305             current_parser_state = FDE_XmlSyntaxState::CloseElement;
306           } else {
307             return false;
308           }
309           current_buffer_idx++;
310           break;
311         case FDE_XmlSyntaxState::CloseElement:
312           if (!IsXMLNameChar(ch, current_text_.empty())) {
313             if (ch == L'>') {
314               if (node_type_stack.empty())
315                 return false;
316 
317               node_type_stack.pop();
318               current_parser_state = FDE_XmlSyntaxState::Text;
319 
320               CFX_XMLElement* element = ToXMLElement(current_node_);
321               if (!element)
322                 return false;
323 
324               WideString element_name = GetTextData();
325               if (element_name.GetLength() > 0 &&
326                   element_name != element->GetName()) {
327                 return false;
328               }
329 
330               current_node_ = current_node_->GetParent();
331               iCount++;
332             } else if (!IsXMLWhiteSpace(ch)) {
333               return false;
334             }
335           } else {
336             current_text_.push_back(ch);
337           }
338           current_buffer_idx++;
339           break;
340         case FDE_XmlSyntaxState::SkipCommentOrDecl: {
341           auto current_span =
342               pdfium::make_span(buffer).subspan(current_buffer_idx);
343           if (FXSYS_wcsnicmp(current_span.data(), L"--", 2) == 0) {
344             current_buffer_idx += 2;
345             current_parser_state = FDE_XmlSyntaxState::SkipComment;
346           } else if (FXSYS_wcsnicmp(current_span.data(), L"[CDATA[", 7) == 0) {
347             current_buffer_idx += 7;
348             current_parser_state = FDE_XmlSyntaxState::SkipCData;
349           } else {
350             current_parser_state = FDE_XmlSyntaxState::SkipDeclNode;
351             current_character_to_skip_to = L'>';
352             character_to_skip_too_stack.push(L'>');
353           }
354           break;
355         }
356         case FDE_XmlSyntaxState::SkipCData: {
357           auto current_span =
358               pdfium::make_span(buffer).subspan(current_buffer_idx);
359           if (FXSYS_wcsnicmp(current_span.data(), L"]]>", 3) == 0) {
360             current_buffer_idx += 3;
361             current_parser_state = FDE_XmlSyntaxState::Text;
362             current_node_->AppendLastChild(
363                 doc->CreateNode<CFX_XMLCharData>(GetTextData()));
364           } else {
365             current_text_.push_back(ch);
366             current_buffer_idx++;
367           }
368           break;
369         }
370         case FDE_XmlSyntaxState::SkipDeclNode:
371           if (current_character_to_skip_to == L'\'' ||
372               current_character_to_skip_to == L'\"') {
373             current_buffer_idx++;
374             if (ch != current_character_to_skip_to)
375               break;
376 
377             character_to_skip_too_stack.pop();
378             if (character_to_skip_too_stack.empty())
379               current_parser_state = FDE_XmlSyntaxState::Text;
380             else
381               current_character_to_skip_to = character_to_skip_too_stack.top();
382           } else {
383             switch (ch) {
384               case L'<':
385                 current_character_to_skip_to = L'>';
386                 character_to_skip_too_stack.push(L'>');
387                 break;
388               case L'[':
389                 current_character_to_skip_to = L']';
390                 character_to_skip_too_stack.push(L']');
391                 break;
392               case L'(':
393                 current_character_to_skip_to = L')';
394                 character_to_skip_too_stack.push(L')');
395                 break;
396               case L'\'':
397                 current_character_to_skip_to = L'\'';
398                 character_to_skip_too_stack.push(L'\'');
399                 break;
400               case L'\"':
401                 current_character_to_skip_to = L'\"';
402                 character_to_skip_too_stack.push(L'\"');
403                 break;
404               default:
405                 if (ch == current_character_to_skip_to) {
406                   character_to_skip_too_stack.pop();
407                   if (character_to_skip_too_stack.empty()) {
408                     current_parser_state = FDE_XmlSyntaxState::Text;
409                   } else {
410                     current_character_to_skip_to =
411                         character_to_skip_too_stack.top();
412                   }
413                 }
414                 break;
415             }
416             current_buffer_idx++;
417           }
418           break;
419         case FDE_XmlSyntaxState::SkipComment: {
420           auto current_span =
421               pdfium::make_span(buffer).subspan(current_buffer_idx);
422           if (FXSYS_wcsnicmp(current_span.data(), L"-->", 3) == 0) {
423             current_buffer_idx += 2;
424             current_parser_state = FDE_XmlSyntaxState::Text;
425           }
426           current_buffer_idx++;
427           break;
428         }
429         case FDE_XmlSyntaxState::TargetData:
430           if (IsXMLWhiteSpace(ch)) {
431             if (current_text_.empty()) {
432               current_buffer_idx++;
433               break;
434             }
435             if (current_quote_character == 0) {
436               current_buffer_idx++;
437               ProcessTargetData();
438               break;
439             }
440           }
441           if (ch == '?') {
442             current_parser_state = FDE_XmlSyntaxState::CloseInstruction;
443             current_buffer_idx++;
444           } else if (ch == '\"') {
445             if (current_quote_character == 0) {
446               current_quote_character = ch;
447               current_buffer_idx++;
448             } else if (ch == current_quote_character) {
449               current_quote_character = 0;
450               current_buffer_idx++;
451               ProcessTargetData();
452             } else {
453               return false;
454             }
455           } else {
456             current_text_.push_back(ch);
457             current_buffer_idx++;
458           }
459           break;
460         default:
461           break;
462       }
463     }
464   }
465 
466   NOTREACHED();
467   return false;
468 }
469 
ProcessTextChar(wchar_t character)470 void CFX_XMLParser::ProcessTextChar(wchar_t character) {
471   current_text_.push_back(character);
472 
473   if (entity_start_ > -1 && character == L';') {
474     // Copy the entity out into a string and remove from the vector. When we
475     // copy the entity we don't want to copy out the & or the ; so we start
476     // shifted by one and want to copy 2 less characters in total.
477     WideString csEntity(current_text_.data() + entity_start_ + 1,
478                         current_text_.size() - entity_start_ - 2);
479     current_text_.erase(current_text_.begin() + entity_start_,
480                         current_text_.end());
481 
482     int32_t iLen = csEntity.GetLength();
483     if (iLen > 0) {
484       if (csEntity[0] == L'#') {
485         uint32_t ch = 0;
486         if (iLen > 1 && csEntity[1] == L'x') {
487           for (int32_t i = 2; i < iLen; i++) {
488             if (!FXSYS_IsHexDigit(csEntity[i]))
489               break;
490             ch = (ch << 4) + FXSYS_HexCharToInt(csEntity[i]);
491           }
492         } else {
493           for (int32_t i = 1; i < iLen; i++) {
494             if (!FXSYS_IsDecimalDigit(csEntity[i]))
495               break;
496             ch = ch * 10 + FXSYS_DecimalCharToInt(csEntity[i]);
497           }
498         }
499         if (ch > kMaxCharRange)
500           ch = ' ';
501 
502         character = static_cast<wchar_t>(ch);
503         if (character != 0)
504           current_text_.push_back(character);
505       } else {
506         if (csEntity.Compare(L"amp") == 0) {
507           current_text_.push_back(L'&');
508         } else if (csEntity.Compare(L"lt") == 0) {
509           current_text_.push_back(L'<');
510         } else if (csEntity.Compare(L"gt") == 0) {
511           current_text_.push_back(L'>');
512         } else if (csEntity.Compare(L"apos") == 0) {
513           current_text_.push_back(L'\'');
514         } else if (csEntity.Compare(L"quot") == 0) {
515           current_text_.push_back(L'"');
516         }
517       }
518     }
519 
520     entity_start_ = -1;
521   } else if (entity_start_ < 0 && character == L'&') {
522     entity_start_ = current_text_.size() - 1;
523   }
524 }
525 
ProcessTargetData()526 void CFX_XMLParser::ProcessTargetData() {
527   WideString target_data = GetTextData();
528   if (target_data.IsEmpty())
529     return;
530 
531   CFX_XMLInstruction* instruction = ToXMLInstruction(current_node_);
532   if (instruction)
533     instruction->AppendData(target_data);
534 }
535 
GetTextData()536 WideString CFX_XMLParser::GetTextData() {
537   WideString ret(current_text_.data(), current_text_.size());
538   entity_start_ = -1;
539   current_text_.clear();
540   current_text_.reserve(kCurrentTextReserve);
541   return ret;
542 }
543