1 // Copyright 2014 PDFium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 // Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com
6
7 #include <algorithm>
8 #include <memory>
9 #include <sstream>
10 #include <string>
11 #include <utility>
12 #include <vector>
13
14 #include "core/fxcrt/cfx_utf8decoder.h"
15 #include "core/fxcrt/cfx_widetextbuf.h"
16 #include "core/fxcrt/fx_extension.h"
17 #include "core/fxcrt/xml/cxml_content.h"
18 #include "core/fxcrt/xml/cxml_element.h"
19 #include "core/fxcrt/xml/cxml_parser.h"
20 #include "third_party/base/ptr_util.h"
21 #include "third_party/base/stl_util.h"
22
23 namespace {
24
25 #define FXCRTM_XML_CHARTYPE_Normal 0x00
26 #define FXCRTM_XML_CHARTYPE_SpaceChar 0x01
27 #define FXCRTM_XML_CHARTYPE_Letter 0x02
28 #define FXCRTM_XML_CHARTYPE_Digital 0x04
29 #define FXCRTM_XML_CHARTYPE_NameIntro 0x08
30 #define FXCRTM_XML_CHARTYPE_NameChar 0x10
31 #define FXCRTM_XML_CHARTYPE_HexDigital 0x20
32 #define FXCRTM_XML_CHARTYPE_HexLowerLetter 0x40
33 #define FXCRTM_XML_CHARTYPE_HexUpperLetter 0x60
34 #define FXCRTM_XML_CHARTYPE_HexChar 0x60
35
36 const uint8_t g_FXCRT_XML_ByteTypes[256] = {
37 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
38 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
39 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x00, 0x00, 0x00,
40 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x10, 0x10, 0x00,
41 0x34, 0x34, 0x34, 0x34, 0x34, 0x34, 0x34, 0x34, 0x34, 0x34, 0x08, 0x00,
42 0x00, 0x00, 0x00, 0x00, 0x00, 0x7A, 0x7A, 0x7A, 0x7A, 0x7A, 0x7A, 0x1A,
43 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A,
44 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x00, 0x00, 0x00, 0x00, 0x18,
45 0x00, 0x5A, 0x5A, 0x5A, 0x5A, 0x5A, 0x5A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A,
46 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A,
47 0x1A, 0x1A, 0x1A, 0x00, 0x00, 0x00, 0x00, 0x00, 0x1A, 0x1A, 0x1A, 0x1A,
48 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A,
49 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A,
50 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A,
51 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A,
52 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A,
53 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A,
54 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A,
55 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A,
56 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A,
57 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A, 0x1A,
58 0x1A, 0x1A, 0x01, 0x01,
59 };
60
61 constexpr int kMaxDepth = 1024;
62
g_FXCRT_XML_IsWhiteSpace(uint8_t ch)63 bool g_FXCRT_XML_IsWhiteSpace(uint8_t ch) {
64 return !!(g_FXCRT_XML_ByteTypes[ch] & FXCRTM_XML_CHARTYPE_SpaceChar);
65 }
66
g_FXCRT_XML_IsDigital(uint8_t ch)67 bool g_FXCRT_XML_IsDigital(uint8_t ch) {
68 return !!(g_FXCRT_XML_ByteTypes[ch] & FXCRTM_XML_CHARTYPE_Digital);
69 }
70
g_FXCRT_XML_IsNameIntro(uint8_t ch)71 bool g_FXCRT_XML_IsNameIntro(uint8_t ch) {
72 return !!(g_FXCRT_XML_ByteTypes[ch] & FXCRTM_XML_CHARTYPE_NameIntro);
73 }
74
g_FXCRT_XML_IsNameChar(uint8_t ch)75 bool g_FXCRT_XML_IsNameChar(uint8_t ch) {
76 return !!(g_FXCRT_XML_ByteTypes[ch] & FXCRTM_XML_CHARTYPE_NameChar);
77 }
78
79 } // namespace
80
CXML_Parser()81 CXML_Parser::CXML_Parser()
82 : m_nOffset(0),
83 m_pBuffer(nullptr),
84 m_dwBufferSize(0),
85 m_nBufferOffset(0),
86 m_dwIndex(0) {}
87
~CXML_Parser()88 CXML_Parser::~CXML_Parser() {}
89
Init(const uint8_t * pBuffer,size_t size)90 bool CXML_Parser::Init(const uint8_t* pBuffer, size_t size) {
91 m_pDataAcc = pdfium::MakeUnique<CXML_DataBufAcc>(pBuffer, size);
92 m_nOffset = 0;
93 return ReadNextBlock();
94 }
95
ReadNextBlock()96 bool CXML_Parser::ReadNextBlock() {
97 if (!m_pDataAcc->ReadNextBlock())
98 return false;
99
100 m_pBuffer = m_pDataAcc->GetBlockBuffer();
101 m_dwBufferSize = m_pDataAcc->GetBlockSize();
102 m_nBufferOffset = 0;
103 m_dwIndex = 0;
104 return m_dwBufferSize > 0;
105 }
106
IsEOF()107 bool CXML_Parser::IsEOF() {
108 return m_pDataAcc->IsEOF() && m_dwIndex >= m_dwBufferSize;
109 }
110
SkipWhiteSpaces()111 void CXML_Parser::SkipWhiteSpaces() {
112 m_nOffset = m_nBufferOffset + static_cast<FX_FILESIZE>(m_dwIndex);
113 if (IsEOF())
114 return;
115
116 do {
117 while (m_dwIndex < m_dwBufferSize &&
118 g_FXCRT_XML_IsWhiteSpace(m_pBuffer[m_dwIndex])) {
119 m_dwIndex++;
120 }
121 m_nOffset = m_nBufferOffset + static_cast<FX_FILESIZE>(m_dwIndex);
122 if (m_dwIndex < m_dwBufferSize || IsEOF())
123 break;
124 } while (ReadNextBlock());
125 }
126
GetName(ByteString * space,ByteString * name)127 void CXML_Parser::GetName(ByteString* space, ByteString* name) {
128 m_nOffset = m_nBufferOffset + static_cast<FX_FILESIZE>(m_dwIndex);
129 if (IsEOF())
130 return;
131
132 std::ostringstream buf;
133 do {
134 while (m_dwIndex < m_dwBufferSize) {
135 uint8_t ch = m_pBuffer[m_dwIndex];
136 if (ch == ':') {
137 *space = ByteString(buf);
138 buf.str("");
139 } else if (g_FXCRT_XML_IsNameChar(ch)) {
140 buf << static_cast<char>(ch);
141 } else {
142 break;
143 }
144 m_dwIndex++;
145 }
146 m_nOffset = m_nBufferOffset + static_cast<FX_FILESIZE>(m_dwIndex);
147 if (m_dwIndex < m_dwBufferSize || IsEOF())
148 break;
149 } while (ReadNextBlock());
150 *name = ByteString(buf);
151 }
152
SkipLiterals(const ByteStringView & str)153 void CXML_Parser::SkipLiterals(const ByteStringView& str) {
154 m_nOffset = m_nBufferOffset + static_cast<FX_FILESIZE>(m_dwIndex);
155 if (IsEOF()) {
156 return;
157 }
158 int32_t i = 0, iLen = str.GetLength();
159 do {
160 while (m_dwIndex < m_dwBufferSize) {
161 if (str[i] != m_pBuffer[m_dwIndex++]) {
162 i = 0;
163 continue;
164 }
165 i++;
166 if (i == iLen)
167 break;
168 }
169 m_nOffset = m_nBufferOffset + static_cast<FX_FILESIZE>(m_dwIndex);
170 if (i == iLen)
171 return;
172
173 if (m_dwIndex < m_dwBufferSize || IsEOF())
174 break;
175 } while (ReadNextBlock());
176 while (!m_pDataAcc->IsEOF()) {
177 ReadNextBlock();
178 m_nOffset = m_nBufferOffset + static_cast<FX_FILESIZE>(m_dwBufferSize);
179 }
180 m_dwIndex = m_dwBufferSize;
181 }
182
GetCharRef()183 uint32_t CXML_Parser::GetCharRef() {
184 m_nOffset = m_nBufferOffset + static_cast<FX_FILESIZE>(m_dwIndex);
185 if (IsEOF())
186 return 0;
187
188 uint8_t ch;
189 int32_t iState = 0;
190 std::ostringstream buf;
191 uint32_t code = 0;
192 do {
193 while (m_dwIndex < m_dwBufferSize) {
194 ch = m_pBuffer[m_dwIndex];
195 switch (iState) {
196 case 0:
197 if (ch == '#') {
198 m_dwIndex++;
199 iState = 2;
200 break;
201 }
202 iState = 1;
203 case 1:
204 m_dwIndex++;
205 if (ch == ';') {
206 std::string ref = buf.str();
207 if (ref == "gt")
208 code = '>';
209 else if (ref == "lt")
210 code = '<';
211 else if (ref == "amp")
212 code = '&';
213 else if (ref == "apos")
214 code = '\'';
215 else if (ref == "quot")
216 code = '"';
217 iState = 10;
218 break;
219 }
220 buf << static_cast<char>(ch);
221 break;
222 case 2:
223 if (ch == 'x') {
224 m_dwIndex++;
225 iState = 4;
226 break;
227 }
228 iState = 3;
229 case 3:
230 m_dwIndex++;
231 if (ch == ';') {
232 iState = 10;
233 break;
234 }
235 if (g_FXCRT_XML_IsDigital(ch))
236 code = code * 10 + FXSYS_DecimalCharToInt(static_cast<wchar_t>(ch));
237 break;
238 case 4:
239 m_dwIndex++;
240 if (ch == ';') {
241 iState = 10;
242 break;
243 }
244 uint8_t nHex =
245 g_FXCRT_XML_ByteTypes[ch] & FXCRTM_XML_CHARTYPE_HexChar;
246 if (nHex) {
247 if (nHex == FXCRTM_XML_CHARTYPE_HexDigital) {
248 code = (code << 4) +
249 FXSYS_DecimalCharToInt(static_cast<wchar_t>(ch));
250 } else if (nHex == FXCRTM_XML_CHARTYPE_HexLowerLetter) {
251 code = (code << 4) + ch - 87;
252 } else {
253 code = (code << 4) + ch - 55;
254 }
255 }
256 break;
257 }
258 if (iState == 10)
259 break;
260 }
261 m_nOffset = m_nBufferOffset + static_cast<FX_FILESIZE>(m_dwIndex);
262 if (iState == 10 || m_dwIndex < m_dwBufferSize || IsEOF()) {
263 break;
264 }
265 } while (ReadNextBlock());
266 return code;
267 }
268
GetAttrValue()269 WideString CXML_Parser::GetAttrValue() {
270 m_nOffset = m_nBufferOffset + static_cast<FX_FILESIZE>(m_dwIndex);
271 if (IsEOF())
272 return WideString();
273
274 CFX_UTF8Decoder decoder;
275 uint8_t mark = 0;
276 uint8_t ch = 0;
277 do {
278 while (m_dwIndex < m_dwBufferSize) {
279 ch = m_pBuffer[m_dwIndex];
280 if (mark == 0) {
281 if (ch != '\'' && ch != '"')
282 return WideString();
283
284 mark = ch;
285 m_dwIndex++;
286 ch = 0;
287 continue;
288 }
289 m_dwIndex++;
290 if (ch == mark)
291 break;
292
293 if (ch == '&') {
294 decoder.AppendCodePoint(GetCharRef());
295 if (IsEOF())
296 return WideString(decoder.GetResult());
297 } else {
298 decoder.Input(ch);
299 }
300 }
301 m_nOffset = m_nBufferOffset + static_cast<FX_FILESIZE>(m_dwIndex);
302 if (ch == mark || m_dwIndex < m_dwBufferSize || IsEOF())
303 break;
304 } while (ReadNextBlock());
305 return WideString(decoder.GetResult());
306 }
307
GetTagName(bool bStartTag,bool * bEndTag,ByteString * space,ByteString * name)308 void CXML_Parser::GetTagName(bool bStartTag,
309 bool* bEndTag,
310 ByteString* space,
311 ByteString* name) {
312 m_nOffset = m_nBufferOffset + static_cast<FX_FILESIZE>(m_dwIndex);
313 if (IsEOF())
314 return;
315
316 *bEndTag = false;
317 uint8_t ch;
318 int32_t iState = bStartTag ? 1 : 0;
319 do {
320 while (m_dwIndex < m_dwBufferSize) {
321 ch = m_pBuffer[m_dwIndex];
322 switch (iState) {
323 case 0:
324 m_dwIndex++;
325 if (ch != '<')
326 break;
327
328 iState = 1;
329 break;
330 case 1:
331 if (ch == '?') {
332 m_dwIndex++;
333 SkipLiterals("?>");
334 iState = 0;
335 break;
336 }
337 if (ch == '!') {
338 m_dwIndex++;
339 SkipLiterals("-->");
340 iState = 0;
341 break;
342 }
343 if (ch == '/') {
344 m_dwIndex++;
345 GetName(space, name);
346 *bEndTag = true;
347 } else {
348 GetName(space, name);
349 *bEndTag = false;
350 }
351 return;
352 }
353 }
354 m_nOffset = m_nBufferOffset + static_cast<FX_FILESIZE>(m_dwIndex);
355 if (m_dwIndex < m_dwBufferSize || IsEOF())
356 break;
357 } while (ReadNextBlock());
358 }
359
ParseElement(CXML_Element * pParent,bool bStartTag)360 std::unique_ptr<CXML_Element> CXML_Parser::ParseElement(CXML_Element* pParent,
361 bool bStartTag) {
362 return ParseElementInternal(pParent, bStartTag, 0);
363 }
364
ParseElementInternal(CXML_Element * pParent,bool bStartTag,int nDepth)365 std::unique_ptr<CXML_Element> CXML_Parser::ParseElementInternal(
366 CXML_Element* pParent,
367 bool bStartTag,
368 int nDepth) {
369 if (nDepth > kMaxDepth)
370 return nullptr;
371
372 m_nOffset = m_nBufferOffset + static_cast<FX_FILESIZE>(m_dwIndex);
373 if (IsEOF())
374 return nullptr;
375
376 ByteString tag_name;
377 ByteString tag_space;
378 bool bEndTag;
379 GetTagName(bStartTag, &bEndTag, &tag_space, &tag_name);
380 if (tag_name.IsEmpty() || bEndTag)
381 return nullptr;
382
383 auto pElement = pdfium::MakeUnique<CXML_Element>(
384 pParent, tag_space.AsStringView(), tag_name.AsStringView());
385 do {
386 ByteString attr_space;
387 ByteString attr_name;
388 while (m_dwIndex < m_dwBufferSize) {
389 SkipWhiteSpaces();
390 if (IsEOF())
391 break;
392
393 if (!g_FXCRT_XML_IsNameIntro(m_pBuffer[m_dwIndex]))
394 break;
395
396 GetName(&attr_space, &attr_name);
397 SkipWhiteSpaces();
398 if (IsEOF())
399 break;
400
401 if (m_pBuffer[m_dwIndex] != '=')
402 break;
403
404 m_dwIndex++;
405 SkipWhiteSpaces();
406 if (IsEOF())
407 break;
408
409 WideString attr_value = GetAttrValue();
410 pElement->SetAttribute(attr_space, attr_name, attr_value);
411 }
412 m_nOffset = m_nBufferOffset + static_cast<FX_FILESIZE>(m_dwIndex);
413 if (m_dwIndex < m_dwBufferSize || IsEOF())
414 break;
415 } while (ReadNextBlock());
416 SkipWhiteSpaces();
417 if (IsEOF())
418 return pElement;
419
420 uint8_t ch = m_pBuffer[m_dwIndex++];
421 if (ch == '/') {
422 m_dwIndex++;
423 m_nOffset = m_nBufferOffset + static_cast<FX_FILESIZE>(m_dwIndex);
424 return pElement;
425 }
426 if (ch != '>') {
427 m_nOffset = m_nBufferOffset + static_cast<FX_FILESIZE>(m_dwIndex);
428 return nullptr;
429 }
430 SkipWhiteSpaces();
431 if (IsEOF())
432 return pElement;
433
434 CFX_UTF8Decoder decoder;
435 CFX_WideTextBuf content;
436 bool bCDATA = false;
437 int32_t iState = 0;
438 do {
439 while (m_dwIndex < m_dwBufferSize) {
440 ch = m_pBuffer[m_dwIndex++];
441 switch (iState) {
442 case 0:
443 if (ch == '<') {
444 iState = 1;
445 } else if (ch == '&') {
446 decoder.ClearStatus();
447 decoder.AppendCodePoint(GetCharRef());
448 } else {
449 decoder.Input(ch);
450 }
451 break;
452 case 1:
453 if (ch == '!') {
454 iState = 2;
455 } else if (ch == '?') {
456 SkipLiterals("?>");
457 SkipWhiteSpaces();
458 iState = 0;
459 } else if (ch == '/') {
460 ByteString space;
461 ByteString name;
462 GetName(&space, &name);
463 SkipWhiteSpaces();
464 m_dwIndex++;
465 iState = 10;
466 } else {
467 content << decoder.GetResult();
468 WideString dataStr = content.MakeString();
469 if (!bCDATA)
470 dataStr.TrimRight(L" \t\r\n");
471
472 InsertContentSegment(bCDATA, dataStr.AsStringView(),
473 pElement.get());
474 content.Clear();
475 decoder.Clear();
476 bCDATA = false;
477 iState = 0;
478 m_dwIndex--;
479 std::unique_ptr<CXML_Element> pSubElement =
480 ParseElementInternal(pElement.get(), true, nDepth + 1);
481 if (!pSubElement)
482 break;
483
484 pElement->AppendChild(std::move(pSubElement));
485 SkipWhiteSpaces();
486 }
487 break;
488 case 2:
489 if (ch == '[') {
490 SkipLiterals("]]>");
491 } else if (ch == '-') {
492 m_dwIndex++;
493 SkipLiterals("-->");
494 } else {
495 SkipLiterals(">");
496 }
497 decoder.Clear();
498 SkipWhiteSpaces();
499 iState = 0;
500 break;
501 }
502 if (iState == 10) {
503 break;
504 }
505 }
506 m_nOffset = m_nBufferOffset + static_cast<FX_FILESIZE>(m_dwIndex);
507 if (iState == 10 || m_dwIndex < m_dwBufferSize || IsEOF())
508 break;
509 } while (ReadNextBlock());
510 content << decoder.GetResult();
511 WideString dataStr = content.MakeString();
512 dataStr.TrimRight(L" \t\r\n");
513
514 InsertContentSegment(bCDATA, dataStr.AsStringView(), pElement.get());
515 content.Clear();
516 decoder.Clear();
517 bCDATA = false;
518 return pElement;
519 }
520
InsertContentSegment(bool bCDATA,const WideStringView & content,CXML_Element * pElement)521 void CXML_Parser::InsertContentSegment(bool bCDATA,
522 const WideStringView& content,
523 CXML_Element* pElement) {
524 if (content.IsEmpty())
525 return;
526
527 pElement->AppendChild(pdfium::MakeUnique<CXML_Content>(bCDATA, content));
528 }
529