1 // Copyright 2016 The PDFium Authors
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 // Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com
6
7 #include "core/fpdfapi/parser/cpdf_syntax_parser.h"
8
9 #include <ctype.h>
10
11 #include <algorithm>
12 #include <utility>
13
14 #include "core/fpdfapi/parser/cpdf_array.h"
15 #include "core/fpdfapi/parser/cpdf_boolean.h"
16 #include "core/fpdfapi/parser/cpdf_crypto_handler.h"
17 #include "core/fpdfapi/parser/cpdf_dictionary.h"
18 #include "core/fpdfapi/parser/cpdf_name.h"
19 #include "core/fpdfapi/parser/cpdf_null.h"
20 #include "core/fpdfapi/parser/cpdf_number.h"
21 #include "core/fpdfapi/parser/cpdf_read_validator.h"
22 #include "core/fpdfapi/parser/cpdf_reference.h"
23 #include "core/fpdfapi/parser/cpdf_stream.h"
24 #include "core/fpdfapi/parser/cpdf_string.h"
25 #include "core/fpdfapi/parser/fpdf_parser_utility.h"
26 #include "core/fxcrt/autorestorer.h"
27 #include "core/fxcrt/cfx_read_only_vector_stream.h"
28 #include "core/fxcrt/check.h"
29 #include "core/fxcrt/check_op.h"
30 #include "core/fxcrt/data_vector.h"
31 #include "core/fxcrt/fixed_size_data_vector.h"
32 #include "core/fxcrt/fx_extension.h"
33 #include "core/fxcrt/fx_memcpy_wrappers.h"
34 #include "core/fxcrt/fx_safe_types.h"
35 #include "core/fxcrt/stl_util.h"
36
37 namespace {
38
39 enum class ReadStatus {
40 kNormal,
41 kBackslash,
42 kOctal,
43 kFinishOctal,
44 kCarriageReturn
45 };
46
47 class ReadableSubStream final : public IFX_SeekableReadStream {
48 public:
ReadableSubStream(RetainPtr<IFX_SeekableReadStream> pFileRead,FX_FILESIZE part_offset,FX_FILESIZE part_size)49 ReadableSubStream(RetainPtr<IFX_SeekableReadStream> pFileRead,
50 FX_FILESIZE part_offset,
51 FX_FILESIZE part_size)
52 : m_pFileRead(std::move(pFileRead)),
53 m_PartOffset(part_offset),
54 m_PartSize(part_size) {}
55
56 ~ReadableSubStream() override = default;
57
58 // IFX_SeekableReadStream overrides:
ReadBlockAtOffset(pdfium::span<uint8_t> buffer,FX_FILESIZE offset)59 bool ReadBlockAtOffset(pdfium::span<uint8_t> buffer,
60 FX_FILESIZE offset) override {
61 FX_SAFE_FILESIZE safe_end = offset;
62 safe_end += buffer.size();
63 // Check that requested range is valid, to prevent calling of ReadBlock
64 // of original m_pFileRead with incorrect params.
65 if (!safe_end.IsValid() || safe_end.ValueOrDie() > m_PartSize)
66 return false;
67
68 return m_pFileRead->ReadBlockAtOffset(buffer, m_PartOffset + offset);
69 }
70
GetSize()71 FX_FILESIZE GetSize() override { return m_PartSize; }
72
73 private:
74 RetainPtr<IFX_SeekableReadStream> m_pFileRead;
75 FX_FILESIZE m_PartOffset;
76 FX_FILESIZE m_PartSize;
77 };
78
79 } // namespace
80
81 // static
82 int CPDF_SyntaxParser::s_CurrentRecursionDepth = 0;
83
84 // static
CreateForTesting(RetainPtr<IFX_SeekableReadStream> pFileAccess,FX_FILESIZE HeaderOffset)85 std::unique_ptr<CPDF_SyntaxParser> CPDF_SyntaxParser::CreateForTesting(
86 RetainPtr<IFX_SeekableReadStream> pFileAccess,
87 FX_FILESIZE HeaderOffset) {
88 return std::make_unique<CPDF_SyntaxParser>(
89 pdfium::MakeRetain<CPDF_ReadValidator>(std::move(pFileAccess), nullptr),
90 HeaderOffset);
91 }
92
CPDF_SyntaxParser(RetainPtr<IFX_SeekableReadStream> pFileAccess)93 CPDF_SyntaxParser::CPDF_SyntaxParser(
94 RetainPtr<IFX_SeekableReadStream> pFileAccess)
95 : CPDF_SyntaxParser(
96 pdfium::MakeRetain<CPDF_ReadValidator>(std::move(pFileAccess),
97 nullptr),
98 0) {}
99
CPDF_SyntaxParser(RetainPtr<CPDF_ReadValidator> validator,FX_FILESIZE HeaderOffset)100 CPDF_SyntaxParser::CPDF_SyntaxParser(RetainPtr<CPDF_ReadValidator> validator,
101 FX_FILESIZE HeaderOffset)
102 : m_pFileAccess(std::move(validator)),
103 m_HeaderOffset(HeaderOffset),
104 m_FileLen(m_pFileAccess->GetSize()) {
105 DCHECK(m_HeaderOffset <= m_FileLen);
106 }
107
108 CPDF_SyntaxParser::~CPDF_SyntaxParser() = default;
109
GetCharAt(FX_FILESIZE pos,uint8_t & ch)110 bool CPDF_SyntaxParser::GetCharAt(FX_FILESIZE pos, uint8_t& ch) {
111 AutoRestorer<FX_FILESIZE> save_pos(&m_Pos);
112 m_Pos = pos;
113 return GetNextChar(ch);
114 }
115
ReadBlockAt(FX_FILESIZE read_pos)116 bool CPDF_SyntaxParser::ReadBlockAt(FX_FILESIZE read_pos) {
117 if (read_pos >= m_FileLen)
118 return false;
119 size_t read_size = m_ReadBufferSize;
120 FX_SAFE_FILESIZE safe_end = read_pos;
121 safe_end += read_size;
122 if (!safe_end.IsValid() || safe_end.ValueOrDie() > m_FileLen)
123 read_size = m_FileLen - read_pos;
124
125 m_pFileBuf.resize(read_size);
126 if (!m_pFileAccess->ReadBlockAtOffset(m_pFileBuf, read_pos)) {
127 m_pFileBuf.clear();
128 return false;
129 }
130
131 m_BufOffset = read_pos;
132 return true;
133 }
134
GetNextChar(uint8_t & ch)135 bool CPDF_SyntaxParser::GetNextChar(uint8_t& ch) {
136 FX_FILESIZE pos = m_Pos + m_HeaderOffset;
137 if (pos >= m_FileLen)
138 return false;
139
140 if (!IsPositionRead(pos) && !ReadBlockAt(pos))
141 return false;
142
143 ch = m_pFileBuf[pos - m_BufOffset];
144 m_Pos++;
145 return true;
146 }
147
GetDocumentSize() const148 FX_FILESIZE CPDF_SyntaxParser::GetDocumentSize() const {
149 return m_FileLen - m_HeaderOffset;
150 }
151
GetCharAtBackward(FX_FILESIZE pos,uint8_t * ch)152 bool CPDF_SyntaxParser::GetCharAtBackward(FX_FILESIZE pos, uint8_t* ch) {
153 pos += m_HeaderOffset;
154 if (pos >= m_FileLen)
155 return false;
156
157 if (!IsPositionRead(pos)) {
158 FX_FILESIZE block_start = 0;
159 if (pos >= CPDF_Stream::kFileBufSize)
160 block_start = pos - CPDF_Stream::kFileBufSize + 1;
161 if (!ReadBlockAt(block_start) || !IsPositionRead(pos))
162 return false;
163 }
164 *ch = m_pFileBuf[pos - m_BufOffset];
165 return true;
166 }
167
ReadBlock(pdfium::span<uint8_t> buffer)168 bool CPDF_SyntaxParser::ReadBlock(pdfium::span<uint8_t> buffer) {
169 if (!m_pFileAccess->ReadBlockAtOffset(buffer, m_Pos + m_HeaderOffset))
170 return false;
171 m_Pos += buffer.size();
172 return true;
173 }
174
GetNextWordInternal()175 CPDF_SyntaxParser::WordType CPDF_SyntaxParser::GetNextWordInternal() {
176 m_WordSize = 0;
177 WordType word_type = WordType::kNumber;
178
179 ToNextWord();
180 uint8_t ch;
181 if (!GetNextChar(ch))
182 return word_type;
183
184 if (PDFCharIsDelimiter(ch)) {
185 word_type = WordType::kWord;
186
187 m_WordBuffer[m_WordSize++] = ch;
188 if (ch == '/') {
189 while (true) {
190 if (!GetNextChar(ch))
191 return word_type;
192
193 if (!PDFCharIsOther(ch) && !PDFCharIsNumeric(ch)) {
194 m_Pos--;
195 return word_type;
196 }
197
198 if (m_WordSize < sizeof(m_WordBuffer) - 1)
199 m_WordBuffer[m_WordSize++] = ch;
200 }
201 } else if (ch == '<') {
202 if (!GetNextChar(ch))
203 return word_type;
204
205 if (ch == '<')
206 m_WordBuffer[m_WordSize++] = ch;
207 else
208 m_Pos--;
209 } else if (ch == '>') {
210 if (!GetNextChar(ch))
211 return word_type;
212
213 if (ch == '>')
214 m_WordBuffer[m_WordSize++] = ch;
215 else
216 m_Pos--;
217 }
218 return word_type;
219 }
220
221 while (true) {
222 if (m_WordSize < sizeof(m_WordBuffer) - 1)
223 m_WordBuffer[m_WordSize++] = ch;
224
225 if (!PDFCharIsNumeric(ch))
226 word_type = WordType::kWord;
227
228 if (!GetNextChar(ch))
229 return word_type;
230
231 if (PDFCharIsDelimiter(ch) || PDFCharIsWhitespace(ch)) {
232 m_Pos--;
233 break;
234 }
235 }
236 return word_type;
237 }
238
ReadString()239 ByteString CPDF_SyntaxParser::ReadString() {
240 uint8_t ch;
241 if (!GetNextChar(ch))
242 return ByteString();
243
244 ByteString buf;
245 int32_t parlevel = 0;
246 ReadStatus status = ReadStatus::kNormal;
247 int32_t iEscCode = 0;
248 while (true) {
249 switch (status) {
250 case ReadStatus::kNormal:
251 if (ch == ')') {
252 if (parlevel == 0)
253 return ByteString(buf);
254 parlevel--;
255 } else if (ch == '(') {
256 parlevel++;
257 }
258 if (ch == '\\')
259 status = ReadStatus::kBackslash;
260 else
261 buf += static_cast<char>(ch);
262 break;
263 case ReadStatus::kBackslash:
264 if (FXSYS_IsOctalDigit(ch)) {
265 iEscCode = FXSYS_DecimalCharToInt(static_cast<wchar_t>(ch));
266 status = ReadStatus::kOctal;
267 break;
268 }
269 if (ch == '\r') {
270 status = ReadStatus::kCarriageReturn;
271 break;
272 }
273 if (ch == 'n') {
274 buf += '\n';
275 } else if (ch == 'r') {
276 buf += '\r';
277 } else if (ch == 't') {
278 buf += '\t';
279 } else if (ch == 'b') {
280 buf += '\b';
281 } else if (ch == 'f') {
282 buf += '\f';
283 } else if (ch != '\n') {
284 buf += static_cast<char>(ch);
285 }
286 status = ReadStatus::kNormal;
287 break;
288 case ReadStatus::kOctal:
289 if (FXSYS_IsOctalDigit(ch)) {
290 iEscCode =
291 iEscCode * 8 + FXSYS_DecimalCharToInt(static_cast<wchar_t>(ch));
292 status = ReadStatus::kFinishOctal;
293 } else {
294 buf += static_cast<char>(iEscCode);
295 status = ReadStatus::kNormal;
296 continue;
297 }
298 break;
299 case ReadStatus::kFinishOctal:
300 status = ReadStatus::kNormal;
301 if (FXSYS_IsOctalDigit(ch)) {
302 iEscCode =
303 iEscCode * 8 + FXSYS_DecimalCharToInt(static_cast<wchar_t>(ch));
304 buf += static_cast<char>(iEscCode);
305 } else {
306 buf += static_cast<char>(iEscCode);
307 continue;
308 }
309 break;
310 case ReadStatus::kCarriageReturn:
311 status = ReadStatus::kNormal;
312 if (ch != '\n')
313 continue;
314 break;
315 }
316
317 if (!GetNextChar(ch))
318 break;
319 }
320
321 GetNextChar(ch);
322 return buf;
323 }
324
ReadHexString()325 DataVector<uint8_t> CPDF_SyntaxParser::ReadHexString() {
326 uint8_t ch;
327 if (!GetNextChar(ch)) {
328 return DataVector<uint8_t>();
329 }
330
331 DataVector<uint8_t> buf;
332 bool bFirst = true;
333 uint8_t code = 0;
334 while (true) {
335 if (ch == '>')
336 break;
337
338 if (isxdigit(ch)) {
339 int val = FXSYS_HexCharToInt(ch);
340 if (bFirst) {
341 code = val * 16;
342 } else {
343 code += val;
344 buf.push_back(code);
345 }
346 bFirst = !bFirst;
347 }
348
349 if (!GetNextChar(ch)) {
350 break;
351 }
352 }
353 if (!bFirst) {
354 buf.push_back(code);
355 }
356
357 return buf;
358 }
359
ToNextLine()360 void CPDF_SyntaxParser::ToNextLine() {
361 uint8_t ch;
362 while (GetNextChar(ch)) {
363 if (ch == '\n')
364 break;
365
366 if (ch == '\r') {
367 GetNextChar(ch);
368 if (ch != '\n')
369 --m_Pos;
370 break;
371 }
372 }
373 }
374
ToNextWord()375 void CPDF_SyntaxParser::ToNextWord() {
376 if (m_TrailerEnds) {
377 RecordingToNextWord();
378 return;
379 }
380
381 uint8_t ch;
382 if (!GetNextChar(ch))
383 return;
384
385 while (true) {
386 while (PDFCharIsWhitespace(ch)) {
387 if (!GetNextChar(ch))
388 return;
389 }
390
391 if (ch != '%')
392 break;
393
394 while (true) {
395 if (!GetNextChar(ch))
396 return;
397 if (PDFCharIsLineEnding(ch))
398 break;
399 }
400 }
401 m_Pos--;
402 }
403
404 // A state machine which goes % -> E -> O -> F -> line ending.
405 enum class EofState {
406 kInitial = 0,
407 kNonPercent,
408 kPercent,
409 kE,
410 kO,
411 kF,
412 kInvalid,
413 };
414
RecordingToNextWord()415 void CPDF_SyntaxParser::RecordingToNextWord() {
416 DCHECK(m_TrailerEnds);
417
418 EofState eof_state = EofState::kInitial;
419 // Find the first character which is neither whitespace, nor part of a
420 // comment.
421 while (true) {
422 uint8_t ch;
423 if (!GetNextChar(ch))
424 return;
425 switch (eof_state) {
426 case EofState::kInitial:
427 if (!PDFCharIsWhitespace(ch))
428 eof_state = ch == '%' ? EofState::kPercent : EofState::kNonPercent;
429 break;
430 case EofState::kNonPercent:
431 break;
432 case EofState::kPercent:
433 if (ch == 'E')
434 eof_state = EofState::kE;
435 else if (ch != '%')
436 eof_state = EofState::kInvalid;
437 break;
438 case EofState::kE:
439 eof_state = ch == 'O' ? EofState::kO : EofState::kInvalid;
440 break;
441 case EofState::kO:
442 eof_state = ch == 'F' ? EofState::kF : EofState::kInvalid;
443 break;
444 case EofState::kF:
445 if (ch == '\r') {
446 // See if \r has to be combined with a \n that follows it
447 // immediately.
448 if (GetNextChar(ch) && ch != '\n') {
449 ch = '\r';
450 m_Pos--;
451 }
452 }
453 // If we now have a \r, that's not followed by a \n, so both are OK.
454 if (ch == '\r' || ch == '\n')
455 m_TrailerEnds->push_back(m_Pos);
456 eof_state = EofState::kInvalid;
457 break;
458 case EofState::kInvalid:
459 break;
460 }
461 if (PDFCharIsLineEnding(ch))
462 eof_state = EofState::kInitial;
463 if (eof_state == EofState::kNonPercent)
464 break;
465 }
466 m_Pos--;
467 }
468
GetNextWord()469 CPDF_SyntaxParser::WordResult CPDF_SyntaxParser::GetNextWord() {
470 CPDF_ReadValidator::ScopedSession read_session(GetValidator());
471 WordType word_type = GetNextWordInternal();
472 ByteStringView word;
473 if (!GetValidator()->has_read_problems()) {
474 word = ByteStringView(pdfium::make_span(m_WordBuffer).first(m_WordSize));
475 }
476 return {ByteString(word), word_type == WordType::kNumber};
477 }
478
PeekNextWord()479 ByteString CPDF_SyntaxParser::PeekNextWord() {
480 AutoRestorer<FX_FILESIZE> save_pos(&m_Pos);
481 return GetNextWord().word;
482 }
483
GetKeyword()484 ByteString CPDF_SyntaxParser::GetKeyword() {
485 return GetNextWord().word;
486 }
487
SetPos(FX_FILESIZE pos)488 void CPDF_SyntaxParser::SetPos(FX_FILESIZE pos) {
489 DCHECK_GE(pos, 0);
490 m_Pos = std::min(pos, m_FileLen);
491 }
492
GetObjectBody(CPDF_IndirectObjectHolder * pObjList)493 RetainPtr<CPDF_Object> CPDF_SyntaxParser::GetObjectBody(
494 CPDF_IndirectObjectHolder* pObjList) {
495 CPDF_ReadValidator::ScopedSession read_session(GetValidator());
496 auto result = GetObjectBodyInternal(pObjList, ParseType::kLoose);
497 if (GetValidator()->has_read_problems())
498 return nullptr;
499 return result;
500 }
501
GetObjectBodyInternal(CPDF_IndirectObjectHolder * pObjList,ParseType parse_type)502 RetainPtr<CPDF_Object> CPDF_SyntaxParser::GetObjectBodyInternal(
503 CPDF_IndirectObjectHolder* pObjList,
504 ParseType parse_type) {
505 AutoRestorer<int> depth_restorer(&s_CurrentRecursionDepth);
506 if (++s_CurrentRecursionDepth > kParserMaxRecursionDepth)
507 return nullptr;
508
509 FX_FILESIZE SavedObjPos = m_Pos;
510 WordResult word_result = GetNextWord();
511 const ByteString& word = word_result.word;
512 if (word.IsEmpty())
513 return nullptr;
514
515 if (word_result.is_number) {
516 AutoRestorer<FX_FILESIZE> pos_restorer(&m_Pos);
517 WordResult nextword = GetNextWord();
518 if (!nextword.is_number)
519 return pdfium::MakeRetain<CPDF_Number>(word.AsStringView());
520
521 WordResult nextword2 = GetNextWord();
522 if (nextword2.word != "R")
523 return pdfium::MakeRetain<CPDF_Number>(word.AsStringView());
524
525 pos_restorer.AbandonRestoration();
526 uint32_t refnum = FXSYS_atoui(word.c_str());
527 if (refnum == CPDF_Object::kInvalidObjNum)
528 return nullptr;
529
530 return pdfium::MakeRetain<CPDF_Reference>(pObjList, refnum);
531 }
532
533 if (word == "true" || word == "false")
534 return pdfium::MakeRetain<CPDF_Boolean>(word == "true");
535
536 if (word == "null")
537 return pdfium::MakeRetain<CPDF_Null>();
538
539 if (word == "(") {
540 return pdfium::MakeRetain<CPDF_String>(m_pPool, ReadString());
541 }
542 if (word == "<") {
543 return pdfium::MakeRetain<CPDF_String>(m_pPool, ReadHexString(),
544 CPDF_String::DataType::kIsHex);
545 }
546 if (word == "[") {
547 auto pArray = pdfium::MakeRetain<CPDF_Array>();
548 while (RetainPtr<CPDF_Object> pObj =
549 GetObjectBodyInternal(pObjList, ParseType::kLoose)) {
550 // `pObj` cannot be a stream, per ISO 32000-1:2008 section 7.3.8.1.
551 if (!pObj->IsStream()) {
552 pArray->Append(std::move(pObj));
553 }
554 }
555 return (parse_type == ParseType::kLoose || m_WordBuffer[0] == ']')
556 ? std::move(pArray)
557 : nullptr;
558 }
559 if (word[0] == '/') {
560 auto word_span = pdfium::make_span(m_WordBuffer).first(m_WordSize);
561 return pdfium::MakeRetain<CPDF_Name>(
562 m_pPool, PDF_NameDecode(ByteStringView(word_span).Substr(1)));
563 }
564 if (word == "<<") {
565 RetainPtr<CPDF_Dictionary> pDict =
566 pdfium::MakeRetain<CPDF_Dictionary>(m_pPool);
567 while (true) {
568 WordResult inner_word_result = GetNextWord();
569 const ByteString& inner_word = inner_word_result.word;
570 if (inner_word.IsEmpty())
571 return nullptr;
572
573 FX_FILESIZE SavedPos = m_Pos - inner_word.GetLength();
574 if (inner_word == ">>")
575 break;
576
577 if (inner_word == "endobj") {
578 m_Pos = SavedPos;
579 break;
580 }
581 if (inner_word[0] != '/')
582 continue;
583
584 ByteString key = PDF_NameDecode(inner_word.AsStringView());
585 if (key.IsEmpty() && parse_type == ParseType::kLoose)
586 continue;
587
588 RetainPtr<CPDF_Object> pObj =
589 GetObjectBodyInternal(pObjList, ParseType::kLoose);
590 if (!pObj) {
591 if (parse_type == ParseType::kLoose)
592 continue;
593
594 ToNextLine();
595 return nullptr;
596 }
597
598 // `key` has to be "/X" at the minimum.
599 // `pObj` cannot be a stream, per ISO 32000-1:2008 section 7.3.8.1.
600 if (key.GetLength() > 1 && !pObj->IsStream()) {
601 pDict->SetFor(key.Substr(1), std::move(pObj));
602 }
603 }
604
605 AutoRestorer<FX_FILESIZE> pos_restorer(&m_Pos);
606 if (GetNextWord().word != "stream")
607 return pDict;
608 pos_restorer.AbandonRestoration();
609 return ReadStream(std::move(pDict));
610 }
611 if (word == ">>")
612 m_Pos = SavedObjPos;
613
614 return nullptr;
615 }
616
GetIndirectObject(CPDF_IndirectObjectHolder * pObjList,ParseType parse_type)617 RetainPtr<CPDF_Object> CPDF_SyntaxParser::GetIndirectObject(
618 CPDF_IndirectObjectHolder* pObjList,
619 ParseType parse_type) {
620 CPDF_ReadValidator::ScopedSession read_session(GetValidator());
621 const FX_FILESIZE saved_pos = GetPos();
622
623 WordResult objnum_word_result = GetNextWord();
624 if (!objnum_word_result.is_number || objnum_word_result.word.IsEmpty()) {
625 SetPos(saved_pos);
626 return nullptr;
627 }
628 const uint32_t parser_objnum = FXSYS_atoui(objnum_word_result.word.c_str());
629
630 WordResult gennum_word_result = GetNextWord();
631 const ByteString& gennum_word = gennum_word_result.word;
632 if (!gennum_word_result.is_number || gennum_word.IsEmpty()) {
633 SetPos(saved_pos);
634 return nullptr;
635 }
636 const uint32_t parser_gennum = FXSYS_atoui(gennum_word.c_str());
637
638 if (GetKeyword() != "obj") {
639 SetPos(saved_pos);
640 return nullptr;
641 }
642
643 RetainPtr<CPDF_Object> pObj = GetObjectBodyInternal(pObjList, parse_type);
644 if (pObj) {
645 pObj->SetObjNum(parser_objnum);
646 pObj->SetGenNum(parser_gennum);
647 }
648
649 return GetValidator()->has_read_problems() ? nullptr : pObj;
650 }
651
ReadEOLMarkers(FX_FILESIZE pos)652 unsigned int CPDF_SyntaxParser::ReadEOLMarkers(FX_FILESIZE pos) {
653 unsigned char byte1 = 0;
654 unsigned char byte2 = 0;
655
656 GetCharAt(pos, byte1);
657 GetCharAt(pos + 1, byte2);
658
659 if (byte1 == '\r' && byte2 == '\n')
660 return 2;
661
662 if (byte1 == '\r' || byte1 == '\n')
663 return 1;
664
665 return 0;
666 }
667
FindWordPos(ByteStringView word)668 FX_FILESIZE CPDF_SyntaxParser::FindWordPos(ByteStringView word) {
669 AutoRestorer<FX_FILESIZE> pos_restorer(&m_Pos);
670 FX_FILESIZE end_offset = FindTag(word);
671 while (end_offset >= 0) {
672 // Stop searching when word is found.
673 if (IsWholeWord(GetPos() - word.GetLength(), m_FileLen, word, true))
674 return GetPos() - word.GetLength();
675
676 end_offset = FindTag(word);
677 }
678 return -1;
679 }
680
FindStreamEndPos()681 FX_FILESIZE CPDF_SyntaxParser::FindStreamEndPos() {
682 const ByteStringView kEndStreamStr("endstream");
683 const ByteStringView kEndObjStr("endobj");
684
685 FX_FILESIZE endStreamWordOffset = FindWordPos(kEndStreamStr);
686 FX_FILESIZE endObjWordOffset = FindWordPos(kEndObjStr);
687
688 // Can't find "endstream" or "endobj".
689 if (endStreamWordOffset < 0 && endObjWordOffset < 0) {
690 return -1;
691 }
692
693 if (endStreamWordOffset < 0 && endObjWordOffset >= 0) {
694 // Correct the position of end stream.
695 endStreamWordOffset = endObjWordOffset;
696 } else if (endStreamWordOffset >= 0 && endObjWordOffset < 0) {
697 // Correct the position of end obj.
698 endObjWordOffset = endStreamWordOffset;
699 } else if (endStreamWordOffset > endObjWordOffset) {
700 endStreamWordOffset = endObjWordOffset;
701 }
702
703 int numMarkers = ReadEOLMarkers(endStreamWordOffset - 2);
704 if (numMarkers == 2) {
705 endStreamWordOffset -= 2;
706 } else {
707 numMarkers = ReadEOLMarkers(endStreamWordOffset - 1);
708 if (numMarkers == 1) {
709 endStreamWordOffset -= 1;
710 }
711 }
712 if (endStreamWordOffset < GetPos()) {
713 return -1;
714 }
715 return endStreamWordOffset;
716 }
717
ReadStream(RetainPtr<CPDF_Dictionary> pDict)718 RetainPtr<CPDF_Stream> CPDF_SyntaxParser::ReadStream(
719 RetainPtr<CPDF_Dictionary> pDict) {
720 RetainPtr<const CPDF_Number> pLenObj =
721 ToNumber(pDict->GetDirectObjectFor("Length"));
722 FX_FILESIZE len = pLenObj ? pLenObj->GetInteger() : -1;
723
724 // Locate the start of stream.
725 ToNextLine();
726 const FX_FILESIZE streamStartPos = GetPos();
727
728 if (len > 0) {
729 FX_SAFE_FILESIZE pos = GetPos();
730 pos += len;
731 if (!pos.IsValid() || pos.ValueOrDie() >= m_FileLen)
732 len = -1;
733 }
734
735 RetainPtr<IFX_SeekableReadStream> substream;
736 if (len > 0) {
737 // Check data availability first to allow the Validator to request data
738 // smoothly, without jumps.
739 if (!GetValidator()->CheckDataRangeAndRequestIfUnavailable(
740 m_HeaderOffset + GetPos(), len)) {
741 return nullptr;
742 }
743
744 substream = pdfium::MakeRetain<ReadableSubStream>(
745 GetValidator(), m_HeaderOffset + GetPos(), len);
746 SetPos(GetPos() + len);
747 }
748
749 const ByteStringView kEndStreamStr("endstream");
750 const ByteStringView kEndObjStr("endobj");
751
752 // Note, we allow zero length streams as we need to pass them through when we
753 // are importing pages into a new document.
754 if (len >= 0) {
755 CPDF_ReadValidator::ScopedSession read_session(GetValidator());
756 m_Pos += ReadEOLMarkers(GetPos());
757 const size_t zap_length = kEndStreamStr.GetLength() + 1;
758 fxcrt::Fill(pdfium::make_span(m_WordBuffer).first(zap_length), 0);
759 GetNextWordInternal();
760 if (GetValidator()->has_read_problems())
761 return nullptr;
762
763 // Earlier version of PDF specification doesn't require EOL marker before
764 // 'endstream' keyword. If keyword 'endstream' follows the bytes in
765 // specified length, it signals the end of stream.
766 if (memcmp(m_WordBuffer.data(), kEndStreamStr.unterminated_unsigned_str(),
767 kEndStreamStr.GetLength()) != 0) {
768 substream.Reset();
769 len = -1;
770 SetPos(streamStartPos);
771 }
772 }
773
774 if (len < 0) {
775 // If len is not available or incorrect, len needs to be calculated
776 // by searching the keywords "endstream" or "endobj".
777 const FX_FILESIZE streamEndPos = FindStreamEndPos();
778 if (streamEndPos < 0)
779 return nullptr;
780
781 len = streamEndPos - streamStartPos;
782 DCHECK_GE(len, 0);
783 if (len > 0) {
784 SetPos(streamStartPos);
785 // Check data availability first to allow the Validator to request data
786 // smoothly, without jumps.
787 if (!GetValidator()->CheckDataRangeAndRequestIfUnavailable(
788 m_HeaderOffset + GetPos(), len)) {
789 return nullptr;
790 }
791
792 substream = pdfium::MakeRetain<ReadableSubStream>(
793 GetValidator(), m_HeaderOffset + GetPos(), len);
794 SetPos(GetPos() + len);
795 }
796 }
797
798 RetainPtr<CPDF_Stream> stream;
799 if (substream) {
800 // It is unclear from CPDF_SyntaxParser's perspective what object
801 // `substream` is ultimately holding references to. To avoid unexpectedly
802 // changing object lifetimes by handing `substream` to `stream`, make a
803 // copy of the data here.
804 auto data = FixedSizeDataVector<uint8_t>::Uninit(substream->GetSize());
805 bool did_read = substream->ReadBlockAtOffset(data.span(), 0);
806 CHECK(did_read);
807 auto data_as_stream =
808 pdfium::MakeRetain<CFX_ReadOnlyVectorStream>(std::move(data));
809
810 stream = pdfium::MakeRetain<CPDF_Stream>(std::move(data_as_stream),
811 std::move(pDict));
812 } else {
813 DCHECK(!len);
814 stream = pdfium::MakeRetain<CPDF_Stream>(std::move(pDict));
815 }
816 const FX_FILESIZE end_stream_offset = GetPos();
817 const size_t zap_length = kEndObjStr.GetLength() + 1;
818 fxcrt::Fill(pdfium::make_span(m_WordBuffer).first(zap_length), 0);
819 GetNextWordInternal();
820
821 // Allow whitespace after endstream and before a newline.
822 unsigned char ch = 0;
823 while (GetNextChar(ch)) {
824 if (!PDFCharIsWhitespace(ch) || PDFCharIsLineEnding(ch))
825 break;
826 }
827 SetPos(GetPos() - 1);
828
829 int numMarkers = ReadEOLMarkers(GetPos());
830 if (m_WordSize == static_cast<unsigned int>(kEndObjStr.GetLength()) &&
831 numMarkers != 0 &&
832 memcmp(m_WordBuffer.data(), kEndObjStr.unterminated_unsigned_str(),
833 kEndObjStr.GetLength()) == 0) {
834 SetPos(end_stream_offset);
835 }
836 return stream;
837 }
838
GetDirectNum()839 uint32_t CPDF_SyntaxParser::GetDirectNum() {
840 if (GetNextWordInternal() != WordType::kNumber)
841 return 0;
842
843 m_WordBuffer[m_WordSize] = 0;
844 return FXSYS_atoui(pdfium::as_chars(pdfium::make_span(m_WordBuffer)).data());
845 }
846
GetValidator() const847 RetainPtr<CPDF_ReadValidator> CPDF_SyntaxParser::GetValidator() const {
848 return m_pFileAccess;
849 }
850
IsWholeWord(FX_FILESIZE startpos,FX_FILESIZE limit,ByteStringView tag,bool checkKeyword)851 bool CPDF_SyntaxParser::IsWholeWord(FX_FILESIZE startpos,
852 FX_FILESIZE limit,
853 ByteStringView tag,
854 bool checkKeyword) {
855 const uint32_t taglen = tag.GetLength();
856
857 bool bCheckLeft = !PDFCharIsDelimiter(tag[0]) && !PDFCharIsWhitespace(tag[0]);
858 bool bCheckRight = !PDFCharIsDelimiter(tag[taglen - 1]) &&
859 !PDFCharIsWhitespace(tag[taglen - 1]);
860
861 uint8_t ch;
862 if (bCheckRight && startpos + static_cast<int32_t>(taglen) <= limit &&
863 GetCharAt(startpos + static_cast<int32_t>(taglen), ch)) {
864 if (PDFCharIsNumeric(ch) || PDFCharIsOther(ch) ||
865 (checkKeyword && PDFCharIsDelimiter(ch))) {
866 return false;
867 }
868 }
869
870 if (bCheckLeft && startpos > 0 && GetCharAt(startpos - 1, ch)) {
871 if (PDFCharIsNumeric(ch) || PDFCharIsOther(ch) ||
872 (checkKeyword && PDFCharIsDelimiter(ch))) {
873 return false;
874 }
875 }
876 return true;
877 }
878
BackwardsSearchToWord(ByteStringView word,FX_FILESIZE limit)879 bool CPDF_SyntaxParser::BackwardsSearchToWord(ByteStringView word,
880 FX_FILESIZE limit) {
881 int32_t taglen = word.GetLength();
882 if (taglen == 0)
883 return false;
884
885 FX_FILESIZE pos = m_Pos;
886 int32_t offset = taglen - 1;
887 while (true) {
888 if (limit && pos <= m_Pos - limit)
889 return false;
890
891 uint8_t byte;
892 if (!GetCharAtBackward(pos, &byte))
893 return false;
894
895 if (byte == word[offset]) {
896 offset--;
897 if (offset >= 0) {
898 pos--;
899 continue;
900 }
901 if (IsWholeWord(pos, limit, word, false)) {
902 m_Pos = pos;
903 return true;
904 }
905 }
906 offset = byte == word[taglen - 1] ? taglen - 2 : taglen - 1;
907 pos--;
908 if (pos < 0)
909 return false;
910 }
911 }
912
FindTag(ByteStringView tag)913 FX_FILESIZE CPDF_SyntaxParser::FindTag(ByteStringView tag) {
914 const FX_FILESIZE startpos = GetPos();
915 const int32_t taglen = tag.GetLength();
916 DCHECK_GT(taglen, 0);
917
918 int32_t match = 0;
919 while (true) {
920 uint8_t ch;
921 if (!GetNextChar(ch))
922 return -1;
923
924 if (ch == tag[match]) {
925 match++;
926 if (match == taglen)
927 return GetPos() - startpos - taglen;
928 } else {
929 match = ch == tag[0] ? 1 : 0;
930 }
931 }
932 }
933
IsPositionRead(FX_FILESIZE pos) const934 bool CPDF_SyntaxParser::IsPositionRead(FX_FILESIZE pos) const {
935 return m_BufOffset <= pos &&
936 pos < static_cast<FX_FILESIZE>(m_BufOffset + m_pFileBuf.size());
937 }
938