1 // Copyright 2016 The PDFium Authors
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 // Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com
6
7 #include "core/fpdfapi/parser/cpdf_syntax_parser.h"
8
9 #include <ctype.h>
10
11 #include <algorithm>
12 #include <utility>
13
14 #include "core/fpdfapi/parser/cpdf_array.h"
15 #include "core/fpdfapi/parser/cpdf_boolean.h"
16 #include "core/fpdfapi/parser/cpdf_crypto_handler.h"
17 #include "core/fpdfapi/parser/cpdf_dictionary.h"
18 #include "core/fpdfapi/parser/cpdf_name.h"
19 #include "core/fpdfapi/parser/cpdf_null.h"
20 #include "core/fpdfapi/parser/cpdf_number.h"
21 #include "core/fpdfapi/parser/cpdf_read_validator.h"
22 #include "core/fpdfapi/parser/cpdf_reference.h"
23 #include "core/fpdfapi/parser/cpdf_stream.h"
24 #include "core/fpdfapi/parser/cpdf_string.h"
25 #include "core/fpdfapi/parser/fpdf_parser_utility.h"
26 #include "core/fxcrt/autorestorer.h"
27 #include "core/fxcrt/cfx_read_only_vector_stream.h"
28 #include "core/fxcrt/fixed_uninit_data_vector.h"
29 #include "core/fxcrt/fx_extension.h"
30 #include "core/fxcrt/fx_safe_types.h"
31 #include "third_party/base/check.h"
32 #include "third_party/base/check_op.h"
33 #include "third_party/base/numerics/safe_math.h"
34
35 namespace {
36
37 enum class ReadStatus {
38 kNormal,
39 kBackslash,
40 kOctal,
41 kFinishOctal,
42 kCarriageReturn
43 };
44
45 class ReadableSubStream final : public IFX_SeekableReadStream {
46 public:
ReadableSubStream(RetainPtr<IFX_SeekableReadStream> pFileRead,FX_FILESIZE part_offset,FX_FILESIZE part_size)47 ReadableSubStream(RetainPtr<IFX_SeekableReadStream> pFileRead,
48 FX_FILESIZE part_offset,
49 FX_FILESIZE part_size)
50 : m_pFileRead(std::move(pFileRead)),
51 m_PartOffset(part_offset),
52 m_PartSize(part_size) {}
53
54 ~ReadableSubStream() override = default;
55
56 // IFX_SeekableReadStream overrides:
ReadBlockAtOffset(pdfium::span<uint8_t> buffer,FX_FILESIZE offset)57 bool ReadBlockAtOffset(pdfium::span<uint8_t> buffer,
58 FX_FILESIZE offset) override {
59 FX_SAFE_FILESIZE safe_end = offset;
60 safe_end += buffer.size();
61 // Check that requested range is valid, to prevent calling of ReadBlock
62 // of original m_pFileRead with incorrect params.
63 if (!safe_end.IsValid() || safe_end.ValueOrDie() > m_PartSize)
64 return false;
65
66 return m_pFileRead->ReadBlockAtOffset(buffer, m_PartOffset + offset);
67 }
68
GetSize()69 FX_FILESIZE GetSize() override { return m_PartSize; }
70
71 private:
72 RetainPtr<IFX_SeekableReadStream> m_pFileRead;
73 FX_FILESIZE m_PartOffset;
74 FX_FILESIZE m_PartSize;
75 };
76
77 } // namespace
78
79 // static
80 int CPDF_SyntaxParser::s_CurrentRecursionDepth = 0;
81
82 // static
CreateForTesting(RetainPtr<IFX_SeekableReadStream> pFileAccess,FX_FILESIZE HeaderOffset)83 std::unique_ptr<CPDF_SyntaxParser> CPDF_SyntaxParser::CreateForTesting(
84 RetainPtr<IFX_SeekableReadStream> pFileAccess,
85 FX_FILESIZE HeaderOffset) {
86 return std::make_unique<CPDF_SyntaxParser>(
87 pdfium::MakeRetain<CPDF_ReadValidator>(std::move(pFileAccess), nullptr),
88 HeaderOffset);
89 }
90
CPDF_SyntaxParser(RetainPtr<IFX_SeekableReadStream> pFileAccess)91 CPDF_SyntaxParser::CPDF_SyntaxParser(
92 RetainPtr<IFX_SeekableReadStream> pFileAccess)
93 : CPDF_SyntaxParser(
94 pdfium::MakeRetain<CPDF_ReadValidator>(std::move(pFileAccess),
95 nullptr),
96 0) {}
97
CPDF_SyntaxParser(RetainPtr<CPDF_ReadValidator> validator,FX_FILESIZE HeaderOffset)98 CPDF_SyntaxParser::CPDF_SyntaxParser(RetainPtr<CPDF_ReadValidator> validator,
99 FX_FILESIZE HeaderOffset)
100 : m_pFileAccess(std::move(validator)),
101 m_HeaderOffset(HeaderOffset),
102 m_FileLen(m_pFileAccess->GetSize()) {
103 DCHECK(m_HeaderOffset <= m_FileLen);
104 }
105
106 CPDF_SyntaxParser::~CPDF_SyntaxParser() = default;
107
GetCharAt(FX_FILESIZE pos,uint8_t & ch)108 bool CPDF_SyntaxParser::GetCharAt(FX_FILESIZE pos, uint8_t& ch) {
109 AutoRestorer<FX_FILESIZE> save_pos(&m_Pos);
110 m_Pos = pos;
111 return GetNextChar(ch);
112 }
113
ReadBlockAt(FX_FILESIZE read_pos)114 bool CPDF_SyntaxParser::ReadBlockAt(FX_FILESIZE read_pos) {
115 if (read_pos >= m_FileLen)
116 return false;
117 size_t read_size = m_ReadBufferSize;
118 FX_SAFE_FILESIZE safe_end = read_pos;
119 safe_end += read_size;
120 if (!safe_end.IsValid() || safe_end.ValueOrDie() > m_FileLen)
121 read_size = m_FileLen - read_pos;
122
123 m_pFileBuf.resize(read_size);
124 if (!m_pFileAccess->ReadBlockAtOffset(m_pFileBuf, read_pos)) {
125 m_pFileBuf.clear();
126 return false;
127 }
128
129 m_BufOffset = read_pos;
130 return true;
131 }
132
GetNextChar(uint8_t & ch)133 bool CPDF_SyntaxParser::GetNextChar(uint8_t& ch) {
134 FX_FILESIZE pos = m_Pos + m_HeaderOffset;
135 if (pos >= m_FileLen)
136 return false;
137
138 if (!IsPositionRead(pos) && !ReadBlockAt(pos))
139 return false;
140
141 ch = m_pFileBuf[pos - m_BufOffset];
142 m_Pos++;
143 return true;
144 }
145
GetDocumentSize() const146 FX_FILESIZE CPDF_SyntaxParser::GetDocumentSize() const {
147 return m_FileLen - m_HeaderOffset;
148 }
149
GetCharAtBackward(FX_FILESIZE pos,uint8_t * ch)150 bool CPDF_SyntaxParser::GetCharAtBackward(FX_FILESIZE pos, uint8_t* ch) {
151 pos += m_HeaderOffset;
152 if (pos >= m_FileLen)
153 return false;
154
155 if (!IsPositionRead(pos)) {
156 FX_FILESIZE block_start = 0;
157 if (pos >= CPDF_Stream::kFileBufSize)
158 block_start = pos - CPDF_Stream::kFileBufSize + 1;
159 if (!ReadBlockAt(block_start) || !IsPositionRead(pos))
160 return false;
161 }
162 *ch = m_pFileBuf[pos - m_BufOffset];
163 return true;
164 }
165
ReadBlock(pdfium::span<uint8_t> buffer)166 bool CPDF_SyntaxParser::ReadBlock(pdfium::span<uint8_t> buffer) {
167 if (!m_pFileAccess->ReadBlockAtOffset(buffer, m_Pos + m_HeaderOffset))
168 return false;
169 m_Pos += buffer.size();
170 return true;
171 }
172
GetNextWordInternal()173 CPDF_SyntaxParser::WordType CPDF_SyntaxParser::GetNextWordInternal() {
174 m_WordSize = 0;
175 WordType word_type = WordType::kNumber;
176
177 ToNextWord();
178 uint8_t ch;
179 if (!GetNextChar(ch))
180 return word_type;
181
182 if (PDFCharIsDelimiter(ch)) {
183 word_type = WordType::kWord;
184
185 m_WordBuffer[m_WordSize++] = ch;
186 if (ch == '/') {
187 while (true) {
188 if (!GetNextChar(ch))
189 return word_type;
190
191 if (!PDFCharIsOther(ch) && !PDFCharIsNumeric(ch)) {
192 m_Pos--;
193 return word_type;
194 }
195
196 if (m_WordSize < sizeof(m_WordBuffer) - 1)
197 m_WordBuffer[m_WordSize++] = ch;
198 }
199 } else if (ch == '<') {
200 if (!GetNextChar(ch))
201 return word_type;
202
203 if (ch == '<')
204 m_WordBuffer[m_WordSize++] = ch;
205 else
206 m_Pos--;
207 } else if (ch == '>') {
208 if (!GetNextChar(ch))
209 return word_type;
210
211 if (ch == '>')
212 m_WordBuffer[m_WordSize++] = ch;
213 else
214 m_Pos--;
215 }
216 return word_type;
217 }
218
219 while (true) {
220 if (m_WordSize < sizeof(m_WordBuffer) - 1)
221 m_WordBuffer[m_WordSize++] = ch;
222
223 if (!PDFCharIsNumeric(ch))
224 word_type = WordType::kWord;
225
226 if (!GetNextChar(ch))
227 return word_type;
228
229 if (PDFCharIsDelimiter(ch) || PDFCharIsWhitespace(ch)) {
230 m_Pos--;
231 break;
232 }
233 }
234 return word_type;
235 }
236
ReadString()237 ByteString CPDF_SyntaxParser::ReadString() {
238 uint8_t ch;
239 if (!GetNextChar(ch))
240 return ByteString();
241
242 ByteString buf;
243 int32_t parlevel = 0;
244 ReadStatus status = ReadStatus::kNormal;
245 int32_t iEscCode = 0;
246 while (true) {
247 switch (status) {
248 case ReadStatus::kNormal:
249 if (ch == ')') {
250 if (parlevel == 0)
251 return ByteString(buf);
252 parlevel--;
253 } else if (ch == '(') {
254 parlevel++;
255 }
256 if (ch == '\\')
257 status = ReadStatus::kBackslash;
258 else
259 buf += static_cast<char>(ch);
260 break;
261 case ReadStatus::kBackslash:
262 if (FXSYS_IsOctalDigit(ch)) {
263 iEscCode = FXSYS_DecimalCharToInt(static_cast<wchar_t>(ch));
264 status = ReadStatus::kOctal;
265 break;
266 }
267 if (ch == '\r') {
268 status = ReadStatus::kCarriageReturn;
269 break;
270 }
271 if (ch == 'n') {
272 buf += '\n';
273 } else if (ch == 'r') {
274 buf += '\r';
275 } else if (ch == 't') {
276 buf += '\t';
277 } else if (ch == 'b') {
278 buf += '\b';
279 } else if (ch == 'f') {
280 buf += '\f';
281 } else if (ch != '\n') {
282 buf += static_cast<char>(ch);
283 }
284 status = ReadStatus::kNormal;
285 break;
286 case ReadStatus::kOctal:
287 if (FXSYS_IsOctalDigit(ch)) {
288 iEscCode =
289 iEscCode * 8 + FXSYS_DecimalCharToInt(static_cast<wchar_t>(ch));
290 status = ReadStatus::kFinishOctal;
291 } else {
292 buf += static_cast<char>(iEscCode);
293 status = ReadStatus::kNormal;
294 continue;
295 }
296 break;
297 case ReadStatus::kFinishOctal:
298 status = ReadStatus::kNormal;
299 if (FXSYS_IsOctalDigit(ch)) {
300 iEscCode =
301 iEscCode * 8 + FXSYS_DecimalCharToInt(static_cast<wchar_t>(ch));
302 buf += static_cast<char>(iEscCode);
303 } else {
304 buf += static_cast<char>(iEscCode);
305 continue;
306 }
307 break;
308 case ReadStatus::kCarriageReturn:
309 status = ReadStatus::kNormal;
310 if (ch != '\n')
311 continue;
312 break;
313 }
314
315 if (!GetNextChar(ch))
316 break;
317 }
318
319 GetNextChar(ch);
320 return buf;
321 }
322
ReadHexString()323 ByteString CPDF_SyntaxParser::ReadHexString() {
324 uint8_t ch;
325 if (!GetNextChar(ch))
326 return ByteString();
327
328 ByteString buf;
329 bool bFirst = true;
330 uint8_t code = 0;
331 while (true) {
332 if (ch == '>')
333 break;
334
335 if (isxdigit(ch)) {
336 int val = FXSYS_HexCharToInt(ch);
337 if (bFirst) {
338 code = val * 16;
339 } else {
340 code += val;
341 buf += static_cast<char>(code);
342 }
343 bFirst = !bFirst;
344 }
345
346 if (!GetNextChar(ch))
347 break;
348 }
349 if (!bFirst)
350 buf += static_cast<char>(code);
351
352 return buf;
353 }
354
ToNextLine()355 void CPDF_SyntaxParser::ToNextLine() {
356 uint8_t ch;
357 while (GetNextChar(ch)) {
358 if (ch == '\n')
359 break;
360
361 if (ch == '\r') {
362 GetNextChar(ch);
363 if (ch != '\n')
364 --m_Pos;
365 break;
366 }
367 }
368 }
369
ToNextWord()370 void CPDF_SyntaxParser::ToNextWord() {
371 if (m_TrailerEnds) {
372 RecordingToNextWord();
373 return;
374 }
375
376 uint8_t ch;
377 if (!GetNextChar(ch))
378 return;
379
380 while (true) {
381 while (PDFCharIsWhitespace(ch)) {
382 if (!GetNextChar(ch))
383 return;
384 }
385
386 if (ch != '%')
387 break;
388
389 while (true) {
390 if (!GetNextChar(ch))
391 return;
392 if (PDFCharIsLineEnding(ch))
393 break;
394 }
395 }
396 m_Pos--;
397 }
398
399 // A state machine which goes % -> E -> O -> F -> line ending.
400 enum class EofState {
401 kInitial = 0,
402 kNonPercent,
403 kPercent,
404 kE,
405 kO,
406 kF,
407 kInvalid,
408 };
409
RecordingToNextWord()410 void CPDF_SyntaxParser::RecordingToNextWord() {
411 DCHECK(m_TrailerEnds);
412
413 EofState eof_state = EofState::kInitial;
414 // Find the first character which is neither whitespace, nor part of a
415 // comment.
416 while (true) {
417 uint8_t ch;
418 if (!GetNextChar(ch))
419 return;
420 switch (eof_state) {
421 case EofState::kInitial:
422 if (!PDFCharIsWhitespace(ch))
423 eof_state = ch == '%' ? EofState::kPercent : EofState::kNonPercent;
424 break;
425 case EofState::kNonPercent:
426 break;
427 case EofState::kPercent:
428 if (ch == 'E')
429 eof_state = EofState::kE;
430 else if (ch != '%')
431 eof_state = EofState::kInvalid;
432 break;
433 case EofState::kE:
434 eof_state = ch == 'O' ? EofState::kO : EofState::kInvalid;
435 break;
436 case EofState::kO:
437 eof_state = ch == 'F' ? EofState::kF : EofState::kInvalid;
438 break;
439 case EofState::kF:
440 if (ch == '\r') {
441 // See if \r has to be combined with a \n that follows it
442 // immediately.
443 if (GetNextChar(ch) && ch != '\n') {
444 ch = '\r';
445 m_Pos--;
446 }
447 }
448 // If we now have a \r, that's not followed by a \n, so both are OK.
449 if (ch == '\r' || ch == '\n')
450 m_TrailerEnds->push_back(m_Pos);
451 eof_state = EofState::kInvalid;
452 break;
453 case EofState::kInvalid:
454 break;
455 }
456 if (PDFCharIsLineEnding(ch))
457 eof_state = EofState::kInitial;
458 if (eof_state == EofState::kNonPercent)
459 break;
460 }
461 m_Pos--;
462 }
463
GetNextWord()464 CPDF_SyntaxParser::WordResult CPDF_SyntaxParser::GetNextWord() {
465 CPDF_ReadValidator::ScopedSession read_session(GetValidator());
466 WordType word_type = GetNextWordInternal();
467 ByteString word;
468 if (!GetValidator()->has_read_problems())
469 word = ByteString(m_WordBuffer, m_WordSize);
470 return {word, word_type == WordType::kNumber};
471 }
472
PeekNextWord()473 ByteString CPDF_SyntaxParser::PeekNextWord() {
474 AutoRestorer<FX_FILESIZE> save_pos(&m_Pos);
475 return GetNextWord().word;
476 }
477
GetKeyword()478 ByteString CPDF_SyntaxParser::GetKeyword() {
479 return GetNextWord().word;
480 }
481
SetPos(FX_FILESIZE pos)482 void CPDF_SyntaxParser::SetPos(FX_FILESIZE pos) {
483 DCHECK_GE(pos, 0);
484 m_Pos = std::min(pos, m_FileLen);
485 }
486
GetObjectBody(CPDF_IndirectObjectHolder * pObjList)487 RetainPtr<CPDF_Object> CPDF_SyntaxParser::GetObjectBody(
488 CPDF_IndirectObjectHolder* pObjList) {
489 CPDF_ReadValidator::ScopedSession read_session(GetValidator());
490 auto result = GetObjectBodyInternal(pObjList, ParseType::kLoose);
491 if (GetValidator()->has_read_problems())
492 return nullptr;
493 return result;
494 }
495
GetObjectBodyInternal(CPDF_IndirectObjectHolder * pObjList,ParseType parse_type)496 RetainPtr<CPDF_Object> CPDF_SyntaxParser::GetObjectBodyInternal(
497 CPDF_IndirectObjectHolder* pObjList,
498 ParseType parse_type) {
499 AutoRestorer<int> depth_restorer(&s_CurrentRecursionDepth);
500 if (++s_CurrentRecursionDepth > kParserMaxRecursionDepth)
501 return nullptr;
502
503 FX_FILESIZE SavedObjPos = m_Pos;
504 WordResult word_result = GetNextWord();
505 const ByteString& word = word_result.word;
506 if (word.IsEmpty())
507 return nullptr;
508
509 if (word_result.is_number) {
510 AutoRestorer<FX_FILESIZE> pos_restorer(&m_Pos);
511 WordResult nextword = GetNextWord();
512 if (!nextword.is_number)
513 return pdfium::MakeRetain<CPDF_Number>(word.AsStringView());
514
515 WordResult nextword2 = GetNextWord();
516 if (nextword2.word != "R")
517 return pdfium::MakeRetain<CPDF_Number>(word.AsStringView());
518
519 pos_restorer.AbandonRestoration();
520 uint32_t refnum = FXSYS_atoui(word.c_str());
521 if (refnum == CPDF_Object::kInvalidObjNum)
522 return nullptr;
523
524 return pdfium::MakeRetain<CPDF_Reference>(pObjList, refnum);
525 }
526
527 if (word == "true" || word == "false")
528 return pdfium::MakeRetain<CPDF_Boolean>(word == "true");
529
530 if (word == "null")
531 return pdfium::MakeRetain<CPDF_Null>();
532
533 if (word == "(") {
534 ByteString str = ReadString();
535 return pdfium::MakeRetain<CPDF_String>(m_pPool, str, false);
536 }
537 if (word == "<") {
538 ByteString str = ReadHexString();
539 return pdfium::MakeRetain<CPDF_String>(m_pPool, str, true);
540 }
541 if (word == "[") {
542 auto pArray = pdfium::MakeRetain<CPDF_Array>();
543 while (RetainPtr<CPDF_Object> pObj =
544 GetObjectBodyInternal(pObjList, ParseType::kLoose)) {
545 pArray->Append(std::move(pObj));
546 }
547 return (parse_type == ParseType::kLoose || m_WordBuffer[0] == ']')
548 ? std::move(pArray)
549 : nullptr;
550 }
551 if (word[0] == '/') {
552 return pdfium::MakeRetain<CPDF_Name>(
553 m_pPool,
554 PDF_NameDecode(ByteStringView(m_WordBuffer + 1, m_WordSize - 1)));
555 }
556 if (word == "<<") {
557 RetainPtr<CPDF_Dictionary> pDict =
558 pdfium::MakeRetain<CPDF_Dictionary>(m_pPool);
559 while (true) {
560 WordResult inner_word_result = GetNextWord();
561 const ByteString& inner_word = inner_word_result.word;
562 if (inner_word.IsEmpty())
563 return nullptr;
564
565 FX_FILESIZE SavedPos = m_Pos - inner_word.GetLength();
566 if (inner_word == ">>")
567 break;
568
569 if (inner_word == "endobj") {
570 m_Pos = SavedPos;
571 break;
572 }
573 if (inner_word[0] != '/')
574 continue;
575
576 ByteString key = PDF_NameDecode(inner_word.AsStringView());
577 if (key.IsEmpty() && parse_type == ParseType::kLoose)
578 continue;
579
580 RetainPtr<CPDF_Object> pObj =
581 GetObjectBodyInternal(pObjList, ParseType::kLoose);
582 if (!pObj) {
583 if (parse_type == ParseType::kLoose)
584 continue;
585
586 ToNextLine();
587 return nullptr;
588 }
589
590 // `key` has to be "/X" at the minimum.
591 if (key.GetLength() > 1) {
592 pDict->SetFor(key.Substr(1), std::move(pObj));
593 }
594 }
595
596 AutoRestorer<FX_FILESIZE> pos_restorer(&m_Pos);
597 if (GetNextWord().word != "stream")
598 return pDict;
599 pos_restorer.AbandonRestoration();
600 return ReadStream(std::move(pDict));
601 }
602 if (word == ">>")
603 m_Pos = SavedObjPos;
604
605 return nullptr;
606 }
607
GetIndirectObject(CPDF_IndirectObjectHolder * pObjList,ParseType parse_type)608 RetainPtr<CPDF_Object> CPDF_SyntaxParser::GetIndirectObject(
609 CPDF_IndirectObjectHolder* pObjList,
610 ParseType parse_type) {
611 CPDF_ReadValidator::ScopedSession read_session(GetValidator());
612 const FX_FILESIZE saved_pos = GetPos();
613
614 WordResult objnum_word_result = GetNextWord();
615 if (!objnum_word_result.is_number || objnum_word_result.word.IsEmpty()) {
616 SetPos(saved_pos);
617 return nullptr;
618 }
619 const uint32_t parser_objnum = FXSYS_atoui(objnum_word_result.word.c_str());
620
621 WordResult gennum_word_result = GetNextWord();
622 const ByteString& gennum_word = gennum_word_result.word;
623 if (!gennum_word_result.is_number || gennum_word.IsEmpty()) {
624 SetPos(saved_pos);
625 return nullptr;
626 }
627 const uint32_t parser_gennum = FXSYS_atoui(gennum_word.c_str());
628
629 if (GetKeyword() != "obj") {
630 SetPos(saved_pos);
631 return nullptr;
632 }
633
634 RetainPtr<CPDF_Object> pObj = GetObjectBodyInternal(pObjList, parse_type);
635 if (pObj) {
636 pObj->SetObjNum(parser_objnum);
637 pObj->SetGenNum(parser_gennum);
638 }
639
640 return GetValidator()->has_read_problems() ? nullptr : std::move(pObj);
641 }
642
ReadEOLMarkers(FX_FILESIZE pos)643 unsigned int CPDF_SyntaxParser::ReadEOLMarkers(FX_FILESIZE pos) {
644 unsigned char byte1 = 0;
645 unsigned char byte2 = 0;
646
647 GetCharAt(pos, byte1);
648 GetCharAt(pos + 1, byte2);
649
650 if (byte1 == '\r' && byte2 == '\n')
651 return 2;
652
653 if (byte1 == '\r' || byte1 == '\n')
654 return 1;
655
656 return 0;
657 }
658
FindWordPos(ByteStringView word)659 FX_FILESIZE CPDF_SyntaxParser::FindWordPos(ByteStringView word) {
660 AutoRestorer<FX_FILESIZE> pos_restorer(&m_Pos);
661 FX_FILESIZE end_offset = FindTag(word);
662 while (end_offset >= 0) {
663 // Stop searching when word is found.
664 if (IsWholeWord(GetPos() - word.GetLength(), m_FileLen, word, true))
665 return GetPos() - word.GetLength();
666
667 end_offset = FindTag(word);
668 }
669 return -1;
670 }
671
FindStreamEndPos()672 FX_FILESIZE CPDF_SyntaxParser::FindStreamEndPos() {
673 const ByteStringView kEndStreamStr("endstream");
674 const ByteStringView kEndObjStr("endobj");
675
676 FX_FILESIZE endStreamWordOffset = FindWordPos(kEndStreamStr);
677 FX_FILESIZE endObjWordOffset = FindWordPos(kEndObjStr);
678
679 // Can't find "endstream" or "endobj".
680 if (endStreamWordOffset < 0 && endObjWordOffset < 0) {
681 return -1;
682 }
683
684 if (endStreamWordOffset < 0 && endObjWordOffset >= 0) {
685 // Correct the position of end stream.
686 endStreamWordOffset = endObjWordOffset;
687 } else if (endStreamWordOffset >= 0 && endObjWordOffset < 0) {
688 // Correct the position of end obj.
689 endObjWordOffset = endStreamWordOffset;
690 } else if (endStreamWordOffset > endObjWordOffset) {
691 endStreamWordOffset = endObjWordOffset;
692 }
693
694 int numMarkers = ReadEOLMarkers(endStreamWordOffset - 2);
695 if (numMarkers == 2) {
696 endStreamWordOffset -= 2;
697 } else {
698 numMarkers = ReadEOLMarkers(endStreamWordOffset - 1);
699 if (numMarkers == 1) {
700 endStreamWordOffset -= 1;
701 }
702 }
703 if (endStreamWordOffset < GetPos()) {
704 return -1;
705 }
706 return endStreamWordOffset;
707 }
708
ReadStream(RetainPtr<CPDF_Dictionary> pDict)709 RetainPtr<CPDF_Stream> CPDF_SyntaxParser::ReadStream(
710 RetainPtr<CPDF_Dictionary> pDict) {
711 RetainPtr<const CPDF_Number> pLenObj =
712 ToNumber(pDict->GetDirectObjectFor("Length"));
713 FX_FILESIZE len = pLenObj ? pLenObj->GetInteger() : -1;
714
715 // Locate the start of stream.
716 ToNextLine();
717 const FX_FILESIZE streamStartPos = GetPos();
718
719 if (len > 0) {
720 FX_SAFE_FILESIZE pos = GetPos();
721 pos += len;
722 if (!pos.IsValid() || pos.ValueOrDie() >= m_FileLen)
723 len = -1;
724 }
725
726 RetainPtr<IFX_SeekableReadStream> substream;
727 if (len > 0) {
728 // Check data availability first to allow the Validator to request data
729 // smoothly, without jumps.
730 if (!GetValidator()->CheckDataRangeAndRequestIfUnavailable(
731 m_HeaderOffset + GetPos(), len)) {
732 return nullptr;
733 }
734
735 substream = pdfium::MakeRetain<ReadableSubStream>(
736 GetValidator(), m_HeaderOffset + GetPos(), len);
737 SetPos(GetPos() + len);
738 }
739
740 const ByteStringView kEndStreamStr("endstream");
741 const ByteStringView kEndObjStr("endobj");
742
743 // Note, we allow zero length streams as we need to pass them through when we
744 // are importing pages into a new document.
745 if (len >= 0) {
746 CPDF_ReadValidator::ScopedSession read_session(GetValidator());
747 m_Pos += ReadEOLMarkers(GetPos());
748 memset(m_WordBuffer, 0, kEndStreamStr.GetLength() + 1);
749 GetNextWordInternal();
750 if (GetValidator()->has_read_problems())
751 return nullptr;
752
753 // Earlier version of PDF specification doesn't require EOL marker before
754 // 'endstream' keyword. If keyword 'endstream' follows the bytes in
755 // specified length, it signals the end of stream.
756 if (memcmp(m_WordBuffer, kEndStreamStr.raw_str(),
757 kEndStreamStr.GetLength()) != 0) {
758 substream.Reset();
759 len = -1;
760 SetPos(streamStartPos);
761 }
762 }
763
764 if (len < 0) {
765 // If len is not available or incorrect, len needs to be calculated
766 // by searching the keywords "endstream" or "endobj".
767 const FX_FILESIZE streamEndPos = FindStreamEndPos();
768 if (streamEndPos < 0)
769 return nullptr;
770
771 len = streamEndPos - streamStartPos;
772 DCHECK_GE(len, 0);
773 if (len > 0) {
774 SetPos(streamStartPos);
775 // Check data availability first to allow the Validator to request data
776 // smoothly, without jumps.
777 if (!GetValidator()->CheckDataRangeAndRequestIfUnavailable(
778 m_HeaderOffset + GetPos(), len)) {
779 return nullptr;
780 }
781
782 substream = pdfium::MakeRetain<ReadableSubStream>(
783 GetValidator(), m_HeaderOffset + GetPos(), len);
784 SetPos(GetPos() + len);
785 }
786 }
787
788 RetainPtr<CPDF_Stream> pStream;
789 if (substream) {
790 // It is unclear from CPDF_SyntaxParser's perspective what object
791 // `substream` is ultimately holding references to. To avoid unexpectedly
792 // changing object lifetimes by handing `substream` to `pStream`, make a
793 // copy of the data here.
794 FixedUninitDataVector<uint8_t> data(substream->GetSize());
795 bool did_read = substream->ReadBlockAtOffset(data.writable_span(), 0);
796 CHECK(did_read);
797 auto data_as_stream =
798 pdfium::MakeRetain<CFX_ReadOnlyVectorStream>(std::move(data));
799
800 pStream = pdfium::MakeRetain<CPDF_Stream>();
801 pStream->InitStreamFromFile(std::move(data_as_stream), std::move(pDict));
802 } else {
803 DCHECK(!len);
804 pStream = pdfium::MakeRetain<CPDF_Stream>(std::move(pDict));
805 }
806 const FX_FILESIZE end_stream_offset = GetPos();
807 memset(m_WordBuffer, 0, kEndObjStr.GetLength() + 1);
808 GetNextWordInternal();
809
810 // Allow whitespace after endstream and before a newline.
811 unsigned char ch = 0;
812 while (GetNextChar(ch)) {
813 if (!PDFCharIsWhitespace(ch) || PDFCharIsLineEnding(ch))
814 break;
815 }
816 SetPos(GetPos() - 1);
817
818 int numMarkers = ReadEOLMarkers(GetPos());
819 if (m_WordSize == static_cast<unsigned int>(kEndObjStr.GetLength()) &&
820 numMarkers != 0 &&
821 memcmp(m_WordBuffer, kEndObjStr.raw_str(), kEndObjStr.GetLength()) == 0) {
822 SetPos(end_stream_offset);
823 }
824 return pStream;
825 }
826
GetDirectNum()827 uint32_t CPDF_SyntaxParser::GetDirectNum() {
828 if (GetNextWordInternal() != WordType::kNumber)
829 return 0;
830
831 m_WordBuffer[m_WordSize] = 0;
832 return FXSYS_atoui(reinterpret_cast<const char*>(m_WordBuffer));
833 }
834
GetValidator() const835 RetainPtr<CPDF_ReadValidator> CPDF_SyntaxParser::GetValidator() const {
836 return m_pFileAccess;
837 }
838
IsWholeWord(FX_FILESIZE startpos,FX_FILESIZE limit,ByteStringView tag,bool checkKeyword)839 bool CPDF_SyntaxParser::IsWholeWord(FX_FILESIZE startpos,
840 FX_FILESIZE limit,
841 ByteStringView tag,
842 bool checkKeyword) {
843 const uint32_t taglen = tag.GetLength();
844
845 bool bCheckLeft = !PDFCharIsDelimiter(tag[0]) && !PDFCharIsWhitespace(tag[0]);
846 bool bCheckRight = !PDFCharIsDelimiter(tag[taglen - 1]) &&
847 !PDFCharIsWhitespace(tag[taglen - 1]);
848
849 uint8_t ch;
850 if (bCheckRight && startpos + static_cast<int32_t>(taglen) <= limit &&
851 GetCharAt(startpos + static_cast<int32_t>(taglen), ch)) {
852 if (PDFCharIsNumeric(ch) || PDFCharIsOther(ch) ||
853 (checkKeyword && PDFCharIsDelimiter(ch))) {
854 return false;
855 }
856 }
857
858 if (bCheckLeft && startpos > 0 && GetCharAt(startpos - 1, ch)) {
859 if (PDFCharIsNumeric(ch) || PDFCharIsOther(ch) ||
860 (checkKeyword && PDFCharIsDelimiter(ch))) {
861 return false;
862 }
863 }
864 return true;
865 }
866
BackwardsSearchToWord(ByteStringView word,FX_FILESIZE limit)867 bool CPDF_SyntaxParser::BackwardsSearchToWord(ByteStringView word,
868 FX_FILESIZE limit) {
869 int32_t taglen = word.GetLength();
870 if (taglen == 0)
871 return false;
872
873 FX_FILESIZE pos = m_Pos;
874 int32_t offset = taglen - 1;
875 while (true) {
876 if (limit && pos <= m_Pos - limit)
877 return false;
878
879 uint8_t byte;
880 if (!GetCharAtBackward(pos, &byte))
881 return false;
882
883 if (byte == word[offset]) {
884 offset--;
885 if (offset >= 0) {
886 pos--;
887 continue;
888 }
889 if (IsWholeWord(pos, limit, word, false)) {
890 m_Pos = pos;
891 return true;
892 }
893 }
894 offset = byte == word[taglen - 1] ? taglen - 2 : taglen - 1;
895 pos--;
896 if (pos < 0)
897 return false;
898 }
899 }
900
FindTag(ByteStringView tag)901 FX_FILESIZE CPDF_SyntaxParser::FindTag(ByteStringView tag) {
902 const FX_FILESIZE startpos = GetPos();
903 const int32_t taglen = tag.GetLength();
904 DCHECK_GT(taglen, 0);
905
906 int32_t match = 0;
907 while (true) {
908 uint8_t ch;
909 if (!GetNextChar(ch))
910 return -1;
911
912 if (ch == tag[match]) {
913 match++;
914 if (match == taglen)
915 return GetPos() - startpos - taglen;
916 } else {
917 match = ch == tag[0] ? 1 : 0;
918 }
919 }
920 }
921
IsPositionRead(FX_FILESIZE pos) const922 bool CPDF_SyntaxParser::IsPositionRead(FX_FILESIZE pos) const {
923 return m_BufOffset <= pos &&
924 pos < static_cast<FX_FILESIZE>(m_BufOffset + m_pFileBuf.size());
925 }
926