• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2016 The PDFium Authors
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 // Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com
6 
7 #include "core/fpdfapi/parser/cpdf_syntax_parser.h"
8 
9 #include <ctype.h>
10 
11 #include <algorithm>
12 #include <utility>
13 
14 #include "core/fpdfapi/parser/cpdf_array.h"
15 #include "core/fpdfapi/parser/cpdf_boolean.h"
16 #include "core/fpdfapi/parser/cpdf_crypto_handler.h"
17 #include "core/fpdfapi/parser/cpdf_dictionary.h"
18 #include "core/fpdfapi/parser/cpdf_name.h"
19 #include "core/fpdfapi/parser/cpdf_null.h"
20 #include "core/fpdfapi/parser/cpdf_number.h"
21 #include "core/fpdfapi/parser/cpdf_read_validator.h"
22 #include "core/fpdfapi/parser/cpdf_reference.h"
23 #include "core/fpdfapi/parser/cpdf_stream.h"
24 #include "core/fpdfapi/parser/cpdf_string.h"
25 #include "core/fpdfapi/parser/fpdf_parser_utility.h"
26 #include "core/fxcrt/autorestorer.h"
27 #include "core/fxcrt/cfx_read_only_vector_stream.h"
28 #include "core/fxcrt/fixed_uninit_data_vector.h"
29 #include "core/fxcrt/fx_extension.h"
30 #include "core/fxcrt/fx_safe_types.h"
31 #include "third_party/base/check.h"
32 #include "third_party/base/check_op.h"
33 #include "third_party/base/numerics/safe_math.h"
34 
35 namespace {
36 
37 enum class ReadStatus {
38   kNormal,
39   kBackslash,
40   kOctal,
41   kFinishOctal,
42   kCarriageReturn
43 };
44 
45 class ReadableSubStream final : public IFX_SeekableReadStream {
46  public:
ReadableSubStream(RetainPtr<IFX_SeekableReadStream> pFileRead,FX_FILESIZE part_offset,FX_FILESIZE part_size)47   ReadableSubStream(RetainPtr<IFX_SeekableReadStream> pFileRead,
48                     FX_FILESIZE part_offset,
49                     FX_FILESIZE part_size)
50       : m_pFileRead(std::move(pFileRead)),
51         m_PartOffset(part_offset),
52         m_PartSize(part_size) {}
53 
54   ~ReadableSubStream() override = default;
55 
56   // IFX_SeekableReadStream overrides:
ReadBlockAtOffset(pdfium::span<uint8_t> buffer,FX_FILESIZE offset)57   bool ReadBlockAtOffset(pdfium::span<uint8_t> buffer,
58                          FX_FILESIZE offset) override {
59     FX_SAFE_FILESIZE safe_end = offset;
60     safe_end += buffer.size();
61     // Check that requested range is valid, to prevent calling of ReadBlock
62     // of original m_pFileRead with incorrect params.
63     if (!safe_end.IsValid() || safe_end.ValueOrDie() > m_PartSize)
64       return false;
65 
66     return m_pFileRead->ReadBlockAtOffset(buffer, m_PartOffset + offset);
67   }
68 
GetSize()69   FX_FILESIZE GetSize() override { return m_PartSize; }
70 
71  private:
72   RetainPtr<IFX_SeekableReadStream> m_pFileRead;
73   FX_FILESIZE m_PartOffset;
74   FX_FILESIZE m_PartSize;
75 };
76 
77 }  // namespace
78 
79 // static
80 int CPDF_SyntaxParser::s_CurrentRecursionDepth = 0;
81 
82 // static
CreateForTesting(RetainPtr<IFX_SeekableReadStream> pFileAccess,FX_FILESIZE HeaderOffset)83 std::unique_ptr<CPDF_SyntaxParser> CPDF_SyntaxParser::CreateForTesting(
84     RetainPtr<IFX_SeekableReadStream> pFileAccess,
85     FX_FILESIZE HeaderOffset) {
86   return std::make_unique<CPDF_SyntaxParser>(
87       pdfium::MakeRetain<CPDF_ReadValidator>(std::move(pFileAccess), nullptr),
88       HeaderOffset);
89 }
90 
CPDF_SyntaxParser(RetainPtr<IFX_SeekableReadStream> pFileAccess)91 CPDF_SyntaxParser::CPDF_SyntaxParser(
92     RetainPtr<IFX_SeekableReadStream> pFileAccess)
93     : CPDF_SyntaxParser(
94           pdfium::MakeRetain<CPDF_ReadValidator>(std::move(pFileAccess),
95                                                  nullptr),
96           0) {}
97 
CPDF_SyntaxParser(RetainPtr<CPDF_ReadValidator> validator,FX_FILESIZE HeaderOffset)98 CPDF_SyntaxParser::CPDF_SyntaxParser(RetainPtr<CPDF_ReadValidator> validator,
99                                      FX_FILESIZE HeaderOffset)
100     : m_pFileAccess(std::move(validator)),
101       m_HeaderOffset(HeaderOffset),
102       m_FileLen(m_pFileAccess->GetSize()) {
103   DCHECK(m_HeaderOffset <= m_FileLen);
104 }
105 
106 CPDF_SyntaxParser::~CPDF_SyntaxParser() = default;
107 
GetCharAt(FX_FILESIZE pos,uint8_t & ch)108 bool CPDF_SyntaxParser::GetCharAt(FX_FILESIZE pos, uint8_t& ch) {
109   AutoRestorer<FX_FILESIZE> save_pos(&m_Pos);
110   m_Pos = pos;
111   return GetNextChar(ch);
112 }
113 
ReadBlockAt(FX_FILESIZE read_pos)114 bool CPDF_SyntaxParser::ReadBlockAt(FX_FILESIZE read_pos) {
115   if (read_pos >= m_FileLen)
116     return false;
117   size_t read_size = m_ReadBufferSize;
118   FX_SAFE_FILESIZE safe_end = read_pos;
119   safe_end += read_size;
120   if (!safe_end.IsValid() || safe_end.ValueOrDie() > m_FileLen)
121     read_size = m_FileLen - read_pos;
122 
123   m_pFileBuf.resize(read_size);
124   if (!m_pFileAccess->ReadBlockAtOffset(m_pFileBuf, read_pos)) {
125     m_pFileBuf.clear();
126     return false;
127   }
128 
129   m_BufOffset = read_pos;
130   return true;
131 }
132 
GetNextChar(uint8_t & ch)133 bool CPDF_SyntaxParser::GetNextChar(uint8_t& ch) {
134   FX_FILESIZE pos = m_Pos + m_HeaderOffset;
135   if (pos >= m_FileLen)
136     return false;
137 
138   if (!IsPositionRead(pos) && !ReadBlockAt(pos))
139     return false;
140 
141   ch = m_pFileBuf[pos - m_BufOffset];
142   m_Pos++;
143   return true;
144 }
145 
GetDocumentSize() const146 FX_FILESIZE CPDF_SyntaxParser::GetDocumentSize() const {
147   return m_FileLen - m_HeaderOffset;
148 }
149 
GetCharAtBackward(FX_FILESIZE pos,uint8_t * ch)150 bool CPDF_SyntaxParser::GetCharAtBackward(FX_FILESIZE pos, uint8_t* ch) {
151   pos += m_HeaderOffset;
152   if (pos >= m_FileLen)
153     return false;
154 
155   if (!IsPositionRead(pos)) {
156     FX_FILESIZE block_start = 0;
157     if (pos >= CPDF_Stream::kFileBufSize)
158       block_start = pos - CPDF_Stream::kFileBufSize + 1;
159     if (!ReadBlockAt(block_start) || !IsPositionRead(pos))
160       return false;
161   }
162   *ch = m_pFileBuf[pos - m_BufOffset];
163   return true;
164 }
165 
ReadBlock(pdfium::span<uint8_t> buffer)166 bool CPDF_SyntaxParser::ReadBlock(pdfium::span<uint8_t> buffer) {
167   if (!m_pFileAccess->ReadBlockAtOffset(buffer, m_Pos + m_HeaderOffset))
168     return false;
169   m_Pos += buffer.size();
170   return true;
171 }
172 
GetNextWordInternal()173 CPDF_SyntaxParser::WordType CPDF_SyntaxParser::GetNextWordInternal() {
174   m_WordSize = 0;
175   WordType word_type = WordType::kNumber;
176 
177   ToNextWord();
178   uint8_t ch;
179   if (!GetNextChar(ch))
180     return word_type;
181 
182   if (PDFCharIsDelimiter(ch)) {
183     word_type = WordType::kWord;
184 
185     m_WordBuffer[m_WordSize++] = ch;
186     if (ch == '/') {
187       while (true) {
188         if (!GetNextChar(ch))
189           return word_type;
190 
191         if (!PDFCharIsOther(ch) && !PDFCharIsNumeric(ch)) {
192           m_Pos--;
193           return word_type;
194         }
195 
196         if (m_WordSize < sizeof(m_WordBuffer) - 1)
197           m_WordBuffer[m_WordSize++] = ch;
198       }
199     } else if (ch == '<') {
200       if (!GetNextChar(ch))
201         return word_type;
202 
203       if (ch == '<')
204         m_WordBuffer[m_WordSize++] = ch;
205       else
206         m_Pos--;
207     } else if (ch == '>') {
208       if (!GetNextChar(ch))
209         return word_type;
210 
211       if (ch == '>')
212         m_WordBuffer[m_WordSize++] = ch;
213       else
214         m_Pos--;
215     }
216     return word_type;
217   }
218 
219   while (true) {
220     if (m_WordSize < sizeof(m_WordBuffer) - 1)
221       m_WordBuffer[m_WordSize++] = ch;
222 
223     if (!PDFCharIsNumeric(ch))
224       word_type = WordType::kWord;
225 
226     if (!GetNextChar(ch))
227       return word_type;
228 
229     if (PDFCharIsDelimiter(ch) || PDFCharIsWhitespace(ch)) {
230       m_Pos--;
231       break;
232     }
233   }
234   return word_type;
235 }
236 
ReadString()237 ByteString CPDF_SyntaxParser::ReadString() {
238   uint8_t ch;
239   if (!GetNextChar(ch))
240     return ByteString();
241 
242   ByteString buf;
243   int32_t parlevel = 0;
244   ReadStatus status = ReadStatus::kNormal;
245   int32_t iEscCode = 0;
246   while (true) {
247     switch (status) {
248       case ReadStatus::kNormal:
249         if (ch == ')') {
250           if (parlevel == 0)
251             return ByteString(buf);
252           parlevel--;
253         } else if (ch == '(') {
254           parlevel++;
255         }
256         if (ch == '\\')
257           status = ReadStatus::kBackslash;
258         else
259           buf += static_cast<char>(ch);
260         break;
261       case ReadStatus::kBackslash:
262         if (FXSYS_IsOctalDigit(ch)) {
263           iEscCode = FXSYS_DecimalCharToInt(static_cast<wchar_t>(ch));
264           status = ReadStatus::kOctal;
265           break;
266         }
267         if (ch == '\r') {
268           status = ReadStatus::kCarriageReturn;
269           break;
270         }
271         if (ch == 'n') {
272           buf += '\n';
273         } else if (ch == 'r') {
274           buf += '\r';
275         } else if (ch == 't') {
276           buf += '\t';
277         } else if (ch == 'b') {
278           buf += '\b';
279         } else if (ch == 'f') {
280           buf += '\f';
281         } else if (ch != '\n') {
282           buf += static_cast<char>(ch);
283         }
284         status = ReadStatus::kNormal;
285         break;
286       case ReadStatus::kOctal:
287         if (FXSYS_IsOctalDigit(ch)) {
288           iEscCode =
289               iEscCode * 8 + FXSYS_DecimalCharToInt(static_cast<wchar_t>(ch));
290           status = ReadStatus::kFinishOctal;
291         } else {
292           buf += static_cast<char>(iEscCode);
293           status = ReadStatus::kNormal;
294           continue;
295         }
296         break;
297       case ReadStatus::kFinishOctal:
298         status = ReadStatus::kNormal;
299         if (FXSYS_IsOctalDigit(ch)) {
300           iEscCode =
301               iEscCode * 8 + FXSYS_DecimalCharToInt(static_cast<wchar_t>(ch));
302           buf += static_cast<char>(iEscCode);
303         } else {
304           buf += static_cast<char>(iEscCode);
305           continue;
306         }
307         break;
308       case ReadStatus::kCarriageReturn:
309         status = ReadStatus::kNormal;
310         if (ch != '\n')
311           continue;
312         break;
313     }
314 
315     if (!GetNextChar(ch))
316       break;
317   }
318 
319   GetNextChar(ch);
320   return buf;
321 }
322 
ReadHexString()323 ByteString CPDF_SyntaxParser::ReadHexString() {
324   uint8_t ch;
325   if (!GetNextChar(ch))
326     return ByteString();
327 
328   ByteString buf;
329   bool bFirst = true;
330   uint8_t code = 0;
331   while (true) {
332     if (ch == '>')
333       break;
334 
335     if (isxdigit(ch)) {
336       int val = FXSYS_HexCharToInt(ch);
337       if (bFirst) {
338         code = val * 16;
339       } else {
340         code += val;
341         buf += static_cast<char>(code);
342       }
343       bFirst = !bFirst;
344     }
345 
346     if (!GetNextChar(ch))
347       break;
348   }
349   if (!bFirst)
350     buf += static_cast<char>(code);
351 
352   return buf;
353 }
354 
ToNextLine()355 void CPDF_SyntaxParser::ToNextLine() {
356   uint8_t ch;
357   while (GetNextChar(ch)) {
358     if (ch == '\n')
359       break;
360 
361     if (ch == '\r') {
362       GetNextChar(ch);
363       if (ch != '\n')
364         --m_Pos;
365       break;
366     }
367   }
368 }
369 
ToNextWord()370 void CPDF_SyntaxParser::ToNextWord() {
371   if (m_TrailerEnds) {
372     RecordingToNextWord();
373     return;
374   }
375 
376   uint8_t ch;
377   if (!GetNextChar(ch))
378     return;
379 
380   while (true) {
381     while (PDFCharIsWhitespace(ch)) {
382       if (!GetNextChar(ch))
383         return;
384     }
385 
386     if (ch != '%')
387       break;
388 
389     while (true) {
390       if (!GetNextChar(ch))
391         return;
392       if (PDFCharIsLineEnding(ch))
393         break;
394     }
395   }
396   m_Pos--;
397 }
398 
399 // A state machine which goes % -> E -> O -> F -> line ending.
400 enum class EofState {
401   kInitial = 0,
402   kNonPercent,
403   kPercent,
404   kE,
405   kO,
406   kF,
407   kInvalid,
408 };
409 
RecordingToNextWord()410 void CPDF_SyntaxParser::RecordingToNextWord() {
411   DCHECK(m_TrailerEnds);
412 
413   EofState eof_state = EofState::kInitial;
414   // Find the first character which is neither whitespace, nor part of a
415   // comment.
416   while (true) {
417     uint8_t ch;
418     if (!GetNextChar(ch))
419       return;
420     switch (eof_state) {
421       case EofState::kInitial:
422         if (!PDFCharIsWhitespace(ch))
423           eof_state = ch == '%' ? EofState::kPercent : EofState::kNonPercent;
424         break;
425       case EofState::kNonPercent:
426         break;
427       case EofState::kPercent:
428         if (ch == 'E')
429           eof_state = EofState::kE;
430         else if (ch != '%')
431           eof_state = EofState::kInvalid;
432         break;
433       case EofState::kE:
434         eof_state = ch == 'O' ? EofState::kO : EofState::kInvalid;
435         break;
436       case EofState::kO:
437         eof_state = ch == 'F' ? EofState::kF : EofState::kInvalid;
438         break;
439       case EofState::kF:
440         if (ch == '\r') {
441           // See if \r has to be combined with a \n that follows it
442           // immediately.
443           if (GetNextChar(ch) && ch != '\n') {
444             ch = '\r';
445             m_Pos--;
446           }
447         }
448         // If we now have a \r, that's not followed by a \n, so both are OK.
449         if (ch == '\r' || ch == '\n')
450           m_TrailerEnds->push_back(m_Pos);
451         eof_state = EofState::kInvalid;
452         break;
453       case EofState::kInvalid:
454         break;
455     }
456     if (PDFCharIsLineEnding(ch))
457       eof_state = EofState::kInitial;
458     if (eof_state == EofState::kNonPercent)
459       break;
460   }
461   m_Pos--;
462 }
463 
GetNextWord()464 CPDF_SyntaxParser::WordResult CPDF_SyntaxParser::GetNextWord() {
465   CPDF_ReadValidator::ScopedSession read_session(GetValidator());
466   WordType word_type = GetNextWordInternal();
467   ByteString word;
468   if (!GetValidator()->has_read_problems())
469     word = ByteString(m_WordBuffer, m_WordSize);
470   return {word, word_type == WordType::kNumber};
471 }
472 
PeekNextWord()473 ByteString CPDF_SyntaxParser::PeekNextWord() {
474   AutoRestorer<FX_FILESIZE> save_pos(&m_Pos);
475   return GetNextWord().word;
476 }
477 
GetKeyword()478 ByteString CPDF_SyntaxParser::GetKeyword() {
479   return GetNextWord().word;
480 }
481 
SetPos(FX_FILESIZE pos)482 void CPDF_SyntaxParser::SetPos(FX_FILESIZE pos) {
483   DCHECK_GE(pos, 0);
484   m_Pos = std::min(pos, m_FileLen);
485 }
486 
GetObjectBody(CPDF_IndirectObjectHolder * pObjList)487 RetainPtr<CPDF_Object> CPDF_SyntaxParser::GetObjectBody(
488     CPDF_IndirectObjectHolder* pObjList) {
489   CPDF_ReadValidator::ScopedSession read_session(GetValidator());
490   auto result = GetObjectBodyInternal(pObjList, ParseType::kLoose);
491   if (GetValidator()->has_read_problems())
492     return nullptr;
493   return result;
494 }
495 
GetObjectBodyInternal(CPDF_IndirectObjectHolder * pObjList,ParseType parse_type)496 RetainPtr<CPDF_Object> CPDF_SyntaxParser::GetObjectBodyInternal(
497     CPDF_IndirectObjectHolder* pObjList,
498     ParseType parse_type) {
499   AutoRestorer<int> depth_restorer(&s_CurrentRecursionDepth);
500   if (++s_CurrentRecursionDepth > kParserMaxRecursionDepth)
501     return nullptr;
502 
503   FX_FILESIZE SavedObjPos = m_Pos;
504   WordResult word_result = GetNextWord();
505   const ByteString& word = word_result.word;
506   if (word.IsEmpty())
507     return nullptr;
508 
509   if (word_result.is_number) {
510     AutoRestorer<FX_FILESIZE> pos_restorer(&m_Pos);
511     WordResult nextword = GetNextWord();
512     if (!nextword.is_number)
513       return pdfium::MakeRetain<CPDF_Number>(word.AsStringView());
514 
515     WordResult nextword2 = GetNextWord();
516     if (nextword2.word != "R")
517       return pdfium::MakeRetain<CPDF_Number>(word.AsStringView());
518 
519     pos_restorer.AbandonRestoration();
520     uint32_t refnum = FXSYS_atoui(word.c_str());
521     if (refnum == CPDF_Object::kInvalidObjNum)
522       return nullptr;
523 
524     return pdfium::MakeRetain<CPDF_Reference>(pObjList, refnum);
525   }
526 
527   if (word == "true" || word == "false")
528     return pdfium::MakeRetain<CPDF_Boolean>(word == "true");
529 
530   if (word == "null")
531     return pdfium::MakeRetain<CPDF_Null>();
532 
533   if (word == "(") {
534     ByteString str = ReadString();
535     return pdfium::MakeRetain<CPDF_String>(m_pPool, str, false);
536   }
537   if (word == "<") {
538     ByteString str = ReadHexString();
539     return pdfium::MakeRetain<CPDF_String>(m_pPool, str, true);
540   }
541   if (word == "[") {
542     auto pArray = pdfium::MakeRetain<CPDF_Array>();
543     while (RetainPtr<CPDF_Object> pObj =
544                GetObjectBodyInternal(pObjList, ParseType::kLoose)) {
545       pArray->Append(std::move(pObj));
546     }
547     return (parse_type == ParseType::kLoose || m_WordBuffer[0] == ']')
548                ? std::move(pArray)
549                : nullptr;
550   }
551   if (word[0] == '/') {
552     return pdfium::MakeRetain<CPDF_Name>(
553         m_pPool,
554         PDF_NameDecode(ByteStringView(m_WordBuffer + 1, m_WordSize - 1)));
555   }
556   if (word == "<<") {
557     RetainPtr<CPDF_Dictionary> pDict =
558         pdfium::MakeRetain<CPDF_Dictionary>(m_pPool);
559     while (true) {
560       WordResult inner_word_result = GetNextWord();
561       const ByteString& inner_word = inner_word_result.word;
562       if (inner_word.IsEmpty())
563         return nullptr;
564 
565       FX_FILESIZE SavedPos = m_Pos - inner_word.GetLength();
566       if (inner_word == ">>")
567         break;
568 
569       if (inner_word == "endobj") {
570         m_Pos = SavedPos;
571         break;
572       }
573       if (inner_word[0] != '/')
574         continue;
575 
576       ByteString key = PDF_NameDecode(inner_word.AsStringView());
577       if (key.IsEmpty() && parse_type == ParseType::kLoose)
578         continue;
579 
580       RetainPtr<CPDF_Object> pObj =
581           GetObjectBodyInternal(pObjList, ParseType::kLoose);
582       if (!pObj) {
583         if (parse_type == ParseType::kLoose)
584           continue;
585 
586         ToNextLine();
587         return nullptr;
588       }
589 
590       // `key` has to be "/X" at the minimum.
591       if (key.GetLength() > 1) {
592         pDict->SetFor(key.Substr(1), std::move(pObj));
593       }
594     }
595 
596     AutoRestorer<FX_FILESIZE> pos_restorer(&m_Pos);
597     if (GetNextWord().word != "stream")
598       return pDict;
599     pos_restorer.AbandonRestoration();
600     return ReadStream(std::move(pDict));
601   }
602   if (word == ">>")
603     m_Pos = SavedObjPos;
604 
605   return nullptr;
606 }
607 
GetIndirectObject(CPDF_IndirectObjectHolder * pObjList,ParseType parse_type)608 RetainPtr<CPDF_Object> CPDF_SyntaxParser::GetIndirectObject(
609     CPDF_IndirectObjectHolder* pObjList,
610     ParseType parse_type) {
611   CPDF_ReadValidator::ScopedSession read_session(GetValidator());
612   const FX_FILESIZE saved_pos = GetPos();
613 
614   WordResult objnum_word_result = GetNextWord();
615   if (!objnum_word_result.is_number || objnum_word_result.word.IsEmpty()) {
616     SetPos(saved_pos);
617     return nullptr;
618   }
619   const uint32_t parser_objnum = FXSYS_atoui(objnum_word_result.word.c_str());
620 
621   WordResult gennum_word_result = GetNextWord();
622   const ByteString& gennum_word = gennum_word_result.word;
623   if (!gennum_word_result.is_number || gennum_word.IsEmpty()) {
624     SetPos(saved_pos);
625     return nullptr;
626   }
627   const uint32_t parser_gennum = FXSYS_atoui(gennum_word.c_str());
628 
629   if (GetKeyword() != "obj") {
630     SetPos(saved_pos);
631     return nullptr;
632   }
633 
634   RetainPtr<CPDF_Object> pObj = GetObjectBodyInternal(pObjList, parse_type);
635   if (pObj) {
636     pObj->SetObjNum(parser_objnum);
637     pObj->SetGenNum(parser_gennum);
638   }
639 
640   return GetValidator()->has_read_problems() ? nullptr : std::move(pObj);
641 }
642 
ReadEOLMarkers(FX_FILESIZE pos)643 unsigned int CPDF_SyntaxParser::ReadEOLMarkers(FX_FILESIZE pos) {
644   unsigned char byte1 = 0;
645   unsigned char byte2 = 0;
646 
647   GetCharAt(pos, byte1);
648   GetCharAt(pos + 1, byte2);
649 
650   if (byte1 == '\r' && byte2 == '\n')
651     return 2;
652 
653   if (byte1 == '\r' || byte1 == '\n')
654     return 1;
655 
656   return 0;
657 }
658 
FindWordPos(ByteStringView word)659 FX_FILESIZE CPDF_SyntaxParser::FindWordPos(ByteStringView word) {
660   AutoRestorer<FX_FILESIZE> pos_restorer(&m_Pos);
661   FX_FILESIZE end_offset = FindTag(word);
662   while (end_offset >= 0) {
663     // Stop searching when word is found.
664     if (IsWholeWord(GetPos() - word.GetLength(), m_FileLen, word, true))
665       return GetPos() - word.GetLength();
666 
667     end_offset = FindTag(word);
668   }
669   return -1;
670 }
671 
FindStreamEndPos()672 FX_FILESIZE CPDF_SyntaxParser::FindStreamEndPos() {
673   const ByteStringView kEndStreamStr("endstream");
674   const ByteStringView kEndObjStr("endobj");
675 
676   FX_FILESIZE endStreamWordOffset = FindWordPos(kEndStreamStr);
677   FX_FILESIZE endObjWordOffset = FindWordPos(kEndObjStr);
678 
679   // Can't find "endstream" or "endobj".
680   if (endStreamWordOffset < 0 && endObjWordOffset < 0) {
681     return -1;
682   }
683 
684   if (endStreamWordOffset < 0 && endObjWordOffset >= 0) {
685     // Correct the position of end stream.
686     endStreamWordOffset = endObjWordOffset;
687   } else if (endStreamWordOffset >= 0 && endObjWordOffset < 0) {
688     // Correct the position of end obj.
689     endObjWordOffset = endStreamWordOffset;
690   } else if (endStreamWordOffset > endObjWordOffset) {
691     endStreamWordOffset = endObjWordOffset;
692   }
693 
694   int numMarkers = ReadEOLMarkers(endStreamWordOffset - 2);
695   if (numMarkers == 2) {
696     endStreamWordOffset -= 2;
697   } else {
698     numMarkers = ReadEOLMarkers(endStreamWordOffset - 1);
699     if (numMarkers == 1) {
700       endStreamWordOffset -= 1;
701     }
702   }
703   if (endStreamWordOffset < GetPos()) {
704     return -1;
705   }
706   return endStreamWordOffset;
707 }
708 
ReadStream(RetainPtr<CPDF_Dictionary> pDict)709 RetainPtr<CPDF_Stream> CPDF_SyntaxParser::ReadStream(
710     RetainPtr<CPDF_Dictionary> pDict) {
711   RetainPtr<const CPDF_Number> pLenObj =
712       ToNumber(pDict->GetDirectObjectFor("Length"));
713   FX_FILESIZE len = pLenObj ? pLenObj->GetInteger() : -1;
714 
715   // Locate the start of stream.
716   ToNextLine();
717   const FX_FILESIZE streamStartPos = GetPos();
718 
719   if (len > 0) {
720     FX_SAFE_FILESIZE pos = GetPos();
721     pos += len;
722     if (!pos.IsValid() || pos.ValueOrDie() >= m_FileLen)
723       len = -1;
724   }
725 
726   RetainPtr<IFX_SeekableReadStream> substream;
727   if (len > 0) {
728     // Check data availability first to allow the Validator to request data
729     // smoothly, without jumps.
730     if (!GetValidator()->CheckDataRangeAndRequestIfUnavailable(
731             m_HeaderOffset + GetPos(), len)) {
732       return nullptr;
733     }
734 
735     substream = pdfium::MakeRetain<ReadableSubStream>(
736         GetValidator(), m_HeaderOffset + GetPos(), len);
737     SetPos(GetPos() + len);
738   }
739 
740   const ByteStringView kEndStreamStr("endstream");
741   const ByteStringView kEndObjStr("endobj");
742 
743   // Note, we allow zero length streams as we need to pass them through when we
744   // are importing pages into a new document.
745   if (len >= 0) {
746     CPDF_ReadValidator::ScopedSession read_session(GetValidator());
747     m_Pos += ReadEOLMarkers(GetPos());
748     memset(m_WordBuffer, 0, kEndStreamStr.GetLength() + 1);
749     GetNextWordInternal();
750     if (GetValidator()->has_read_problems())
751       return nullptr;
752 
753     // Earlier version of PDF specification doesn't require EOL marker before
754     // 'endstream' keyword. If keyword 'endstream' follows the bytes in
755     // specified length, it signals the end of stream.
756     if (memcmp(m_WordBuffer, kEndStreamStr.raw_str(),
757                kEndStreamStr.GetLength()) != 0) {
758       substream.Reset();
759       len = -1;
760       SetPos(streamStartPos);
761     }
762   }
763 
764   if (len < 0) {
765     // If len is not available or incorrect, len needs to be calculated
766     // by searching the keywords "endstream" or "endobj".
767     const FX_FILESIZE streamEndPos = FindStreamEndPos();
768     if (streamEndPos < 0)
769       return nullptr;
770 
771     len = streamEndPos - streamStartPos;
772     DCHECK_GE(len, 0);
773     if (len > 0) {
774       SetPos(streamStartPos);
775       // Check data availability first to allow the Validator to request data
776       // smoothly, without jumps.
777       if (!GetValidator()->CheckDataRangeAndRequestIfUnavailable(
778               m_HeaderOffset + GetPos(), len)) {
779         return nullptr;
780       }
781 
782       substream = pdfium::MakeRetain<ReadableSubStream>(
783           GetValidator(), m_HeaderOffset + GetPos(), len);
784       SetPos(GetPos() + len);
785     }
786   }
787 
788   RetainPtr<CPDF_Stream> pStream;
789   if (substream) {
790     // It is unclear from CPDF_SyntaxParser's perspective what object
791     // `substream` is ultimately holding references to. To avoid unexpectedly
792     // changing object lifetimes by handing `substream` to `pStream`, make a
793     // copy of the data here.
794     FixedUninitDataVector<uint8_t> data(substream->GetSize());
795     bool did_read = substream->ReadBlockAtOffset(data.writable_span(), 0);
796     CHECK(did_read);
797     auto data_as_stream =
798         pdfium::MakeRetain<CFX_ReadOnlyVectorStream>(std::move(data));
799 
800     pStream = pdfium::MakeRetain<CPDF_Stream>();
801     pStream->InitStreamFromFile(std::move(data_as_stream), std::move(pDict));
802   } else {
803     DCHECK(!len);
804     pStream = pdfium::MakeRetain<CPDF_Stream>(std::move(pDict));
805   }
806   const FX_FILESIZE end_stream_offset = GetPos();
807   memset(m_WordBuffer, 0, kEndObjStr.GetLength() + 1);
808   GetNextWordInternal();
809 
810   // Allow whitespace after endstream and before a newline.
811   unsigned char ch = 0;
812   while (GetNextChar(ch)) {
813     if (!PDFCharIsWhitespace(ch) || PDFCharIsLineEnding(ch))
814       break;
815   }
816   SetPos(GetPos() - 1);
817 
818   int numMarkers = ReadEOLMarkers(GetPos());
819   if (m_WordSize == static_cast<unsigned int>(kEndObjStr.GetLength()) &&
820       numMarkers != 0 &&
821       memcmp(m_WordBuffer, kEndObjStr.raw_str(), kEndObjStr.GetLength()) == 0) {
822     SetPos(end_stream_offset);
823   }
824   return pStream;
825 }
826 
GetDirectNum()827 uint32_t CPDF_SyntaxParser::GetDirectNum() {
828   if (GetNextWordInternal() != WordType::kNumber)
829     return 0;
830 
831   m_WordBuffer[m_WordSize] = 0;
832   return FXSYS_atoui(reinterpret_cast<const char*>(m_WordBuffer));
833 }
834 
GetValidator() const835 RetainPtr<CPDF_ReadValidator> CPDF_SyntaxParser::GetValidator() const {
836   return m_pFileAccess;
837 }
838 
IsWholeWord(FX_FILESIZE startpos,FX_FILESIZE limit,ByteStringView tag,bool checkKeyword)839 bool CPDF_SyntaxParser::IsWholeWord(FX_FILESIZE startpos,
840                                     FX_FILESIZE limit,
841                                     ByteStringView tag,
842                                     bool checkKeyword) {
843   const uint32_t taglen = tag.GetLength();
844 
845   bool bCheckLeft = !PDFCharIsDelimiter(tag[0]) && !PDFCharIsWhitespace(tag[0]);
846   bool bCheckRight = !PDFCharIsDelimiter(tag[taglen - 1]) &&
847                      !PDFCharIsWhitespace(tag[taglen - 1]);
848 
849   uint8_t ch;
850   if (bCheckRight && startpos + static_cast<int32_t>(taglen) <= limit &&
851       GetCharAt(startpos + static_cast<int32_t>(taglen), ch)) {
852     if (PDFCharIsNumeric(ch) || PDFCharIsOther(ch) ||
853         (checkKeyword && PDFCharIsDelimiter(ch))) {
854       return false;
855     }
856   }
857 
858   if (bCheckLeft && startpos > 0 && GetCharAt(startpos - 1, ch)) {
859     if (PDFCharIsNumeric(ch) || PDFCharIsOther(ch) ||
860         (checkKeyword && PDFCharIsDelimiter(ch))) {
861       return false;
862     }
863   }
864   return true;
865 }
866 
BackwardsSearchToWord(ByteStringView word,FX_FILESIZE limit)867 bool CPDF_SyntaxParser::BackwardsSearchToWord(ByteStringView word,
868                                               FX_FILESIZE limit) {
869   int32_t taglen = word.GetLength();
870   if (taglen == 0)
871     return false;
872 
873   FX_FILESIZE pos = m_Pos;
874   int32_t offset = taglen - 1;
875   while (true) {
876     if (limit && pos <= m_Pos - limit)
877       return false;
878 
879     uint8_t byte;
880     if (!GetCharAtBackward(pos, &byte))
881       return false;
882 
883     if (byte == word[offset]) {
884       offset--;
885       if (offset >= 0) {
886         pos--;
887         continue;
888       }
889       if (IsWholeWord(pos, limit, word, false)) {
890         m_Pos = pos;
891         return true;
892       }
893     }
894     offset = byte == word[taglen - 1] ? taglen - 2 : taglen - 1;
895     pos--;
896     if (pos < 0)
897       return false;
898   }
899 }
900 
FindTag(ByteStringView tag)901 FX_FILESIZE CPDF_SyntaxParser::FindTag(ByteStringView tag) {
902   const FX_FILESIZE startpos = GetPos();
903   const int32_t taglen = tag.GetLength();
904   DCHECK_GT(taglen, 0);
905 
906   int32_t match = 0;
907   while (true) {
908     uint8_t ch;
909     if (!GetNextChar(ch))
910       return -1;
911 
912     if (ch == tag[match]) {
913       match++;
914       if (match == taglen)
915         return GetPos() - startpos - taglen;
916     } else {
917       match = ch == tag[0] ? 1 : 0;
918     }
919   }
920 }
921 
IsPositionRead(FX_FILESIZE pos) const922 bool CPDF_SyntaxParser::IsPositionRead(FX_FILESIZE pos) const {
923   return m_BufOffset <= pos &&
924          pos < static_cast<FX_FILESIZE>(m_BufOffset + m_pFileBuf.size());
925 }
926