• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2016 The PDFium Authors
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 // Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com
6 
7 #include "core/fpdfapi/parser/cpdf_syntax_parser.h"
8 
9 #include <ctype.h>
10 
11 #include <algorithm>
12 #include <utility>
13 
14 #include "core/fpdfapi/parser/cpdf_array.h"
15 #include "core/fpdfapi/parser/cpdf_boolean.h"
16 #include "core/fpdfapi/parser/cpdf_crypto_handler.h"
17 #include "core/fpdfapi/parser/cpdf_dictionary.h"
18 #include "core/fpdfapi/parser/cpdf_name.h"
19 #include "core/fpdfapi/parser/cpdf_null.h"
20 #include "core/fpdfapi/parser/cpdf_number.h"
21 #include "core/fpdfapi/parser/cpdf_read_validator.h"
22 #include "core/fpdfapi/parser/cpdf_reference.h"
23 #include "core/fpdfapi/parser/cpdf_stream.h"
24 #include "core/fpdfapi/parser/cpdf_string.h"
25 #include "core/fpdfapi/parser/fpdf_parser_utility.h"
26 #include "core/fxcrt/autorestorer.h"
27 #include "core/fxcrt/cfx_read_only_vector_stream.h"
28 #include "core/fxcrt/check.h"
29 #include "core/fxcrt/check_op.h"
30 #include "core/fxcrt/data_vector.h"
31 #include "core/fxcrt/fixed_size_data_vector.h"
32 #include "core/fxcrt/fx_extension.h"
33 #include "core/fxcrt/fx_memcpy_wrappers.h"
34 #include "core/fxcrt/fx_safe_types.h"
35 #include "core/fxcrt/stl_util.h"
36 
37 namespace {
38 
39 enum class ReadStatus {
40   kNormal,
41   kBackslash,
42   kOctal,
43   kFinishOctal,
44   kCarriageReturn
45 };
46 
47 class ReadableSubStream final : public IFX_SeekableReadStream {
48  public:
ReadableSubStream(RetainPtr<IFX_SeekableReadStream> pFileRead,FX_FILESIZE part_offset,FX_FILESIZE part_size)49   ReadableSubStream(RetainPtr<IFX_SeekableReadStream> pFileRead,
50                     FX_FILESIZE part_offset,
51                     FX_FILESIZE part_size)
52       : m_pFileRead(std::move(pFileRead)),
53         m_PartOffset(part_offset),
54         m_PartSize(part_size) {}
55 
56   ~ReadableSubStream() override = default;
57 
58   // IFX_SeekableReadStream overrides:
ReadBlockAtOffset(pdfium::span<uint8_t> buffer,FX_FILESIZE offset)59   bool ReadBlockAtOffset(pdfium::span<uint8_t> buffer,
60                          FX_FILESIZE offset) override {
61     FX_SAFE_FILESIZE safe_end = offset;
62     safe_end += buffer.size();
63     // Check that requested range is valid, to prevent calling of ReadBlock
64     // of original m_pFileRead with incorrect params.
65     if (!safe_end.IsValid() || safe_end.ValueOrDie() > m_PartSize)
66       return false;
67 
68     return m_pFileRead->ReadBlockAtOffset(buffer, m_PartOffset + offset);
69   }
70 
GetSize()71   FX_FILESIZE GetSize() override { return m_PartSize; }
72 
73  private:
74   RetainPtr<IFX_SeekableReadStream> m_pFileRead;
75   FX_FILESIZE m_PartOffset;
76   FX_FILESIZE m_PartSize;
77 };
78 
79 }  // namespace
80 
81 // static
82 int CPDF_SyntaxParser::s_CurrentRecursionDepth = 0;
83 
84 // static
CreateForTesting(RetainPtr<IFX_SeekableReadStream> pFileAccess,FX_FILESIZE HeaderOffset)85 std::unique_ptr<CPDF_SyntaxParser> CPDF_SyntaxParser::CreateForTesting(
86     RetainPtr<IFX_SeekableReadStream> pFileAccess,
87     FX_FILESIZE HeaderOffset) {
88   return std::make_unique<CPDF_SyntaxParser>(
89       pdfium::MakeRetain<CPDF_ReadValidator>(std::move(pFileAccess), nullptr),
90       HeaderOffset);
91 }
92 
CPDF_SyntaxParser(RetainPtr<IFX_SeekableReadStream> pFileAccess)93 CPDF_SyntaxParser::CPDF_SyntaxParser(
94     RetainPtr<IFX_SeekableReadStream> pFileAccess)
95     : CPDF_SyntaxParser(
96           pdfium::MakeRetain<CPDF_ReadValidator>(std::move(pFileAccess),
97                                                  nullptr),
98           0) {}
99 
CPDF_SyntaxParser(RetainPtr<CPDF_ReadValidator> validator,FX_FILESIZE HeaderOffset)100 CPDF_SyntaxParser::CPDF_SyntaxParser(RetainPtr<CPDF_ReadValidator> validator,
101                                      FX_FILESIZE HeaderOffset)
102     : m_pFileAccess(std::move(validator)),
103       m_HeaderOffset(HeaderOffset),
104       m_FileLen(m_pFileAccess->GetSize()) {
105   DCHECK(m_HeaderOffset <= m_FileLen);
106 }
107 
108 CPDF_SyntaxParser::~CPDF_SyntaxParser() = default;
109 
GetCharAt(FX_FILESIZE pos,uint8_t & ch)110 bool CPDF_SyntaxParser::GetCharAt(FX_FILESIZE pos, uint8_t& ch) {
111   AutoRestorer<FX_FILESIZE> save_pos(&m_Pos);
112   m_Pos = pos;
113   return GetNextChar(ch);
114 }
115 
ReadBlockAt(FX_FILESIZE read_pos)116 bool CPDF_SyntaxParser::ReadBlockAt(FX_FILESIZE read_pos) {
117   if (read_pos >= m_FileLen)
118     return false;
119   size_t read_size = m_ReadBufferSize;
120   FX_SAFE_FILESIZE safe_end = read_pos;
121   safe_end += read_size;
122   if (!safe_end.IsValid() || safe_end.ValueOrDie() > m_FileLen)
123     read_size = m_FileLen - read_pos;
124 
125   m_pFileBuf.resize(read_size);
126   if (!m_pFileAccess->ReadBlockAtOffset(m_pFileBuf, read_pos)) {
127     m_pFileBuf.clear();
128     return false;
129   }
130 
131   m_BufOffset = read_pos;
132   return true;
133 }
134 
GetNextChar(uint8_t & ch)135 bool CPDF_SyntaxParser::GetNextChar(uint8_t& ch) {
136   FX_FILESIZE pos = m_Pos + m_HeaderOffset;
137   if (pos >= m_FileLen)
138     return false;
139 
140   if (!IsPositionRead(pos) && !ReadBlockAt(pos))
141     return false;
142 
143   ch = m_pFileBuf[pos - m_BufOffset];
144   m_Pos++;
145   return true;
146 }
147 
GetDocumentSize() const148 FX_FILESIZE CPDF_SyntaxParser::GetDocumentSize() const {
149   return m_FileLen - m_HeaderOffset;
150 }
151 
GetCharAtBackward(FX_FILESIZE pos,uint8_t * ch)152 bool CPDF_SyntaxParser::GetCharAtBackward(FX_FILESIZE pos, uint8_t* ch) {
153   pos += m_HeaderOffset;
154   if (pos >= m_FileLen)
155     return false;
156 
157   if (!IsPositionRead(pos)) {
158     FX_FILESIZE block_start = 0;
159     if (pos >= CPDF_Stream::kFileBufSize)
160       block_start = pos - CPDF_Stream::kFileBufSize + 1;
161     if (!ReadBlockAt(block_start) || !IsPositionRead(pos))
162       return false;
163   }
164   *ch = m_pFileBuf[pos - m_BufOffset];
165   return true;
166 }
167 
ReadBlock(pdfium::span<uint8_t> buffer)168 bool CPDF_SyntaxParser::ReadBlock(pdfium::span<uint8_t> buffer) {
169   if (!m_pFileAccess->ReadBlockAtOffset(buffer, m_Pos + m_HeaderOffset))
170     return false;
171   m_Pos += buffer.size();
172   return true;
173 }
174 
GetNextWordInternal()175 CPDF_SyntaxParser::WordType CPDF_SyntaxParser::GetNextWordInternal() {
176   m_WordSize = 0;
177   WordType word_type = WordType::kNumber;
178 
179   ToNextWord();
180   uint8_t ch;
181   if (!GetNextChar(ch))
182     return word_type;
183 
184   if (PDFCharIsDelimiter(ch)) {
185     word_type = WordType::kWord;
186 
187     m_WordBuffer[m_WordSize++] = ch;
188     if (ch == '/') {
189       while (true) {
190         if (!GetNextChar(ch))
191           return word_type;
192 
193         if (!PDFCharIsOther(ch) && !PDFCharIsNumeric(ch)) {
194           m_Pos--;
195           return word_type;
196         }
197 
198         if (m_WordSize < sizeof(m_WordBuffer) - 1)
199           m_WordBuffer[m_WordSize++] = ch;
200       }
201     } else if (ch == '<') {
202       if (!GetNextChar(ch))
203         return word_type;
204 
205       if (ch == '<')
206         m_WordBuffer[m_WordSize++] = ch;
207       else
208         m_Pos--;
209     } else if (ch == '>') {
210       if (!GetNextChar(ch))
211         return word_type;
212 
213       if (ch == '>')
214         m_WordBuffer[m_WordSize++] = ch;
215       else
216         m_Pos--;
217     }
218     return word_type;
219   }
220 
221   while (true) {
222     if (m_WordSize < sizeof(m_WordBuffer) - 1)
223       m_WordBuffer[m_WordSize++] = ch;
224 
225     if (!PDFCharIsNumeric(ch))
226       word_type = WordType::kWord;
227 
228     if (!GetNextChar(ch))
229       return word_type;
230 
231     if (PDFCharIsDelimiter(ch) || PDFCharIsWhitespace(ch)) {
232       m_Pos--;
233       break;
234     }
235   }
236   return word_type;
237 }
238 
ReadString()239 ByteString CPDF_SyntaxParser::ReadString() {
240   uint8_t ch;
241   if (!GetNextChar(ch))
242     return ByteString();
243 
244   ByteString buf;
245   int32_t parlevel = 0;
246   ReadStatus status = ReadStatus::kNormal;
247   int32_t iEscCode = 0;
248   while (true) {
249     switch (status) {
250       case ReadStatus::kNormal:
251         if (ch == ')') {
252           if (parlevel == 0)
253             return ByteString(buf);
254           parlevel--;
255         } else if (ch == '(') {
256           parlevel++;
257         }
258         if (ch == '\\')
259           status = ReadStatus::kBackslash;
260         else
261           buf += static_cast<char>(ch);
262         break;
263       case ReadStatus::kBackslash:
264         if (FXSYS_IsOctalDigit(ch)) {
265           iEscCode = FXSYS_DecimalCharToInt(static_cast<wchar_t>(ch));
266           status = ReadStatus::kOctal;
267           break;
268         }
269         if (ch == '\r') {
270           status = ReadStatus::kCarriageReturn;
271           break;
272         }
273         if (ch == 'n') {
274           buf += '\n';
275         } else if (ch == 'r') {
276           buf += '\r';
277         } else if (ch == 't') {
278           buf += '\t';
279         } else if (ch == 'b') {
280           buf += '\b';
281         } else if (ch == 'f') {
282           buf += '\f';
283         } else if (ch != '\n') {
284           buf += static_cast<char>(ch);
285         }
286         status = ReadStatus::kNormal;
287         break;
288       case ReadStatus::kOctal:
289         if (FXSYS_IsOctalDigit(ch)) {
290           iEscCode =
291               iEscCode * 8 + FXSYS_DecimalCharToInt(static_cast<wchar_t>(ch));
292           status = ReadStatus::kFinishOctal;
293         } else {
294           buf += static_cast<char>(iEscCode);
295           status = ReadStatus::kNormal;
296           continue;
297         }
298         break;
299       case ReadStatus::kFinishOctal:
300         status = ReadStatus::kNormal;
301         if (FXSYS_IsOctalDigit(ch)) {
302           iEscCode =
303               iEscCode * 8 + FXSYS_DecimalCharToInt(static_cast<wchar_t>(ch));
304           buf += static_cast<char>(iEscCode);
305         } else {
306           buf += static_cast<char>(iEscCode);
307           continue;
308         }
309         break;
310       case ReadStatus::kCarriageReturn:
311         status = ReadStatus::kNormal;
312         if (ch != '\n')
313           continue;
314         break;
315     }
316 
317     if (!GetNextChar(ch))
318       break;
319   }
320 
321   GetNextChar(ch);
322   return buf;
323 }
324 
ReadHexString()325 DataVector<uint8_t> CPDF_SyntaxParser::ReadHexString() {
326   uint8_t ch;
327   if (!GetNextChar(ch)) {
328     return DataVector<uint8_t>();
329   }
330 
331   DataVector<uint8_t> buf;
332   bool bFirst = true;
333   uint8_t code = 0;
334   while (true) {
335     if (ch == '>')
336       break;
337 
338     if (isxdigit(ch)) {
339       int val = FXSYS_HexCharToInt(ch);
340       if (bFirst) {
341         code = val * 16;
342       } else {
343         code += val;
344         buf.push_back(code);
345       }
346       bFirst = !bFirst;
347     }
348 
349     if (!GetNextChar(ch)) {
350       break;
351     }
352   }
353   if (!bFirst) {
354     buf.push_back(code);
355   }
356 
357   return buf;
358 }
359 
ToNextLine()360 void CPDF_SyntaxParser::ToNextLine() {
361   uint8_t ch;
362   while (GetNextChar(ch)) {
363     if (ch == '\n')
364       break;
365 
366     if (ch == '\r') {
367       GetNextChar(ch);
368       if (ch != '\n')
369         --m_Pos;
370       break;
371     }
372   }
373 }
374 
ToNextWord()375 void CPDF_SyntaxParser::ToNextWord() {
376   if (m_TrailerEnds) {
377     RecordingToNextWord();
378     return;
379   }
380 
381   uint8_t ch;
382   if (!GetNextChar(ch))
383     return;
384 
385   while (true) {
386     while (PDFCharIsWhitespace(ch)) {
387       if (!GetNextChar(ch))
388         return;
389     }
390 
391     if (ch != '%')
392       break;
393 
394     while (true) {
395       if (!GetNextChar(ch))
396         return;
397       if (PDFCharIsLineEnding(ch))
398         break;
399     }
400   }
401   m_Pos--;
402 }
403 
404 // A state machine which goes % -> E -> O -> F -> line ending.
405 enum class EofState {
406   kInitial = 0,
407   kNonPercent,
408   kPercent,
409   kE,
410   kO,
411   kF,
412   kInvalid,
413 };
414 
RecordingToNextWord()415 void CPDF_SyntaxParser::RecordingToNextWord() {
416   DCHECK(m_TrailerEnds);
417 
418   EofState eof_state = EofState::kInitial;
419   // Find the first character which is neither whitespace, nor part of a
420   // comment.
421   while (true) {
422     uint8_t ch;
423     if (!GetNextChar(ch))
424       return;
425     switch (eof_state) {
426       case EofState::kInitial:
427         if (!PDFCharIsWhitespace(ch))
428           eof_state = ch == '%' ? EofState::kPercent : EofState::kNonPercent;
429         break;
430       case EofState::kNonPercent:
431         break;
432       case EofState::kPercent:
433         if (ch == 'E')
434           eof_state = EofState::kE;
435         else if (ch != '%')
436           eof_state = EofState::kInvalid;
437         break;
438       case EofState::kE:
439         eof_state = ch == 'O' ? EofState::kO : EofState::kInvalid;
440         break;
441       case EofState::kO:
442         eof_state = ch == 'F' ? EofState::kF : EofState::kInvalid;
443         break;
444       case EofState::kF:
445         if (ch == '\r') {
446           // See if \r has to be combined with a \n that follows it
447           // immediately.
448           if (GetNextChar(ch) && ch != '\n') {
449             ch = '\r';
450             m_Pos--;
451           }
452         }
453         // If we now have a \r, that's not followed by a \n, so both are OK.
454         if (ch == '\r' || ch == '\n')
455           m_TrailerEnds->push_back(m_Pos);
456         eof_state = EofState::kInvalid;
457         break;
458       case EofState::kInvalid:
459         break;
460     }
461     if (PDFCharIsLineEnding(ch))
462       eof_state = EofState::kInitial;
463     if (eof_state == EofState::kNonPercent)
464       break;
465   }
466   m_Pos--;
467 }
468 
GetNextWord()469 CPDF_SyntaxParser::WordResult CPDF_SyntaxParser::GetNextWord() {
470   CPDF_ReadValidator::ScopedSession read_session(GetValidator());
471   WordType word_type = GetNextWordInternal();
472   ByteStringView word;
473   if (!GetValidator()->has_read_problems()) {
474     word = ByteStringView(pdfium::make_span(m_WordBuffer).first(m_WordSize));
475   }
476   return {ByteString(word), word_type == WordType::kNumber};
477 }
478 
PeekNextWord()479 ByteString CPDF_SyntaxParser::PeekNextWord() {
480   AutoRestorer<FX_FILESIZE> save_pos(&m_Pos);
481   return GetNextWord().word;
482 }
483 
GetKeyword()484 ByteString CPDF_SyntaxParser::GetKeyword() {
485   return GetNextWord().word;
486 }
487 
SetPos(FX_FILESIZE pos)488 void CPDF_SyntaxParser::SetPos(FX_FILESIZE pos) {
489   DCHECK_GE(pos, 0);
490   m_Pos = std::min(pos, m_FileLen);
491 }
492 
GetObjectBody(CPDF_IndirectObjectHolder * pObjList)493 RetainPtr<CPDF_Object> CPDF_SyntaxParser::GetObjectBody(
494     CPDF_IndirectObjectHolder* pObjList) {
495   CPDF_ReadValidator::ScopedSession read_session(GetValidator());
496   auto result = GetObjectBodyInternal(pObjList, ParseType::kLoose);
497   if (GetValidator()->has_read_problems())
498     return nullptr;
499   return result;
500 }
501 
GetObjectBodyInternal(CPDF_IndirectObjectHolder * pObjList,ParseType parse_type)502 RetainPtr<CPDF_Object> CPDF_SyntaxParser::GetObjectBodyInternal(
503     CPDF_IndirectObjectHolder* pObjList,
504     ParseType parse_type) {
505   AutoRestorer<int> depth_restorer(&s_CurrentRecursionDepth);
506   if (++s_CurrentRecursionDepth > kParserMaxRecursionDepth)
507     return nullptr;
508 
509   FX_FILESIZE SavedObjPos = m_Pos;
510   WordResult word_result = GetNextWord();
511   const ByteString& word = word_result.word;
512   if (word.IsEmpty())
513     return nullptr;
514 
515   if (word_result.is_number) {
516     AutoRestorer<FX_FILESIZE> pos_restorer(&m_Pos);
517     WordResult nextword = GetNextWord();
518     if (!nextword.is_number)
519       return pdfium::MakeRetain<CPDF_Number>(word.AsStringView());
520 
521     WordResult nextword2 = GetNextWord();
522     if (nextword2.word != "R")
523       return pdfium::MakeRetain<CPDF_Number>(word.AsStringView());
524 
525     pos_restorer.AbandonRestoration();
526     uint32_t refnum = FXSYS_atoui(word.c_str());
527     if (refnum == CPDF_Object::kInvalidObjNum)
528       return nullptr;
529 
530     return pdfium::MakeRetain<CPDF_Reference>(pObjList, refnum);
531   }
532 
533   if (word == "true" || word == "false")
534     return pdfium::MakeRetain<CPDF_Boolean>(word == "true");
535 
536   if (word == "null")
537     return pdfium::MakeRetain<CPDF_Null>();
538 
539   if (word == "(") {
540     return pdfium::MakeRetain<CPDF_String>(m_pPool, ReadString());
541   }
542   if (word == "<") {
543     return pdfium::MakeRetain<CPDF_String>(m_pPool, ReadHexString(),
544                                            CPDF_String::DataType::kIsHex);
545   }
546   if (word == "[") {
547     auto pArray = pdfium::MakeRetain<CPDF_Array>();
548     while (RetainPtr<CPDF_Object> pObj =
549                GetObjectBodyInternal(pObjList, ParseType::kLoose)) {
550       // `pObj` cannot be a stream, per ISO 32000-1:2008 section 7.3.8.1.
551       if (!pObj->IsStream()) {
552         pArray->Append(std::move(pObj));
553       }
554     }
555     return (parse_type == ParseType::kLoose || m_WordBuffer[0] == ']')
556                ? std::move(pArray)
557                : nullptr;
558   }
559   if (word[0] == '/') {
560     auto word_span = pdfium::make_span(m_WordBuffer).first(m_WordSize);
561     return pdfium::MakeRetain<CPDF_Name>(
562         m_pPool, PDF_NameDecode(ByteStringView(word_span).Substr(1)));
563   }
564   if (word == "<<") {
565     RetainPtr<CPDF_Dictionary> pDict =
566         pdfium::MakeRetain<CPDF_Dictionary>(m_pPool);
567     while (true) {
568       WordResult inner_word_result = GetNextWord();
569       const ByteString& inner_word = inner_word_result.word;
570       if (inner_word.IsEmpty())
571         return nullptr;
572 
573       FX_FILESIZE SavedPos = m_Pos - inner_word.GetLength();
574       if (inner_word == ">>")
575         break;
576 
577       if (inner_word == "endobj") {
578         m_Pos = SavedPos;
579         break;
580       }
581       if (inner_word[0] != '/')
582         continue;
583 
584       ByteString key = PDF_NameDecode(inner_word.AsStringView());
585       if (key.IsEmpty() && parse_type == ParseType::kLoose)
586         continue;
587 
588       RetainPtr<CPDF_Object> pObj =
589           GetObjectBodyInternal(pObjList, ParseType::kLoose);
590       if (!pObj) {
591         if (parse_type == ParseType::kLoose)
592           continue;
593 
594         ToNextLine();
595         return nullptr;
596       }
597 
598       // `key` has to be "/X" at the minimum.
599       // `pObj` cannot be a stream, per ISO 32000-1:2008 section 7.3.8.1.
600       if (key.GetLength() > 1 && !pObj->IsStream()) {
601         pDict->SetFor(key.Substr(1), std::move(pObj));
602       }
603     }
604 
605     AutoRestorer<FX_FILESIZE> pos_restorer(&m_Pos);
606     if (GetNextWord().word != "stream")
607       return pDict;
608     pos_restorer.AbandonRestoration();
609     return ReadStream(std::move(pDict));
610   }
611   if (word == ">>")
612     m_Pos = SavedObjPos;
613 
614   return nullptr;
615 }
616 
GetIndirectObject(CPDF_IndirectObjectHolder * pObjList,ParseType parse_type)617 RetainPtr<CPDF_Object> CPDF_SyntaxParser::GetIndirectObject(
618     CPDF_IndirectObjectHolder* pObjList,
619     ParseType parse_type) {
620   CPDF_ReadValidator::ScopedSession read_session(GetValidator());
621   const FX_FILESIZE saved_pos = GetPos();
622 
623   WordResult objnum_word_result = GetNextWord();
624   if (!objnum_word_result.is_number || objnum_word_result.word.IsEmpty()) {
625     SetPos(saved_pos);
626     return nullptr;
627   }
628   const uint32_t parser_objnum = FXSYS_atoui(objnum_word_result.word.c_str());
629 
630   WordResult gennum_word_result = GetNextWord();
631   const ByteString& gennum_word = gennum_word_result.word;
632   if (!gennum_word_result.is_number || gennum_word.IsEmpty()) {
633     SetPos(saved_pos);
634     return nullptr;
635   }
636   const uint32_t parser_gennum = FXSYS_atoui(gennum_word.c_str());
637 
638   if (GetKeyword() != "obj") {
639     SetPos(saved_pos);
640     return nullptr;
641   }
642 
643   RetainPtr<CPDF_Object> pObj = GetObjectBodyInternal(pObjList, parse_type);
644   if (pObj) {
645     pObj->SetObjNum(parser_objnum);
646     pObj->SetGenNum(parser_gennum);
647   }
648 
649   return GetValidator()->has_read_problems() ? nullptr : pObj;
650 }
651 
ReadEOLMarkers(FX_FILESIZE pos)652 unsigned int CPDF_SyntaxParser::ReadEOLMarkers(FX_FILESIZE pos) {
653   unsigned char byte1 = 0;
654   unsigned char byte2 = 0;
655 
656   GetCharAt(pos, byte1);
657   GetCharAt(pos + 1, byte2);
658 
659   if (byte1 == '\r' && byte2 == '\n')
660     return 2;
661 
662   if (byte1 == '\r' || byte1 == '\n')
663     return 1;
664 
665   return 0;
666 }
667 
FindWordPos(ByteStringView word)668 FX_FILESIZE CPDF_SyntaxParser::FindWordPos(ByteStringView word) {
669   AutoRestorer<FX_FILESIZE> pos_restorer(&m_Pos);
670   FX_FILESIZE end_offset = FindTag(word);
671   while (end_offset >= 0) {
672     // Stop searching when word is found.
673     if (IsWholeWord(GetPos() - word.GetLength(), m_FileLen, word, true))
674       return GetPos() - word.GetLength();
675 
676     end_offset = FindTag(word);
677   }
678   return -1;
679 }
680 
FindStreamEndPos()681 FX_FILESIZE CPDF_SyntaxParser::FindStreamEndPos() {
682   const ByteStringView kEndStreamStr("endstream");
683   const ByteStringView kEndObjStr("endobj");
684 
685   FX_FILESIZE endStreamWordOffset = FindWordPos(kEndStreamStr);
686   FX_FILESIZE endObjWordOffset = FindWordPos(kEndObjStr);
687 
688   // Can't find "endstream" or "endobj".
689   if (endStreamWordOffset < 0 && endObjWordOffset < 0) {
690     return -1;
691   }
692 
693   if (endStreamWordOffset < 0 && endObjWordOffset >= 0) {
694     // Correct the position of end stream.
695     endStreamWordOffset = endObjWordOffset;
696   } else if (endStreamWordOffset >= 0 && endObjWordOffset < 0) {
697     // Correct the position of end obj.
698     endObjWordOffset = endStreamWordOffset;
699   } else if (endStreamWordOffset > endObjWordOffset) {
700     endStreamWordOffset = endObjWordOffset;
701   }
702 
703   int numMarkers = ReadEOLMarkers(endStreamWordOffset - 2);
704   if (numMarkers == 2) {
705     endStreamWordOffset -= 2;
706   } else {
707     numMarkers = ReadEOLMarkers(endStreamWordOffset - 1);
708     if (numMarkers == 1) {
709       endStreamWordOffset -= 1;
710     }
711   }
712   if (endStreamWordOffset < GetPos()) {
713     return -1;
714   }
715   return endStreamWordOffset;
716 }
717 
ReadStream(RetainPtr<CPDF_Dictionary> pDict)718 RetainPtr<CPDF_Stream> CPDF_SyntaxParser::ReadStream(
719     RetainPtr<CPDF_Dictionary> pDict) {
720   RetainPtr<const CPDF_Number> pLenObj =
721       ToNumber(pDict->GetDirectObjectFor("Length"));
722   FX_FILESIZE len = pLenObj ? pLenObj->GetInteger() : -1;
723 
724   // Locate the start of stream.
725   ToNextLine();
726   const FX_FILESIZE streamStartPos = GetPos();
727 
728   if (len > 0) {
729     FX_SAFE_FILESIZE pos = GetPos();
730     pos += len;
731     if (!pos.IsValid() || pos.ValueOrDie() >= m_FileLen)
732       len = -1;
733   }
734 
735   RetainPtr<IFX_SeekableReadStream> substream;
736   if (len > 0) {
737     // Check data availability first to allow the Validator to request data
738     // smoothly, without jumps.
739     if (!GetValidator()->CheckDataRangeAndRequestIfUnavailable(
740             m_HeaderOffset + GetPos(), len)) {
741       return nullptr;
742     }
743 
744     substream = pdfium::MakeRetain<ReadableSubStream>(
745         GetValidator(), m_HeaderOffset + GetPos(), len);
746     SetPos(GetPos() + len);
747   }
748 
749   const ByteStringView kEndStreamStr("endstream");
750   const ByteStringView kEndObjStr("endobj");
751 
752   // Note, we allow zero length streams as we need to pass them through when we
753   // are importing pages into a new document.
754   if (len >= 0) {
755     CPDF_ReadValidator::ScopedSession read_session(GetValidator());
756     m_Pos += ReadEOLMarkers(GetPos());
757     const size_t zap_length = kEndStreamStr.GetLength() + 1;
758     fxcrt::Fill(pdfium::make_span(m_WordBuffer).first(zap_length), 0);
759     GetNextWordInternal();
760     if (GetValidator()->has_read_problems())
761       return nullptr;
762 
763     // Earlier version of PDF specification doesn't require EOL marker before
764     // 'endstream' keyword. If keyword 'endstream' follows the bytes in
765     // specified length, it signals the end of stream.
766     if (memcmp(m_WordBuffer.data(), kEndStreamStr.unterminated_unsigned_str(),
767                kEndStreamStr.GetLength()) != 0) {
768       substream.Reset();
769       len = -1;
770       SetPos(streamStartPos);
771     }
772   }
773 
774   if (len < 0) {
775     // If len is not available or incorrect, len needs to be calculated
776     // by searching the keywords "endstream" or "endobj".
777     const FX_FILESIZE streamEndPos = FindStreamEndPos();
778     if (streamEndPos < 0)
779       return nullptr;
780 
781     len = streamEndPos - streamStartPos;
782     DCHECK_GE(len, 0);
783     if (len > 0) {
784       SetPos(streamStartPos);
785       // Check data availability first to allow the Validator to request data
786       // smoothly, without jumps.
787       if (!GetValidator()->CheckDataRangeAndRequestIfUnavailable(
788               m_HeaderOffset + GetPos(), len)) {
789         return nullptr;
790       }
791 
792       substream = pdfium::MakeRetain<ReadableSubStream>(
793           GetValidator(), m_HeaderOffset + GetPos(), len);
794       SetPos(GetPos() + len);
795     }
796   }
797 
798   RetainPtr<CPDF_Stream> stream;
799   if (substream) {
800     // It is unclear from CPDF_SyntaxParser's perspective what object
801     // `substream` is ultimately holding references to. To avoid unexpectedly
802     // changing object lifetimes by handing `substream` to `stream`, make a
803     // copy of the data here.
804     auto data = FixedSizeDataVector<uint8_t>::Uninit(substream->GetSize());
805     bool did_read = substream->ReadBlockAtOffset(data.span(), 0);
806     CHECK(did_read);
807     auto data_as_stream =
808         pdfium::MakeRetain<CFX_ReadOnlyVectorStream>(std::move(data));
809 
810     stream = pdfium::MakeRetain<CPDF_Stream>(std::move(data_as_stream),
811                                              std::move(pDict));
812   } else {
813     DCHECK(!len);
814     stream = pdfium::MakeRetain<CPDF_Stream>(std::move(pDict));
815   }
816   const FX_FILESIZE end_stream_offset = GetPos();
817   const size_t zap_length = kEndObjStr.GetLength() + 1;
818   fxcrt::Fill(pdfium::make_span(m_WordBuffer).first(zap_length), 0);
819   GetNextWordInternal();
820 
821   // Allow whitespace after endstream and before a newline.
822   unsigned char ch = 0;
823   while (GetNextChar(ch)) {
824     if (!PDFCharIsWhitespace(ch) || PDFCharIsLineEnding(ch))
825       break;
826   }
827   SetPos(GetPos() - 1);
828 
829   int numMarkers = ReadEOLMarkers(GetPos());
830   if (m_WordSize == static_cast<unsigned int>(kEndObjStr.GetLength()) &&
831       numMarkers != 0 &&
832       memcmp(m_WordBuffer.data(), kEndObjStr.unterminated_unsigned_str(),
833              kEndObjStr.GetLength()) == 0) {
834     SetPos(end_stream_offset);
835   }
836   return stream;
837 }
838 
GetDirectNum()839 uint32_t CPDF_SyntaxParser::GetDirectNum() {
840   if (GetNextWordInternal() != WordType::kNumber)
841     return 0;
842 
843   m_WordBuffer[m_WordSize] = 0;
844   return FXSYS_atoui(pdfium::as_chars(pdfium::make_span(m_WordBuffer)).data());
845 }
846 
GetValidator() const847 RetainPtr<CPDF_ReadValidator> CPDF_SyntaxParser::GetValidator() const {
848   return m_pFileAccess;
849 }
850 
IsWholeWord(FX_FILESIZE startpos,FX_FILESIZE limit,ByteStringView tag,bool checkKeyword)851 bool CPDF_SyntaxParser::IsWholeWord(FX_FILESIZE startpos,
852                                     FX_FILESIZE limit,
853                                     ByteStringView tag,
854                                     bool checkKeyword) {
855   const uint32_t taglen = tag.GetLength();
856 
857   bool bCheckLeft = !PDFCharIsDelimiter(tag[0]) && !PDFCharIsWhitespace(tag[0]);
858   bool bCheckRight = !PDFCharIsDelimiter(tag[taglen - 1]) &&
859                      !PDFCharIsWhitespace(tag[taglen - 1]);
860 
861   uint8_t ch;
862   if (bCheckRight && startpos + static_cast<int32_t>(taglen) <= limit &&
863       GetCharAt(startpos + static_cast<int32_t>(taglen), ch)) {
864     if (PDFCharIsNumeric(ch) || PDFCharIsOther(ch) ||
865         (checkKeyword && PDFCharIsDelimiter(ch))) {
866       return false;
867     }
868   }
869 
870   if (bCheckLeft && startpos > 0 && GetCharAt(startpos - 1, ch)) {
871     if (PDFCharIsNumeric(ch) || PDFCharIsOther(ch) ||
872         (checkKeyword && PDFCharIsDelimiter(ch))) {
873       return false;
874     }
875   }
876   return true;
877 }
878 
BackwardsSearchToWord(ByteStringView word,FX_FILESIZE limit)879 bool CPDF_SyntaxParser::BackwardsSearchToWord(ByteStringView word,
880                                               FX_FILESIZE limit) {
881   int32_t taglen = word.GetLength();
882   if (taglen == 0)
883     return false;
884 
885   FX_FILESIZE pos = m_Pos;
886   int32_t offset = taglen - 1;
887   while (true) {
888     if (limit && pos <= m_Pos - limit)
889       return false;
890 
891     uint8_t byte;
892     if (!GetCharAtBackward(pos, &byte))
893       return false;
894 
895     if (byte == word[offset]) {
896       offset--;
897       if (offset >= 0) {
898         pos--;
899         continue;
900       }
901       if (IsWholeWord(pos, limit, word, false)) {
902         m_Pos = pos;
903         return true;
904       }
905     }
906     offset = byte == word[taglen - 1] ? taglen - 2 : taglen - 1;
907     pos--;
908     if (pos < 0)
909       return false;
910   }
911 }
912 
FindTag(ByteStringView tag)913 FX_FILESIZE CPDF_SyntaxParser::FindTag(ByteStringView tag) {
914   const FX_FILESIZE startpos = GetPos();
915   const int32_t taglen = tag.GetLength();
916   DCHECK_GT(taglen, 0);
917 
918   int32_t match = 0;
919   while (true) {
920     uint8_t ch;
921     if (!GetNextChar(ch))
922       return -1;
923 
924     if (ch == tag[match]) {
925       match++;
926       if (match == taglen)
927         return GetPos() - startpos - taglen;
928     } else {
929       match = ch == tag[0] ? 1 : 0;
930     }
931   }
932 }
933 
IsPositionRead(FX_FILESIZE pos) const934 bool CPDF_SyntaxParser::IsPositionRead(FX_FILESIZE pos) const {
935   return m_BufOffset <= pos &&
936          pos < static_cast<FX_FILESIZE>(m_BufOffset + m_pFileBuf.size());
937 }
938