// Copyright 2016 The PDFium Authors // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. // Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com #ifndef CORE_FPDFAPI_PARSER_CPDF_SYNTAX_PARSER_H_ #define CORE_FPDFAPI_PARSER_CPDF_SYNTAX_PARSER_H_ #include #include #include #include #include "core/fpdfapi/parser/cpdf_stream.h" #include "core/fxcrt/data_vector.h" #include "core/fxcrt/fx_types.h" #include "core/fxcrt/retain_ptr.h" #include "core/fxcrt/span.h" #include "core/fxcrt/string_pool_template.h" #include "core/fxcrt/unowned_ptr.h" #include "core/fxcrt/weak_ptr.h" class CPDF_Dictionary; class CPDF_IndirectObjectHolder; class CPDF_Object; class CPDF_ReadValidator; class CPDF_Stream; class IFX_SeekableReadStream; class CPDF_SyntaxParser { public: enum class ParseType : bool { kStrict, kLoose }; struct WordResult { ByteString word; bool is_number; }; static std::unique_ptr CreateForTesting( RetainPtr pFileAccess, FX_FILESIZE HeaderOffset); explicit CPDF_SyntaxParser(RetainPtr pFileAccess); CPDF_SyntaxParser(RetainPtr pValidator, FX_FILESIZE HeaderOffset); ~CPDF_SyntaxParser(); void SetReadBufferSize(uint32_t read_buffer_size) { m_ReadBufferSize = read_buffer_size; } FX_FILESIZE GetPos() const { return m_Pos; } void SetPos(FX_FILESIZE pos); RetainPtr GetObjectBody(CPDF_IndirectObjectHolder* pObjList); RetainPtr GetIndirectObject(CPDF_IndirectObjectHolder* pObjList, ParseType parse_type); ByteString GetKeyword(); void ToNextLine(); void ToNextWord(); void RecordingToNextWord(); bool BackwardsSearchToWord(ByteStringView word, FX_FILESIZE limit); FX_FILESIZE FindTag(ByteStringView tag); bool ReadBlock(pdfium::span buffer); bool GetCharAt(FX_FILESIZE pos, uint8_t& ch); WordResult GetNextWord(); ByteString PeekNextWord(); RetainPtr GetValidator() const; uint32_t GetDirectNum(); bool GetNextChar(uint8_t& ch); // The document size may be smaller than the file size. // The syntax parser use position relative to document // offset (|m_HeaderOffset|). // The document size will be FileSize - "Header offset". // All offsets was readed from document, should not be great than document // size. Use it for checks instead of real file size. FX_FILESIZE GetDocumentSize() const; ByteString ReadString(); DataVector ReadHexString(); void SetTrailerEnds(std::vector* trailer_ends) { m_TrailerEnds = trailer_ends; } private: enum class WordType : bool { kWord, kNumber }; friend class CPDF_DataAvail; friend class cpdf_syntax_parser_ReadHexString_Test; static constexpr int kParserMaxRecursionDepth = 64; static int s_CurrentRecursionDepth; bool ReadBlockAt(FX_FILESIZE read_pos); bool GetCharAtBackward(FX_FILESIZE pos, uint8_t* ch); WordType GetNextWordInternal(); bool IsWholeWord(FX_FILESIZE startpos, FX_FILESIZE limit, ByteStringView tag, bool checkKeyword); unsigned int ReadEOLMarkers(FX_FILESIZE pos); FX_FILESIZE FindWordPos(ByteStringView word); FX_FILESIZE FindStreamEndPos(); RetainPtr ReadStream(RetainPtr pDict); bool IsPositionRead(FX_FILESIZE pos) const; RetainPtr GetObjectBodyInternal( CPDF_IndirectObjectHolder* pObjList, ParseType parse_type); RetainPtr m_pFileAccess; // The syntax parser use position relative to header offset. // The header contains at file start, and can follow after some stuff. We // ignore this stuff. const FX_FILESIZE m_HeaderOffset; const FX_FILESIZE m_FileLen; FX_FILESIZE m_Pos = 0; WeakPtr m_pPool; DataVector m_pFileBuf; FX_FILESIZE m_BufOffset = 0; uint32_t m_WordSize = 0; uint32_t m_ReadBufferSize = CPDF_Stream::kFileBufSize; std::array m_WordBuffer = {}; // The syntax parser records traversed trailer end byte offsets here. UnownedPtr> m_TrailerEnds; }; #endif // CORE_FPDFAPI_PARSER_CPDF_SYNTAX_PARSER_H_