1 // Copyright 2016 PDFium Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 5 // Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com 6 7 #ifndef CORE_FPDFAPI_PARSER_CPDF_PARSER_H_ 8 #define CORE_FPDFAPI_PARSER_CPDF_PARSER_H_ 9 10 #include <limits> 11 #include <map> 12 #include <memory> 13 #include <set> 14 #include <vector> 15 16 #include "core/fpdfapi/parser/cpdf_syntax_parser.h" 17 #include "core/fxcrt/fx_string.h" 18 #include "core/fxcrt/fx_system.h" 19 #include "core/fxcrt/retain_ptr.h" 20 #include "core/fxcrt/unowned_ptr.h" 21 22 class CPDF_Array; 23 class CPDF_CryptoHandler; 24 class CPDF_Dictionary; 25 class CPDF_Document; 26 class CPDF_IndirectObjectHolder; 27 class CPDF_LinearizedHeader; 28 class CPDF_Object; 29 class CPDF_SecurityHandler; 30 class CPDF_StreamAcc; 31 class CPDF_SyntaxParser; 32 class IFX_SeekableReadStream; 33 34 class CPDF_Parser { 35 public: 36 enum Error { 37 SUCCESS = 0, 38 FILE_ERROR, 39 FORMAT_ERROR, 40 PASSWORD_ERROR, 41 HANDLER_ERROR 42 }; 43 44 // A limit on the maximum object number in the xref table. Theoretical limits 45 // are higher, but this may be large enough in practice. 46 static const uint32_t kMaxObjectNumber = 1048576; 47 48 static const size_t kInvalidPos = std::numeric_limits<size_t>::max(); 49 50 CPDF_Parser(); 51 ~CPDF_Parser(); 52 53 Error StartParse(const RetainPtr<IFX_SeekableReadStream>& pFile, 54 CPDF_Document* pDocument); 55 Error StartLinearizedParse(const RetainPtr<IFX_SeekableReadStream>& pFile, 56 CPDF_Document* pDocument); 57 SetPassword(const char * password)58 void SetPassword(const char* password) { m_Password = password; } GetPassword()59 ByteString GetPassword() { return m_Password; } 60 61 CPDF_Dictionary* GetTrailer() const; 62 63 // Returns a new trailer which combines the last read trailer with the /Root 64 // and /Info from previous ones. 65 std::unique_ptr<CPDF_Dictionary> GetCombinedTrailer() const; 66 GetLastXRefOffset()67 FX_FILESIZE GetLastXRefOffset() const { return m_LastXRefOffset; } 68 69 uint32_t GetPermissions() const; 70 uint32_t GetRootObjNum(); 71 uint32_t GetInfoObjNum(); 72 const CPDF_Array* GetIDArray() const; 73 GetEncryptDict()74 CPDF_Dictionary* GetEncryptDict() const { return m_pEncryptDict.Get(); } 75 76 std::unique_ptr<CPDF_Object> ParseIndirectObject( 77 CPDF_IndirectObjectHolder* pObjList, 78 uint32_t objnum); 79 80 uint32_t GetLastObjNum() const; 81 bool IsValidObjectNumber(uint32_t objnum) const; 82 FX_FILESIZE GetObjectPositionOrZero(uint32_t objnum) const; 83 uint16_t GetObjectGenNum(uint32_t objnum) const; 84 bool IsObjectFreeOrNull(uint32_t objnum) const; GetSecurityHandler()85 CPDF_SecurityHandler* GetSecurityHandler() const { 86 return m_pSecurityHandler.get(); 87 } 88 RetainPtr<IFX_SeekableReadStream> GetFileAccess() const; 89 bool IsObjectFree(uint32_t objnum) const; 90 91 FX_FILESIZE GetObjectOffset(uint32_t objnum) const; 92 GetFileVersion()93 int GetFileVersion() const { return m_FileVersion; } IsXRefStream()94 bool IsXRefStream() const { return m_bXRefStream; } 95 96 std::unique_ptr<CPDF_Object> ParseIndirectObjectAt( 97 CPDF_IndirectObjectHolder* pObjList, 98 FX_FILESIZE pos, 99 uint32_t objnum); 100 101 std::unique_ptr<CPDF_Object> ParseIndirectObjectAtByStrict( 102 CPDF_IndirectObjectHolder* pObjList, 103 FX_FILESIZE pos, 104 uint32_t objnum, 105 FX_FILESIZE* pResultPos); 106 107 uint32_t GetFirstPageNo() const; 108 109 protected: 110 enum class ObjectType : uint8_t { 111 kFree = 0x00, 112 kNotCompressed = 0x01, 113 kCompressed = 0x02, 114 kNull = 0xFF, 115 }; 116 117 struct ObjectInfo { ObjectInfoObjectInfo118 ObjectInfo() : pos(0), type(ObjectType::kFree), gennum(0) {} 119 // if type is ObjectType::kCompressed the archive_obj_num should be used. 120 // if type is ObjectType::kNotCompressed the pos should be used. 121 // In other cases its are unused. 122 union { 123 FX_FILESIZE pos; 124 FX_FILESIZE archive_obj_num; 125 }; 126 ObjectType type; 127 uint16_t gennum; 128 }; 129 130 std::unique_ptr<CPDF_SyntaxParser> m_pSyntax; 131 std::map<uint32_t, ObjectInfo> m_ObjectInfo; 132 133 bool LoadCrossRefV4(FX_FILESIZE pos, bool bSkip); 134 bool RebuildCrossRef(); 135 136 private: 137 friend class CPDF_DataAvail; 138 139 class TrailerData; 140 141 enum class ParserState { 142 kDefault, 143 kComment, 144 kWhitespace, 145 kString, 146 kHexString, 147 kEscapedString, 148 kXref, 149 kObjNum, 150 kPostObjNum, 151 kGenNum, 152 kPostGenNum, 153 kTrailer, 154 kBeginObj, 155 kEndObj 156 }; 157 158 struct CrossRefObjData { 159 uint32_t obj_num = 0; 160 ObjectInfo info; 161 }; 162 163 Error StartParseInternal(CPDF_Document* pDocument); 164 FX_FILESIZE ParseStartXRef(); 165 bool LoadAllCrossRefV4(FX_FILESIZE pos); 166 bool LoadAllCrossRefV5(FX_FILESIZE pos); 167 bool LoadCrossRefV5(FX_FILESIZE* pos, bool bMainXRef); 168 std::unique_ptr<CPDF_Dictionary> LoadTrailerV4(); 169 Error SetEncryptHandler(); 170 void ReleaseEncryptHandler(); 171 bool LoadLinearizedAllCrossRefV4(FX_FILESIZE pos); 172 bool LoadLinearizedAllCrossRefV5(FX_FILESIZE pos); 173 Error LoadLinearizedMainXRefTable(); 174 RetainPtr<CPDF_StreamAcc> GetObjectStream(uint32_t number); 175 std::unique_ptr<CPDF_LinearizedHeader> ParseLinearizedHeader(); 176 void SetEncryptDictionary(CPDF_Dictionary* pDict); 177 void ShrinkObjectMap(uint32_t size); 178 // A simple check whether the cross reference table matches with 179 // the objects. 180 bool VerifyCrossRefV4(); 181 182 // If out_objects is null, the parser position will be moved to end subsection 183 // without additional validation. 184 bool ParseAndAppendCrossRefSubsectionData( 185 uint32_t start_objnum, 186 uint32_t count, 187 std::vector<CrossRefObjData>* out_objects); 188 bool ParseCrossRefV4(std::vector<CrossRefObjData>* out_objects); 189 void MergeCrossRefObjectsData(const std::vector<CrossRefObjData>& objects); 190 191 std::unique_ptr<CPDF_Object> ParseIndirectObjectAtInternal( 192 CPDF_IndirectObjectHolder* pObjList, 193 FX_FILESIZE pos, 194 uint32_t objnum, 195 CPDF_SyntaxParser::ParseType parse_type, 196 FX_FILESIZE* pResultPos); 197 198 bool InitSyntaxParser(const RetainPtr<IFX_SeekableReadStream>& file_access); 199 bool ParseFileVersion(); 200 201 UnownedPtr<CPDF_Document> m_pDocument; 202 ObjectType GetObjectType(uint32_t objnum) const; 203 ObjectType GetObjectTypeFromCrossRefStreamType( 204 int cross_ref_stream_type) const; 205 206 bool m_bHasParsed; 207 bool m_bXRefStream; 208 int m_FileVersion; 209 // m_TrailerData must be destroyed after m_pSecurityHandler due to the 210 // ownership of the ID array data. 211 std::unique_ptr<TrailerData> m_TrailerData; 212 UnownedPtr<CPDF_Dictionary> m_pEncryptDict; 213 FX_FILESIZE m_LastXRefOffset; 214 std::unique_ptr<CPDF_SecurityHandler> m_pSecurityHandler; 215 ByteString m_Password; 216 std::unique_ptr<CPDF_LinearizedHeader> m_pLinearized; 217 218 // A map of object numbers to indirect streams. 219 std::map<uint32_t, RetainPtr<CPDF_StreamAcc>> m_ObjectStreamMap; 220 221 // Mapping of object numbers to offsets. The offsets are relative to the first 222 // object in the stream. 223 using StreamObjectCache = std::map<uint32_t, uint32_t>; 224 225 // Mapping of streams to their object caches. This is valid as long as the 226 // streams in |m_ObjectStreamMap| are valid. 227 std::map<RetainPtr<CPDF_StreamAcc>, StreamObjectCache> m_ObjCache; 228 229 // All indirect object numbers that are being parsed. 230 std::set<uint32_t> m_ParsingObjNums; 231 232 uint32_t m_MetadataObjnum = 0; 233 }; 234 235 #endif // CORE_FPDFAPI_PARSER_CPDF_PARSER_H_ 236