1 // Copyright 2016 The PDFium Authors 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 5 // Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com 6 7 #ifndef CORE_FPDFAPI_PARSER_CPDF_PARSER_H_ 8 #define CORE_FPDFAPI_PARSER_CPDF_PARSER_H_ 9 10 #include <stddef.h> 11 #include <stdint.h> 12 13 #include <limits> 14 #include <map> 15 #include <memory> 16 #include <set> 17 #include <vector> 18 19 #include "core/fpdfapi/parser/cpdf_cross_ref_table.h" 20 #include "core/fpdfapi/parser/cpdf_indirect_object_holder.h" 21 #include "core/fxcrt/bytestring.h" 22 #include "core/fxcrt/fx_types.h" 23 #include "core/fxcrt/retain_ptr.h" 24 #include "core/fxcrt/unowned_ptr.h" 25 26 class CPDF_Array; 27 class CPDF_Dictionary; 28 class CPDF_LinearizedHeader; 29 class CPDF_Object; 30 class CPDF_ObjectStream; 31 class CPDF_ReadValidator; 32 class CPDF_SecurityHandler; 33 class CPDF_SyntaxParser; 34 class IFX_ArchiveStream; 35 class IFX_SeekableReadStream; 36 37 class CPDF_Parser { 38 public: 39 using ObjectType = CPDF_CrossRefTable::ObjectType; 40 using ObjectInfo = CPDF_CrossRefTable::ObjectInfo; 41 42 class ParsedObjectsHolder : public CPDF_IndirectObjectHolder { 43 public: 44 virtual bool TryInit() = 0; 45 }; 46 47 enum Error { 48 SUCCESS = 0, 49 FILE_ERROR, 50 FORMAT_ERROR, 51 PASSWORD_ERROR, 52 HANDLER_ERROR 53 }; 54 55 // A limit on the maximum object number in the xref table. Theoretical limits 56 // are higher, but this may be large enough in practice. 57 // Note: This was 1M, but https://crbug.com/910009 encountered a PDF with 58 // object numbers in the 1.7M range. The PDF only has 10K objects, but they 59 // are non-consecutive. 60 static constexpr uint32_t kMaxObjectNumber = 4 * 1024 * 1024; 61 62 static constexpr size_t kInvalidPos = std::numeric_limits<size_t>::max(); 63 64 explicit CPDF_Parser(ParsedObjectsHolder* holder); 65 CPDF_Parser(); 66 ~CPDF_Parser(); 67 68 Error StartParse(RetainPtr<IFX_SeekableReadStream> pFile, 69 const ByteString& password); 70 Error StartLinearizedParse(RetainPtr<CPDF_ReadValidator> validator, 71 const ByteString& password); 72 SetPassword(const ByteString & password)73 void SetPassword(const ByteString& password) { m_Password = password; } GetPassword()74 ByteString GetPassword() const { return m_Password; } 75 76 // Take the GetPassword() value and encode it, if necessary, based on the 77 // password encoding conversion. 78 ByteString GetEncodedPassword() const; 79 80 const CPDF_Dictionary* GetTrailer() const; 81 CPDF_Dictionary* GetMutableTrailerForTesting(); 82 uint32_t GetTrailerObjectNumber() const; 83 84 // Returns a new trailer which combines the last read trailer with the /Root 85 // and /Info from previous ones. 86 RetainPtr<CPDF_Dictionary> GetCombinedTrailer() const; 87 GetLastXRefOffset()88 FX_FILESIZE GetLastXRefOffset() const { return m_LastXRefOffset; } 89 90 uint32_t GetPermissions() const; 91 uint32_t GetRootObjNum() const; 92 uint32_t GetInfoObjNum() const; 93 RetainPtr<const CPDF_Array> GetIDArray() const; 94 RetainPtr<const CPDF_Dictionary> GetRoot() const; 95 RetainPtr<const CPDF_Dictionary> GetEncryptDict() const; 96 97 RetainPtr<CPDF_Object> ParseIndirectObject(uint32_t objnum); 98 99 uint32_t GetLastObjNum() const; 100 bool IsValidObjectNumber(uint32_t objnum) const; 101 FX_FILESIZE GetObjectPositionOrZero(uint32_t objnum) const; 102 bool IsObjectFreeOrNull(uint32_t objnum) const; GetSecurityHandler()103 const RetainPtr<CPDF_SecurityHandler>& GetSecurityHandler() const { 104 return m_pSecurityHandler; 105 } 106 bool IsObjectFree(uint32_t objnum) const; 107 GetFileVersion()108 int GetFileVersion() const { return m_FileVersion; } IsXRefStream()109 bool IsXRefStream() const { return m_bXRefStream; } 110 111 RetainPtr<CPDF_Object> ParseIndirectObjectAt(FX_FILESIZE pos, 112 uint32_t objnum); 113 114 FX_FILESIZE GetDocumentSize() const; 115 uint32_t GetFirstPageNo() const; GetLinearizedHeader()116 const CPDF_LinearizedHeader* GetLinearizedHeader() const { 117 return m_pLinearized.get(); 118 } 119 GetCrossRefTable()120 const CPDF_CrossRefTable* GetCrossRefTable() const { 121 return m_CrossRefTable.get(); 122 } 123 xref_table_rebuilt()124 bool xref_table_rebuilt() const { return m_bXRefTableRebuilt; } 125 126 std::vector<unsigned int> GetTrailerEnds(); 127 bool WriteToArchive(IFX_ArchiveStream* archive, FX_FILESIZE src_size); 128 129 void SetLinearizedHeaderForTesting( 130 std::unique_ptr<CPDF_LinearizedHeader> pLinearized); 131 132 protected: 133 bool LoadCrossRefV4(FX_FILESIZE pos, bool bSkip); 134 bool RebuildCrossRef(); 135 Error StartParseInternal(); 136 FX_FILESIZE ParseStartXRef(); 137 std::unique_ptr<CPDF_LinearizedHeader> ParseLinearizedHeader(); 138 139 void SetSyntaxParserForTesting(std::unique_ptr<CPDF_SyntaxParser> parser); 140 141 private: 142 friend class CPDF_DataAvail; 143 144 struct CrossRefObjData { 145 uint32_t obj_num = 0; 146 ObjectInfo info; 147 }; 148 149 bool LoadAllCrossRefV4(FX_FILESIZE xref_offset); 150 bool LoadAllCrossRefV5(FX_FILESIZE xref_offset); 151 bool LoadCrossRefV5(FX_FILESIZE* pos, bool bMainXRef); 152 void ProcessCrossRefV5Entry(pdfium::span<const uint8_t> entry_span, 153 pdfium::span<const uint32_t> field_widths, 154 uint32_t obj_num); 155 RetainPtr<CPDF_Dictionary> LoadTrailerV4(); 156 Error SetEncryptHandler(); 157 void ReleaseEncryptHandler(); 158 bool LoadLinearizedAllCrossRefV4(FX_FILESIZE main_xref_offset); 159 bool LoadLinearizedAllCrossRefV5(FX_FILESIZE main_xref_offset); 160 Error LoadLinearizedMainXRefTable(); 161 const CPDF_ObjectStream* GetObjectStream(uint32_t object_number); 162 void ShrinkObjectMap(uint32_t size); 163 // A simple check whether the cross reference table matches with 164 // the objects. 165 bool VerifyCrossRefV4(); 166 167 // If out_objects is null, the parser position will be moved to end subsection 168 // without additional validation. 169 bool ParseAndAppendCrossRefSubsectionData( 170 uint32_t start_objnum, 171 uint32_t count, 172 std::vector<CrossRefObjData>* out_objects); 173 bool ParseCrossRefV4(std::vector<CrossRefObjData>* out_objects); 174 void MergeCrossRefObjectsData(const std::vector<CrossRefObjData>& objects); 175 176 bool InitSyntaxParser(RetainPtr<CPDF_ReadValidator> validator); 177 bool ParseFileVersion(); 178 179 ObjectType GetObjectType(uint32_t objnum) const; 180 181 std::unique_ptr<CPDF_SyntaxParser> m_pSyntax; 182 std::unique_ptr<ParsedObjectsHolder> m_pOwnedObjectsHolder; 183 UnownedPtr<ParsedObjectsHolder> m_pObjectsHolder; 184 185 bool m_bHasParsed = false; 186 bool m_bXRefStream = false; 187 bool m_bXRefTableRebuilt = false; 188 int m_FileVersion = 0; 189 uint32_t m_MetadataObjnum = 0; 190 // m_CrossRefTable must be destroyed after m_pSecurityHandler due to the 191 // ownership of the ID array data. 192 std::unique_ptr<CPDF_CrossRefTable> m_CrossRefTable; 193 FX_FILESIZE m_LastXRefOffset = 0; 194 ByteString m_Password; 195 std::unique_ptr<CPDF_LinearizedHeader> m_pLinearized; 196 197 // A map of object numbers to indirect streams. 198 std::map<uint32_t, std::unique_ptr<CPDF_ObjectStream>> m_ObjectStreamMap; 199 200 // All indirect object numbers that are being parsed. 201 std::set<uint32_t> m_ParsingObjNums; 202 203 RetainPtr<CPDF_SecurityHandler> m_pSecurityHandler; 204 }; 205 206 #endif // CORE_FPDFAPI_PARSER_CPDF_PARSER_H_ 207