1 // Copyright 2016 The PDFium Authors 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 5 // Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com 6 7 #ifndef CORE_FPDFAPI_PARSER_CPDF_PARSER_H_ 8 #define CORE_FPDFAPI_PARSER_CPDF_PARSER_H_ 9 10 #include <stddef.h> 11 #include <stdint.h> 12 13 #include <limits> 14 #include <map> 15 #include <memory> 16 #include <set> 17 #include <vector> 18 19 #include "core/fpdfapi/parser/cpdf_cross_ref_table.h" 20 #include "core/fpdfapi/parser/cpdf_indirect_object_holder.h" 21 #include "core/fxcrt/bytestring.h" 22 #include "core/fxcrt/fx_types.h" 23 #include "core/fxcrt/retain_ptr.h" 24 #include "core/fxcrt/unowned_ptr.h" 25 26 class CPDF_Array; 27 class CPDF_Dictionary; 28 class CPDF_LinearizedHeader; 29 class CPDF_Object; 30 class CPDF_ObjectStream; 31 class CPDF_ReadValidator; 32 class CPDF_SecurityHandler; 33 class CPDF_SyntaxParser; 34 class IFX_ArchiveStream; 35 class IFX_SeekableReadStream; 36 37 class CPDF_Parser { 38 public: 39 class ParsedObjectsHolder : public CPDF_IndirectObjectHolder { 40 public: 41 virtual bool TryInit() = 0; 42 }; 43 44 enum Error { 45 SUCCESS = 0, 46 FILE_ERROR, 47 FORMAT_ERROR, 48 PASSWORD_ERROR, 49 HANDLER_ERROR 50 }; 51 52 // A limit on the maximum object number in the xref table. Theoretical limits 53 // are higher, but this may be large enough in practice. 54 // Note: This was 1M, but https://crbug.com/910009 encountered a PDF with 55 // object numbers in the 1.7M range. The PDF only has 10K objects, but they 56 // are non-consecutive. 57 static constexpr uint32_t kMaxObjectNumber = 4 * 1024 * 1024; 58 59 static constexpr size_t kInvalidPos = std::numeric_limits<size_t>::max(); 60 61 explicit CPDF_Parser(ParsedObjectsHolder* holder); 62 CPDF_Parser(); 63 ~CPDF_Parser(); 64 65 Error StartParse(RetainPtr<IFX_SeekableReadStream> pFile, 66 const ByteString& password); 67 Error StartLinearizedParse(RetainPtr<CPDF_ReadValidator> validator, 68 const ByteString& password); 69 GetPassword()70 ByteString GetPassword() const { return m_Password; } 71 72 // Take the GetPassword() value and encode it, if necessary, based on the 73 // password encoding conversion. 74 ByteString GetEncodedPassword() const; 75 76 const CPDF_Dictionary* GetTrailer() const; 77 uint32_t GetTrailerObjectNumber() const; 78 79 // Returns a new trailer which combines the last read trailer with the /Root 80 // and /Info from previous ones. 81 RetainPtr<CPDF_Dictionary> GetCombinedTrailer() const; 82 GetLastXRefOffset()83 FX_FILESIZE GetLastXRefOffset() const { return m_LastXRefOffset; } 84 85 uint32_t GetPermissions(bool get_owner_perms) const; 86 uint32_t GetRootObjNum() const; 87 uint32_t GetInfoObjNum() const; 88 RetainPtr<const CPDF_Array> GetIDArray() const; 89 RetainPtr<const CPDF_Dictionary> GetEncryptDict() const; 90 91 RetainPtr<CPDF_Object> ParseIndirectObject(uint32_t objnum); 92 93 uint32_t GetLastObjNum() const; 94 bool IsValidObjectNumber(uint32_t objnum) const; 95 FX_FILESIZE GetObjectPositionOrZero(uint32_t objnum) const; GetSecurityHandler()96 const RetainPtr<CPDF_SecurityHandler>& GetSecurityHandler() const { 97 return m_pSecurityHandler; 98 } 99 bool IsObjectFree(uint32_t objnum) const; 100 GetFileVersion()101 int GetFileVersion() const { return m_FileVersion; } IsXRefStream()102 bool IsXRefStream() const { return m_bXRefStream; } 103 104 FX_FILESIZE GetDocumentSize() const; 105 uint32_t GetFirstPageNo() const; GetLinearizedHeader()106 const CPDF_LinearizedHeader* GetLinearizedHeader() const { 107 return m_pLinearized.get(); 108 } 109 xref_table_rebuilt()110 bool xref_table_rebuilt() const { return m_bXRefTableRebuilt; } 111 112 std::vector<unsigned int> GetTrailerEnds(); 113 bool WriteToArchive(IFX_ArchiveStream* archive, FX_FILESIZE src_size); 114 GetCrossRefTableForTesting()115 const CPDF_CrossRefTable* GetCrossRefTableForTesting() const { 116 return m_CrossRefTable.get(); 117 } 118 119 CPDF_Dictionary* GetMutableTrailerForTesting(); 120 ParseIndirectObjectAtForTesting(FX_FILESIZE pos)121 RetainPtr<CPDF_Object> ParseIndirectObjectAtForTesting(FX_FILESIZE pos) { 122 return ParseIndirectObjectAt(pos, 0); 123 } 124 125 void SetLinearizedHeaderForTesting( 126 std::unique_ptr<CPDF_LinearizedHeader> pLinearized); 127 128 protected: 129 bool LoadCrossRefTable(FX_FILESIZE pos, bool skip); 130 bool RebuildCrossRef(); 131 Error StartParseInternal(); 132 FX_FILESIZE ParseStartXRef(); 133 std::unique_ptr<CPDF_LinearizedHeader> ParseLinearizedHeader(); 134 135 void SetSyntaxParserForTesting(std::unique_ptr<CPDF_SyntaxParser> parser); 136 137 private: 138 friend class CPDF_DataAvail; 139 140 struct CrossRefObjData { 141 uint32_t obj_num = 0; 142 CPDF_CrossRefTable::ObjectInfo info; 143 }; 144 145 bool LoadAllCrossRefTablesAndStreams(FX_FILESIZE xref_offset); 146 bool FindAllCrossReferenceTablesAndStream( 147 FX_FILESIZE main_xref_offset, 148 std::vector<FX_FILESIZE>& xref_list, 149 std::vector<FX_FILESIZE>& xref_stream_list); 150 bool LoadCrossRefStream(FX_FILESIZE* pos, bool is_main_xref); 151 void ProcessCrossRefStreamEntry(pdfium::span<const uint8_t> entry_span, 152 pdfium::span<const uint32_t> field_widths, 153 uint32_t obj_num); 154 RetainPtr<CPDF_Dictionary> LoadTrailer(); 155 Error SetEncryptHandler(); 156 void ReleaseEncryptHandler(); 157 bool LoadLinearizedAllCrossRefTable(FX_FILESIZE main_xref_offset); 158 bool LoadLinearizedAllCrossRefStream(FX_FILESIZE main_xref_offset); 159 Error LoadLinearizedMainXRefTable(); 160 161 const CPDF_ObjectStream* GetObjectStream(uint32_t object_number); 162 RetainPtr<const CPDF_Dictionary> GetRoot() const; 163 164 // A simple check whether the cross reference table matches with 165 // the objects. 166 bool VerifyCrossRefTable(); 167 168 RetainPtr<CPDF_Object> ParseIndirectObjectAt(FX_FILESIZE pos, 169 uint32_t objnum); 170 171 // If out_objects is null, the parser position will be moved to end subsection 172 // without additional validation. 173 bool ParseAndAppendCrossRefSubsectionData( 174 uint32_t start_objnum, 175 uint32_t count, 176 std::vector<CrossRefObjData>* out_objects); 177 bool ParseCrossRefTable(std::vector<CrossRefObjData>* out_objects); 178 void MergeCrossRefObjectsData(const std::vector<CrossRefObjData>& objects); 179 180 bool InitSyntaxParser(RetainPtr<CPDF_ReadValidator> validator); 181 bool ParseFileVersion(); SetPassword(const ByteString & password)182 void SetPassword(const ByteString& password) { m_Password = password; } 183 184 std::unique_ptr<CPDF_SyntaxParser> m_pSyntax; 185 std::unique_ptr<ParsedObjectsHolder> m_pOwnedObjectsHolder; 186 UnownedPtr<ParsedObjectsHolder> m_pObjectsHolder; 187 188 bool m_bHasParsed = false; 189 bool m_bXRefStream = false; 190 bool m_bXRefTableRebuilt = false; 191 int m_FileVersion = 0; 192 uint32_t m_MetadataObjnum = 0; 193 // m_CrossRefTable must be destroyed after m_pSecurityHandler due to the 194 // ownership of the ID array data. 195 std::unique_ptr<CPDF_CrossRefTable> m_CrossRefTable; 196 FX_FILESIZE m_LastXRefOffset = 0; 197 ByteString m_Password; 198 std::unique_ptr<CPDF_LinearizedHeader> m_pLinearized; 199 200 // A map of object numbers to indirect streams. 201 std::map<uint32_t, std::unique_ptr<CPDF_ObjectStream>> m_ObjectStreamMap; 202 203 // All indirect object numbers that are being parsed. 204 std::set<uint32_t> m_ParsingObjNums; 205 206 RetainPtr<CPDF_SecurityHandler> m_pSecurityHandler; 207 }; 208 209 #endif // CORE_FPDFAPI_PARSER_CPDF_PARSER_H_ 210