• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2016 The PDFium Authors
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 // Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com
6 
7 #ifndef CORE_FPDFAPI_PARSER_CPDF_PARSER_H_
8 #define CORE_FPDFAPI_PARSER_CPDF_PARSER_H_
9 
10 #include <stddef.h>
11 #include <stdint.h>
12 
13 #include <limits>
14 #include <map>
15 #include <memory>
16 #include <set>
17 #include <vector>
18 
19 #include "core/fpdfapi/parser/cpdf_cross_ref_table.h"
20 #include "core/fpdfapi/parser/cpdf_indirect_object_holder.h"
21 #include "core/fxcrt/bytestring.h"
22 #include "core/fxcrt/fx_types.h"
23 #include "core/fxcrt/retain_ptr.h"
24 #include "core/fxcrt/unowned_ptr.h"
25 
26 class CPDF_Array;
27 class CPDF_Dictionary;
28 class CPDF_LinearizedHeader;
29 class CPDF_Object;
30 class CPDF_ObjectStream;
31 class CPDF_ReadValidator;
32 class CPDF_SecurityHandler;
33 class CPDF_SyntaxParser;
34 class IFX_ArchiveStream;
35 class IFX_SeekableReadStream;
36 
37 class CPDF_Parser {
38  public:
39   class ParsedObjectsHolder : public CPDF_IndirectObjectHolder {
40    public:
41     virtual bool TryInit() = 0;
42   };
43 
44   enum Error {
45     SUCCESS = 0,
46     FILE_ERROR,
47     FORMAT_ERROR,
48     PASSWORD_ERROR,
49     HANDLER_ERROR
50   };
51 
52   // A limit on the maximum object number in the xref table. Theoretical limits
53   // are higher, but this may be large enough in practice.
54   // Note: This was 1M, but https://crbug.com/910009 encountered a PDF with
55   // object numbers in the 1.7M range. The PDF only has 10K objects, but they
56   // are non-consecutive.
57   static constexpr uint32_t kMaxObjectNumber = 4 * 1024 * 1024;
58 
59   static constexpr size_t kInvalidPos = std::numeric_limits<size_t>::max();
60 
61   explicit CPDF_Parser(ParsedObjectsHolder* holder);
62   CPDF_Parser();
63   ~CPDF_Parser();
64 
65   Error StartParse(RetainPtr<IFX_SeekableReadStream> pFile,
66                    const ByteString& password);
67   Error StartLinearizedParse(RetainPtr<CPDF_ReadValidator> validator,
68                              const ByteString& password);
69 
GetPassword()70   ByteString GetPassword() const { return m_Password; }
71 
72   // Take the GetPassword() value and encode it, if necessary, based on the
73   // password encoding conversion.
74   ByteString GetEncodedPassword() const;
75 
76   const CPDF_Dictionary* GetTrailer() const;
77   uint32_t GetTrailerObjectNumber() const;
78 
79   // Returns a new trailer which combines the last read trailer with the /Root
80   // and /Info from previous ones.
81   RetainPtr<CPDF_Dictionary> GetCombinedTrailer() const;
82 
GetLastXRefOffset()83   FX_FILESIZE GetLastXRefOffset() const { return m_LastXRefOffset; }
84 
85   uint32_t GetPermissions(bool get_owner_perms) const;
86   uint32_t GetRootObjNum() const;
87   uint32_t GetInfoObjNum() const;
88   RetainPtr<const CPDF_Array> GetIDArray() const;
89   RetainPtr<const CPDF_Dictionary> GetEncryptDict() const;
90 
91   RetainPtr<CPDF_Object> ParseIndirectObject(uint32_t objnum);
92 
93   uint32_t GetLastObjNum() const;
94   bool IsValidObjectNumber(uint32_t objnum) const;
95   FX_FILESIZE GetObjectPositionOrZero(uint32_t objnum) const;
GetSecurityHandler()96   const RetainPtr<CPDF_SecurityHandler>& GetSecurityHandler() const {
97     return m_pSecurityHandler;
98   }
99   bool IsObjectFree(uint32_t objnum) const;
100 
GetFileVersion()101   int GetFileVersion() const { return m_FileVersion; }
IsXRefStream()102   bool IsXRefStream() const { return m_bXRefStream; }
103 
104   FX_FILESIZE GetDocumentSize() const;
105   uint32_t GetFirstPageNo() const;
GetLinearizedHeader()106   const CPDF_LinearizedHeader* GetLinearizedHeader() const {
107     return m_pLinearized.get();
108   }
109 
xref_table_rebuilt()110   bool xref_table_rebuilt() const { return m_bXRefTableRebuilt; }
111 
112   std::vector<unsigned int> GetTrailerEnds();
113   bool WriteToArchive(IFX_ArchiveStream* archive, FX_FILESIZE src_size);
114 
GetCrossRefTableForTesting()115   const CPDF_CrossRefTable* GetCrossRefTableForTesting() const {
116     return m_CrossRefTable.get();
117   }
118 
119   CPDF_Dictionary* GetMutableTrailerForTesting();
120 
ParseIndirectObjectAtForTesting(FX_FILESIZE pos)121   RetainPtr<CPDF_Object> ParseIndirectObjectAtForTesting(FX_FILESIZE pos) {
122     return ParseIndirectObjectAt(pos, 0);
123   }
124 
125   void SetLinearizedHeaderForTesting(
126       std::unique_ptr<CPDF_LinearizedHeader> pLinearized);
127 
128  protected:
129   bool LoadCrossRefTable(FX_FILESIZE pos, bool skip);
130   bool RebuildCrossRef();
131   Error StartParseInternal();
132   FX_FILESIZE ParseStartXRef();
133   std::unique_ptr<CPDF_LinearizedHeader> ParseLinearizedHeader();
134 
135   void SetSyntaxParserForTesting(std::unique_ptr<CPDF_SyntaxParser> parser);
136 
137  private:
138   friend class CPDF_DataAvail;
139 
140   struct CrossRefObjData {
141     uint32_t obj_num = 0;
142     CPDF_CrossRefTable::ObjectInfo info;
143   };
144 
145   bool LoadAllCrossRefTablesAndStreams(FX_FILESIZE xref_offset);
146   bool FindAllCrossReferenceTablesAndStream(
147       FX_FILESIZE main_xref_offset,
148       std::vector<FX_FILESIZE>& xref_list,
149       std::vector<FX_FILESIZE>& xref_stream_list);
150   bool LoadCrossRefStream(FX_FILESIZE* pos, bool is_main_xref);
151   void ProcessCrossRefStreamEntry(pdfium::span<const uint8_t> entry_span,
152                                   pdfium::span<const uint32_t> field_widths,
153                                   uint32_t obj_num);
154   RetainPtr<CPDF_Dictionary> LoadTrailer();
155   Error SetEncryptHandler();
156   void ReleaseEncryptHandler();
157   bool LoadLinearizedAllCrossRefTable(FX_FILESIZE main_xref_offset);
158   bool LoadLinearizedAllCrossRefStream(FX_FILESIZE main_xref_offset);
159   Error LoadLinearizedMainXRefTable();
160 
161   const CPDF_ObjectStream* GetObjectStream(uint32_t object_number);
162   RetainPtr<const CPDF_Dictionary> GetRoot() const;
163 
164   // A simple check whether the cross reference table matches with
165   // the objects.
166   bool VerifyCrossRefTable();
167 
168   RetainPtr<CPDF_Object> ParseIndirectObjectAt(FX_FILESIZE pos,
169                                                uint32_t objnum);
170 
171   // If out_objects is null, the parser position will be moved to end subsection
172   // without additional validation.
173   bool ParseAndAppendCrossRefSubsectionData(
174       uint32_t start_objnum,
175       uint32_t count,
176       std::vector<CrossRefObjData>* out_objects);
177   bool ParseCrossRefTable(std::vector<CrossRefObjData>* out_objects);
178   void MergeCrossRefObjectsData(const std::vector<CrossRefObjData>& objects);
179 
180   bool InitSyntaxParser(RetainPtr<CPDF_ReadValidator> validator);
181   bool ParseFileVersion();
SetPassword(const ByteString & password)182   void SetPassword(const ByteString& password) { m_Password = password; }
183 
184   std::unique_ptr<CPDF_SyntaxParser> m_pSyntax;
185   std::unique_ptr<ParsedObjectsHolder> m_pOwnedObjectsHolder;
186   UnownedPtr<ParsedObjectsHolder> m_pObjectsHolder;
187 
188   bool m_bHasParsed = false;
189   bool m_bXRefStream = false;
190   bool m_bXRefTableRebuilt = false;
191   int m_FileVersion = 0;
192   uint32_t m_MetadataObjnum = 0;
193   // m_CrossRefTable must be destroyed after m_pSecurityHandler due to the
194   // ownership of the ID array data.
195   std::unique_ptr<CPDF_CrossRefTable> m_CrossRefTable;
196   FX_FILESIZE m_LastXRefOffset = 0;
197   ByteString m_Password;
198   std::unique_ptr<CPDF_LinearizedHeader> m_pLinearized;
199 
200   // A map of object numbers to indirect streams.
201   std::map<uint32_t, std::unique_ptr<CPDF_ObjectStream>> m_ObjectStreamMap;
202 
203   // All indirect object numbers that are being parsed.
204   std::set<uint32_t> m_ParsingObjNums;
205 
206   RetainPtr<CPDF_SecurityHandler> m_pSecurityHandler;
207 };
208 
209 #endif  // CORE_FPDFAPI_PARSER_CPDF_PARSER_H_
210