• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2016 The PDFium Authors
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 // Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com
6 
7 #ifndef CORE_FPDFAPI_PARSER_CPDF_PARSER_H_
8 #define CORE_FPDFAPI_PARSER_CPDF_PARSER_H_
9 
10 #include <stddef.h>
11 #include <stdint.h>
12 
13 #include <limits>
14 #include <map>
15 #include <memory>
16 #include <set>
17 #include <vector>
18 
19 #include "core/fpdfapi/parser/cpdf_cross_ref_table.h"
20 #include "core/fpdfapi/parser/cpdf_indirect_object_holder.h"
21 #include "core/fxcrt/bytestring.h"
22 #include "core/fxcrt/fx_types.h"
23 #include "core/fxcrt/retain_ptr.h"
24 #include "core/fxcrt/unowned_ptr.h"
25 
26 class CPDF_Array;
27 class CPDF_Dictionary;
28 class CPDF_LinearizedHeader;
29 class CPDF_Object;
30 class CPDF_ObjectStream;
31 class CPDF_ReadValidator;
32 class CPDF_SecurityHandler;
33 class CPDF_SyntaxParser;
34 class IFX_ArchiveStream;
35 class IFX_SeekableReadStream;
36 
37 class CPDF_Parser {
38  public:
39   using ObjectType = CPDF_CrossRefTable::ObjectType;
40   using ObjectInfo = CPDF_CrossRefTable::ObjectInfo;
41 
42   class ParsedObjectsHolder : public CPDF_IndirectObjectHolder {
43    public:
44     virtual bool TryInit() = 0;
45   };
46 
47   enum Error {
48     SUCCESS = 0,
49     FILE_ERROR,
50     FORMAT_ERROR,
51     PASSWORD_ERROR,
52     HANDLER_ERROR
53   };
54 
55   // A limit on the maximum object number in the xref table. Theoretical limits
56   // are higher, but this may be large enough in practice.
57   // Note: This was 1M, but https://crbug.com/910009 encountered a PDF with
58   // object numbers in the 1.7M range. The PDF only has 10K objects, but they
59   // are non-consecutive.
60   static constexpr uint32_t kMaxObjectNumber = 4 * 1024 * 1024;
61 
62   static constexpr size_t kInvalidPos = std::numeric_limits<size_t>::max();
63 
64   explicit CPDF_Parser(ParsedObjectsHolder* holder);
65   CPDF_Parser();
66   ~CPDF_Parser();
67 
68   Error StartParse(RetainPtr<IFX_SeekableReadStream> pFile,
69                    const ByteString& password);
70   Error StartLinearizedParse(RetainPtr<CPDF_ReadValidator> validator,
71                              const ByteString& password);
72 
SetPassword(const ByteString & password)73   void SetPassword(const ByteString& password) { m_Password = password; }
GetPassword()74   ByteString GetPassword() const { return m_Password; }
75 
76   // Take the GetPassword() value and encode it, if necessary, based on the
77   // password encoding conversion.
78   ByteString GetEncodedPassword() const;
79 
80   const CPDF_Dictionary* GetTrailer() const;
81   CPDF_Dictionary* GetMutableTrailerForTesting();
82   uint32_t GetTrailerObjectNumber() const;
83 
84   // Returns a new trailer which combines the last read trailer with the /Root
85   // and /Info from previous ones.
86   RetainPtr<CPDF_Dictionary> GetCombinedTrailer() const;
87 
GetLastXRefOffset()88   FX_FILESIZE GetLastXRefOffset() const { return m_LastXRefOffset; }
89 
90   uint32_t GetPermissions() const;
91   uint32_t GetRootObjNum() const;
92   uint32_t GetInfoObjNum() const;
93   RetainPtr<const CPDF_Array> GetIDArray() const;
94   RetainPtr<const CPDF_Dictionary> GetRoot() const;
95   RetainPtr<const CPDF_Dictionary> GetEncryptDict() const;
96 
97   RetainPtr<CPDF_Object> ParseIndirectObject(uint32_t objnum);
98 
99   uint32_t GetLastObjNum() const;
100   bool IsValidObjectNumber(uint32_t objnum) const;
101   FX_FILESIZE GetObjectPositionOrZero(uint32_t objnum) const;
102   bool IsObjectFreeOrNull(uint32_t objnum) const;
GetSecurityHandler()103   const RetainPtr<CPDF_SecurityHandler>& GetSecurityHandler() const {
104     return m_pSecurityHandler;
105   }
106   bool IsObjectFree(uint32_t objnum) const;
107 
GetFileVersion()108   int GetFileVersion() const { return m_FileVersion; }
IsXRefStream()109   bool IsXRefStream() const { return m_bXRefStream; }
110 
111   RetainPtr<CPDF_Object> ParseIndirectObjectAt(FX_FILESIZE pos,
112                                                uint32_t objnum);
113 
114   FX_FILESIZE GetDocumentSize() const;
115   uint32_t GetFirstPageNo() const;
GetLinearizedHeader()116   const CPDF_LinearizedHeader* GetLinearizedHeader() const {
117     return m_pLinearized.get();
118   }
119 
GetCrossRefTable()120   const CPDF_CrossRefTable* GetCrossRefTable() const {
121     return m_CrossRefTable.get();
122   }
123 
xref_table_rebuilt()124   bool xref_table_rebuilt() const { return m_bXRefTableRebuilt; }
125 
126   std::vector<unsigned int> GetTrailerEnds();
127   bool WriteToArchive(IFX_ArchiveStream* archive, FX_FILESIZE src_size);
128 
129   void SetLinearizedHeaderForTesting(
130       std::unique_ptr<CPDF_LinearizedHeader> pLinearized);
131 
132  protected:
133   bool LoadCrossRefV4(FX_FILESIZE pos, bool bSkip);
134   bool RebuildCrossRef();
135   Error StartParseInternal();
136   FX_FILESIZE ParseStartXRef();
137   std::unique_ptr<CPDF_LinearizedHeader> ParseLinearizedHeader();
138 
139   void SetSyntaxParserForTesting(std::unique_ptr<CPDF_SyntaxParser> parser);
140 
141  private:
142   friend class CPDF_DataAvail;
143 
144   struct CrossRefObjData {
145     uint32_t obj_num = 0;
146     ObjectInfo info;
147   };
148 
149   bool LoadAllCrossRefV4(FX_FILESIZE xref_offset);
150   bool LoadAllCrossRefV5(FX_FILESIZE xref_offset);
151   bool LoadCrossRefV5(FX_FILESIZE* pos, bool bMainXRef);
152   void ProcessCrossRefV5Entry(pdfium::span<const uint8_t> entry_span,
153                               pdfium::span<const uint32_t> field_widths,
154                               uint32_t obj_num);
155   RetainPtr<CPDF_Dictionary> LoadTrailerV4();
156   Error SetEncryptHandler();
157   void ReleaseEncryptHandler();
158   bool LoadLinearizedAllCrossRefV4(FX_FILESIZE main_xref_offset);
159   bool LoadLinearizedAllCrossRefV5(FX_FILESIZE main_xref_offset);
160   Error LoadLinearizedMainXRefTable();
161   const CPDF_ObjectStream* GetObjectStream(uint32_t object_number);
162   void ShrinkObjectMap(uint32_t size);
163   // A simple check whether the cross reference table matches with
164   // the objects.
165   bool VerifyCrossRefV4();
166 
167   // If out_objects is null, the parser position will be moved to end subsection
168   // without additional validation.
169   bool ParseAndAppendCrossRefSubsectionData(
170       uint32_t start_objnum,
171       uint32_t count,
172       std::vector<CrossRefObjData>* out_objects);
173   bool ParseCrossRefV4(std::vector<CrossRefObjData>* out_objects);
174   void MergeCrossRefObjectsData(const std::vector<CrossRefObjData>& objects);
175 
176   bool InitSyntaxParser(RetainPtr<CPDF_ReadValidator> validator);
177   bool ParseFileVersion();
178 
179   ObjectType GetObjectType(uint32_t objnum) const;
180 
181   std::unique_ptr<CPDF_SyntaxParser> m_pSyntax;
182   std::unique_ptr<ParsedObjectsHolder> m_pOwnedObjectsHolder;
183   UnownedPtr<ParsedObjectsHolder> m_pObjectsHolder;
184 
185   bool m_bHasParsed = false;
186   bool m_bXRefStream = false;
187   bool m_bXRefTableRebuilt = false;
188   int m_FileVersion = 0;
189   uint32_t m_MetadataObjnum = 0;
190   // m_CrossRefTable must be destroyed after m_pSecurityHandler due to the
191   // ownership of the ID array data.
192   std::unique_ptr<CPDF_CrossRefTable> m_CrossRefTable;
193   FX_FILESIZE m_LastXRefOffset = 0;
194   ByteString m_Password;
195   std::unique_ptr<CPDF_LinearizedHeader> m_pLinearized;
196 
197   // A map of object numbers to indirect streams.
198   std::map<uint32_t, std::unique_ptr<CPDF_ObjectStream>> m_ObjectStreamMap;
199 
200   // All indirect object numbers that are being parsed.
201   std::set<uint32_t> m_ParsingObjNums;
202 
203   RetainPtr<CPDF_SecurityHandler> m_pSecurityHandler;
204 };
205 
206 #endif  // CORE_FPDFAPI_PARSER_CPDF_PARSER_H_
207