• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2016 The PDFium Authors
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 // Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com
6 
7 #include "core/fpdfapi/parser/cpdf_parser.h"
8 
9 #include <ctype.h>
10 #include <stdint.h>
11 
12 #include <algorithm>
13 #include <optional>
14 #include <utility>
15 #include <vector>
16 
17 #include "core/fpdfapi/parser/cpdf_array.h"
18 #include "core/fpdfapi/parser/cpdf_crypto_handler.h"
19 #include "core/fpdfapi/parser/cpdf_dictionary.h"
20 #include "core/fpdfapi/parser/cpdf_document.h"
21 #include "core/fpdfapi/parser/cpdf_linearized_header.h"
22 #include "core/fpdfapi/parser/cpdf_number.h"
23 #include "core/fpdfapi/parser/cpdf_object_stream.h"
24 #include "core/fpdfapi/parser/cpdf_read_validator.h"
25 #include "core/fpdfapi/parser/cpdf_reference.h"
26 #include "core/fpdfapi/parser/cpdf_security_handler.h"
27 #include "core/fpdfapi/parser/cpdf_stream.h"
28 #include "core/fpdfapi/parser/cpdf_stream_acc.h"
29 #include "core/fpdfapi/parser/cpdf_syntax_parser.h"
30 #include "core/fpdfapi/parser/fpdf_parser_utility.h"
31 #include "core/fxcrt/autorestorer.h"
32 #include "core/fxcrt/check.h"
33 #include "core/fxcrt/check_op.h"
34 #include "core/fxcrt/containers/contains.h"
35 #include "core/fxcrt/data_vector.h"
36 #include "core/fxcrt/fx_extension.h"
37 #include "core/fxcrt/fx_safe_types.h"
38 #include "core/fxcrt/notreached.h"
39 #include "core/fxcrt/scoped_set_insertion.h"
40 #include "core/fxcrt/span.h"
41 
42 using ObjectType = CPDF_CrossRefTable::ObjectType;
43 using ObjectInfo = CPDF_CrossRefTable::ObjectInfo;
44 
45 namespace {
46 
47 // A limit on the size of the xref table. Theoretical limits are higher, but
48 // this may be large enough in practice. The max size should always be 1 more
49 // than the max object number.
50 constexpr int32_t kMaxXRefSize = CPDF_Parser::kMaxObjectNumber + 1;
51 
52 // "%PDF-1.7\n"
53 constexpr FX_FILESIZE kPDFHeaderSize = 9;
54 
55 // The required number of fields in a /W array in a cross-reference stream
56 // dictionary.
57 constexpr size_t kMinFieldCount = 3;
58 
59 // Trailers are inline.
60 constexpr uint32_t kNoTrailerObjectNumber = 0;
61 
62 struct CrossRefStreamIndexEntry {
63   uint32_t start_obj_num;
64   uint32_t obj_count;
65 };
66 
GetObjectTypeFromCrossRefStreamType(uint32_t cross_ref_stream_type)67 std::optional<ObjectType> GetObjectTypeFromCrossRefStreamType(
68     uint32_t cross_ref_stream_type) {
69   switch (cross_ref_stream_type) {
70     case 0:
71       return ObjectType::kFree;
72     case 1:
73       return ObjectType::kNormal;
74     case 2:
75       return ObjectType::kCompressed;
76     default:
77       return std::nullopt;
78   }
79 }
80 
81 // Use the Get*XRefStreamEntry() functions below, instead of calling this
82 // directly.
GetVarInt(pdfium::span<const uint8_t> input)83 uint32_t GetVarInt(pdfium::span<const uint8_t> input) {
84   uint32_t result = 0;
85   for (uint8_t c : input)
86     result = result * 256 + c;
87   return result;
88 }
89 
90 // The following 3 functions retrieve variable length entries from
91 // cross-reference streams, as described in ISO 32000-1:2008 table 18. There are
92 // only 3 fields for any given entry.
GetFirstXRefStreamEntry(pdfium::span<const uint8_t> entry_span,pdfium::span<const uint32_t> field_widths)93 uint32_t GetFirstXRefStreamEntry(pdfium::span<const uint8_t> entry_span,
94                                  pdfium::span<const uint32_t> field_widths) {
95   return GetVarInt(entry_span.first(field_widths[0]));
96 }
97 
GetSecondXRefStreamEntry(pdfium::span<const uint8_t> entry_span,pdfium::span<const uint32_t> field_widths)98 uint32_t GetSecondXRefStreamEntry(pdfium::span<const uint8_t> entry_span,
99                                   pdfium::span<const uint32_t> field_widths) {
100   return GetVarInt(entry_span.subspan(field_widths[0], field_widths[1]));
101 }
102 
GetThirdXRefStreamEntry(pdfium::span<const uint8_t> entry_span,pdfium::span<const uint32_t> field_widths)103 uint32_t GetThirdXRefStreamEntry(pdfium::span<const uint8_t> entry_span,
104                                  pdfium::span<const uint32_t> field_widths) {
105   return GetVarInt(
106       entry_span.subspan(field_widths[0] + field_widths[1], field_widths[2]));
107 }
108 
GetCrossRefStreamIndices(const CPDF_Array * array,uint32_t size)109 std::vector<CrossRefStreamIndexEntry> GetCrossRefStreamIndices(
110     const CPDF_Array* array,
111     uint32_t size) {
112   std::vector<CrossRefStreamIndexEntry> indices;
113   if (array) {
114     for (size_t i = 0; i < array->size() / 2; i++) {
115       RetainPtr<const CPDF_Number> pStartNumObj = array->GetNumberAt(i * 2);
116       if (!pStartNumObj)
117         continue;
118 
119       RetainPtr<const CPDF_Number> pCountObj = array->GetNumberAt(i * 2 + 1);
120       if (!pCountObj)
121         continue;
122 
123       int nStartNum = pStartNumObj->GetInteger();
124       int nCount = pCountObj->GetInteger();
125       if (nStartNum < 0 || nCount <= 0)
126         continue;
127 
128       indices.push_back(
129           {static_cast<uint32_t>(nStartNum), static_cast<uint32_t>(nCount)});
130     }
131   }
132 
133   if (indices.empty())
134     indices.push_back({0, size});
135   return indices;
136 }
137 
GetFieldWidths(const CPDF_Array * array)138 std::vector<uint32_t> GetFieldWidths(const CPDF_Array* array) {
139   std::vector<uint32_t> results;
140   if (!array)
141     return results;
142 
143   CPDF_ArrayLocker locker(array);
144   for (const auto& obj : locker)
145     results.push_back(obj->GetInteger());
146   return results;
147 }
148 
149 class ObjectsHolderStub final : public CPDF_Parser::ParsedObjectsHolder {
150  public:
151   ObjectsHolderStub() = default;
152   ~ObjectsHolderStub() override = default;
TryInit()153   bool TryInit() override { return true; }
154 };
155 
156 }  // namespace
157 
CPDF_Parser(ParsedObjectsHolder * holder)158 CPDF_Parser::CPDF_Parser(ParsedObjectsHolder* holder)
159     : m_pObjectsHolder(holder),
160       m_CrossRefTable(std::make_unique<CPDF_CrossRefTable>()) {
161   if (!holder) {
162     m_pOwnedObjectsHolder = std::make_unique<ObjectsHolderStub>();
163     m_pObjectsHolder = m_pOwnedObjectsHolder.get();
164   }
165 }
166 
CPDF_Parser()167 CPDF_Parser::CPDF_Parser() : CPDF_Parser(nullptr) {}
168 
169 CPDF_Parser::~CPDF_Parser() = default;
170 
GetLastObjNum() const171 uint32_t CPDF_Parser::GetLastObjNum() const {
172   return m_CrossRefTable->objects_info().empty()
173              ? 0
174              : m_CrossRefTable->objects_info().rbegin()->first;
175 }
176 
IsValidObjectNumber(uint32_t objnum) const177 bool CPDF_Parser::IsValidObjectNumber(uint32_t objnum) const {
178   return objnum <= GetLastObjNum();
179 }
180 
GetObjectPositionOrZero(uint32_t objnum) const181 FX_FILESIZE CPDF_Parser::GetObjectPositionOrZero(uint32_t objnum) const {
182   const auto* info = m_CrossRefTable->GetObjectInfo(objnum);
183   return (info && info->type == ObjectType::kNormal) ? info->pos : 0;
184 }
185 
IsObjectFree(uint32_t objnum) const186 bool CPDF_Parser::IsObjectFree(uint32_t objnum) const {
187   DCHECK(IsValidObjectNumber(objnum));
188   const auto* info = m_CrossRefTable->GetObjectInfo(objnum);
189   return !info || info->type == ObjectType::kFree;
190 }
191 
InitSyntaxParser(RetainPtr<CPDF_ReadValidator> validator)192 bool CPDF_Parser::InitSyntaxParser(RetainPtr<CPDF_ReadValidator> validator) {
193   const std::optional<FX_FILESIZE> header_offset = GetHeaderOffset(validator);
194   if (!header_offset.has_value())
195     return false;
196   if (validator->GetSize() < header_offset.value() + kPDFHeaderSize)
197     return false;
198 
199   m_pSyntax = std::make_unique<CPDF_SyntaxParser>(std::move(validator),
200                                                   header_offset.value());
201   return ParseFileVersion();
202 }
203 
ParseFileVersion()204 bool CPDF_Parser::ParseFileVersion() {
205   m_FileVersion = 0;
206   uint8_t ch;
207   if (!m_pSyntax->GetCharAt(5, ch))
208     return false;
209 
210   if (isdigit(ch))
211     m_FileVersion = FXSYS_DecimalCharToInt(static_cast<wchar_t>(ch)) * 10;
212 
213   if (!m_pSyntax->GetCharAt(7, ch))
214     return false;
215 
216   if (isdigit(ch))
217     m_FileVersion += FXSYS_DecimalCharToInt(static_cast<wchar_t>(ch));
218   return true;
219 }
220 
StartParse(RetainPtr<IFX_SeekableReadStream> pFileAccess,const ByteString & password)221 CPDF_Parser::Error CPDF_Parser::StartParse(
222     RetainPtr<IFX_SeekableReadStream> pFileAccess,
223     const ByteString& password) {
224   if (!InitSyntaxParser(pdfium::MakeRetain<CPDF_ReadValidator>(
225           std::move(pFileAccess), nullptr)))
226     return FORMAT_ERROR;
227   SetPassword(password);
228   return StartParseInternal();
229 }
230 
StartParseInternal()231 CPDF_Parser::Error CPDF_Parser::StartParseInternal() {
232   DCHECK(!m_bHasParsed);
233   DCHECK(!m_bXRefTableRebuilt);
234   m_bHasParsed = true;
235   m_bXRefStream = false;
236 
237   m_LastXRefOffset = ParseStartXRef();
238   if (m_LastXRefOffset >= kPDFHeaderSize) {
239     if (!LoadAllCrossRefTablesAndStreams(m_LastXRefOffset)) {
240       if (!RebuildCrossRef())
241         return FORMAT_ERROR;
242 
243       m_bXRefTableRebuilt = true;
244       m_LastXRefOffset = 0;
245     }
246   } else {
247     if (!RebuildCrossRef())
248       return FORMAT_ERROR;
249 
250     m_bXRefTableRebuilt = true;
251   }
252   Error eRet = SetEncryptHandler();
253   if (eRet != SUCCESS)
254     return eRet;
255 
256   if (!GetRoot() || !m_pObjectsHolder->TryInit()) {
257     if (m_bXRefTableRebuilt)
258       return FORMAT_ERROR;
259 
260     ReleaseEncryptHandler();
261     if (!RebuildCrossRef())
262       return FORMAT_ERROR;
263 
264     eRet = SetEncryptHandler();
265     if (eRet != SUCCESS)
266       return eRet;
267 
268     m_pObjectsHolder->TryInit();
269     if (!GetRoot())
270       return FORMAT_ERROR;
271   }
272   if (GetRootObjNum() == CPDF_Object::kInvalidObjNum) {
273     ReleaseEncryptHandler();
274     if (!RebuildCrossRef() || GetRootObjNum() == CPDF_Object::kInvalidObjNum)
275       return FORMAT_ERROR;
276 
277     eRet = SetEncryptHandler();
278     if (eRet != SUCCESS)
279       return eRet;
280   }
281   if (m_pSecurityHandler && !m_pSecurityHandler->IsMetadataEncrypted()) {
282     RetainPtr<const CPDF_Reference> pMetadata =
283         ToReference(GetRoot()->GetObjectFor("Metadata"));
284     if (pMetadata)
285       m_MetadataObjnum = pMetadata->GetRefObjNum();
286   }
287   return SUCCESS;
288 }
289 
ParseStartXRef()290 FX_FILESIZE CPDF_Parser::ParseStartXRef() {
291   static constexpr char kStartXRefKeyword[] = "startxref";
292   m_pSyntax->SetPos(m_pSyntax->GetDocumentSize() - strlen(kStartXRefKeyword));
293   if (!m_pSyntax->BackwardsSearchToWord(kStartXRefKeyword, 4096))
294     return 0;
295 
296   // Skip "startxref" keyword.
297   m_pSyntax->GetKeyword();
298 
299   // Read XRef offset.
300   const CPDF_SyntaxParser::WordResult xref_offset_result =
301       m_pSyntax->GetNextWord();
302   if (!xref_offset_result.is_number || xref_offset_result.word.IsEmpty())
303     return 0;
304 
305   const FX_SAFE_FILESIZE result = FXSYS_atoi64(xref_offset_result.word.c_str());
306   if (!result.IsValid() || result.ValueOrDie() >= m_pSyntax->GetDocumentSize())
307     return 0;
308 
309   return result.ValueOrDie();
310 }
311 
SetEncryptHandler()312 CPDF_Parser::Error CPDF_Parser::SetEncryptHandler() {
313   ReleaseEncryptHandler();
314   if (!GetTrailer())
315     return FORMAT_ERROR;
316 
317   RetainPtr<const CPDF_Dictionary> pEncryptDict = GetEncryptDict();
318   if (!pEncryptDict)
319     return SUCCESS;
320 
321   if (pEncryptDict->GetNameFor("Filter") != "Standard")
322     return HANDLER_ERROR;
323 
324   auto pSecurityHandler = pdfium::MakeRetain<CPDF_SecurityHandler>();
325   if (!pSecurityHandler->OnInit(pEncryptDict, GetIDArray(), GetPassword()))
326     return PASSWORD_ERROR;
327 
328   m_pSecurityHandler = std::move(pSecurityHandler);
329   return SUCCESS;
330 }
331 
ReleaseEncryptHandler()332 void CPDF_Parser::ReleaseEncryptHandler() {
333   m_pSecurityHandler.Reset();
334 }
335 
336 // Ideally, all the cross reference entries should be verified.
337 // In reality, we rarely see well-formed cross references don't match
338 // with the objects. crbug/602650 showed a case where object numbers
339 // in the cross reference table are all off by one.
VerifyCrossRefTable()340 bool CPDF_Parser::VerifyCrossRefTable() {
341   for (const auto& it : m_CrossRefTable->objects_info()) {
342     if (it.second.pos <= 0)
343       continue;
344     // Find the first non-zero position.
345     FX_FILESIZE SavedPos = m_pSyntax->GetPos();
346     m_pSyntax->SetPos(it.second.pos);
347     CPDF_SyntaxParser::WordResult word_result = m_pSyntax->GetNextWord();
348     m_pSyntax->SetPos(SavedPos);
349     if (!word_result.is_number || word_result.word.IsEmpty() ||
350         FXSYS_atoui(word_result.word.c_str()) != it.first) {
351       // If the object number read doesn't match the one stored,
352       // something is wrong with the cross reference table.
353       return false;
354     }
355     break;
356   }
357   return true;
358 }
359 
LoadAllCrossRefTablesAndStreams(FX_FILESIZE xref_offset)360 bool CPDF_Parser::LoadAllCrossRefTablesAndStreams(FX_FILESIZE xref_offset) {
361   const bool is_xref_stream = !LoadCrossRefTable(xref_offset, /*skip=*/true);
362   if (is_xref_stream) {
363     // Use a copy of `xref_offset`, as LoadCrossRefStream() may change it.
364     FX_FILESIZE xref_offset_copy = xref_offset;
365     if (!LoadCrossRefStream(&xref_offset_copy, /*is_main_xref=*/true)) {
366       return false;
367     }
368 
369     // LoadCrossRefStream() sets the trailer when `is_main_xref` is true.
370     // Thus no SetTrailer() call like the else-block below. Similarly,
371     // LoadCrossRefStream() also calls SetObjectMapSize() itself, so no need to
372     // call it again here.
373   } else {
374     RetainPtr<CPDF_Dictionary> trailer = LoadTrailer();
375     if (!trailer) {
376       return false;
377     }
378 
379     m_CrossRefTable->SetTrailer(std::move(trailer), kNoTrailerObjectNumber);
380 
381     const int32_t xrefsize = GetTrailer()->GetDirectIntegerFor("Size");
382     if (xrefsize > 0 && xrefsize <= kMaxXRefSize) {
383       m_CrossRefTable->SetObjectMapSize(xrefsize);
384     }
385   }
386 
387   std::vector<FX_FILESIZE> xref_list;
388   std::vector<FX_FILESIZE> xref_stream_list;
389 
390   if (is_xref_stream) {
391     xref_list.push_back(0);
392     xref_stream_list.push_back(xref_offset);
393   } else {
394     xref_list.push_back(xref_offset);
395     xref_stream_list.push_back(GetTrailer()->GetDirectIntegerFor("XRefStm"));
396   }
397 
398   if (!FindAllCrossReferenceTablesAndStream(xref_offset, xref_list,
399                                             xref_stream_list)) {
400     return false;
401   }
402 
403   if (xref_list.front() > 0) {
404     if (!LoadCrossRefTable(xref_list.front(), /*skip=*/false)) {
405       return false;
406     }
407 
408     if (!VerifyCrossRefTable()) {
409       return false;
410     }
411   }
412 
413   // Cross reference table entries take precedence over cross reference stream
414   // entries. So process the stream entries first and then give the cross
415   // reference tables a chance to overwrite them.
416   //
417   // XRefStm entries should only be used in update sections, so skip
418   // `xref_stream_list.front()`.
419   //
420   // See details in ISO 32000-1:2008, section 7.5.8.4.
421   for (size_t i = 1; i < xref_list.size(); ++i) {
422     if (xref_stream_list[i] > 0 &&
423         !LoadCrossRefStream(&xref_stream_list[i], /*is_main_xref=*/false)) {
424       return false;
425     }
426     if (xref_list[i] > 0 && !LoadCrossRefTable(xref_list[i], /*skip=*/false)) {
427       return false;
428     }
429   }
430 
431   if (is_xref_stream) {
432     m_ObjectStreamMap.clear();
433     m_bXRefStream = true;
434   }
435 
436   return true;
437 }
438 
LoadLinearizedAllCrossRefTable(FX_FILESIZE main_xref_offset)439 bool CPDF_Parser::LoadLinearizedAllCrossRefTable(FX_FILESIZE main_xref_offset) {
440   if (!LoadCrossRefTable(main_xref_offset, /*skip=*/false)) {
441     return false;
442   }
443 
444   RetainPtr<CPDF_Dictionary> main_trailer = LoadTrailer();
445   if (!main_trailer)
446     return false;
447 
448   // GetTrailer() currently returns the first-page trailer.
449   if (GetTrailer()->GetDirectIntegerFor("Size") == 0)
450     return false;
451 
452   // Read /XRefStm from the first-page trailer. No need to read /Prev for the
453   // first-page trailer, as the caller already did that and passed it in as
454   // |main_xref_offset|.
455   FX_FILESIZE xref_stm = GetTrailer()->GetDirectIntegerFor("XRefStm");
456   std::vector<FX_FILESIZE> xref_list{main_xref_offset};
457   std::vector<FX_FILESIZE> xref_stream_list{xref_stm};
458 
459   // Merge the trailers. Now GetTrailer() returns the merged trailer, where
460   // /Prev is from the main-trailer.
461   m_CrossRefTable = CPDF_CrossRefTable::MergeUp(
462       std::make_unique<CPDF_CrossRefTable>(std::move(main_trailer),
463                                            kNoTrailerObjectNumber),
464       std::move(m_CrossRefTable));
465 
466   if (!FindAllCrossReferenceTablesAndStream(main_xref_offset, xref_list,
467                                             xref_stream_list)) {
468     return false;
469   }
470 
471   // Unlike LoadAllCrossRefTablesAndStreams(), the first XRefStm entry in
472   // `xref_stream_list` should be processed.
473   if (xref_stream_list[0] > 0 &&
474       !LoadCrossRefStream(&xref_stream_list[0], /*is_main_xref=*/false)) {
475     return false;
476   }
477 
478   // Cross reference table entries take precedence over cross reference stream
479   // entries. So process the stream entries first and then give the cross
480   // reference tables a chance to overwrite them.
481   for (size_t i = 1; i < xref_list.size(); ++i) {
482     if (xref_stream_list[i] > 0 &&
483         !LoadCrossRefStream(&xref_stream_list[i], /*is_main_xref=*/false)) {
484       return false;
485     }
486     if (xref_list[i] > 0 && !LoadCrossRefTable(xref_list[i], /*skip=*/false)) {
487       return false;
488     }
489   }
490 
491   return true;
492 }
493 
ParseAndAppendCrossRefSubsectionData(uint32_t start_objnum,uint32_t count,std::vector<CrossRefObjData> * out_objects)494 bool CPDF_Parser::ParseAndAppendCrossRefSubsectionData(
495     uint32_t start_objnum,
496     uint32_t count,
497     std::vector<CrossRefObjData>* out_objects) {
498   if (!count)
499     return true;
500 
501   // Each entry shall be exactly 20 byte.
502   // A sample entry looks like:
503   // "0000000000 00007 f\r\n"
504   static constexpr int32_t kEntrySize = 20;
505 
506   if (!out_objects) {
507     FX_SAFE_FILESIZE pos = count;
508     pos *= kEntrySize;
509     pos += m_pSyntax->GetPos();
510     if (!pos.IsValid())
511       return false;
512     m_pSyntax->SetPos(pos.ValueOrDie());
513     return true;
514   }
515   const size_t start_obj_index = out_objects->size();
516   FX_SAFE_SIZE_T new_size = start_obj_index;
517   new_size += count;
518   if (!new_size.IsValid())
519     return false;
520 
521   if (new_size.ValueOrDie() > kMaxXRefSize)
522     return false;
523 
524   const size_t max_entries_in_file = m_pSyntax->GetDocumentSize() / kEntrySize;
525   if (new_size.ValueOrDie() > max_entries_in_file)
526     return false;
527 
528   out_objects->resize(new_size.ValueOrDie());
529 
530   DataVector<char> buf(1024 * kEntrySize + 1);
531   buf.back() = '\0';
532 
533   uint32_t entries_to_read = count;
534   while (entries_to_read > 0) {
535     const uint32_t entries_in_block = std::min(entries_to_read, 1024u);
536     const uint32_t bytes_to_read = entries_in_block * kEntrySize;
537     auto block_span = pdfium::make_span(buf).first(bytes_to_read);
538     if (!m_pSyntax->ReadBlock(pdfium::as_writable_bytes(block_span)))
539       return false;
540 
541     for (uint32_t i = 0; i < entries_in_block; i++) {
542       uint32_t iObjectIndex = count - entries_to_read + i;
543       CrossRefObjData& obj_data =
544           (*out_objects)[start_obj_index + iObjectIndex];
545       const uint32_t objnum = start_objnum + iObjectIndex;
546       obj_data.obj_num = objnum;
547       ObjectInfo& info = obj_data.info;
548 
549       pdfium::span<const char> pEntry =
550           pdfium::make_span(buf).subspan(i * kEntrySize);
551       if (pEntry[17] == 'f') {
552         info.pos = 0;
553         info.type = ObjectType::kFree;
554       } else {
555         const FX_SAFE_FILESIZE offset = FXSYS_atoi64(pEntry.data());
556         if (!offset.IsValid())
557           return false;
558 
559         if (offset.ValueOrDie() == 0) {
560           for (int32_t c = 0; c < 10; c++) {
561             if (!isdigit(pEntry[c]))
562               return false;
563           }
564         }
565 
566         info.pos = offset.ValueOrDie();
567 
568         // TODO(art-snake): The info.gennum is uint16_t, but version may be
569         // greated than max<uint16_t>. Needs solve this issue.
570         const int32_t version = FXSYS_atoi(pEntry.subspan(11).data());
571         info.gennum = version;
572         info.type = ObjectType::kNormal;
573       }
574     }
575     entries_to_read -= entries_in_block;
576   }
577   return true;
578 }
579 
ParseCrossRefTable(std::vector<CrossRefObjData> * out_objects)580 bool CPDF_Parser::ParseCrossRefTable(
581     std::vector<CrossRefObjData>* out_objects) {
582   if (out_objects)
583     out_objects->clear();
584 
585   if (m_pSyntax->GetKeyword() != "xref")
586     return false;
587   std::vector<CrossRefObjData> result_objects;
588   while (true) {
589     FX_FILESIZE saved_pos = m_pSyntax->GetPos();
590     CPDF_SyntaxParser::WordResult word_result = m_pSyntax->GetNextWord();
591     const ByteString& word = word_result.word;
592     if (word.IsEmpty())
593       return false;
594 
595     if (!word_result.is_number) {
596       m_pSyntax->SetPos(saved_pos);
597       break;
598     }
599 
600     uint32_t start_objnum = FXSYS_atoui(word.c_str());
601     if (start_objnum >= kMaxObjectNumber)
602       return false;
603 
604     uint32_t count = m_pSyntax->GetDirectNum();
605     m_pSyntax->ToNextWord();
606 
607     if (!ParseAndAppendCrossRefSubsectionData(
608             start_objnum, count, out_objects ? &result_objects : nullptr)) {
609       return false;
610     }
611   }
612   if (out_objects)
613     *out_objects = std::move(result_objects);
614   return true;
615 }
616 
LoadCrossRefTable(FX_FILESIZE pos,bool skip)617 bool CPDF_Parser::LoadCrossRefTable(FX_FILESIZE pos, bool skip) {
618   m_pSyntax->SetPos(pos);
619   std::vector<CrossRefObjData> objects;
620   if (!ParseCrossRefTable(skip ? nullptr : &objects)) {
621     return false;
622   }
623 
624   MergeCrossRefObjectsData(objects);
625   return true;
626 }
627 
MergeCrossRefObjectsData(const std::vector<CrossRefObjData> & objects)628 void CPDF_Parser::MergeCrossRefObjectsData(
629     const std::vector<CrossRefObjData>& objects) {
630   for (const auto& obj : objects) {
631     switch (obj.info.type) {
632       case ObjectType::kFree:
633         if (obj.info.gennum > 0)
634           m_CrossRefTable->SetFree(obj.obj_num, obj.info.gennum);
635         break;
636       case ObjectType::kNormal:
637         m_CrossRefTable->AddNormal(obj.obj_num, obj.info.gennum,
638                                    obj.info.is_object_stream_flag,
639                                    obj.info.pos);
640         break;
641       case ObjectType::kCompressed:
642         m_CrossRefTable->AddCompressed(obj.obj_num, obj.info.archive.obj_num,
643                                        obj.info.archive.obj_index);
644         break;
645     }
646   }
647 }
648 
FindAllCrossReferenceTablesAndStream(FX_FILESIZE main_xref_offset,std::vector<FX_FILESIZE> & xref_list,std::vector<FX_FILESIZE> & xref_stream_list)649 bool CPDF_Parser::FindAllCrossReferenceTablesAndStream(
650     FX_FILESIZE main_xref_offset,
651     std::vector<FX_FILESIZE>& xref_list,
652     std::vector<FX_FILESIZE>& xref_stream_list) {
653   std::set<FX_FILESIZE> seen_xref_offset{main_xref_offset};
654 
655   // When the trailer doesn't have Prev entry or Prev entry value is not
656   // numerical, GetDirectInteger() returns 0. Loading will end.
657   FX_FILESIZE xref_offset = GetTrailer()->GetDirectIntegerFor("Prev");
658   while (xref_offset > 0) {
659     // Check for circular references.
660     if (pdfium::Contains(seen_xref_offset, xref_offset)) {
661       return false;
662     }
663 
664     seen_xref_offset.insert(xref_offset);
665 
666     // Use a copy of `xref_offset`, as LoadCrossRefStream() may change it.
667     FX_FILESIZE xref_offset_copy = xref_offset;
668     if (LoadCrossRefStream(&xref_offset_copy, /*is_main_xref=*/false)) {
669       // Since `xref_offset` points to a cross reference stream, mark it
670       // accordingly.
671       xref_list.insert(xref_list.begin(), 0);
672       xref_stream_list.insert(xref_stream_list.begin(), xref_offset);
673       xref_offset = xref_offset_copy;
674 
675       // On success, LoadCrossRefStream() called CPDF_CrossRefTable::MergeUp()
676       // when `is_main_xref` is false. Thus no explicit call here.
677     } else {
678       // SLOW ...
679       LoadCrossRefTable(xref_offset, /*skip=*/true);
680 
681       RetainPtr<CPDF_Dictionary> trailer_dict = LoadTrailer();
682       if (!trailer_dict) {
683         return false;
684       }
685 
686       // The trailer for cross reference tables may point to a cross reference
687       // stream as well.
688       xref_list.insert(xref_list.begin(), xref_offset);
689       xref_stream_list.insert(xref_stream_list.begin(),
690                               trailer_dict->GetIntegerFor("XRefStm"));
691       xref_offset = trailer_dict->GetDirectIntegerFor("Prev");
692 
693       // SLOW ...
694       m_CrossRefTable = CPDF_CrossRefTable::MergeUp(
695           std::make_unique<CPDF_CrossRefTable>(std::move(trailer_dict),
696                                                kNoTrailerObjectNumber),
697           std::move(m_CrossRefTable));
698     }
699   }
700   return true;
701 }
702 
RebuildCrossRef()703 bool CPDF_Parser::RebuildCrossRef() {
704   auto cross_ref_table = std::make_unique<CPDF_CrossRefTable>();
705 
706   const uint32_t kBufferSize = 4096;
707   m_pSyntax->SetReadBufferSize(kBufferSize);
708   m_pSyntax->SetPos(0);
709 
710   std::vector<std::pair<uint32_t, FX_FILESIZE>> numbers;
711   for (CPDF_SyntaxParser::WordResult result = m_pSyntax->GetNextWord();
712        !result.word.IsEmpty(); result = m_pSyntax->GetNextWord()) {
713     const ByteString& word = result.word;
714     if (result.is_number) {
715       numbers.emplace_back(FXSYS_atoui(word.c_str()),
716                            m_pSyntax->GetPos() - word.GetLength());
717       if (numbers.size() > 2u)
718         numbers.erase(numbers.begin());
719       continue;
720     }
721 
722     if (word == "(") {
723       m_pSyntax->ReadString();
724     } else if (word == "<") {
725       m_pSyntax->ReadHexString();
726     } else if (word == "trailer") {
727       RetainPtr<CPDF_Object> pTrailer = m_pSyntax->GetObjectBody(nullptr);
728       if (pTrailer) {
729         CPDF_Stream* stream_trailer = pTrailer->AsMutableStream();
730         // Grab the object number from `pTrailer` before potentially calling
731         // std::move(pTrailer) below.
732         const uint32_t trailer_object_number = pTrailer->GetObjNum();
733         RetainPtr<CPDF_Dictionary> trailer_dict =
734             stream_trailer ? stream_trailer->GetMutableDict()
735                            : ToDictionary(std::move(pTrailer));
736         cross_ref_table = CPDF_CrossRefTable::MergeUp(
737             std::move(cross_ref_table),
738             std::make_unique<CPDF_CrossRefTable>(std::move(trailer_dict),
739                                                  trailer_object_number));
740       }
741     } else if (word == "obj" && numbers.size() == 2u) {
742       const FX_FILESIZE obj_pos = numbers[0].second;
743       const uint32_t obj_num = numbers[0].first;
744       const uint32_t gen_num = numbers[1].first;
745 
746       m_pSyntax->SetPos(obj_pos);
747       RetainPtr<CPDF_Stream> pStream = ToStream(m_pSyntax->GetIndirectObject(
748           nullptr, CPDF_SyntaxParser::ParseType::kStrict));
749 
750       if (pStream && pStream->GetDict()->GetNameFor("Type") == "XRef") {
751         cross_ref_table = CPDF_CrossRefTable::MergeUp(
752             std::move(cross_ref_table),
753             std::make_unique<CPDF_CrossRefTable>(
754                 ToDictionary(pStream->GetDict()->Clone()),
755                 pStream->GetObjNum()));
756       }
757 
758       if (obj_num < kMaxObjectNumber) {
759         cross_ref_table->AddNormal(obj_num, gen_num, /*is_object_stream=*/false,
760                                    obj_pos);
761         const auto object_stream =
762             CPDF_ObjectStream::Create(std::move(pStream));
763         if (object_stream) {
764           const auto& object_info = object_stream->object_info();
765           for (size_t i = 0; i < object_info.size(); ++i) {
766             const auto& info = object_info[i];
767             if (info.obj_num < kMaxObjectNumber)
768               cross_ref_table->AddCompressed(info.obj_num, obj_num, i);
769           }
770         }
771       }
772     }
773     numbers.clear();
774   }
775 
776   m_CrossRefTable = CPDF_CrossRefTable::MergeUp(std::move(m_CrossRefTable),
777                                                 std::move(cross_ref_table));
778   // Resore default buffer size.
779   m_pSyntax->SetReadBufferSize(CPDF_Stream::kFileBufSize);
780 
781   return GetTrailer() && !m_CrossRefTable->objects_info().empty();
782 }
783 
LoadCrossRefStream(FX_FILESIZE * pos,bool is_main_xref)784 bool CPDF_Parser::LoadCrossRefStream(FX_FILESIZE* pos, bool is_main_xref) {
785   RetainPtr<const CPDF_Stream> pStream =
786       ToStream(ParseIndirectObjectAt(*pos, 0));
787   if (!pStream || !pStream->GetObjNum()) {
788     return false;
789   }
790 
791   RetainPtr<const CPDF_Dictionary> pDict = pStream->GetDict();
792   int32_t prev = pDict->GetIntegerFor("Prev");
793   if (prev < 0)
794     return false;
795 
796   int32_t size = pDict->GetIntegerFor("Size");
797   if (size < 0)
798     return false;
799 
800   *pos = prev;
801 
802   auto new_cross_ref_table = std::make_unique<CPDF_CrossRefTable>(
803       /*trailer=*/ToDictionary(pDict->Clone()),
804       /*trailer_object_number=*/pStream->GetObjNum());
805   if (is_main_xref) {
806     m_CrossRefTable = std::move(new_cross_ref_table);
807     m_CrossRefTable->SetObjectMapSize(size);
808   } else {
809     m_CrossRefTable = CPDF_CrossRefTable::MergeUp(
810         std::move(new_cross_ref_table), std::move(m_CrossRefTable));
811   }
812 
813   std::vector<CrossRefStreamIndexEntry> indices =
814       GetCrossRefStreamIndices(pDict->GetArrayFor("Index").Get(), size);
815 
816   std::vector<uint32_t> field_widths =
817       GetFieldWidths(pDict->GetArrayFor("W").Get());
818   if (field_widths.size() < kMinFieldCount)
819     return false;
820 
821   FX_SAFE_UINT32 dwAccWidth;
822   for (uint32_t width : field_widths)
823     dwAccWidth += width;
824   if (!dwAccWidth.IsValid())
825     return false;
826 
827   uint32_t total_width = dwAccWidth.ValueOrDie();
828   auto pAcc = pdfium::MakeRetain<CPDF_StreamAcc>(pStream);
829   pAcc->LoadAllDataFiltered();
830 
831   pdfium::span<const uint8_t> data_span = pAcc->GetSpan();
832   uint32_t segindex = 0;
833   for (const auto& index : indices) {
834     FX_SAFE_UINT32 seg_end = segindex;
835     seg_end += index.obj_count;
836     seg_end *= total_width;
837     if (!seg_end.IsValid() || seg_end.ValueOrDie() > data_span.size())
838       continue;
839 
840     pdfium::span<const uint8_t> seg_span = data_span.subspan(
841         segindex * total_width, index.obj_count * total_width);
842     FX_SAFE_UINT32 safe_new_size = index.start_obj_num;
843     safe_new_size += index.obj_count;
844     if (!safe_new_size.IsValid()) {
845       continue;
846     }
847 
848     // Until SetObjectMapSize() below has been called by a prior loop iteration,
849     // `current_size` is based on the /Size value parsed in
850     // LoadCrossRefStream(). PDFs may not always have the correct /Size. In this
851     // case, other PDF implementations ignore the incorrect size, and PDFium
852     // also ignores incorrect size in trailers for cross reference tables.
853     const uint32_t current_size =
854         m_CrossRefTable->objects_info().empty() ? 0 : GetLastObjNum() + 1;
855     // So allow `new_size` to be greater than `current_size`, but avoid going
856     // over `kMaxXRefSize`. This works just fine because the loop below checks
857     // against `kMaxObjectNumber`, and the two "max" constants are in sync.
858     const uint32_t new_size =
859         std::min<uint32_t>(safe_new_size.ValueOrDie(), kMaxXRefSize);
860     if (new_size > current_size) {
861       m_CrossRefTable->SetObjectMapSize(new_size);
862     }
863 
864     for (uint32_t i = 0; i < index.obj_count; ++i) {
865       const uint32_t obj_num = index.start_obj_num + i;
866       if (obj_num >= kMaxObjectNumber) {
867         break;
868       }
869 
870       ProcessCrossRefStreamEntry(seg_span.subspan(i * total_width, total_width),
871                                  field_widths, obj_num);
872     }
873 
874     segindex += index.obj_count;
875   }
876   return true;
877 }
878 
ProcessCrossRefStreamEntry(pdfium::span<const uint8_t> entry_span,pdfium::span<const uint32_t> field_widths,uint32_t obj_num)879 void CPDF_Parser::ProcessCrossRefStreamEntry(
880     pdfium::span<const uint8_t> entry_span,
881     pdfium::span<const uint32_t> field_widths,
882     uint32_t obj_num) {
883   DCHECK_GE(field_widths.size(), kMinFieldCount);
884   ObjectType type;
885   if (field_widths[0]) {
886     const uint32_t cross_ref_stream_obj_type =
887         GetFirstXRefStreamEntry(entry_span, field_widths);
888     std::optional<ObjectType> maybe_type =
889         GetObjectTypeFromCrossRefStreamType(cross_ref_stream_obj_type);
890     if (!maybe_type.has_value()) {
891       return;
892     }
893     type = maybe_type.value();
894   } else {
895     // Per ISO 32000-1:2008 table 17, use the default value of 1 for the xref
896     // stream entry when it is not specified. The `type` assignment is the
897     // equivalent to calling GetObjectTypeFromCrossRefStreamType(1).
898     type = ObjectType::kNormal;
899   }
900 
901   if (type == ObjectType::kFree) {
902     const uint32_t gen_num = GetThirdXRefStreamEntry(entry_span, field_widths);
903     if (pdfium::IsValueInRangeForNumericType<uint16_t>(gen_num)) {
904       m_CrossRefTable->SetFree(obj_num, gen_num);
905     }
906     return;
907   }
908 
909   if (type == ObjectType::kNormal) {
910     const uint32_t offset = GetSecondXRefStreamEntry(entry_span, field_widths);
911     const uint32_t gen_num = GetThirdXRefStreamEntry(entry_span, field_widths);
912     if (pdfium::IsValueInRangeForNumericType<FX_FILESIZE>(offset) &&
913         pdfium::IsValueInRangeForNumericType<uint16_t>(gen_num)) {
914       m_CrossRefTable->AddNormal(obj_num, gen_num, /*is_object_stream=*/false,
915                                  offset);
916     }
917     return;
918   }
919 
920   DCHECK_EQ(type, ObjectType::kCompressed);
921   const uint32_t archive_obj_num =
922       GetSecondXRefStreamEntry(entry_span, field_widths);
923   if (!IsValidObjectNumber(archive_obj_num)) {
924     return;
925   }
926 
927   const uint32_t archive_obj_index =
928       GetThirdXRefStreamEntry(entry_span, field_widths);
929   m_CrossRefTable->AddCompressed(obj_num, archive_obj_num, archive_obj_index);
930 }
931 
GetIDArray() const932 RetainPtr<const CPDF_Array> CPDF_Parser::GetIDArray() const {
933   return GetTrailer() ? GetTrailer()->GetArrayFor("ID") : nullptr;
934 }
935 
GetRoot() const936 RetainPtr<const CPDF_Dictionary> CPDF_Parser::GetRoot() const {
937   RetainPtr<CPDF_Object> obj =
938       m_pObjectsHolder->GetOrParseIndirectObject(GetRootObjNum());
939   return obj ? obj->GetDict() : nullptr;
940 }
941 
GetEncryptDict() const942 RetainPtr<const CPDF_Dictionary> CPDF_Parser::GetEncryptDict() const {
943   if (!GetTrailer())
944     return nullptr;
945 
946   RetainPtr<const CPDF_Object> pEncryptObj =
947       GetTrailer()->GetObjectFor("Encrypt");
948   if (!pEncryptObj)
949     return nullptr;
950 
951   if (pEncryptObj->IsDictionary())
952     return pdfium::WrapRetain(pEncryptObj->AsDictionary());
953 
954   if (pEncryptObj->IsReference()) {
955     return ToDictionary(m_pObjectsHolder->GetOrParseIndirectObject(
956         pEncryptObj->AsReference()->GetRefObjNum()));
957   }
958   return nullptr;
959 }
960 
GetEncodedPassword() const961 ByteString CPDF_Parser::GetEncodedPassword() const {
962   return GetSecurityHandler()->GetEncodedPassword(GetPassword().AsStringView());
963 }
964 
GetTrailer() const965 const CPDF_Dictionary* CPDF_Parser::GetTrailer() const {
966   return m_CrossRefTable->trailer();
967 }
968 
GetMutableTrailerForTesting()969 CPDF_Dictionary* CPDF_Parser::GetMutableTrailerForTesting() {
970   return m_CrossRefTable->GetMutableTrailerForTesting();
971 }
972 
GetTrailerObjectNumber() const973 uint32_t CPDF_Parser::GetTrailerObjectNumber() const {
974   return m_CrossRefTable->trailer_object_number();
975 }
976 
GetCombinedTrailer() const977 RetainPtr<CPDF_Dictionary> CPDF_Parser::GetCombinedTrailer() const {
978   return m_CrossRefTable->trailer()
979              ? ToDictionary(m_CrossRefTable->trailer()->Clone())
980              : RetainPtr<CPDF_Dictionary>();
981 }
982 
GetInfoObjNum() const983 uint32_t CPDF_Parser::GetInfoObjNum() const {
984   RetainPtr<const CPDF_Reference> pRef =
985       ToReference(m_CrossRefTable->trailer()
986                       ? m_CrossRefTable->trailer()->GetObjectFor("Info")
987                       : nullptr);
988   return pRef ? pRef->GetRefObjNum() : CPDF_Object::kInvalidObjNum;
989 }
990 
GetRootObjNum() const991 uint32_t CPDF_Parser::GetRootObjNum() const {
992   RetainPtr<const CPDF_Reference> pRef =
993       ToReference(m_CrossRefTable->trailer()
994                       ? m_CrossRefTable->trailer()->GetObjectFor("Root")
995                       : nullptr);
996   return pRef ? pRef->GetRefObjNum() : CPDF_Object::kInvalidObjNum;
997 }
998 
ParseIndirectObject(uint32_t objnum)999 RetainPtr<CPDF_Object> CPDF_Parser::ParseIndirectObject(uint32_t objnum) {
1000   if (!IsValidObjectNumber(objnum)) {
1001     return nullptr;
1002   }
1003 
1004   // Prevent circular parsing the same object.
1005   if (pdfium::Contains(m_ParsingObjNums, objnum)) {
1006     return nullptr;
1007   }
1008 
1009   ScopedSetInsertion<uint32_t> local_insert(&m_ParsingObjNums, objnum);
1010   const auto* info = m_CrossRefTable->GetObjectInfo(objnum);
1011   if (!info) {
1012     return nullptr;
1013   }
1014 
1015   switch (info->type) {
1016     case ObjectType::kFree: {
1017       return nullptr;
1018     }
1019     case ObjectType::kNormal: {
1020       if (info->pos <= 0) {
1021         return nullptr;
1022       }
1023       return ParseIndirectObjectAt(info->pos, objnum);
1024     }
1025     case ObjectType::kCompressed: {
1026       const auto* obj_stream = GetObjectStream(info->archive.obj_num);
1027       if (!obj_stream) {
1028         return nullptr;
1029       }
1030       return obj_stream->ParseObject(m_pObjectsHolder, objnum,
1031                                      info->archive.obj_index);
1032     }
1033   }
1034 }
1035 
GetObjectStream(uint32_t object_number)1036 const CPDF_ObjectStream* CPDF_Parser::GetObjectStream(uint32_t object_number) {
1037   // Prevent circular parsing the same object.
1038   if (pdfium::Contains(m_ParsingObjNums, object_number))
1039     return nullptr;
1040 
1041   auto it = m_ObjectStreamMap.find(object_number);
1042   if (it != m_ObjectStreamMap.end())
1043     return it->second.get();
1044 
1045   const auto* info = m_CrossRefTable->GetObjectInfo(object_number);
1046   if (!info || !info->is_object_stream_flag) {
1047     return nullptr;
1048   }
1049 
1050   const FX_FILESIZE object_pos = info->pos;
1051   if (object_pos <= 0)
1052     return nullptr;
1053 
1054   // Keep track of `object_number` before doing more parsing.
1055   ScopedSetInsertion<uint32_t> local_insert(&m_ParsingObjNums, object_number);
1056 
1057   RetainPtr<CPDF_Object> object =
1058       ParseIndirectObjectAt(object_pos, object_number);
1059   if (!object)
1060     return nullptr;
1061 
1062   std::unique_ptr<CPDF_ObjectStream> objs_stream =
1063       CPDF_ObjectStream::Create(ToStream(object));
1064   const CPDF_ObjectStream* result = objs_stream.get();
1065   m_ObjectStreamMap[object_number] = std::move(objs_stream);
1066 
1067   return result;
1068 }
1069 
ParseIndirectObjectAt(FX_FILESIZE pos,uint32_t objnum)1070 RetainPtr<CPDF_Object> CPDF_Parser::ParseIndirectObjectAt(FX_FILESIZE pos,
1071                                                           uint32_t objnum) {
1072   const FX_FILESIZE saved_pos = m_pSyntax->GetPos();
1073   m_pSyntax->SetPos(pos);
1074 
1075   auto result = m_pSyntax->GetIndirectObject(
1076       m_pObjectsHolder, CPDF_SyntaxParser::ParseType::kLoose);
1077   m_pSyntax->SetPos(saved_pos);
1078   if (result && objnum && result->GetObjNum() != objnum)
1079     return nullptr;
1080 
1081   const bool should_decrypt = m_pSecurityHandler &&
1082                               m_pSecurityHandler->GetCryptoHandler() &&
1083                               objnum != m_MetadataObjnum;
1084   if (should_decrypt &&
1085       !m_pSecurityHandler->GetCryptoHandler()->DecryptObjectTree(result)) {
1086     return nullptr;
1087   }
1088   return result;
1089 }
1090 
GetDocumentSize() const1091 FX_FILESIZE CPDF_Parser::GetDocumentSize() const {
1092   return m_pSyntax->GetDocumentSize();
1093 }
1094 
GetFirstPageNo() const1095 uint32_t CPDF_Parser::GetFirstPageNo() const {
1096   return m_pLinearized ? m_pLinearized->GetFirstPageNo() : 0;
1097 }
1098 
SetLinearizedHeaderForTesting(std::unique_ptr<CPDF_LinearizedHeader> pLinearized)1099 void CPDF_Parser::SetLinearizedHeaderForTesting(
1100     std::unique_ptr<CPDF_LinearizedHeader> pLinearized) {
1101   m_pLinearized = std::move(pLinearized);
1102 }
1103 
LoadTrailer()1104 RetainPtr<CPDF_Dictionary> CPDF_Parser::LoadTrailer() {
1105   if (m_pSyntax->GetKeyword() != "trailer")
1106     return nullptr;
1107 
1108   return ToDictionary(m_pSyntax->GetObjectBody(m_pObjectsHolder));
1109 }
1110 
GetPermissions(bool get_owner_perms) const1111 uint32_t CPDF_Parser::GetPermissions(bool get_owner_perms) const {
1112   return m_pSecurityHandler
1113              ? m_pSecurityHandler->GetPermissions(get_owner_perms)
1114              : 0xFFFFFFFF;
1115 }
1116 
ParseLinearizedHeader()1117 std::unique_ptr<CPDF_LinearizedHeader> CPDF_Parser::ParseLinearizedHeader() {
1118   return CPDF_LinearizedHeader::Parse(m_pSyntax.get());
1119 }
1120 
StartLinearizedParse(RetainPtr<CPDF_ReadValidator> validator,const ByteString & password)1121 CPDF_Parser::Error CPDF_Parser::StartLinearizedParse(
1122     RetainPtr<CPDF_ReadValidator> validator,
1123     const ByteString& password) {
1124   DCHECK(!m_bHasParsed);
1125   DCHECK(!m_bXRefTableRebuilt);
1126   SetPassword(password);
1127   m_bXRefStream = false;
1128   m_LastXRefOffset = 0;
1129 
1130   if (!InitSyntaxParser(std::move(validator)))
1131     return FORMAT_ERROR;
1132 
1133   m_pLinearized = ParseLinearizedHeader();
1134   if (!m_pLinearized)
1135     return StartParseInternal();
1136 
1137   m_bHasParsed = true;
1138 
1139   m_LastXRefOffset = m_pLinearized->GetLastXRefOffset();
1140   FX_FILESIZE dwFirstXRefOffset = m_LastXRefOffset;
1141   const bool loaded_xref_table =
1142       LoadCrossRefTable(dwFirstXRefOffset, /*skip=*/false);
1143   if (!loaded_xref_table &&
1144       !LoadCrossRefStream(&dwFirstXRefOffset, /*is_main_xref=*/true)) {
1145     if (!RebuildCrossRef())
1146       return FORMAT_ERROR;
1147 
1148     m_bXRefTableRebuilt = true;
1149     m_LastXRefOffset = 0;
1150   }
1151   if (loaded_xref_table) {
1152     RetainPtr<CPDF_Dictionary> trailer = LoadTrailer();
1153     if (!trailer)
1154       return SUCCESS;
1155 
1156     m_CrossRefTable->SetTrailer(std::move(trailer), kNoTrailerObjectNumber);
1157     const int32_t xrefsize = GetTrailer()->GetDirectIntegerFor("Size");
1158     if (xrefsize > 0) {
1159       // Check if `xrefsize` is correct. If it is incorrect, give up and rebuild
1160       // the xref table.
1161       const uint32_t expected_last_obj_num = xrefsize - 1;
1162       if (GetLastObjNum() != expected_last_obj_num && !RebuildCrossRef()) {
1163         return FORMAT_ERROR;
1164       }
1165     }
1166   }
1167 
1168   Error eRet = SetEncryptHandler();
1169   if (eRet != SUCCESS)
1170     return eRet;
1171 
1172   if (!GetRoot() || !m_pObjectsHolder->TryInit()) {
1173     if (m_bXRefTableRebuilt)
1174       return FORMAT_ERROR;
1175 
1176     ReleaseEncryptHandler();
1177     if (!RebuildCrossRef())
1178       return FORMAT_ERROR;
1179 
1180     eRet = SetEncryptHandler();
1181     if (eRet != SUCCESS)
1182       return eRet;
1183 
1184     m_pObjectsHolder->TryInit();
1185     if (!GetRoot())
1186       return FORMAT_ERROR;
1187   }
1188 
1189   if (GetRootObjNum() == CPDF_Object::kInvalidObjNum) {
1190     ReleaseEncryptHandler();
1191     if (!RebuildCrossRef() || GetRootObjNum() == CPDF_Object::kInvalidObjNum)
1192       return FORMAT_ERROR;
1193 
1194     eRet = SetEncryptHandler();
1195     if (eRet != SUCCESS)
1196       return eRet;
1197   }
1198 
1199   if (m_pSecurityHandler && m_pSecurityHandler->IsMetadataEncrypted()) {
1200     RetainPtr<const CPDF_Reference> pMetadata =
1201         ToReference(GetRoot()->GetObjectFor("Metadata"));
1202     if (pMetadata)
1203       m_MetadataObjnum = pMetadata->GetRefObjNum();
1204   }
1205   return SUCCESS;
1206 }
1207 
LoadLinearizedAllCrossRefStream(FX_FILESIZE main_xref_offset)1208 bool CPDF_Parser::LoadLinearizedAllCrossRefStream(
1209     FX_FILESIZE main_xref_offset) {
1210   FX_FILESIZE xref_offset = main_xref_offset;
1211   if (!LoadCrossRefStream(&xref_offset, /*is_main_xref=*/false)) {
1212     return false;
1213   }
1214 
1215   std::set<FX_FILESIZE> seen_xref_offset;
1216   while (xref_offset) {
1217     seen_xref_offset.insert(xref_offset);
1218     if (!LoadCrossRefStream(&xref_offset, /*is_main_xref=*/false)) {
1219       return false;
1220     }
1221 
1222     // Check for circular references.
1223     if (pdfium::Contains(seen_xref_offset, xref_offset))
1224       return false;
1225   }
1226   m_ObjectStreamMap.clear();
1227   m_bXRefStream = true;
1228   return true;
1229 }
1230 
LoadLinearizedMainXRefTable()1231 CPDF_Parser::Error CPDF_Parser::LoadLinearizedMainXRefTable() {
1232   const FX_SAFE_FILESIZE prev = GetTrailer()->GetIntegerFor("Prev");
1233   const FX_FILESIZE main_xref_offset = prev.ValueOrDefault(-1);
1234   if (main_xref_offset < 0)
1235     return FORMAT_ERROR;
1236 
1237   if (main_xref_offset == 0)
1238     return SUCCESS;
1239 
1240   const AutoRestorer<uint32_t> save_metadata_objnum(&m_MetadataObjnum);
1241   m_MetadataObjnum = 0;
1242   m_ObjectStreamMap.clear();
1243 
1244   if (!LoadLinearizedAllCrossRefTable(main_xref_offset) &&
1245       !LoadLinearizedAllCrossRefStream(main_xref_offset)) {
1246     m_LastXRefOffset = 0;
1247     return FORMAT_ERROR;
1248   }
1249 
1250   return SUCCESS;
1251 }
1252 
SetSyntaxParserForTesting(std::unique_ptr<CPDF_SyntaxParser> parser)1253 void CPDF_Parser::SetSyntaxParserForTesting(
1254     std::unique_ptr<CPDF_SyntaxParser> parser) {
1255   m_pSyntax = std::move(parser);
1256 }
1257 
GetTrailerEnds()1258 std::vector<unsigned int> CPDF_Parser::GetTrailerEnds() {
1259   std::vector<unsigned int> trailer_ends;
1260   m_pSyntax->SetTrailerEnds(&trailer_ends);
1261 
1262   // Traverse the document.
1263   m_pSyntax->SetPos(0);
1264   while (true) {
1265     CPDF_SyntaxParser::WordResult word_result = m_pSyntax->GetNextWord();
1266     if (word_result.is_number) {
1267       // The object number was read. Read the generation number.
1268       word_result = m_pSyntax->GetNextWord();
1269       if (!word_result.is_number)
1270         break;
1271 
1272       word_result = m_pSyntax->GetNextWord();
1273       if (word_result.word != "obj")
1274         break;
1275 
1276       m_pSyntax->GetObjectBody(nullptr);
1277 
1278       word_result = m_pSyntax->GetNextWord();
1279       if (word_result.word != "endobj")
1280         break;
1281     } else if (word_result.word == "trailer") {
1282       m_pSyntax->GetObjectBody(nullptr);
1283     } else if (word_result.word == "startxref") {
1284       m_pSyntax->GetNextWord();
1285     } else if (word_result.word == "xref") {
1286       while (true) {
1287         word_result = m_pSyntax->GetNextWord();
1288         if (word_result.word.IsEmpty() || word_result.word == "startxref")
1289           break;
1290       }
1291       m_pSyntax->GetNextWord();
1292     } else {
1293       break;
1294     }
1295   }
1296 
1297   // Stop recording trailer ends.
1298   m_pSyntax->SetTrailerEnds(nullptr);
1299   return trailer_ends;
1300 }
1301 
WriteToArchive(IFX_ArchiveStream * archive,FX_FILESIZE src_size)1302 bool CPDF_Parser::WriteToArchive(IFX_ArchiveStream* archive,
1303                                  FX_FILESIZE src_size) {
1304   static constexpr FX_FILESIZE kBufferSize = 4096;
1305   DataVector<uint8_t> buffer(kBufferSize);
1306   m_pSyntax->SetPos(0);
1307   while (src_size) {
1308     const uint32_t block_size =
1309         static_cast<uint32_t>(std::min(kBufferSize, src_size));
1310     auto block_span = pdfium::make_span(buffer).first(block_size);
1311     if (!m_pSyntax->ReadBlock(block_span))
1312       return false;
1313     if (!archive->WriteBlock(pdfium::make_span(buffer).first(block_size)))
1314       return false;
1315     src_size -= block_size;
1316   }
1317   return true;
1318 }
1319