• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2023 The PDFium Authors
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 #include "core/fpdfapi/parser/object_tree_traversal_util.h"
6 
7 #include <stdint.h>
8 
9 #include <map>
10 #include <queue>
11 #include <set>
12 #include <utility>
13 #include <vector>
14 
15 #include "core/fpdfapi/parser/cpdf_array.h"
16 #include "core/fpdfapi/parser/cpdf_dictionary.h"
17 #include "core/fpdfapi/parser/cpdf_document.h"
18 #include "core/fpdfapi/parser/cpdf_reference.h"
19 #include "core/fpdfapi/parser/cpdf_stream.h"
20 #include "third_party/base/check.h"
21 #include "third_party/base/containers/contains.h"
22 
23 namespace {
24 
25 class ObjectTreeTraverser {
26  public:
ObjectTreeTraverser(const CPDF_Document * document)27   explicit ObjectTreeTraverser(const CPDF_Document* document)
28       : document_(document) {
29     const CPDF_Parser* parser = document_->GetParser();
30     const CPDF_Dictionary* trailer = parser ? parser->GetTrailer() : nullptr;
31     const CPDF_Dictionary* root = trailer ? trailer : document_->GetRoot();
32     const uint32_t root_object_number =
33         trailer ? parser->GetTrailerObjectNumber() : root->GetObjNum();
34     // If `root` is a trailer, then it may not have an object number, as many
35     // trailers are inlined.
36     if (root_object_number) {
37       referenced_objects_[root_object_number] = 1;
38       object_number_map_[root] = root_object_number;
39     }
40 
41     object_queue_.push(pdfium::WrapRetain(root));
42     seen_objects_.insert(root);
43   }
44   ~ObjectTreeTraverser() = default;
45 
Traverse()46   void Traverse() { CalculateReferenceCounts(GetReferenceEntries()); }
47 
referenced_objects()48   const std::map<uint32_t, int>& referenced_objects() {
49     return referenced_objects_;
50   }
51 
52  private:
53   struct ReferenceEntry {
54     uint32_t ref_object_number;
55     uint32_t referenced_object_number;
56   };
57 
GetReferenceEntries()58   std::vector<ReferenceEntry> GetReferenceEntries() {
59     std::vector<ReferenceEntry> reference_entries;
60     while (!object_queue_.empty()) {
61       RetainPtr<const CPDF_Object> current_object = object_queue_.front();
62       object_queue_.pop();
63 
64       switch (current_object->GetType()) {
65         case CPDF_Object::kArray: {
66           CPDF_ArrayLocker locker(current_object->AsArray());
67           for (const auto& it : locker) {
68             PushNewObject(current_object, it);
69           }
70           break;
71         }
72         case CPDF_Object::kDictionary: {
73           CPDF_DictionaryLocker locker(current_object->AsDictionary());
74           for (const auto& it : locker) {
75             PushNewObject(current_object, it.second);
76           }
77           break;
78         }
79         case CPDF_Object::kReference: {
80           const CPDF_Reference* ref_object = current_object->AsReference();
81           const uint32_t ref_object_number = GetObjectNumber(ref_object);
82           const uint32_t referenced_object_number = ref_object->GetRefObjNum();
83           CHECK(referenced_object_number);
84 
85           RetainPtr<const CPDF_Object> referenced_object;
86           if (ref_object->HasIndirectObjectHolder()) {
87             // Calling GetIndirectObject() does not work for normal references.
88             referenced_object = ref_object->GetDirect();
89           } else {
90             // Calling GetDirect() does not work for references from trailers.
91             referenced_object =
92                 document_->GetIndirectObject(referenced_object_number);
93           }
94           // Unlike the other object types, CPDF_Reference can point at nullptr.
95           if (referenced_object) {
96             reference_entries.push_back(
97                 {ref_object_number, referenced_object_number});
98             PushNewObject(ref_object, referenced_object);
99           }
100           break;
101         }
102         case CPDF_Object::kStream: {
103           RetainPtr<const CPDF_Dictionary> dict =
104               current_object->AsStream()->GetDict();
105           CHECK(dict->IsInline());  // i.e. No object number.
106           CPDF_DictionaryLocker locker(dict);
107           for (const auto& it : locker) {
108             PushNewObject(current_object, it.second);
109           }
110           break;
111         }
112         default: {
113           break;
114         }
115       }
116     }
117     return reference_entries;
118   }
119 
CalculateReferenceCounts(const std::vector<ReferenceEntry> & reference_entries)120   void CalculateReferenceCounts(
121       const std::vector<ReferenceEntry>& reference_entries) {
122     // Tracks PDF objects that referenced other PDF objects, identified by their
123     // object numbers. Never 0.
124     std::set<uint32_t> seen_ref_objects;
125 
126     for (const ReferenceEntry& entry : reference_entries) {
127       // Make sure this is not a self-reference.
128       if (entry.referenced_object_number == entry.ref_object_number) {
129         continue;
130       }
131 
132       // Make sure this is not a circular reference.
133       if (pdfium::Contains(seen_ref_objects, entry.ref_object_number) &&
134           pdfium::Contains(seen_ref_objects, entry.referenced_object_number)) {
135         continue;
136       }
137 
138       ++referenced_objects_[entry.referenced_object_number];
139       if (entry.ref_object_number) {
140         seen_ref_objects.insert(entry.ref_object_number);
141       }
142     }
143   }
144 
PushNewObject(const CPDF_Object * parent_object,RetainPtr<const CPDF_Object> child_object)145   void PushNewObject(const CPDF_Object* parent_object,
146                      RetainPtr<const CPDF_Object> child_object) {
147     CHECK(parent_object);
148     CHECK(child_object);
149     const bool inserted = seen_objects_.insert(child_object).second;
150     if (!inserted) {
151       return;
152     }
153     const uint32_t child_object_number = child_object->GetObjNum();
154     if (child_object_number) {
155       object_number_map_[child_object] = child_object_number;
156     } else {
157       // This search can fail for inlined trailers.
158       auto it = object_number_map_.find(parent_object);
159       if (it != object_number_map_.end()) {
160         object_number_map_[child_object] = it->second;
161       }
162     }
163     object_queue_.push(std::move(child_object));
164   }
165 
166   // Returns 0 if not found.
GetObjectNumber(const CPDF_Object * object) const167   uint32_t GetObjectNumber(const CPDF_Object* object) const {
168     auto it = object_number_map_.find(object);
169     return it != object_number_map_.end() ? it->second : 0;
170   }
171 
172   const CPDF_Document* const document_;
173 
174   // Queue of objects to traverse.
175   // - Pointers in the queue are non-null.
176   // - The same pointer never enters the queue twice.
177   std::queue<RetainPtr<const CPDF_Object>> object_queue_;
178 
179   // Map of objects to "top-level" object numbers. For inline objects, this is
180   // the ancestor object with an object number. The keys are non-null and the
181   // values are never 0.
182   // This is used to prevent self-references, as a single PDF object, with
183   // inlined objects, is represented by multiple CPDF_Objects.
184   std::map<const CPDF_Object*, uint32_t> object_number_map_;
185 
186   // Tracks traversed objects to prevent duplicates from getting into
187   // `object_queue_` and `object_number_map_`.
188   std::set<const CPDF_Object*> seen_objects_;
189 
190   // Tracks which PDF objects are referenced.
191   // Key: object number
192   // Value: number of times referenced
193   std::map<uint32_t, int> referenced_objects_;
194 };
195 
196 }  // namespace
197 
GetObjectsWithReferences(const CPDF_Document * document)198 std::set<uint32_t> GetObjectsWithReferences(const CPDF_Document* document) {
199   ObjectTreeTraverser traverser(document);
200   traverser.Traverse();
201 
202   std::set<uint32_t> results;
203   for (const auto& it : traverser.referenced_objects()) {
204     results.insert(it.first);
205   }
206   return results;
207 }
208 
GetObjectsWithMultipleReferences(const CPDF_Document * document)209 std::set<uint32_t> GetObjectsWithMultipleReferences(
210     const CPDF_Document* document) {
211   ObjectTreeTraverser traverser(document);
212   traverser.Traverse();
213 
214   std::set<uint32_t> results;
215   for (const auto& it : traverser.referenced_objects()) {
216     if (it.second > 1) {
217       results.insert(it.first);
218     }
219   }
220   return results;
221 }
222