1 // Copyright 2023 The PDFium Authors
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 #include "core/fpdfapi/parser/object_tree_traversal_util.h"
6
7 #include <stdint.h>
8
9 #include <map>
10 #include <queue>
11 #include <set>
12 #include <utility>
13 #include <vector>
14
15 #include "core/fpdfapi/parser/cpdf_array.h"
16 #include "core/fpdfapi/parser/cpdf_dictionary.h"
17 #include "core/fpdfapi/parser/cpdf_document.h"
18 #include "core/fpdfapi/parser/cpdf_reference.h"
19 #include "core/fpdfapi/parser/cpdf_stream.h"
20 #include "third_party/base/check.h"
21 #include "third_party/base/containers/contains.h"
22
23 namespace {
24
25 class ObjectTreeTraverser {
26 public:
ObjectTreeTraverser(const CPDF_Document * document)27 explicit ObjectTreeTraverser(const CPDF_Document* document)
28 : document_(document) {
29 const CPDF_Parser* parser = document_->GetParser();
30 const CPDF_Dictionary* trailer = parser ? parser->GetTrailer() : nullptr;
31 const CPDF_Dictionary* root = trailer ? trailer : document_->GetRoot();
32 const uint32_t root_object_number =
33 trailer ? parser->GetTrailerObjectNumber() : root->GetObjNum();
34 // If `root` is a trailer, then it may not have an object number, as many
35 // trailers are inlined.
36 if (root_object_number) {
37 referenced_objects_[root_object_number] = 1;
38 object_number_map_[root] = root_object_number;
39 }
40
41 object_queue_.push(pdfium::WrapRetain(root));
42 seen_objects_.insert(root);
43 }
44 ~ObjectTreeTraverser() = default;
45
Traverse()46 void Traverse() { CalculateReferenceCounts(GetReferenceEntries()); }
47
referenced_objects()48 const std::map<uint32_t, int>& referenced_objects() {
49 return referenced_objects_;
50 }
51
52 private:
53 struct ReferenceEntry {
54 uint32_t ref_object_number;
55 uint32_t referenced_object_number;
56 };
57
GetReferenceEntries()58 std::vector<ReferenceEntry> GetReferenceEntries() {
59 std::vector<ReferenceEntry> reference_entries;
60 while (!object_queue_.empty()) {
61 RetainPtr<const CPDF_Object> current_object = object_queue_.front();
62 object_queue_.pop();
63
64 switch (current_object->GetType()) {
65 case CPDF_Object::kArray: {
66 CPDF_ArrayLocker locker(current_object->AsArray());
67 for (const auto& it : locker) {
68 PushNewObject(current_object, it);
69 }
70 break;
71 }
72 case CPDF_Object::kDictionary: {
73 CPDF_DictionaryLocker locker(current_object->AsDictionary());
74 for (const auto& it : locker) {
75 PushNewObject(current_object, it.second);
76 }
77 break;
78 }
79 case CPDF_Object::kReference: {
80 const CPDF_Reference* ref_object = current_object->AsReference();
81 const uint32_t ref_object_number = GetObjectNumber(ref_object);
82 const uint32_t referenced_object_number = ref_object->GetRefObjNum();
83 CHECK(referenced_object_number);
84
85 RetainPtr<const CPDF_Object> referenced_object;
86 if (ref_object->HasIndirectObjectHolder()) {
87 // Calling GetIndirectObject() does not work for normal references.
88 referenced_object = ref_object->GetDirect();
89 } else {
90 // Calling GetDirect() does not work for references from trailers.
91 referenced_object =
92 document_->GetIndirectObject(referenced_object_number);
93 }
94 // Unlike the other object types, CPDF_Reference can point at nullptr.
95 if (referenced_object) {
96 reference_entries.push_back(
97 {ref_object_number, referenced_object_number});
98 PushNewObject(ref_object, referenced_object);
99 }
100 break;
101 }
102 case CPDF_Object::kStream: {
103 RetainPtr<const CPDF_Dictionary> dict =
104 current_object->AsStream()->GetDict();
105 CHECK(dict->IsInline()); // i.e. No object number.
106 CPDF_DictionaryLocker locker(dict);
107 for (const auto& it : locker) {
108 PushNewObject(current_object, it.second);
109 }
110 break;
111 }
112 default: {
113 break;
114 }
115 }
116 }
117 return reference_entries;
118 }
119
CalculateReferenceCounts(const std::vector<ReferenceEntry> & reference_entries)120 void CalculateReferenceCounts(
121 const std::vector<ReferenceEntry>& reference_entries) {
122 // Tracks PDF objects that referenced other PDF objects, identified by their
123 // object numbers. Never 0.
124 std::set<uint32_t> seen_ref_objects;
125
126 for (const ReferenceEntry& entry : reference_entries) {
127 // Make sure this is not a self-reference.
128 if (entry.referenced_object_number == entry.ref_object_number) {
129 continue;
130 }
131
132 // Make sure this is not a circular reference.
133 if (pdfium::Contains(seen_ref_objects, entry.ref_object_number) &&
134 pdfium::Contains(seen_ref_objects, entry.referenced_object_number)) {
135 continue;
136 }
137
138 ++referenced_objects_[entry.referenced_object_number];
139 if (entry.ref_object_number) {
140 seen_ref_objects.insert(entry.ref_object_number);
141 }
142 }
143 }
144
PushNewObject(const CPDF_Object * parent_object,RetainPtr<const CPDF_Object> child_object)145 void PushNewObject(const CPDF_Object* parent_object,
146 RetainPtr<const CPDF_Object> child_object) {
147 CHECK(parent_object);
148 CHECK(child_object);
149 const bool inserted = seen_objects_.insert(child_object).second;
150 if (!inserted) {
151 return;
152 }
153 const uint32_t child_object_number = child_object->GetObjNum();
154 if (child_object_number) {
155 object_number_map_[child_object] = child_object_number;
156 } else {
157 // This search can fail for inlined trailers.
158 auto it = object_number_map_.find(parent_object);
159 if (it != object_number_map_.end()) {
160 object_number_map_[child_object] = it->second;
161 }
162 }
163 object_queue_.push(std::move(child_object));
164 }
165
166 // Returns 0 if not found.
GetObjectNumber(const CPDF_Object * object) const167 uint32_t GetObjectNumber(const CPDF_Object* object) const {
168 auto it = object_number_map_.find(object);
169 return it != object_number_map_.end() ? it->second : 0;
170 }
171
172 const CPDF_Document* const document_;
173
174 // Queue of objects to traverse.
175 // - Pointers in the queue are non-null.
176 // - The same pointer never enters the queue twice.
177 std::queue<RetainPtr<const CPDF_Object>> object_queue_;
178
179 // Map of objects to "top-level" object numbers. For inline objects, this is
180 // the ancestor object with an object number. The keys are non-null and the
181 // values are never 0.
182 // This is used to prevent self-references, as a single PDF object, with
183 // inlined objects, is represented by multiple CPDF_Objects.
184 std::map<const CPDF_Object*, uint32_t> object_number_map_;
185
186 // Tracks traversed objects to prevent duplicates from getting into
187 // `object_queue_` and `object_number_map_`.
188 std::set<const CPDF_Object*> seen_objects_;
189
190 // Tracks which PDF objects are referenced.
191 // Key: object number
192 // Value: number of times referenced
193 std::map<uint32_t, int> referenced_objects_;
194 };
195
196 } // namespace
197
GetObjectsWithReferences(const CPDF_Document * document)198 std::set<uint32_t> GetObjectsWithReferences(const CPDF_Document* document) {
199 ObjectTreeTraverser traverser(document);
200 traverser.Traverse();
201
202 std::set<uint32_t> results;
203 for (const auto& it : traverser.referenced_objects()) {
204 results.insert(it.first);
205 }
206 return results;
207 }
208
GetObjectsWithMultipleReferences(const CPDF_Document * document)209 std::set<uint32_t> GetObjectsWithMultipleReferences(
210 const CPDF_Document* document) {
211 ObjectTreeTraverser traverser(document);
212 traverser.Traverse();
213
214 std::set<uint32_t> results;
215 for (const auto& it : traverser.referenced_objects()) {
216 if (it.second > 1) {
217 results.insert(it.first);
218 }
219 }
220 return results;
221 }
222