1 /*
2 * Copyright 2018 Google Inc.
3 *
4 * Use of this source code is governed by a BSD-style license that can be
5 * found in the LICENSE file.
6 */
7
8 #include "src/pdf/SkPDFDocumentPriv.h"
9 #include "src/pdf/SkPDFTag.h"
10
11 // The struct parent tree consists of one entry per page, followed by
12 // entries for individual struct tree nodes corresponding to
13 // annotations. Each entry is a key/value pair with an integer key
14 // and an indirect reference key.
15 //
16 // The page entries get consecutive keys starting at 0. Since we don't
17 // know the total number of pages in the document at the time we start
18 // processing annotations, start the key for annotations with a large
19 // number, which effectively becomes the maximum number of pages in a
20 // PDF we can handle.
21 const int kFirstAnnotationStructParentKey = 100000;
22
23 struct SkPDFTagNode {
24 // Structure element nodes need a unique alphanumeric ID,
25 // and we need to be able to output them sorted in lexicographic
26 // order. This helper function takes one of our node IDs and
27 // builds an ID string that zero-pads the digits so that lexicographic
28 // order matches numeric order.
nodeIdToStringSkPDFTagNode29 static SkString nodeIdToString(int nodeId) {
30 SkString idString;
31 idString.printf("node%08d", nodeId);
32 return idString;
33 }
34
35 SkPDFTagNode* fChildren = nullptr;
36 size_t fChildCount = 0;
37 struct MarkedContentInfo {
38 unsigned fPageIndex;
39 int fMarkId;
40 };
41 SkTArray<MarkedContentInfo> fMarkedContent;
42 int fNodeId;
43 SkString fTypeString;
44 SkString fAlt;
45 SkString fLang;
46 SkPDFIndirectReference fRef;
47 enum State {
48 kUnknown,
49 kYes,
50 kNo,
51 } fCanDiscard = kUnknown;
52 std::unique_ptr<SkPDFArray> fAttributes;
53 struct AnnotationInfo {
54 unsigned fPageIndex;
55 SkPDFIndirectReference fAnnotationRef;
56 };
57 std::vector<AnnotationInfo> fAnnotations;
58 };
59
60 SkPDF::AttributeList::AttributeList() = default;
61
62 SkPDF::AttributeList::~AttributeList() = default;
63
appendInt(const char * owner,const char * name,int value)64 void SkPDF::AttributeList::appendInt(
65 const char* owner, const char* name, int value) {
66 if (!fAttrs)
67 fAttrs = SkPDFMakeArray();
68 std::unique_ptr<SkPDFDict> attrDict = SkPDFMakeDict();
69 attrDict->insertName("O", owner);
70 attrDict->insertInt(name, value);
71 fAttrs->appendObject(std::move(attrDict));
72 }
73
appendFloat(const char * owner,const char * name,float value)74 void SkPDF::AttributeList::appendFloat(
75 const char* owner, const char* name, float value) {
76 if (!fAttrs)
77 fAttrs = SkPDFMakeArray();
78 std::unique_ptr<SkPDFDict> attrDict = SkPDFMakeDict();
79 attrDict->insertName("O", owner);
80 attrDict->insertScalar(name, value);
81 fAttrs->appendObject(std::move(attrDict));
82 }
83
appendName(const char * owner,const char * name,const char * value)84 void SkPDF::AttributeList::appendName(
85 const char* owner, const char* name, const char* value) {
86 if (!fAttrs)
87 fAttrs = SkPDFMakeArray();
88 std::unique_ptr<SkPDFDict> attrDict = SkPDFMakeDict();
89 attrDict->insertName("O", owner);
90 attrDict->insertName(name, value);
91 fAttrs->appendObject(std::move(attrDict));
92 }
93
appendFloatArray(const char * owner,const char * name,const std::vector<float> & value)94 void SkPDF::AttributeList::appendFloatArray(
95 const char* owner, const char* name, const std::vector<float>& value) {
96 if (!fAttrs)
97 fAttrs = SkPDFMakeArray();
98 std::unique_ptr<SkPDFDict> attrDict = SkPDFMakeDict();
99 attrDict->insertName("O", owner);
100 std::unique_ptr<SkPDFArray> pdfArray = SkPDFMakeArray();
101 for (float element : value) {
102 pdfArray->appendScalar(element);
103 }
104 attrDict->insertObject(name, std::move(pdfArray));
105 fAttrs->appendObject(std::move(attrDict));
106 }
107
appendNodeIdArray(const char * owner,const char * name,const std::vector<int> & nodeIds)108 void SkPDF::AttributeList::appendNodeIdArray(
109 const char* owner,
110 const char* name,
111 const std::vector<int>& nodeIds) {
112 if (!fAttrs)
113 fAttrs = SkPDFMakeArray();
114 std::unique_ptr<SkPDFDict> attrDict = SkPDFMakeDict();
115 attrDict->insertName("O", owner);
116 std::unique_ptr<SkPDFArray> pdfArray = SkPDFMakeArray();
117 for (int nodeId : nodeIds) {
118 SkString idString = SkPDFTagNode::nodeIdToString(nodeId);
119 pdfArray->appendByteString(idString);
120 }
121 attrDict->insertObject(name, std::move(pdfArray));
122 fAttrs->appendObject(std::move(attrDict));
123 }
124
SkPDFTagTree()125 SkPDFTagTree::SkPDFTagTree() : fArena(4 * sizeof(SkPDFTagNode)) {}
126
127 SkPDFTagTree::~SkPDFTagTree() = default;
128
129 // static
Copy(SkPDF::StructureElementNode & node,SkPDFTagNode * dst,SkArenaAlloc * arena,SkTHashMap<int,SkPDFTagNode * > * nodeMap)130 void SkPDFTagTree::Copy(SkPDF::StructureElementNode& node,
131 SkPDFTagNode* dst,
132 SkArenaAlloc* arena,
133 SkTHashMap<int, SkPDFTagNode*>* nodeMap) {
134 nodeMap->set(node.fNodeId, dst);
135 for (int nodeId : node.fAdditionalNodeIds) {
136 SkASSERT(!nodeMap->find(nodeId));
137 nodeMap->set(nodeId, dst);
138 }
139 dst->fNodeId = node.fNodeId;
140 dst->fTypeString = node.fTypeString;
141 dst->fAlt = node.fAlt;
142 dst->fLang = node.fLang;
143
144 size_t childCount = node.fChildVector.size();
145 SkPDFTagNode* children = arena->makeArray<SkPDFTagNode>(childCount);
146 dst->fChildCount = childCount;
147 dst->fChildren = children;
148 for (size_t i = 0; i < childCount; ++i) {
149 Copy(*node.fChildVector[i], &children[i], arena, nodeMap);
150 }
151
152 dst->fAttributes = std::move(node.fAttributes.fAttrs);
153 }
154
init(SkPDF::StructureElementNode * node)155 void SkPDFTagTree::init(SkPDF::StructureElementNode* node) {
156 if (node) {
157 fRoot = fArena.make<SkPDFTagNode>();
158 Copy(*node, fRoot, &fArena, &fNodeMap);
159 }
160 }
161
createMarkIdForNodeId(int nodeId,unsigned pageIndex)162 int SkPDFTagTree::createMarkIdForNodeId(int nodeId, unsigned pageIndex) {
163 if (!fRoot) {
164 return -1;
165 }
166 SkPDFTagNode** tagPtr = fNodeMap.find(nodeId);
167 if (!tagPtr) {
168 return -1;
169 }
170 SkPDFTagNode* tag = *tagPtr;
171 SkASSERT(tag);
172 while (SkToUInt(fMarksPerPage.size()) < pageIndex + 1) {
173 fMarksPerPage.push_back();
174 }
175 SkTArray<SkPDFTagNode*>& pageMarks = fMarksPerPage[pageIndex];
176 int markId = pageMarks.size();
177 tag->fMarkedContent.push_back({pageIndex, markId});
178 pageMarks.push_back(tag);
179 return markId;
180 }
181
createStructParentKeyForNodeId(int nodeId,unsigned pageIndex)182 int SkPDFTagTree::createStructParentKeyForNodeId(int nodeId, unsigned pageIndex) {
183 if (!fRoot) {
184 return -1;
185 }
186 SkPDFTagNode** tagPtr = fNodeMap.find(nodeId);
187 if (!tagPtr) {
188 return -1;
189 }
190 SkPDFTagNode* tag = *tagPtr;
191 SkASSERT(tag);
192
193 tag->fCanDiscard = SkPDFTagNode::kNo;
194
195 int nextStructParentKey = kFirstAnnotationStructParentKey +
196 static_cast<int>(fParentTreeAnnotationNodeIds.size());
197 fParentTreeAnnotationNodeIds.push_back(nodeId);
198 return nextStructParentKey;
199 }
200
can_discard(SkPDFTagNode * node)201 static bool can_discard(SkPDFTagNode* node) {
202 if (node->fCanDiscard == SkPDFTagNode::kYes) {
203 return true;
204 }
205 if (node->fCanDiscard == SkPDFTagNode::kNo) {
206 return false;
207 }
208 if (!node->fMarkedContent.empty()) {
209 node->fCanDiscard = SkPDFTagNode::kNo;
210 return false;
211 }
212 for (size_t i = 0; i < node->fChildCount; ++i) {
213 if (!can_discard(&node->fChildren[i])) {
214 node->fCanDiscard = SkPDFTagNode::kNo;
215 return false;
216 }
217 }
218 node->fCanDiscard = SkPDFTagNode::kYes;
219 return true;
220 }
221
PrepareTagTreeToEmit(SkPDFIndirectReference parent,SkPDFTagNode * node,SkPDFDocument * doc)222 SkPDFIndirectReference SkPDFTagTree::PrepareTagTreeToEmit(SkPDFIndirectReference parent,
223 SkPDFTagNode* node,
224 SkPDFDocument* doc) {
225 SkPDFIndirectReference ref = doc->reserveRef();
226 std::unique_ptr<SkPDFArray> kids = SkPDFMakeArray();
227 SkPDFTagNode* children = node->fChildren;
228 size_t childCount = node->fChildCount;
229 for (size_t i = 0; i < childCount; ++i) {
230 SkPDFTagNode* child = &children[i];
231 if (!(can_discard(child))) {
232 kids->appendRef(PrepareTagTreeToEmit(ref, child, doc));
233 }
234 }
235 for (const SkPDFTagNode::MarkedContentInfo& info : node->fMarkedContent) {
236 std::unique_ptr<SkPDFDict> mcr = SkPDFMakeDict("MCR");
237 mcr->insertRef("Pg", doc->getPage(info.fPageIndex));
238 mcr->insertInt("MCID", info.fMarkId);
239 kids->appendObject(std::move(mcr));
240 }
241 for (const SkPDFTagNode::AnnotationInfo& annotationInfo : node->fAnnotations) {
242 std::unique_ptr<SkPDFDict> annotationDict = SkPDFMakeDict("OBJR");
243 annotationDict->insertRef("Obj", annotationInfo.fAnnotationRef);
244 annotationDict->insertRef("Pg", doc->getPage(annotationInfo.fPageIndex));
245 kids->appendObject(std::move(annotationDict));
246 }
247 node->fRef = ref;
248 SkPDFDict dict("StructElem");
249 dict.insertName("S", node->fTypeString.isEmpty() ? "NonStruct" : node->fTypeString.c_str());
250 if (!node->fAlt.isEmpty()) {
251 dict.insertTextString("Alt", node->fAlt);
252 }
253 if (!node->fLang.isEmpty()) {
254 dict.insertTextString("Lang", node->fLang);
255 }
256 dict.insertRef("P", parent);
257 dict.insertObject("K", std::move(kids));
258 if (node->fAttributes) {
259 dict.insertObject("A", std::move(node->fAttributes));
260 }
261
262 // Each node has a unique ID that also needs to be referenced
263 // in a separate IDTree node, along with the lowest and highest
264 // unique ID string.
265 SkString idString = SkPDFTagNode::nodeIdToString(node->fNodeId);
266 dict.insertByteString("ID", idString.c_str());
267 IDTreeEntry idTreeEntry = {node->fNodeId, ref};
268 fIdTreeEntries.push_back(idTreeEntry);
269
270 return doc->emit(dict, ref);
271 }
272
addNodeAnnotation(int nodeId,SkPDFIndirectReference annotationRef,unsigned pageIndex)273 void SkPDFTagTree::addNodeAnnotation(int nodeId, SkPDFIndirectReference annotationRef, unsigned pageIndex) {
274 if (!fRoot) {
275 return;
276 }
277 SkPDFTagNode** tagPtr = fNodeMap.find(nodeId);
278 if (!tagPtr) {
279 return;
280 }
281 SkPDFTagNode* tag = *tagPtr;
282 SkASSERT(tag);
283
284 SkPDFTagNode::AnnotationInfo annotationInfo = {pageIndex, annotationRef};
285 tag->fAnnotations.push_back(annotationInfo);
286 }
287
makeStructTreeRoot(SkPDFDocument * doc)288 SkPDFIndirectReference SkPDFTagTree::makeStructTreeRoot(SkPDFDocument* doc) {
289 if (!fRoot || can_discard(fRoot)) {
290 return SkPDFIndirectReference();
291 }
292
293 SkPDFIndirectReference ref = doc->reserveRef();
294
295 unsigned pageCount = SkToUInt(doc->pageCount());
296
297 // Build the StructTreeRoot.
298 SkPDFDict structTreeRoot("StructTreeRoot");
299 structTreeRoot.insertRef("K", PrepareTagTreeToEmit(ref, fRoot, doc));
300 structTreeRoot.insertInt("ParentTreeNextKey", SkToInt(pageCount));
301
302 // Build the parent tree, which consists of two things:
303 // (1) For each page, a mapping from the marked content IDs on
304 // each page to their corresponding tags
305 // (2) For each annotation, an indirect reference to that
306 // annotation's struct tree element.
307 SkPDFDict parentTree("ParentTree");
308 auto parentTreeNums = SkPDFMakeArray();
309
310 // First, one entry per page.
311 SkASSERT(SkToUInt(fMarksPerPage.size()) <= pageCount);
312 for (int j = 0; j < fMarksPerPage.size(); ++j) {
313 const SkTArray<SkPDFTagNode*>& pageMarks = fMarksPerPage[j];
314 SkPDFArray markToTagArray;
315 for (SkPDFTagNode* mark : pageMarks) {
316 SkASSERT(mark->fRef);
317 markToTagArray.appendRef(mark->fRef);
318 }
319 parentTreeNums->appendInt(j);
320 parentTreeNums->appendRef(doc->emit(markToTagArray));
321 }
322
323 // Then, one entry per annotation.
324 for (size_t j = 0; j < fParentTreeAnnotationNodeIds.size(); ++j) {
325 int nodeId = fParentTreeAnnotationNodeIds[j];
326 int structParentKey = kFirstAnnotationStructParentKey + static_cast<int>(j);
327
328 SkPDFTagNode** tagPtr = fNodeMap.find(nodeId);
329 if (!tagPtr) {
330 continue;
331 }
332 SkPDFTagNode* tag = *tagPtr;
333 parentTreeNums->appendInt(structParentKey);
334 parentTreeNums->appendRef(tag->fRef);
335 }
336
337 parentTree.insertObject("Nums", std::move(parentTreeNums));
338 structTreeRoot.insertRef("ParentTree", doc->emit(parentTree));
339
340 // Build the IDTree, a mapping from every unique ID string to
341 // a reference to its corresponding structure element node.
342 if (!fIdTreeEntries.empty()) {
343 std::sort(fIdTreeEntries.begin(), fIdTreeEntries.end(),
344 [](const IDTreeEntry& a, const IDTreeEntry& b) {
345 return a.nodeId < b.nodeId;
346 });
347
348 SkPDFDict idTree;
349 SkPDFDict idTreeLeaf;
350 auto limits = SkPDFMakeArray();
351 SkString lowestNodeIdString = SkPDFTagNode::nodeIdToString(
352 fIdTreeEntries.begin()->nodeId);
353 limits->appendByteString(lowestNodeIdString);
354 SkString highestNodeIdString = SkPDFTagNode::nodeIdToString(
355 fIdTreeEntries.rbegin()->nodeId);
356 limits->appendByteString(highestNodeIdString);
357 idTreeLeaf.insertObject("Limits", std::move(limits));
358 auto names = SkPDFMakeArray();
359 for (const IDTreeEntry& entry : fIdTreeEntries) {
360 SkString idString = SkPDFTagNode::nodeIdToString(entry.nodeId);
361 names->appendByteString(idString);
362 names->appendRef(entry.ref);
363 }
364 idTreeLeaf.insertObject("Names", std::move(names));
365 auto idTreeKids = SkPDFMakeArray();
366 idTreeKids->appendRef(doc->emit(idTreeLeaf));
367 idTree.insertObject("Kids", std::move(idTreeKids));
368 structTreeRoot.insertRef("IDTree", doc->emit(idTree));
369 }
370
371 return doc->emit(structTreeRoot, ref);
372 }
373