1 /*
2 * Copyright 2018 Google Inc.
3 *
4 * Use of this source code is governed by a BSD-style license that can be
5 * found in the LICENSE file.
6 */
7
8 #include "src/pdf/SkPDFDocumentPriv.h"
9 #include "src/pdf/SkPDFTag.h"
10
11 // The struct parent tree consists of one entry per page, followed by
12 // entries for individual struct tree nodes corresponding to
13 // annotations. Each entry is a key/value pair with an integer key
14 // and an indirect reference key.
15 //
16 // The page entries get consecutive keys starting at 0. Since we don't
17 // know the total number of pages in the document at the time we start
18 // processing annotations, start the key for annotations with a large
19 // number, which effectively becomes the maximum number of pages in a
20 // PDF we can handle.
21 const int kFirstAnnotationStructParentKey = 100000;
22
23 struct SkPDFTagNode {
24 // Structure element nodes need a unique alphanumeric ID,
25 // and we need to be able to output them sorted in lexicographic
26 // order. This helper function takes one of our node IDs and
27 // builds an ID string that zero-pads the digits so that lexicographic
28 // order matches numeric order.
nodeIdToStringSkPDFTagNode29 static SkString nodeIdToString(int nodeId) {
30 SkString idString;
31 idString.printf("node%08d", nodeId);
32 return idString;
33 }
34
35 SkPDFTagNode* fChildren = nullptr;
36 size_t fChildCount = 0;
37 struct MarkedContentInfo {
38 unsigned fPageIndex;
39 int fMarkId;
40 };
41 SkTArray<MarkedContentInfo> fMarkedContent;
42 int fNodeId;
43 SkString fTypeString;
44 SkString fAlt;
45 SkString fLang;
46 SkPDFIndirectReference fRef;
47 enum State {
48 kUnknown,
49 kYes,
50 kNo,
51 } fCanDiscard = kUnknown;
52 std::unique_ptr<SkPDFArray> fAttributes;
53 struct AnnotationInfo {
54 unsigned fPageIndex;
55 SkPDFIndirectReference fAnnotationRef;
56 };
57 std::vector<AnnotationInfo> fAnnotations;
58 };
59
60 SkPDF::AttributeList::AttributeList() = default;
61
62 SkPDF::AttributeList::~AttributeList() = default;
63
appendInt(const char * owner,const char * name,int value)64 void SkPDF::AttributeList::appendInt(
65 const char* owner, const char* name, int value) {
66 if (!fAttrs)
67 fAttrs = SkPDFMakeArray();
68 std::unique_ptr<SkPDFDict> attrDict = SkPDFMakeDict();
69 attrDict->insertName("O", owner);
70 attrDict->insertInt(name, value);
71 fAttrs->appendObject(std::move(attrDict));
72 }
73
appendFloat(const char * owner,const char * name,float value)74 void SkPDF::AttributeList::appendFloat(
75 const char* owner, const char* name, float value) {
76 if (!fAttrs)
77 fAttrs = SkPDFMakeArray();
78 std::unique_ptr<SkPDFDict> attrDict = SkPDFMakeDict();
79 attrDict->insertName("O", owner);
80 attrDict->insertScalar(name, value);
81 fAttrs->appendObject(std::move(attrDict));
82 }
83
appendName(const char * owner,const char * name,const char * value)84 void SkPDF::AttributeList::appendName(
85 const char* owner, const char* name, const char* value) {
86 if (!fAttrs)
87 fAttrs = SkPDFMakeArray();
88 std::unique_ptr<SkPDFDict> attrDict = SkPDFMakeDict();
89 attrDict->insertName("O", owner);
90 attrDict->insertName(name, value);
91 fAttrs->appendObject(std::move(attrDict));
92 }
93
appendString(const char * owner,const char * name,const char * value)94 void SkPDF::AttributeList::appendString(
95 const char* owner, const char* name, const char* value) {
96 if (!fAttrs)
97 fAttrs = SkPDFMakeArray();
98 std::unique_ptr<SkPDFDict> attrDict = SkPDFMakeDict();
99 attrDict->insertName("O", owner);
100 attrDict->insertString(name, value);
101 fAttrs->appendObject(std::move(attrDict));
102 }
103
appendFloatArray(const char * owner,const char * name,const std::vector<float> & value)104 void SkPDF::AttributeList::appendFloatArray(
105 const char* owner, const char* name, const std::vector<float>& value) {
106 if (!fAttrs)
107 fAttrs = SkPDFMakeArray();
108 std::unique_ptr<SkPDFDict> attrDict = SkPDFMakeDict();
109 attrDict->insertName("O", owner);
110 std::unique_ptr<SkPDFArray> pdfArray = SkPDFMakeArray();
111 for (float element : value) {
112 pdfArray->appendScalar(element);
113 }
114 attrDict->insertObject(name, std::move(pdfArray));
115 fAttrs->appendObject(std::move(attrDict));
116 }
117
118 // Deprecated.
appendStringArray(const char * owner,const char * name,const std::vector<SkString> & values)119 void SkPDF::AttributeList::appendStringArray(
120 const char* owner,
121 const char* name,
122 const std::vector<SkString>& values) {
123 if (!fAttrs)
124 fAttrs = SkPDFMakeArray();
125 std::unique_ptr<SkPDFDict> attrDict = SkPDFMakeDict();
126 attrDict->insertName("O", owner);
127 std::unique_ptr<SkPDFArray> pdfArray = SkPDFMakeArray();
128 for (const SkString& element : values) {
129 pdfArray->appendString(element);
130 }
131 attrDict->insertObject(name, std::move(pdfArray));
132 fAttrs->appendObject(std::move(attrDict));
133 }
134
135
appendNodeIdArray(const char * owner,const char * name,const std::vector<int> & nodeIds)136 void SkPDF::AttributeList::appendNodeIdArray(
137 const char* owner,
138 const char* name,
139 const std::vector<int>& nodeIds) {
140 if (!fAttrs)
141 fAttrs = SkPDFMakeArray();
142 std::unique_ptr<SkPDFDict> attrDict = SkPDFMakeDict();
143 attrDict->insertName("O", owner);
144 std::unique_ptr<SkPDFArray> pdfArray = SkPDFMakeArray();
145 for (int nodeId : nodeIds) {
146 SkString idString = SkPDFTagNode::nodeIdToString(nodeId);
147 pdfArray->appendString(idString);
148 }
149 attrDict->insertObject(name, std::move(pdfArray));
150 fAttrs->appendObject(std::move(attrDict));
151 }
152
SkPDFTagTree()153 SkPDFTagTree::SkPDFTagTree() : fArena(4 * sizeof(SkPDFTagNode)) {}
154
155 SkPDFTagTree::~SkPDFTagTree() = default;
156
157 // static
Copy(SkPDF::StructureElementNode & node,SkPDFTagNode * dst,SkArenaAlloc * arena,SkTHashMap<int,SkPDFTagNode * > * nodeMap)158 void SkPDFTagTree::Copy(SkPDF::StructureElementNode& node,
159 SkPDFTagNode* dst,
160 SkArenaAlloc* arena,
161 SkTHashMap<int, SkPDFTagNode*>* nodeMap) {
162 nodeMap->set(node.fNodeId, dst);
163 for (int nodeId : node.fAdditionalNodeIds) {
164 SkASSERT(!nodeMap->find(nodeId));
165 nodeMap->set(nodeId, dst);
166 }
167 dst->fNodeId = node.fNodeId;
168 dst->fTypeString = node.fTypeString;
169 dst->fAlt = node.fAlt;
170 dst->fLang = node.fLang;
171
172 size_t childCount = node.fChildVector.size();
173 SkPDFTagNode* children = arena->makeArray<SkPDFTagNode>(childCount);
174 dst->fChildCount = childCount;
175 dst->fChildren = children;
176 for (size_t i = 0; i < childCount; ++i) {
177 Copy(*node.fChildVector[i], &children[i], arena, nodeMap);
178 }
179
180 dst->fAttributes = std::move(node.fAttributes.fAttrs);
181 }
182
init(SkPDF::StructureElementNode * node)183 void SkPDFTagTree::init(SkPDF::StructureElementNode* node) {
184 if (node) {
185 fRoot = fArena.make<SkPDFTagNode>();
186 Copy(*node, fRoot, &fArena, &fNodeMap);
187 }
188 }
189
createMarkIdForNodeId(int nodeId,unsigned pageIndex)190 int SkPDFTagTree::createMarkIdForNodeId(int nodeId, unsigned pageIndex) {
191 if (!fRoot) {
192 return -1;
193 }
194 SkPDFTagNode** tagPtr = fNodeMap.find(nodeId);
195 if (!tagPtr) {
196 return -1;
197 }
198 SkPDFTagNode* tag = *tagPtr;
199 SkASSERT(tag);
200 while (fMarksPerPage.size() < pageIndex + 1) {
201 fMarksPerPage.push_back();
202 }
203 SkTArray<SkPDFTagNode*>& pageMarks = fMarksPerPage[pageIndex];
204 int markId = pageMarks.count();
205 tag->fMarkedContent.push_back({pageIndex, markId});
206 pageMarks.push_back(tag);
207 return markId;
208 }
209
createStructParentKeyForNodeId(int nodeId,unsigned pageIndex)210 int SkPDFTagTree::createStructParentKeyForNodeId(int nodeId, unsigned pageIndex) {
211 if (!fRoot) {
212 return -1;
213 }
214 SkPDFTagNode** tagPtr = fNodeMap.find(nodeId);
215 if (!tagPtr) {
216 return -1;
217 }
218 SkPDFTagNode* tag = *tagPtr;
219 SkASSERT(tag);
220
221 tag->fCanDiscard = SkPDFTagNode::kNo;
222
223 int nextStructParentKey = kFirstAnnotationStructParentKey +
224 static_cast<int>(fParentTreeAnnotationNodeIds.size());
225 fParentTreeAnnotationNodeIds.push_back(nodeId);
226 return nextStructParentKey;
227 }
228
can_discard(SkPDFTagNode * node)229 static bool can_discard(SkPDFTagNode* node) {
230 if (node->fCanDiscard == SkPDFTagNode::kYes) {
231 return true;
232 }
233 if (node->fCanDiscard == SkPDFTagNode::kNo) {
234 return false;
235 }
236 if (!node->fMarkedContent.empty()) {
237 node->fCanDiscard = SkPDFTagNode::kNo;
238 return false;
239 }
240 for (size_t i = 0; i < node->fChildCount; ++i) {
241 if (!can_discard(&node->fChildren[i])) {
242 node->fCanDiscard = SkPDFTagNode::kNo;
243 return false;
244 }
245 }
246 node->fCanDiscard = SkPDFTagNode::kYes;
247 return true;
248 }
249
PrepareTagTreeToEmit(SkPDFIndirectReference parent,SkPDFTagNode * node,SkPDFDocument * doc)250 SkPDFIndirectReference SkPDFTagTree::PrepareTagTreeToEmit(SkPDFIndirectReference parent,
251 SkPDFTagNode* node,
252 SkPDFDocument* doc) {
253 SkPDFIndirectReference ref = doc->reserveRef();
254 std::unique_ptr<SkPDFArray> kids = SkPDFMakeArray();
255 SkPDFTagNode* children = node->fChildren;
256 size_t childCount = node->fChildCount;
257 for (size_t i = 0; i < childCount; ++i) {
258 SkPDFTagNode* child = &children[i];
259 if (!(can_discard(child))) {
260 kids->appendRef(PrepareTagTreeToEmit(ref, child, doc));
261 }
262 }
263 for (const SkPDFTagNode::MarkedContentInfo& info : node->fMarkedContent) {
264 std::unique_ptr<SkPDFDict> mcr = SkPDFMakeDict("MCR");
265 mcr->insertRef("Pg", doc->getPage(info.fPageIndex));
266 mcr->insertInt("MCID", info.fMarkId);
267 kids->appendObject(std::move(mcr));
268 }
269 for (const SkPDFTagNode::AnnotationInfo& annotationInfo : node->fAnnotations) {
270 std::unique_ptr<SkPDFDict> annotationDict = SkPDFMakeDict("OBJR");
271 annotationDict->insertRef("Obj", annotationInfo.fAnnotationRef);
272 annotationDict->insertRef("Pg", doc->getPage(annotationInfo.fPageIndex));
273 kids->appendObject(std::move(annotationDict));
274 }
275 node->fRef = ref;
276 SkPDFDict dict("StructElem");
277 dict.insertName("S", node->fTypeString.isEmpty() ? "NonStruct" : node->fTypeString.c_str());
278 if (!node->fAlt.isEmpty()) {
279 dict.insertString("Alt", node->fAlt);
280 }
281 if (!node->fLang.isEmpty()) {
282 dict.insertString("Lang", node->fLang);
283 }
284 dict.insertRef("P", parent);
285 dict.insertObject("K", std::move(kids));
286 if (node->fAttributes) {
287 dict.insertObject("A", std::move(node->fAttributes));
288 }
289
290 // Each node has a unique ID that also needs to be referenced
291 // in a separate IDTree node, along with the lowest and highest
292 // unique ID string.
293 SkString idString = SkPDFTagNode::nodeIdToString(node->fNodeId);
294 dict.insertString("ID", idString.c_str());
295 IDTreeEntry idTreeEntry = {node->fNodeId, ref};
296 fIdTreeEntries.push_back(idTreeEntry);
297
298 return doc->emit(dict, ref);
299 }
300
addNodeAnnotation(int nodeId,SkPDFIndirectReference annotationRef,unsigned pageIndex)301 void SkPDFTagTree::addNodeAnnotation(int nodeId, SkPDFIndirectReference annotationRef, unsigned pageIndex) {
302 if (!fRoot) {
303 return;
304 }
305 SkPDFTagNode** tagPtr = fNodeMap.find(nodeId);
306 if (!tagPtr) {
307 return;
308 }
309 SkPDFTagNode* tag = *tagPtr;
310 SkASSERT(tag);
311
312 SkPDFTagNode::AnnotationInfo annotationInfo = {pageIndex, annotationRef};
313 tag->fAnnotations.push_back(annotationInfo);
314 }
315
makeStructTreeRoot(SkPDFDocument * doc)316 SkPDFIndirectReference SkPDFTagTree::makeStructTreeRoot(SkPDFDocument* doc) {
317 if (!fRoot || can_discard(fRoot)) {
318 return SkPDFIndirectReference();
319 }
320
321 SkPDFIndirectReference ref = doc->reserveRef();
322
323 unsigned pageCount = SkToUInt(doc->pageCount());
324
325 // Build the StructTreeRoot.
326 SkPDFDict structTreeRoot("StructTreeRoot");
327 structTreeRoot.insertRef("K", PrepareTagTreeToEmit(ref, fRoot, doc));
328 structTreeRoot.insertInt("ParentTreeNextKey", SkToInt(pageCount));
329
330 // Build the parent tree, which consists of two things:
331 // (1) For each page, a mapping from the marked content IDs on
332 // each page to their corresponding tags
333 // (2) For each annotation, an indirect reference to that
334 // annotation's struct tree element.
335 SkPDFDict parentTree("ParentTree");
336 auto parentTreeNums = SkPDFMakeArray();
337
338 // First, one entry per page.
339 SkASSERT(fMarksPerPage.size() <= pageCount);
340 for (size_t j = 0; j < fMarksPerPage.size(); ++j) {
341 const SkTArray<SkPDFTagNode*>& pageMarks = fMarksPerPage[j];
342 SkPDFArray markToTagArray;
343 for (SkPDFTagNode* mark : pageMarks) {
344 SkASSERT(mark->fRef);
345 markToTagArray.appendRef(mark->fRef);
346 }
347 parentTreeNums->appendInt(j);
348 parentTreeNums->appendRef(doc->emit(markToTagArray));
349 }
350
351 // Then, one entry per annotation.
352 for (size_t j = 0; j < fParentTreeAnnotationNodeIds.size(); ++j) {
353 int nodeId = fParentTreeAnnotationNodeIds[j];
354 int structParentKey = kFirstAnnotationStructParentKey + static_cast<int>(j);
355
356 SkPDFTagNode** tagPtr = fNodeMap.find(nodeId);
357 if (!tagPtr) {
358 continue;
359 }
360 SkPDFTagNode* tag = *tagPtr;
361 parentTreeNums->appendInt(structParentKey);
362 parentTreeNums->appendRef(tag->fRef);
363 }
364
365 parentTree.insertObject("Nums", std::move(parentTreeNums));
366 structTreeRoot.insertRef("ParentTree", doc->emit(parentTree));
367
368 // Build the IDTree, a mapping from every unique ID string to
369 // a reference to its corresponding structure element node.
370 if (!fIdTreeEntries.empty()) {
371 std::sort(fIdTreeEntries.begin(), fIdTreeEntries.end(),
372 [](const IDTreeEntry& a, const IDTreeEntry& b) {
373 return a.nodeId < b.nodeId;
374 });
375
376 SkPDFDict idTree;
377 SkPDFDict idTreeLeaf;
378 auto limits = SkPDFMakeArray();
379 SkString lowestNodeIdString = SkPDFTagNode::nodeIdToString(
380 fIdTreeEntries.begin()->nodeId);
381 limits->appendString(lowestNodeIdString);
382 SkString highestNodeIdString = SkPDFTagNode::nodeIdToString(
383 fIdTreeEntries.rbegin()->nodeId);
384 limits->appendString(highestNodeIdString);
385 idTreeLeaf.insertObject("Limits", std::move(limits));
386 auto names = SkPDFMakeArray();
387 for (const IDTreeEntry& entry : fIdTreeEntries) {
388 SkString idString = SkPDFTagNode::nodeIdToString(entry.nodeId);
389 names->appendString(idString);
390 names->appendRef(entry.ref);
391 }
392 idTreeLeaf.insertObject("Names", std::move(names));
393 auto idTreeKids = SkPDFMakeArray();
394 idTreeKids->appendRef(doc->emit(idTreeLeaf));
395 idTree.insertObject("Kids", std::move(idTreeKids));
396 structTreeRoot.insertRef("IDTree", doc->emit(idTree));
397 }
398
399 return doc->emit(structTreeRoot, ref);
400 }
401