• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
2 // -*- Mode: C++ -*-
3 //
4 // Copyright (C) 2021 Google, Inc.
5 //
6 // Author: Giuliano Procida
7 
8 /// @file
9 ///
10 /// This file contains ABI XML manipulation routines and a main driver.
11 ///
12 /// The libxml Tree API is used. The XPath API is not used as it proved
13 /// to be many times slower than direct traversal but only slightly more
14 /// convenient.
15 
16 #include <fcntl.h>
17 #include <unistd.h>
18 
19 #include <algorithm>
20 #include <array>
21 #include <cassert>
22 #include <cctype>
23 #include <cstring>
24 #include <fstream>
25 #include <functional>
26 #include <ios>
27 #include <iostream>
28 #include <map>
29 #include <optional>
30 #include <set>
31 #include <sstream>
32 #include <string>
33 #include <unordered_map>
34 #include <unordered_set>
35 #include <vector>
36 
37 #include <libxml/globals.h>
38 #include <libxml/parser.h>
39 #include <libxml/tree.h>
40 
41 /// Convenience typedef referring to a namespace scope.
42 using namespace_scope = std::vector<std::string>;
43 
44 /// Convenience typedef referring to a set of symbols.
45 using symbol_set = std::unordered_set<std::string>;
46 
47 /// Level of location information to preserve.
48 enum struct LocationInfo { COLUMN, LINE, FILE, NONE };
49 
50 static const std::map<std::string, LocationInfo> LOCATION_INFO_NAME = {
51   {"column", LocationInfo::COLUMN},
52   {"line", LocationInfo::LINE},
53   {"file", LocationInfo::FILE},
54   {"none", LocationInfo::NONE},
55 };
56 
57 static const std::map<std::string, std::string> NAMED_TYPES = {
58   {"enum-decl", "__anonymous_enum__"},
59   {"class-decl", "__anonymous_struct__"},
60   {"union-decl", "__anonymous_union__"},
61 };
62 
63 
64 /// Cast a C string to a libxml string.
65 ///
66 /// @param str the C string (pointer)
67 ///
68 /// @return the same thing, as a type compatible with the libxml API
69 static const xmlChar*
to_libxml(const char * str)70 to_libxml(const char* str)
71 {
72   return reinterpret_cast<const xmlChar*>(str);
73 }
74 
75 /// Cast a libxml string to C string.
76 ///
77 /// @param str the libxml string (pointer)
78 ///
79 /// @return the same thing, as a type compatible with the C library API
80 static const char*
from_libxml(const xmlChar * str)81 from_libxml(const xmlChar* str)
82 {
83   return reinterpret_cast<const char*>(str);
84 }
85 
86 /// Remove a node from its document and free its storage.
87 ///
88 /// @param node the node to remove
89 static void
remove_node(xmlNodePtr node)90 remove_node(xmlNodePtr node)
91 {
92   xmlUnlinkNode(node);
93   xmlFreeNode(node);
94 }
95 
96 /// Remove an XML element and any immediately preceding comment.
97 ///
98 /// @param node the element to remove
99 static void
remove_element(xmlNodePtr node)100 remove_element(xmlNodePtr node)
101 {
102   xmlNodePtr previous_node = node->prev;
103   if (previous_node && previous_node->type == XML_COMMENT_NODE)
104     remove_node(previous_node);
105   remove_node(node);
106 }
107 
108 /// Move a node to an element.
109 ///
110 /// @param node the node to move
111 ///
112 /// @param destination the destination element
113 static void
move_node(xmlNodePtr node,xmlNodePtr destination)114 move_node(xmlNodePtr node, xmlNodePtr destination)
115 {
116   xmlUnlinkNode(node);
117   xmlAddChild(destination, node);
118 }
119 
120 /// Move an XML element and any immediately preceding comment to another
121 /// element.
122 ///
123 /// @param node the element to remove
124 ///
125 /// @param destination the destination element
126 static void
move_element(xmlNodePtr node,xmlNodePtr destination)127 move_element(xmlNodePtr node, xmlNodePtr destination)
128 {
129   xmlNodePtr previous_node = node->prev;
130   if (previous_node && previous_node->type == XML_COMMENT_NODE)
131     move_node(previous_node, destination);
132   move_node(node, destination);
133 }
134 
135 /// Get child nodes of given node.
136 ///
137 /// @param node the node whose children to fetch
138 ///
139 /// @return a vector of child nodes
140 static std::vector<xmlNodePtr>
get_children(xmlNodePtr node)141 get_children(xmlNodePtr node)
142 {
143   std::vector<xmlNodePtr> result;
144   for (xmlNodePtr child = node->children; child; child = child->next)
145     result.push_back(child);
146   return result;
147 }
148 
149 /// Fetch an attribute from a node.
150 ///
151 /// @param node the node
152 ///
153 /// @param name the attribute name
154 ///
155 /// @return the attribute value, if present
156 static std::optional<std::string>
get_attribute(xmlNodePtr node,const char * name)157 get_attribute(xmlNodePtr node, const char* name)
158 {
159   std::optional<std::string> result;
160   xmlChar* attribute = xmlGetProp(node, to_libxml(name));
161   if (attribute)
162     {
163       result = from_libxml(attribute);
164       xmlFree(attribute);
165     }
166   return result;
167 }
168 
169 /// Set an attribute value.
170 ///
171 /// @param node the node
172 ///
173 /// @param name the attribute name
174 ///
175 /// @param value the attribute value
176 static void
set_attribute(xmlNodePtr node,const char * name,const std::string & value)177 set_attribute(xmlNodePtr node, const char* name,
178               const std::string& value)
179 {
180   xmlSetProp(node, to_libxml(name), to_libxml(value.c_str()));
181 }
182 
183 /// Unset an attribute value.
184 ///
185 /// @param node the node
186 ///
187 /// @param name the attribute name
188 static void
unset_attribute(xmlNodePtr node,const char * name)189 unset_attribute(xmlNodePtr node, const char* name)
190 {
191   xmlUnsetProp(node, to_libxml(name));
192 }
193 
194 /// Remove text nodes, recursively.
195 ///
196 /// This simplifies subsequent analysis and manipulation. Removing and
197 /// moving elements will destroy formatting anyway. The only remaining
198 /// node types should be elements and comments.
199 ///
200 /// @param node the node to process
201 static void
strip_text(xmlNodePtr node)202 strip_text(xmlNodePtr node)
203 {
204   if (node->type == XML_TEXT_NODE)
205     remove_node(node);
206   else if (node->type == XML_ELEMENT_NODE)
207     for (xmlNodePtr child : get_children(node))
208       strip_text(child);
209 }
210 
211 /// Add text before / after a node.
212 ///
213 /// @param node the node
214 ///
215 /// @param after whether the next should go after
216 ///
217 /// @param text the text
218 static void
add_text(xmlNodePtr node,bool after,const std::string & text)219 add_text(xmlNodePtr node, bool after, const std::string& text)
220 {
221   xmlNodePtr text_node = xmlNewTextLen(to_libxml(text.data()), text.size());
222   if (after)
223     xmlAddNextSibling(node, text_node);
224   else
225     xmlAddPrevSibling(node, text_node);
226 }
227 
228 /// Format an XML element by adding internal indentation and newlines.
229 ///
230 /// This makes the XML readable.
231 ///
232 /// @param indentation what to add to the line indentation prefix
233 ///
234 /// @param prefix the current line indentation prefix
235 ///
236 /// @param node the node to format
237 static void
format_xml(const std::string & indentation,std::string prefix,xmlNodePtr node)238 format_xml(const std::string& indentation, std::string prefix, xmlNodePtr node)
239 {
240   std::vector<xmlNodePtr> children = get_children(node);
241   if (children.empty())
242     return;
243 
244   // The ordering of operations here is incidental. The outcomes we want
245   // are: 1. an extra newline after the opening tag and indentation of
246   // the closing tag to match, and 2. indentation and newline for each
247   // child.
248   add_text(children[0], false, "\n");
249   add_text(children[children.size() - 1], true, prefix);
250   prefix += indentation;
251   for (xmlNodePtr child : children)
252     {
253       add_text(child, false, prefix);
254       format_xml(indentation, prefix, child);
255       add_text(child, true, "\n");
256     }
257 }
258 
259 /// Rewrite attributes using single quotes.
260 ///
261 /// libxml uses double quotes but libabigail uses single quotes.
262 ///
263 /// Note that libabigail does not emit attributes *containing* single
264 /// quotes and if it did it would escape them as &quot; which libxml
265 /// would in turn preserve. However, the code here will handle all forms
266 /// of quotes, conservatively.
267 ///
268 /// Annotation comments can contain single quote characters so just
269 /// checking for any single quotes at all is insufficiently precise.
270 ///
271 /// @param start a pointer to the start of the XML text
272 ///
273 /// @param limit a pointer to just past the end of the XML text
274 static void
adjust_quotes(xmlChar * start,xmlChar * limit)275 adjust_quotes(xmlChar* start, xmlChar* limit)
276 {
277   const std::string open{"<!--"};
278   const std::string close{"-->"};
279   while (start < limit)
280     {
281       // Look for a '<'
282       start = std::find(start, limit, '<');
283       if (start == limit)
284         break;
285       if (start + open.size() < limit
286           && std::equal(open.begin(), open.end(), start))
287         {
288           // Have a comment, skip to the end.
289           start += open.size();
290           xmlChar* end = std::search(start, limit, close.begin(), close.end());
291           if (end == limit)
292             break;
293           start = end + close.size();
294         }
295       else
296         {
297           // Have some tag, search for the end.
298           start += 1;
299           xmlChar* end = std::find(start, limit, '>');
300           if (end == limit)
301             break;
302           // In general, inside a tag we could find either ' or " being
303           // used to quote attributes and the other quote character
304           // being used as part of the attribute data. However, libxml's
305           // xmlDocDump* functions use " to quote attributes and it's
306           // safe to substitute this quote character with ' so long as '
307           // does not appear within the attribute data.
308           if (std::find(start, end, '\'') == end)
309             for (xmlChar* c = start; c < end; ++c)
310               if (*c == '"')
311                 *c = '\'';
312           start = end + 1;
313         }
314     }
315 }
316 
317 static const std::set<std::string> DROP_IF_EMPTY = {
318   "elf-variable-symbols",
319   "elf-function-symbols",
320   "namespace-decl",
321   "abi-instr",
322   "abi-corpus",
323   "abi-corpus-group",
324 };
325 
326 /// Drop empty elements, if safe to do so, recursively.
327 ///
328 /// @param node the element to process
329 static void
drop_empty(xmlNodePtr node)330 drop_empty(xmlNodePtr node)
331 {
332   if (node->type != XML_ELEMENT_NODE)
333     return;
334   for (xmlNodePtr child : get_children(node))
335     drop_empty(child);
336   // Do not drop the root element, even if empty.
337   if (node->parent->type == XML_DOCUMENT_NODE)
338     return;
339   if (!node->children && DROP_IF_EMPTY.count(from_libxml(node->name)))
340     remove_element(node);
341 }
342 
343 /// Get ELF symbol id.
344 ///
345 /// This is not an explicit attribute. It takes one of these forms:
346 ///
347 /// * name (if symbol is not versioned)
348 /// * name@version (if symbol is versioned but not the default version)
349 /// * name@@version (if symbol is versioned and the default version)
350 ///
351 /// @param node the elf-symbol element
352 ///
353 /// @return the ELF symbol id
354 static std::string
get_elf_symbol_id(xmlNodePtr node)355 get_elf_symbol_id(xmlNodePtr node)
356 {
357   const auto name = get_attribute(node, "name");
358   assert(name);
359   std::string result = name.value();
360   const auto version = get_attribute(node, "version");
361   if (version)
362     {
363       result += '@';
364       const auto is_default = get_attribute(node, "is-default-version");
365       if (is_default && is_default.value() == "yes")
366         result += '@';
367       result += version.value();
368     }
369   return result;
370 }
371 
372 static const std::set<std::string> HAS_LOCATION = {
373   "class-decl",
374   "enum-decl",
375   "function-decl",
376   "parameter",
377   "typedef-decl",
378   "union-decl",
379   "var-decl"
380 };
381 
382 /// Limit location information.
383 ///
384 /// @param location_info the level of location information to retain
385 ///
386 /// @param node the element to process
387 static void
limit_locations(LocationInfo location_info,xmlNodePtr node)388 limit_locations(LocationInfo location_info, xmlNodePtr node)
389 {
390   if (node->type != XML_ELEMENT_NODE)
391     return;
392   if (HAS_LOCATION.count(from_libxml(node->name)))
393     {
394       if (location_info > LocationInfo::COLUMN)
395         {
396           unset_attribute(node, "column");
397           if (location_info > LocationInfo::LINE)
398             {
399               unset_attribute(node, "line");
400               if (location_info > LocationInfo::FILE)
401                 unset_attribute(node, "filepath");
402             }
403         }
404     }
405   for (xmlNodePtr child : get_children(node))
406     limit_locations(location_info, child);
407 }
408 
409 /// Handle unreachable elements.
410 ///
411 /// Reachability is defined to be union of contains, containing and
412 /// refers-to relationships for types, declarations and symbols. The
413 /// roots for reachability are the ELF elements in the ABI.
414 ///
415 /// The subrange element requires special treatment. It has a useless
416 /// type id, but it is not a type and its type id aliases with that of
417 /// all other subranges of the same length. So don't treat it as a type.
418 ///
419 /// @param prune whether to prune unreachable elements
420 ///
421 /// @param report whether to report untyped symbols
422 ///
423 /// @param root the XML root element
424 ///
425 /// @return the number of untyped symbols
426 static size_t
handle_unreachable(bool prune,bool report,xmlNodePtr root)427 handle_unreachable(bool prune, bool report, xmlNodePtr root)
428 {
429   // ELF symbol ids.
430   std::set<std::string> elf_symbol_ids;
431 
432   // Simple way of allowing two kinds of nodes: false=>type,
433   // true=>symbol.
434   using vertex_t = std::pair<bool, std::string>;
435 
436   // Graph vertices.
437   std::set<vertex_t> vertices;
438   // Graph edges.
439   std::map<vertex_t, std::set<vertex_t>> edges;
440 
441   // Keep track of type / symbol nesting so we can identify contains,
442   // containing and refers-to relationships.
443   std::vector<vertex_t> stack;
444 
445   // Process an XML node, adding a vertex and possibly some edges.
446   std::function<void(xmlNodePtr)> process_node = [&](xmlNodePtr node) {
447     // We only care about elements and not comments, at this stage.
448     if (node->type != XML_ELEMENT_NODE)
449       return;
450 
451     const char* node_name = from_libxml(node->name);
452 
453     // Is this an ELF symbol?
454     if (strcmp(node_name, "elf-symbol") == 0)
455       {
456         elf_symbol_ids.insert(get_elf_symbol_id(node));
457         // Early return is safe, but not necessary.
458         return;
459       }
460 
461     // Is this a type? Note that the same id may appear multiple times.
462     const auto id = strcmp(node_name, "subrange") != 0
463                     ? get_attribute(node, "id")
464                     : std::optional<std::string>();
465     if (id)
466       {
467         vertex_t type_vertex{false, id.value()};
468         vertices.insert(type_vertex);
469         const auto naming_typedef_id = get_attribute(node, "naming-typedef-id");
470         if (naming_typedef_id)
471           {
472             // This is an odd one, there can be a backwards link from an
473             // anonymous type to a typedef that refers to it. The -t
474             // option will drop these, but if they are still present, we
475             // should model the link to avoid the risk of dangling
476             // references.
477             vertex_t naming_typedef_vertex{false, naming_typedef_id.value()};
478             edges[type_vertex].insert(naming_typedef_vertex);
479           }
480         if (!stack.empty())
481           {
482             // Parent<->child dependencies; record dependencies both
483             // ways to avoid holes in XML types and declarations.
484             const auto& parent = stack.back();
485             edges[parent].insert(type_vertex);
486             edges[type_vertex].insert(parent);
487           }
488         // Record the type.
489         stack.push_back(type_vertex);
490       }
491 
492     // Is this a (declaration expected to be linked to a) symbol?
493     const auto symbol = get_attribute(node, "elf-symbol-id");
494     if (symbol)
495       {
496         vertex_t symbol_vertex{true, symbol.value()};
497         vertices.insert(symbol_vertex);
498         if (!stack.empty())
499           {
500             // Parent<->child dependencies; record dependencies both
501             // ways to avoid making holes in XML types and declarations.
502             //
503             // Symbols exist outside of the type hierarchy, so choosing
504             // to make them depend on a containing type scope and vice
505             // versa is conservative and probably not necessary.
506             const auto& parent = stack.back();
507             edges[parent].insert(symbol_vertex);
508             edges[symbol_vertex].insert(parent);
509           }
510         // Record the symbol.
511         stack.push_back(symbol_vertex);
512         // In practice there will be at most one symbol on the stack; we could
513         // verify this here, but it wouldn't achieve anything.
514       }
515 
516     // Being both would make the stack ordering ambiguous.
517     if (id && symbol)
518       {
519         std::cerr << "cannot handle element which is both type and symbol\n";
520         exit(1);
521       }
522 
523     // Is there a reference to another type?
524     const auto type_id = get_attribute(node, "type-id");
525     if (type_id && !stack.empty())
526       {
527         // The enclosing type or symbol refers to another type.
528         const auto& parent = stack.back();
529         vertex_t type_id_vertex{false, type_id.value()};
530         edges[parent].insert(type_id_vertex);
531       }
532 
533     // Process recursively.
534     for (auto child : get_children(node))
535       process_node(child);
536 
537     // Restore the stack.
538     if (symbol)
539       stack.pop_back();
540     if (id)
541       stack.pop_back();
542   };
543 
544   // Traverse the whole root element and build a graph.
545   process_node(root);
546 
547   // Simple DFS.
548   std::set<vertex_t> seen;
549   std::function<void(vertex_t)> dfs = [&](vertex_t vertex) {
550     if (!seen.insert(vertex).second)
551       return;
552     auto it = edges.find(vertex);
553     if (it != edges.end())
554       for (auto to : it->second)
555         dfs(to);
556   };
557 
558   // Count of how many symbols are untyped.
559   size_t untyped = 0;
560 
561   // Traverse the graph, starting from the ELF symbols.
562   for (const auto& symbol_id : elf_symbol_ids)
563     {
564       vertex_t symbol_vertex{true, symbol_id};
565       if (vertices.count(symbol_vertex))
566         {
567           dfs(symbol_vertex);
568         }
569       else
570         {
571           if (report)
572             std::cerr << "no declaration found for ELF symbol with id "
573                       << symbol_id << '\n';
574           ++untyped;
575         }
576     }
577 
578   // This is a DFS with early stopping.
579   std::function<void(xmlNodePtr)> remove_unseen = [&](xmlNodePtr node) {
580     if (node->type != XML_ELEMENT_NODE)
581       return;
582 
583     const char* node_name = from_libxml(node->name);
584 
585     // Return if we know that this is a type to keep or drop in its
586     // entirety.
587     const auto id = strcmp(node_name, "subrange") != 0
588                     ? get_attribute(node, "id")
589                     : std::optional<std::string>();
590     if (id)
591       {
592         if (!seen.count(vertex_t{false, id.value()}))
593           remove_element(node);
594         return;
595       }
596 
597     // Return if we know that this is a declaration to keep or drop in
598     // its entirety. Note that var-decl and function-decl are the only
599     // elements that can have an elf-symbol-id attribute.
600     if (strcmp(node_name, "var-decl") == 0
601         || strcmp(node_name, "function-decl") == 0)
602       {
603         const auto symbol = get_attribute(node, "elf-symbol-id");
604         if (!(symbol && seen.count(vertex_t{true, symbol.value()})))
605           remove_element(node);
606         return;
607       }
608 
609     // Otherwise, this is not a type, declaration or part thereof, so
610     // process child elements.
611     for (auto child : get_children(node))
612       remove_unseen(child);
613   };
614 
615   if (prune)
616     // Traverse the XML, removing unseen elements.
617     remove_unseen(root);
618 
619   return untyped;
620 }
621 
622 /// Tidy anonymous types in various ways.
623 ///
624 /// 1. Normalise anonymous type names by removing the numerical suffix.
625 ///
626 /// Anonymous type names take the form __anonymous_foo__N where foo is
627 /// one of enum, struct or union and N is an optional numerical suffix.
628 /// The suffices are senstive to processing order and do not convey
629 /// useful ABI information. They can cause spurious harmless diffs and
630 /// make XML diffing and rebasing harder.
631 ///
632 /// It's best to remove the suffix.
633 ///
634 /// 2. Reanonymise anonymous types that have been given names.
635 ///
636 /// A recent change to abidw changed its behaviour for any anonymous
637 /// type that has a naming typedef. In addition to linking the typedef
638 /// and type in both directions, the code now gives (some) anonymous
639 /// types the same name as the typedef. This misrepresents the original
640 /// types.
641 ///
642 /// Such types should be anonymous.
643 ///
644 /// 3. Discard naming typedef backlinks.
645 ///
646 /// The attribute naming-typedef-id is a backwards link from an
647 /// anonymous type to the typedef that refers to it. It is ignored by
648 /// abidiff.
649 ///
650 /// Unfortunately, libabigail sometimes conflates multiple anonymous
651 /// types that have naming typedefs and only one of the typedefs can
652 /// "win". ABI XML is thus sensitive to processing order and can also
653 /// end up containing definitions of an anonymous type with differing
654 /// naming-typedef-id attributes.
655 ///
656 /// It's best to just drop the attribute.
657 ///
658 /// @param node the XML node to process
659 static void
handle_anonymous_types(bool normalise,bool reanonymise,bool discard_naming,xmlNodePtr node)660 handle_anonymous_types(bool normalise, bool reanonymise, bool discard_naming,
661                        xmlNodePtr node)
662 {
663   if (node->type != XML_ELEMENT_NODE)
664     return;
665 
666   const auto it = NAMED_TYPES.find(from_libxml(node->name));
667   if (it != NAMED_TYPES.end())
668     {
669       const auto& anon = it->second;
670       const auto name_attribute = get_attribute(node, "name");
671       const auto& name =
672           name_attribute ? name_attribute.value() : std::string();
673       const auto anon_attr = get_attribute(node, "is-anonymous");
674       const bool is_anon = anon_attr && anon_attr.value() == "yes";
675       const auto naming_attribute = get_attribute(node, "naming-typedef-id");
676       if (normalise && is_anon && name != anon) {
677         // __anonymous_foo__123 -> __anonymous_foo__
678         set_attribute(node, "name", anon);
679       }
680       if (reanonymise && !is_anon && naming_attribute) {
681         // bar with naming typedef -> __anonymous_foo__
682         set_attribute(node, "is-anonymous", "yes");
683         set_attribute(node, "name", anon);
684       }
685       if (discard_naming && naming_attribute)
686         unset_attribute(node, "naming-typedef-id");
687     }
688 
689   for (auto child : get_children(node))
690     handle_anonymous_types(normalise, reanonymise, discard_naming, child);
691 }
692 
693 /// Remove attributes emitted by abidw --load-all-types.
694 ///
695 /// With this invocation and if any user-defined types are deemed
696 /// unreachable, libabigail will output a tracking-non-reachable-types
697 /// attribute on top-level elements and a is-non-reachable attribute on
698 /// each such type element.
699 ///
700 /// abitidy has its own graph-theoretic notion of reachability and these
701 /// attributes have no ABI relevance.
702 ///
703 /// It's best to just drop them.
704 ///
705 /// @param node the XML node to process
706 void
clear_non_reachable(xmlNodePtr node)707 clear_non_reachable(xmlNodePtr node)
708 {
709   if (node->type != XML_ELEMENT_NODE)
710     return;
711 
712   const char* node_name = from_libxml(node->name);
713 
714   if (strcmp(node_name, "abi-corpus-group") == 0
715       || strcmp(node_name, "abi-corpus") == 0)
716     unset_attribute(node, "tracking-non-reachable-types");
717   else if (NAMED_TYPES.find(node_name) != NAMED_TYPES.end())
718     unset_attribute(node, "is-non-reachable");
719 
720   for (auto child : get_children(node))
721     clear_non_reachable(child);
722 }
723 
724 /// The set of attributes that should be excluded from consideration
725 /// when comparing XML elements.
726 ///
727 /// Source location attributes are omitted with --no-show-locs without
728 /// changing the meaning of the ABI. They can also sometimes vary
729 /// between duplicate type definitions.
730 ///
731 /// The naming-typedef-id attribute, if not already removed by another
732 /// pass, is irrelevant to ABI semantics.
733 ///
734 /// The is-non-reachable attribute, if not already removed by another
735 /// pass, is irrelevant to ABI semantics.
736 static const std::unordered_set<std::string> IRRELEVANT_ATTRIBUTES = {
737   {"filepath"},
738   {"line"},
739   {"column"},
740   {"naming-typedef-id"},
741   {"is-non-reachable"},
742 };
743 
744 /// Determine whether one XML element is a subtree of another.
745 ///
746 /// XML elements representing types are sometimes emitted multiple
747 /// times, identically. Also, member typedefs are sometimes emitted
748 /// separately from their types, resulting in duplicate XML fragments.
749 ///
750 /// Both these issues can be resolved by first detecting duplicate
751 /// occurrences of a given type id and then checking to see if there's
752 /// an instance that subsumes the others, which can then be eliminated.
753 ///
754 /// @param left the first element to compare
755 ///
756 /// @param right the second element to compare
757 ///
758 /// @return whether the first element is a subtree of the second
759 bool
sub_tree(xmlNodePtr left,xmlNodePtr right)760 sub_tree(xmlNodePtr left, xmlNodePtr right)
761 {
762   // Node names must match.
763   const char* left_name = from_libxml(left->name);
764   const char* right_name = from_libxml(right->name);
765   if (strcmp(left_name, right_name) != 0)
766     return false;
767   // Attributes may be missing on the left, but must match otherwise.
768   for (auto p = left->properties; p; p = p->next)
769   {
770     const char* attribute_name = from_libxml(p->name);
771     if (IRRELEVANT_ATTRIBUTES.count(attribute_name))
772       continue;
773     // EXCEPTION: libabigail emits the access specifier for the type
774     // it's trying to "emit in scope" rather than for what may be a
775     // containing type; so allow member-type attribute access to differ.
776     if (strcmp(left_name, "member-type") == 0
777         && strcmp(attribute_name, "access") == 0)
778       continue;
779     const auto left_value = get_attribute(left, attribute_name);
780     assert(left_value);
781     const auto right_value = get_attribute(right, attribute_name);
782     if (!right_value || left_value.value() != right_value.value())
783       return false;
784   }
785   // The left subelements must be a subsequence of the right ones.
786   xmlNodePtr left_child = xmlFirstElementChild(left);
787   xmlNodePtr right_child = xmlFirstElementChild(right);
788   while (left_child && right_child)
789     {
790       if (sub_tree(left_child, right_child))
791         left_child = xmlNextElementSibling(left_child);
792       right_child = xmlNextElementSibling(right_child);
793     }
794   return !left_child;
795 }
796 
797 /// Elminate non-conflicting / report conflicting type definitions.
798 ///
799 /// This function can eliminate exact type duplicates and duplicates
800 /// where there is at least one maximal definition. It can report the
801 /// remaining, conflicting duplicate definitions.
802 ///
803 /// If a type has duplicate definitions in multiple namespace scopes,
804 /// these should not be reordered. This function reports how many such
805 /// types it finds.
806 ///
807 /// @param eliminate whether to eliminate non-conflicting duplicates
808 ///
809 /// @param report whether to report conflicting duplicate definitions
810 ///
811 /// @param root the root XML element
812 ///
813 /// @return the number of types defined in multiple namespace scopes
handle_duplicate_types(bool eliminate,bool report,xmlNodePtr root)814 size_t handle_duplicate_types(bool eliminate, bool report, xmlNodePtr root)
815 {
816   // map of type-id to pair of set of namespace scopes and vector of
817   // xmlNodes
818   std::unordered_map<
819       std::string,
820       std::pair<
821           std::set<namespace_scope>,
822           std::vector<xmlNodePtr>>> types;
823   namespace_scope namespaces;
824 
825   // find all type occurrences
826   std::function<void(xmlNodePtr)> dfs = [&](xmlNodePtr node) {
827     if (node->type != XML_ELEMENT_NODE)
828       return;
829     const char* node_name = from_libxml(node->name);
830     std::optional<std::string> namespace_name;
831     if (strcmp(node_name, "namespace-decl") == 0)
832       namespace_name = get_attribute(node, "name");
833     if (namespace_name)
834       namespaces.push_back(namespace_name.value());
835     if (strcmp(node_name, "abi-corpus-group") == 0
836         || strcmp(node_name, "abi-corpus") == 0
837         || strcmp(node_name, "abi-instr") == 0
838         || namespace_name)
839       {
840         for (auto child : get_children(node))
841           dfs(child);
842       }
843     else
844       {
845         const auto id = get_attribute(node, "id");
846         if (id)
847           {
848             auto& info = types[id.value()];
849             info.first.insert(namespaces);
850             info.second.push_back(node);
851           }
852       }
853     if (namespace_name)
854       namespaces.pop_back();
855   };
856   dfs(root);
857 
858   size_t scope_conflicts = 0;
859   for (const auto& [id, scopes_and_definitions] : types)
860     {
861       const auto& [scopes, definitions] = scopes_and_definitions;
862 
863       if (scopes.size() > 1)
864         {
865           if (report)
866             std::cerr << "conflicting scopes found for type '" << id << "'\n";
867           ++scope_conflicts;
868           continue;
869         }
870 
871       const size_t count = definitions.size();
872       if (count <= 1)
873         continue;
874 
875       // Find a potentially maximal candidate by scanning through and
876       // retaining the new definition if it's a supertree of the current
877       // candidate.
878       std::vector<bool> ok(count);
879       size_t candidate = 0;
880       ok[candidate] = true;
881       for (size_t ix = 1; ix < count; ++ix)
882         if (sub_tree(definitions[candidate], definitions[ix]))
883           {
884             candidate = ix;
885             ok[candidate] = true;
886           }
887 
888       // Verify the candidate is indeed maximal by scanning the
889       // definitions not already known to be subtrees of it.
890       bool bad = false;
891       for (size_t ix = 0; ix < count; ++ix)
892         if (!ok[ix] && !sub_tree(definitions[ix], definitions[candidate]))
893           {
894             bad = true;
895             break;
896           }
897       if (bad)
898         {
899           if (report)
900             std::cerr << "conflicting definitions found for type '" << id
901                       << "'\n";
902           continue;
903         }
904 
905       if (eliminate)
906         // Remove all but the maximal definition.
907         for (size_t ix = 0; ix < count; ++ix)
908           if (ix != candidate)
909             remove_element(definitions[ix]);
910     }
911 
912   return scope_conflicts;
913 }
914 
915 static const std::set<std::string> INSTR_VARIABLE_ATTRIBUTES = {
916   "path",
917   "comp-dir-path",
918   "language",
919 };
920 
921 /// Collect elements of abi-instr elements by namespace.
922 ///
923 /// Namespaces are not returned but are recursively traversed with the
924 /// namespace stack being maintained. Other elements are associated with
925 /// the current namespace.
926 ///
927 /// @param nodes the nodes to traverse
928 ///
929 /// @return child elements grouped by namespace scope
930 static std::map<namespace_scope, std::vector<xmlNodePtr>>
get_children_by_namespace(const std::vector<xmlNodePtr> & nodes)931 get_children_by_namespace(const std::vector<xmlNodePtr>& nodes)
932 {
933   std::map<namespace_scope, std::vector<xmlNodePtr>> result;
934   namespace_scope scope;
935 
936   std::function<void(xmlNodePtr)> process = [&](xmlNodePtr node) {
937     if (node->type != XML_ELEMENT_NODE)
938       return;
939     std::optional<std::string> namespace_name;
940     const char* node_name = from_libxml(node->name);
941     if (strcmp(node_name, "namespace-decl") == 0)
942       namespace_name = get_attribute(node, "name");
943     if (namespace_name)
944       {
945         scope.push_back(namespace_name.value());
946         for (auto child : get_children(node))
947           process(child);
948         scope.pop_back();
949       }
950     else
951       result[scope].push_back(node);
952   };
953 
954   for (auto node : nodes)
955     for (auto child : get_children(node))
956       process(child);
957   return result;
958 }
959 
960 /// Sort namespaces, types and declarations.
961 ///
962 /// This loses annotations (XML comments) on namespace-decl elements.
963 /// It would have been a fair amount of extra work to preserve them.
964 ///
965 /// @param root the XML root element
966 static void
sort_namespaces_types_and_declarations(xmlNodePtr root)967 sort_namespaces_types_and_declarations(xmlNodePtr root)
968 {
969   // There are (currently) 2 ABI formats we handle here.
970   //
971   // 1. An abi-corpus containing one or more abi-instr. In this case, we
972   // move all namespaces, types and declarations to a replacement
973   // abi-instr at the end of the abi-corpus. The existing abi-instr will
974   // then be confirmed as empty and removed.
975   //
976   // 2. An abi-corpus-group containing one or more abi-corpus each
977   // containing zero or more abi-instr (with at least one abi-instr
978   // altogether). In this case the replacement abi-instr is created
979   // within the first abi-corpus of the group.
980   //
981   // Anything else is left alone. For example, single abi-instr elements
982   // are present in some libabigail test suite files.
983 
984   // We first need to identify where to place the new abi-instr and
985   // collect all the abi-instr to process.
986   xmlNodePtr where = nullptr;
987   std::vector<xmlNodePtr> instrs;
988 
989   auto process_corpus = [&](xmlNodePtr corpus) {
990     if (!where)
991       where = corpus;
992     for (auto instr : get_children(corpus))
993       if (strcmp(from_libxml(instr->name), "abi-instr") == 0)
994         instrs.push_back(instr);
995   };
996 
997   const char* root_name = from_libxml(root->name);
998   if (strcmp(root_name, "abi-corpus-group") == 0)
999     {
1000       // Process all corpora in a corpus group together.
1001       for (auto corpus : get_children(root))
1002         if (strcmp(from_libxml(corpus->name), "abi-corpus") == 0)
1003           process_corpus(corpus);
1004     }
1005   else if (strcmp(root_name, "abi-corpus") == 0)
1006     {
1007       // We have a corpus to sort, just get its instrs.
1008       process_corpus(root);
1009     }
1010 
1011   if (instrs.empty())
1012     return;
1013 
1014   // Collect the attributes of all the instrs.
1015   std::map<std::string, std::set<std::string>> attributes;
1016   for (auto instr : instrs)
1017     for (auto p = instr->properties; p; p = p->next)
1018       {
1019         // This is horrible. There should be a better way of iterating.
1020         const char* attribute_name = from_libxml(p->name);
1021         const auto attribute_value = get_attribute(instr, attribute_name);
1022         assert(attribute_value);
1023         attributes[attribute_name].insert(attribute_value.value());
1024       }
1025 
1026   // Create and attach a replacement instr and populate its attributes.
1027   xmlNodePtr replacement =
1028       xmlAddChild(where, xmlNewNode(nullptr, to_libxml("abi-instr")));
1029   for (const auto& attribute : attributes)
1030     {
1031       const char* attribute_name = attribute.first.c_str();
1032       const auto& attribute_values = attribute.second;
1033       if (attribute_values.size() == 1)
1034         set_attribute(replacement, attribute_name, *attribute_values.begin());
1035       else if (INSTR_VARIABLE_ATTRIBUTES.count(attribute_name))
1036         set_attribute(replacement, attribute_name, "various");
1037       else
1038         {
1039           std::cerr << "unexpectedly variable abi-instr attribute '"
1040                     << attribute_name << "'\n";
1041           remove_node(replacement);
1042           return;
1043         }
1044     }
1045 
1046   // Order types before declarations, types by id, declarations by name
1047   // (and by mangled-name, if present).
1048   struct Compare {
1049     int
1050     cmp(xmlNodePtr a, xmlNodePtr b) const
1051     {
1052       // NOTE: This must not reorder type definitions with the same id.
1053       // In particular, we cannot do anything nice and easy like order
1054       // by element tag first.
1055       //
1056       // TODO: Replace compare and subtraction with <=>.
1057       int result;
1058 
1059       auto a_id = get_attribute(a, "id");
1060       auto b_id = get_attribute(b, "id");
1061       // types before non-types
1062       result = b_id.has_value() - a_id.has_value();
1063       if (result)
1064         return result;
1065       if (a_id)
1066         // sort types by id
1067         return a_id.value().compare(b_id.value());
1068 
1069       auto a_name = get_attribute(a, "name");
1070       auto b_name = get_attribute(b, "name");
1071       // declarations before non-declarations
1072       result = b_name.has_value() - a_name.has_value();
1073       if (result)
1074         return result;
1075       if (a_name)
1076         {
1077           // sort declarations by name
1078           result = a_name.value().compare(b_name.value());
1079           if (result)
1080             return result;
1081           auto a_mangled = get_attribute(a, "mangled-name");
1082           auto b_mangled = get_attribute(b, "mangled-name");
1083           // without mangled-name first
1084           result = a_mangled.has_value() - b_mangled.has_value();
1085           if (result)
1086             return result;
1087           // and by mangled-name if present
1088           return !a_mangled ? 0 : a_mangled.value().compare(b_mangled.value());
1089         }
1090 
1091       // a and b are not types or declarations; should not be reached
1092       return 0;
1093     }
1094 
1095     bool
1096     operator()(xmlNodePtr a, xmlNodePtr b) const
1097     {
1098       return cmp(a, b) < 0;
1099     }
1100   };
1101 
1102   // Collect the child elements of all the instrs, by namespace scope.
1103   auto scoped_children = get_children_by_namespace(instrs);
1104   for (auto& [scope, children] : scoped_children)
1105     // Sort the children, preserving order of duplicates.
1106     std::stable_sort(children.begin(), children.end(), Compare());
1107 
1108   // Create namespace elements on demand. The global namespace, with
1109   // empty scope, is just the replacement instr itself.
1110   std::map<namespace_scope, xmlNodePtr> namespace_elements{{{}, replacement}};
1111   std::function<xmlNodePtr(const namespace_scope&)> get_namespace_element =
1112       [&](const namespace_scope& scope) {
1113         auto insertion = namespace_elements.insert({scope, nullptr});
1114     if (insertion.second)
1115       {
1116         // Insertion succeeded, so the scope cannot be empty.
1117         namespace_scope truncated = scope;
1118         truncated.pop_back();
1119         xmlNodePtr parent = get_namespace_element(truncated);
1120         // We can now create an XML element in the right place.
1121         xmlNodePtr child = xmlNewNode(nullptr, to_libxml("namespace-decl"));
1122         set_attribute(child, "name", scope.back());
1123         xmlAddChild(parent, child);
1124         insertion.first->second = child;
1125       }
1126     return insertion.first->second;
1127   };
1128 
1129   // Move the children to the replacement instr or its subelements.
1130   for (const auto& [scope, elements] : scoped_children)
1131     {
1132       xmlNodePtr namespace_element = get_namespace_element(scope);
1133       for (auto element : elements)
1134         move_element(element, namespace_element);
1135     }
1136 
1137   // Check the original instrs are now all empty and remove them.
1138   for (auto instr : instrs)
1139     if (get_children_by_namespace({instr}).empty())
1140       remove_node(instr);
1141     else
1142       std::cerr << "original abi-instr has residual child elements\n";
1143 }
1144 
1145 static constexpr std::array<std::string_view, 2> SYMBOL_SECTION_SUFFICES = {
1146   "symbol_list",
1147   "whitelist",
1148 };
1149 
1150 /// Read symbols from a file.
1151 ///
1152 /// This aims to be compatible with the .ini format used by libabigail
1153 /// for suppression specifications and symbol lists. All symbol list
1154 /// sections in the given file are combined into a single set of
1155 /// symbols.
1156 ///
1157 /// @param filename the name of the file from which to read
1158 ///
1159 /// @return a set of symbols
1160 symbol_set
read_symbols(const char * filename)1161 read_symbols(const char* filename)
1162 {
1163   symbol_set symbols;
1164   std::ifstream file(filename);
1165   if (!file)
1166     {
1167       std::cerr << "error opening symbol file '" << filename << "'\n";
1168       exit(1);
1169     }
1170 
1171   bool in_symbol_section = false;
1172   std::string line;
1173   while (std::getline(file, line))
1174     {
1175       size_t start = 0;
1176       size_t limit = line.size();
1177       // Strip comments and leading / trailing whitespace.
1178       while (start < limit)
1179         {
1180           if (std::isspace(line[start]))
1181             ++start;
1182           else if (line[start] == '#')
1183             start = limit;
1184           else
1185             break;
1186         }
1187       while (start < limit)
1188         {
1189           if (std::isspace(line[limit - 1]))
1190             --limit;
1191           else
1192             break;
1193         }
1194       // Skip empty lines.
1195       if (start == limit)
1196         continue;
1197       // See if we are entering a symbol list section.
1198       if (line[start] == '[' && line[limit - 1] == ']')
1199         {
1200           std::string_view section(&line[start + 1], limit - start - 2);
1201           bool found = false;
1202           for (const auto& suffix : SYMBOL_SECTION_SUFFICES)
1203             if (section.size() >= suffix.size()
1204                 && section.substr(section.size() - suffix.size()) == suffix)
1205               {
1206                 found = true;
1207                 break;
1208               }
1209           in_symbol_section = found;
1210           continue;
1211         }
1212       // Add symbol.
1213       if (in_symbol_section)
1214         symbols.insert(std::string(&line[start], limit - start));
1215     }
1216   if (!file.eof())
1217     {
1218       std::cerr << "error reading symbol file '" << filename << "'\n";
1219       exit(1);
1220     }
1221   return symbols;
1222 }
1223 
1224 /// Remove unlisted ELF symbols.
1225 ///
1226 /// @param symbols the set of symbols
1227 ///
1228 /// @param node the XML node to process
1229 void
filter_symbols(const symbol_set & symbols,xmlNodePtr node)1230 filter_symbols(const symbol_set& symbols, xmlNodePtr node)
1231 {
1232   if (node->type != XML_ELEMENT_NODE)
1233     return;
1234   const char* node_name = from_libxml(node->name);
1235   if (strcmp(node_name, "abi-corpus-group") == 0
1236       || strcmp(node_name, "abi-corpus") == 0
1237       || strcmp(node_name, "elf-variable-symbols") == 0
1238       || strcmp(node_name, "elf-function-symbols") == 0)
1239     {
1240       // Process children.
1241       for (auto child : get_children(node))
1242         filter_symbols(symbols, child);
1243     }
1244   else if (strcmp(node_name, "elf-symbol") == 0)
1245     {
1246       const auto name = get_attribute(node, "name");
1247       if (name && !symbols.count(name.value()))
1248         remove_element(node);
1249     }
1250 }
1251 
1252 /// Main program.
1253 ///
1254 /// Read and write ABI XML, with optional processing passes.
1255 ///
1256 /// @param argc argument count
1257 ///
1258 /// @param argv argument vector
1259 ///
1260 /// @return exit status
1261 int
main(int argc,char * argv[])1262 main(int argc, char* argv[])
1263 {
1264   // Defaults.
1265   const char* opt_input = nullptr;
1266   const char* opt_output = nullptr;
1267   std::optional<symbol_set> opt_symbols;
1268   LocationInfo opt_locations = LocationInfo::COLUMN;
1269   int opt_indentation = 2;
1270   bool opt_normalise_anonymous = false;
1271   bool opt_reanonymise_anonymous = false;
1272   bool opt_discard_naming_typedefs = false;
1273   bool opt_prune_unreachable = false;
1274   bool opt_report_untyped = false;
1275   bool opt_abort_on_untyped = false;
1276   bool opt_clear_non_reachable = false;
1277   bool opt_eliminate_duplicates = false;
1278   bool opt_report_conflicts = false;
1279   bool opt_sort = false;
1280   bool opt_drop_empty = false;
1281 
1282   // Process command line.
1283   auto usage = [&]() -> int {
1284     std::cerr << "usage: " << argv[0] << '\n'
1285               << "  [-i|--input file]\n"
1286               << "  [-o|--output file]\n"
1287               << "  [-S|--symbols file]\n"
1288               << "  [-L|--locations {column|line|file|none}]\n"
1289               << "  [-I|--indentation n]\n"
1290               << "  [-a|--all] (implies -n -r -t -p -u -b -e -c -s -d)\n"
1291               << "  [-n|--[no-]normalise-anonymous]\n"
1292               << "  [-r|--[no-]reanonymise-anonymous]\n"
1293               << "  [-t|--[no-]discard-naming-typedefs]\n"
1294               << "  [-p|--[no-]prune-unreachable]\n"
1295               << "  [-u|--[no-]report-untyped]\n"
1296               << "  [-U|--abort-on-untyped-symbols]\n"
1297               << "  [-b|--[no-]clear-non-reachable]\n"
1298               << "  [-e|--[no-]eliminate-duplicates]\n"
1299               << "  [-c|--[no-]report-conflicts]\n"
1300               << "  [-s|--[no-]sort]\n"
1301               << "  [-d|--[no-]drop-empty]\n";
1302     return 1;
1303   };
1304   int opt_index = 1;
1305   auto get_arg = [&]() {
1306     if (opt_index < argc)
1307       return argv[opt_index++];
1308     exit(usage());
1309   };
1310   while (opt_index < argc)
1311     {
1312       const std::string arg = get_arg();
1313       if (arg == "-i" || arg == "--input")
1314         opt_input = get_arg();
1315       else if (arg == "-o" || arg == "--output")
1316         opt_output = get_arg();
1317       else if (arg == "-S" || arg == "--symbols")
1318         opt_symbols = read_symbols(get_arg());
1319       else if (arg == "-L" || arg == "--locations")
1320         {
1321           auto it = LOCATION_INFO_NAME.find(get_arg());
1322           if (it == LOCATION_INFO_NAME.end())
1323             exit(usage());
1324           opt_locations = it->second;
1325         }
1326       else if (arg == "-I" || arg == "--indentation")
1327         {
1328           std::istringstream is(get_arg());
1329           is >> std::noskipws >> opt_indentation;
1330           if (!is || !is.eof() || opt_indentation < 0)
1331             exit(usage());
1332         }
1333       else if (arg == "-a" || arg == "--all")
1334         opt_normalise_anonymous = opt_reanonymise_anonymous
1335                                 = opt_discard_naming_typedefs
1336                                 = opt_prune_unreachable
1337                                 = opt_report_untyped
1338                                 = opt_clear_non_reachable
1339                                 = opt_eliminate_duplicates
1340                                 = opt_report_conflicts
1341                                 = opt_sort
1342                                 = opt_drop_empty
1343                                 = true;
1344       else if (arg == "-n" || arg == "--normalise-anonymous")
1345         opt_normalise_anonymous = true;
1346       else if (arg == "--no-normalise-anonymous")
1347         opt_normalise_anonymous = false;
1348       else if (arg == "-r" || arg == "--reanonymise-anonymous")
1349         opt_reanonymise_anonymous = true;
1350       else if (arg == "--no-reanonymise-anonymous")
1351         opt_reanonymise_anonymous = false;
1352       else if (arg == "-t" || arg == "--discard-naming-typedefs")
1353         opt_discard_naming_typedefs = true;
1354       else if (arg == "--no-discard-naming-typedefs")
1355         opt_discard_naming_typedefs = false;
1356       else if (arg == "-p" || arg == "--prune-unreachable")
1357         opt_prune_unreachable = true;
1358       else if (arg == "--no-prune-unreachable")
1359         opt_prune_unreachable = false;
1360       else if (arg == "-u" || arg == "--report-untyped")
1361         opt_report_untyped = true;
1362       else if (arg == "--no-report-untyped")
1363         opt_report_untyped = false;
1364       else if (arg == "-U" || arg == "--abort-on-untyped-symbols")
1365         opt_abort_on_untyped = true;
1366       else if (arg == "-b" || arg == "--clear-non-reachable")
1367         opt_clear_non_reachable = true;
1368       else if (arg == "--no-clear-non-reachable")
1369         opt_clear_non_reachable = false;
1370       else if (arg == "-e" || arg == "--eliminate-duplicates")
1371         opt_eliminate_duplicates = true;
1372       else if (arg == "--no-eliminate-duplicates")
1373         opt_eliminate_duplicates = false;
1374       else if (arg == "-c" || arg == "--report-conflicts")
1375         opt_report_conflicts = true;
1376       else if (arg == "--no-report-conflicts")
1377         opt_report_conflicts = false;
1378       else if (arg == "-s" || arg == "--sort")
1379         opt_sort = true;
1380       else if (arg == "--no-sort")
1381         opt_sort = false;
1382       else if (arg == "-d" || arg == "--drop-empty")
1383         opt_drop_empty = true;
1384       else if (arg == "--no-drop-empty")
1385         opt_drop_empty = false;
1386       else
1387         exit(usage());
1388     }
1389 
1390   // Open input for reading.
1391   int in_fd = STDIN_FILENO;
1392   if (opt_input)
1393     {
1394       in_fd = open(opt_input, O_RDONLY);
1395       if (in_fd < 0)
1396         {
1397           std::cerr << "could not open '" << opt_input << "' for reading: "
1398                     << strerror(errno) << '\n';
1399           exit(1);
1400         }
1401     }
1402 
1403   // Read the XML.
1404   xmlParserCtxtPtr parser_context = xmlNewParserCtxt();
1405   xmlDocPtr document
1406       = xmlCtxtReadFd(parser_context, in_fd, nullptr, nullptr, 0);
1407   if (!document)
1408     {
1409       std::cerr << "failed to parse input as XML\n";
1410       exit(1);
1411     }
1412   xmlFreeParserCtxt(parser_context);
1413   close(in_fd);
1414 
1415   // Get the root element.
1416   xmlNodePtr root = xmlDocGetRootElement(document);
1417   if (!root)
1418     {
1419       std::cerr << "XML document has no root element\n";
1420       exit(1);
1421     }
1422 
1423   // Strip text nodes to simplify other operations.
1424   strip_text(root);
1425 
1426   // Remove unlisted symbols.
1427   if (opt_symbols)
1428     filter_symbols(opt_symbols.value(), root);
1429 
1430   // Normalise anonymous type names.
1431   // Reanonymise anonymous types.
1432   // Discard naming typedef backlinks.
1433   if (opt_normalise_anonymous || opt_reanonymise_anonymous
1434       || opt_discard_naming_typedefs)
1435     handle_anonymous_types(opt_normalise_anonymous, opt_reanonymise_anonymous,
1436                            opt_discard_naming_typedefs, root);
1437 
1438   // Prune unreachable elements and/or report untyped symbols.
1439   size_t untyped_symbols = 0;
1440   if (opt_prune_unreachable || opt_report_untyped || opt_abort_on_untyped)
1441     untyped_symbols += handle_unreachable(
1442         opt_prune_unreachable, opt_report_untyped, root);
1443   if (opt_abort_on_untyped && untyped_symbols)
1444     {
1445       std::cerr << "found " << untyped_symbols << " untyped symbols\n";
1446       exit(1);
1447     }
1448 
1449   // Limit location information.
1450   if (opt_locations > LocationInfo::COLUMN)
1451     limit_locations(opt_locations, root);
1452 
1453   // Clear unwanted non-reachable attributes.
1454   if (opt_clear_non_reachable)
1455     clear_non_reachable(root);
1456 
1457   // Eliminate complete duplicates and extra fragments of types.
1458   // Report conflicting type defintions.
1459   // Record whether there are namespace scope conflicts.
1460   size_t scope_conflicts = 0;
1461   if (opt_eliminate_duplicates || opt_report_conflicts || opt_sort)
1462     scope_conflicts += handle_duplicate_types(
1463         opt_eliminate_duplicates, opt_report_conflicts, root);
1464 
1465   // Sort namespaces, types and declarations.
1466   if (opt_sort)
1467     {
1468       if (scope_conflicts)
1469         std::cerr << "found type definition scope conflicts, skipping sort\n";
1470       else
1471         sort_namespaces_types_and_declarations(root);
1472     }
1473 
1474   // Drop empty subelements.
1475   if (opt_drop_empty)
1476     drop_empty(root);
1477 
1478   // Reformat root element for human consumption.
1479   format_xml(std::string(opt_indentation, ' '), std::string(), root);
1480 
1481   // Open output for writing.
1482   int out_fd = STDOUT_FILENO;
1483   if (opt_output)
1484     {
1485       out_fd = open(opt_output, O_CREAT | O_TRUNC | O_WRONLY,
1486                     S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP | S_IROTH | S_IWOTH);
1487       if (out_fd < 0)
1488         {
1489           std::cerr << "could not open '" << opt_output << "' for writing: "
1490                     << strerror(errno) << '\n';
1491           exit(1);
1492         }
1493     }
1494 
1495   // Write the XML.
1496   //
1497   // First to memory, as we need to do a little post-processing.
1498   xmlChar* out_data;
1499   int out_size;
1500   xmlDocDumpMemory(document, &out_data, &out_size);
1501   // Remove the XML declaration as it currently upsets abidiff.
1502   xmlChar* out_limit = out_data + out_size;
1503   while (out_data < out_limit && *out_data != '\n')
1504     ++out_data;
1505   if (out_data < out_limit)
1506     ++out_data;
1507   // Adjust quotes to match abidw.
1508   adjust_quotes(out_data, out_limit);
1509   // And now to a file.
1510   size_t count = out_limit - out_data;
1511   if (write(out_fd, out_data, count) != count)
1512     {
1513       std::cerr << "could not write output: " << strerror(errno) << '\n';
1514       exit(1);
1515     }
1516   if (close(out_fd) < 0)
1517     {
1518       std::cerr << "could not close output: " << strerror(errno) << '\n';
1519       exit(1);
1520     }
1521 
1522   // Free libxml document.
1523   xmlFreeDoc(document);
1524   return 0;
1525 }
1526