• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 //===- YAMLParser.h - Simple YAML parser ------------------------*- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 //  This is a YAML 1.2 parser.
10 //
11 //  See http://www.yaml.org/spec/1.2/spec.html for the full standard.
12 //
13 //  This currently does not implement the following:
14 //    * Multi-line literal folding.
15 //    * Tag resolution.
16 //    * UTF-16.
17 //    * BOMs anywhere other than the first Unicode scalar value in the file.
18 //
19 //  The most important class here is Stream. This represents a YAML stream with
20 //  0, 1, or many documents.
21 //
22 //  SourceMgr sm;
23 //  StringRef input = getInput();
24 //  yaml::Stream stream(input, sm);
25 //
26 //  for (yaml::document_iterator di = stream.begin(), de = stream.end();
27 //       di != de; ++di) {
28 //    yaml::Node *n = di->getRoot();
29 //    if (n) {
30 //      // Do something with n...
31 //    } else
32 //      break;
33 //  }
34 //
35 //===----------------------------------------------------------------------===//
36 
37 #ifndef LLVM_SUPPORT_YAMLPARSER_H
38 #define LLVM_SUPPORT_YAMLPARSER_H
39 
40 #include "llvm/ADT/StringRef.h"
41 #include "llvm/Support/Allocator.h"
42 #include "llvm/Support/SMLoc.h"
43 #include <cassert>
44 #include <cstddef>
45 #include <iterator>
46 #include <map>
47 #include <memory>
48 #include <string>
49 #include <system_error>
50 
51 namespace llvm {
52 
53 class MemoryBufferRef;
54 class SourceMgr;
55 class raw_ostream;
56 class Twine;
57 
58 namespace yaml {
59 
60 class Document;
61 class document_iterator;
62 class Node;
63 class Scanner;
64 struct Token;
65 
66 /// Dump all the tokens in this stream to OS.
67 /// \returns true if there was an error, false otherwise.
68 bool dumpTokens(StringRef Input, raw_ostream &);
69 
70 /// Scans all tokens in input without outputting anything. This is used
71 ///        for benchmarking the tokenizer.
72 /// \returns true if there was an error, false otherwise.
73 bool scanTokens(StringRef Input);
74 
75 /// Escape \a Input for a double quoted scalar; if \p EscapePrintable
76 /// is true, all UTF8 sequences will be escaped, if \p EscapePrintable is
77 /// false, those UTF8 sequences encoding printable unicode scalars will not be
78 /// escaped, but emitted verbatim.
79 std::string escape(StringRef Input, bool EscapePrintable = true);
80 
81 /// This class represents a YAML stream potentially containing multiple
82 ///        documents.
83 class Stream {
84 public:
85   /// This keeps a reference to the string referenced by \p Input.
86   Stream(StringRef Input, SourceMgr &, bool ShowColors = true,
87          std::error_code *EC = nullptr);
88 
89   Stream(MemoryBufferRef InputBuffer, SourceMgr &, bool ShowColors = true,
90          std::error_code *EC = nullptr);
91   ~Stream();
92 
93   document_iterator begin();
94   document_iterator end();
95   void skip();
96   bool failed();
97 
validate()98   bool validate() {
99     skip();
100     return !failed();
101   }
102 
103   void printError(Node *N, const Twine &Msg);
104 
105 private:
106   friend class Document;
107 
108   std::unique_ptr<Scanner> scanner;
109   std::unique_ptr<Document> CurrentDoc;
110 };
111 
112 /// Abstract base class for all Nodes.
113 class Node {
114   virtual void anchor();
115 
116 public:
117   enum NodeKind {
118     NK_Null,
119     NK_Scalar,
120     NK_BlockScalar,
121     NK_KeyValue,
122     NK_Mapping,
123     NK_Sequence,
124     NK_Alias
125   };
126 
127   Node(unsigned int Type, std::unique_ptr<Document> &, StringRef Anchor,
128        StringRef Tag);
129 
130   // It's not safe to copy YAML nodes; the document is streamed and the position
131   // is part of the state.
132   Node(const Node &) = delete;
133   void operator=(const Node &) = delete;
134 
135   void *operator new(size_t Size, BumpPtrAllocator &Alloc,
136                      size_t Alignment = 16) noexcept {
137     return Alloc.Allocate(Size, Alignment);
138   }
139 
delete(void * Ptr,BumpPtrAllocator & Alloc,size_t Size)140   void operator delete(void *Ptr, BumpPtrAllocator &Alloc,
141                        size_t Size) noexcept {
142     Alloc.Deallocate(Ptr, Size);
143   }
144 
145   void operator delete(void *) noexcept = delete;
146 
147   /// Get the value of the anchor attached to this node. If it does not
148   ///        have one, getAnchor().size() will be 0.
getAnchor()149   StringRef getAnchor() const { return Anchor; }
150 
151   /// Get the tag as it was written in the document. This does not
152   ///   perform tag resolution.
getRawTag()153   StringRef getRawTag() const { return Tag; }
154 
155   /// Get the verbatium tag for a given Node. This performs tag resoluton
156   ///   and substitution.
157   std::string getVerbatimTag() const;
158 
getSourceRange()159   SMRange getSourceRange() const { return SourceRange; }
setSourceRange(SMRange SR)160   void setSourceRange(SMRange SR) { SourceRange = SR; }
161 
162   // These functions forward to Document and Scanner.
163   Token &peekNext();
164   Token getNext();
165   Node *parseBlockNode();
166   BumpPtrAllocator &getAllocator();
167   void setError(const Twine &Message, Token &Location) const;
168   bool failed() const;
169 
skip()170   virtual void skip() {}
171 
getType()172   unsigned int getType() const { return TypeID; }
173 
174 protected:
175   std::unique_ptr<Document> &Doc;
176   SMRange SourceRange;
177 
178   ~Node() = default;
179 
180 private:
181   unsigned int TypeID;
182   StringRef Anchor;
183   /// The tag as typed in the document.
184   StringRef Tag;
185 };
186 
187 /// A null value.
188 ///
189 /// Example:
190 ///   !!null null
191 class NullNode final : public Node {
192   void anchor() override;
193 
194 public:
NullNode(std::unique_ptr<Document> & D)195   NullNode(std::unique_ptr<Document> &D)
196       : Node(NK_Null, D, StringRef(), StringRef()) {}
197 
classof(const Node * N)198   static bool classof(const Node *N) { return N->getType() == NK_Null; }
199 };
200 
201 /// A scalar node is an opaque datum that can be presented as a
202 ///        series of zero or more Unicode scalar values.
203 ///
204 /// Example:
205 ///   Adena
206 class ScalarNode final : public Node {
207   void anchor() override;
208 
209 public:
ScalarNode(std::unique_ptr<Document> & D,StringRef Anchor,StringRef Tag,StringRef Val)210   ScalarNode(std::unique_ptr<Document> &D, StringRef Anchor, StringRef Tag,
211              StringRef Val)
212       : Node(NK_Scalar, D, Anchor, Tag), Value(Val) {
213     SMLoc Start = SMLoc::getFromPointer(Val.begin());
214     SMLoc End = SMLoc::getFromPointer(Val.end());
215     SourceRange = SMRange(Start, End);
216   }
217 
218   // Return Value without any escaping or folding or other fun YAML stuff. This
219   // is the exact bytes that are contained in the file (after conversion to
220   // utf8).
getRawValue()221   StringRef getRawValue() const { return Value; }
222 
223   /// Gets the value of this node as a StringRef.
224   ///
225   /// \param Storage is used to store the content of the returned StringRef iff
226   ///        it requires any modification from how it appeared in the source.
227   ///        This happens with escaped characters and multi-line literals.
228   StringRef getValue(SmallVectorImpl<char> &Storage) const;
229 
classof(const Node * N)230   static bool classof(const Node *N) {
231     return N->getType() == NK_Scalar;
232   }
233 
234 private:
235   StringRef Value;
236 
237   StringRef unescapeDoubleQuoted(StringRef UnquotedValue,
238                                  StringRef::size_type Start,
239                                  SmallVectorImpl<char> &Storage) const;
240 };
241 
242 /// A block scalar node is an opaque datum that can be presented as a
243 ///        series of zero or more Unicode scalar values.
244 ///
245 /// Example:
246 ///   |
247 ///     Hello
248 ///     World
249 class BlockScalarNode final : public Node {
250   void anchor() override;
251 
252 public:
BlockScalarNode(std::unique_ptr<Document> & D,StringRef Anchor,StringRef Tag,StringRef Value,StringRef RawVal)253   BlockScalarNode(std::unique_ptr<Document> &D, StringRef Anchor, StringRef Tag,
254                   StringRef Value, StringRef RawVal)
255       : Node(NK_BlockScalar, D, Anchor, Tag), Value(Value) {
256     SMLoc Start = SMLoc::getFromPointer(RawVal.begin());
257     SMLoc End = SMLoc::getFromPointer(RawVal.end());
258     SourceRange = SMRange(Start, End);
259   }
260 
261   /// Gets the value of this node as a StringRef.
getValue()262   StringRef getValue() const { return Value; }
263 
classof(const Node * N)264   static bool classof(const Node *N) {
265     return N->getType() == NK_BlockScalar;
266   }
267 
268 private:
269   StringRef Value;
270 };
271 
272 /// A key and value pair. While not technically a Node under the YAML
273 ///        representation graph, it is easier to treat them this way.
274 ///
275 /// TODO: Consider making this not a child of Node.
276 ///
277 /// Example:
278 ///   Section: .text
279 class KeyValueNode final : public Node {
280   void anchor() override;
281 
282 public:
KeyValueNode(std::unique_ptr<Document> & D)283   KeyValueNode(std::unique_ptr<Document> &D)
284       : Node(NK_KeyValue, D, StringRef(), StringRef()) {}
285 
286   /// Parse and return the key.
287   ///
288   /// This may be called multiple times.
289   ///
290   /// \returns The key, or nullptr if failed() == true.
291   Node *getKey();
292 
293   /// Parse and return the value.
294   ///
295   /// This may be called multiple times.
296   ///
297   /// \returns The value, or nullptr if failed() == true.
298   Node *getValue();
299 
skip()300   void skip() override {
301     if (Node *Key = getKey()) {
302       Key->skip();
303       if (Node *Val = getValue())
304         Val->skip();
305     }
306   }
307 
classof(const Node * N)308   static bool classof(const Node *N) {
309     return N->getType() == NK_KeyValue;
310   }
311 
312 private:
313   Node *Key = nullptr;
314   Node *Value = nullptr;
315 };
316 
317 /// This is an iterator abstraction over YAML collections shared by both
318 ///        sequences and maps.
319 ///
320 /// BaseT must have a ValueT* member named CurrentEntry and a member function
321 /// increment() which must set CurrentEntry to 0 to create an end iterator.
322 template <class BaseT, class ValueT>
323 class basic_collection_iterator {
324 public:
325   using iterator_category = std::input_iterator_tag;
326   using value_type = ValueT;
327   using difference_type = std::ptrdiff_t;
328   using pointer = ValueT*;
329   using reference = ValueT&;
330 
331   basic_collection_iterator() = default;
basic_collection_iterator(BaseT * B)332   basic_collection_iterator(BaseT *B) : Base(B) {}
333 
334   ValueT *operator->() const {
335     assert(Base && Base->CurrentEntry && "Attempted to access end iterator!");
336     return Base->CurrentEntry;
337   }
338 
339   ValueT &operator*() const {
340     assert(Base && Base->CurrentEntry &&
341            "Attempted to dereference end iterator!");
342     return *Base->CurrentEntry;
343   }
344 
345   operator ValueT *() const {
346     assert(Base && Base->CurrentEntry && "Attempted to access end iterator!");
347     return Base->CurrentEntry;
348   }
349 
350   /// Note on EqualityComparable:
351   ///
352   /// The iterator is not re-entrant,
353   /// it is meant to be used for parsing YAML on-demand
354   /// Once iteration started - it can point only to one entry at a time
355   /// hence Base.CurrentEntry and Other.Base.CurrentEntry are equal
356   /// iff Base and Other.Base are equal.
357   bool operator==(const basic_collection_iterator &Other) const {
358     if (Base && (Base == Other.Base)) {
359       assert((Base->CurrentEntry == Other.Base->CurrentEntry)
360              && "Equal Bases expected to point to equal Entries");
361     }
362 
363     return Base == Other.Base;
364   }
365 
366   bool operator!=(const basic_collection_iterator &Other) const {
367     return !(Base == Other.Base);
368   }
369 
370   basic_collection_iterator &operator++() {
371     assert(Base && "Attempted to advance iterator past end!");
372     Base->increment();
373     // Create an end iterator.
374     if (!Base->CurrentEntry)
375       Base = nullptr;
376     return *this;
377   }
378 
379 private:
380   BaseT *Base = nullptr;
381 };
382 
383 // The following two templates are used for both MappingNode and Sequence Node.
384 template <class CollectionType>
begin(CollectionType & C)385 typename CollectionType::iterator begin(CollectionType &C) {
386   assert(C.IsAtBeginning && "You may only iterate over a collection once!");
387   C.IsAtBeginning = false;
388   typename CollectionType::iterator ret(&C);
389   ++ret;
390   return ret;
391 }
392 
skip(CollectionType & C)393 template <class CollectionType> void skip(CollectionType &C) {
394   // TODO: support skipping from the middle of a parsed collection ;/
395   assert((C.IsAtBeginning || C.IsAtEnd) && "Cannot skip mid parse!");
396   if (C.IsAtBeginning)
397     for (typename CollectionType::iterator i = begin(C), e = C.end(); i != e;
398          ++i)
399       i->skip();
400 }
401 
402 /// Represents a YAML map created from either a block map for a flow map.
403 ///
404 /// This parses the YAML stream as increment() is called.
405 ///
406 /// Example:
407 ///   Name: _main
408 ///   Scope: Global
409 class MappingNode final : public Node {
410   void anchor() override;
411 
412 public:
413   enum MappingType {
414     MT_Block,
415     MT_Flow,
416     MT_Inline ///< An inline mapping node is used for "[key: value]".
417   };
418 
MappingNode(std::unique_ptr<Document> & D,StringRef Anchor,StringRef Tag,MappingType MT)419   MappingNode(std::unique_ptr<Document> &D, StringRef Anchor, StringRef Tag,
420               MappingType MT)
421       : Node(NK_Mapping, D, Anchor, Tag), Type(MT) {}
422 
423   friend class basic_collection_iterator<MappingNode, KeyValueNode>;
424 
425   using iterator = basic_collection_iterator<MappingNode, KeyValueNode>;
426 
427   template <class T> friend typename T::iterator yaml::begin(T &);
428   template <class T> friend void yaml::skip(T &);
429 
begin()430   iterator begin() { return yaml::begin(*this); }
431 
end()432   iterator end() { return iterator(); }
433 
skip()434   void skip() override { yaml::skip(*this); }
435 
classof(const Node * N)436   static bool classof(const Node *N) {
437     return N->getType() == NK_Mapping;
438   }
439 
440 private:
441   MappingType Type;
442   bool IsAtBeginning = true;
443   bool IsAtEnd = false;
444   KeyValueNode *CurrentEntry = nullptr;
445 
446   void increment();
447 };
448 
449 /// Represents a YAML sequence created from either a block sequence for a
450 ///        flow sequence.
451 ///
452 /// This parses the YAML stream as increment() is called.
453 ///
454 /// Example:
455 ///   - Hello
456 ///   - World
457 class SequenceNode final : public Node {
458   void anchor() override;
459 
460 public:
461   enum SequenceType {
462     ST_Block,
463     ST_Flow,
464     // Use for:
465     //
466     // key:
467     // - val1
468     // - val2
469     //
470     // As a BlockMappingEntry and BlockEnd are not created in this case.
471     ST_Indentless
472   };
473 
SequenceNode(std::unique_ptr<Document> & D,StringRef Anchor,StringRef Tag,SequenceType ST)474   SequenceNode(std::unique_ptr<Document> &D, StringRef Anchor, StringRef Tag,
475                SequenceType ST)
476       : Node(NK_Sequence, D, Anchor, Tag), SeqType(ST) {}
477 
478   friend class basic_collection_iterator<SequenceNode, Node>;
479 
480   using iterator = basic_collection_iterator<SequenceNode, Node>;
481 
482   template <class T> friend typename T::iterator yaml::begin(T &);
483   template <class T> friend void yaml::skip(T &);
484 
485   void increment();
486 
begin()487   iterator begin() { return yaml::begin(*this); }
488 
end()489   iterator end() { return iterator(); }
490 
skip()491   void skip() override { yaml::skip(*this); }
492 
classof(const Node * N)493   static bool classof(const Node *N) {
494     return N->getType() == NK_Sequence;
495   }
496 
497 private:
498   SequenceType SeqType;
499   bool IsAtBeginning = true;
500   bool IsAtEnd = false;
501   bool WasPreviousTokenFlowEntry = true; // Start with an imaginary ','.
502   Node *CurrentEntry = nullptr;
503 };
504 
505 /// Represents an alias to a Node with an anchor.
506 ///
507 /// Example:
508 ///   *AnchorName
509 class AliasNode final : public Node {
510   void anchor() override;
511 
512 public:
AliasNode(std::unique_ptr<Document> & D,StringRef Val)513   AliasNode(std::unique_ptr<Document> &D, StringRef Val)
514       : Node(NK_Alias, D, StringRef(), StringRef()), Name(Val) {}
515 
getName()516   StringRef getName() const { return Name; }
517   Node *getTarget();
518 
classof(const Node * N)519   static bool classof(const Node *N) { return N->getType() == NK_Alias; }
520 
521 private:
522   StringRef Name;
523 };
524 
525 /// A YAML Stream is a sequence of Documents. A document contains a root
526 ///        node.
527 class Document {
528 public:
529   Document(Stream &ParentStream);
530 
531   /// Root for parsing a node. Returns a single node.
532   Node *parseBlockNode();
533 
534   /// Finish parsing the current document and return true if there are
535   ///        more. Return false otherwise.
536   bool skip();
537 
538   /// Parse and return the root level node.
getRoot()539   Node *getRoot() {
540     if (Root)
541       return Root;
542     return Root = parseBlockNode();
543   }
544 
getTagMap()545   const std::map<StringRef, StringRef> &getTagMap() const { return TagMap; }
546 
547 private:
548   friend class Node;
549   friend class document_iterator;
550 
551   /// Stream to read tokens from.
552   Stream &stream;
553 
554   /// Used to allocate nodes to. All are destroyed without calling their
555   ///        destructor when the document is destroyed.
556   BumpPtrAllocator NodeAllocator;
557 
558   /// The root node. Used to support skipping a partially parsed
559   ///        document.
560   Node *Root;
561 
562   /// Maps tag prefixes to their expansion.
563   std::map<StringRef, StringRef> TagMap;
564 
565   Token &peekNext();
566   Token getNext();
567   void setError(const Twine &Message, Token &Location) const;
568   bool failed() const;
569 
570   /// Parse %BLAH directives and return true if any were encountered.
571   bool parseDirectives();
572 
573   /// Parse %YAML
574   void parseYAMLDirective();
575 
576   /// Parse %TAG
577   void parseTAGDirective();
578 
579   /// Consume the next token and error if it is not \a TK.
580   bool expectToken(int TK);
581 };
582 
583 /// Iterator abstraction for Documents over a Stream.
584 class document_iterator {
585 public:
586   document_iterator() = default;
document_iterator(std::unique_ptr<Document> & D)587   document_iterator(std::unique_ptr<Document> &D) : Doc(&D) {}
588 
589   bool operator==(const document_iterator &Other) const {
590     if (isAtEnd() || Other.isAtEnd())
591       return isAtEnd() && Other.isAtEnd();
592 
593     return Doc == Other.Doc;
594   }
595   bool operator!=(const document_iterator &Other) const {
596     return !(*this == Other);
597   }
598 
599   document_iterator operator++() {
600     assert(Doc && "incrementing iterator past the end.");
601     if (!(*Doc)->skip()) {
602       Doc->reset(nullptr);
603     } else {
604       Stream &S = (*Doc)->stream;
605       Doc->reset(new Document(S));
606     }
607     return *this;
608   }
609 
610   Document &operator*() { return *Doc->get(); }
611 
612   std::unique_ptr<Document> &operator->() { return *Doc; }
613 
614 private:
isAtEnd()615   bool isAtEnd() const { return !Doc || !*Doc; }
616 
617   std::unique_ptr<Document> *Doc = nullptr;
618 };
619 
620 } // end namespace yaml
621 
622 } // end namespace llvm
623 
624 #endif // LLVM_SUPPORT_YAMLPARSER_H
625