1 //===- Tree.h - structure of the syntax tree ------------------*- C++ -*-=====// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // Defines the basic structure of the syntax tree. There are two kinds of nodes: 9 // - leaf nodes correspond to a token in the expanded token stream, 10 // - tree nodes correspond to language grammar constructs. 11 // 12 // The tree is initially built from an AST. Each node of a newly built tree 13 // covers a continous subrange of expanded tokens (i.e. tokens after 14 // preprocessing), the specific tokens coverered are stored in the leaf nodes of 15 // a tree. A post-order traversal of a tree will visit leaf nodes in an order 16 // corresponding the original order of expanded tokens. 17 // 18 // This is still work in progress and highly experimental, we leave room for 19 // ourselves to completely change the design and/or implementation. 20 //===----------------------------------------------------------------------===// 21 #ifndef LLVM_CLANG_TOOLING_SYNTAX_TREE_CASCADE_H 22 #define LLVM_CLANG_TOOLING_SYNTAX_TREE_CASCADE_H 23 24 #include "clang/Basic/LangOptions.h" 25 #include "clang/Basic/SourceLocation.h" 26 #include "clang/Basic/SourceManager.h" 27 #include "clang/Basic/TokenKinds.h" 28 #include "clang/Tooling/Syntax/Tokens.h" 29 #include "llvm/ADT/ArrayRef.h" 30 #include "llvm/ADT/DenseMap.h" 31 #include "llvm/ADT/iterator.h" 32 #include "llvm/Support/Allocator.h" 33 #include <cstdint> 34 #include <iterator> 35 36 namespace clang { 37 namespace syntax { 38 39 /// A memory arena for syntax trees. Also tracks the underlying token buffers, 40 /// source manager, etc. 41 class Arena { 42 public: 43 Arena(SourceManager &SourceMgr, const LangOptions &LangOpts, 44 const TokenBuffer &Tokens); 45 getSourceManager()46 const SourceManager &getSourceManager() const { return SourceMgr; } getLangOptions()47 const LangOptions &getLangOptions() const { return LangOpts; } 48 49 const TokenBuffer &getTokenBuffer() const; getAllocator()50 llvm::BumpPtrAllocator &getAllocator() { return Allocator; } 51 52 private: 53 /// Add \p Buffer to the underlying source manager, tokenize it and store the 54 /// resulting tokens. Used exclusively in `FactoryImpl` to materialize tokens 55 /// that were not written in user code. 56 std::pair<FileID, ArrayRef<Token>> 57 lexBuffer(std::unique_ptr<llvm::MemoryBuffer> Buffer); 58 friend class FactoryImpl; 59 60 private: 61 SourceManager &SourceMgr; 62 const LangOptions &LangOpts; 63 const TokenBuffer &Tokens; 64 /// IDs and storage for additional tokenized files. 65 llvm::DenseMap<FileID, std::vector<Token>> ExtraTokens; 66 /// Keeps all the allocated nodes and their intermediate data structures. 67 llvm::BumpPtrAllocator Allocator; 68 }; 69 70 class Tree; 71 class TreeBuilder; 72 class FactoryImpl; 73 class MutationsImpl; 74 75 enum class NodeKind : uint16_t; 76 enum class NodeRole : uint8_t; 77 78 /// A node in a syntax tree. Each node is either a Leaf (representing tokens) or 79 /// a Tree (representing language constructrs). 80 class Node { 81 protected: 82 /// Newly created nodes are detached from a tree, parent and sibling links are 83 /// set when the node is added as a child to another one. 84 Node(NodeKind Kind); 85 /// Nodes are allocated on Arenas; the destructor is never called. 86 ~Node() = default; 87 88 public: 89 /// Nodes cannot simply be copied without violating tree invariants. 90 Node(const Node &) = delete; 91 Node &operator=(const Node &) = delete; 92 /// Idiomatically, nodes are allocated on an Arena and never moved. 93 Node(Node &&) = delete; 94 Node &operator=(Node &&) = delete; 95 getKind()96 NodeKind getKind() const { return static_cast<NodeKind>(Kind); } getRole()97 NodeRole getRole() const { return static_cast<NodeRole>(Role); } 98 99 /// Whether the node is detached from a tree, i.e. does not have a parent. 100 bool isDetached() const; 101 /// Whether the node was created from the AST backed by the source code 102 /// rather than added later through mutation APIs or created with factory 103 /// functions. 104 /// When this flag is true, all subtrees are also original. 105 /// This flag is set to false on any modifications to the node or any of its 106 /// subtrees, even if this simply involves swapping existing subtrees. isOriginal()107 bool isOriginal() const { return Original; } 108 /// If this function return false, the tree cannot be modified because there 109 /// is no reasonable way to produce the corresponding textual replacements. 110 /// This can happen when the node crosses macro expansion boundaries. 111 /// 112 /// Note that even if the node is not modifiable, its child nodes can be 113 /// modifiable. canModify()114 bool canModify() const { return CanModify; } 115 getParent()116 const Tree *getParent() const { return Parent; } getParent()117 Tree *getParent() { return Parent; } 118 getNextSibling()119 const Node *getNextSibling() const { return NextSibling; } getNextSibling()120 Node *getNextSibling() { return NextSibling; } getPreviousSibling()121 const Node *getPreviousSibling() const { return PreviousSibling; } getPreviousSibling()122 Node *getPreviousSibling() { return PreviousSibling; } 123 124 /// Dumps the structure of a subtree. For debugging and testing purposes. 125 std::string dump(const SourceManager &SM) const; 126 /// Dumps the tokens forming this subtree. 127 std::string dumpTokens(const SourceManager &SM) const; 128 129 /// Asserts invariants on this node of the tree and its immediate children. 130 /// Will not recurse into the subtree. No-op if NDEBUG is set. 131 void assertInvariants() const; 132 /// Runs checkInvariants on all nodes in the subtree. No-op if NDEBUG is set. 133 void assertInvariantsRecursive() const; 134 135 private: 136 // Tree is allowed to change the Parent link and Role. 137 friend class Tree; 138 // TreeBuilder is allowed to set the Original and CanModify flags. 139 friend class TreeBuilder; 140 // MutationsImpl sets roles and CanModify flag. 141 friend class MutationsImpl; 142 // FactoryImpl sets CanModify flag. 143 friend class FactoryImpl; 144 145 void setRole(NodeRole NR); 146 147 Tree *Parent; 148 Node *NextSibling; 149 Node *PreviousSibling; 150 unsigned Kind : 16; 151 unsigned Role : 8; 152 unsigned Original : 1; 153 unsigned CanModify : 1; 154 }; 155 156 /// A leaf node points to a single token inside the expanded token stream. 157 class Leaf final : public Node { 158 public: 159 Leaf(const Token *T); 160 static bool classof(const Node *N); 161 getToken()162 const Token *getToken() const { return Tok; } 163 164 private: 165 const Token *Tok; 166 }; 167 168 /// A node that has children and represents a syntactic language construct. 169 class Tree : public Node { 170 /// Iterator over children (common base for const/non-const). 171 /// Not invalidated by tree mutations (holds a stable node pointer). 172 template <typename DerivedT, typename NodeT> 173 class ChildIteratorBase 174 : public llvm::iterator_facade_base<DerivedT, std::forward_iterator_tag, 175 NodeT> { 176 protected: 177 NodeT *N = nullptr; 178 using Base = ChildIteratorBase; 179 180 public: 181 ChildIteratorBase() = default; ChildIteratorBase(NodeT * N)182 explicit ChildIteratorBase(NodeT *N) : N(N) {} 183 184 bool operator==(const DerivedT &O) const { return O.N == N; } 185 NodeT &operator*() const { return *N; } 186 DerivedT &operator++() { 187 N = N->getNextSibling(); 188 return *static_cast<DerivedT *>(this); 189 } 190 191 /// Truthy if valid (not past-the-end). 192 /// This allows: if (auto It = find_if(N.children(), ...) ) 193 explicit operator bool() const { return N != nullptr; } 194 /// The element, or nullptr if past-the-end. asPointer()195 NodeT *asPointer() const { return N; } 196 }; 197 198 public: 199 static bool classof(const Node *N); 200 getFirstChild()201 Node *getFirstChild() { return FirstChild; } getFirstChild()202 const Node *getFirstChild() const { return FirstChild; } getLastChild()203 Node *getLastChild() { return LastChild; } getLastChild()204 const Node *getLastChild() const { return LastChild; } 205 206 const Leaf *findFirstLeaf() const; findFirstLeaf()207 Leaf *findFirstLeaf() { 208 return const_cast<Leaf *>(const_cast<const Tree *>(this)->findFirstLeaf()); 209 } 210 211 const Leaf *findLastLeaf() const; findLastLeaf()212 Leaf *findLastLeaf() { 213 return const_cast<Leaf *>(const_cast<const Tree *>(this)->findLastLeaf()); 214 } 215 216 /// child_iterator is not invalidated by mutations. 217 struct ChildIterator : ChildIteratorBase<ChildIterator, Node> { 218 using Base::ChildIteratorBase; 219 }; 220 struct ConstChildIterator 221 : ChildIteratorBase<ConstChildIterator, const Node> { 222 using Base::ChildIteratorBase; 223 ConstChildIterator() = default; ConstChildIteratorConstChildIterator224 ConstChildIterator(const ChildIterator &I) : Base(I.asPointer()) {} 225 }; 226 getChildren()227 llvm::iterator_range<ChildIterator> getChildren() { 228 return {ChildIterator(getFirstChild()), ChildIterator()}; 229 } getChildren()230 llvm::iterator_range<ConstChildIterator> getChildren() const { 231 return {ConstChildIterator(getFirstChild()), ConstChildIterator()}; 232 } 233 234 /// Find the first node with a corresponding role. 235 const Node *findChild(NodeRole R) const; findChild(NodeRole R)236 Node *findChild(NodeRole R) { 237 return const_cast<Node *>(const_cast<const Tree *>(this)->findChild(R)); 238 } 239 240 protected: 241 using Node::Node; 242 243 private: 244 /// Append \p Child to the list of children and sets the parent pointer. 245 /// A very low-level operation that does not check any invariants, only used 246 /// by TreeBuilder and FactoryImpl. 247 /// EXPECTS: Role != Detached. 248 void appendChildLowLevel(Node *Child, NodeRole Role); 249 /// Similar but prepends. 250 void prependChildLowLevel(Node *Child, NodeRole Role); 251 252 /// Like the previous overloads, but does not set role for \p Child. 253 /// EXPECTS: Child->Role != Detached 254 void appendChildLowLevel(Node *Child); 255 void prependChildLowLevel(Node *Child); 256 friend class TreeBuilder; 257 friend class FactoryImpl; 258 259 /// Replace a range of children [Begin, End) with a list of 260 /// new nodes starting at \p New. 261 /// Only used by MutationsImpl to implement higher-level mutation operations. 262 /// (!) \p New can be null to model removal of the child range. 263 /// (!) \p End can be null to model one past the end. 264 /// (!) \p Begin can be null to model an append. 265 void replaceChildRangeLowLevel(Node *Begin, Node *End, Node *New); 266 friend class MutationsImpl; 267 268 Node *FirstChild = nullptr; 269 Node *LastChild = nullptr; 270 }; 271 272 // Provide missing non_const == const overload. 273 // iterator_facade_base requires == to be a member, but implicit conversions 274 // don't work on the LHS of a member operator. 275 inline bool operator==(const Tree::ConstChildIterator &A, 276 const Tree::ConstChildIterator &B) { 277 return A.operator==(B); 278 } 279 280 /// A list of Elements separated or terminated by a fixed token. 281 /// 282 /// This type models the following grammar construct: 283 /// delimited-list(element, delimiter, termination, canBeEmpty) 284 class List : public Tree { 285 public: 286 template <typename Element> struct ElementAndDelimiter { 287 Element *element; 288 Leaf *delimiter; 289 }; 290 291 enum class TerminationKind { 292 Terminated, 293 MaybeTerminated, 294 Separated, 295 }; 296 297 using Tree::Tree; 298 static bool classof(const Node *N); 299 /// Returns the elements and corresponding delimiters. Missing elements 300 /// and delimiters are represented as null pointers. 301 /// 302 /// For example, in a separated list: 303 /// "a, b, c" <=> [("a" , ","), ("b" , "," ), ("c" , null)] 304 /// "a, , c" <=> [("a" , ","), (null, "," ), ("c" , null)] 305 /// "a, b c" <=> [("a" , ","), ("b" , null), ("c" , null)] 306 /// "a, b," <=> [("a" , ","), ("b" , "," ), (null, null)] 307 /// 308 /// In a terminated or maybe-terminated list: 309 /// "a; b; c;" <=> [("a" , ";"), ("b" , ";" ), ("c" , ";" )] 310 /// "a; ; c;" <=> [("a" , ";"), (null, ";" ), ("c" , ";" )] 311 /// "a; b c;" <=> [("a" , ";"), ("b" , null), ("c" , ";" )] 312 /// "a; b; c" <=> [("a" , ";"), ("b" , ";" ), ("c" , null)] 313 std::vector<ElementAndDelimiter<Node>> getElementsAsNodesAndDelimiters(); 314 315 /// Returns the elements of the list. Missing elements are represented 316 /// as null pointers in the same way as in the return value of 317 /// `getElementsAsNodesAndDelimiters()`. 318 std::vector<Node *> getElementsAsNodes(); 319 320 // These can't be implemented with the information we have! 321 322 /// Returns the appropriate delimiter for this list. 323 /// 324 /// Useful for discovering the correct delimiter to use when adding 325 /// elements to empty or one-element lists. 326 clang::tok::TokenKind getDelimiterTokenKind() const; 327 328 TerminationKind getTerminationKind() const; 329 330 /// Whether this list can be empty in syntactically and semantically correct 331 /// code. 332 /// 333 /// This list may be empty when the source code has errors even if 334 /// canBeEmpty() returns false. 335 bool canBeEmpty() const; 336 }; 337 338 } // namespace syntax 339 } // namespace clang 340 341 #endif 342