• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 //===--- CommentLexer.h - Lexer for structured comments ---------*- C++ -*-===//
2 //
3 //                     The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 //  This file defines lexer for structured comments and supporting token class.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #ifndef LLVM_CLANG_AST_COMMENT_LEXER_H
15 #define LLVM_CLANG_AST_COMMENT_LEXER_H
16 
17 #include "clang/Basic/SourceManager.h"
18 #include "llvm/ADT/SmallString.h"
19 #include "llvm/ADT/SmallVector.h"
20 #include "llvm/ADT/StringRef.h"
21 #include "llvm/Support/Allocator.h"
22 #include "llvm/Support/raw_ostream.h"
23 
24 namespace clang {
25 namespace comments {
26 
27 class Lexer;
28 class TextTokenRetokenizer;
29 struct CommandInfo;
30 class CommandTraits;
31 
32 namespace tok {
33 enum TokenKind {
34   eof,
35   newline,
36   text,
37   unknown_command,   // Command that does not have an ID.
38   backslash_command, // Command with an ID, that used backslash marker.
39   at_command,        // Command with an ID, that used 'at' marker.
40   verbatim_block_begin,
41   verbatim_block_line,
42   verbatim_block_end,
43   verbatim_line_name,
44   verbatim_line_text,
45   html_start_tag,     // <tag
46   html_ident,         // attr
47   html_equals,        // =
48   html_quoted_string, // "blah\"blah" or 'blah\'blah'
49   html_greater,       // >
50   html_slash_greater, // />
51   html_end_tag        // </tag
52 };
53 } // end namespace tok
54 
55 /// \brief Comment token.
56 class Token {
57   friend class Lexer;
58   friend class TextTokenRetokenizer;
59 
60   /// The location of the token.
61   SourceLocation Loc;
62 
63   /// The actual kind of the token.
64   tok::TokenKind Kind;
65 
66   /// Length of the token spelling in comment.  Can be 0 for synthenized
67   /// tokens.
68   unsigned Length;
69 
70   /// Contains text value associated with a token.
71   const char *TextPtr;
72 
73   /// Integer value associated with a token.
74   ///
75   /// If the token is a konwn command, contains command ID and TextPtr is
76   /// unused (command spelling can be found with CommandTraits).  Otherwise,
77   /// contains the length of the string that starts at TextPtr.
78   unsigned IntVal;
79 
80 public:
getLocation()81   SourceLocation getLocation() const LLVM_READONLY { return Loc; }
setLocation(SourceLocation SL)82   void setLocation(SourceLocation SL) { Loc = SL; }
83 
getEndLocation()84   SourceLocation getEndLocation() const LLVM_READONLY {
85     if (Length == 0 || Length == 1)
86       return Loc;
87     return Loc.getLocWithOffset(Length - 1);
88   }
89 
getKind()90   tok::TokenKind getKind() const LLVM_READONLY { return Kind; }
setKind(tok::TokenKind K)91   void setKind(tok::TokenKind K) { Kind = K; }
92 
is(tok::TokenKind K)93   bool is(tok::TokenKind K) const LLVM_READONLY { return Kind == K; }
isNot(tok::TokenKind K)94   bool isNot(tok::TokenKind K) const LLVM_READONLY { return Kind != K; }
95 
getLength()96   unsigned getLength() const LLVM_READONLY { return Length; }
setLength(unsigned L)97   void setLength(unsigned L) { Length = L; }
98 
getText()99   StringRef getText() const LLVM_READONLY {
100     assert(is(tok::text));
101     return StringRef(TextPtr, IntVal);
102   }
103 
setText(StringRef Text)104   void setText(StringRef Text) {
105     assert(is(tok::text));
106     TextPtr = Text.data();
107     IntVal = Text.size();
108   }
109 
getUnknownCommandName()110   StringRef getUnknownCommandName() const LLVM_READONLY {
111     assert(is(tok::unknown_command));
112     return StringRef(TextPtr, IntVal);
113   }
114 
setUnknownCommandName(StringRef Name)115   void setUnknownCommandName(StringRef Name) {
116     assert(is(tok::unknown_command));
117     TextPtr = Name.data();
118     IntVal = Name.size();
119   }
120 
getCommandID()121   unsigned getCommandID() const LLVM_READONLY {
122     assert(is(tok::backslash_command) || is(tok::at_command));
123     return IntVal;
124   }
125 
setCommandID(unsigned ID)126   void setCommandID(unsigned ID) {
127     assert(is(tok::backslash_command) || is(tok::at_command));
128     IntVal = ID;
129   }
130 
getVerbatimBlockID()131   unsigned getVerbatimBlockID() const LLVM_READONLY {
132     assert(is(tok::verbatim_block_begin) || is(tok::verbatim_block_end));
133     return IntVal;
134   }
135 
setVerbatimBlockID(unsigned ID)136   void setVerbatimBlockID(unsigned ID) {
137     assert(is(tok::verbatim_block_begin) || is(tok::verbatim_block_end));
138     IntVal = ID;
139   }
140 
getVerbatimBlockText()141   StringRef getVerbatimBlockText() const LLVM_READONLY {
142     assert(is(tok::verbatim_block_line));
143     return StringRef(TextPtr, IntVal);
144   }
145 
setVerbatimBlockText(StringRef Text)146   void setVerbatimBlockText(StringRef Text) {
147     assert(is(tok::verbatim_block_line));
148     TextPtr = Text.data();
149     IntVal = Text.size();
150   }
151 
getVerbatimLineID()152   unsigned getVerbatimLineID() const LLVM_READONLY {
153     assert(is(tok::verbatim_line_name));
154     return IntVal;
155   }
156 
setVerbatimLineID(unsigned ID)157   void setVerbatimLineID(unsigned ID) {
158     assert(is(tok::verbatim_line_name));
159     IntVal = ID;
160   }
161 
getVerbatimLineText()162   StringRef getVerbatimLineText() const LLVM_READONLY {
163     assert(is(tok::verbatim_line_text));
164     return StringRef(TextPtr, IntVal);
165   }
166 
setVerbatimLineText(StringRef Text)167   void setVerbatimLineText(StringRef Text) {
168     assert(is(tok::verbatim_line_text));
169     TextPtr = Text.data();
170     IntVal = Text.size();
171   }
172 
getHTMLTagStartName()173   StringRef getHTMLTagStartName() const LLVM_READONLY {
174     assert(is(tok::html_start_tag));
175     return StringRef(TextPtr, IntVal);
176   }
177 
setHTMLTagStartName(StringRef Name)178   void setHTMLTagStartName(StringRef Name) {
179     assert(is(tok::html_start_tag));
180     TextPtr = Name.data();
181     IntVal = Name.size();
182   }
183 
getHTMLIdent()184   StringRef getHTMLIdent() const LLVM_READONLY {
185     assert(is(tok::html_ident));
186     return StringRef(TextPtr, IntVal);
187   }
188 
setHTMLIdent(StringRef Name)189   void setHTMLIdent(StringRef Name) {
190     assert(is(tok::html_ident));
191     TextPtr = Name.data();
192     IntVal = Name.size();
193   }
194 
getHTMLQuotedString()195   StringRef getHTMLQuotedString() const LLVM_READONLY {
196     assert(is(tok::html_quoted_string));
197     return StringRef(TextPtr, IntVal);
198   }
199 
setHTMLQuotedString(StringRef Str)200   void setHTMLQuotedString(StringRef Str) {
201     assert(is(tok::html_quoted_string));
202     TextPtr = Str.data();
203     IntVal = Str.size();
204   }
205 
getHTMLTagEndName()206   StringRef getHTMLTagEndName() const LLVM_READONLY {
207     assert(is(tok::html_end_tag));
208     return StringRef(TextPtr, IntVal);
209   }
210 
setHTMLTagEndName(StringRef Name)211   void setHTMLTagEndName(StringRef Name) {
212     assert(is(tok::html_end_tag));
213     TextPtr = Name.data();
214     IntVal = Name.size();
215   }
216 
217   void dump(const Lexer &L, const SourceManager &SM) const;
218 };
219 
220 /// \brief Comment lexer.
221 class Lexer {
222 private:
223   Lexer(const Lexer &) LLVM_DELETED_FUNCTION;
224   void operator=(const Lexer &) LLVM_DELETED_FUNCTION;
225 
226   /// Allocator for strings that are semantic values of tokens and have to be
227   /// computed (for example, resolved decimal character references).
228   llvm::BumpPtrAllocator &Allocator;
229 
230   const CommandTraits &Traits;
231 
232   const char *const BufferStart;
233   const char *const BufferEnd;
234   SourceLocation FileLoc;
235 
236   const char *BufferPtr;
237 
238   /// One past end pointer for the current comment.  For BCPL comments points
239   /// to newline or BufferEnd, for C comments points to star in '*/'.
240   const char *CommentEnd;
241 
242   enum LexerCommentState {
243     LCS_BeforeComment,
244     LCS_InsideBCPLComment,
245     LCS_InsideCComment,
246     LCS_BetweenComments
247   };
248 
249   /// Low-level lexer state, track if we are inside or outside of comment.
250   LexerCommentState CommentState;
251 
252   enum LexerState {
253     /// Lexing normal comment text
254     LS_Normal,
255 
256     /// Finished lexing verbatim block beginning command, will lex first body
257     /// line.
258     LS_VerbatimBlockFirstLine,
259 
260     /// Lexing verbatim block body line-by-line, skipping line-starting
261     /// decorations.
262     LS_VerbatimBlockBody,
263 
264     /// Finished lexing verbatim line beginning command, will lex text (one
265     /// line).
266     LS_VerbatimLineText,
267 
268     /// Finished lexing \verbatim <TAG \endverbatim part, lexing tag attributes.
269     LS_HTMLStartTag,
270 
271     /// Finished lexing \verbatim </TAG \endverbatim part, lexing '>'.
272     LS_HTMLEndTag
273   };
274 
275   /// Current lexing mode.
276   LexerState State;
277 
278   /// If State is LS_VerbatimBlock, contains the name of verbatim end
279   /// command, including command marker.
280   SmallString<16> VerbatimBlockEndCommandName;
281 
282   /// Given a character reference name (e.g., "lt"), return the character that
283   /// it stands for (e.g., "<").
284   StringRef resolveHTMLNamedCharacterReference(StringRef Name) const;
285 
286   /// Given a Unicode codepoint as base-10 integer, return the character.
287   StringRef resolveHTMLDecimalCharacterReference(StringRef Name) const;
288 
289   /// Given a Unicode codepoint as base-16 integer, return the character.
290   StringRef resolveHTMLHexCharacterReference(StringRef Name) const;
291 
formTokenWithChars(Token & Result,const char * TokEnd,tok::TokenKind Kind)292   void formTokenWithChars(Token &Result, const char *TokEnd,
293                           tok::TokenKind Kind) {
294     const unsigned TokLen = TokEnd - BufferPtr;
295     Result.setLocation(getSourceLocation(BufferPtr));
296     Result.setKind(Kind);
297     Result.setLength(TokLen);
298 #ifndef NDEBUG
299     Result.TextPtr = "<UNSET>";
300     Result.IntVal = 7;
301 #endif
302     BufferPtr = TokEnd;
303   }
304 
formTextToken(Token & Result,const char * TokEnd)305   void formTextToken(Token &Result, const char *TokEnd) {
306     StringRef Text(BufferPtr, TokEnd - BufferPtr);
307     formTokenWithChars(Result, TokEnd, tok::text);
308     Result.setText(Text);
309   }
310 
getSourceLocation(const char * Loc)311   SourceLocation getSourceLocation(const char *Loc) const {
312     assert(Loc >= BufferStart && Loc <= BufferEnd &&
313            "Location out of range for this buffer!");
314 
315     const unsigned CharNo = Loc - BufferStart;
316     return FileLoc.getLocWithOffset(CharNo);
317   }
318 
319   /// Eat string matching regexp \code \s*\* \endcode.
320   void skipLineStartingDecorations();
321 
322   /// Lex stuff inside comments.  CommentEnd should be set correctly.
323   void lexCommentText(Token &T);
324 
325   void setupAndLexVerbatimBlock(Token &T,
326                                 const char *TextBegin,
327                                 char Marker, const CommandInfo *Info);
328 
329   void lexVerbatimBlockFirstLine(Token &T);
330 
331   void lexVerbatimBlockBody(Token &T);
332 
333   void setupAndLexVerbatimLine(Token &T, const char *TextBegin,
334                                const CommandInfo *Info);
335 
336   void lexVerbatimLineText(Token &T);
337 
338   void lexHTMLCharacterReference(Token &T);
339 
340   void setupAndLexHTMLStartTag(Token &T);
341 
342   void lexHTMLStartTag(Token &T);
343 
344   void setupAndLexHTMLEndTag(Token &T);
345 
346   void lexHTMLEndTag(Token &T);
347 
348 public:
349   Lexer(llvm::BumpPtrAllocator &Allocator, const CommandTraits &Traits,
350         SourceLocation FileLoc,
351         const char *BufferStart, const char *BufferEnd);
352 
353   void lex(Token &T);
354 
355   StringRef getSpelling(const Token &Tok,
356                         const SourceManager &SourceMgr,
357                         bool *Invalid = NULL) const;
358 };
359 
360 } // end namespace comments
361 } // end namespace clang
362 
363 #endif
364 
365