1 //===--- CommentLexer.h - Lexer for structured comments ---------*- C++ -*-===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 // This file defines lexer for structured comments and supporting token class. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #ifndef LLVM_CLANG_AST_COMMENT_LEXER_H 15 #define LLVM_CLANG_AST_COMMENT_LEXER_H 16 17 #include "clang/Basic/SourceManager.h" 18 #include "llvm/ADT/SmallString.h" 19 #include "llvm/ADT/SmallVector.h" 20 #include "llvm/ADT/StringRef.h" 21 #include "llvm/Support/Allocator.h" 22 #include "llvm/Support/raw_ostream.h" 23 24 namespace clang { 25 namespace comments { 26 27 class Lexer; 28 class TextTokenRetokenizer; 29 struct CommandInfo; 30 class CommandTraits; 31 32 namespace tok { 33 enum TokenKind { 34 eof, 35 newline, 36 text, 37 unknown_command, // Command that does not have an ID. 38 backslash_command, // Command with an ID, that used backslash marker. 39 at_command, // Command with an ID, that used 'at' marker. 40 verbatim_block_begin, 41 verbatim_block_line, 42 verbatim_block_end, 43 verbatim_line_name, 44 verbatim_line_text, 45 html_start_tag, // <tag 46 html_ident, // attr 47 html_equals, // = 48 html_quoted_string, // "blah\"blah" or 'blah\'blah' 49 html_greater, // > 50 html_slash_greater, // /> 51 html_end_tag // </tag 52 }; 53 } // end namespace tok 54 55 /// \brief Comment token. 56 class Token { 57 friend class Lexer; 58 friend class TextTokenRetokenizer; 59 60 /// The location of the token. 61 SourceLocation Loc; 62 63 /// The actual kind of the token. 64 tok::TokenKind Kind; 65 66 /// Length of the token spelling in comment. Can be 0 for synthenized 67 /// tokens. 68 unsigned Length; 69 70 /// Contains text value associated with a token. 71 const char *TextPtr; 72 73 /// Integer value associated with a token. 74 /// 75 /// If the token is a konwn command, contains command ID and TextPtr is 76 /// unused (command spelling can be found with CommandTraits). Otherwise, 77 /// contains the length of the string that starts at TextPtr. 78 unsigned IntVal; 79 80 public: getLocation()81 SourceLocation getLocation() const LLVM_READONLY { return Loc; } setLocation(SourceLocation SL)82 void setLocation(SourceLocation SL) { Loc = SL; } 83 getEndLocation()84 SourceLocation getEndLocation() const LLVM_READONLY { 85 if (Length == 0 || Length == 1) 86 return Loc; 87 return Loc.getLocWithOffset(Length - 1); 88 } 89 getKind()90 tok::TokenKind getKind() const LLVM_READONLY { return Kind; } setKind(tok::TokenKind K)91 void setKind(tok::TokenKind K) { Kind = K; } 92 is(tok::TokenKind K)93 bool is(tok::TokenKind K) const LLVM_READONLY { return Kind == K; } isNot(tok::TokenKind K)94 bool isNot(tok::TokenKind K) const LLVM_READONLY { return Kind != K; } 95 getLength()96 unsigned getLength() const LLVM_READONLY { return Length; } setLength(unsigned L)97 void setLength(unsigned L) { Length = L; } 98 getText()99 StringRef getText() const LLVM_READONLY { 100 assert(is(tok::text)); 101 return StringRef(TextPtr, IntVal); 102 } 103 setText(StringRef Text)104 void setText(StringRef Text) { 105 assert(is(tok::text)); 106 TextPtr = Text.data(); 107 IntVal = Text.size(); 108 } 109 getUnknownCommandName()110 StringRef getUnknownCommandName() const LLVM_READONLY { 111 assert(is(tok::unknown_command)); 112 return StringRef(TextPtr, IntVal); 113 } 114 setUnknownCommandName(StringRef Name)115 void setUnknownCommandName(StringRef Name) { 116 assert(is(tok::unknown_command)); 117 TextPtr = Name.data(); 118 IntVal = Name.size(); 119 } 120 getCommandID()121 unsigned getCommandID() const LLVM_READONLY { 122 assert(is(tok::backslash_command) || is(tok::at_command)); 123 return IntVal; 124 } 125 setCommandID(unsigned ID)126 void setCommandID(unsigned ID) { 127 assert(is(tok::backslash_command) || is(tok::at_command)); 128 IntVal = ID; 129 } 130 getVerbatimBlockID()131 unsigned getVerbatimBlockID() const LLVM_READONLY { 132 assert(is(tok::verbatim_block_begin) || is(tok::verbatim_block_end)); 133 return IntVal; 134 } 135 setVerbatimBlockID(unsigned ID)136 void setVerbatimBlockID(unsigned ID) { 137 assert(is(tok::verbatim_block_begin) || is(tok::verbatim_block_end)); 138 IntVal = ID; 139 } 140 getVerbatimBlockText()141 StringRef getVerbatimBlockText() const LLVM_READONLY { 142 assert(is(tok::verbatim_block_line)); 143 return StringRef(TextPtr, IntVal); 144 } 145 setVerbatimBlockText(StringRef Text)146 void setVerbatimBlockText(StringRef Text) { 147 assert(is(tok::verbatim_block_line)); 148 TextPtr = Text.data(); 149 IntVal = Text.size(); 150 } 151 getVerbatimLineID()152 unsigned getVerbatimLineID() const LLVM_READONLY { 153 assert(is(tok::verbatim_line_name)); 154 return IntVal; 155 } 156 setVerbatimLineID(unsigned ID)157 void setVerbatimLineID(unsigned ID) { 158 assert(is(tok::verbatim_line_name)); 159 IntVal = ID; 160 } 161 getVerbatimLineText()162 StringRef getVerbatimLineText() const LLVM_READONLY { 163 assert(is(tok::verbatim_line_text)); 164 return StringRef(TextPtr, IntVal); 165 } 166 setVerbatimLineText(StringRef Text)167 void setVerbatimLineText(StringRef Text) { 168 assert(is(tok::verbatim_line_text)); 169 TextPtr = Text.data(); 170 IntVal = Text.size(); 171 } 172 getHTMLTagStartName()173 StringRef getHTMLTagStartName() const LLVM_READONLY { 174 assert(is(tok::html_start_tag)); 175 return StringRef(TextPtr, IntVal); 176 } 177 setHTMLTagStartName(StringRef Name)178 void setHTMLTagStartName(StringRef Name) { 179 assert(is(tok::html_start_tag)); 180 TextPtr = Name.data(); 181 IntVal = Name.size(); 182 } 183 getHTMLIdent()184 StringRef getHTMLIdent() const LLVM_READONLY { 185 assert(is(tok::html_ident)); 186 return StringRef(TextPtr, IntVal); 187 } 188 setHTMLIdent(StringRef Name)189 void setHTMLIdent(StringRef Name) { 190 assert(is(tok::html_ident)); 191 TextPtr = Name.data(); 192 IntVal = Name.size(); 193 } 194 getHTMLQuotedString()195 StringRef getHTMLQuotedString() const LLVM_READONLY { 196 assert(is(tok::html_quoted_string)); 197 return StringRef(TextPtr, IntVal); 198 } 199 setHTMLQuotedString(StringRef Str)200 void setHTMLQuotedString(StringRef Str) { 201 assert(is(tok::html_quoted_string)); 202 TextPtr = Str.data(); 203 IntVal = Str.size(); 204 } 205 getHTMLTagEndName()206 StringRef getHTMLTagEndName() const LLVM_READONLY { 207 assert(is(tok::html_end_tag)); 208 return StringRef(TextPtr, IntVal); 209 } 210 setHTMLTagEndName(StringRef Name)211 void setHTMLTagEndName(StringRef Name) { 212 assert(is(tok::html_end_tag)); 213 TextPtr = Name.data(); 214 IntVal = Name.size(); 215 } 216 217 void dump(const Lexer &L, const SourceManager &SM) const; 218 }; 219 220 /// \brief Comment lexer. 221 class Lexer { 222 private: 223 Lexer(const Lexer &) LLVM_DELETED_FUNCTION; 224 void operator=(const Lexer &) LLVM_DELETED_FUNCTION; 225 226 /// Allocator for strings that are semantic values of tokens and have to be 227 /// computed (for example, resolved decimal character references). 228 llvm::BumpPtrAllocator &Allocator; 229 230 const CommandTraits &Traits; 231 232 const char *const BufferStart; 233 const char *const BufferEnd; 234 SourceLocation FileLoc; 235 236 const char *BufferPtr; 237 238 /// One past end pointer for the current comment. For BCPL comments points 239 /// to newline or BufferEnd, for C comments points to star in '*/'. 240 const char *CommentEnd; 241 242 enum LexerCommentState { 243 LCS_BeforeComment, 244 LCS_InsideBCPLComment, 245 LCS_InsideCComment, 246 LCS_BetweenComments 247 }; 248 249 /// Low-level lexer state, track if we are inside or outside of comment. 250 LexerCommentState CommentState; 251 252 enum LexerState { 253 /// Lexing normal comment text 254 LS_Normal, 255 256 /// Finished lexing verbatim block beginning command, will lex first body 257 /// line. 258 LS_VerbatimBlockFirstLine, 259 260 /// Lexing verbatim block body line-by-line, skipping line-starting 261 /// decorations. 262 LS_VerbatimBlockBody, 263 264 /// Finished lexing verbatim line beginning command, will lex text (one 265 /// line). 266 LS_VerbatimLineText, 267 268 /// Finished lexing \verbatim <TAG \endverbatim part, lexing tag attributes. 269 LS_HTMLStartTag, 270 271 /// Finished lexing \verbatim </TAG \endverbatim part, lexing '>'. 272 LS_HTMLEndTag 273 }; 274 275 /// Current lexing mode. 276 LexerState State; 277 278 /// If State is LS_VerbatimBlock, contains the name of verbatim end 279 /// command, including command marker. 280 SmallString<16> VerbatimBlockEndCommandName; 281 282 /// Given a character reference name (e.g., "lt"), return the character that 283 /// it stands for (e.g., "<"). 284 StringRef resolveHTMLNamedCharacterReference(StringRef Name) const; 285 286 /// Given a Unicode codepoint as base-10 integer, return the character. 287 StringRef resolveHTMLDecimalCharacterReference(StringRef Name) const; 288 289 /// Given a Unicode codepoint as base-16 integer, return the character. 290 StringRef resolveHTMLHexCharacterReference(StringRef Name) const; 291 formTokenWithChars(Token & Result,const char * TokEnd,tok::TokenKind Kind)292 void formTokenWithChars(Token &Result, const char *TokEnd, 293 tok::TokenKind Kind) { 294 const unsigned TokLen = TokEnd - BufferPtr; 295 Result.setLocation(getSourceLocation(BufferPtr)); 296 Result.setKind(Kind); 297 Result.setLength(TokLen); 298 #ifndef NDEBUG 299 Result.TextPtr = "<UNSET>"; 300 Result.IntVal = 7; 301 #endif 302 BufferPtr = TokEnd; 303 } 304 formTextToken(Token & Result,const char * TokEnd)305 void formTextToken(Token &Result, const char *TokEnd) { 306 StringRef Text(BufferPtr, TokEnd - BufferPtr); 307 formTokenWithChars(Result, TokEnd, tok::text); 308 Result.setText(Text); 309 } 310 getSourceLocation(const char * Loc)311 SourceLocation getSourceLocation(const char *Loc) const { 312 assert(Loc >= BufferStart && Loc <= BufferEnd && 313 "Location out of range for this buffer!"); 314 315 const unsigned CharNo = Loc - BufferStart; 316 return FileLoc.getLocWithOffset(CharNo); 317 } 318 319 /// Eat string matching regexp \code \s*\* \endcode. 320 void skipLineStartingDecorations(); 321 322 /// Lex stuff inside comments. CommentEnd should be set correctly. 323 void lexCommentText(Token &T); 324 325 void setupAndLexVerbatimBlock(Token &T, 326 const char *TextBegin, 327 char Marker, const CommandInfo *Info); 328 329 void lexVerbatimBlockFirstLine(Token &T); 330 331 void lexVerbatimBlockBody(Token &T); 332 333 void setupAndLexVerbatimLine(Token &T, const char *TextBegin, 334 const CommandInfo *Info); 335 336 void lexVerbatimLineText(Token &T); 337 338 void lexHTMLCharacterReference(Token &T); 339 340 void setupAndLexHTMLStartTag(Token &T); 341 342 void lexHTMLStartTag(Token &T); 343 344 void setupAndLexHTMLEndTag(Token &T); 345 346 void lexHTMLEndTag(Token &T); 347 348 public: 349 Lexer(llvm::BumpPtrAllocator &Allocator, const CommandTraits &Traits, 350 SourceLocation FileLoc, 351 const char *BufferStart, const char *BufferEnd); 352 353 void lex(Token &T); 354 355 StringRef getSpelling(const Token &Tok, 356 const SourceManager &SourceMgr, 357 bool *Invalid = NULL) const; 358 }; 359 360 } // end namespace comments 361 } // end namespace clang 362 363 #endif 364 365