1 //===--- CommentLexer.h - Lexer for structured comments ---------*- C++ -*-===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 // This file defines lexer for structured comments and supporting token class. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #ifndef LLVM_CLANG_AST_COMMENT_LEXER_H 15 #define LLVM_CLANG_AST_COMMENT_LEXER_H 16 17 #include "clang/Basic/SourceManager.h" 18 #include "llvm/ADT/StringRef.h" 19 #include "llvm/ADT/SmallString.h" 20 #include "llvm/ADT/SmallVector.h" 21 #include "llvm/Support/Allocator.h" 22 #include "llvm/Support/raw_ostream.h" 23 24 namespace clang { 25 namespace comments { 26 27 class Lexer; 28 class TextTokenRetokenizer; 29 struct CommandInfo; 30 class CommandTraits; 31 32 namespace tok { 33 enum TokenKind { 34 eof, 35 newline, 36 text, 37 unknown_command, 38 command, 39 verbatim_block_begin, 40 verbatim_block_line, 41 verbatim_block_end, 42 verbatim_line_name, 43 verbatim_line_text, 44 html_start_tag, // <tag 45 html_ident, // attr 46 html_equals, // = 47 html_quoted_string, // "blah\"blah" or 'blah\'blah' 48 html_greater, // > 49 html_slash_greater, // /> 50 html_end_tag // </tag 51 }; 52 } // end namespace tok 53 54 /// \brief Comment token. 55 class Token { 56 friend class Lexer; 57 friend class TextTokenRetokenizer; 58 59 /// The location of the token. 60 SourceLocation Loc; 61 62 /// The actual kind of the token. 63 tok::TokenKind Kind; 64 65 /// Length of the token spelling in comment. Can be 0 for synthenized 66 /// tokens. 67 unsigned Length; 68 69 /// Contains text value associated with a token. 70 const char *TextPtr; 71 72 /// Integer value associated with a token. 73 /// 74 /// If the token is a konwn command, contains command ID and TextPtr is 75 /// unused (command spelling can be found with CommandTraits). Otherwise, 76 /// contains the length of the string that starts at TextPtr. 77 unsigned IntVal; 78 79 public: getLocation()80 SourceLocation getLocation() const LLVM_READONLY { return Loc; } setLocation(SourceLocation SL)81 void setLocation(SourceLocation SL) { Loc = SL; } 82 getEndLocation()83 SourceLocation getEndLocation() const LLVM_READONLY { 84 if (Length == 0 || Length == 1) 85 return Loc; 86 return Loc.getLocWithOffset(Length - 1); 87 } 88 getKind()89 tok::TokenKind getKind() const LLVM_READONLY { return Kind; } setKind(tok::TokenKind K)90 void setKind(tok::TokenKind K) { Kind = K; } 91 is(tok::TokenKind K)92 bool is(tok::TokenKind K) const LLVM_READONLY { return Kind == K; } isNot(tok::TokenKind K)93 bool isNot(tok::TokenKind K) const LLVM_READONLY { return Kind != K; } 94 getLength()95 unsigned getLength() const LLVM_READONLY { return Length; } setLength(unsigned L)96 void setLength(unsigned L) { Length = L; } 97 getText()98 StringRef getText() const LLVM_READONLY { 99 assert(is(tok::text)); 100 return StringRef(TextPtr, IntVal); 101 } 102 setText(StringRef Text)103 void setText(StringRef Text) { 104 assert(is(tok::text)); 105 TextPtr = Text.data(); 106 IntVal = Text.size(); 107 } 108 getUnknownCommandName()109 StringRef getUnknownCommandName() const LLVM_READONLY { 110 assert(is(tok::unknown_command)); 111 return StringRef(TextPtr, IntVal); 112 } 113 setUnknownCommandName(StringRef Name)114 void setUnknownCommandName(StringRef Name) { 115 assert(is(tok::unknown_command)); 116 TextPtr = Name.data(); 117 IntVal = Name.size(); 118 } 119 getCommandID()120 unsigned getCommandID() const LLVM_READONLY { 121 assert(is(tok::command)); 122 return IntVal; 123 } 124 setCommandID(unsigned ID)125 void setCommandID(unsigned ID) { 126 assert(is(tok::command)); 127 IntVal = ID; 128 } 129 getVerbatimBlockID()130 unsigned getVerbatimBlockID() const LLVM_READONLY { 131 assert(is(tok::verbatim_block_begin) || is(tok::verbatim_block_end)); 132 return IntVal; 133 } 134 setVerbatimBlockID(unsigned ID)135 void setVerbatimBlockID(unsigned ID) { 136 assert(is(tok::verbatim_block_begin) || is(tok::verbatim_block_end)); 137 IntVal = ID; 138 } 139 getVerbatimBlockText()140 StringRef getVerbatimBlockText() const LLVM_READONLY { 141 assert(is(tok::verbatim_block_line)); 142 return StringRef(TextPtr, IntVal); 143 } 144 setVerbatimBlockText(StringRef Text)145 void setVerbatimBlockText(StringRef Text) { 146 assert(is(tok::verbatim_block_line)); 147 TextPtr = Text.data(); 148 IntVal = Text.size(); 149 } 150 getVerbatimLineID()151 unsigned getVerbatimLineID() const LLVM_READONLY { 152 assert(is(tok::verbatim_line_name)); 153 return IntVal; 154 } 155 setVerbatimLineID(unsigned ID)156 void setVerbatimLineID(unsigned ID) { 157 assert(is(tok::verbatim_line_name)); 158 IntVal = ID; 159 } 160 getVerbatimLineText()161 StringRef getVerbatimLineText() const LLVM_READONLY { 162 assert(is(tok::verbatim_line_text)); 163 return StringRef(TextPtr, IntVal); 164 } 165 setVerbatimLineText(StringRef Text)166 void setVerbatimLineText(StringRef Text) { 167 assert(is(tok::verbatim_line_text)); 168 TextPtr = Text.data(); 169 IntVal = Text.size(); 170 } 171 getHTMLTagStartName()172 StringRef getHTMLTagStartName() const LLVM_READONLY { 173 assert(is(tok::html_start_tag)); 174 return StringRef(TextPtr, IntVal); 175 } 176 setHTMLTagStartName(StringRef Name)177 void setHTMLTagStartName(StringRef Name) { 178 assert(is(tok::html_start_tag)); 179 TextPtr = Name.data(); 180 IntVal = Name.size(); 181 } 182 getHTMLIdent()183 StringRef getHTMLIdent() const LLVM_READONLY { 184 assert(is(tok::html_ident)); 185 return StringRef(TextPtr, IntVal); 186 } 187 setHTMLIdent(StringRef Name)188 void setHTMLIdent(StringRef Name) { 189 assert(is(tok::html_ident)); 190 TextPtr = Name.data(); 191 IntVal = Name.size(); 192 } 193 getHTMLQuotedString()194 StringRef getHTMLQuotedString() const LLVM_READONLY { 195 assert(is(tok::html_quoted_string)); 196 return StringRef(TextPtr, IntVal); 197 } 198 setHTMLQuotedString(StringRef Str)199 void setHTMLQuotedString(StringRef Str) { 200 assert(is(tok::html_quoted_string)); 201 TextPtr = Str.data(); 202 IntVal = Str.size(); 203 } 204 getHTMLTagEndName()205 StringRef getHTMLTagEndName() const LLVM_READONLY { 206 assert(is(tok::html_end_tag)); 207 return StringRef(TextPtr, IntVal); 208 } 209 setHTMLTagEndName(StringRef Name)210 void setHTMLTagEndName(StringRef Name) { 211 assert(is(tok::html_end_tag)); 212 TextPtr = Name.data(); 213 IntVal = Name.size(); 214 } 215 216 void dump(const Lexer &L, const SourceManager &SM) const; 217 }; 218 219 /// \brief Comment lexer. 220 class Lexer { 221 private: 222 Lexer(const Lexer &) LLVM_DELETED_FUNCTION; 223 void operator=(const Lexer &) LLVM_DELETED_FUNCTION; 224 225 /// Allocator for strings that are semantic values of tokens and have to be 226 /// computed (for example, resolved decimal character references). 227 llvm::BumpPtrAllocator &Allocator; 228 229 const CommandTraits &Traits; 230 231 const char *const BufferStart; 232 const char *const BufferEnd; 233 SourceLocation FileLoc; 234 235 const char *BufferPtr; 236 237 /// One past end pointer for the current comment. For BCPL comments points 238 /// to newline or BufferEnd, for C comments points to star in '*/'. 239 const char *CommentEnd; 240 241 enum LexerCommentState { 242 LCS_BeforeComment, 243 LCS_InsideBCPLComment, 244 LCS_InsideCComment, 245 LCS_BetweenComments 246 }; 247 248 /// Low-level lexer state, track if we are inside or outside of comment. 249 LexerCommentState CommentState; 250 251 enum LexerState { 252 /// Lexing normal comment text 253 LS_Normal, 254 255 /// Finished lexing verbatim block beginning command, will lex first body 256 /// line. 257 LS_VerbatimBlockFirstLine, 258 259 /// Lexing verbatim block body line-by-line, skipping line-starting 260 /// decorations. 261 LS_VerbatimBlockBody, 262 263 /// Finished lexing verbatim line beginning command, will lex text (one 264 /// line). 265 LS_VerbatimLineText, 266 267 /// Finished lexing \verbatim <TAG \endverbatim part, lexing tag attributes. 268 LS_HTMLStartTag, 269 270 /// Finished lexing \verbatim </TAG \endverbatim part, lexing '>'. 271 LS_HTMLEndTag 272 }; 273 274 /// Current lexing mode. 275 LexerState State; 276 277 /// If State is LS_VerbatimBlock, contains the name of verbatim end 278 /// command, including command marker. 279 SmallString<16> VerbatimBlockEndCommandName; 280 281 /// Given a character reference name (e.g., "lt"), return the character that 282 /// it stands for (e.g., "<"). 283 StringRef resolveHTMLNamedCharacterReference(StringRef Name) const; 284 285 /// Given a Unicode codepoint as base-10 integer, return the character. 286 StringRef resolveHTMLDecimalCharacterReference(StringRef Name) const; 287 288 /// Given a Unicode codepoint as base-16 integer, return the character. 289 StringRef resolveHTMLHexCharacterReference(StringRef Name) const; 290 formTokenWithChars(Token & Result,const char * TokEnd,tok::TokenKind Kind)291 void formTokenWithChars(Token &Result, const char *TokEnd, 292 tok::TokenKind Kind) { 293 const unsigned TokLen = TokEnd - BufferPtr; 294 Result.setLocation(getSourceLocation(BufferPtr)); 295 Result.setKind(Kind); 296 Result.setLength(TokLen); 297 #ifndef NDEBUG 298 Result.TextPtr = "<UNSET>"; 299 Result.IntVal = 7; 300 #endif 301 BufferPtr = TokEnd; 302 } 303 formTextToken(Token & Result,const char * TokEnd)304 void formTextToken(Token &Result, const char *TokEnd) { 305 StringRef Text(BufferPtr, TokEnd - BufferPtr); 306 formTokenWithChars(Result, TokEnd, tok::text); 307 Result.setText(Text); 308 } 309 getSourceLocation(const char * Loc)310 SourceLocation getSourceLocation(const char *Loc) const { 311 assert(Loc >= BufferStart && Loc <= BufferEnd && 312 "Location out of range for this buffer!"); 313 314 const unsigned CharNo = Loc - BufferStart; 315 return FileLoc.getLocWithOffset(CharNo); 316 } 317 318 /// Eat string matching regexp \code \s*\* \endcode. 319 void skipLineStartingDecorations(); 320 321 /// Lex stuff inside comments. CommentEnd should be set correctly. 322 void lexCommentText(Token &T); 323 324 void setupAndLexVerbatimBlock(Token &T, 325 const char *TextBegin, 326 char Marker, const CommandInfo *Info); 327 328 void lexVerbatimBlockFirstLine(Token &T); 329 330 void lexVerbatimBlockBody(Token &T); 331 332 void setupAndLexVerbatimLine(Token &T, const char *TextBegin, 333 const CommandInfo *Info); 334 335 void lexVerbatimLineText(Token &T); 336 337 void lexHTMLCharacterReference(Token &T); 338 339 void setupAndLexHTMLStartTag(Token &T); 340 341 void lexHTMLStartTag(Token &T); 342 343 void setupAndLexHTMLEndTag(Token &T); 344 345 void lexHTMLEndTag(Token &T); 346 347 public: 348 Lexer(llvm::BumpPtrAllocator &Allocator, const CommandTraits &Traits, 349 SourceLocation FileLoc, 350 const char *BufferStart, const char *BufferEnd); 351 352 void lex(Token &T); 353 354 StringRef getSpelling(const Token &Tok, 355 const SourceManager &SourceMgr, 356 bool *Invalid = NULL) const; 357 }; 358 359 } // end namespace comments 360 } // end namespace clang 361 362 #endif 363 364