• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 #include "clang/AST/CommentLexer.h"
2 #include "clang/AST/CommentCommandTraits.h"
3 #include "clang/Basic/ConvertUTF.h"
4 #include "llvm/ADT/StringSwitch.h"
5 #include "llvm/Support/ErrorHandling.h"
6 
7 namespace clang {
8 namespace comments {
9 
dump(const Lexer & L,const SourceManager & SM) const10 void Token::dump(const Lexer &L, const SourceManager &SM) const {
11   llvm::errs() << "comments::Token Kind=" << Kind << " ";
12   Loc.dump(SM);
13   llvm::errs() << " " << Length << " \"" << L.getSpelling(*this, SM) << "\"\n";
14 }
15 
16 namespace {
isHTMLNamedCharacterReferenceCharacter(char C)17 bool isHTMLNamedCharacterReferenceCharacter(char C) {
18   return (C >= 'a' && C <= 'z') ||
19          (C >= 'A' && C <= 'Z');
20 }
21 
isHTMLDecimalCharacterReferenceCharacter(char C)22 bool isHTMLDecimalCharacterReferenceCharacter(char C) {
23   return C >= '0' && C <= '9';
24 }
25 
isHTMLHexCharacterReferenceCharacter(char C)26 bool isHTMLHexCharacterReferenceCharacter(char C) {
27   return (C >= '0' && C <= '9') ||
28          (C >= 'a' && C <= 'f') ||
29          (C >= 'A' && C <= 'F');
30 }
31 
32 #include "clang/AST/CommentHTMLTags.inc"
33 
34 } // unnamed namespace
35 
resolveHTMLNamedCharacterReference(StringRef Name) const36 StringRef Lexer::resolveHTMLNamedCharacterReference(StringRef Name) const {
37   return llvm::StringSwitch<StringRef>(Name)
38       .Case("amp", "&")
39       .Case("lt", "<")
40       .Case("gt", ">")
41       .Case("quot", "\"")
42       .Case("apos", "\'")
43       .Default("");
44 }
45 
resolveHTMLDecimalCharacterReference(StringRef Name) const46 StringRef Lexer::resolveHTMLDecimalCharacterReference(StringRef Name) const {
47   unsigned CodePoint = 0;
48   for (unsigned i = 0, e = Name.size(); i != e; ++i) {
49     assert(isHTMLDecimalCharacterReferenceCharacter(Name[i]));
50     CodePoint *= 10;
51     CodePoint += Name[i] - '0';
52   }
53 
54   char *Resolved = Allocator.Allocate<char>(UNI_MAX_UTF8_BYTES_PER_CODE_POINT);
55   char *ResolvedPtr = Resolved;
56   if (ConvertCodePointToUTF8(CodePoint, ResolvedPtr))
57     return StringRef(Resolved, ResolvedPtr - Resolved);
58   else
59     return StringRef();
60 }
61 
resolveHTMLHexCharacterReference(StringRef Name) const62 StringRef Lexer::resolveHTMLHexCharacterReference(StringRef Name) const {
63   unsigned CodePoint = 0;
64   for (unsigned i = 0, e = Name.size(); i != e; ++i) {
65     CodePoint *= 16;
66     const char C = Name[i];
67     assert(isHTMLHexCharacterReferenceCharacter(C));
68     if (C >= '0' && C <= '9')
69       CodePoint += Name[i] - '0';
70     else if (C >= 'a' && C <= 'f')
71       CodePoint += Name[i] - 'a' + 10;
72     else
73       CodePoint += Name[i] - 'A' + 10;
74   }
75 
76   char *Resolved = Allocator.Allocate<char>(UNI_MAX_UTF8_BYTES_PER_CODE_POINT);
77   char *ResolvedPtr = Resolved;
78   if (ConvertCodePointToUTF8(CodePoint, ResolvedPtr))
79     return StringRef(Resolved, ResolvedPtr - Resolved);
80   else
81     return StringRef();
82 }
83 
skipLineStartingDecorations()84 void Lexer::skipLineStartingDecorations() {
85   // This function should be called only for C comments
86   assert(CommentState == LCS_InsideCComment);
87 
88   if (BufferPtr == CommentEnd)
89     return;
90 
91   switch (*BufferPtr) {
92   case ' ':
93   case '\t':
94   case '\f':
95   case '\v': {
96     const char *NewBufferPtr = BufferPtr;
97     NewBufferPtr++;
98     if (NewBufferPtr == CommentEnd)
99       return;
100 
101     char C = *NewBufferPtr;
102     while (C == ' ' || C == '\t' || C == '\f' || C == '\v') {
103       NewBufferPtr++;
104       if (NewBufferPtr == CommentEnd)
105         return;
106       C = *NewBufferPtr;
107     }
108     if (C == '*')
109       BufferPtr = NewBufferPtr + 1;
110     break;
111   }
112   case '*':
113     BufferPtr++;
114     break;
115   }
116 }
117 
118 namespace {
119 /// Returns pointer to the first newline character in the string.
findNewline(const char * BufferPtr,const char * BufferEnd)120 const char *findNewline(const char *BufferPtr, const char *BufferEnd) {
121   for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
122     const char C = *BufferPtr;
123     if (C == '\n' || C == '\r')
124       return BufferPtr;
125   }
126   return BufferEnd;
127 }
128 
skipNewline(const char * BufferPtr,const char * BufferEnd)129 const char *skipNewline(const char *BufferPtr, const char *BufferEnd) {
130   if (BufferPtr == BufferEnd)
131     return BufferPtr;
132 
133   if (*BufferPtr == '\n')
134     BufferPtr++;
135   else {
136     assert(*BufferPtr == '\r');
137     BufferPtr++;
138     if (BufferPtr != BufferEnd && *BufferPtr == '\n')
139       BufferPtr++;
140   }
141   return BufferPtr;
142 }
143 
skipNamedCharacterReference(const char * BufferPtr,const char * BufferEnd)144 const char *skipNamedCharacterReference(const char *BufferPtr,
145                                         const char *BufferEnd) {
146   for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
147     if (!isHTMLNamedCharacterReferenceCharacter(*BufferPtr))
148       return BufferPtr;
149   }
150   return BufferEnd;
151 }
152 
skipDecimalCharacterReference(const char * BufferPtr,const char * BufferEnd)153 const char *skipDecimalCharacterReference(const char *BufferPtr,
154                                           const char *BufferEnd) {
155   for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
156     if (!isHTMLDecimalCharacterReferenceCharacter(*BufferPtr))
157       return BufferPtr;
158   }
159   return BufferEnd;
160 }
161 
skipHexCharacterReference(const char * BufferPtr,const char * BufferEnd)162 const char *skipHexCharacterReference(const char *BufferPtr,
163                                           const char *BufferEnd) {
164   for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
165     if (!isHTMLHexCharacterReferenceCharacter(*BufferPtr))
166       return BufferPtr;
167   }
168   return BufferEnd;
169 }
170 
isHTMLIdentifierStartingCharacter(char C)171 bool isHTMLIdentifierStartingCharacter(char C) {
172   return (C >= 'a' && C <= 'z') ||
173          (C >= 'A' && C <= 'Z');
174 }
175 
isHTMLIdentifierCharacter(char C)176 bool isHTMLIdentifierCharacter(char C) {
177   return (C >= 'a' && C <= 'z') ||
178          (C >= 'A' && C <= 'Z') ||
179          (C >= '0' && C <= '9');
180 }
181 
skipHTMLIdentifier(const char * BufferPtr,const char * BufferEnd)182 const char *skipHTMLIdentifier(const char *BufferPtr, const char *BufferEnd) {
183   for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
184     if (!isHTMLIdentifierCharacter(*BufferPtr))
185       return BufferPtr;
186   }
187   return BufferEnd;
188 }
189 
190 /// Skip HTML string quoted in single or double quotes.  Escaping quotes inside
191 /// string allowed.
192 ///
193 /// Returns pointer to closing quote.
skipHTMLQuotedString(const char * BufferPtr,const char * BufferEnd)194 const char *skipHTMLQuotedString(const char *BufferPtr, const char *BufferEnd)
195 {
196   const char Quote = *BufferPtr;
197   assert(Quote == '\"' || Quote == '\'');
198 
199   BufferPtr++;
200   for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
201     const char C = *BufferPtr;
202     if (C == Quote && BufferPtr[-1] != '\\')
203       return BufferPtr;
204   }
205   return BufferEnd;
206 }
207 
isHorizontalWhitespace(char C)208 bool isHorizontalWhitespace(char C) {
209   return C == ' ' || C == '\t' || C == '\f' || C == '\v';
210 }
211 
isWhitespace(char C)212 bool isWhitespace(char C) {
213   return C == ' ' || C == '\n' || C == '\r' ||
214          C == '\t' || C == '\f' || C == '\v';
215 }
216 
skipWhitespace(const char * BufferPtr,const char * BufferEnd)217 const char *skipWhitespace(const char *BufferPtr, const char *BufferEnd) {
218   for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
219     if (!isWhitespace(*BufferPtr))
220       return BufferPtr;
221   }
222   return BufferEnd;
223 }
224 
isWhitespace(const char * BufferPtr,const char * BufferEnd)225 bool isWhitespace(const char *BufferPtr, const char *BufferEnd) {
226   return skipWhitespace(BufferPtr, BufferEnd) == BufferEnd;
227 }
228 
isCommandNameCharacter(char C)229 bool isCommandNameCharacter(char C) {
230   return (C >= 'a' && C <= 'z') ||
231          (C >= 'A' && C <= 'Z') ||
232          (C >= '0' && C <= '9');
233 }
234 
skipCommandName(const char * BufferPtr,const char * BufferEnd)235 const char *skipCommandName(const char *BufferPtr, const char *BufferEnd) {
236   for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
237     if (!isCommandNameCharacter(*BufferPtr))
238       return BufferPtr;
239   }
240   return BufferEnd;
241 }
242 
243 /// Return the one past end pointer for BCPL comments.
244 /// Handles newlines escaped with backslash or trigraph for backslahs.
findBCPLCommentEnd(const char * BufferPtr,const char * BufferEnd)245 const char *findBCPLCommentEnd(const char *BufferPtr, const char *BufferEnd) {
246   const char *CurPtr = BufferPtr;
247   while (CurPtr != BufferEnd) {
248     char C = *CurPtr;
249     while (C != '\n' && C != '\r') {
250       CurPtr++;
251       if (CurPtr == BufferEnd)
252         return BufferEnd;
253       C = *CurPtr;
254     }
255     // We found a newline, check if it is escaped.
256     const char *EscapePtr = CurPtr - 1;
257     while(isHorizontalWhitespace(*EscapePtr))
258       EscapePtr--;
259 
260     if (*EscapePtr == '\\' ||
261         (EscapePtr - 2 >= BufferPtr && EscapePtr[0] == '/' &&
262          EscapePtr[-1] == '?' && EscapePtr[-2] == '?')) {
263       // We found an escaped newline.
264       CurPtr = skipNewline(CurPtr, BufferEnd);
265     } else
266       return CurPtr; // Not an escaped newline.
267   }
268   return BufferEnd;
269 }
270 
271 /// Return the one past end pointer for C comments.
272 /// Very dumb, does not handle escaped newlines or trigraphs.
findCCommentEnd(const char * BufferPtr,const char * BufferEnd)273 const char *findCCommentEnd(const char *BufferPtr, const char *BufferEnd) {
274   for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
275     if (*BufferPtr == '*') {
276       assert(BufferPtr + 1 != BufferEnd);
277       if (*(BufferPtr + 1) == '/')
278         return BufferPtr;
279     }
280   }
281   llvm_unreachable("buffer end hit before '*/' was seen");
282 }
283 } // unnamed namespace
284 
lexCommentText(Token & T)285 void Lexer::lexCommentText(Token &T) {
286   assert(CommentState == LCS_InsideBCPLComment ||
287          CommentState == LCS_InsideCComment);
288 
289   switch (State) {
290   case LS_Normal:
291     break;
292   case LS_VerbatimBlockFirstLine:
293     lexVerbatimBlockFirstLine(T);
294     return;
295   case LS_VerbatimBlockBody:
296     lexVerbatimBlockBody(T);
297     return;
298   case LS_VerbatimLineText:
299     lexVerbatimLineText(T);
300     return;
301   case LS_HTMLStartTag:
302     lexHTMLStartTag(T);
303     return;
304   case LS_HTMLEndTag:
305     lexHTMLEndTag(T);
306     return;
307   }
308 
309   assert(State == LS_Normal);
310 
311   const char *TokenPtr = BufferPtr;
312   assert(TokenPtr < CommentEnd);
313   while (TokenPtr != CommentEnd) {
314     switch(*TokenPtr) {
315       case '\\':
316       case '@': {
317         TokenPtr++;
318         if (TokenPtr == CommentEnd) {
319           formTextToken(T, TokenPtr);
320           return;
321         }
322         char C = *TokenPtr;
323         switch (C) {
324         default:
325           break;
326 
327         case '\\': case '@': case '&': case '$':
328         case '#':  case '<': case '>': case '%':
329         case '\"': case '.': case ':':
330           // This is one of \\ \@ \& \$ etc escape sequences.
331           TokenPtr++;
332           if (C == ':' && TokenPtr != CommentEnd && *TokenPtr == ':') {
333             // This is the \:: escape sequence.
334             TokenPtr++;
335           }
336           StringRef UnescapedText(BufferPtr + 1, TokenPtr - (BufferPtr + 1));
337           formTokenWithChars(T, TokenPtr, tok::text);
338           T.setText(UnescapedText);
339           return;
340         }
341 
342         // Don't make zero-length commands.
343         if (!isCommandNameCharacter(*TokenPtr)) {
344           formTextToken(T, TokenPtr);
345           return;
346         }
347 
348         TokenPtr = skipCommandName(TokenPtr, CommentEnd);
349         unsigned Length = TokenPtr - (BufferPtr + 1);
350 
351         // Hardcoded support for lexing LaTeX formula commands
352         // \f$ \f[ \f] \f{ \f} as a single command.
353         if (Length == 1 && TokenPtr[-1] == 'f' && TokenPtr != CommentEnd) {
354           C = *TokenPtr;
355           if (C == '$' || C == '[' || C == ']' || C == '{' || C == '}') {
356             TokenPtr++;
357             Length++;
358           }
359         }
360 
361         const StringRef CommandName(BufferPtr + 1, Length);
362 
363         const CommandInfo *Info = Traits.getCommandInfoOrNULL(CommandName);
364         if (!Info) {
365           formTokenWithChars(T, TokenPtr, tok::unknown_command);
366           T.setUnknownCommandName(CommandName);
367           return;
368         }
369         if (Info->IsVerbatimBlockCommand) {
370           setupAndLexVerbatimBlock(T, TokenPtr, *BufferPtr, Info);
371           return;
372         }
373         if (Info->IsVerbatimLineCommand) {
374           setupAndLexVerbatimLine(T, TokenPtr, Info);
375           return;
376         }
377         formTokenWithChars(T, TokenPtr, tok::command);
378         T.setCommandID(Info->getID());
379         return;
380       }
381 
382       case '&':
383         lexHTMLCharacterReference(T);
384         return;
385 
386       case '<': {
387         TokenPtr++;
388         if (TokenPtr == CommentEnd) {
389           formTextToken(T, TokenPtr);
390           return;
391         }
392         const char C = *TokenPtr;
393         if (isHTMLIdentifierStartingCharacter(C))
394           setupAndLexHTMLStartTag(T);
395         else if (C == '/')
396           setupAndLexHTMLEndTag(T);
397         else
398           formTextToken(T, TokenPtr);
399 
400         return;
401       }
402 
403       case '\n':
404       case '\r':
405         TokenPtr = skipNewline(TokenPtr, CommentEnd);
406         formTokenWithChars(T, TokenPtr, tok::newline);
407 
408         if (CommentState == LCS_InsideCComment)
409           skipLineStartingDecorations();
410         return;
411 
412       default: {
413         while (true) {
414           TokenPtr++;
415           if (TokenPtr == CommentEnd)
416             break;
417           const char C = *TokenPtr;
418           if(C == '\n' || C == '\r' ||
419              C == '\\' || C == '@' || C == '&' || C == '<')
420             break;
421         }
422         formTextToken(T, TokenPtr);
423         return;
424       }
425     }
426   }
427 }
428 
setupAndLexVerbatimBlock(Token & T,const char * TextBegin,char Marker,const CommandInfo * Info)429 void Lexer::setupAndLexVerbatimBlock(Token &T,
430                                      const char *TextBegin,
431                                      char Marker, const CommandInfo *Info) {
432   assert(Info->IsVerbatimBlockCommand);
433 
434   VerbatimBlockEndCommandName.clear();
435   VerbatimBlockEndCommandName.append(Marker == '\\' ? "\\" : "@");
436   VerbatimBlockEndCommandName.append(Info->EndCommandName);
437 
438   formTokenWithChars(T, TextBegin, tok::verbatim_block_begin);
439   T.setVerbatimBlockID(Info->getID());
440 
441   // If there is a newline following the verbatim opening command, skip the
442   // newline so that we don't create an tok::verbatim_block_line with empty
443   // text content.
444   if (BufferPtr != CommentEnd) {
445     const char C = *BufferPtr;
446     if (C == '\n' || C == '\r') {
447       BufferPtr = skipNewline(BufferPtr, CommentEnd);
448       State = LS_VerbatimBlockBody;
449       return;
450     }
451   }
452 
453   State = LS_VerbatimBlockFirstLine;
454 }
455 
lexVerbatimBlockFirstLine(Token & T)456 void Lexer::lexVerbatimBlockFirstLine(Token &T) {
457 again:
458   assert(BufferPtr < CommentEnd);
459 
460   // FIXME: It would be better to scan the text once, finding either the block
461   // end command or newline.
462   //
463   // Extract current line.
464   const char *Newline = findNewline(BufferPtr, CommentEnd);
465   StringRef Line(BufferPtr, Newline - BufferPtr);
466 
467   // Look for end command in current line.
468   size_t Pos = Line.find(VerbatimBlockEndCommandName);
469   const char *TextEnd;
470   const char *NextLine;
471   if (Pos == StringRef::npos) {
472     // Current line is completely verbatim.
473     TextEnd = Newline;
474     NextLine = skipNewline(Newline, CommentEnd);
475   } else if (Pos == 0) {
476     // Current line contains just an end command.
477     const char *End = BufferPtr + VerbatimBlockEndCommandName.size();
478     StringRef Name(BufferPtr + 1, End - (BufferPtr + 1));
479     formTokenWithChars(T, End, tok::verbatim_block_end);
480     T.setVerbatimBlockID(Traits.getCommandInfo(Name)->getID());
481     State = LS_Normal;
482     return;
483   } else {
484     // There is some text, followed by end command.  Extract text first.
485     TextEnd = BufferPtr + Pos;
486     NextLine = TextEnd;
487     // If there is only whitespace before end command, skip whitespace.
488     if (isWhitespace(BufferPtr, TextEnd)) {
489       BufferPtr = TextEnd;
490       goto again;
491     }
492   }
493 
494   StringRef Text(BufferPtr, TextEnd - BufferPtr);
495   formTokenWithChars(T, NextLine, tok::verbatim_block_line);
496   T.setVerbatimBlockText(Text);
497 
498   State = LS_VerbatimBlockBody;
499 }
500 
lexVerbatimBlockBody(Token & T)501 void Lexer::lexVerbatimBlockBody(Token &T) {
502   assert(State == LS_VerbatimBlockBody);
503 
504   if (CommentState == LCS_InsideCComment)
505     skipLineStartingDecorations();
506 
507   lexVerbatimBlockFirstLine(T);
508 }
509 
setupAndLexVerbatimLine(Token & T,const char * TextBegin,const CommandInfo * Info)510 void Lexer::setupAndLexVerbatimLine(Token &T, const char *TextBegin,
511                                     const CommandInfo *Info) {
512   assert(Info->IsVerbatimLineCommand);
513   formTokenWithChars(T, TextBegin, tok::verbatim_line_name);
514   T.setVerbatimLineID(Info->getID());
515 
516   State = LS_VerbatimLineText;
517 }
518 
lexVerbatimLineText(Token & T)519 void Lexer::lexVerbatimLineText(Token &T) {
520   assert(State == LS_VerbatimLineText);
521 
522   // Extract current line.
523   const char *Newline = findNewline(BufferPtr, CommentEnd);
524   const StringRef Text(BufferPtr, Newline - BufferPtr);
525   formTokenWithChars(T, Newline, tok::verbatim_line_text);
526   T.setVerbatimLineText(Text);
527 
528   State = LS_Normal;
529 }
530 
lexHTMLCharacterReference(Token & T)531 void Lexer::lexHTMLCharacterReference(Token &T) {
532   const char *TokenPtr = BufferPtr;
533   assert(*TokenPtr == '&');
534   TokenPtr++;
535   if (TokenPtr == CommentEnd) {
536     formTextToken(T, TokenPtr);
537     return;
538   }
539   const char *NamePtr;
540   bool isNamed = false;
541   bool isDecimal = false;
542   char C = *TokenPtr;
543   if (isHTMLNamedCharacterReferenceCharacter(C)) {
544     NamePtr = TokenPtr;
545     TokenPtr = skipNamedCharacterReference(TokenPtr, CommentEnd);
546     isNamed = true;
547   } else if (C == '#') {
548     TokenPtr++;
549     if (TokenPtr == CommentEnd) {
550       formTextToken(T, TokenPtr);
551       return;
552     }
553     C = *TokenPtr;
554     if (isHTMLDecimalCharacterReferenceCharacter(C)) {
555       NamePtr = TokenPtr;
556       TokenPtr = skipDecimalCharacterReference(TokenPtr, CommentEnd);
557       isDecimal = true;
558     } else if (C == 'x' || C == 'X') {
559       TokenPtr++;
560       NamePtr = TokenPtr;
561       TokenPtr = skipHexCharacterReference(TokenPtr, CommentEnd);
562     } else {
563       formTextToken(T, TokenPtr);
564       return;
565     }
566   } else {
567     formTextToken(T, TokenPtr);
568     return;
569   }
570   if (NamePtr == TokenPtr || TokenPtr == CommentEnd ||
571       *TokenPtr != ';') {
572     formTextToken(T, TokenPtr);
573     return;
574   }
575   StringRef Name(NamePtr, TokenPtr - NamePtr);
576   TokenPtr++; // Skip semicolon.
577   StringRef Resolved;
578   if (isNamed)
579     Resolved = resolveHTMLNamedCharacterReference(Name);
580   else if (isDecimal)
581     Resolved = resolveHTMLDecimalCharacterReference(Name);
582   else
583     Resolved = resolveHTMLHexCharacterReference(Name);
584 
585   if (Resolved.empty()) {
586     formTextToken(T, TokenPtr);
587     return;
588   }
589   formTokenWithChars(T, TokenPtr, tok::text);
590   T.setText(Resolved);
591   return;
592 }
593 
setupAndLexHTMLStartTag(Token & T)594 void Lexer::setupAndLexHTMLStartTag(Token &T) {
595   assert(BufferPtr[0] == '<' &&
596          isHTMLIdentifierStartingCharacter(BufferPtr[1]));
597   const char *TagNameEnd = skipHTMLIdentifier(BufferPtr + 2, CommentEnd);
598   StringRef Name(BufferPtr + 1, TagNameEnd - (BufferPtr + 1));
599   if (!isHTMLTagName(Name)) {
600     formTextToken(T, TagNameEnd);
601     return;
602   }
603 
604   formTokenWithChars(T, TagNameEnd, tok::html_start_tag);
605   T.setHTMLTagStartName(Name);
606 
607   BufferPtr = skipWhitespace(BufferPtr, CommentEnd);
608 
609   const char C = *BufferPtr;
610   if (BufferPtr != CommentEnd &&
611       (C == '>' || C == '/' || isHTMLIdentifierStartingCharacter(C)))
612     State = LS_HTMLStartTag;
613 }
614 
lexHTMLStartTag(Token & T)615 void Lexer::lexHTMLStartTag(Token &T) {
616   assert(State == LS_HTMLStartTag);
617 
618   const char *TokenPtr = BufferPtr;
619   char C = *TokenPtr;
620   if (isHTMLIdentifierCharacter(C)) {
621     TokenPtr = skipHTMLIdentifier(TokenPtr, CommentEnd);
622     StringRef Ident(BufferPtr, TokenPtr - BufferPtr);
623     formTokenWithChars(T, TokenPtr, tok::html_ident);
624     T.setHTMLIdent(Ident);
625   } else {
626     switch (C) {
627     case '=':
628       TokenPtr++;
629       formTokenWithChars(T, TokenPtr, tok::html_equals);
630       break;
631     case '\"':
632     case '\'': {
633       const char *OpenQuote = TokenPtr;
634       TokenPtr = skipHTMLQuotedString(TokenPtr, CommentEnd);
635       const char *ClosingQuote = TokenPtr;
636       if (TokenPtr != CommentEnd) // Skip closing quote.
637         TokenPtr++;
638       formTokenWithChars(T, TokenPtr, tok::html_quoted_string);
639       T.setHTMLQuotedString(StringRef(OpenQuote + 1,
640                                       ClosingQuote - (OpenQuote + 1)));
641       break;
642     }
643     case '>':
644       TokenPtr++;
645       formTokenWithChars(T, TokenPtr, tok::html_greater);
646       State = LS_Normal;
647       return;
648     case '/':
649       TokenPtr++;
650       if (TokenPtr != CommentEnd && *TokenPtr == '>') {
651         TokenPtr++;
652         formTokenWithChars(T, TokenPtr, tok::html_slash_greater);
653       } else
654         formTextToken(T, TokenPtr);
655 
656       State = LS_Normal;
657       return;
658     }
659   }
660 
661   // Now look ahead and return to normal state if we don't see any HTML tokens
662   // ahead.
663   BufferPtr = skipWhitespace(BufferPtr, CommentEnd);
664   if (BufferPtr == CommentEnd) {
665     State = LS_Normal;
666     return;
667   }
668 
669   C = *BufferPtr;
670   if (!isHTMLIdentifierStartingCharacter(C) &&
671       C != '=' && C != '\"' && C != '\'' && C != '>') {
672     State = LS_Normal;
673     return;
674   }
675 }
676 
setupAndLexHTMLEndTag(Token & T)677 void Lexer::setupAndLexHTMLEndTag(Token &T) {
678   assert(BufferPtr[0] == '<' && BufferPtr[1] == '/');
679 
680   const char *TagNameBegin = skipWhitespace(BufferPtr + 2, CommentEnd);
681   const char *TagNameEnd = skipHTMLIdentifier(TagNameBegin, CommentEnd);
682   StringRef Name(TagNameBegin, TagNameEnd - TagNameBegin);
683   if (!isHTMLTagName(Name)) {
684     formTextToken(T, TagNameEnd);
685     return;
686   }
687 
688   const char *End = skipWhitespace(TagNameEnd, CommentEnd);
689 
690   formTokenWithChars(T, End, tok::html_end_tag);
691   T.setHTMLTagEndName(Name);
692 
693   if (BufferPtr != CommentEnd && *BufferPtr == '>')
694     State = LS_HTMLEndTag;
695 }
696 
lexHTMLEndTag(Token & T)697 void Lexer::lexHTMLEndTag(Token &T) {
698   assert(BufferPtr != CommentEnd && *BufferPtr == '>');
699 
700   formTokenWithChars(T, BufferPtr + 1, tok::html_greater);
701   State = LS_Normal;
702 }
703 
Lexer(llvm::BumpPtrAllocator & Allocator,const CommandTraits & Traits,SourceLocation FileLoc,const char * BufferStart,const char * BufferEnd)704 Lexer::Lexer(llvm::BumpPtrAllocator &Allocator, const CommandTraits &Traits,
705              SourceLocation FileLoc,
706              const char *BufferStart, const char *BufferEnd):
707     Allocator(Allocator), Traits(Traits),
708     BufferStart(BufferStart), BufferEnd(BufferEnd),
709     FileLoc(FileLoc), BufferPtr(BufferStart),
710     CommentState(LCS_BeforeComment), State(LS_Normal) {
711 }
712 
lex(Token & T)713 void Lexer::lex(Token &T) {
714 again:
715   switch (CommentState) {
716   case LCS_BeforeComment:
717     if (BufferPtr == BufferEnd) {
718       formTokenWithChars(T, BufferPtr, tok::eof);
719       return;
720     }
721 
722     assert(*BufferPtr == '/');
723     BufferPtr++; // Skip first slash.
724     switch(*BufferPtr) {
725     case '/': { // BCPL comment.
726       BufferPtr++; // Skip second slash.
727 
728       if (BufferPtr != BufferEnd) {
729         // Skip Doxygen magic marker, if it is present.
730         // It might be missing because of a typo //< or /*<, or because we
731         // merged this non-Doxygen comment into a bunch of Doxygen comments
732         // around it: /** ... */ /* ... */ /** ... */
733         const char C = *BufferPtr;
734         if (C == '/' || C == '!')
735           BufferPtr++;
736       }
737 
738       // Skip less-than symbol that marks trailing comments.
739       // Skip it even if the comment is not a Doxygen one, because //< and /*<
740       // are frequent typos.
741       if (BufferPtr != BufferEnd && *BufferPtr == '<')
742         BufferPtr++;
743 
744       CommentState = LCS_InsideBCPLComment;
745       if (State != LS_VerbatimBlockBody && State != LS_VerbatimBlockFirstLine)
746         State = LS_Normal;
747       CommentEnd = findBCPLCommentEnd(BufferPtr, BufferEnd);
748       goto again;
749     }
750     case '*': { // C comment.
751       BufferPtr++; // Skip star.
752 
753       // Skip Doxygen magic marker.
754       const char C = *BufferPtr;
755       if ((C == '*' && *(BufferPtr + 1) != '/') || C == '!')
756         BufferPtr++;
757 
758       // Skip less-than symbol that marks trailing comments.
759       if (BufferPtr != BufferEnd && *BufferPtr == '<')
760         BufferPtr++;
761 
762       CommentState = LCS_InsideCComment;
763       State = LS_Normal;
764       CommentEnd = findCCommentEnd(BufferPtr, BufferEnd);
765       goto again;
766     }
767     default:
768       llvm_unreachable("second character of comment should be '/' or '*'");
769     }
770 
771   case LCS_BetweenComments: {
772     // Consecutive comments are extracted only if there is only whitespace
773     // between them.  So we can search for the start of the next comment.
774     const char *EndWhitespace = BufferPtr;
775     while(EndWhitespace != BufferEnd && *EndWhitespace != '/')
776       EndWhitespace++;
777 
778     // Turn any whitespace between comments (and there is only whitespace
779     // between them -- guaranteed by comment extraction) into a newline.  We
780     // have two newlines between C comments in total (first one was synthesized
781     // after a comment).
782     formTokenWithChars(T, EndWhitespace, tok::newline);
783 
784     CommentState = LCS_BeforeComment;
785     break;
786   }
787 
788   case LCS_InsideBCPLComment:
789   case LCS_InsideCComment:
790     if (BufferPtr != CommentEnd) {
791       lexCommentText(T);
792       break;
793     } else {
794       // Skip C comment closing sequence.
795       if (CommentState == LCS_InsideCComment) {
796         assert(BufferPtr[0] == '*' && BufferPtr[1] == '/');
797         BufferPtr += 2;
798         assert(BufferPtr <= BufferEnd);
799 
800         // Synthenize newline just after the C comment, regardless if there is
801         // actually a newline.
802         formTokenWithChars(T, BufferPtr, tok::newline);
803 
804         CommentState = LCS_BetweenComments;
805         break;
806       } else {
807         // Don't synthesized a newline after BCPL comment.
808         CommentState = LCS_BetweenComments;
809         goto again;
810       }
811     }
812   }
813 }
814 
getSpelling(const Token & Tok,const SourceManager & SourceMgr,bool * Invalid) const815 StringRef Lexer::getSpelling(const Token &Tok,
816                              const SourceManager &SourceMgr,
817                              bool *Invalid) const {
818   SourceLocation Loc = Tok.getLocation();
819   std::pair<FileID, unsigned> LocInfo = SourceMgr.getDecomposedLoc(Loc);
820 
821   bool InvalidTemp = false;
822   StringRef File = SourceMgr.getBufferData(LocInfo.first, &InvalidTemp);
823   if (InvalidTemp) {
824     *Invalid = true;
825     return StringRef();
826   }
827 
828   const char *Begin = File.data() + LocInfo.second;
829   return StringRef(Begin, Tok.getLength());
830 }
831 
832 } // end namespace comments
833 } // end namespace clang
834 
835