1 #include "clang/AST/CommentLexer.h"
2 #include "clang/AST/CommentCommandTraits.h"
3 #include "clang/Basic/ConvertUTF.h"
4 #include "llvm/ADT/StringSwitch.h"
5 #include "llvm/Support/ErrorHandling.h"
6
7 namespace clang {
8 namespace comments {
9
dump(const Lexer & L,const SourceManager & SM) const10 void Token::dump(const Lexer &L, const SourceManager &SM) const {
11 llvm::errs() << "comments::Token Kind=" << Kind << " ";
12 Loc.dump(SM);
13 llvm::errs() << " " << Length << " \"" << L.getSpelling(*this, SM) << "\"\n";
14 }
15
16 namespace {
isHTMLNamedCharacterReferenceCharacter(char C)17 bool isHTMLNamedCharacterReferenceCharacter(char C) {
18 return (C >= 'a' && C <= 'z') ||
19 (C >= 'A' && C <= 'Z');
20 }
21
isHTMLDecimalCharacterReferenceCharacter(char C)22 bool isHTMLDecimalCharacterReferenceCharacter(char C) {
23 return C >= '0' && C <= '9';
24 }
25
isHTMLHexCharacterReferenceCharacter(char C)26 bool isHTMLHexCharacterReferenceCharacter(char C) {
27 return (C >= '0' && C <= '9') ||
28 (C >= 'a' && C <= 'f') ||
29 (C >= 'A' && C <= 'F');
30 }
31
32 #include "clang/AST/CommentHTMLTags.inc"
33
34 } // unnamed namespace
35
resolveHTMLNamedCharacterReference(StringRef Name) const36 StringRef Lexer::resolveHTMLNamedCharacterReference(StringRef Name) const {
37 return llvm::StringSwitch<StringRef>(Name)
38 .Case("amp", "&")
39 .Case("lt", "<")
40 .Case("gt", ">")
41 .Case("quot", "\"")
42 .Case("apos", "\'")
43 .Default("");
44 }
45
resolveHTMLDecimalCharacterReference(StringRef Name) const46 StringRef Lexer::resolveHTMLDecimalCharacterReference(StringRef Name) const {
47 unsigned CodePoint = 0;
48 for (unsigned i = 0, e = Name.size(); i != e; ++i) {
49 assert(isHTMLDecimalCharacterReferenceCharacter(Name[i]));
50 CodePoint *= 10;
51 CodePoint += Name[i] - '0';
52 }
53
54 char *Resolved = Allocator.Allocate<char>(UNI_MAX_UTF8_BYTES_PER_CODE_POINT);
55 char *ResolvedPtr = Resolved;
56 if (ConvertCodePointToUTF8(CodePoint, ResolvedPtr))
57 return StringRef(Resolved, ResolvedPtr - Resolved);
58 else
59 return StringRef();
60 }
61
resolveHTMLHexCharacterReference(StringRef Name) const62 StringRef Lexer::resolveHTMLHexCharacterReference(StringRef Name) const {
63 unsigned CodePoint = 0;
64 for (unsigned i = 0, e = Name.size(); i != e; ++i) {
65 CodePoint *= 16;
66 const char C = Name[i];
67 assert(isHTMLHexCharacterReferenceCharacter(C));
68 if (C >= '0' && C <= '9')
69 CodePoint += Name[i] - '0';
70 else if (C >= 'a' && C <= 'f')
71 CodePoint += Name[i] - 'a' + 10;
72 else
73 CodePoint += Name[i] - 'A' + 10;
74 }
75
76 char *Resolved = Allocator.Allocate<char>(UNI_MAX_UTF8_BYTES_PER_CODE_POINT);
77 char *ResolvedPtr = Resolved;
78 if (ConvertCodePointToUTF8(CodePoint, ResolvedPtr))
79 return StringRef(Resolved, ResolvedPtr - Resolved);
80 else
81 return StringRef();
82 }
83
skipLineStartingDecorations()84 void Lexer::skipLineStartingDecorations() {
85 // This function should be called only for C comments
86 assert(CommentState == LCS_InsideCComment);
87
88 if (BufferPtr == CommentEnd)
89 return;
90
91 switch (*BufferPtr) {
92 case ' ':
93 case '\t':
94 case '\f':
95 case '\v': {
96 const char *NewBufferPtr = BufferPtr;
97 NewBufferPtr++;
98 if (NewBufferPtr == CommentEnd)
99 return;
100
101 char C = *NewBufferPtr;
102 while (C == ' ' || C == '\t' || C == '\f' || C == '\v') {
103 NewBufferPtr++;
104 if (NewBufferPtr == CommentEnd)
105 return;
106 C = *NewBufferPtr;
107 }
108 if (C == '*')
109 BufferPtr = NewBufferPtr + 1;
110 break;
111 }
112 case '*':
113 BufferPtr++;
114 break;
115 }
116 }
117
118 namespace {
119 /// Returns pointer to the first newline character in the string.
findNewline(const char * BufferPtr,const char * BufferEnd)120 const char *findNewline(const char *BufferPtr, const char *BufferEnd) {
121 for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
122 const char C = *BufferPtr;
123 if (C == '\n' || C == '\r')
124 return BufferPtr;
125 }
126 return BufferEnd;
127 }
128
skipNewline(const char * BufferPtr,const char * BufferEnd)129 const char *skipNewline(const char *BufferPtr, const char *BufferEnd) {
130 if (BufferPtr == BufferEnd)
131 return BufferPtr;
132
133 if (*BufferPtr == '\n')
134 BufferPtr++;
135 else {
136 assert(*BufferPtr == '\r');
137 BufferPtr++;
138 if (BufferPtr != BufferEnd && *BufferPtr == '\n')
139 BufferPtr++;
140 }
141 return BufferPtr;
142 }
143
skipNamedCharacterReference(const char * BufferPtr,const char * BufferEnd)144 const char *skipNamedCharacterReference(const char *BufferPtr,
145 const char *BufferEnd) {
146 for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
147 if (!isHTMLNamedCharacterReferenceCharacter(*BufferPtr))
148 return BufferPtr;
149 }
150 return BufferEnd;
151 }
152
skipDecimalCharacterReference(const char * BufferPtr,const char * BufferEnd)153 const char *skipDecimalCharacterReference(const char *BufferPtr,
154 const char *BufferEnd) {
155 for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
156 if (!isHTMLDecimalCharacterReferenceCharacter(*BufferPtr))
157 return BufferPtr;
158 }
159 return BufferEnd;
160 }
161
skipHexCharacterReference(const char * BufferPtr,const char * BufferEnd)162 const char *skipHexCharacterReference(const char *BufferPtr,
163 const char *BufferEnd) {
164 for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
165 if (!isHTMLHexCharacterReferenceCharacter(*BufferPtr))
166 return BufferPtr;
167 }
168 return BufferEnd;
169 }
170
isHTMLIdentifierStartingCharacter(char C)171 bool isHTMLIdentifierStartingCharacter(char C) {
172 return (C >= 'a' && C <= 'z') ||
173 (C >= 'A' && C <= 'Z');
174 }
175
isHTMLIdentifierCharacter(char C)176 bool isHTMLIdentifierCharacter(char C) {
177 return (C >= 'a' && C <= 'z') ||
178 (C >= 'A' && C <= 'Z') ||
179 (C >= '0' && C <= '9');
180 }
181
skipHTMLIdentifier(const char * BufferPtr,const char * BufferEnd)182 const char *skipHTMLIdentifier(const char *BufferPtr, const char *BufferEnd) {
183 for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
184 if (!isHTMLIdentifierCharacter(*BufferPtr))
185 return BufferPtr;
186 }
187 return BufferEnd;
188 }
189
190 /// Skip HTML string quoted in single or double quotes. Escaping quotes inside
191 /// string allowed.
192 ///
193 /// Returns pointer to closing quote.
skipHTMLQuotedString(const char * BufferPtr,const char * BufferEnd)194 const char *skipHTMLQuotedString(const char *BufferPtr, const char *BufferEnd)
195 {
196 const char Quote = *BufferPtr;
197 assert(Quote == '\"' || Quote == '\'');
198
199 BufferPtr++;
200 for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
201 const char C = *BufferPtr;
202 if (C == Quote && BufferPtr[-1] != '\\')
203 return BufferPtr;
204 }
205 return BufferEnd;
206 }
207
isHorizontalWhitespace(char C)208 bool isHorizontalWhitespace(char C) {
209 return C == ' ' || C == '\t' || C == '\f' || C == '\v';
210 }
211
isWhitespace(char C)212 bool isWhitespace(char C) {
213 return C == ' ' || C == '\n' || C == '\r' ||
214 C == '\t' || C == '\f' || C == '\v';
215 }
216
skipWhitespace(const char * BufferPtr,const char * BufferEnd)217 const char *skipWhitespace(const char *BufferPtr, const char *BufferEnd) {
218 for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
219 if (!isWhitespace(*BufferPtr))
220 return BufferPtr;
221 }
222 return BufferEnd;
223 }
224
isWhitespace(const char * BufferPtr,const char * BufferEnd)225 bool isWhitespace(const char *BufferPtr, const char *BufferEnd) {
226 return skipWhitespace(BufferPtr, BufferEnd) == BufferEnd;
227 }
228
isCommandNameCharacter(char C)229 bool isCommandNameCharacter(char C) {
230 return (C >= 'a' && C <= 'z') ||
231 (C >= 'A' && C <= 'Z') ||
232 (C >= '0' && C <= '9');
233 }
234
skipCommandName(const char * BufferPtr,const char * BufferEnd)235 const char *skipCommandName(const char *BufferPtr, const char *BufferEnd) {
236 for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
237 if (!isCommandNameCharacter(*BufferPtr))
238 return BufferPtr;
239 }
240 return BufferEnd;
241 }
242
243 /// Return the one past end pointer for BCPL comments.
244 /// Handles newlines escaped with backslash or trigraph for backslahs.
findBCPLCommentEnd(const char * BufferPtr,const char * BufferEnd)245 const char *findBCPLCommentEnd(const char *BufferPtr, const char *BufferEnd) {
246 const char *CurPtr = BufferPtr;
247 while (CurPtr != BufferEnd) {
248 char C = *CurPtr;
249 while (C != '\n' && C != '\r') {
250 CurPtr++;
251 if (CurPtr == BufferEnd)
252 return BufferEnd;
253 C = *CurPtr;
254 }
255 // We found a newline, check if it is escaped.
256 const char *EscapePtr = CurPtr - 1;
257 while(isHorizontalWhitespace(*EscapePtr))
258 EscapePtr--;
259
260 if (*EscapePtr == '\\' ||
261 (EscapePtr - 2 >= BufferPtr && EscapePtr[0] == '/' &&
262 EscapePtr[-1] == '?' && EscapePtr[-2] == '?')) {
263 // We found an escaped newline.
264 CurPtr = skipNewline(CurPtr, BufferEnd);
265 } else
266 return CurPtr; // Not an escaped newline.
267 }
268 return BufferEnd;
269 }
270
271 /// Return the one past end pointer for C comments.
272 /// Very dumb, does not handle escaped newlines or trigraphs.
findCCommentEnd(const char * BufferPtr,const char * BufferEnd)273 const char *findCCommentEnd(const char *BufferPtr, const char *BufferEnd) {
274 for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
275 if (*BufferPtr == '*') {
276 assert(BufferPtr + 1 != BufferEnd);
277 if (*(BufferPtr + 1) == '/')
278 return BufferPtr;
279 }
280 }
281 llvm_unreachable("buffer end hit before '*/' was seen");
282 }
283 } // unnamed namespace
284
lexCommentText(Token & T)285 void Lexer::lexCommentText(Token &T) {
286 assert(CommentState == LCS_InsideBCPLComment ||
287 CommentState == LCS_InsideCComment);
288
289 switch (State) {
290 case LS_Normal:
291 break;
292 case LS_VerbatimBlockFirstLine:
293 lexVerbatimBlockFirstLine(T);
294 return;
295 case LS_VerbatimBlockBody:
296 lexVerbatimBlockBody(T);
297 return;
298 case LS_VerbatimLineText:
299 lexVerbatimLineText(T);
300 return;
301 case LS_HTMLStartTag:
302 lexHTMLStartTag(T);
303 return;
304 case LS_HTMLEndTag:
305 lexHTMLEndTag(T);
306 return;
307 }
308
309 assert(State == LS_Normal);
310
311 const char *TokenPtr = BufferPtr;
312 assert(TokenPtr < CommentEnd);
313 while (TokenPtr != CommentEnd) {
314 switch(*TokenPtr) {
315 case '\\':
316 case '@': {
317 TokenPtr++;
318 if (TokenPtr == CommentEnd) {
319 formTextToken(T, TokenPtr);
320 return;
321 }
322 char C = *TokenPtr;
323 switch (C) {
324 default:
325 break;
326
327 case '\\': case '@': case '&': case '$':
328 case '#': case '<': case '>': case '%':
329 case '\"': case '.': case ':':
330 // This is one of \\ \@ \& \$ etc escape sequences.
331 TokenPtr++;
332 if (C == ':' && TokenPtr != CommentEnd && *TokenPtr == ':') {
333 // This is the \:: escape sequence.
334 TokenPtr++;
335 }
336 StringRef UnescapedText(BufferPtr + 1, TokenPtr - (BufferPtr + 1));
337 formTokenWithChars(T, TokenPtr, tok::text);
338 T.setText(UnescapedText);
339 return;
340 }
341
342 // Don't make zero-length commands.
343 if (!isCommandNameCharacter(*TokenPtr)) {
344 formTextToken(T, TokenPtr);
345 return;
346 }
347
348 TokenPtr = skipCommandName(TokenPtr, CommentEnd);
349 unsigned Length = TokenPtr - (BufferPtr + 1);
350
351 // Hardcoded support for lexing LaTeX formula commands
352 // \f$ \f[ \f] \f{ \f} as a single command.
353 if (Length == 1 && TokenPtr[-1] == 'f' && TokenPtr != CommentEnd) {
354 C = *TokenPtr;
355 if (C == '$' || C == '[' || C == ']' || C == '{' || C == '}') {
356 TokenPtr++;
357 Length++;
358 }
359 }
360
361 const StringRef CommandName(BufferPtr + 1, Length);
362
363 const CommandInfo *Info = Traits.getCommandInfoOrNULL(CommandName);
364 if (!Info) {
365 formTokenWithChars(T, TokenPtr, tok::unknown_command);
366 T.setUnknownCommandName(CommandName);
367 return;
368 }
369 if (Info->IsVerbatimBlockCommand) {
370 setupAndLexVerbatimBlock(T, TokenPtr, *BufferPtr, Info);
371 return;
372 }
373 if (Info->IsVerbatimLineCommand) {
374 setupAndLexVerbatimLine(T, TokenPtr, Info);
375 return;
376 }
377 formTokenWithChars(T, TokenPtr, tok::command);
378 T.setCommandID(Info->getID());
379 return;
380 }
381
382 case '&':
383 lexHTMLCharacterReference(T);
384 return;
385
386 case '<': {
387 TokenPtr++;
388 if (TokenPtr == CommentEnd) {
389 formTextToken(T, TokenPtr);
390 return;
391 }
392 const char C = *TokenPtr;
393 if (isHTMLIdentifierStartingCharacter(C))
394 setupAndLexHTMLStartTag(T);
395 else if (C == '/')
396 setupAndLexHTMLEndTag(T);
397 else
398 formTextToken(T, TokenPtr);
399
400 return;
401 }
402
403 case '\n':
404 case '\r':
405 TokenPtr = skipNewline(TokenPtr, CommentEnd);
406 formTokenWithChars(T, TokenPtr, tok::newline);
407
408 if (CommentState == LCS_InsideCComment)
409 skipLineStartingDecorations();
410 return;
411
412 default: {
413 while (true) {
414 TokenPtr++;
415 if (TokenPtr == CommentEnd)
416 break;
417 const char C = *TokenPtr;
418 if(C == '\n' || C == '\r' ||
419 C == '\\' || C == '@' || C == '&' || C == '<')
420 break;
421 }
422 formTextToken(T, TokenPtr);
423 return;
424 }
425 }
426 }
427 }
428
setupAndLexVerbatimBlock(Token & T,const char * TextBegin,char Marker,const CommandInfo * Info)429 void Lexer::setupAndLexVerbatimBlock(Token &T,
430 const char *TextBegin,
431 char Marker, const CommandInfo *Info) {
432 assert(Info->IsVerbatimBlockCommand);
433
434 VerbatimBlockEndCommandName.clear();
435 VerbatimBlockEndCommandName.append(Marker == '\\' ? "\\" : "@");
436 VerbatimBlockEndCommandName.append(Info->EndCommandName);
437
438 formTokenWithChars(T, TextBegin, tok::verbatim_block_begin);
439 T.setVerbatimBlockID(Info->getID());
440
441 // If there is a newline following the verbatim opening command, skip the
442 // newline so that we don't create an tok::verbatim_block_line with empty
443 // text content.
444 if (BufferPtr != CommentEnd) {
445 const char C = *BufferPtr;
446 if (C == '\n' || C == '\r') {
447 BufferPtr = skipNewline(BufferPtr, CommentEnd);
448 State = LS_VerbatimBlockBody;
449 return;
450 }
451 }
452
453 State = LS_VerbatimBlockFirstLine;
454 }
455
lexVerbatimBlockFirstLine(Token & T)456 void Lexer::lexVerbatimBlockFirstLine(Token &T) {
457 again:
458 assert(BufferPtr < CommentEnd);
459
460 // FIXME: It would be better to scan the text once, finding either the block
461 // end command or newline.
462 //
463 // Extract current line.
464 const char *Newline = findNewline(BufferPtr, CommentEnd);
465 StringRef Line(BufferPtr, Newline - BufferPtr);
466
467 // Look for end command in current line.
468 size_t Pos = Line.find(VerbatimBlockEndCommandName);
469 const char *TextEnd;
470 const char *NextLine;
471 if (Pos == StringRef::npos) {
472 // Current line is completely verbatim.
473 TextEnd = Newline;
474 NextLine = skipNewline(Newline, CommentEnd);
475 } else if (Pos == 0) {
476 // Current line contains just an end command.
477 const char *End = BufferPtr + VerbatimBlockEndCommandName.size();
478 StringRef Name(BufferPtr + 1, End - (BufferPtr + 1));
479 formTokenWithChars(T, End, tok::verbatim_block_end);
480 T.setVerbatimBlockID(Traits.getCommandInfo(Name)->getID());
481 State = LS_Normal;
482 return;
483 } else {
484 // There is some text, followed by end command. Extract text first.
485 TextEnd = BufferPtr + Pos;
486 NextLine = TextEnd;
487 // If there is only whitespace before end command, skip whitespace.
488 if (isWhitespace(BufferPtr, TextEnd)) {
489 BufferPtr = TextEnd;
490 goto again;
491 }
492 }
493
494 StringRef Text(BufferPtr, TextEnd - BufferPtr);
495 formTokenWithChars(T, NextLine, tok::verbatim_block_line);
496 T.setVerbatimBlockText(Text);
497
498 State = LS_VerbatimBlockBody;
499 }
500
lexVerbatimBlockBody(Token & T)501 void Lexer::lexVerbatimBlockBody(Token &T) {
502 assert(State == LS_VerbatimBlockBody);
503
504 if (CommentState == LCS_InsideCComment)
505 skipLineStartingDecorations();
506
507 lexVerbatimBlockFirstLine(T);
508 }
509
setupAndLexVerbatimLine(Token & T,const char * TextBegin,const CommandInfo * Info)510 void Lexer::setupAndLexVerbatimLine(Token &T, const char *TextBegin,
511 const CommandInfo *Info) {
512 assert(Info->IsVerbatimLineCommand);
513 formTokenWithChars(T, TextBegin, tok::verbatim_line_name);
514 T.setVerbatimLineID(Info->getID());
515
516 State = LS_VerbatimLineText;
517 }
518
lexVerbatimLineText(Token & T)519 void Lexer::lexVerbatimLineText(Token &T) {
520 assert(State == LS_VerbatimLineText);
521
522 // Extract current line.
523 const char *Newline = findNewline(BufferPtr, CommentEnd);
524 const StringRef Text(BufferPtr, Newline - BufferPtr);
525 formTokenWithChars(T, Newline, tok::verbatim_line_text);
526 T.setVerbatimLineText(Text);
527
528 State = LS_Normal;
529 }
530
lexHTMLCharacterReference(Token & T)531 void Lexer::lexHTMLCharacterReference(Token &T) {
532 const char *TokenPtr = BufferPtr;
533 assert(*TokenPtr == '&');
534 TokenPtr++;
535 if (TokenPtr == CommentEnd) {
536 formTextToken(T, TokenPtr);
537 return;
538 }
539 const char *NamePtr;
540 bool isNamed = false;
541 bool isDecimal = false;
542 char C = *TokenPtr;
543 if (isHTMLNamedCharacterReferenceCharacter(C)) {
544 NamePtr = TokenPtr;
545 TokenPtr = skipNamedCharacterReference(TokenPtr, CommentEnd);
546 isNamed = true;
547 } else if (C == '#') {
548 TokenPtr++;
549 if (TokenPtr == CommentEnd) {
550 formTextToken(T, TokenPtr);
551 return;
552 }
553 C = *TokenPtr;
554 if (isHTMLDecimalCharacterReferenceCharacter(C)) {
555 NamePtr = TokenPtr;
556 TokenPtr = skipDecimalCharacterReference(TokenPtr, CommentEnd);
557 isDecimal = true;
558 } else if (C == 'x' || C == 'X') {
559 TokenPtr++;
560 NamePtr = TokenPtr;
561 TokenPtr = skipHexCharacterReference(TokenPtr, CommentEnd);
562 } else {
563 formTextToken(T, TokenPtr);
564 return;
565 }
566 } else {
567 formTextToken(T, TokenPtr);
568 return;
569 }
570 if (NamePtr == TokenPtr || TokenPtr == CommentEnd ||
571 *TokenPtr != ';') {
572 formTextToken(T, TokenPtr);
573 return;
574 }
575 StringRef Name(NamePtr, TokenPtr - NamePtr);
576 TokenPtr++; // Skip semicolon.
577 StringRef Resolved;
578 if (isNamed)
579 Resolved = resolveHTMLNamedCharacterReference(Name);
580 else if (isDecimal)
581 Resolved = resolveHTMLDecimalCharacterReference(Name);
582 else
583 Resolved = resolveHTMLHexCharacterReference(Name);
584
585 if (Resolved.empty()) {
586 formTextToken(T, TokenPtr);
587 return;
588 }
589 formTokenWithChars(T, TokenPtr, tok::text);
590 T.setText(Resolved);
591 return;
592 }
593
setupAndLexHTMLStartTag(Token & T)594 void Lexer::setupAndLexHTMLStartTag(Token &T) {
595 assert(BufferPtr[0] == '<' &&
596 isHTMLIdentifierStartingCharacter(BufferPtr[1]));
597 const char *TagNameEnd = skipHTMLIdentifier(BufferPtr + 2, CommentEnd);
598 StringRef Name(BufferPtr + 1, TagNameEnd - (BufferPtr + 1));
599 if (!isHTMLTagName(Name)) {
600 formTextToken(T, TagNameEnd);
601 return;
602 }
603
604 formTokenWithChars(T, TagNameEnd, tok::html_start_tag);
605 T.setHTMLTagStartName(Name);
606
607 BufferPtr = skipWhitespace(BufferPtr, CommentEnd);
608
609 const char C = *BufferPtr;
610 if (BufferPtr != CommentEnd &&
611 (C == '>' || C == '/' || isHTMLIdentifierStartingCharacter(C)))
612 State = LS_HTMLStartTag;
613 }
614
lexHTMLStartTag(Token & T)615 void Lexer::lexHTMLStartTag(Token &T) {
616 assert(State == LS_HTMLStartTag);
617
618 const char *TokenPtr = BufferPtr;
619 char C = *TokenPtr;
620 if (isHTMLIdentifierCharacter(C)) {
621 TokenPtr = skipHTMLIdentifier(TokenPtr, CommentEnd);
622 StringRef Ident(BufferPtr, TokenPtr - BufferPtr);
623 formTokenWithChars(T, TokenPtr, tok::html_ident);
624 T.setHTMLIdent(Ident);
625 } else {
626 switch (C) {
627 case '=':
628 TokenPtr++;
629 formTokenWithChars(T, TokenPtr, tok::html_equals);
630 break;
631 case '\"':
632 case '\'': {
633 const char *OpenQuote = TokenPtr;
634 TokenPtr = skipHTMLQuotedString(TokenPtr, CommentEnd);
635 const char *ClosingQuote = TokenPtr;
636 if (TokenPtr != CommentEnd) // Skip closing quote.
637 TokenPtr++;
638 formTokenWithChars(T, TokenPtr, tok::html_quoted_string);
639 T.setHTMLQuotedString(StringRef(OpenQuote + 1,
640 ClosingQuote - (OpenQuote + 1)));
641 break;
642 }
643 case '>':
644 TokenPtr++;
645 formTokenWithChars(T, TokenPtr, tok::html_greater);
646 State = LS_Normal;
647 return;
648 case '/':
649 TokenPtr++;
650 if (TokenPtr != CommentEnd && *TokenPtr == '>') {
651 TokenPtr++;
652 formTokenWithChars(T, TokenPtr, tok::html_slash_greater);
653 } else
654 formTextToken(T, TokenPtr);
655
656 State = LS_Normal;
657 return;
658 }
659 }
660
661 // Now look ahead and return to normal state if we don't see any HTML tokens
662 // ahead.
663 BufferPtr = skipWhitespace(BufferPtr, CommentEnd);
664 if (BufferPtr == CommentEnd) {
665 State = LS_Normal;
666 return;
667 }
668
669 C = *BufferPtr;
670 if (!isHTMLIdentifierStartingCharacter(C) &&
671 C != '=' && C != '\"' && C != '\'' && C != '>') {
672 State = LS_Normal;
673 return;
674 }
675 }
676
setupAndLexHTMLEndTag(Token & T)677 void Lexer::setupAndLexHTMLEndTag(Token &T) {
678 assert(BufferPtr[0] == '<' && BufferPtr[1] == '/');
679
680 const char *TagNameBegin = skipWhitespace(BufferPtr + 2, CommentEnd);
681 const char *TagNameEnd = skipHTMLIdentifier(TagNameBegin, CommentEnd);
682 StringRef Name(TagNameBegin, TagNameEnd - TagNameBegin);
683 if (!isHTMLTagName(Name)) {
684 formTextToken(T, TagNameEnd);
685 return;
686 }
687
688 const char *End = skipWhitespace(TagNameEnd, CommentEnd);
689
690 formTokenWithChars(T, End, tok::html_end_tag);
691 T.setHTMLTagEndName(Name);
692
693 if (BufferPtr != CommentEnd && *BufferPtr == '>')
694 State = LS_HTMLEndTag;
695 }
696
lexHTMLEndTag(Token & T)697 void Lexer::lexHTMLEndTag(Token &T) {
698 assert(BufferPtr != CommentEnd && *BufferPtr == '>');
699
700 formTokenWithChars(T, BufferPtr + 1, tok::html_greater);
701 State = LS_Normal;
702 }
703
Lexer(llvm::BumpPtrAllocator & Allocator,const CommandTraits & Traits,SourceLocation FileLoc,const char * BufferStart,const char * BufferEnd)704 Lexer::Lexer(llvm::BumpPtrAllocator &Allocator, const CommandTraits &Traits,
705 SourceLocation FileLoc,
706 const char *BufferStart, const char *BufferEnd):
707 Allocator(Allocator), Traits(Traits),
708 BufferStart(BufferStart), BufferEnd(BufferEnd),
709 FileLoc(FileLoc), BufferPtr(BufferStart),
710 CommentState(LCS_BeforeComment), State(LS_Normal) {
711 }
712
lex(Token & T)713 void Lexer::lex(Token &T) {
714 again:
715 switch (CommentState) {
716 case LCS_BeforeComment:
717 if (BufferPtr == BufferEnd) {
718 formTokenWithChars(T, BufferPtr, tok::eof);
719 return;
720 }
721
722 assert(*BufferPtr == '/');
723 BufferPtr++; // Skip first slash.
724 switch(*BufferPtr) {
725 case '/': { // BCPL comment.
726 BufferPtr++; // Skip second slash.
727
728 if (BufferPtr != BufferEnd) {
729 // Skip Doxygen magic marker, if it is present.
730 // It might be missing because of a typo //< or /*<, or because we
731 // merged this non-Doxygen comment into a bunch of Doxygen comments
732 // around it: /** ... */ /* ... */ /** ... */
733 const char C = *BufferPtr;
734 if (C == '/' || C == '!')
735 BufferPtr++;
736 }
737
738 // Skip less-than symbol that marks trailing comments.
739 // Skip it even if the comment is not a Doxygen one, because //< and /*<
740 // are frequent typos.
741 if (BufferPtr != BufferEnd && *BufferPtr == '<')
742 BufferPtr++;
743
744 CommentState = LCS_InsideBCPLComment;
745 if (State != LS_VerbatimBlockBody && State != LS_VerbatimBlockFirstLine)
746 State = LS_Normal;
747 CommentEnd = findBCPLCommentEnd(BufferPtr, BufferEnd);
748 goto again;
749 }
750 case '*': { // C comment.
751 BufferPtr++; // Skip star.
752
753 // Skip Doxygen magic marker.
754 const char C = *BufferPtr;
755 if ((C == '*' && *(BufferPtr + 1) != '/') || C == '!')
756 BufferPtr++;
757
758 // Skip less-than symbol that marks trailing comments.
759 if (BufferPtr != BufferEnd && *BufferPtr == '<')
760 BufferPtr++;
761
762 CommentState = LCS_InsideCComment;
763 State = LS_Normal;
764 CommentEnd = findCCommentEnd(BufferPtr, BufferEnd);
765 goto again;
766 }
767 default:
768 llvm_unreachable("second character of comment should be '/' or '*'");
769 }
770
771 case LCS_BetweenComments: {
772 // Consecutive comments are extracted only if there is only whitespace
773 // between them. So we can search for the start of the next comment.
774 const char *EndWhitespace = BufferPtr;
775 while(EndWhitespace != BufferEnd && *EndWhitespace != '/')
776 EndWhitespace++;
777
778 // Turn any whitespace between comments (and there is only whitespace
779 // between them -- guaranteed by comment extraction) into a newline. We
780 // have two newlines between C comments in total (first one was synthesized
781 // after a comment).
782 formTokenWithChars(T, EndWhitespace, tok::newline);
783
784 CommentState = LCS_BeforeComment;
785 break;
786 }
787
788 case LCS_InsideBCPLComment:
789 case LCS_InsideCComment:
790 if (BufferPtr != CommentEnd) {
791 lexCommentText(T);
792 break;
793 } else {
794 // Skip C comment closing sequence.
795 if (CommentState == LCS_InsideCComment) {
796 assert(BufferPtr[0] == '*' && BufferPtr[1] == '/');
797 BufferPtr += 2;
798 assert(BufferPtr <= BufferEnd);
799
800 // Synthenize newline just after the C comment, regardless if there is
801 // actually a newline.
802 formTokenWithChars(T, BufferPtr, tok::newline);
803
804 CommentState = LCS_BetweenComments;
805 break;
806 } else {
807 // Don't synthesized a newline after BCPL comment.
808 CommentState = LCS_BetweenComments;
809 goto again;
810 }
811 }
812 }
813 }
814
getSpelling(const Token & Tok,const SourceManager & SourceMgr,bool * Invalid) const815 StringRef Lexer::getSpelling(const Token &Tok,
816 const SourceManager &SourceMgr,
817 bool *Invalid) const {
818 SourceLocation Loc = Tok.getLocation();
819 std::pair<FileID, unsigned> LocInfo = SourceMgr.getDecomposedLoc(Loc);
820
821 bool InvalidTemp = false;
822 StringRef File = SourceMgr.getBufferData(LocInfo.first, &InvalidTemp);
823 if (InvalidTemp) {
824 *Invalid = true;
825 return StringRef();
826 }
827
828 const char *Begin = File.data() + LocInfo.second;
829 return StringRef(Begin, Tok.getLength());
830 }
831
832 } // end namespace comments
833 } // end namespace clang
834
835