1 //===- AsmLexer.cpp - Lexer for Assembly Files ----------------------------===//
2 //
3 // The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 // This class implements the lexer for assembly files.
11 //
12 //===----------------------------------------------------------------------===//
13
14 #include "llvm/MC/MCParser/AsmLexer.h"
15 #include "llvm/MC/MCAsmInfo.h"
16 #include "llvm/Support/MemoryBuffer.h"
17 #include "llvm/Support/SMLoc.h"
18 #include <cctype>
19 #include <cerrno>
20 #include <cstdio>
21 #include <cstdlib>
22 using namespace llvm;
23
AsmLexer(const MCAsmInfo & _MAI)24 AsmLexer::AsmLexer(const MCAsmInfo &_MAI) : MAI(_MAI) {
25 CurPtr = nullptr;
26 isAtStartOfLine = true;
27 AllowAtInIdentifier = !StringRef(MAI.getCommentString()).startswith("@");
28 }
29
~AsmLexer()30 AsmLexer::~AsmLexer() {
31 }
32
setBuffer(StringRef Buf,const char * ptr)33 void AsmLexer::setBuffer(StringRef Buf, const char *ptr) {
34 CurBuf = Buf;
35
36 if (ptr)
37 CurPtr = ptr;
38 else
39 CurPtr = CurBuf.begin();
40
41 TokStart = nullptr;
42 }
43
44 /// ReturnError - Set the error to the specified string at the specified
45 /// location. This is defined to always return AsmToken::Error.
ReturnError(const char * Loc,const std::string & Msg)46 AsmToken AsmLexer::ReturnError(const char *Loc, const std::string &Msg) {
47 SetError(SMLoc::getFromPointer(Loc), Msg);
48
49 return AsmToken(AsmToken::Error, StringRef(Loc, 0));
50 }
51
getNextChar()52 int AsmLexer::getNextChar() {
53 char CurChar = *CurPtr++;
54 switch (CurChar) {
55 default:
56 return (unsigned char)CurChar;
57 case 0:
58 // A nul character in the stream is either the end of the current buffer or
59 // a random nul in the file. Disambiguate that here.
60 if (CurPtr - 1 != CurBuf.end())
61 return 0; // Just whitespace.
62
63 // Otherwise, return end of file.
64 --CurPtr; // Another call to lex will return EOF again.
65 return EOF;
66 }
67 }
68
69 /// LexFloatLiteral: [0-9]*[.][0-9]*([eE][+-]?[0-9]*)?
70 ///
71 /// The leading integral digit sequence and dot should have already been
72 /// consumed, some or all of the fractional digit sequence *can* have been
73 /// consumed.
LexFloatLiteral()74 AsmToken AsmLexer::LexFloatLiteral() {
75 // Skip the fractional digit sequence.
76 while (isdigit(*CurPtr))
77 ++CurPtr;
78
79 // Check for exponent; we intentionally accept a slighlty wider set of
80 // literals here and rely on the upstream client to reject invalid ones (e.g.,
81 // "1e+").
82 if (*CurPtr == 'e' || *CurPtr == 'E') {
83 ++CurPtr;
84 if (*CurPtr == '-' || *CurPtr == '+')
85 ++CurPtr;
86 while (isdigit(*CurPtr))
87 ++CurPtr;
88 }
89
90 return AsmToken(AsmToken::Real,
91 StringRef(TokStart, CurPtr - TokStart));
92 }
93
94 /// LexHexFloatLiteral matches essentially (.[0-9a-fA-F]*)?[pP][+-]?[0-9a-fA-F]+
95 /// while making sure there are enough actual digits around for the constant to
96 /// be valid.
97 ///
98 /// The leading "0x[0-9a-fA-F]*" (i.e. integer part) has already been consumed
99 /// before we get here.
LexHexFloatLiteral(bool NoIntDigits)100 AsmToken AsmLexer::LexHexFloatLiteral(bool NoIntDigits) {
101 assert((*CurPtr == 'p' || *CurPtr == 'P' || *CurPtr == '.') &&
102 "unexpected parse state in floating hex");
103 bool NoFracDigits = true;
104
105 // Skip the fractional part if there is one
106 if (*CurPtr == '.') {
107 ++CurPtr;
108
109 const char *FracStart = CurPtr;
110 while (isxdigit(*CurPtr))
111 ++CurPtr;
112
113 NoFracDigits = CurPtr == FracStart;
114 }
115
116 if (NoIntDigits && NoFracDigits)
117 return ReturnError(TokStart, "invalid hexadecimal floating-point constant: "
118 "expected at least one significand digit");
119
120 // Make sure we do have some kind of proper exponent part
121 if (*CurPtr != 'p' && *CurPtr != 'P')
122 return ReturnError(TokStart, "invalid hexadecimal floating-point constant: "
123 "expected exponent part 'p'");
124 ++CurPtr;
125
126 if (*CurPtr == '+' || *CurPtr == '-')
127 ++CurPtr;
128
129 // N.b. exponent digits are *not* hex
130 const char *ExpStart = CurPtr;
131 while (isdigit(*CurPtr))
132 ++CurPtr;
133
134 if (CurPtr == ExpStart)
135 return ReturnError(TokStart, "invalid hexadecimal floating-point constant: "
136 "expected at least one exponent digit");
137
138 return AsmToken(AsmToken::Real, StringRef(TokStart, CurPtr - TokStart));
139 }
140
141 /// LexIdentifier: [a-zA-Z_.][a-zA-Z0-9_$.@?]*
IsIdentifierChar(char c,bool AllowAt)142 static bool IsIdentifierChar(char c, bool AllowAt) {
143 return isalnum(c) || c == '_' || c == '$' || c == '.' ||
144 (c == '@' && AllowAt) || c == '?';
145 }
LexIdentifier()146 AsmToken AsmLexer::LexIdentifier() {
147 // Check for floating point literals.
148 if (CurPtr[-1] == '.' && isdigit(*CurPtr)) {
149 // Disambiguate a .1243foo identifier from a floating literal.
150 while (isdigit(*CurPtr))
151 ++CurPtr;
152 if (*CurPtr == 'e' || *CurPtr == 'E' ||
153 !IsIdentifierChar(*CurPtr, AllowAtInIdentifier))
154 return LexFloatLiteral();
155 }
156
157 while (IsIdentifierChar(*CurPtr, AllowAtInIdentifier))
158 ++CurPtr;
159
160 // Handle . as a special case.
161 if (CurPtr == TokStart+1 && TokStart[0] == '.')
162 return AsmToken(AsmToken::Dot, StringRef(TokStart, 1));
163
164 return AsmToken(AsmToken::Identifier, StringRef(TokStart, CurPtr - TokStart));
165 }
166
167 /// LexSlash: Slash: /
168 /// C-Style Comment: /* ... */
LexSlash()169 AsmToken AsmLexer::LexSlash() {
170 switch (*CurPtr) {
171 case '*': break; // C style comment.
172 case '/': return ++CurPtr, LexLineComment();
173 default: return AsmToken(AsmToken::Slash, StringRef(CurPtr-1, 1));
174 }
175
176 // C Style comment.
177 ++CurPtr; // skip the star.
178 while (1) {
179 int CurChar = getNextChar();
180 switch (CurChar) {
181 case EOF:
182 return ReturnError(TokStart, "unterminated comment");
183 case '*':
184 // End of the comment?
185 if (CurPtr[0] != '/') break;
186
187 ++CurPtr; // End the */.
188 return LexToken();
189 }
190 }
191 }
192
193 /// LexLineComment: Comment: #[^\n]*
194 /// : //[^\n]*
LexLineComment()195 AsmToken AsmLexer::LexLineComment() {
196 // FIXME: This is broken if we happen to a comment at the end of a file, which
197 // was .included, and which doesn't end with a newline.
198 int CurChar = getNextChar();
199 while (CurChar != '\n' && CurChar != '\r' && CurChar != EOF)
200 CurChar = getNextChar();
201
202 if (CurChar == EOF)
203 return AsmToken(AsmToken::Eof, StringRef(TokStart, 0));
204 return AsmToken(AsmToken::EndOfStatement, StringRef(TokStart, 0));
205 }
206
SkipIgnoredIntegerSuffix(const char * & CurPtr)207 static void SkipIgnoredIntegerSuffix(const char *&CurPtr) {
208 // Skip ULL, UL, U, L and LL suffices.
209 if (CurPtr[0] == 'U')
210 ++CurPtr;
211 if (CurPtr[0] == 'L')
212 ++CurPtr;
213 if (CurPtr[0] == 'L')
214 ++CurPtr;
215 }
216
217 // Look ahead to search for first non-hex digit, if it's [hH], then we treat the
218 // integer as a hexadecimal, possibly with leading zeroes.
doLookAhead(const char * & CurPtr,unsigned DefaultRadix)219 static unsigned doLookAhead(const char *&CurPtr, unsigned DefaultRadix) {
220 const char *FirstHex = nullptr;
221 const char *LookAhead = CurPtr;
222 while (1) {
223 if (isdigit(*LookAhead)) {
224 ++LookAhead;
225 } else if (isxdigit(*LookAhead)) {
226 if (!FirstHex)
227 FirstHex = LookAhead;
228 ++LookAhead;
229 } else {
230 break;
231 }
232 }
233 bool isHex = *LookAhead == 'h' || *LookAhead == 'H';
234 CurPtr = isHex || !FirstHex ? LookAhead : FirstHex;
235 if (isHex)
236 return 16;
237 return DefaultRadix;
238 }
239
intToken(StringRef Ref,APInt & Value)240 static AsmToken intToken(StringRef Ref, APInt &Value)
241 {
242 if (Value.isIntN(64))
243 return AsmToken(AsmToken::Integer, Ref, Value);
244 return AsmToken(AsmToken::BigNum, Ref, Value);
245 }
246
247 /// LexDigit: First character is [0-9].
248 /// Local Label: [0-9][:]
249 /// Forward/Backward Label: [0-9][fb]
250 /// Binary integer: 0b[01]+
251 /// Octal integer: 0[0-7]+
252 /// Hex integer: 0x[0-9a-fA-F]+ or [0x]?[0-9][0-9a-fA-F]*[hH]
253 /// Decimal integer: [1-9][0-9]*
LexDigit()254 AsmToken AsmLexer::LexDigit() {
255 // Decimal integer: [1-9][0-9]*
256 if (CurPtr[-1] != '0' || CurPtr[0] == '.') {
257 unsigned Radix = doLookAhead(CurPtr, 10);
258 bool isHex = Radix == 16;
259 // Check for floating point literals.
260 if (!isHex && (*CurPtr == '.' || *CurPtr == 'e')) {
261 ++CurPtr;
262 return LexFloatLiteral();
263 }
264
265 StringRef Result(TokStart, CurPtr - TokStart);
266
267 APInt Value(128, 0, true);
268 if (Result.getAsInteger(Radix, Value))
269 return ReturnError(TokStart, !isHex ? "invalid decimal number" :
270 "invalid hexdecimal number");
271
272 // Consume the [bB][hH].
273 if (Radix == 2 || Radix == 16)
274 ++CurPtr;
275
276 // The darwin/x86 (and x86-64) assembler accepts and ignores type
277 // suffices on integer literals.
278 SkipIgnoredIntegerSuffix(CurPtr);
279
280 return intToken(Result, Value);
281 }
282
283 if (*CurPtr == 'b') {
284 ++CurPtr;
285 // See if we actually have "0b" as part of something like "jmp 0b\n"
286 if (!isdigit(CurPtr[0])) {
287 --CurPtr;
288 StringRef Result(TokStart, CurPtr - TokStart);
289 return AsmToken(AsmToken::Integer, Result, 0);
290 }
291 const char *NumStart = CurPtr;
292 while (CurPtr[0] == '0' || CurPtr[0] == '1')
293 ++CurPtr;
294
295 // Requires at least one binary digit.
296 if (CurPtr == NumStart)
297 return ReturnError(TokStart, "invalid binary number");
298
299 StringRef Result(TokStart, CurPtr - TokStart);
300
301 APInt Value(128, 0, true);
302 if (Result.substr(2).getAsInteger(2, Value))
303 return ReturnError(TokStart, "invalid binary number");
304
305 // The darwin/x86 (and x86-64) assembler accepts and ignores ULL and LL
306 // suffixes on integer literals.
307 SkipIgnoredIntegerSuffix(CurPtr);
308
309 return intToken(Result, Value);
310 }
311
312 if (*CurPtr == 'x') {
313 ++CurPtr;
314 const char *NumStart = CurPtr;
315 while (isxdigit(CurPtr[0]))
316 ++CurPtr;
317
318 // "0x.0p0" is valid, and "0x0p0" (but not "0xp0" for example, which will be
319 // diagnosed by LexHexFloatLiteral).
320 if (CurPtr[0] == '.' || CurPtr[0] == 'p' || CurPtr[0] == 'P')
321 return LexHexFloatLiteral(NumStart == CurPtr);
322
323 // Otherwise requires at least one hex digit.
324 if (CurPtr == NumStart)
325 return ReturnError(CurPtr-2, "invalid hexadecimal number");
326
327 APInt Result(128, 0);
328 if (StringRef(TokStart, CurPtr - TokStart).getAsInteger(0, Result))
329 return ReturnError(TokStart, "invalid hexadecimal number");
330
331 // Consume the optional [hH].
332 if (*CurPtr == 'h' || *CurPtr == 'H')
333 ++CurPtr;
334
335 // The darwin/x86 (and x86-64) assembler accepts and ignores ULL and LL
336 // suffixes on integer literals.
337 SkipIgnoredIntegerSuffix(CurPtr);
338
339 return intToken(StringRef(TokStart, CurPtr - TokStart), Result);
340 }
341
342 // Either octal or hexadecimal.
343 APInt Value(128, 0, true);
344 unsigned Radix = doLookAhead(CurPtr, 8);
345 bool isHex = Radix == 16;
346 StringRef Result(TokStart, CurPtr - TokStart);
347 if (Result.getAsInteger(Radix, Value))
348 return ReturnError(TokStart, !isHex ? "invalid octal number" :
349 "invalid hexdecimal number");
350
351 // Consume the [hH].
352 if (Radix == 16)
353 ++CurPtr;
354
355 // The darwin/x86 (and x86-64) assembler accepts and ignores ULL and LL
356 // suffixes on integer literals.
357 SkipIgnoredIntegerSuffix(CurPtr);
358
359 return intToken(Result, Value);
360 }
361
362 /// LexSingleQuote: Integer: 'b'
LexSingleQuote()363 AsmToken AsmLexer::LexSingleQuote() {
364 int CurChar = getNextChar();
365
366 if (CurChar == '\\')
367 CurChar = getNextChar();
368
369 if (CurChar == EOF)
370 return ReturnError(TokStart, "unterminated single quote");
371
372 CurChar = getNextChar();
373
374 if (CurChar != '\'')
375 return ReturnError(TokStart, "single quote way too long");
376
377 // The idea here being that 'c' is basically just an integral
378 // constant.
379 StringRef Res = StringRef(TokStart,CurPtr - TokStart);
380 long long Value;
381
382 if (Res.startswith("\'\\")) {
383 char theChar = Res[2];
384 switch (theChar) {
385 default: Value = theChar; break;
386 case '\'': Value = '\''; break;
387 case 't': Value = '\t'; break;
388 case 'n': Value = '\n'; break;
389 case 'b': Value = '\b'; break;
390 }
391 } else
392 Value = TokStart[1];
393
394 return AsmToken(AsmToken::Integer, Res, Value);
395 }
396
397
398 /// LexQuote: String: "..."
LexQuote()399 AsmToken AsmLexer::LexQuote() {
400 int CurChar = getNextChar();
401 // TODO: does gas allow multiline string constants?
402 while (CurChar != '"') {
403 if (CurChar == '\\') {
404 // Allow \", etc.
405 CurChar = getNextChar();
406 }
407
408 if (CurChar == EOF)
409 return ReturnError(TokStart, "unterminated string constant");
410
411 CurChar = getNextChar();
412 }
413
414 return AsmToken(AsmToken::String, StringRef(TokStart, CurPtr - TokStart));
415 }
416
LexUntilEndOfStatement()417 StringRef AsmLexer::LexUntilEndOfStatement() {
418 TokStart = CurPtr;
419
420 while (!isAtStartOfComment(*CurPtr) && // Start of line comment.
421 !isAtStatementSeparator(CurPtr) && // End of statement marker.
422 *CurPtr != '\n' && *CurPtr != '\r' &&
423 (*CurPtr != 0 || CurPtr != CurBuf.end())) {
424 ++CurPtr;
425 }
426 return StringRef(TokStart, CurPtr-TokStart);
427 }
428
LexUntilEndOfLine()429 StringRef AsmLexer::LexUntilEndOfLine() {
430 TokStart = CurPtr;
431
432 while (*CurPtr != '\n' && *CurPtr != '\r' &&
433 (*CurPtr != 0 || CurPtr != CurBuf.end())) {
434 ++CurPtr;
435 }
436 return StringRef(TokStart, CurPtr-TokStart);
437 }
438
peekTok(bool ShouldSkipSpace)439 const AsmToken AsmLexer::peekTok(bool ShouldSkipSpace) {
440 const char *SavedTokStart = TokStart;
441 const char *SavedCurPtr = CurPtr;
442 bool SavedAtStartOfLine = isAtStartOfLine;
443 bool SavedSkipSpace = SkipSpace;
444
445 std::string SavedErr = getErr();
446 SMLoc SavedErrLoc = getErrLoc();
447
448 SkipSpace = ShouldSkipSpace;
449 AsmToken Token = LexToken();
450
451 SetError(SavedErrLoc, SavedErr);
452
453 SkipSpace = SavedSkipSpace;
454 isAtStartOfLine = SavedAtStartOfLine;
455 CurPtr = SavedCurPtr;
456 TokStart = SavedTokStart;
457
458 return Token;
459 }
460
isAtStartOfComment(char Char)461 bool AsmLexer::isAtStartOfComment(char Char) {
462 // FIXME: This won't work for multi-character comment indicators like "//".
463 return Char == *MAI.getCommentString();
464 }
465
isAtStatementSeparator(const char * Ptr)466 bool AsmLexer::isAtStatementSeparator(const char *Ptr) {
467 return strncmp(Ptr, MAI.getSeparatorString(),
468 strlen(MAI.getSeparatorString())) == 0;
469 }
470
LexToken()471 AsmToken AsmLexer::LexToken() {
472 TokStart = CurPtr;
473 // This always consumes at least one character.
474 int CurChar = getNextChar();
475
476 if (isAtStartOfComment(CurChar)) {
477 // If this comment starts with a '#', then return the Hash token and let
478 // the assembler parser see if it can be parsed as a cpp line filename
479 // comment. We do this only if we are at the start of a line.
480 if (CurChar == '#' && isAtStartOfLine)
481 return AsmToken(AsmToken::Hash, StringRef(TokStart, 1));
482 isAtStartOfLine = true;
483 return LexLineComment();
484 }
485 if (isAtStatementSeparator(TokStart)) {
486 CurPtr += strlen(MAI.getSeparatorString()) - 1;
487 return AsmToken(AsmToken::EndOfStatement,
488 StringRef(TokStart, strlen(MAI.getSeparatorString())));
489 }
490
491 // If we're missing a newline at EOF, make sure we still get an
492 // EndOfStatement token before the Eof token.
493 if (CurChar == EOF && !isAtStartOfLine) {
494 isAtStartOfLine = true;
495 return AsmToken(AsmToken::EndOfStatement, StringRef(TokStart, 1));
496 }
497
498 isAtStartOfLine = false;
499 switch (CurChar) {
500 default:
501 // Handle identifier: [a-zA-Z_.][a-zA-Z0-9_$.@]*
502 if (isalpha(CurChar) || CurChar == '_' || CurChar == '.')
503 return LexIdentifier();
504
505 // Unknown character, emit an error.
506 return ReturnError(TokStart, "invalid character in input");
507 case EOF: return AsmToken(AsmToken::Eof, StringRef(TokStart, 0));
508 case 0:
509 case ' ':
510 case '\t':
511 if (SkipSpace) {
512 // Ignore whitespace.
513 return LexToken();
514 } else {
515 int len = 1;
516 while (*CurPtr==' ' || *CurPtr=='\t') {
517 CurPtr++;
518 len++;
519 }
520 return AsmToken(AsmToken::Space, StringRef(TokStart, len));
521 }
522 case '\n': // FALL THROUGH.
523 case '\r':
524 isAtStartOfLine = true;
525 return AsmToken(AsmToken::EndOfStatement, StringRef(TokStart, 1));
526 case ':': return AsmToken(AsmToken::Colon, StringRef(TokStart, 1));
527 case '+': return AsmToken(AsmToken::Plus, StringRef(TokStart, 1));
528 case '-': return AsmToken(AsmToken::Minus, StringRef(TokStart, 1));
529 case '~': return AsmToken(AsmToken::Tilde, StringRef(TokStart, 1));
530 case '(': return AsmToken(AsmToken::LParen, StringRef(TokStart, 1));
531 case ')': return AsmToken(AsmToken::RParen, StringRef(TokStart, 1));
532 case '[': return AsmToken(AsmToken::LBrac, StringRef(TokStart, 1));
533 case ']': return AsmToken(AsmToken::RBrac, StringRef(TokStart, 1));
534 case '{': return AsmToken(AsmToken::LCurly, StringRef(TokStart, 1));
535 case '}': return AsmToken(AsmToken::RCurly, StringRef(TokStart, 1));
536 case '*': return AsmToken(AsmToken::Star, StringRef(TokStart, 1));
537 case ',': return AsmToken(AsmToken::Comma, StringRef(TokStart, 1));
538 case '$': return AsmToken(AsmToken::Dollar, StringRef(TokStart, 1));
539 case '@': return AsmToken(AsmToken::At, StringRef(TokStart, 1));
540 case '\\': return AsmToken(AsmToken::BackSlash, StringRef(TokStart, 1));
541 case '=':
542 if (*CurPtr == '=')
543 return ++CurPtr, AsmToken(AsmToken::EqualEqual, StringRef(TokStart, 2));
544 return AsmToken(AsmToken::Equal, StringRef(TokStart, 1));
545 case '|':
546 if (*CurPtr == '|')
547 return ++CurPtr, AsmToken(AsmToken::PipePipe, StringRef(TokStart, 2));
548 return AsmToken(AsmToken::Pipe, StringRef(TokStart, 1));
549 case '^': return AsmToken(AsmToken::Caret, StringRef(TokStart, 1));
550 case '&':
551 if (*CurPtr == '&')
552 return ++CurPtr, AsmToken(AsmToken::AmpAmp, StringRef(TokStart, 2));
553 return AsmToken(AsmToken::Amp, StringRef(TokStart, 1));
554 case '!':
555 if (*CurPtr == '=')
556 return ++CurPtr, AsmToken(AsmToken::ExclaimEqual, StringRef(TokStart, 2));
557 return AsmToken(AsmToken::Exclaim, StringRef(TokStart, 1));
558 case '%': return AsmToken(AsmToken::Percent, StringRef(TokStart, 1));
559 case '/': return LexSlash();
560 case '#': return AsmToken(AsmToken::Hash, StringRef(TokStart, 1));
561 case '\'': return LexSingleQuote();
562 case '"': return LexQuote();
563 case '0': case '1': case '2': case '3': case '4':
564 case '5': case '6': case '7': case '8': case '9':
565 return LexDigit();
566 case '<':
567 switch (*CurPtr) {
568 case '<': return ++CurPtr, AsmToken(AsmToken::LessLess,
569 StringRef(TokStart, 2));
570 case '=': return ++CurPtr, AsmToken(AsmToken::LessEqual,
571 StringRef(TokStart, 2));
572 case '>': return ++CurPtr, AsmToken(AsmToken::LessGreater,
573 StringRef(TokStart, 2));
574 default: return AsmToken(AsmToken::Less, StringRef(TokStart, 1));
575 }
576 case '>':
577 switch (*CurPtr) {
578 case '>': return ++CurPtr, AsmToken(AsmToken::GreaterGreater,
579 StringRef(TokStart, 2));
580 case '=': return ++CurPtr, AsmToken(AsmToken::GreaterEqual,
581 StringRef(TokStart, 2));
582 default: return AsmToken(AsmToken::Greater, StringRef(TokStart, 1));
583 }
584
585 // TODO: Quoted identifiers (objc methods etc)
586 // local labels: [0-9][:]
587 // Forward/backward labels: [0-9][fb]
588 // Integers, fp constants, character constants.
589 }
590 }
591