1 //===- TGLexer.cpp - Lexer for TableGen -----------------------------------===//
2 //
3 // The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 // Implement the Lexer for TableGen.
11 //
12 //===----------------------------------------------------------------------===//
13
14 #include "TGLexer.h"
15 #include "Error.h"
16 #include "llvm/Support/SourceMgr.h"
17 #include "llvm/Support/MemoryBuffer.h"
18 #include "llvm/Config/config.h"
19 #include "llvm/ADT/StringSwitch.h"
20 #include "llvm/ADT/Twine.h"
21 #include <cctype>
22 #include <cstdio>
23 #include <cstdlib>
24 #include <cstring>
25 #include <cerrno>
26 using namespace llvm;
27
TGLexer(SourceMgr & SM)28 TGLexer::TGLexer(SourceMgr &SM) : SrcMgr(SM) {
29 CurBuffer = 0;
30 CurBuf = SrcMgr.getMemoryBuffer(CurBuffer);
31 CurPtr = CurBuf->getBufferStart();
32 TokStart = 0;
33 }
34
getLoc() const35 SMLoc TGLexer::getLoc() const {
36 return SMLoc::getFromPointer(TokStart);
37 }
38
39 /// ReturnError - Set the error to the specified string at the specified
40 /// location. This is defined to always return tgtok::Error.
ReturnError(const char * Loc,const Twine & Msg)41 tgtok::TokKind TGLexer::ReturnError(const char *Loc, const Twine &Msg) {
42 PrintError(Loc, Msg);
43 return tgtok::Error;
44 }
45
getNextChar()46 int TGLexer::getNextChar() {
47 char CurChar = *CurPtr++;
48 switch (CurChar) {
49 default:
50 return (unsigned char)CurChar;
51 case 0: {
52 // A nul character in the stream is either the end of the current buffer or
53 // a random nul in the file. Disambiguate that here.
54 if (CurPtr-1 != CurBuf->getBufferEnd())
55 return 0; // Just whitespace.
56
57 // If this is the end of an included file, pop the parent file off the
58 // include stack.
59 SMLoc ParentIncludeLoc = SrcMgr.getParentIncludeLoc(CurBuffer);
60 if (ParentIncludeLoc != SMLoc()) {
61 CurBuffer = SrcMgr.FindBufferContainingLoc(ParentIncludeLoc);
62 CurBuf = SrcMgr.getMemoryBuffer(CurBuffer);
63 CurPtr = ParentIncludeLoc.getPointer();
64 return getNextChar();
65 }
66
67 // Otherwise, return end of file.
68 --CurPtr; // Another call to lex will return EOF again.
69 return EOF;
70 }
71 case '\n':
72 case '\r':
73 // Handle the newline character by ignoring it and incrementing the line
74 // count. However, be careful about 'dos style' files with \n\r in them.
75 // Only treat a \n\r or \r\n as a single line.
76 if ((*CurPtr == '\n' || (*CurPtr == '\r')) &&
77 *CurPtr != CurChar)
78 ++CurPtr; // Eat the two char newline sequence.
79 return '\n';
80 }
81 }
82
LexToken()83 tgtok::TokKind TGLexer::LexToken() {
84 TokStart = CurPtr;
85 // This always consumes at least one character.
86 int CurChar = getNextChar();
87
88 switch (CurChar) {
89 default:
90 // Handle letters: [a-zA-Z_#]
91 if (isalpha(CurChar) || CurChar == '_' || CurChar == '#')
92 return LexIdentifier();
93
94 // Unknown character, emit an error.
95 return ReturnError(TokStart, "Unexpected character");
96 case EOF: return tgtok::Eof;
97 case ':': return tgtok::colon;
98 case ';': return tgtok::semi;
99 case '.': return tgtok::period;
100 case ',': return tgtok::comma;
101 case '<': return tgtok::less;
102 case '>': return tgtok::greater;
103 case ']': return tgtok::r_square;
104 case '{': return tgtok::l_brace;
105 case '}': return tgtok::r_brace;
106 case '(': return tgtok::l_paren;
107 case ')': return tgtok::r_paren;
108 case '=': return tgtok::equal;
109 case '?': return tgtok::question;
110
111 case 0:
112 case ' ':
113 case '\t':
114 case '\n':
115 case '\r':
116 // Ignore whitespace.
117 return LexToken();
118 case '/':
119 // If this is the start of a // comment, skip until the end of the line or
120 // the end of the buffer.
121 if (*CurPtr == '/')
122 SkipBCPLComment();
123 else if (*CurPtr == '*') {
124 if (SkipCComment())
125 return tgtok::Error;
126 } else // Otherwise, this is an error.
127 return ReturnError(TokStart, "Unexpected character");
128 return LexToken();
129 case '-': case '+':
130 case '0': case '1': case '2': case '3': case '4': case '5': case '6':
131 case '7': case '8': case '9':
132 return LexNumber();
133 case '"': return LexString();
134 case '$': return LexVarName();
135 case '[': return LexBracket();
136 case '!': return LexExclaim();
137 }
138 }
139
140 /// LexString - Lex "[^"]*"
LexString()141 tgtok::TokKind TGLexer::LexString() {
142 const char *StrStart = CurPtr;
143
144 CurStrVal = "";
145
146 while (*CurPtr != '"') {
147 // If we hit the end of the buffer, report an error.
148 if (*CurPtr == 0 && CurPtr == CurBuf->getBufferEnd())
149 return ReturnError(StrStart, "End of file in string literal");
150
151 if (*CurPtr == '\n' || *CurPtr == '\r')
152 return ReturnError(StrStart, "End of line in string literal");
153
154 if (*CurPtr != '\\') {
155 CurStrVal += *CurPtr++;
156 continue;
157 }
158
159 ++CurPtr;
160
161 switch (*CurPtr) {
162 case '\\': case '\'': case '"':
163 // These turn into their literal character.
164 CurStrVal += *CurPtr++;
165 break;
166 case 't':
167 CurStrVal += '\t';
168 ++CurPtr;
169 break;
170 case 'n':
171 CurStrVal += '\n';
172 ++CurPtr;
173 break;
174
175 case '\n':
176 case '\r':
177 return ReturnError(CurPtr, "escaped newlines not supported in tblgen");
178
179 // If we hit the end of the buffer, report an error.
180 case '\0':
181 if (CurPtr == CurBuf->getBufferEnd())
182 return ReturnError(StrStart, "End of file in string literal");
183 // FALL THROUGH
184 default:
185 return ReturnError(CurPtr, "invalid escape in string literal");
186 }
187 }
188
189 ++CurPtr;
190 return tgtok::StrVal;
191 }
192
LexVarName()193 tgtok::TokKind TGLexer::LexVarName() {
194 if (!isalpha(CurPtr[0]) && CurPtr[0] != '_')
195 return ReturnError(TokStart, "Invalid variable name");
196
197 // Otherwise, we're ok, consume the rest of the characters.
198 const char *VarNameStart = CurPtr++;
199
200 while (isalpha(*CurPtr) || isdigit(*CurPtr) || *CurPtr == '_')
201 ++CurPtr;
202
203 CurStrVal.assign(VarNameStart, CurPtr);
204 return tgtok::VarName;
205 }
206
207
LexIdentifier()208 tgtok::TokKind TGLexer::LexIdentifier() {
209 // The first letter is [a-zA-Z_#].
210 const char *IdentStart = TokStart;
211
212 // Match the rest of the identifier regex: [0-9a-zA-Z_#]*
213 while (isalpha(*CurPtr) || isdigit(*CurPtr) || *CurPtr == '_' ||
214 *CurPtr == '#')
215 ++CurPtr;
216
217
218 // Check to see if this identifier is a keyword.
219 unsigned Len = CurPtr-IdentStart;
220
221 if (Len == 3 && !memcmp(IdentStart, "int", 3)) return tgtok::Int;
222 if (Len == 3 && !memcmp(IdentStart, "bit", 3)) return tgtok::Bit;
223 if (Len == 4 && !memcmp(IdentStart, "bits", 4)) return tgtok::Bits;
224 if (Len == 6 && !memcmp(IdentStart, "string", 6)) return tgtok::String;
225 if (Len == 4 && !memcmp(IdentStart, "list", 4)) return tgtok::List;
226 if (Len == 4 && !memcmp(IdentStart, "code", 4)) return tgtok::Code;
227 if (Len == 3 && !memcmp(IdentStart, "dag", 3)) return tgtok::Dag;
228
229 if (Len == 5 && !memcmp(IdentStart, "class", 5)) return tgtok::Class;
230 if (Len == 3 && !memcmp(IdentStart, "def", 3)) return tgtok::Def;
231 if (Len == 4 && !memcmp(IdentStart, "defm", 4)) return tgtok::Defm;
232 if (Len == 10 && !memcmp(IdentStart, "multiclass", 10))
233 return tgtok::MultiClass;
234 if (Len == 5 && !memcmp(IdentStart, "field", 5)) return tgtok::Field;
235 if (Len == 3 && !memcmp(IdentStart, "let", 3)) return tgtok::Let;
236 if (Len == 2 && !memcmp(IdentStart, "in", 2)) return tgtok::In;
237
238 if (Len == 7 && !memcmp(IdentStart, "include", 7)) {
239 if (LexInclude()) return tgtok::Error;
240 return Lex();
241 }
242
243 CurStrVal.assign(IdentStart, CurPtr);
244 return tgtok::Id;
245 }
246
247 /// LexInclude - We just read the "include" token. Get the string token that
248 /// comes next and enter the include.
LexInclude()249 bool TGLexer::LexInclude() {
250 // The token after the include must be a string.
251 tgtok::TokKind Tok = LexToken();
252 if (Tok == tgtok::Error) return true;
253 if (Tok != tgtok::StrVal) {
254 PrintError(getLoc(), "Expected filename after include");
255 return true;
256 }
257
258 // Get the string.
259 std::string Filename = CurStrVal;
260 std::string IncludedFile;
261
262
263 CurBuffer = SrcMgr.AddIncludeFile(Filename, SMLoc::getFromPointer(CurPtr),
264 IncludedFile);
265 if (CurBuffer == -1) {
266 PrintError(getLoc(), "Could not find include file '" + Filename + "'");
267 return true;
268 }
269
270 Dependencies.push_back(IncludedFile);
271 // Save the line number and lex buffer of the includer.
272 CurBuf = SrcMgr.getMemoryBuffer(CurBuffer);
273 CurPtr = CurBuf->getBufferStart();
274 return false;
275 }
276
SkipBCPLComment()277 void TGLexer::SkipBCPLComment() {
278 ++CurPtr; // skip the second slash.
279 while (1) {
280 switch (*CurPtr) {
281 case '\n':
282 case '\r':
283 return; // Newline is end of comment.
284 case 0:
285 // If this is the end of the buffer, end the comment.
286 if (CurPtr == CurBuf->getBufferEnd())
287 return;
288 break;
289 }
290 // Otherwise, skip the character.
291 ++CurPtr;
292 }
293 }
294
295 /// SkipCComment - This skips C-style /**/ comments. The only difference from C
296 /// is that we allow nesting.
SkipCComment()297 bool TGLexer::SkipCComment() {
298 ++CurPtr; // skip the star.
299 unsigned CommentDepth = 1;
300
301 while (1) {
302 int CurChar = getNextChar();
303 switch (CurChar) {
304 case EOF:
305 PrintError(TokStart, "Unterminated comment!");
306 return true;
307 case '*':
308 // End of the comment?
309 if (CurPtr[0] != '/') break;
310
311 ++CurPtr; // End the */.
312 if (--CommentDepth == 0)
313 return false;
314 break;
315 case '/':
316 // Start of a nested comment?
317 if (CurPtr[0] != '*') break;
318 ++CurPtr;
319 ++CommentDepth;
320 break;
321 }
322 }
323 }
324
325 /// LexNumber - Lex:
326 /// [-+]?[0-9]+
327 /// 0x[0-9a-fA-F]+
328 /// 0b[01]+
LexNumber()329 tgtok::TokKind TGLexer::LexNumber() {
330 if (CurPtr[-1] == '0') {
331 if (CurPtr[0] == 'x') {
332 ++CurPtr;
333 const char *NumStart = CurPtr;
334 while (isxdigit(CurPtr[0]))
335 ++CurPtr;
336
337 // Requires at least one hex digit.
338 if (CurPtr == NumStart)
339 return ReturnError(TokStart, "Invalid hexadecimal number");
340
341 errno = 0;
342 CurIntVal = strtoll(NumStart, 0, 16);
343 if (errno == EINVAL)
344 return ReturnError(TokStart, "Invalid hexadecimal number");
345 if (errno == ERANGE) {
346 errno = 0;
347 CurIntVal = (int64_t)strtoull(NumStart, 0, 16);
348 if (errno == EINVAL)
349 return ReturnError(TokStart, "Invalid hexadecimal number");
350 if (errno == ERANGE)
351 return ReturnError(TokStart, "Hexadecimal number out of range");
352 }
353 return tgtok::IntVal;
354 } else if (CurPtr[0] == 'b') {
355 ++CurPtr;
356 const char *NumStart = CurPtr;
357 while (CurPtr[0] == '0' || CurPtr[0] == '1')
358 ++CurPtr;
359
360 // Requires at least one binary digit.
361 if (CurPtr == NumStart)
362 return ReturnError(CurPtr-2, "Invalid binary number");
363 CurIntVal = strtoll(NumStart, 0, 2);
364 return tgtok::IntVal;
365 }
366 }
367
368 // Check for a sign without a digit.
369 if (!isdigit(CurPtr[0])) {
370 if (CurPtr[-1] == '-')
371 return tgtok::minus;
372 else if (CurPtr[-1] == '+')
373 return tgtok::plus;
374 }
375
376 while (isdigit(CurPtr[0]))
377 ++CurPtr;
378 CurIntVal = strtoll(TokStart, 0, 10);
379 return tgtok::IntVal;
380 }
381
382 /// LexBracket - We just read '['. If this is a code block, return it,
383 /// otherwise return the bracket. Match: '[' and '[{ ( [^}]+ | }[^]] )* }]'
LexBracket()384 tgtok::TokKind TGLexer::LexBracket() {
385 if (CurPtr[0] != '{')
386 return tgtok::l_square;
387 ++CurPtr;
388 const char *CodeStart = CurPtr;
389 while (1) {
390 int Char = getNextChar();
391 if (Char == EOF) break;
392
393 if (Char != '}') continue;
394
395 Char = getNextChar();
396 if (Char == EOF) break;
397 if (Char == ']') {
398 CurStrVal.assign(CodeStart, CurPtr-2);
399 return tgtok::CodeFragment;
400 }
401 }
402
403 return ReturnError(CodeStart-2, "Unterminated Code Block");
404 }
405
406 /// LexExclaim - Lex '!' and '![a-zA-Z]+'.
LexExclaim()407 tgtok::TokKind TGLexer::LexExclaim() {
408 if (!isalpha(*CurPtr))
409 return ReturnError(CurPtr - 1, "Invalid \"!operator\"");
410
411 const char *Start = CurPtr++;
412 while (isalpha(*CurPtr))
413 ++CurPtr;
414
415 // Check to see which operator this is.
416 tgtok::TokKind Kind =
417 StringSwitch<tgtok::TokKind>(StringRef(Start, CurPtr - Start))
418 .Case("eq", tgtok::XEq)
419 .Case("if", tgtok::XIf)
420 .Case("head", tgtok::XHead)
421 .Case("tail", tgtok::XTail)
422 .Case("con", tgtok::XConcat)
423 .Case("shl", tgtok::XSHL)
424 .Case("sra", tgtok::XSRA)
425 .Case("srl", tgtok::XSRL)
426 .Case("cast", tgtok::XCast)
427 .Case("empty", tgtok::XEmpty)
428 .Case("subst", tgtok::XSubst)
429 .Case("foreach", tgtok::XForEach)
430 .Case("strconcat", tgtok::XStrConcat)
431 .Default(tgtok::Error);
432
433 return Kind != tgtok::Error ? Kind : ReturnError(Start-1, "Unknown operator");
434 }
435
436