1 //===--- LiteralSupport.cpp - Code to parse and process literals ----------===//
2 //
3 // The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 // This file implements the NumericLiteralParser, CharLiteralParser, and
11 // StringLiteralParser interfaces.
12 //
13 //===----------------------------------------------------------------------===//
14
15 #include "clang/Lex/LiteralSupport.h"
16 #include "clang/Lex/Preprocessor.h"
17 #include "clang/Lex/LexDiagnostic.h"
18 #include "clang/Basic/TargetInfo.h"
19 #include "clang/Basic/ConvertUTF.h"
20 #include "llvm/ADT/StringExtras.h"
21 #include "llvm/Support/ErrorHandling.h"
22 using namespace clang;
23
24 /// HexDigitValue - Return the value of the specified hex digit, or -1 if it's
25 /// not valid.
HexDigitValue(char C)26 static int HexDigitValue(char C) {
27 if (C >= '0' && C <= '9') return C-'0';
28 if (C >= 'a' && C <= 'f') return C-'a'+10;
29 if (C >= 'A' && C <= 'F') return C-'A'+10;
30 return -1;
31 }
32
getCharWidth(tok::TokenKind kind,const TargetInfo & Target)33 static unsigned getCharWidth(tok::TokenKind kind, const TargetInfo &Target) {
34 switch (kind) {
35 default: llvm_unreachable("Unknown token type!");
36 case tok::char_constant:
37 case tok::string_literal:
38 case tok::utf8_string_literal:
39 return Target.getCharWidth();
40 case tok::wide_char_constant:
41 case tok::wide_string_literal:
42 return Target.getWCharWidth();
43 case tok::utf16_char_constant:
44 case tok::utf16_string_literal:
45 return Target.getChar16Width();
46 case tok::utf32_char_constant:
47 case tok::utf32_string_literal:
48 return Target.getChar32Width();
49 }
50 }
51
52 /// \brief Produce a diagnostic highlighting some portion of a literal.
53 ///
54 /// Emits the diagnostic \p DiagID, highlighting the range of characters from
55 /// \p TokRangeBegin (inclusive) to \p TokRangeEnd (exclusive), which must be
56 /// a substring of a spelling buffer for the token beginning at \p TokBegin.
Diag(DiagnosticsEngine * Diags,const LangOptions & Features,FullSourceLoc TokLoc,const char * TokBegin,const char * TokRangeBegin,const char * TokRangeEnd,unsigned DiagID)57 static DiagnosticBuilder Diag(DiagnosticsEngine *Diags,
58 const LangOptions &Features, FullSourceLoc TokLoc,
59 const char *TokBegin, const char *TokRangeBegin,
60 const char *TokRangeEnd, unsigned DiagID) {
61 SourceLocation Begin =
62 Lexer::AdvanceToTokenCharacter(TokLoc, TokRangeBegin - TokBegin,
63 TokLoc.getManager(), Features);
64 SourceLocation End =
65 Lexer::AdvanceToTokenCharacter(Begin, TokRangeEnd - TokRangeBegin,
66 TokLoc.getManager(), Features);
67 return Diags->Report(Begin, DiagID)
68 << CharSourceRange::getCharRange(Begin, End);
69 }
70
71 /// ProcessCharEscape - Parse a standard C escape sequence, which can occur in
72 /// either a character or a string literal.
ProcessCharEscape(const char * ThisTokBegin,const char * & ThisTokBuf,const char * ThisTokEnd,bool & HadError,FullSourceLoc Loc,unsigned CharWidth,DiagnosticsEngine * Diags,const LangOptions & Features)73 static unsigned ProcessCharEscape(const char *ThisTokBegin,
74 const char *&ThisTokBuf,
75 const char *ThisTokEnd, bool &HadError,
76 FullSourceLoc Loc, unsigned CharWidth,
77 DiagnosticsEngine *Diags,
78 const LangOptions &Features) {
79 const char *EscapeBegin = ThisTokBuf;
80
81 // Skip the '\' char.
82 ++ThisTokBuf;
83
84 // We know that this character can't be off the end of the buffer, because
85 // that would have been \", which would not have been the end of string.
86 unsigned ResultChar = *ThisTokBuf++;
87 switch (ResultChar) {
88 // These map to themselves.
89 case '\\': case '\'': case '"': case '?': break;
90
91 // These have fixed mappings.
92 case 'a':
93 // TODO: K&R: the meaning of '\\a' is different in traditional C
94 ResultChar = 7;
95 break;
96 case 'b':
97 ResultChar = 8;
98 break;
99 case 'e':
100 if (Diags)
101 Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
102 diag::ext_nonstandard_escape) << "e";
103 ResultChar = 27;
104 break;
105 case 'E':
106 if (Diags)
107 Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
108 diag::ext_nonstandard_escape) << "E";
109 ResultChar = 27;
110 break;
111 case 'f':
112 ResultChar = 12;
113 break;
114 case 'n':
115 ResultChar = 10;
116 break;
117 case 'r':
118 ResultChar = 13;
119 break;
120 case 't':
121 ResultChar = 9;
122 break;
123 case 'v':
124 ResultChar = 11;
125 break;
126 case 'x': { // Hex escape.
127 ResultChar = 0;
128 if (ThisTokBuf == ThisTokEnd || !isxdigit(*ThisTokBuf)) {
129 if (Diags)
130 Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
131 diag::err_hex_escape_no_digits);
132 HadError = 1;
133 break;
134 }
135
136 // Hex escapes are a maximal series of hex digits.
137 bool Overflow = false;
138 for (; ThisTokBuf != ThisTokEnd; ++ThisTokBuf) {
139 int CharVal = HexDigitValue(ThisTokBuf[0]);
140 if (CharVal == -1) break;
141 // About to shift out a digit?
142 Overflow |= (ResultChar & 0xF0000000) ? true : false;
143 ResultChar <<= 4;
144 ResultChar |= CharVal;
145 }
146
147 // See if any bits will be truncated when evaluated as a character.
148 if (CharWidth != 32 && (ResultChar >> CharWidth) != 0) {
149 Overflow = true;
150 ResultChar &= ~0U >> (32-CharWidth);
151 }
152
153 // Check for overflow.
154 if (Overflow && Diags) // Too many digits to fit in
155 Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
156 diag::warn_hex_escape_too_large);
157 break;
158 }
159 case '0': case '1': case '2': case '3':
160 case '4': case '5': case '6': case '7': {
161 // Octal escapes.
162 --ThisTokBuf;
163 ResultChar = 0;
164
165 // Octal escapes are a series of octal digits with maximum length 3.
166 // "\0123" is a two digit sequence equal to "\012" "3".
167 unsigned NumDigits = 0;
168 do {
169 ResultChar <<= 3;
170 ResultChar |= *ThisTokBuf++ - '0';
171 ++NumDigits;
172 } while (ThisTokBuf != ThisTokEnd && NumDigits < 3 &&
173 ThisTokBuf[0] >= '0' && ThisTokBuf[0] <= '7');
174
175 // Check for overflow. Reject '\777', but not L'\777'.
176 if (CharWidth != 32 && (ResultChar >> CharWidth) != 0) {
177 if (Diags)
178 Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
179 diag::warn_octal_escape_too_large);
180 ResultChar &= ~0U >> (32-CharWidth);
181 }
182 break;
183 }
184
185 // Otherwise, these are not valid escapes.
186 case '(': case '{': case '[': case '%':
187 // GCC accepts these as extensions. We warn about them as such though.
188 if (Diags)
189 Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
190 diag::ext_nonstandard_escape)
191 << std::string(1, ResultChar);
192 break;
193 default:
194 if (Diags == 0)
195 break;
196
197 if (isgraph(ResultChar))
198 Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
199 diag::ext_unknown_escape)
200 << std::string(1, ResultChar);
201 else
202 Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
203 diag::ext_unknown_escape)
204 << "x" + llvm::utohexstr(ResultChar);
205 break;
206 }
207
208 return ResultChar;
209 }
210
211 /// ProcessUCNEscape - Read the Universal Character Name, check constraints and
212 /// return the UTF32.
ProcessUCNEscape(const char * ThisTokBegin,const char * & ThisTokBuf,const char * ThisTokEnd,uint32_t & UcnVal,unsigned short & UcnLen,FullSourceLoc Loc,DiagnosticsEngine * Diags,const LangOptions & Features,bool in_char_string_literal=false)213 static bool ProcessUCNEscape(const char *ThisTokBegin, const char *&ThisTokBuf,
214 const char *ThisTokEnd,
215 uint32_t &UcnVal, unsigned short &UcnLen,
216 FullSourceLoc Loc, DiagnosticsEngine *Diags,
217 const LangOptions &Features,
218 bool in_char_string_literal = false) {
219 const char *UcnBegin = ThisTokBuf;
220
221 // Skip the '\u' char's.
222 ThisTokBuf += 2;
223
224 if (ThisTokBuf == ThisTokEnd || !isxdigit(*ThisTokBuf)) {
225 if (Diags)
226 Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,
227 diag::err_ucn_escape_no_digits);
228 return false;
229 }
230 UcnLen = (ThisTokBuf[-1] == 'u' ? 4 : 8);
231 unsigned short UcnLenSave = UcnLen;
232 for (; ThisTokBuf != ThisTokEnd && UcnLenSave; ++ThisTokBuf, UcnLenSave--) {
233 int CharVal = HexDigitValue(ThisTokBuf[0]);
234 if (CharVal == -1) break;
235 UcnVal <<= 4;
236 UcnVal |= CharVal;
237 }
238 // If we didn't consume the proper number of digits, there is a problem.
239 if (UcnLenSave) {
240 if (Diags)
241 Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,
242 diag::err_ucn_escape_incomplete);
243 return false;
244 }
245
246 // Check UCN constraints (C99 6.4.3p2) [C++11 lex.charset p2]
247 if ((0xD800 <= UcnVal && UcnVal <= 0xDFFF) || // surrogate codepoints
248 UcnVal > 0x10FFFF) { // maximum legal UTF32 value
249 if (Diags)
250 Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,
251 diag::err_ucn_escape_invalid);
252 return false;
253 }
254
255 // C++11 allows UCNs that refer to control characters and basic source
256 // characters inside character and string literals
257 if (UcnVal < 0xa0 &&
258 (UcnVal != 0x24 && UcnVal != 0x40 && UcnVal != 0x60)) { // $, @, `
259 bool IsError = (!Features.CPlusPlus0x || !in_char_string_literal);
260 if (Diags) {
261 char BasicSCSChar = UcnVal;
262 if (UcnVal >= 0x20 && UcnVal < 0x7f)
263 Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,
264 IsError ? diag::err_ucn_escape_basic_scs :
265 diag::warn_cxx98_compat_literal_ucn_escape_basic_scs)
266 << StringRef(&BasicSCSChar, 1);
267 else
268 Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,
269 IsError ? diag::err_ucn_control_character :
270 diag::warn_cxx98_compat_literal_ucn_control_character);
271 }
272 if (IsError)
273 return false;
274 }
275
276 if (!Features.CPlusPlus && !Features.C99 && Diags)
277 Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,
278 diag::warn_ucn_not_valid_in_c89);
279
280 return true;
281 }
282
283 /// MeasureUCNEscape - Determine the number of bytes within the resulting string
284 /// which this UCN will occupy.
MeasureUCNEscape(const char * ThisTokBegin,const char * & ThisTokBuf,const char * ThisTokEnd,unsigned CharByteWidth,const LangOptions & Features,bool & HadError)285 static int MeasureUCNEscape(const char *ThisTokBegin, const char *&ThisTokBuf,
286 const char *ThisTokEnd, unsigned CharByteWidth,
287 const LangOptions &Features, bool &HadError) {
288 // UTF-32: 4 bytes per escape.
289 if (CharByteWidth == 4)
290 return 4;
291
292 uint32_t UcnVal = 0;
293 unsigned short UcnLen = 0;
294 FullSourceLoc Loc;
295
296 if (!ProcessUCNEscape(ThisTokBegin, ThisTokBuf, ThisTokEnd, UcnVal,
297 UcnLen, Loc, 0, Features, true)) {
298 HadError = true;
299 return 0;
300 }
301
302 // UTF-16: 2 bytes for BMP, 4 bytes otherwise.
303 if (CharByteWidth == 2)
304 return UcnVal <= 0xFFFF ? 2 : 4;
305
306 // UTF-8.
307 if (UcnVal < 0x80)
308 return 1;
309 if (UcnVal < 0x800)
310 return 2;
311 if (UcnVal < 0x10000)
312 return 3;
313 return 4;
314 }
315
316 /// EncodeUCNEscape - Read the Universal Character Name, check constraints and
317 /// convert the UTF32 to UTF8 or UTF16. This is a subroutine of
318 /// StringLiteralParser. When we decide to implement UCN's for identifiers,
319 /// we will likely rework our support for UCN's.
EncodeUCNEscape(const char * ThisTokBegin,const char * & ThisTokBuf,const char * ThisTokEnd,char * & ResultBuf,bool & HadError,FullSourceLoc Loc,unsigned CharByteWidth,DiagnosticsEngine * Diags,const LangOptions & Features)320 static void EncodeUCNEscape(const char *ThisTokBegin, const char *&ThisTokBuf,
321 const char *ThisTokEnd,
322 char *&ResultBuf, bool &HadError,
323 FullSourceLoc Loc, unsigned CharByteWidth,
324 DiagnosticsEngine *Diags,
325 const LangOptions &Features) {
326 typedef uint32_t UTF32;
327 UTF32 UcnVal = 0;
328 unsigned short UcnLen = 0;
329 if (!ProcessUCNEscape(ThisTokBegin, ThisTokBuf, ThisTokEnd, UcnVal, UcnLen,
330 Loc, Diags, Features, true)) {
331 HadError = true;
332 return;
333 }
334
335 assert((CharByteWidth == 1 || CharByteWidth == 2 || CharByteWidth) &&
336 "only character widths of 1, 2, or 4 bytes supported");
337
338 (void)UcnLen;
339 assert((UcnLen== 4 || UcnLen== 8) && "only ucn length of 4 or 8 supported");
340
341 if (CharByteWidth == 4) {
342 // FIXME: Make the type of the result buffer correct instead of
343 // using reinterpret_cast.
344 UTF32 *ResultPtr = reinterpret_cast<UTF32*>(ResultBuf);
345 *ResultPtr = UcnVal;
346 ResultBuf += 4;
347 return;
348 }
349
350 if (CharByteWidth == 2) {
351 // FIXME: Make the type of the result buffer correct instead of
352 // using reinterpret_cast.
353 UTF16 *ResultPtr = reinterpret_cast<UTF16*>(ResultBuf);
354
355 if (UcnVal <= (UTF32)0xFFFF) {
356 *ResultPtr = UcnVal;
357 ResultBuf += 2;
358 return;
359 }
360
361 // Convert to UTF16.
362 UcnVal -= 0x10000;
363 *ResultPtr = 0xD800 + (UcnVal >> 10);
364 *(ResultPtr+1) = 0xDC00 + (UcnVal & 0x3FF);
365 ResultBuf += 4;
366 return;
367 }
368
369 assert(CharByteWidth == 1 && "UTF-8 encoding is only for 1 byte characters");
370
371 // Now that we've parsed/checked the UCN, we convert from UTF32->UTF8.
372 // The conversion below was inspired by:
373 // http://www.unicode.org/Public/PROGRAMS/CVTUTF/ConvertUTF.c
374 // First, we determine how many bytes the result will require.
375 typedef uint8_t UTF8;
376
377 unsigned short bytesToWrite = 0;
378 if (UcnVal < (UTF32)0x80)
379 bytesToWrite = 1;
380 else if (UcnVal < (UTF32)0x800)
381 bytesToWrite = 2;
382 else if (UcnVal < (UTF32)0x10000)
383 bytesToWrite = 3;
384 else
385 bytesToWrite = 4;
386
387 const unsigned byteMask = 0xBF;
388 const unsigned byteMark = 0x80;
389
390 // Once the bits are split out into bytes of UTF8, this is a mask OR-ed
391 // into the first byte, depending on how many bytes follow.
392 static const UTF8 firstByteMark[5] = {
393 0x00, 0x00, 0xC0, 0xE0, 0xF0
394 };
395 // Finally, we write the bytes into ResultBuf.
396 ResultBuf += bytesToWrite;
397 switch (bytesToWrite) { // note: everything falls through.
398 case 4: *--ResultBuf = (UTF8)((UcnVal | byteMark) & byteMask); UcnVal >>= 6;
399 case 3: *--ResultBuf = (UTF8)((UcnVal | byteMark) & byteMask); UcnVal >>= 6;
400 case 2: *--ResultBuf = (UTF8)((UcnVal | byteMark) & byteMask); UcnVal >>= 6;
401 case 1: *--ResultBuf = (UTF8) (UcnVal | firstByteMark[bytesToWrite]);
402 }
403 // Update the buffer.
404 ResultBuf += bytesToWrite;
405 }
406
407
408 /// integer-constant: [C99 6.4.4.1]
409 /// decimal-constant integer-suffix
410 /// octal-constant integer-suffix
411 /// hexadecimal-constant integer-suffix
412 /// user-defined-integer-literal: [C++11 lex.ext]
413 /// decimal-literal ud-suffix
414 /// octal-literal ud-suffix
415 /// hexadecimal-literal ud-suffix
416 /// decimal-constant:
417 /// nonzero-digit
418 /// decimal-constant digit
419 /// octal-constant:
420 /// 0
421 /// octal-constant octal-digit
422 /// hexadecimal-constant:
423 /// hexadecimal-prefix hexadecimal-digit
424 /// hexadecimal-constant hexadecimal-digit
425 /// hexadecimal-prefix: one of
426 /// 0x 0X
427 /// integer-suffix:
428 /// unsigned-suffix [long-suffix]
429 /// unsigned-suffix [long-long-suffix]
430 /// long-suffix [unsigned-suffix]
431 /// long-long-suffix [unsigned-sufix]
432 /// nonzero-digit:
433 /// 1 2 3 4 5 6 7 8 9
434 /// octal-digit:
435 /// 0 1 2 3 4 5 6 7
436 /// hexadecimal-digit:
437 /// 0 1 2 3 4 5 6 7 8 9
438 /// a b c d e f
439 /// A B C D E F
440 /// unsigned-suffix: one of
441 /// u U
442 /// long-suffix: one of
443 /// l L
444 /// long-long-suffix: one of
445 /// ll LL
446 ///
447 /// floating-constant: [C99 6.4.4.2]
448 /// TODO: add rules...
449 ///
450 NumericLiteralParser::
NumericLiteralParser(const char * begin,const char * end,SourceLocation TokLoc,Preprocessor & pp)451 NumericLiteralParser(const char *begin, const char *end,
452 SourceLocation TokLoc, Preprocessor &pp)
453 : PP(pp), ThisTokBegin(begin), ThisTokEnd(end) {
454
455 // This routine assumes that the range begin/end matches the regex for integer
456 // and FP constants (specifically, the 'pp-number' regex), and assumes that
457 // the byte at "*end" is both valid and not part of the regex. Because of
458 // this, it doesn't have to check for 'overscan' in various places.
459 assert(!isalnum(*end) && *end != '.' && *end != '_' &&
460 "Lexer didn't maximally munch?");
461
462 s = DigitsBegin = begin;
463 saw_exponent = false;
464 saw_period = false;
465 saw_ud_suffix = false;
466 isLong = false;
467 isUnsigned = false;
468 isLongLong = false;
469 isFloat = false;
470 isImaginary = false;
471 isMicrosoftInteger = false;
472 hadError = false;
473
474 if (*s == '0') { // parse radix
475 ParseNumberStartingWithZero(TokLoc);
476 if (hadError)
477 return;
478 } else { // the first digit is non-zero
479 radix = 10;
480 s = SkipDigits(s);
481 if (s == ThisTokEnd) {
482 // Done.
483 } else if (isxdigit(*s) && !(*s == 'e' || *s == 'E')) {
484 PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, s-begin),
485 diag::err_invalid_decimal_digit) << StringRef(s, 1);
486 hadError = true;
487 return;
488 } else if (*s == '.') {
489 s++;
490 saw_period = true;
491 s = SkipDigits(s);
492 }
493 if ((*s == 'e' || *s == 'E')) { // exponent
494 const char *Exponent = s;
495 s++;
496 saw_exponent = true;
497 if (*s == '+' || *s == '-') s++; // sign
498 const char *first_non_digit = SkipDigits(s);
499 if (first_non_digit != s) {
500 s = first_non_digit;
501 } else {
502 PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, Exponent-begin),
503 diag::err_exponent_has_no_digits);
504 hadError = true;
505 return;
506 }
507 }
508 }
509
510 SuffixBegin = s;
511
512 // Parse the suffix. At this point we can classify whether we have an FP or
513 // integer constant.
514 bool isFPConstant = isFloatingLiteral();
515
516 // Loop over all of the characters of the suffix. If we see something bad,
517 // we break out of the loop.
518 for (; s != ThisTokEnd; ++s) {
519 switch (*s) {
520 case 'f': // FP Suffix for "float"
521 case 'F':
522 if (!isFPConstant) break; // Error for integer constant.
523 if (isFloat || isLong) break; // FF, LF invalid.
524 isFloat = true;
525 continue; // Success.
526 case 'u':
527 case 'U':
528 if (isFPConstant) break; // Error for floating constant.
529 if (isUnsigned) break; // Cannot be repeated.
530 isUnsigned = true;
531 continue; // Success.
532 case 'l':
533 case 'L':
534 if (isLong || isLongLong) break; // Cannot be repeated.
535 if (isFloat) break; // LF invalid.
536
537 // Check for long long. The L's need to be adjacent and the same case.
538 if (s+1 != ThisTokEnd && s[1] == s[0]) {
539 if (isFPConstant) break; // long long invalid for floats.
540 isLongLong = true;
541 ++s; // Eat both of them.
542 } else {
543 isLong = true;
544 }
545 continue; // Success.
546 case 'i':
547 case 'I':
548 if (PP.getLangOpts().MicrosoftExt) {
549 if (isFPConstant || isLong || isLongLong) break;
550
551 // Allow i8, i16, i32, i64, and i128.
552 if (s + 1 != ThisTokEnd) {
553 switch (s[1]) {
554 case '8':
555 s += 2; // i8 suffix
556 isMicrosoftInteger = true;
557 break;
558 case '1':
559 if (s + 2 == ThisTokEnd) break;
560 if (s[2] == '6') {
561 s += 3; // i16 suffix
562 isMicrosoftInteger = true;
563 }
564 else if (s[2] == '2') {
565 if (s + 3 == ThisTokEnd) break;
566 if (s[3] == '8') {
567 s += 4; // i128 suffix
568 isMicrosoftInteger = true;
569 }
570 }
571 break;
572 case '3':
573 if (s + 2 == ThisTokEnd) break;
574 if (s[2] == '2') {
575 s += 3; // i32 suffix
576 isLong = true;
577 isMicrosoftInteger = true;
578 }
579 break;
580 case '6':
581 if (s + 2 == ThisTokEnd) break;
582 if (s[2] == '4') {
583 s += 3; // i64 suffix
584 isLongLong = true;
585 isMicrosoftInteger = true;
586 }
587 break;
588 default:
589 break;
590 }
591 break;
592 }
593 }
594 // fall through.
595 case 'j':
596 case 'J':
597 if (isImaginary) break; // Cannot be repeated.
598 PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, s-begin),
599 diag::ext_imaginary_constant);
600 isImaginary = true;
601 continue; // Success.
602 }
603 // If we reached here, there was an error or a ud-suffix.
604 break;
605 }
606
607 if (s != ThisTokEnd) {
608 if (PP.getLangOpts().CPlusPlus0x && s == SuffixBegin && *s == '_') {
609 // We have a ud-suffix! By C++11 [lex.ext]p10, ud-suffixes not starting
610 // with an '_' are ill-formed.
611 saw_ud_suffix = true;
612 return;
613 }
614
615 // Report an error if there are any.
616 PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, SuffixBegin-begin),
617 isFPConstant ? diag::err_invalid_suffix_float_constant :
618 diag::err_invalid_suffix_integer_constant)
619 << StringRef(SuffixBegin, ThisTokEnd-SuffixBegin);
620 hadError = true;
621 return;
622 }
623 }
624
625 /// ParseNumberStartingWithZero - This method is called when the first character
626 /// of the number is found to be a zero. This means it is either an octal
627 /// number (like '04') or a hex number ('0x123a') a binary number ('0b1010') or
628 /// a floating point number (01239.123e4). Eat the prefix, determining the
629 /// radix etc.
ParseNumberStartingWithZero(SourceLocation TokLoc)630 void NumericLiteralParser::ParseNumberStartingWithZero(SourceLocation TokLoc) {
631 assert(s[0] == '0' && "Invalid method call");
632 s++;
633
634 // Handle a hex number like 0x1234.
635 if ((*s == 'x' || *s == 'X') && (isxdigit(s[1]) || s[1] == '.')) {
636 s++;
637 radix = 16;
638 DigitsBegin = s;
639 s = SkipHexDigits(s);
640 bool noSignificand = (s == DigitsBegin);
641 if (s == ThisTokEnd) {
642 // Done.
643 } else if (*s == '.') {
644 s++;
645 saw_period = true;
646 const char *floatDigitsBegin = s;
647 s = SkipHexDigits(s);
648 noSignificand &= (floatDigitsBegin == s);
649 }
650
651 if (noSignificand) {
652 PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, s-ThisTokBegin), \
653 diag::err_hexconstant_requires_digits);
654 hadError = true;
655 return;
656 }
657
658 // A binary exponent can appear with or with a '.'. If dotted, the
659 // binary exponent is required.
660 if (*s == 'p' || *s == 'P') {
661 const char *Exponent = s;
662 s++;
663 saw_exponent = true;
664 if (*s == '+' || *s == '-') s++; // sign
665 const char *first_non_digit = SkipDigits(s);
666 if (first_non_digit == s) {
667 PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, Exponent-ThisTokBegin),
668 diag::err_exponent_has_no_digits);
669 hadError = true;
670 return;
671 }
672 s = first_non_digit;
673
674 if (!PP.getLangOpts().HexFloats)
675 PP.Diag(TokLoc, diag::ext_hexconstant_invalid);
676 } else if (saw_period) {
677 PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, s-ThisTokBegin),
678 diag::err_hexconstant_requires_exponent);
679 hadError = true;
680 }
681 return;
682 }
683
684 // Handle simple binary numbers 0b01010
685 if (*s == 'b' || *s == 'B') {
686 // 0b101010 is a GCC extension.
687 PP.Diag(TokLoc, diag::ext_binary_literal);
688 ++s;
689 radix = 2;
690 DigitsBegin = s;
691 s = SkipBinaryDigits(s);
692 if (s == ThisTokEnd) {
693 // Done.
694 } else if (isxdigit(*s)) {
695 PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, s-ThisTokBegin),
696 diag::err_invalid_binary_digit) << StringRef(s, 1);
697 hadError = true;
698 }
699 // Other suffixes will be diagnosed by the caller.
700 return;
701 }
702
703 // For now, the radix is set to 8. If we discover that we have a
704 // floating point constant, the radix will change to 10. Octal floating
705 // point constants are not permitted (only decimal and hexadecimal).
706 radix = 8;
707 DigitsBegin = s;
708 s = SkipOctalDigits(s);
709 if (s == ThisTokEnd)
710 return; // Done, simple octal number like 01234
711
712 // If we have some other non-octal digit that *is* a decimal digit, see if
713 // this is part of a floating point number like 094.123 or 09e1.
714 if (isdigit(*s)) {
715 const char *EndDecimal = SkipDigits(s);
716 if (EndDecimal[0] == '.' || EndDecimal[0] == 'e' || EndDecimal[0] == 'E') {
717 s = EndDecimal;
718 radix = 10;
719 }
720 }
721
722 // If we have a hex digit other than 'e' (which denotes a FP exponent) then
723 // the code is using an incorrect base.
724 if (isxdigit(*s) && *s != 'e' && *s != 'E') {
725 PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, s-ThisTokBegin),
726 diag::err_invalid_octal_digit) << StringRef(s, 1);
727 hadError = true;
728 return;
729 }
730
731 if (*s == '.') {
732 s++;
733 radix = 10;
734 saw_period = true;
735 s = SkipDigits(s); // Skip suffix.
736 }
737 if (*s == 'e' || *s == 'E') { // exponent
738 const char *Exponent = s;
739 s++;
740 radix = 10;
741 saw_exponent = true;
742 if (*s == '+' || *s == '-') s++; // sign
743 const char *first_non_digit = SkipDigits(s);
744 if (first_non_digit != s) {
745 s = first_non_digit;
746 } else {
747 PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, Exponent-ThisTokBegin),
748 diag::err_exponent_has_no_digits);
749 hadError = true;
750 return;
751 }
752 }
753 }
754
755
756 /// GetIntegerValue - Convert this numeric literal value to an APInt that
757 /// matches Val's input width. If there is an overflow, set Val to the low bits
758 /// of the result and return true. Otherwise, return false.
GetIntegerValue(llvm::APInt & Val)759 bool NumericLiteralParser::GetIntegerValue(llvm::APInt &Val) {
760 // Fast path: Compute a conservative bound on the maximum number of
761 // bits per digit in this radix. If we can't possibly overflow a
762 // uint64 based on that bound then do the simple conversion to
763 // integer. This avoids the expensive overflow checking below, and
764 // handles the common cases that matter (small decimal integers and
765 // hex/octal values which don't overflow).
766 unsigned MaxBitsPerDigit = 1;
767 while ((1U << MaxBitsPerDigit) < radix)
768 MaxBitsPerDigit += 1;
769 if ((SuffixBegin - DigitsBegin) * MaxBitsPerDigit <= 64) {
770 uint64_t N = 0;
771 for (s = DigitsBegin; s != SuffixBegin; ++s)
772 N = N*radix + HexDigitValue(*s);
773
774 // This will truncate the value to Val's input width. Simply check
775 // for overflow by comparing.
776 Val = N;
777 return Val.getZExtValue() != N;
778 }
779
780 Val = 0;
781 s = DigitsBegin;
782
783 llvm::APInt RadixVal(Val.getBitWidth(), radix);
784 llvm::APInt CharVal(Val.getBitWidth(), 0);
785 llvm::APInt OldVal = Val;
786
787 bool OverflowOccurred = false;
788 while (s < SuffixBegin) {
789 unsigned C = HexDigitValue(*s++);
790
791 // If this letter is out of bound for this radix, reject it.
792 assert(C < radix && "NumericLiteralParser ctor should have rejected this");
793
794 CharVal = C;
795
796 // Add the digit to the value in the appropriate radix. If adding in digits
797 // made the value smaller, then this overflowed.
798 OldVal = Val;
799
800 // Multiply by radix, did overflow occur on the multiply?
801 Val *= RadixVal;
802 OverflowOccurred |= Val.udiv(RadixVal) != OldVal;
803
804 // Add value, did overflow occur on the value?
805 // (a + b) ult b <=> overflow
806 Val += CharVal;
807 OverflowOccurred |= Val.ult(CharVal);
808 }
809 return OverflowOccurred;
810 }
811
812 llvm::APFloat::opStatus
GetFloatValue(llvm::APFloat & Result)813 NumericLiteralParser::GetFloatValue(llvm::APFloat &Result) {
814 using llvm::APFloat;
815
816 unsigned n = std::min(SuffixBegin - ThisTokBegin, ThisTokEnd - ThisTokBegin);
817 return Result.convertFromString(StringRef(ThisTokBegin, n),
818 APFloat::rmNearestTiesToEven);
819 }
820
821
822 /// \verbatim
823 /// user-defined-character-literal: [C++11 lex.ext]
824 /// character-literal ud-suffix
825 /// ud-suffix:
826 /// identifier
827 /// character-literal: [C++11 lex.ccon]
828 /// ' c-char-sequence '
829 /// u' c-char-sequence '
830 /// U' c-char-sequence '
831 /// L' c-char-sequence '
832 /// c-char-sequence:
833 /// c-char
834 /// c-char-sequence c-char
835 /// c-char:
836 /// any member of the source character set except the single-quote ',
837 /// backslash \, or new-line character
838 /// escape-sequence
839 /// universal-character-name
840 /// escape-sequence:
841 /// simple-escape-sequence
842 /// octal-escape-sequence
843 /// hexadecimal-escape-sequence
844 /// simple-escape-sequence:
845 /// one of \' \" \? \\ \a \b \f \n \r \t \v
846 /// octal-escape-sequence:
847 /// \ octal-digit
848 /// \ octal-digit octal-digit
849 /// \ octal-digit octal-digit octal-digit
850 /// hexadecimal-escape-sequence:
851 /// \x hexadecimal-digit
852 /// hexadecimal-escape-sequence hexadecimal-digit
853 /// universal-character-name: [C++11 lex.charset]
854 /// \u hex-quad
855 /// \U hex-quad hex-quad
856 /// hex-quad:
857 /// hex-digit hex-digit hex-digit hex-digit
858 /// \endverbatim
859 ///
CharLiteralParser(const char * begin,const char * end,SourceLocation Loc,Preprocessor & PP,tok::TokenKind kind)860 CharLiteralParser::CharLiteralParser(const char *begin, const char *end,
861 SourceLocation Loc, Preprocessor &PP,
862 tok::TokenKind kind) {
863 // At this point we know that the character matches the regex "(L|u|U)?'.*'".
864 HadError = false;
865
866 Kind = kind;
867
868 const char *TokBegin = begin;
869
870 // Skip over wide character determinant.
871 if (Kind != tok::char_constant) {
872 ++begin;
873 }
874
875 // Skip over the entry quote.
876 assert(begin[0] == '\'' && "Invalid token lexed");
877 ++begin;
878
879 // Remove an optional ud-suffix.
880 if (end[-1] != '\'') {
881 const char *UDSuffixEnd = end;
882 do {
883 --end;
884 } while (end[-1] != '\'');
885 UDSuffixBuf.assign(end, UDSuffixEnd);
886 UDSuffixOffset = end - TokBegin;
887 }
888
889 // Trim the ending quote.
890 assert(end != begin && "Invalid token lexed");
891 --end;
892
893 // FIXME: The "Value" is an uint64_t so we can handle char literals of
894 // up to 64-bits.
895 // FIXME: This extensively assumes that 'char' is 8-bits.
896 assert(PP.getTargetInfo().getCharWidth() == 8 &&
897 "Assumes char is 8 bits");
898 assert(PP.getTargetInfo().getIntWidth() <= 64 &&
899 (PP.getTargetInfo().getIntWidth() & 7) == 0 &&
900 "Assumes sizeof(int) on target is <= 64 and a multiple of char");
901 assert(PP.getTargetInfo().getWCharWidth() <= 64 &&
902 "Assumes sizeof(wchar) on target is <= 64");
903
904 SmallVector<uint32_t,4> codepoint_buffer;
905 codepoint_buffer.resize(end-begin);
906 uint32_t *buffer_begin = &codepoint_buffer.front();
907 uint32_t *buffer_end = buffer_begin + codepoint_buffer.size();
908
909 // Unicode escapes representing characters that cannot be correctly
910 // represented in a single code unit are disallowed in character literals
911 // by this implementation.
912 uint32_t largest_character_for_kind;
913 if (tok::wide_char_constant == Kind) {
914 largest_character_for_kind = 0xFFFFFFFFu >> (32-PP.getTargetInfo().getWCharWidth());
915 } else if (tok::utf16_char_constant == Kind) {
916 largest_character_for_kind = 0xFFFF;
917 } else if (tok::utf32_char_constant == Kind) {
918 largest_character_for_kind = 0x10FFFF;
919 } else {
920 largest_character_for_kind = 0x7Fu;
921 }
922
923 while (begin!=end) {
924 // Is this a span of non-escape characters?
925 if (begin[0] != '\\') {
926 char const *start = begin;
927 do {
928 ++begin;
929 } while (begin != end && *begin != '\\');
930
931 char const *tmp_in_start = start;
932 uint32_t *tmp_out_start = buffer_begin;
933 ConversionResult res =
934 ConvertUTF8toUTF32(reinterpret_cast<UTF8 const **>(&start),
935 reinterpret_cast<UTF8 const *>(begin),
936 &buffer_begin,buffer_end,strictConversion);
937 if (res!=conversionOK) {
938 // If we see bad encoding for unprefixed character literals, warn and
939 // simply copy the byte values, for compatibility with gcc and
940 // older versions of clang.
941 bool NoErrorOnBadEncoding = isAscii();
942 unsigned Msg = diag::err_bad_character_encoding;
943 if (NoErrorOnBadEncoding)
944 Msg = diag::warn_bad_character_encoding;
945 PP.Diag(Loc, Msg);
946 if (NoErrorOnBadEncoding) {
947 start = tmp_in_start;
948 buffer_begin = tmp_out_start;
949 for ( ; start != begin; ++start, ++buffer_begin)
950 *buffer_begin = static_cast<uint8_t>(*start);
951 } else {
952 HadError = true;
953 }
954 } else {
955 for (; tmp_out_start <buffer_begin; ++tmp_out_start) {
956 if (*tmp_out_start > largest_character_for_kind) {
957 HadError = true;
958 PP.Diag(Loc, diag::err_character_too_large);
959 }
960 }
961 }
962
963 continue;
964 }
965 // Is this a Universal Character Name excape?
966 if (begin[1] == 'u' || begin[1] == 'U') {
967 unsigned short UcnLen = 0;
968 if (!ProcessUCNEscape(TokBegin, begin, end, *buffer_begin, UcnLen,
969 FullSourceLoc(Loc, PP.getSourceManager()),
970 &PP.getDiagnostics(), PP.getLangOpts(),
971 true))
972 {
973 HadError = true;
974 } else if (*buffer_begin > largest_character_for_kind) {
975 HadError = true;
976 PP.Diag(Loc, diag::err_character_too_large);
977 }
978
979 ++buffer_begin;
980 continue;
981 }
982 unsigned CharWidth = getCharWidth(Kind, PP.getTargetInfo());
983 uint64_t result =
984 ProcessCharEscape(TokBegin, begin, end, HadError,
985 FullSourceLoc(Loc,PP.getSourceManager()),
986 CharWidth, &PP.getDiagnostics(), PP.getLangOpts());
987 *buffer_begin++ = result;
988 }
989
990 unsigned NumCharsSoFar = buffer_begin-&codepoint_buffer.front();
991
992 if (NumCharsSoFar > 1) {
993 if (isWide())
994 PP.Diag(Loc, diag::warn_extraneous_char_constant);
995 else if (isAscii() && NumCharsSoFar == 4)
996 PP.Diag(Loc, diag::ext_four_char_character_literal);
997 else if (isAscii())
998 PP.Diag(Loc, diag::ext_multichar_character_literal);
999 else
1000 PP.Diag(Loc, diag::err_multichar_utf_character_literal);
1001 IsMultiChar = true;
1002 } else
1003 IsMultiChar = false;
1004
1005 llvm::APInt LitVal(PP.getTargetInfo().getIntWidth(), 0);
1006
1007 // Narrow character literals act as though their value is concatenated
1008 // in this implementation, but warn on overflow.
1009 bool multi_char_too_long = false;
1010 if (isAscii() && isMultiChar()) {
1011 LitVal = 0;
1012 for (size_t i=0;i<NumCharsSoFar;++i) {
1013 // check for enough leading zeros to shift into
1014 multi_char_too_long |= (LitVal.countLeadingZeros() < 8);
1015 LitVal <<= 8;
1016 LitVal = LitVal + (codepoint_buffer[i] & 0xFF);
1017 }
1018 } else if (NumCharsSoFar > 0) {
1019 // otherwise just take the last character
1020 LitVal = buffer_begin[-1];
1021 }
1022
1023 if (!HadError && multi_char_too_long) {
1024 PP.Diag(Loc,diag::warn_char_constant_too_large);
1025 }
1026
1027 // Transfer the value from APInt to uint64_t
1028 Value = LitVal.getZExtValue();
1029
1030 // If this is a single narrow character, sign extend it (e.g. '\xFF' is "-1")
1031 // if 'char' is signed for this target (C99 6.4.4.4p10). Note that multiple
1032 // character constants are not sign extended in the this implementation:
1033 // '\xFF\xFF' = 65536 and '\x0\xFF' = 255, which matches GCC.
1034 if (isAscii() && NumCharsSoFar == 1 && (Value & 128) &&
1035 PP.getLangOpts().CharIsSigned)
1036 Value = (signed char)Value;
1037 }
1038
1039 /// \verbatim
1040 /// string-literal: [C++0x lex.string]
1041 /// encoding-prefix " [s-char-sequence] "
1042 /// encoding-prefix R raw-string
1043 /// encoding-prefix:
1044 /// u8
1045 /// u
1046 /// U
1047 /// L
1048 /// s-char-sequence:
1049 /// s-char
1050 /// s-char-sequence s-char
1051 /// s-char:
1052 /// any member of the source character set except the double-quote ",
1053 /// backslash \, or new-line character
1054 /// escape-sequence
1055 /// universal-character-name
1056 /// raw-string:
1057 /// " d-char-sequence ( r-char-sequence ) d-char-sequence "
1058 /// r-char-sequence:
1059 /// r-char
1060 /// r-char-sequence r-char
1061 /// r-char:
1062 /// any member of the source character set, except a right parenthesis )
1063 /// followed by the initial d-char-sequence (which may be empty)
1064 /// followed by a double quote ".
1065 /// d-char-sequence:
1066 /// d-char
1067 /// d-char-sequence d-char
1068 /// d-char:
1069 /// any member of the basic source character set except:
1070 /// space, the left parenthesis (, the right parenthesis ),
1071 /// the backslash \, and the control characters representing horizontal
1072 /// tab, vertical tab, form feed, and newline.
1073 /// escape-sequence: [C++0x lex.ccon]
1074 /// simple-escape-sequence
1075 /// octal-escape-sequence
1076 /// hexadecimal-escape-sequence
1077 /// simple-escape-sequence:
1078 /// one of \' \" \? \\ \a \b \f \n \r \t \v
1079 /// octal-escape-sequence:
1080 /// \ octal-digit
1081 /// \ octal-digit octal-digit
1082 /// \ octal-digit octal-digit octal-digit
1083 /// hexadecimal-escape-sequence:
1084 /// \x hexadecimal-digit
1085 /// hexadecimal-escape-sequence hexadecimal-digit
1086 /// universal-character-name:
1087 /// \u hex-quad
1088 /// \U hex-quad hex-quad
1089 /// hex-quad:
1090 /// hex-digit hex-digit hex-digit hex-digit
1091 /// \endverbatim
1092 ///
1093 StringLiteralParser::
StringLiteralParser(const Token * StringToks,unsigned NumStringToks,Preprocessor & PP,bool Complain)1094 StringLiteralParser(const Token *StringToks, unsigned NumStringToks,
1095 Preprocessor &PP, bool Complain)
1096 : SM(PP.getSourceManager()), Features(PP.getLangOpts()),
1097 Target(PP.getTargetInfo()), Diags(Complain ? &PP.getDiagnostics() : 0),
1098 MaxTokenLength(0), SizeBound(0), CharByteWidth(0), Kind(tok::unknown),
1099 ResultPtr(ResultBuf.data()), hadError(false), Pascal(false) {
1100 init(StringToks, NumStringToks);
1101 }
1102
init(const Token * StringToks,unsigned NumStringToks)1103 void StringLiteralParser::init(const Token *StringToks, unsigned NumStringToks){
1104 // The literal token may have come from an invalid source location (e.g. due
1105 // to a PCH error), in which case the token length will be 0.
1106 if (NumStringToks == 0 || StringToks[0].getLength() < 2)
1107 return DiagnoseLexingError(SourceLocation());
1108
1109 // Scan all of the string portions, remember the max individual token length,
1110 // computing a bound on the concatenated string length, and see whether any
1111 // piece is a wide-string. If any of the string portions is a wide-string
1112 // literal, the result is a wide-string literal [C99 6.4.5p4].
1113 assert(NumStringToks && "expected at least one token");
1114 MaxTokenLength = StringToks[0].getLength();
1115 assert(StringToks[0].getLength() >= 2 && "literal token is invalid!");
1116 SizeBound = StringToks[0].getLength()-2; // -2 for "".
1117 Kind = StringToks[0].getKind();
1118
1119 hadError = false;
1120
1121 // Implement Translation Phase #6: concatenation of string literals
1122 /// (C99 5.1.1.2p1). The common case is only one string fragment.
1123 for (unsigned i = 1; i != NumStringToks; ++i) {
1124 if (StringToks[i].getLength() < 2)
1125 return DiagnoseLexingError(StringToks[i].getLocation());
1126
1127 // The string could be shorter than this if it needs cleaning, but this is a
1128 // reasonable bound, which is all we need.
1129 assert(StringToks[i].getLength() >= 2 && "literal token is invalid!");
1130 SizeBound += StringToks[i].getLength()-2; // -2 for "".
1131
1132 // Remember maximum string piece length.
1133 if (StringToks[i].getLength() > MaxTokenLength)
1134 MaxTokenLength = StringToks[i].getLength();
1135
1136 // Remember if we see any wide or utf-8/16/32 strings.
1137 // Also check for illegal concatenations.
1138 if (StringToks[i].isNot(Kind) && StringToks[i].isNot(tok::string_literal)) {
1139 if (isAscii()) {
1140 Kind = StringToks[i].getKind();
1141 } else {
1142 if (Diags)
1143 Diags->Report(StringToks[i].getLocation(),
1144 diag::err_unsupported_string_concat);
1145 hadError = true;
1146 }
1147 }
1148 }
1149
1150 // Include space for the null terminator.
1151 ++SizeBound;
1152
1153 // TODO: K&R warning: "traditional C rejects string constant concatenation"
1154
1155 // Get the width in bytes of char/wchar_t/char16_t/char32_t
1156 CharByteWidth = getCharWidth(Kind, Target);
1157 assert((CharByteWidth & 7) == 0 && "Assumes character size is byte multiple");
1158 CharByteWidth /= 8;
1159
1160 // The output buffer size needs to be large enough to hold wide characters.
1161 // This is a worst-case assumption which basically corresponds to L"" "long".
1162 SizeBound *= CharByteWidth;
1163
1164 // Size the temporary buffer to hold the result string data.
1165 ResultBuf.resize(SizeBound);
1166
1167 // Likewise, but for each string piece.
1168 SmallString<512> TokenBuf;
1169 TokenBuf.resize(MaxTokenLength);
1170
1171 // Loop over all the strings, getting their spelling, and expanding them to
1172 // wide strings as appropriate.
1173 ResultPtr = &ResultBuf[0]; // Next byte to fill in.
1174
1175 Pascal = false;
1176
1177 SourceLocation UDSuffixTokLoc;
1178
1179 for (unsigned i = 0, e = NumStringToks; i != e; ++i) {
1180 const char *ThisTokBuf = &TokenBuf[0];
1181 // Get the spelling of the token, which eliminates trigraphs, etc. We know
1182 // that ThisTokBuf points to a buffer that is big enough for the whole token
1183 // and 'spelled' tokens can only shrink.
1184 bool StringInvalid = false;
1185 unsigned ThisTokLen =
1186 Lexer::getSpelling(StringToks[i], ThisTokBuf, SM, Features,
1187 &StringInvalid);
1188 if (StringInvalid)
1189 return DiagnoseLexingError(StringToks[i].getLocation());
1190
1191 const char *ThisTokBegin = ThisTokBuf;
1192 const char *ThisTokEnd = ThisTokBuf+ThisTokLen;
1193
1194 // Remove an optional ud-suffix.
1195 if (ThisTokEnd[-1] != '"') {
1196 const char *UDSuffixEnd = ThisTokEnd;
1197 do {
1198 --ThisTokEnd;
1199 } while (ThisTokEnd[-1] != '"');
1200
1201 StringRef UDSuffix(ThisTokEnd, UDSuffixEnd - ThisTokEnd);
1202
1203 if (UDSuffixBuf.empty()) {
1204 UDSuffixBuf.assign(UDSuffix);
1205 UDSuffixToken = i;
1206 UDSuffixOffset = ThisTokEnd - ThisTokBuf;
1207 UDSuffixTokLoc = StringToks[i].getLocation();
1208 } else if (!UDSuffixBuf.equals(UDSuffix)) {
1209 // C++11 [lex.ext]p8: At the end of phase 6, if a string literal is the
1210 // result of a concatenation involving at least one user-defined-string-
1211 // literal, all the participating user-defined-string-literals shall
1212 // have the same ud-suffix.
1213 if (Diags) {
1214 SourceLocation TokLoc = StringToks[i].getLocation();
1215 Diags->Report(TokLoc, diag::err_string_concat_mixed_suffix)
1216 << UDSuffixBuf << UDSuffix
1217 << SourceRange(UDSuffixTokLoc, UDSuffixTokLoc)
1218 << SourceRange(TokLoc, TokLoc);
1219 }
1220 hadError = true;
1221 }
1222 }
1223
1224 // Strip the end quote.
1225 --ThisTokEnd;
1226
1227 // TODO: Input character set mapping support.
1228
1229 // Skip marker for wide or unicode strings.
1230 if (ThisTokBuf[0] == 'L' || ThisTokBuf[0] == 'u' || ThisTokBuf[0] == 'U') {
1231 ++ThisTokBuf;
1232 // Skip 8 of u8 marker for utf8 strings.
1233 if (ThisTokBuf[0] == '8')
1234 ++ThisTokBuf;
1235 }
1236
1237 // Check for raw string
1238 if (ThisTokBuf[0] == 'R') {
1239 ThisTokBuf += 2; // skip R"
1240
1241 const char *Prefix = ThisTokBuf;
1242 while (ThisTokBuf[0] != '(')
1243 ++ThisTokBuf;
1244 ++ThisTokBuf; // skip '('
1245
1246 // Remove same number of characters from the end
1247 ThisTokEnd -= ThisTokBuf - Prefix;
1248 assert(ThisTokEnd >= ThisTokBuf && "malformed raw string literal");
1249
1250 // Copy the string over
1251 if (CopyStringFragment(StringToks[i], ThisTokBegin,
1252 StringRef(ThisTokBuf, ThisTokEnd - ThisTokBuf)))
1253 hadError = true;
1254 } else {
1255 if (ThisTokBuf[0] != '"') {
1256 // The file may have come from PCH and then changed after loading the
1257 // PCH; Fail gracefully.
1258 return DiagnoseLexingError(StringToks[i].getLocation());
1259 }
1260 ++ThisTokBuf; // skip "
1261
1262 // Check if this is a pascal string
1263 if (Features.PascalStrings && ThisTokBuf + 1 != ThisTokEnd &&
1264 ThisTokBuf[0] == '\\' && ThisTokBuf[1] == 'p') {
1265
1266 // If the \p sequence is found in the first token, we have a pascal string
1267 // Otherwise, if we already have a pascal string, ignore the first \p
1268 if (i == 0) {
1269 ++ThisTokBuf;
1270 Pascal = true;
1271 } else if (Pascal)
1272 ThisTokBuf += 2;
1273 }
1274
1275 while (ThisTokBuf != ThisTokEnd) {
1276 // Is this a span of non-escape characters?
1277 if (ThisTokBuf[0] != '\\') {
1278 const char *InStart = ThisTokBuf;
1279 do {
1280 ++ThisTokBuf;
1281 } while (ThisTokBuf != ThisTokEnd && ThisTokBuf[0] != '\\');
1282
1283 // Copy the character span over.
1284 if (CopyStringFragment(StringToks[i], ThisTokBegin,
1285 StringRef(InStart, ThisTokBuf - InStart)))
1286 hadError = true;
1287 continue;
1288 }
1289 // Is this a Universal Character Name escape?
1290 if (ThisTokBuf[1] == 'u' || ThisTokBuf[1] == 'U') {
1291 EncodeUCNEscape(ThisTokBegin, ThisTokBuf, ThisTokEnd,
1292 ResultPtr, hadError,
1293 FullSourceLoc(StringToks[i].getLocation(), SM),
1294 CharByteWidth, Diags, Features);
1295 continue;
1296 }
1297 // Otherwise, this is a non-UCN escape character. Process it.
1298 unsigned ResultChar =
1299 ProcessCharEscape(ThisTokBegin, ThisTokBuf, ThisTokEnd, hadError,
1300 FullSourceLoc(StringToks[i].getLocation(), SM),
1301 CharByteWidth*8, Diags, Features);
1302
1303 if (CharByteWidth == 4) {
1304 // FIXME: Make the type of the result buffer correct instead of
1305 // using reinterpret_cast.
1306 UTF32 *ResultWidePtr = reinterpret_cast<UTF32*>(ResultPtr);
1307 *ResultWidePtr = ResultChar;
1308 ResultPtr += 4;
1309 } else if (CharByteWidth == 2) {
1310 // FIXME: Make the type of the result buffer correct instead of
1311 // using reinterpret_cast.
1312 UTF16 *ResultWidePtr = reinterpret_cast<UTF16*>(ResultPtr);
1313 *ResultWidePtr = ResultChar & 0xFFFF;
1314 ResultPtr += 2;
1315 } else {
1316 assert(CharByteWidth == 1 && "Unexpected char width");
1317 *ResultPtr++ = ResultChar & 0xFF;
1318 }
1319 }
1320 }
1321 }
1322
1323 if (Pascal) {
1324 if (CharByteWidth == 4) {
1325 // FIXME: Make the type of the result buffer correct instead of
1326 // using reinterpret_cast.
1327 UTF32 *ResultWidePtr = reinterpret_cast<UTF32*>(ResultBuf.data());
1328 ResultWidePtr[0] = GetNumStringChars() - 1;
1329 } else if (CharByteWidth == 2) {
1330 // FIXME: Make the type of the result buffer correct instead of
1331 // using reinterpret_cast.
1332 UTF16 *ResultWidePtr = reinterpret_cast<UTF16*>(ResultBuf.data());
1333 ResultWidePtr[0] = GetNumStringChars() - 1;
1334 } else {
1335 assert(CharByteWidth == 1 && "Unexpected char width");
1336 ResultBuf[0] = GetNumStringChars() - 1;
1337 }
1338
1339 // Verify that pascal strings aren't too large.
1340 if (GetStringLength() > 256) {
1341 if (Diags)
1342 Diags->Report(StringToks[0].getLocation(),
1343 diag::err_pascal_string_too_long)
1344 << SourceRange(StringToks[0].getLocation(),
1345 StringToks[NumStringToks-1].getLocation());
1346 hadError = true;
1347 return;
1348 }
1349 } else if (Diags) {
1350 // Complain if this string literal has too many characters.
1351 unsigned MaxChars = Features.CPlusPlus? 65536 : Features.C99 ? 4095 : 509;
1352
1353 if (GetNumStringChars() > MaxChars)
1354 Diags->Report(StringToks[0].getLocation(),
1355 diag::ext_string_too_long)
1356 << GetNumStringChars() << MaxChars
1357 << (Features.CPlusPlus ? 2 : Features.C99 ? 1 : 0)
1358 << SourceRange(StringToks[0].getLocation(),
1359 StringToks[NumStringToks-1].getLocation());
1360 }
1361 }
1362
1363 /// \brief This function copies from Fragment, which is a sequence of bytes
1364 /// within Tok's contents (which begin at TokBegin) into ResultPtr.
1365 /// Performs widening for multi-byte characters.
CopyStringFragment(const Token & Tok,const char * TokBegin,StringRef Fragment)1366 bool StringLiteralParser::CopyStringFragment(const Token &Tok,
1367 const char *TokBegin,
1368 StringRef Fragment) {
1369 const UTF8 *ErrorPtrTmp;
1370 if (ConvertUTF8toWide(CharByteWidth, Fragment, ResultPtr, ErrorPtrTmp))
1371 return false;
1372 const char *ErrorPtr = reinterpret_cast<const char *>(ErrorPtrTmp);
1373
1374 // If we see bad encoding for unprefixed string literals, warn and
1375 // simply copy the byte values, for compatibility with gcc and older
1376 // versions of clang.
1377 bool NoErrorOnBadEncoding = isAscii();
1378 if (NoErrorOnBadEncoding) {
1379 memcpy(ResultPtr, Fragment.data(), Fragment.size());
1380 ResultPtr += Fragment.size();
1381 }
1382 if (Diags) {
1383 Diag(Diags, Features, FullSourceLoc(Tok.getLocation(), SM), TokBegin,
1384 ErrorPtr, ErrorPtr + std::min<unsigned>(getNumBytesForUTF8(*ErrorPtr),
1385 Fragment.end() - ErrorPtr),
1386 NoErrorOnBadEncoding ? diag::warn_bad_string_encoding
1387 : diag::err_bad_string_encoding);
1388 }
1389 return !NoErrorOnBadEncoding;
1390 }
1391
DiagnoseLexingError(SourceLocation Loc)1392 void StringLiteralParser::DiagnoseLexingError(SourceLocation Loc) {
1393 hadError = true;
1394 if (Diags)
1395 Diags->Report(Loc, diag::err_lexing_string);
1396 }
1397
1398 /// getOffsetOfStringByte - This function returns the offset of the
1399 /// specified byte of the string data represented by Token. This handles
1400 /// advancing over escape sequences in the string.
getOffsetOfStringByte(const Token & Tok,unsigned ByteNo) const1401 unsigned StringLiteralParser::getOffsetOfStringByte(const Token &Tok,
1402 unsigned ByteNo) const {
1403 // Get the spelling of the token.
1404 SmallString<32> SpellingBuffer;
1405 SpellingBuffer.resize(Tok.getLength());
1406
1407 bool StringInvalid = false;
1408 const char *SpellingPtr = &SpellingBuffer[0];
1409 unsigned TokLen = Lexer::getSpelling(Tok, SpellingPtr, SM, Features,
1410 &StringInvalid);
1411 if (StringInvalid)
1412 return 0;
1413
1414 const char *SpellingStart = SpellingPtr;
1415 const char *SpellingEnd = SpellingPtr+TokLen;
1416
1417 // Handle UTF-8 strings just like narrow strings.
1418 if (SpellingPtr[0] == 'u' && SpellingPtr[1] == '8')
1419 SpellingPtr += 2;
1420
1421 assert(SpellingPtr[0] != 'L' && SpellingPtr[0] != 'u' &&
1422 SpellingPtr[0] != 'U' && "Doesn't handle wide or utf strings yet");
1423
1424 // For raw string literals, this is easy.
1425 if (SpellingPtr[0] == 'R') {
1426 assert(SpellingPtr[1] == '"' && "Should be a raw string literal!");
1427 // Skip 'R"'.
1428 SpellingPtr += 2;
1429 while (*SpellingPtr != '(') {
1430 ++SpellingPtr;
1431 assert(SpellingPtr < SpellingEnd && "Missing ( for raw string literal");
1432 }
1433 // Skip '('.
1434 ++SpellingPtr;
1435 return SpellingPtr - SpellingStart + ByteNo;
1436 }
1437
1438 // Skip over the leading quote
1439 assert(SpellingPtr[0] == '"' && "Should be a string literal!");
1440 ++SpellingPtr;
1441
1442 // Skip over bytes until we find the offset we're looking for.
1443 while (ByteNo) {
1444 assert(SpellingPtr < SpellingEnd && "Didn't find byte offset!");
1445
1446 // Step over non-escapes simply.
1447 if (*SpellingPtr != '\\') {
1448 ++SpellingPtr;
1449 --ByteNo;
1450 continue;
1451 }
1452
1453 // Otherwise, this is an escape character. Advance over it.
1454 bool HadError = false;
1455 if (SpellingPtr[1] == 'u' || SpellingPtr[1] == 'U') {
1456 const char *EscapePtr = SpellingPtr;
1457 unsigned Len = MeasureUCNEscape(SpellingStart, SpellingPtr, SpellingEnd,
1458 1, Features, HadError);
1459 if (Len > ByteNo) {
1460 // ByteNo is somewhere within the escape sequence.
1461 SpellingPtr = EscapePtr;
1462 break;
1463 }
1464 ByteNo -= Len;
1465 } else {
1466 ProcessCharEscape(SpellingStart, SpellingPtr, SpellingEnd, HadError,
1467 FullSourceLoc(Tok.getLocation(), SM),
1468 CharByteWidth*8, Diags, Features);
1469 --ByteNo;
1470 }
1471 assert(!HadError && "This method isn't valid on erroneous strings");
1472 }
1473
1474 return SpellingPtr-SpellingStart;
1475 }
1476