• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 //===--- LiteralSupport.cpp - Code to parse and process literals ----------===//
2 //
3 //                     The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 // This file implements the NumericLiteralParser, CharLiteralParser, and
11 // StringLiteralParser interfaces.
12 //
13 //===----------------------------------------------------------------------===//
14 
15 #include "clang/Lex/LiteralSupport.h"
16 #include "clang/Lex/Preprocessor.h"
17 #include "clang/Lex/LexDiagnostic.h"
18 #include "clang/Basic/TargetInfo.h"
19 #include "clang/Basic/ConvertUTF.h"
20 #include "llvm/ADT/StringExtras.h"
21 #include "llvm/Support/ErrorHandling.h"
22 using namespace clang;
23 
24 /// HexDigitValue - Return the value of the specified hex digit, or -1 if it's
25 /// not valid.
HexDigitValue(char C)26 static int HexDigitValue(char C) {
27   if (C >= '0' && C <= '9') return C-'0';
28   if (C >= 'a' && C <= 'f') return C-'a'+10;
29   if (C >= 'A' && C <= 'F') return C-'A'+10;
30   return -1;
31 }
32 
getCharWidth(tok::TokenKind kind,const TargetInfo & Target)33 static unsigned getCharWidth(tok::TokenKind kind, const TargetInfo &Target) {
34   switch (kind) {
35   default: llvm_unreachable("Unknown token type!");
36   case tok::char_constant:
37   case tok::string_literal:
38   case tok::utf8_string_literal:
39     return Target.getCharWidth();
40   case tok::wide_char_constant:
41   case tok::wide_string_literal:
42     return Target.getWCharWidth();
43   case tok::utf16_char_constant:
44   case tok::utf16_string_literal:
45     return Target.getChar16Width();
46   case tok::utf32_char_constant:
47   case tok::utf32_string_literal:
48     return Target.getChar32Width();
49   }
50 }
51 
52 /// ProcessCharEscape - Parse a standard C escape sequence, which can occur in
53 /// either a character or a string literal.
ProcessCharEscape(const char * & ThisTokBuf,const char * ThisTokEnd,bool & HadError,FullSourceLoc Loc,unsigned CharWidth,DiagnosticsEngine * Diags)54 static unsigned ProcessCharEscape(const char *&ThisTokBuf,
55                                   const char *ThisTokEnd, bool &HadError,
56                                   FullSourceLoc Loc, unsigned CharWidth,
57                                   DiagnosticsEngine *Diags) {
58   // Skip the '\' char.
59   ++ThisTokBuf;
60 
61   // We know that this character can't be off the end of the buffer, because
62   // that would have been \", which would not have been the end of string.
63   unsigned ResultChar = *ThisTokBuf++;
64   switch (ResultChar) {
65   // These map to themselves.
66   case '\\': case '\'': case '"': case '?': break;
67 
68     // These have fixed mappings.
69   case 'a':
70     // TODO: K&R: the meaning of '\\a' is different in traditional C
71     ResultChar = 7;
72     break;
73   case 'b':
74     ResultChar = 8;
75     break;
76   case 'e':
77     if (Diags)
78       Diags->Report(Loc, diag::ext_nonstandard_escape) << "e";
79     ResultChar = 27;
80     break;
81   case 'E':
82     if (Diags)
83       Diags->Report(Loc, diag::ext_nonstandard_escape) << "E";
84     ResultChar = 27;
85     break;
86   case 'f':
87     ResultChar = 12;
88     break;
89   case 'n':
90     ResultChar = 10;
91     break;
92   case 'r':
93     ResultChar = 13;
94     break;
95   case 't':
96     ResultChar = 9;
97     break;
98   case 'v':
99     ResultChar = 11;
100     break;
101   case 'x': { // Hex escape.
102     ResultChar = 0;
103     if (ThisTokBuf == ThisTokEnd || !isxdigit(*ThisTokBuf)) {
104       if (Diags)
105         Diags->Report(Loc, diag::err_hex_escape_no_digits);
106       HadError = 1;
107       break;
108     }
109 
110     // Hex escapes are a maximal series of hex digits.
111     bool Overflow = false;
112     for (; ThisTokBuf != ThisTokEnd; ++ThisTokBuf) {
113       int CharVal = HexDigitValue(ThisTokBuf[0]);
114       if (CharVal == -1) break;
115       // About to shift out a digit?
116       Overflow |= (ResultChar & 0xF0000000) ? true : false;
117       ResultChar <<= 4;
118       ResultChar |= CharVal;
119     }
120 
121     // See if any bits will be truncated when evaluated as a character.
122     if (CharWidth != 32 && (ResultChar >> CharWidth) != 0) {
123       Overflow = true;
124       ResultChar &= ~0U >> (32-CharWidth);
125     }
126 
127     // Check for overflow.
128     if (Overflow && Diags)   // Too many digits to fit in
129       Diags->Report(Loc, diag::warn_hex_escape_too_large);
130     break;
131   }
132   case '0': case '1': case '2': case '3':
133   case '4': case '5': case '6': case '7': {
134     // Octal escapes.
135     --ThisTokBuf;
136     ResultChar = 0;
137 
138     // Octal escapes are a series of octal digits with maximum length 3.
139     // "\0123" is a two digit sequence equal to "\012" "3".
140     unsigned NumDigits = 0;
141     do {
142       ResultChar <<= 3;
143       ResultChar |= *ThisTokBuf++ - '0';
144       ++NumDigits;
145     } while (ThisTokBuf != ThisTokEnd && NumDigits < 3 &&
146              ThisTokBuf[0] >= '0' && ThisTokBuf[0] <= '7');
147 
148     // Check for overflow.  Reject '\777', but not L'\777'.
149     if (CharWidth != 32 && (ResultChar >> CharWidth) != 0) {
150       if (Diags)
151         Diags->Report(Loc, diag::warn_octal_escape_too_large);
152       ResultChar &= ~0U >> (32-CharWidth);
153     }
154     break;
155   }
156 
157     // Otherwise, these are not valid escapes.
158   case '(': case '{': case '[': case '%':
159     // GCC accepts these as extensions.  We warn about them as such though.
160     if (Diags)
161       Diags->Report(Loc, diag::ext_nonstandard_escape)
162         << std::string()+(char)ResultChar;
163     break;
164   default:
165     if (Diags == 0)
166       break;
167 
168     if (isgraph(ResultChar))
169       Diags->Report(Loc, diag::ext_unknown_escape)
170         << std::string()+(char)ResultChar;
171     else
172       Diags->Report(Loc, diag::ext_unknown_escape)
173         << "x"+llvm::utohexstr(ResultChar);
174     break;
175   }
176 
177   return ResultChar;
178 }
179 
180 /// ProcessUCNEscape - Read the Universal Character Name, check constraints and
181 /// return the UTF32.
ProcessUCNEscape(const char * ThisTokBegin,const char * & ThisTokBuf,const char * ThisTokEnd,uint32_t & UcnVal,unsigned short & UcnLen,FullSourceLoc Loc,DiagnosticsEngine * Diags,const LangOptions & Features,bool in_char_string_literal=false)182 static bool ProcessUCNEscape(const char *ThisTokBegin, const char *&ThisTokBuf,
183                              const char *ThisTokEnd,
184                              uint32_t &UcnVal, unsigned short &UcnLen,
185                              FullSourceLoc Loc, DiagnosticsEngine *Diags,
186                              const LangOptions &Features,
187                              bool in_char_string_literal = false) {
188   if (!Features.CPlusPlus && !Features.C99 && Diags)
189     Diags->Report(Loc, diag::warn_ucn_not_valid_in_c89);
190 
191   const char *UcnBegin = ThisTokBuf;
192 
193   // Skip the '\u' char's.
194   ThisTokBuf += 2;
195 
196   if (ThisTokBuf == ThisTokEnd || !isxdigit(*ThisTokBuf)) {
197     if (Diags)
198       Diags->Report(Loc, diag::err_ucn_escape_no_digits);
199     return false;
200   }
201   UcnLen = (ThisTokBuf[-1] == 'u' ? 4 : 8);
202   unsigned short UcnLenSave = UcnLen;
203   for (; ThisTokBuf != ThisTokEnd && UcnLenSave; ++ThisTokBuf, UcnLenSave--) {
204     int CharVal = HexDigitValue(ThisTokBuf[0]);
205     if (CharVal == -1) break;
206     UcnVal <<= 4;
207     UcnVal |= CharVal;
208   }
209   // If we didn't consume the proper number of digits, there is a problem.
210   if (UcnLenSave) {
211     if (Diags) {
212       SourceLocation L =
213         Lexer::AdvanceToTokenCharacter(Loc, UcnBegin - ThisTokBegin,
214                                        Loc.getManager(), Features);
215       Diags->Report(L, diag::err_ucn_escape_incomplete);
216     }
217     return false;
218   }
219 
220   // Check UCN constraints (C99 6.4.3p2) [C++11 lex.charset p2]
221   if ((0xD800 <= UcnVal && UcnVal <= 0xDFFF) || // surrogate codepoints
222       UcnVal > 0x10FFFF) {                      // maximum legal UTF32 value
223     if (Diags)
224       Diags->Report(Loc, diag::err_ucn_escape_invalid);
225     return false;
226   }
227 
228   // C++11 allows UCNs that refer to control characters and basic source
229   // characters inside character and string literals
230   if (UcnVal < 0xa0 &&
231       (UcnVal != 0x24 && UcnVal != 0x40 && UcnVal != 0x60)) {  // $, @, `
232     bool IsError = (!Features.CPlusPlus0x || !in_char_string_literal);
233     if (Diags) {
234       SourceLocation UcnBeginLoc =
235         Lexer::AdvanceToTokenCharacter(Loc, UcnBegin - ThisTokBegin,
236                                        Loc.getManager(), Features);
237       char BasicSCSChar = UcnVal;
238       if (UcnVal >= 0x20 && UcnVal < 0x7f)
239         Diags->Report(UcnBeginLoc, IsError ? diag::err_ucn_escape_basic_scs :
240                       diag::warn_cxx98_compat_literal_ucn_escape_basic_scs)
241           << StringRef(&BasicSCSChar, 1);
242       else
243         Diags->Report(UcnBeginLoc, IsError ? diag::err_ucn_control_character :
244                       diag::warn_cxx98_compat_literal_ucn_control_character);
245     }
246     if (IsError)
247       return false;
248   }
249 
250   return true;
251 }
252 
253 /// EncodeUCNEscape - Read the Universal Character Name, check constraints and
254 /// convert the UTF32 to UTF8 or UTF16. This is a subroutine of
255 /// StringLiteralParser. When we decide to implement UCN's for identifiers,
256 /// we will likely rework our support for UCN's.
EncodeUCNEscape(const char * ThisTokBegin,const char * & ThisTokBuf,const char * ThisTokEnd,char * & ResultBuf,bool & HadError,FullSourceLoc Loc,unsigned CharByteWidth,DiagnosticsEngine * Diags,const LangOptions & Features)257 static void EncodeUCNEscape(const char *ThisTokBegin, const char *&ThisTokBuf,
258                             const char *ThisTokEnd,
259                             char *&ResultBuf, bool &HadError,
260                             FullSourceLoc Loc, unsigned CharByteWidth,
261                             DiagnosticsEngine *Diags,
262                             const LangOptions &Features) {
263   typedef uint32_t UTF32;
264   UTF32 UcnVal = 0;
265   unsigned short UcnLen = 0;
266   if (!ProcessUCNEscape(ThisTokBegin, ThisTokBuf, ThisTokEnd, UcnVal, UcnLen,
267                         Loc, Diags, Features, true)) {
268     HadError = 1;
269     return;
270   }
271 
272   assert((CharByteWidth == 1 || CharByteWidth == 2 || CharByteWidth) &&
273          "only character widths of 1, 2, or 4 bytes supported");
274 
275   (void)UcnLen;
276   assert((UcnLen== 4 || UcnLen== 8) && "only ucn length of 4 or 8 supported");
277 
278   if (CharByteWidth == 4) {
279     // FIXME: Make the type of the result buffer correct instead of
280     // using reinterpret_cast.
281     UTF32 *ResultPtr = reinterpret_cast<UTF32*>(ResultBuf);
282     *ResultPtr = UcnVal;
283     ResultBuf += 4;
284     return;
285   }
286 
287   if (CharByteWidth == 2) {
288     // FIXME: Make the type of the result buffer correct instead of
289     // using reinterpret_cast.
290     UTF16 *ResultPtr = reinterpret_cast<UTF16*>(ResultBuf);
291 
292     if (UcnVal < (UTF32)0xFFFF) {
293       *ResultPtr = UcnVal;
294       ResultBuf += 2;
295       return;
296     }
297 
298     // Convert to UTF16.
299     UcnVal -= 0x10000;
300     *ResultPtr     = 0xD800 + (UcnVal >> 10);
301     *(ResultPtr+1) = 0xDC00 + (UcnVal & 0x3FF);
302     ResultBuf += 4;
303     return;
304   }
305 
306   assert(CharByteWidth == 1 && "UTF-8 encoding is only for 1 byte characters");
307 
308   // Now that we've parsed/checked the UCN, we convert from UTF32->UTF8.
309   // The conversion below was inspired by:
310   //   http://www.unicode.org/Public/PROGRAMS/CVTUTF/ConvertUTF.c
311   // First, we determine how many bytes the result will require.
312   typedef uint8_t UTF8;
313 
314   unsigned short bytesToWrite = 0;
315   if (UcnVal < (UTF32)0x80)
316     bytesToWrite = 1;
317   else if (UcnVal < (UTF32)0x800)
318     bytesToWrite = 2;
319   else if (UcnVal < (UTF32)0x10000)
320     bytesToWrite = 3;
321   else
322     bytesToWrite = 4;
323 
324   const unsigned byteMask = 0xBF;
325   const unsigned byteMark = 0x80;
326 
327   // Once the bits are split out into bytes of UTF8, this is a mask OR-ed
328   // into the first byte, depending on how many bytes follow.
329   static const UTF8 firstByteMark[5] = {
330     0x00, 0x00, 0xC0, 0xE0, 0xF0
331   };
332   // Finally, we write the bytes into ResultBuf.
333   ResultBuf += bytesToWrite;
334   switch (bytesToWrite) { // note: everything falls through.
335     case 4: *--ResultBuf = (UTF8)((UcnVal | byteMark) & byteMask); UcnVal >>= 6;
336     case 3: *--ResultBuf = (UTF8)((UcnVal | byteMark) & byteMask); UcnVal >>= 6;
337     case 2: *--ResultBuf = (UTF8)((UcnVal | byteMark) & byteMask); UcnVal >>= 6;
338     case 1: *--ResultBuf = (UTF8) (UcnVal | firstByteMark[bytesToWrite]);
339   }
340   // Update the buffer.
341   ResultBuf += bytesToWrite;
342 }
343 
344 
345 ///       integer-constant: [C99 6.4.4.1]
346 ///         decimal-constant integer-suffix
347 ///         octal-constant integer-suffix
348 ///         hexadecimal-constant integer-suffix
349 ///       user-defined-integer-literal: [C++11 lex.ext]
350 ///         decimal-literal ud-suffix
351 ///         octal-literal ud-suffix
352 ///         hexadecimal-literal ud-suffix
353 ///       decimal-constant:
354 ///         nonzero-digit
355 ///         decimal-constant digit
356 ///       octal-constant:
357 ///         0
358 ///         octal-constant octal-digit
359 ///       hexadecimal-constant:
360 ///         hexadecimal-prefix hexadecimal-digit
361 ///         hexadecimal-constant hexadecimal-digit
362 ///       hexadecimal-prefix: one of
363 ///         0x 0X
364 ///       integer-suffix:
365 ///         unsigned-suffix [long-suffix]
366 ///         unsigned-suffix [long-long-suffix]
367 ///         long-suffix [unsigned-suffix]
368 ///         long-long-suffix [unsigned-sufix]
369 ///       nonzero-digit:
370 ///         1 2 3 4 5 6 7 8 9
371 ///       octal-digit:
372 ///         0 1 2 3 4 5 6 7
373 ///       hexadecimal-digit:
374 ///         0 1 2 3 4 5 6 7 8 9
375 ///         a b c d e f
376 ///         A B C D E F
377 ///       unsigned-suffix: one of
378 ///         u U
379 ///       long-suffix: one of
380 ///         l L
381 ///       long-long-suffix: one of
382 ///         ll LL
383 ///
384 ///       floating-constant: [C99 6.4.4.2]
385 ///         TODO: add rules...
386 ///
387 NumericLiteralParser::
NumericLiteralParser(const char * begin,const char * end,SourceLocation TokLoc,Preprocessor & pp)388 NumericLiteralParser(const char *begin, const char *end,
389                      SourceLocation TokLoc, Preprocessor &pp)
390   : PP(pp), ThisTokBegin(begin), ThisTokEnd(end) {
391 
392   // This routine assumes that the range begin/end matches the regex for integer
393   // and FP constants (specifically, the 'pp-number' regex), and assumes that
394   // the byte at "*end" is both valid and not part of the regex.  Because of
395   // this, it doesn't have to check for 'overscan' in various places.
396   assert(!isalnum(*end) && *end != '.' && *end != '_' &&
397          "Lexer didn't maximally munch?");
398 
399   s = DigitsBegin = begin;
400   saw_exponent = false;
401   saw_period = false;
402   saw_ud_suffix = false;
403   isLong = false;
404   isUnsigned = false;
405   isLongLong = false;
406   isFloat = false;
407   isImaginary = false;
408   isMicrosoftInteger = false;
409   hadError = false;
410 
411   if (*s == '0') { // parse radix
412     ParseNumberStartingWithZero(TokLoc);
413     if (hadError)
414       return;
415   } else { // the first digit is non-zero
416     radix = 10;
417     s = SkipDigits(s);
418     if (s == ThisTokEnd) {
419       // Done.
420     } else if (isxdigit(*s) && !(*s == 'e' || *s == 'E')) {
421       PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, s-begin),
422               diag::err_invalid_decimal_digit) << StringRef(s, 1);
423       hadError = true;
424       return;
425     } else if (*s == '.') {
426       s++;
427       saw_period = true;
428       s = SkipDigits(s);
429     }
430     if ((*s == 'e' || *s == 'E')) { // exponent
431       const char *Exponent = s;
432       s++;
433       saw_exponent = true;
434       if (*s == '+' || *s == '-')  s++; // sign
435       const char *first_non_digit = SkipDigits(s);
436       if (first_non_digit != s) {
437         s = first_non_digit;
438       } else {
439         PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, Exponent-begin),
440                 diag::err_exponent_has_no_digits);
441         hadError = true;
442         return;
443       }
444     }
445   }
446 
447   SuffixBegin = s;
448 
449   // Parse the suffix.  At this point we can classify whether we have an FP or
450   // integer constant.
451   bool isFPConstant = isFloatingLiteral();
452 
453   // Loop over all of the characters of the suffix.  If we see something bad,
454   // we break out of the loop.
455   for (; s != ThisTokEnd; ++s) {
456     switch (*s) {
457     case 'f':      // FP Suffix for "float"
458     case 'F':
459       if (!isFPConstant) break;  // Error for integer constant.
460       if (isFloat || isLong) break; // FF, LF invalid.
461       isFloat = true;
462       continue;  // Success.
463     case 'u':
464     case 'U':
465       if (isFPConstant) break;  // Error for floating constant.
466       if (isUnsigned) break;    // Cannot be repeated.
467       isUnsigned = true;
468       continue;  // Success.
469     case 'l':
470     case 'L':
471       if (isLong || isLongLong) break;  // Cannot be repeated.
472       if (isFloat) break;               // LF invalid.
473 
474       // Check for long long.  The L's need to be adjacent and the same case.
475       if (s+1 != ThisTokEnd && s[1] == s[0]) {
476         if (isFPConstant) break;        // long long invalid for floats.
477         isLongLong = true;
478         ++s;  // Eat both of them.
479       } else {
480         isLong = true;
481       }
482       continue;  // Success.
483     case 'i':
484     case 'I':
485       if (PP.getLangOpts().MicrosoftExt) {
486         if (isFPConstant || isLong || isLongLong) break;
487 
488         // Allow i8, i16, i32, i64, and i128.
489         if (s + 1 != ThisTokEnd) {
490           switch (s[1]) {
491             case '8':
492               s += 2; // i8 suffix
493               isMicrosoftInteger = true;
494               break;
495             case '1':
496               if (s + 2 == ThisTokEnd) break;
497               if (s[2] == '6') {
498                 s += 3; // i16 suffix
499                 isMicrosoftInteger = true;
500               }
501               else if (s[2] == '2') {
502                 if (s + 3 == ThisTokEnd) break;
503                 if (s[3] == '8') {
504                   s += 4; // i128 suffix
505                   isMicrosoftInteger = true;
506                 }
507               }
508               break;
509             case '3':
510               if (s + 2 == ThisTokEnd) break;
511               if (s[2] == '2') {
512                 s += 3; // i32 suffix
513                 isLong = true;
514                 isMicrosoftInteger = true;
515               }
516               break;
517             case '6':
518               if (s + 2 == ThisTokEnd) break;
519               if (s[2] == '4') {
520                 s += 3; // i64 suffix
521                 isLongLong = true;
522                 isMicrosoftInteger = true;
523               }
524               break;
525             default:
526               break;
527           }
528           break;
529         }
530       }
531       // fall through.
532     case 'j':
533     case 'J':
534       if (isImaginary) break;   // Cannot be repeated.
535       PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, s-begin),
536               diag::ext_imaginary_constant);
537       isImaginary = true;
538       continue;  // Success.
539     }
540     // If we reached here, there was an error or a ud-suffix.
541     break;
542   }
543 
544   if (s != ThisTokEnd) {
545     if (PP.getLangOpts().CPlusPlus0x && s == SuffixBegin && *s == '_') {
546       // We have a ud-suffix! By C++11 [lex.ext]p10, ud-suffixes not starting
547       // with an '_' are ill-formed.
548       saw_ud_suffix = true;
549       return;
550     }
551 
552     // Report an error if there are any.
553     PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, SuffixBegin-begin),
554             isFPConstant ? diag::err_invalid_suffix_float_constant :
555                            diag::err_invalid_suffix_integer_constant)
556       << StringRef(SuffixBegin, ThisTokEnd-SuffixBegin);
557     hadError = true;
558     return;
559   }
560 }
561 
562 /// ParseNumberStartingWithZero - This method is called when the first character
563 /// of the number is found to be a zero.  This means it is either an octal
564 /// number (like '04') or a hex number ('0x123a') a binary number ('0b1010') or
565 /// a floating point number (01239.123e4).  Eat the prefix, determining the
566 /// radix etc.
ParseNumberStartingWithZero(SourceLocation TokLoc)567 void NumericLiteralParser::ParseNumberStartingWithZero(SourceLocation TokLoc) {
568   assert(s[0] == '0' && "Invalid method call");
569   s++;
570 
571   // Handle a hex number like 0x1234.
572   if ((*s == 'x' || *s == 'X') && (isxdigit(s[1]) || s[1] == '.')) {
573     s++;
574     radix = 16;
575     DigitsBegin = s;
576     s = SkipHexDigits(s);
577     bool noSignificand = (s == DigitsBegin);
578     if (s == ThisTokEnd) {
579       // Done.
580     } else if (*s == '.') {
581       s++;
582       saw_period = true;
583       const char *floatDigitsBegin = s;
584       s = SkipHexDigits(s);
585       noSignificand &= (floatDigitsBegin == s);
586     }
587 
588     if (noSignificand) {
589       PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, s-ThisTokBegin), \
590         diag::err_hexconstant_requires_digits);
591       hadError = true;
592       return;
593     }
594 
595     // A binary exponent can appear with or with a '.'. If dotted, the
596     // binary exponent is required.
597     if (*s == 'p' || *s == 'P') {
598       const char *Exponent = s;
599       s++;
600       saw_exponent = true;
601       if (*s == '+' || *s == '-')  s++; // sign
602       const char *first_non_digit = SkipDigits(s);
603       if (first_non_digit == s) {
604         PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, Exponent-ThisTokBegin),
605                 diag::err_exponent_has_no_digits);
606         hadError = true;
607         return;
608       }
609       s = first_non_digit;
610 
611       if (!PP.getLangOpts().HexFloats)
612         PP.Diag(TokLoc, diag::ext_hexconstant_invalid);
613     } else if (saw_period) {
614       PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, s-ThisTokBegin),
615               diag::err_hexconstant_requires_exponent);
616       hadError = true;
617     }
618     return;
619   }
620 
621   // Handle simple binary numbers 0b01010
622   if (*s == 'b' || *s == 'B') {
623     // 0b101010 is a GCC extension.
624     PP.Diag(TokLoc, diag::ext_binary_literal);
625     ++s;
626     radix = 2;
627     DigitsBegin = s;
628     s = SkipBinaryDigits(s);
629     if (s == ThisTokEnd) {
630       // Done.
631     } else if (isxdigit(*s)) {
632       PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, s-ThisTokBegin),
633               diag::err_invalid_binary_digit) << StringRef(s, 1);
634       hadError = true;
635     }
636     // Other suffixes will be diagnosed by the caller.
637     return;
638   }
639 
640   // For now, the radix is set to 8. If we discover that we have a
641   // floating point constant, the radix will change to 10. Octal floating
642   // point constants are not permitted (only decimal and hexadecimal).
643   radix = 8;
644   DigitsBegin = s;
645   s = SkipOctalDigits(s);
646   if (s == ThisTokEnd)
647     return; // Done, simple octal number like 01234
648 
649   // If we have some other non-octal digit that *is* a decimal digit, see if
650   // this is part of a floating point number like 094.123 or 09e1.
651   if (isdigit(*s)) {
652     const char *EndDecimal = SkipDigits(s);
653     if (EndDecimal[0] == '.' || EndDecimal[0] == 'e' || EndDecimal[0] == 'E') {
654       s = EndDecimal;
655       radix = 10;
656     }
657   }
658 
659   // If we have a hex digit other than 'e' (which denotes a FP exponent) then
660   // the code is using an incorrect base.
661   if (isxdigit(*s) && *s != 'e' && *s != 'E') {
662     PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, s-ThisTokBegin),
663             diag::err_invalid_octal_digit) << StringRef(s, 1);
664     hadError = true;
665     return;
666   }
667 
668   if (*s == '.') {
669     s++;
670     radix = 10;
671     saw_period = true;
672     s = SkipDigits(s); // Skip suffix.
673   }
674   if (*s == 'e' || *s == 'E') { // exponent
675     const char *Exponent = s;
676     s++;
677     radix = 10;
678     saw_exponent = true;
679     if (*s == '+' || *s == '-')  s++; // sign
680     const char *first_non_digit = SkipDigits(s);
681     if (first_non_digit != s) {
682       s = first_non_digit;
683     } else {
684       PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, Exponent-ThisTokBegin),
685               diag::err_exponent_has_no_digits);
686       hadError = true;
687       return;
688     }
689   }
690 }
691 
692 
693 /// GetIntegerValue - Convert this numeric literal value to an APInt that
694 /// matches Val's input width.  If there is an overflow, set Val to the low bits
695 /// of the result and return true.  Otherwise, return false.
GetIntegerValue(llvm::APInt & Val)696 bool NumericLiteralParser::GetIntegerValue(llvm::APInt &Val) {
697   // Fast path: Compute a conservative bound on the maximum number of
698   // bits per digit in this radix. If we can't possibly overflow a
699   // uint64 based on that bound then do the simple conversion to
700   // integer. This avoids the expensive overflow checking below, and
701   // handles the common cases that matter (small decimal integers and
702   // hex/octal values which don't overflow).
703   unsigned MaxBitsPerDigit = 1;
704   while ((1U << MaxBitsPerDigit) < radix)
705     MaxBitsPerDigit += 1;
706   if ((SuffixBegin - DigitsBegin) * MaxBitsPerDigit <= 64) {
707     uint64_t N = 0;
708     for (s = DigitsBegin; s != SuffixBegin; ++s)
709       N = N*radix + HexDigitValue(*s);
710 
711     // This will truncate the value to Val's input width. Simply check
712     // for overflow by comparing.
713     Val = N;
714     return Val.getZExtValue() != N;
715   }
716 
717   Val = 0;
718   s = DigitsBegin;
719 
720   llvm::APInt RadixVal(Val.getBitWidth(), radix);
721   llvm::APInt CharVal(Val.getBitWidth(), 0);
722   llvm::APInt OldVal = Val;
723 
724   bool OverflowOccurred = false;
725   while (s < SuffixBegin) {
726     unsigned C = HexDigitValue(*s++);
727 
728     // If this letter is out of bound for this radix, reject it.
729     assert(C < radix && "NumericLiteralParser ctor should have rejected this");
730 
731     CharVal = C;
732 
733     // Add the digit to the value in the appropriate radix.  If adding in digits
734     // made the value smaller, then this overflowed.
735     OldVal = Val;
736 
737     // Multiply by radix, did overflow occur on the multiply?
738     Val *= RadixVal;
739     OverflowOccurred |= Val.udiv(RadixVal) != OldVal;
740 
741     // Add value, did overflow occur on the value?
742     //   (a + b) ult b  <=> overflow
743     Val += CharVal;
744     OverflowOccurred |= Val.ult(CharVal);
745   }
746   return OverflowOccurred;
747 }
748 
749 llvm::APFloat::opStatus
GetFloatValue(llvm::APFloat & Result)750 NumericLiteralParser::GetFloatValue(llvm::APFloat &Result) {
751   using llvm::APFloat;
752 
753   unsigned n = std::min(SuffixBegin - ThisTokBegin, ThisTokEnd - ThisTokBegin);
754   return Result.convertFromString(StringRef(ThisTokBegin, n),
755                                   APFloat::rmNearestTiesToEven);
756 }
757 
758 
759 ///       user-defined-character-literal: [C++11 lex.ext]
760 ///         character-literal ud-suffix
761 ///       ud-suffix:
762 ///         identifier
763 ///       character-literal: [C++11 lex.ccon]
764 ///         ' c-char-sequence '
765 ///         u' c-char-sequence '
766 ///         U' c-char-sequence '
767 ///         L' c-char-sequence '
768 ///       c-char-sequence:
769 ///         c-char
770 ///         c-char-sequence c-char
771 ///       c-char:
772 ///         any member of the source character set except the single-quote ',
773 ///           backslash \, or new-line character
774 ///         escape-sequence
775 ///         universal-character-name
776 ///       escape-sequence:
777 ///         simple-escape-sequence
778 ///         octal-escape-sequence
779 ///         hexadecimal-escape-sequence
780 ///       simple-escape-sequence:
781 ///         one of \' \" \? \\ \a \b \f \n \r \t \v
782 ///       octal-escape-sequence:
783 ///         \ octal-digit
784 ///         \ octal-digit octal-digit
785 ///         \ octal-digit octal-digit octal-digit
786 ///       hexadecimal-escape-sequence:
787 ///         \x hexadecimal-digit
788 ///         hexadecimal-escape-sequence hexadecimal-digit
789 ///       universal-character-name: [C++11 lex.charset]
790 ///         \u hex-quad
791 ///         \U hex-quad hex-quad
792 ///       hex-quad:
793 ///         hex-digit hex-digit hex-digit hex-digit
794 ///
CharLiteralParser(const char * begin,const char * end,SourceLocation Loc,Preprocessor & PP,tok::TokenKind kind)795 CharLiteralParser::CharLiteralParser(const char *begin, const char *end,
796                                      SourceLocation Loc, Preprocessor &PP,
797                                      tok::TokenKind kind) {
798   // At this point we know that the character matches the regex "(L|u|U)?'.*'".
799   HadError = false;
800 
801   Kind = kind;
802 
803   const char *TokBegin = begin;
804 
805   // Skip over wide character determinant.
806   if (Kind != tok::char_constant) {
807     ++begin;
808   }
809 
810   // Skip over the entry quote.
811   assert(begin[0] == '\'' && "Invalid token lexed");
812   ++begin;
813 
814   // Remove an optional ud-suffix.
815   if (end[-1] != '\'') {
816     const char *UDSuffixEnd = end;
817     do {
818       --end;
819     } while (end[-1] != '\'');
820     UDSuffixBuf.assign(end, UDSuffixEnd);
821     UDSuffixOffset = end - TokBegin;
822   }
823 
824   // Trim the ending quote.
825   assert(end != begin && "Invalid token lexed");
826   --end;
827 
828   // FIXME: The "Value" is an uint64_t so we can handle char literals of
829   // up to 64-bits.
830   // FIXME: This extensively assumes that 'char' is 8-bits.
831   assert(PP.getTargetInfo().getCharWidth() == 8 &&
832          "Assumes char is 8 bits");
833   assert(PP.getTargetInfo().getIntWidth() <= 64 &&
834          (PP.getTargetInfo().getIntWidth() & 7) == 0 &&
835          "Assumes sizeof(int) on target is <= 64 and a multiple of char");
836   assert(PP.getTargetInfo().getWCharWidth() <= 64 &&
837          "Assumes sizeof(wchar) on target is <= 64");
838 
839   SmallVector<uint32_t,4> codepoint_buffer;
840   codepoint_buffer.resize(end-begin);
841   uint32_t *buffer_begin = &codepoint_buffer.front();
842   uint32_t *buffer_end = buffer_begin + codepoint_buffer.size();
843 
844   // Unicode escapes representing characters that cannot be correctly
845   // represented in a single code unit are disallowed in character literals
846   // by this implementation.
847   uint32_t largest_character_for_kind;
848   if (tok::wide_char_constant == Kind) {
849     largest_character_for_kind = 0xFFFFFFFFu >> (32-PP.getTargetInfo().getWCharWidth());
850   } else if (tok::utf16_char_constant == Kind) {
851     largest_character_for_kind = 0xFFFF;
852   } else if (tok::utf32_char_constant == Kind) {
853     largest_character_for_kind = 0x10FFFF;
854   } else {
855     largest_character_for_kind = 0x7Fu;
856   }
857 
858   while (begin!=end) {
859     // Is this a span of non-escape characters?
860     if (begin[0] != '\\') {
861       char const *start = begin;
862       do {
863         ++begin;
864       } while (begin != end && *begin != '\\');
865 
866       char const *tmp_in_start = start;
867       uint32_t *tmp_out_start = buffer_begin;
868       ConversionResult res =
869       ConvertUTF8toUTF32(reinterpret_cast<UTF8 const **>(&start),
870                          reinterpret_cast<UTF8 const *>(begin),
871                          &buffer_begin,buffer_end,strictConversion);
872       if (res!=conversionOK) {
873         // If we see bad encoding for unprefixed character literals, warn and
874         // simply copy the byte values, for compatibility with gcc and
875         // older versions of clang.
876         bool NoErrorOnBadEncoding = isAscii();
877         unsigned Msg = diag::err_bad_character_encoding;
878         if (NoErrorOnBadEncoding)
879           Msg = diag::warn_bad_character_encoding;
880         PP.Diag(Loc, Msg);
881         if (NoErrorOnBadEncoding) {
882           start = tmp_in_start;
883           buffer_begin = tmp_out_start;
884           for ( ; start != begin; ++start, ++buffer_begin)
885             *buffer_begin = static_cast<uint8_t>(*start);
886         } else {
887           HadError = true;
888         }
889       } else {
890         for (; tmp_out_start <buffer_begin; ++tmp_out_start) {
891           if (*tmp_out_start > largest_character_for_kind) {
892             HadError = true;
893             PP.Diag(Loc, diag::err_character_too_large);
894           }
895         }
896       }
897 
898       continue;
899     }
900     // Is this a Universal Character Name excape?
901     if (begin[1] == 'u' || begin[1] == 'U') {
902       unsigned short UcnLen = 0;
903       if (!ProcessUCNEscape(TokBegin, begin, end, *buffer_begin, UcnLen,
904                             FullSourceLoc(Loc, PP.getSourceManager()),
905                             &PP.getDiagnostics(), PP.getLangOpts(),
906                             true))
907       {
908         HadError = true;
909       } else if (*buffer_begin > largest_character_for_kind) {
910         HadError = true;
911         PP.Diag(Loc,diag::err_character_too_large);
912       }
913 
914       ++buffer_begin;
915       continue;
916     }
917     unsigned CharWidth = getCharWidth(Kind, PP.getTargetInfo());
918     uint64_t result =
919     ProcessCharEscape(begin, end, HadError,
920                       FullSourceLoc(Loc,PP.getSourceManager()),
921                       CharWidth, &PP.getDiagnostics());
922     *buffer_begin++ = result;
923   }
924 
925   unsigned NumCharsSoFar = buffer_begin-&codepoint_buffer.front();
926 
927   if (NumCharsSoFar > 1) {
928     if (isWide())
929       PP.Diag(Loc, diag::warn_extraneous_char_constant);
930     else if (isAscii() && NumCharsSoFar == 4)
931       PP.Diag(Loc, diag::ext_four_char_character_literal);
932     else if (isAscii())
933       PP.Diag(Loc, diag::ext_multichar_character_literal);
934     else
935       PP.Diag(Loc, diag::err_multichar_utf_character_literal);
936     IsMultiChar = true;
937   } else
938     IsMultiChar = false;
939 
940   llvm::APInt LitVal(PP.getTargetInfo().getIntWidth(), 0);
941 
942   // Narrow character literals act as though their value is concatenated
943   // in this implementation, but warn on overflow.
944   bool multi_char_too_long = false;
945   if (isAscii() && isMultiChar()) {
946     LitVal = 0;
947     for (size_t i=0;i<NumCharsSoFar;++i) {
948       // check for enough leading zeros to shift into
949       multi_char_too_long |= (LitVal.countLeadingZeros() < 8);
950       LitVal <<= 8;
951       LitVal = LitVal + (codepoint_buffer[i] & 0xFF);
952     }
953   } else if (NumCharsSoFar > 0) {
954     // otherwise just take the last character
955     LitVal = buffer_begin[-1];
956   }
957 
958   if (!HadError && multi_char_too_long) {
959     PP.Diag(Loc,diag::warn_char_constant_too_large);
960   }
961 
962   // Transfer the value from APInt to uint64_t
963   Value = LitVal.getZExtValue();
964 
965   // If this is a single narrow character, sign extend it (e.g. '\xFF' is "-1")
966   // if 'char' is signed for this target (C99 6.4.4.4p10).  Note that multiple
967   // character constants are not sign extended in the this implementation:
968   // '\xFF\xFF' = 65536 and '\x0\xFF' = 255, which matches GCC.
969   if (isAscii() && NumCharsSoFar == 1 && (Value & 128) &&
970       PP.getLangOpts().CharIsSigned)
971     Value = (signed char)Value;
972 }
973 
974 
975 ///       string-literal: [C++0x lex.string]
976 ///         encoding-prefix " [s-char-sequence] "
977 ///         encoding-prefix R raw-string
978 ///       encoding-prefix:
979 ///         u8
980 ///         u
981 ///         U
982 ///         L
983 ///       s-char-sequence:
984 ///         s-char
985 ///         s-char-sequence s-char
986 ///       s-char:
987 ///         any member of the source character set except the double-quote ",
988 ///           backslash \, or new-line character
989 ///         escape-sequence
990 ///         universal-character-name
991 ///       raw-string:
992 ///         " d-char-sequence ( r-char-sequence ) d-char-sequence "
993 ///       r-char-sequence:
994 ///         r-char
995 ///         r-char-sequence r-char
996 ///       r-char:
997 ///         any member of the source character set, except a right parenthesis )
998 ///           followed by the initial d-char-sequence (which may be empty)
999 ///           followed by a double quote ".
1000 ///       d-char-sequence:
1001 ///         d-char
1002 ///         d-char-sequence d-char
1003 ///       d-char:
1004 ///         any member of the basic source character set except:
1005 ///           space, the left parenthesis (, the right parenthesis ),
1006 ///           the backslash \, and the control characters representing horizontal
1007 ///           tab, vertical tab, form feed, and newline.
1008 ///       escape-sequence: [C++0x lex.ccon]
1009 ///         simple-escape-sequence
1010 ///         octal-escape-sequence
1011 ///         hexadecimal-escape-sequence
1012 ///       simple-escape-sequence:
1013 ///         one of \' \" \? \\ \a \b \f \n \r \t \v
1014 ///       octal-escape-sequence:
1015 ///         \ octal-digit
1016 ///         \ octal-digit octal-digit
1017 ///         \ octal-digit octal-digit octal-digit
1018 ///       hexadecimal-escape-sequence:
1019 ///         \x hexadecimal-digit
1020 ///         hexadecimal-escape-sequence hexadecimal-digit
1021 ///       universal-character-name:
1022 ///         \u hex-quad
1023 ///         \U hex-quad hex-quad
1024 ///       hex-quad:
1025 ///         hex-digit hex-digit hex-digit hex-digit
1026 ///
1027 StringLiteralParser::
StringLiteralParser(const Token * StringToks,unsigned NumStringToks,Preprocessor & PP,bool Complain)1028 StringLiteralParser(const Token *StringToks, unsigned NumStringToks,
1029                     Preprocessor &PP, bool Complain)
1030   : SM(PP.getSourceManager()), Features(PP.getLangOpts()),
1031     Target(PP.getTargetInfo()), Diags(Complain ? &PP.getDiagnostics() : 0),
1032     MaxTokenLength(0), SizeBound(0), CharByteWidth(0), Kind(tok::unknown),
1033     ResultPtr(ResultBuf.data()), hadError(false), Pascal(false) {
1034   init(StringToks, NumStringToks);
1035 }
1036 
init(const Token * StringToks,unsigned NumStringToks)1037 void StringLiteralParser::init(const Token *StringToks, unsigned NumStringToks){
1038   // The literal token may have come from an invalid source location (e.g. due
1039   // to a PCH error), in which case the token length will be 0.
1040   if (NumStringToks == 0 || StringToks[0].getLength() < 2) {
1041     hadError = true;
1042     return;
1043   }
1044 
1045   // Scan all of the string portions, remember the max individual token length,
1046   // computing a bound on the concatenated string length, and see whether any
1047   // piece is a wide-string.  If any of the string portions is a wide-string
1048   // literal, the result is a wide-string literal [C99 6.4.5p4].
1049   assert(NumStringToks && "expected at least one token");
1050   MaxTokenLength = StringToks[0].getLength();
1051   assert(StringToks[0].getLength() >= 2 && "literal token is invalid!");
1052   SizeBound = StringToks[0].getLength()-2;  // -2 for "".
1053   Kind = StringToks[0].getKind();
1054 
1055   hadError = false;
1056 
1057   // Implement Translation Phase #6: concatenation of string literals
1058   /// (C99 5.1.1.2p1).  The common case is only one string fragment.
1059   for (unsigned i = 1; i != NumStringToks; ++i) {
1060     if (StringToks[i].getLength() < 2) {
1061       hadError = true;
1062       return;
1063     }
1064 
1065     // The string could be shorter than this if it needs cleaning, but this is a
1066     // reasonable bound, which is all we need.
1067     assert(StringToks[i].getLength() >= 2 && "literal token is invalid!");
1068     SizeBound += StringToks[i].getLength()-2;  // -2 for "".
1069 
1070     // Remember maximum string piece length.
1071     if (StringToks[i].getLength() > MaxTokenLength)
1072       MaxTokenLength = StringToks[i].getLength();
1073 
1074     // Remember if we see any wide or utf-8/16/32 strings.
1075     // Also check for illegal concatenations.
1076     if (StringToks[i].isNot(Kind) && StringToks[i].isNot(tok::string_literal)) {
1077       if (isAscii()) {
1078         Kind = StringToks[i].getKind();
1079       } else {
1080         if (Diags)
1081           Diags->Report(FullSourceLoc(StringToks[i].getLocation(), SM),
1082                         diag::err_unsupported_string_concat);
1083         hadError = true;
1084       }
1085     }
1086   }
1087 
1088   // Include space for the null terminator.
1089   ++SizeBound;
1090 
1091   // TODO: K&R warning: "traditional C rejects string constant concatenation"
1092 
1093   // Get the width in bytes of char/wchar_t/char16_t/char32_t
1094   CharByteWidth = getCharWidth(Kind, Target);
1095   assert((CharByteWidth & 7) == 0 && "Assumes character size is byte multiple");
1096   CharByteWidth /= 8;
1097 
1098   // The output buffer size needs to be large enough to hold wide characters.
1099   // This is a worst-case assumption which basically corresponds to L"" "long".
1100   SizeBound *= CharByteWidth;
1101 
1102   // Size the temporary buffer to hold the result string data.
1103   ResultBuf.resize(SizeBound);
1104 
1105   // Likewise, but for each string piece.
1106   SmallString<512> TokenBuf;
1107   TokenBuf.resize(MaxTokenLength);
1108 
1109   // Loop over all the strings, getting their spelling, and expanding them to
1110   // wide strings as appropriate.
1111   ResultPtr = &ResultBuf[0];   // Next byte to fill in.
1112 
1113   Pascal = false;
1114 
1115   SourceLocation UDSuffixTokLoc;
1116 
1117   for (unsigned i = 0, e = NumStringToks; i != e; ++i) {
1118     const char *ThisTokBuf = &TokenBuf[0];
1119     // Get the spelling of the token, which eliminates trigraphs, etc.  We know
1120     // that ThisTokBuf points to a buffer that is big enough for the whole token
1121     // and 'spelled' tokens can only shrink.
1122     bool StringInvalid = false;
1123     unsigned ThisTokLen =
1124       Lexer::getSpelling(StringToks[i], ThisTokBuf, SM, Features,
1125                          &StringInvalid);
1126     if (StringInvalid) {
1127       hadError = true;
1128       continue;
1129     }
1130 
1131     const char *ThisTokBegin = ThisTokBuf;
1132     const char *ThisTokEnd = ThisTokBuf+ThisTokLen;
1133 
1134     // Remove an optional ud-suffix.
1135     if (ThisTokEnd[-1] != '"') {
1136       const char *UDSuffixEnd = ThisTokEnd;
1137       do {
1138         --ThisTokEnd;
1139       } while (ThisTokEnd[-1] != '"');
1140 
1141       StringRef UDSuffix(ThisTokEnd, UDSuffixEnd - ThisTokEnd);
1142 
1143       if (UDSuffixBuf.empty()) {
1144         UDSuffixBuf.assign(UDSuffix);
1145         UDSuffixToken = i;
1146         UDSuffixOffset = ThisTokEnd - ThisTokBuf;
1147         UDSuffixTokLoc = StringToks[i].getLocation();
1148       } else if (!UDSuffixBuf.equals(UDSuffix)) {
1149         // C++11 [lex.ext]p8: At the end of phase 6, if a string literal is the
1150         // result of a concatenation involving at least one user-defined-string-
1151         // literal, all the participating user-defined-string-literals shall
1152         // have the same ud-suffix.
1153         if (Diags) {
1154           SourceLocation TokLoc = StringToks[i].getLocation();
1155           Diags->Report(TokLoc, diag::err_string_concat_mixed_suffix)
1156             << UDSuffixBuf << UDSuffix
1157             << SourceRange(UDSuffixTokLoc, UDSuffixTokLoc)
1158             << SourceRange(TokLoc, TokLoc);
1159         }
1160         hadError = true;
1161       }
1162     }
1163 
1164     // Strip the end quote.
1165     --ThisTokEnd;
1166 
1167     // TODO: Input character set mapping support.
1168 
1169     // Skip marker for wide or unicode strings.
1170     if (ThisTokBuf[0] == 'L' || ThisTokBuf[0] == 'u' || ThisTokBuf[0] == 'U') {
1171       ++ThisTokBuf;
1172       // Skip 8 of u8 marker for utf8 strings.
1173       if (ThisTokBuf[0] == '8')
1174         ++ThisTokBuf;
1175     }
1176 
1177     // Check for raw string
1178     if (ThisTokBuf[0] == 'R') {
1179       ThisTokBuf += 2; // skip R"
1180 
1181       const char *Prefix = ThisTokBuf;
1182       while (ThisTokBuf[0] != '(')
1183         ++ThisTokBuf;
1184       ++ThisTokBuf; // skip '('
1185 
1186       // Remove same number of characters from the end
1187       ThisTokEnd -= ThisTokBuf - Prefix;
1188       assert(ThisTokEnd >= ThisTokBuf && "malformed raw string literal");
1189 
1190       // Copy the string over
1191       if (CopyStringFragment(StringRef(ThisTokBuf, ThisTokEnd - ThisTokBuf)))
1192         if (DiagnoseBadString(StringToks[i]))
1193           hadError = true;
1194     } else {
1195       assert(ThisTokBuf[0] == '"' && "Expected quote, lexer broken?");
1196       ++ThisTokBuf; // skip "
1197 
1198       // Check if this is a pascal string
1199       if (Features.PascalStrings && ThisTokBuf + 1 != ThisTokEnd &&
1200           ThisTokBuf[0] == '\\' && ThisTokBuf[1] == 'p') {
1201 
1202         // If the \p sequence is found in the first token, we have a pascal string
1203         // Otherwise, if we already have a pascal string, ignore the first \p
1204         if (i == 0) {
1205           ++ThisTokBuf;
1206           Pascal = true;
1207         } else if (Pascal)
1208           ThisTokBuf += 2;
1209       }
1210 
1211       while (ThisTokBuf != ThisTokEnd) {
1212         // Is this a span of non-escape characters?
1213         if (ThisTokBuf[0] != '\\') {
1214           const char *InStart = ThisTokBuf;
1215           do {
1216             ++ThisTokBuf;
1217           } while (ThisTokBuf != ThisTokEnd && ThisTokBuf[0] != '\\');
1218 
1219           // Copy the character span over.
1220           if (CopyStringFragment(StringRef(InStart, ThisTokBuf - InStart)))
1221             if (DiagnoseBadString(StringToks[i]))
1222               hadError = true;
1223           continue;
1224         }
1225         // Is this a Universal Character Name escape?
1226         if (ThisTokBuf[1] == 'u' || ThisTokBuf[1] == 'U') {
1227           EncodeUCNEscape(ThisTokBegin, ThisTokBuf, ThisTokEnd,
1228                           ResultPtr, hadError,
1229                           FullSourceLoc(StringToks[i].getLocation(), SM),
1230                           CharByteWidth, Diags, Features);
1231           continue;
1232         }
1233         // Otherwise, this is a non-UCN escape character.  Process it.
1234         unsigned ResultChar =
1235           ProcessCharEscape(ThisTokBuf, ThisTokEnd, hadError,
1236                             FullSourceLoc(StringToks[i].getLocation(), SM),
1237                             CharByteWidth*8, Diags);
1238 
1239         if (CharByteWidth == 4) {
1240           // FIXME: Make the type of the result buffer correct instead of
1241           // using reinterpret_cast.
1242           UTF32 *ResultWidePtr = reinterpret_cast<UTF32*>(ResultPtr);
1243           *ResultWidePtr = ResultChar;
1244           ResultPtr += 4;
1245         } else if (CharByteWidth == 2) {
1246           // FIXME: Make the type of the result buffer correct instead of
1247           // using reinterpret_cast.
1248           UTF16 *ResultWidePtr = reinterpret_cast<UTF16*>(ResultPtr);
1249           *ResultWidePtr = ResultChar & 0xFFFF;
1250           ResultPtr += 2;
1251         } else {
1252           assert(CharByteWidth == 1 && "Unexpected char width");
1253           *ResultPtr++ = ResultChar & 0xFF;
1254         }
1255       }
1256     }
1257   }
1258 
1259   if (Pascal) {
1260     if (CharByteWidth == 4) {
1261       // FIXME: Make the type of the result buffer correct instead of
1262       // using reinterpret_cast.
1263       UTF32 *ResultWidePtr = reinterpret_cast<UTF32*>(ResultBuf.data());
1264       ResultWidePtr[0] = GetNumStringChars() - 1;
1265     } else if (CharByteWidth == 2) {
1266       // FIXME: Make the type of the result buffer correct instead of
1267       // using reinterpret_cast.
1268       UTF16 *ResultWidePtr = reinterpret_cast<UTF16*>(ResultBuf.data());
1269       ResultWidePtr[0] = GetNumStringChars() - 1;
1270     } else {
1271       assert(CharByteWidth == 1 && "Unexpected char width");
1272       ResultBuf[0] = GetNumStringChars() - 1;
1273     }
1274 
1275     // Verify that pascal strings aren't too large.
1276     if (GetStringLength() > 256) {
1277       if (Diags)
1278         Diags->Report(FullSourceLoc(StringToks[0].getLocation(), SM),
1279                       diag::err_pascal_string_too_long)
1280           << SourceRange(StringToks[0].getLocation(),
1281                          StringToks[NumStringToks-1].getLocation());
1282       hadError = true;
1283       return;
1284     }
1285   } else if (Diags) {
1286     // Complain if this string literal has too many characters.
1287     unsigned MaxChars = Features.CPlusPlus? 65536 : Features.C99 ? 4095 : 509;
1288 
1289     if (GetNumStringChars() > MaxChars)
1290       Diags->Report(FullSourceLoc(StringToks[0].getLocation(), SM),
1291                     diag::ext_string_too_long)
1292         << GetNumStringChars() << MaxChars
1293         << (Features.CPlusPlus ? 2 : Features.C99 ? 1 : 0)
1294         << SourceRange(StringToks[0].getLocation(),
1295                        StringToks[NumStringToks-1].getLocation());
1296   }
1297 }
1298 
1299 
1300 /// copyStringFragment - This function copies from Start to End into ResultPtr.
1301 /// Performs widening for multi-byte characters.
CopyStringFragment(StringRef Fragment)1302 bool StringLiteralParser::CopyStringFragment(StringRef Fragment) {
1303   assert(CharByteWidth==1 || CharByteWidth==2 || CharByteWidth==4);
1304   ConversionResult result = conversionOK;
1305   // Copy the character span over.
1306   if (CharByteWidth == 1) {
1307     if (!isLegalUTF8String(reinterpret_cast<const UTF8*>(Fragment.begin()),
1308                            reinterpret_cast<const UTF8*>(Fragment.end())))
1309       result = sourceIllegal;
1310     memcpy(ResultPtr, Fragment.data(), Fragment.size());
1311     ResultPtr += Fragment.size();
1312   } else if (CharByteWidth == 2) {
1313     UTF8 const *sourceStart = (UTF8 const *)Fragment.data();
1314     // FIXME: Make the type of the result buffer correct instead of
1315     // using reinterpret_cast.
1316     UTF16 *targetStart = reinterpret_cast<UTF16*>(ResultPtr);
1317     ConversionFlags flags = strictConversion;
1318     result = ConvertUTF8toUTF16(
1319 	    &sourceStart,sourceStart + Fragment.size(),
1320         &targetStart,targetStart + 2*Fragment.size(),flags);
1321     if (result==conversionOK)
1322       ResultPtr = reinterpret_cast<char*>(targetStart);
1323   } else if (CharByteWidth == 4) {
1324     UTF8 const *sourceStart = (UTF8 const *)Fragment.data();
1325     // FIXME: Make the type of the result buffer correct instead of
1326     // using reinterpret_cast.
1327     UTF32 *targetStart = reinterpret_cast<UTF32*>(ResultPtr);
1328     ConversionFlags flags = strictConversion;
1329     result = ConvertUTF8toUTF32(
1330         &sourceStart,sourceStart + Fragment.size(),
1331         &targetStart,targetStart + 4*Fragment.size(),flags);
1332     if (result==conversionOK)
1333       ResultPtr = reinterpret_cast<char*>(targetStart);
1334   }
1335   assert((result != targetExhausted)
1336          && "ConvertUTF8toUTFXX exhausted target buffer");
1337   return result != conversionOK;
1338 }
1339 
DiagnoseBadString(const Token & Tok)1340 bool StringLiteralParser::DiagnoseBadString(const Token &Tok) {
1341   // If we see bad encoding for unprefixed string literals, warn and
1342   // simply copy the byte values, for compatibility with gcc and older
1343   // versions of clang.
1344   bool NoErrorOnBadEncoding = isAscii();
1345   unsigned Msg = NoErrorOnBadEncoding ? diag::warn_bad_string_encoding :
1346                                         diag::err_bad_string_encoding;
1347   if (Diags)
1348     Diags->Report(FullSourceLoc(Tok.getLocation(), SM), Msg);
1349   return !NoErrorOnBadEncoding;
1350 }
1351 
1352 /// getOffsetOfStringByte - This function returns the offset of the
1353 /// specified byte of the string data represented by Token.  This handles
1354 /// advancing over escape sequences in the string.
getOffsetOfStringByte(const Token & Tok,unsigned ByteNo) const1355 unsigned StringLiteralParser::getOffsetOfStringByte(const Token &Tok,
1356                                                     unsigned ByteNo) const {
1357   // Get the spelling of the token.
1358   SmallString<32> SpellingBuffer;
1359   SpellingBuffer.resize(Tok.getLength());
1360 
1361   bool StringInvalid = false;
1362   const char *SpellingPtr = &SpellingBuffer[0];
1363   unsigned TokLen = Lexer::getSpelling(Tok, SpellingPtr, SM, Features,
1364                                        &StringInvalid);
1365   if (StringInvalid)
1366     return 0;
1367 
1368   assert(SpellingPtr[0] != 'L' && SpellingPtr[0] != 'u' &&
1369          SpellingPtr[0] != 'U' && "Doesn't handle wide or utf strings yet");
1370 
1371 
1372   const char *SpellingStart = SpellingPtr;
1373   const char *SpellingEnd = SpellingPtr+TokLen;
1374 
1375   // Skip over the leading quote.
1376   assert(SpellingPtr[0] == '"' && "Should be a string literal!");
1377   ++SpellingPtr;
1378 
1379   // Skip over bytes until we find the offset we're looking for.
1380   while (ByteNo) {
1381     assert(SpellingPtr < SpellingEnd && "Didn't find byte offset!");
1382 
1383     // Step over non-escapes simply.
1384     if (*SpellingPtr != '\\') {
1385       ++SpellingPtr;
1386       --ByteNo;
1387       continue;
1388     }
1389 
1390     // Otherwise, this is an escape character.  Advance over it.
1391     bool HadError = false;
1392     ProcessCharEscape(SpellingPtr, SpellingEnd, HadError,
1393                       FullSourceLoc(Tok.getLocation(), SM),
1394                       CharByteWidth*8, Diags);
1395     assert(!HadError && "This method isn't valid on erroneous strings");
1396     --ByteNo;
1397   }
1398 
1399   return SpellingPtr-SpellingStart;
1400 }
1401