• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 //===--- LiteralSupport.cpp - Code to parse and process literals ----------===//
2 //
3 //                     The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 // This file implements the NumericLiteralParser, CharLiteralParser, and
11 // StringLiteralParser interfaces.
12 //
13 //===----------------------------------------------------------------------===//
14 
15 #include "clang/Lex/LiteralSupport.h"
16 #include "clang/Lex/Preprocessor.h"
17 #include "clang/Lex/LexDiagnostic.h"
18 #include "clang/Basic/TargetInfo.h"
19 #include "clang/Basic/ConvertUTF.h"
20 #include "llvm/ADT/StringExtras.h"
21 #include "llvm/Support/ErrorHandling.h"
22 using namespace clang;
23 
24 /// HexDigitValue - Return the value of the specified hex digit, or -1 if it's
25 /// not valid.
HexDigitValue(char C)26 static int HexDigitValue(char C) {
27   if (C >= '0' && C <= '9') return C-'0';
28   if (C >= 'a' && C <= 'f') return C-'a'+10;
29   if (C >= 'A' && C <= 'F') return C-'A'+10;
30   return -1;
31 }
32 
getCharWidth(tok::TokenKind kind,const TargetInfo & Target)33 static unsigned getCharWidth(tok::TokenKind kind, const TargetInfo &Target) {
34   switch (kind) {
35   default: llvm_unreachable("Unknown token type!");
36   case tok::char_constant:
37   case tok::string_literal:
38   case tok::utf8_string_literal:
39     return Target.getCharWidth();
40   case tok::wide_char_constant:
41   case tok::wide_string_literal:
42     return Target.getWCharWidth();
43   case tok::utf16_char_constant:
44   case tok::utf16_string_literal:
45     return Target.getChar16Width();
46   case tok::utf32_char_constant:
47   case tok::utf32_string_literal:
48     return Target.getChar32Width();
49   }
50 }
51 
52 /// \brief Produce a diagnostic highlighting some portion of a literal.
53 ///
54 /// Emits the diagnostic \p DiagID, highlighting the range of characters from
55 /// \p TokRangeBegin (inclusive) to \p TokRangeEnd (exclusive), which must be
56 /// a substring of a spelling buffer for the token beginning at \p TokBegin.
Diag(DiagnosticsEngine * Diags,const LangOptions & Features,FullSourceLoc TokLoc,const char * TokBegin,const char * TokRangeBegin,const char * TokRangeEnd,unsigned DiagID)57 static DiagnosticBuilder Diag(DiagnosticsEngine *Diags,
58                               const LangOptions &Features, FullSourceLoc TokLoc,
59                               const char *TokBegin, const char *TokRangeBegin,
60                               const char *TokRangeEnd, unsigned DiagID) {
61   SourceLocation Begin =
62     Lexer::AdvanceToTokenCharacter(TokLoc, TokRangeBegin - TokBegin,
63                                    TokLoc.getManager(), Features);
64   SourceLocation End =
65     Lexer::AdvanceToTokenCharacter(Begin, TokRangeEnd - TokRangeBegin,
66                                    TokLoc.getManager(), Features);
67   return Diags->Report(Begin, DiagID)
68       << CharSourceRange::getCharRange(Begin, End);
69 }
70 
71 /// ProcessCharEscape - Parse a standard C escape sequence, which can occur in
72 /// either a character or a string literal.
ProcessCharEscape(const char * ThisTokBegin,const char * & ThisTokBuf,const char * ThisTokEnd,bool & HadError,FullSourceLoc Loc,unsigned CharWidth,DiagnosticsEngine * Diags,const LangOptions & Features)73 static unsigned ProcessCharEscape(const char *ThisTokBegin,
74                                   const char *&ThisTokBuf,
75                                   const char *ThisTokEnd, bool &HadError,
76                                   FullSourceLoc Loc, unsigned CharWidth,
77                                   DiagnosticsEngine *Diags,
78                                   const LangOptions &Features) {
79   const char *EscapeBegin = ThisTokBuf;
80 
81   // Skip the '\' char.
82   ++ThisTokBuf;
83 
84   // We know that this character can't be off the end of the buffer, because
85   // that would have been \", which would not have been the end of string.
86   unsigned ResultChar = *ThisTokBuf++;
87   switch (ResultChar) {
88   // These map to themselves.
89   case '\\': case '\'': case '"': case '?': break;
90 
91     // These have fixed mappings.
92   case 'a':
93     // TODO: K&R: the meaning of '\\a' is different in traditional C
94     ResultChar = 7;
95     break;
96   case 'b':
97     ResultChar = 8;
98     break;
99   case 'e':
100     if (Diags)
101       Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
102            diag::ext_nonstandard_escape) << "e";
103     ResultChar = 27;
104     break;
105   case 'E':
106     if (Diags)
107       Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
108            diag::ext_nonstandard_escape) << "E";
109     ResultChar = 27;
110     break;
111   case 'f':
112     ResultChar = 12;
113     break;
114   case 'n':
115     ResultChar = 10;
116     break;
117   case 'r':
118     ResultChar = 13;
119     break;
120   case 't':
121     ResultChar = 9;
122     break;
123   case 'v':
124     ResultChar = 11;
125     break;
126   case 'x': { // Hex escape.
127     ResultChar = 0;
128     if (ThisTokBuf == ThisTokEnd || !isxdigit(*ThisTokBuf)) {
129       if (Diags)
130         Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
131              diag::err_hex_escape_no_digits);
132       HadError = 1;
133       break;
134     }
135 
136     // Hex escapes are a maximal series of hex digits.
137     bool Overflow = false;
138     for (; ThisTokBuf != ThisTokEnd; ++ThisTokBuf) {
139       int CharVal = HexDigitValue(ThisTokBuf[0]);
140       if (CharVal == -1) break;
141       // About to shift out a digit?
142       Overflow |= (ResultChar & 0xF0000000) ? true : false;
143       ResultChar <<= 4;
144       ResultChar |= CharVal;
145     }
146 
147     // See if any bits will be truncated when evaluated as a character.
148     if (CharWidth != 32 && (ResultChar >> CharWidth) != 0) {
149       Overflow = true;
150       ResultChar &= ~0U >> (32-CharWidth);
151     }
152 
153     // Check for overflow.
154     if (Overflow && Diags)   // Too many digits to fit in
155       Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
156            diag::warn_hex_escape_too_large);
157     break;
158   }
159   case '0': case '1': case '2': case '3':
160   case '4': case '5': case '6': case '7': {
161     // Octal escapes.
162     --ThisTokBuf;
163     ResultChar = 0;
164 
165     // Octal escapes are a series of octal digits with maximum length 3.
166     // "\0123" is a two digit sequence equal to "\012" "3".
167     unsigned NumDigits = 0;
168     do {
169       ResultChar <<= 3;
170       ResultChar |= *ThisTokBuf++ - '0';
171       ++NumDigits;
172     } while (ThisTokBuf != ThisTokEnd && NumDigits < 3 &&
173              ThisTokBuf[0] >= '0' && ThisTokBuf[0] <= '7');
174 
175     // Check for overflow.  Reject '\777', but not L'\777'.
176     if (CharWidth != 32 && (ResultChar >> CharWidth) != 0) {
177       if (Diags)
178         Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
179              diag::warn_octal_escape_too_large);
180       ResultChar &= ~0U >> (32-CharWidth);
181     }
182     break;
183   }
184 
185     // Otherwise, these are not valid escapes.
186   case '(': case '{': case '[': case '%':
187     // GCC accepts these as extensions.  We warn about them as such though.
188     if (Diags)
189       Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
190            diag::ext_nonstandard_escape)
191         << std::string(1, ResultChar);
192     break;
193   default:
194     if (Diags == 0)
195       break;
196 
197     if (isgraph(ResultChar))
198       Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
199            diag::ext_unknown_escape)
200         << std::string(1, ResultChar);
201     else
202       Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
203            diag::ext_unknown_escape)
204         << "x" + llvm::utohexstr(ResultChar);
205     break;
206   }
207 
208   return ResultChar;
209 }
210 
211 /// ProcessUCNEscape - Read the Universal Character Name, check constraints and
212 /// return the UTF32.
ProcessUCNEscape(const char * ThisTokBegin,const char * & ThisTokBuf,const char * ThisTokEnd,uint32_t & UcnVal,unsigned short & UcnLen,FullSourceLoc Loc,DiagnosticsEngine * Diags,const LangOptions & Features,bool in_char_string_literal=false)213 static bool ProcessUCNEscape(const char *ThisTokBegin, const char *&ThisTokBuf,
214                              const char *ThisTokEnd,
215                              uint32_t &UcnVal, unsigned short &UcnLen,
216                              FullSourceLoc Loc, DiagnosticsEngine *Diags,
217                              const LangOptions &Features,
218                              bool in_char_string_literal = false) {
219   const char *UcnBegin = ThisTokBuf;
220 
221   // Skip the '\u' char's.
222   ThisTokBuf += 2;
223 
224   if (ThisTokBuf == ThisTokEnd || !isxdigit(*ThisTokBuf)) {
225     if (Diags)
226       Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,
227            diag::err_ucn_escape_no_digits);
228     return false;
229   }
230   UcnLen = (ThisTokBuf[-1] == 'u' ? 4 : 8);
231   unsigned short UcnLenSave = UcnLen;
232   for (; ThisTokBuf != ThisTokEnd && UcnLenSave; ++ThisTokBuf, UcnLenSave--) {
233     int CharVal = HexDigitValue(ThisTokBuf[0]);
234     if (CharVal == -1) break;
235     UcnVal <<= 4;
236     UcnVal |= CharVal;
237   }
238   // If we didn't consume the proper number of digits, there is a problem.
239   if (UcnLenSave) {
240     if (Diags)
241       Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,
242            diag::err_ucn_escape_incomplete);
243     return false;
244   }
245 
246   // Check UCN constraints (C99 6.4.3p2) [C++11 lex.charset p2]
247   if ((0xD800 <= UcnVal && UcnVal <= 0xDFFF) || // surrogate codepoints
248       UcnVal > 0x10FFFF) {                      // maximum legal UTF32 value
249     if (Diags)
250       Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,
251            diag::err_ucn_escape_invalid);
252     return false;
253   }
254 
255   // C++11 allows UCNs that refer to control characters and basic source
256   // characters inside character and string literals
257   if (UcnVal < 0xa0 &&
258       (UcnVal != 0x24 && UcnVal != 0x40 && UcnVal != 0x60)) {  // $, @, `
259     bool IsError = (!Features.CPlusPlus0x || !in_char_string_literal);
260     if (Diags) {
261       char BasicSCSChar = UcnVal;
262       if (UcnVal >= 0x20 && UcnVal < 0x7f)
263         Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,
264              IsError ? diag::err_ucn_escape_basic_scs :
265                        diag::warn_cxx98_compat_literal_ucn_escape_basic_scs)
266             << StringRef(&BasicSCSChar, 1);
267       else
268         Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,
269              IsError ? diag::err_ucn_control_character :
270                        diag::warn_cxx98_compat_literal_ucn_control_character);
271     }
272     if (IsError)
273       return false;
274   }
275 
276   if (!Features.CPlusPlus && !Features.C99 && Diags)
277     Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,
278          diag::warn_ucn_not_valid_in_c89);
279 
280   return true;
281 }
282 
283 /// MeasureUCNEscape - Determine the number of bytes within the resulting string
284 /// which this UCN will occupy.
MeasureUCNEscape(const char * ThisTokBegin,const char * & ThisTokBuf,const char * ThisTokEnd,unsigned CharByteWidth,const LangOptions & Features,bool & HadError)285 static int MeasureUCNEscape(const char *ThisTokBegin, const char *&ThisTokBuf,
286                             const char *ThisTokEnd, unsigned CharByteWidth,
287                             const LangOptions &Features, bool &HadError) {
288   // UTF-32: 4 bytes per escape.
289   if (CharByteWidth == 4)
290     return 4;
291 
292   uint32_t UcnVal = 0;
293   unsigned short UcnLen = 0;
294   FullSourceLoc Loc;
295 
296   if (!ProcessUCNEscape(ThisTokBegin, ThisTokBuf, ThisTokEnd, UcnVal,
297                         UcnLen, Loc, 0, Features, true)) {
298     HadError = true;
299     return 0;
300   }
301 
302   // UTF-16: 2 bytes for BMP, 4 bytes otherwise.
303   if (CharByteWidth == 2)
304     return UcnVal <= 0xFFFF ? 2 : 4;
305 
306   // UTF-8.
307   if (UcnVal < 0x80)
308     return 1;
309   if (UcnVal < 0x800)
310     return 2;
311   if (UcnVal < 0x10000)
312     return 3;
313   return 4;
314 }
315 
316 /// EncodeUCNEscape - Read the Universal Character Name, check constraints and
317 /// convert the UTF32 to UTF8 or UTF16. This is a subroutine of
318 /// StringLiteralParser. When we decide to implement UCN's for identifiers,
319 /// we will likely rework our support for UCN's.
EncodeUCNEscape(const char * ThisTokBegin,const char * & ThisTokBuf,const char * ThisTokEnd,char * & ResultBuf,bool & HadError,FullSourceLoc Loc,unsigned CharByteWidth,DiagnosticsEngine * Diags,const LangOptions & Features)320 static void EncodeUCNEscape(const char *ThisTokBegin, const char *&ThisTokBuf,
321                             const char *ThisTokEnd,
322                             char *&ResultBuf, bool &HadError,
323                             FullSourceLoc Loc, unsigned CharByteWidth,
324                             DiagnosticsEngine *Diags,
325                             const LangOptions &Features) {
326   typedef uint32_t UTF32;
327   UTF32 UcnVal = 0;
328   unsigned short UcnLen = 0;
329   if (!ProcessUCNEscape(ThisTokBegin, ThisTokBuf, ThisTokEnd, UcnVal, UcnLen,
330                         Loc, Diags, Features, true)) {
331     HadError = true;
332     return;
333   }
334 
335   assert((CharByteWidth == 1 || CharByteWidth == 2 || CharByteWidth) &&
336          "only character widths of 1, 2, or 4 bytes supported");
337 
338   (void)UcnLen;
339   assert((UcnLen== 4 || UcnLen== 8) && "only ucn length of 4 or 8 supported");
340 
341   if (CharByteWidth == 4) {
342     // FIXME: Make the type of the result buffer correct instead of
343     // using reinterpret_cast.
344     UTF32 *ResultPtr = reinterpret_cast<UTF32*>(ResultBuf);
345     *ResultPtr = UcnVal;
346     ResultBuf += 4;
347     return;
348   }
349 
350   if (CharByteWidth == 2) {
351     // FIXME: Make the type of the result buffer correct instead of
352     // using reinterpret_cast.
353     UTF16 *ResultPtr = reinterpret_cast<UTF16*>(ResultBuf);
354 
355     if (UcnVal <= (UTF32)0xFFFF) {
356       *ResultPtr = UcnVal;
357       ResultBuf += 2;
358       return;
359     }
360 
361     // Convert to UTF16.
362     UcnVal -= 0x10000;
363     *ResultPtr     = 0xD800 + (UcnVal >> 10);
364     *(ResultPtr+1) = 0xDC00 + (UcnVal & 0x3FF);
365     ResultBuf += 4;
366     return;
367   }
368 
369   assert(CharByteWidth == 1 && "UTF-8 encoding is only for 1 byte characters");
370 
371   // Now that we've parsed/checked the UCN, we convert from UTF32->UTF8.
372   // The conversion below was inspired by:
373   //   http://www.unicode.org/Public/PROGRAMS/CVTUTF/ConvertUTF.c
374   // First, we determine how many bytes the result will require.
375   typedef uint8_t UTF8;
376 
377   unsigned short bytesToWrite = 0;
378   if (UcnVal < (UTF32)0x80)
379     bytesToWrite = 1;
380   else if (UcnVal < (UTF32)0x800)
381     bytesToWrite = 2;
382   else if (UcnVal < (UTF32)0x10000)
383     bytesToWrite = 3;
384   else
385     bytesToWrite = 4;
386 
387   const unsigned byteMask = 0xBF;
388   const unsigned byteMark = 0x80;
389 
390   // Once the bits are split out into bytes of UTF8, this is a mask OR-ed
391   // into the first byte, depending on how many bytes follow.
392   static const UTF8 firstByteMark[5] = {
393     0x00, 0x00, 0xC0, 0xE0, 0xF0
394   };
395   // Finally, we write the bytes into ResultBuf.
396   ResultBuf += bytesToWrite;
397   switch (bytesToWrite) { // note: everything falls through.
398     case 4: *--ResultBuf = (UTF8)((UcnVal | byteMark) & byteMask); UcnVal >>= 6;
399     case 3: *--ResultBuf = (UTF8)((UcnVal | byteMark) & byteMask); UcnVal >>= 6;
400     case 2: *--ResultBuf = (UTF8)((UcnVal | byteMark) & byteMask); UcnVal >>= 6;
401     case 1: *--ResultBuf = (UTF8) (UcnVal | firstByteMark[bytesToWrite]);
402   }
403   // Update the buffer.
404   ResultBuf += bytesToWrite;
405 }
406 
407 
408 ///       integer-constant: [C99 6.4.4.1]
409 ///         decimal-constant integer-suffix
410 ///         octal-constant integer-suffix
411 ///         hexadecimal-constant integer-suffix
412 ///       user-defined-integer-literal: [C++11 lex.ext]
413 ///         decimal-literal ud-suffix
414 ///         octal-literal ud-suffix
415 ///         hexadecimal-literal ud-suffix
416 ///       decimal-constant:
417 ///         nonzero-digit
418 ///         decimal-constant digit
419 ///       octal-constant:
420 ///         0
421 ///         octal-constant octal-digit
422 ///       hexadecimal-constant:
423 ///         hexadecimal-prefix hexadecimal-digit
424 ///         hexadecimal-constant hexadecimal-digit
425 ///       hexadecimal-prefix: one of
426 ///         0x 0X
427 ///       integer-suffix:
428 ///         unsigned-suffix [long-suffix]
429 ///         unsigned-suffix [long-long-suffix]
430 ///         long-suffix [unsigned-suffix]
431 ///         long-long-suffix [unsigned-sufix]
432 ///       nonzero-digit:
433 ///         1 2 3 4 5 6 7 8 9
434 ///       octal-digit:
435 ///         0 1 2 3 4 5 6 7
436 ///       hexadecimal-digit:
437 ///         0 1 2 3 4 5 6 7 8 9
438 ///         a b c d e f
439 ///         A B C D E F
440 ///       unsigned-suffix: one of
441 ///         u U
442 ///       long-suffix: one of
443 ///         l L
444 ///       long-long-suffix: one of
445 ///         ll LL
446 ///
447 ///       floating-constant: [C99 6.4.4.2]
448 ///         TODO: add rules...
449 ///
450 NumericLiteralParser::
NumericLiteralParser(const char * begin,const char * end,SourceLocation TokLoc,Preprocessor & pp)451 NumericLiteralParser(const char *begin, const char *end,
452                      SourceLocation TokLoc, Preprocessor &pp)
453   : PP(pp), ThisTokBegin(begin), ThisTokEnd(end) {
454 
455   // This routine assumes that the range begin/end matches the regex for integer
456   // and FP constants (specifically, the 'pp-number' regex), and assumes that
457   // the byte at "*end" is both valid and not part of the regex.  Because of
458   // this, it doesn't have to check for 'overscan' in various places.
459   assert(!isalnum(*end) && *end != '.' && *end != '_' &&
460          "Lexer didn't maximally munch?");
461 
462   s = DigitsBegin = begin;
463   saw_exponent = false;
464   saw_period = false;
465   saw_ud_suffix = false;
466   isLong = false;
467   isUnsigned = false;
468   isLongLong = false;
469   isFloat = false;
470   isImaginary = false;
471   isMicrosoftInteger = false;
472   hadError = false;
473 
474   if (*s == '0') { // parse radix
475     ParseNumberStartingWithZero(TokLoc);
476     if (hadError)
477       return;
478   } else { // the first digit is non-zero
479     radix = 10;
480     s = SkipDigits(s);
481     if (s == ThisTokEnd) {
482       // Done.
483     } else if (isxdigit(*s) && !(*s == 'e' || *s == 'E')) {
484       PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, s-begin),
485               diag::err_invalid_decimal_digit) << StringRef(s, 1);
486       hadError = true;
487       return;
488     } else if (*s == '.') {
489       s++;
490       saw_period = true;
491       s = SkipDigits(s);
492     }
493     if ((*s == 'e' || *s == 'E')) { // exponent
494       const char *Exponent = s;
495       s++;
496       saw_exponent = true;
497       if (*s == '+' || *s == '-')  s++; // sign
498       const char *first_non_digit = SkipDigits(s);
499       if (first_non_digit != s) {
500         s = first_non_digit;
501       } else {
502         PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, Exponent-begin),
503                 diag::err_exponent_has_no_digits);
504         hadError = true;
505         return;
506       }
507     }
508   }
509 
510   SuffixBegin = s;
511 
512   // Parse the suffix.  At this point we can classify whether we have an FP or
513   // integer constant.
514   bool isFPConstant = isFloatingLiteral();
515 
516   // Loop over all of the characters of the suffix.  If we see something bad,
517   // we break out of the loop.
518   for (; s != ThisTokEnd; ++s) {
519     switch (*s) {
520     case 'f':      // FP Suffix for "float"
521     case 'F':
522       if (!isFPConstant) break;  // Error for integer constant.
523       if (isFloat || isLong) break; // FF, LF invalid.
524       isFloat = true;
525       continue;  // Success.
526     case 'u':
527     case 'U':
528       if (isFPConstant) break;  // Error for floating constant.
529       if (isUnsigned) break;    // Cannot be repeated.
530       isUnsigned = true;
531       continue;  // Success.
532     case 'l':
533     case 'L':
534       if (isLong || isLongLong) break;  // Cannot be repeated.
535       if (isFloat) break;               // LF invalid.
536 
537       // Check for long long.  The L's need to be adjacent and the same case.
538       if (s+1 != ThisTokEnd && s[1] == s[0]) {
539         if (isFPConstant) break;        // long long invalid for floats.
540         isLongLong = true;
541         ++s;  // Eat both of them.
542       } else {
543         isLong = true;
544       }
545       continue;  // Success.
546     case 'i':
547     case 'I':
548       if (PP.getLangOpts().MicrosoftExt) {
549         if (isFPConstant || isLong || isLongLong) break;
550 
551         // Allow i8, i16, i32, i64, and i128.
552         if (s + 1 != ThisTokEnd) {
553           switch (s[1]) {
554             case '8':
555               s += 2; // i8 suffix
556               isMicrosoftInteger = true;
557               break;
558             case '1':
559               if (s + 2 == ThisTokEnd) break;
560               if (s[2] == '6') {
561                 s += 3; // i16 suffix
562                 isMicrosoftInteger = true;
563               }
564               else if (s[2] == '2') {
565                 if (s + 3 == ThisTokEnd) break;
566                 if (s[3] == '8') {
567                   s += 4; // i128 suffix
568                   isMicrosoftInteger = true;
569                 }
570               }
571               break;
572             case '3':
573               if (s + 2 == ThisTokEnd) break;
574               if (s[2] == '2') {
575                 s += 3; // i32 suffix
576                 isLong = true;
577                 isMicrosoftInteger = true;
578               }
579               break;
580             case '6':
581               if (s + 2 == ThisTokEnd) break;
582               if (s[2] == '4') {
583                 s += 3; // i64 suffix
584                 isLongLong = true;
585                 isMicrosoftInteger = true;
586               }
587               break;
588             default:
589               break;
590           }
591           break;
592         }
593       }
594       // fall through.
595     case 'j':
596     case 'J':
597       if (isImaginary) break;   // Cannot be repeated.
598       PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, s-begin),
599               diag::ext_imaginary_constant);
600       isImaginary = true;
601       continue;  // Success.
602     }
603     // If we reached here, there was an error or a ud-suffix.
604     break;
605   }
606 
607   if (s != ThisTokEnd) {
608     if (PP.getLangOpts().CPlusPlus0x && s == SuffixBegin && *s == '_') {
609       // We have a ud-suffix! By C++11 [lex.ext]p10, ud-suffixes not starting
610       // with an '_' are ill-formed.
611       saw_ud_suffix = true;
612       return;
613     }
614 
615     // Report an error if there are any.
616     PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, SuffixBegin-begin),
617             isFPConstant ? diag::err_invalid_suffix_float_constant :
618                            diag::err_invalid_suffix_integer_constant)
619       << StringRef(SuffixBegin, ThisTokEnd-SuffixBegin);
620     hadError = true;
621     return;
622   }
623 }
624 
625 /// ParseNumberStartingWithZero - This method is called when the first character
626 /// of the number is found to be a zero.  This means it is either an octal
627 /// number (like '04') or a hex number ('0x123a') a binary number ('0b1010') or
628 /// a floating point number (01239.123e4).  Eat the prefix, determining the
629 /// radix etc.
ParseNumberStartingWithZero(SourceLocation TokLoc)630 void NumericLiteralParser::ParseNumberStartingWithZero(SourceLocation TokLoc) {
631   assert(s[0] == '0' && "Invalid method call");
632   s++;
633 
634   // Handle a hex number like 0x1234.
635   if ((*s == 'x' || *s == 'X') && (isxdigit(s[1]) || s[1] == '.')) {
636     s++;
637     radix = 16;
638     DigitsBegin = s;
639     s = SkipHexDigits(s);
640     bool noSignificand = (s == DigitsBegin);
641     if (s == ThisTokEnd) {
642       // Done.
643     } else if (*s == '.') {
644       s++;
645       saw_period = true;
646       const char *floatDigitsBegin = s;
647       s = SkipHexDigits(s);
648       noSignificand &= (floatDigitsBegin == s);
649     }
650 
651     if (noSignificand) {
652       PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, s-ThisTokBegin), \
653         diag::err_hexconstant_requires_digits);
654       hadError = true;
655       return;
656     }
657 
658     // A binary exponent can appear with or with a '.'. If dotted, the
659     // binary exponent is required.
660     if (*s == 'p' || *s == 'P') {
661       const char *Exponent = s;
662       s++;
663       saw_exponent = true;
664       if (*s == '+' || *s == '-')  s++; // sign
665       const char *first_non_digit = SkipDigits(s);
666       if (first_non_digit == s) {
667         PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, Exponent-ThisTokBegin),
668                 diag::err_exponent_has_no_digits);
669         hadError = true;
670         return;
671       }
672       s = first_non_digit;
673 
674       if (!PP.getLangOpts().HexFloats)
675         PP.Diag(TokLoc, diag::ext_hexconstant_invalid);
676     } else if (saw_period) {
677       PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, s-ThisTokBegin),
678               diag::err_hexconstant_requires_exponent);
679       hadError = true;
680     }
681     return;
682   }
683 
684   // Handle simple binary numbers 0b01010
685   if (*s == 'b' || *s == 'B') {
686     // 0b101010 is a GCC extension.
687     PP.Diag(TokLoc, diag::ext_binary_literal);
688     ++s;
689     radix = 2;
690     DigitsBegin = s;
691     s = SkipBinaryDigits(s);
692     if (s == ThisTokEnd) {
693       // Done.
694     } else if (isxdigit(*s)) {
695       PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, s-ThisTokBegin),
696               diag::err_invalid_binary_digit) << StringRef(s, 1);
697       hadError = true;
698     }
699     // Other suffixes will be diagnosed by the caller.
700     return;
701   }
702 
703   // For now, the radix is set to 8. If we discover that we have a
704   // floating point constant, the radix will change to 10. Octal floating
705   // point constants are not permitted (only decimal and hexadecimal).
706   radix = 8;
707   DigitsBegin = s;
708   s = SkipOctalDigits(s);
709   if (s == ThisTokEnd)
710     return; // Done, simple octal number like 01234
711 
712   // If we have some other non-octal digit that *is* a decimal digit, see if
713   // this is part of a floating point number like 094.123 or 09e1.
714   if (isdigit(*s)) {
715     const char *EndDecimal = SkipDigits(s);
716     if (EndDecimal[0] == '.' || EndDecimal[0] == 'e' || EndDecimal[0] == 'E') {
717       s = EndDecimal;
718       radix = 10;
719     }
720   }
721 
722   // If we have a hex digit other than 'e' (which denotes a FP exponent) then
723   // the code is using an incorrect base.
724   if (isxdigit(*s) && *s != 'e' && *s != 'E') {
725     PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, s-ThisTokBegin),
726             diag::err_invalid_octal_digit) << StringRef(s, 1);
727     hadError = true;
728     return;
729   }
730 
731   if (*s == '.') {
732     s++;
733     radix = 10;
734     saw_period = true;
735     s = SkipDigits(s); // Skip suffix.
736   }
737   if (*s == 'e' || *s == 'E') { // exponent
738     const char *Exponent = s;
739     s++;
740     radix = 10;
741     saw_exponent = true;
742     if (*s == '+' || *s == '-')  s++; // sign
743     const char *first_non_digit = SkipDigits(s);
744     if (first_non_digit != s) {
745       s = first_non_digit;
746     } else {
747       PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, Exponent-ThisTokBegin),
748               diag::err_exponent_has_no_digits);
749       hadError = true;
750       return;
751     }
752   }
753 }
754 
755 
756 /// GetIntegerValue - Convert this numeric literal value to an APInt that
757 /// matches Val's input width.  If there is an overflow, set Val to the low bits
758 /// of the result and return true.  Otherwise, return false.
GetIntegerValue(llvm::APInt & Val)759 bool NumericLiteralParser::GetIntegerValue(llvm::APInt &Val) {
760   // Fast path: Compute a conservative bound on the maximum number of
761   // bits per digit in this radix. If we can't possibly overflow a
762   // uint64 based on that bound then do the simple conversion to
763   // integer. This avoids the expensive overflow checking below, and
764   // handles the common cases that matter (small decimal integers and
765   // hex/octal values which don't overflow).
766   unsigned MaxBitsPerDigit = 1;
767   while ((1U << MaxBitsPerDigit) < radix)
768     MaxBitsPerDigit += 1;
769   if ((SuffixBegin - DigitsBegin) * MaxBitsPerDigit <= 64) {
770     uint64_t N = 0;
771     for (s = DigitsBegin; s != SuffixBegin; ++s)
772       N = N*radix + HexDigitValue(*s);
773 
774     // This will truncate the value to Val's input width. Simply check
775     // for overflow by comparing.
776     Val = N;
777     return Val.getZExtValue() != N;
778   }
779 
780   Val = 0;
781   s = DigitsBegin;
782 
783   llvm::APInt RadixVal(Val.getBitWidth(), radix);
784   llvm::APInt CharVal(Val.getBitWidth(), 0);
785   llvm::APInt OldVal = Val;
786 
787   bool OverflowOccurred = false;
788   while (s < SuffixBegin) {
789     unsigned C = HexDigitValue(*s++);
790 
791     // If this letter is out of bound for this radix, reject it.
792     assert(C < radix && "NumericLiteralParser ctor should have rejected this");
793 
794     CharVal = C;
795 
796     // Add the digit to the value in the appropriate radix.  If adding in digits
797     // made the value smaller, then this overflowed.
798     OldVal = Val;
799 
800     // Multiply by radix, did overflow occur on the multiply?
801     Val *= RadixVal;
802     OverflowOccurred |= Val.udiv(RadixVal) != OldVal;
803 
804     // Add value, did overflow occur on the value?
805     //   (a + b) ult b  <=> overflow
806     Val += CharVal;
807     OverflowOccurred |= Val.ult(CharVal);
808   }
809   return OverflowOccurred;
810 }
811 
812 llvm::APFloat::opStatus
GetFloatValue(llvm::APFloat & Result)813 NumericLiteralParser::GetFloatValue(llvm::APFloat &Result) {
814   using llvm::APFloat;
815 
816   unsigned n = std::min(SuffixBegin - ThisTokBegin, ThisTokEnd - ThisTokBegin);
817   return Result.convertFromString(StringRef(ThisTokBegin, n),
818                                   APFloat::rmNearestTiesToEven);
819 }
820 
821 
822 /// \verbatim
823 ///       user-defined-character-literal: [C++11 lex.ext]
824 ///         character-literal ud-suffix
825 ///       ud-suffix:
826 ///         identifier
827 ///       character-literal: [C++11 lex.ccon]
828 ///         ' c-char-sequence '
829 ///         u' c-char-sequence '
830 ///         U' c-char-sequence '
831 ///         L' c-char-sequence '
832 ///       c-char-sequence:
833 ///         c-char
834 ///         c-char-sequence c-char
835 ///       c-char:
836 ///         any member of the source character set except the single-quote ',
837 ///           backslash \, or new-line character
838 ///         escape-sequence
839 ///         universal-character-name
840 ///       escape-sequence:
841 ///         simple-escape-sequence
842 ///         octal-escape-sequence
843 ///         hexadecimal-escape-sequence
844 ///       simple-escape-sequence:
845 ///         one of \' \" \? \\ \a \b \f \n \r \t \v
846 ///       octal-escape-sequence:
847 ///         \ octal-digit
848 ///         \ octal-digit octal-digit
849 ///         \ octal-digit octal-digit octal-digit
850 ///       hexadecimal-escape-sequence:
851 ///         \x hexadecimal-digit
852 ///         hexadecimal-escape-sequence hexadecimal-digit
853 ///       universal-character-name: [C++11 lex.charset]
854 ///         \u hex-quad
855 ///         \U hex-quad hex-quad
856 ///       hex-quad:
857 ///         hex-digit hex-digit hex-digit hex-digit
858 /// \endverbatim
859 ///
CharLiteralParser(const char * begin,const char * end,SourceLocation Loc,Preprocessor & PP,tok::TokenKind kind)860 CharLiteralParser::CharLiteralParser(const char *begin, const char *end,
861                                      SourceLocation Loc, Preprocessor &PP,
862                                      tok::TokenKind kind) {
863   // At this point we know that the character matches the regex "(L|u|U)?'.*'".
864   HadError = false;
865 
866   Kind = kind;
867 
868   const char *TokBegin = begin;
869 
870   // Skip over wide character determinant.
871   if (Kind != tok::char_constant) {
872     ++begin;
873   }
874 
875   // Skip over the entry quote.
876   assert(begin[0] == '\'' && "Invalid token lexed");
877   ++begin;
878 
879   // Remove an optional ud-suffix.
880   if (end[-1] != '\'') {
881     const char *UDSuffixEnd = end;
882     do {
883       --end;
884     } while (end[-1] != '\'');
885     UDSuffixBuf.assign(end, UDSuffixEnd);
886     UDSuffixOffset = end - TokBegin;
887   }
888 
889   // Trim the ending quote.
890   assert(end != begin && "Invalid token lexed");
891   --end;
892 
893   // FIXME: The "Value" is an uint64_t so we can handle char literals of
894   // up to 64-bits.
895   // FIXME: This extensively assumes that 'char' is 8-bits.
896   assert(PP.getTargetInfo().getCharWidth() == 8 &&
897          "Assumes char is 8 bits");
898   assert(PP.getTargetInfo().getIntWidth() <= 64 &&
899          (PP.getTargetInfo().getIntWidth() & 7) == 0 &&
900          "Assumes sizeof(int) on target is <= 64 and a multiple of char");
901   assert(PP.getTargetInfo().getWCharWidth() <= 64 &&
902          "Assumes sizeof(wchar) on target is <= 64");
903 
904   SmallVector<uint32_t,4> codepoint_buffer;
905   codepoint_buffer.resize(end-begin);
906   uint32_t *buffer_begin = &codepoint_buffer.front();
907   uint32_t *buffer_end = buffer_begin + codepoint_buffer.size();
908 
909   // Unicode escapes representing characters that cannot be correctly
910   // represented in a single code unit are disallowed in character literals
911   // by this implementation.
912   uint32_t largest_character_for_kind;
913   if (tok::wide_char_constant == Kind) {
914     largest_character_for_kind = 0xFFFFFFFFu >> (32-PP.getTargetInfo().getWCharWidth());
915   } else if (tok::utf16_char_constant == Kind) {
916     largest_character_for_kind = 0xFFFF;
917   } else if (tok::utf32_char_constant == Kind) {
918     largest_character_for_kind = 0x10FFFF;
919   } else {
920     largest_character_for_kind = 0x7Fu;
921   }
922 
923   while (begin!=end) {
924     // Is this a span of non-escape characters?
925     if (begin[0] != '\\') {
926       char const *start = begin;
927       do {
928         ++begin;
929       } while (begin != end && *begin != '\\');
930 
931       char const *tmp_in_start = start;
932       uint32_t *tmp_out_start = buffer_begin;
933       ConversionResult res =
934       ConvertUTF8toUTF32(reinterpret_cast<UTF8 const **>(&start),
935                          reinterpret_cast<UTF8 const *>(begin),
936                          &buffer_begin,buffer_end,strictConversion);
937       if (res!=conversionOK) {
938         // If we see bad encoding for unprefixed character literals, warn and
939         // simply copy the byte values, for compatibility with gcc and
940         // older versions of clang.
941         bool NoErrorOnBadEncoding = isAscii();
942         unsigned Msg = diag::err_bad_character_encoding;
943         if (NoErrorOnBadEncoding)
944           Msg = diag::warn_bad_character_encoding;
945         PP.Diag(Loc, Msg);
946         if (NoErrorOnBadEncoding) {
947           start = tmp_in_start;
948           buffer_begin = tmp_out_start;
949           for ( ; start != begin; ++start, ++buffer_begin)
950             *buffer_begin = static_cast<uint8_t>(*start);
951         } else {
952           HadError = true;
953         }
954       } else {
955         for (; tmp_out_start <buffer_begin; ++tmp_out_start) {
956           if (*tmp_out_start > largest_character_for_kind) {
957             HadError = true;
958             PP.Diag(Loc, diag::err_character_too_large);
959           }
960         }
961       }
962 
963       continue;
964     }
965     // Is this a Universal Character Name excape?
966     if (begin[1] == 'u' || begin[1] == 'U') {
967       unsigned short UcnLen = 0;
968       if (!ProcessUCNEscape(TokBegin, begin, end, *buffer_begin, UcnLen,
969                             FullSourceLoc(Loc, PP.getSourceManager()),
970                             &PP.getDiagnostics(), PP.getLangOpts(),
971                             true))
972       {
973         HadError = true;
974       } else if (*buffer_begin > largest_character_for_kind) {
975         HadError = true;
976         PP.Diag(Loc, diag::err_character_too_large);
977       }
978 
979       ++buffer_begin;
980       continue;
981     }
982     unsigned CharWidth = getCharWidth(Kind, PP.getTargetInfo());
983     uint64_t result =
984       ProcessCharEscape(TokBegin, begin, end, HadError,
985                         FullSourceLoc(Loc,PP.getSourceManager()),
986                         CharWidth, &PP.getDiagnostics(), PP.getLangOpts());
987     *buffer_begin++ = result;
988   }
989 
990   unsigned NumCharsSoFar = buffer_begin-&codepoint_buffer.front();
991 
992   if (NumCharsSoFar > 1) {
993     if (isWide())
994       PP.Diag(Loc, diag::warn_extraneous_char_constant);
995     else if (isAscii() && NumCharsSoFar == 4)
996       PP.Diag(Loc, diag::ext_four_char_character_literal);
997     else if (isAscii())
998       PP.Diag(Loc, diag::ext_multichar_character_literal);
999     else
1000       PP.Diag(Loc, diag::err_multichar_utf_character_literal);
1001     IsMultiChar = true;
1002   } else
1003     IsMultiChar = false;
1004 
1005   llvm::APInt LitVal(PP.getTargetInfo().getIntWidth(), 0);
1006 
1007   // Narrow character literals act as though their value is concatenated
1008   // in this implementation, but warn on overflow.
1009   bool multi_char_too_long = false;
1010   if (isAscii() && isMultiChar()) {
1011     LitVal = 0;
1012     for (size_t i=0;i<NumCharsSoFar;++i) {
1013       // check for enough leading zeros to shift into
1014       multi_char_too_long |= (LitVal.countLeadingZeros() < 8);
1015       LitVal <<= 8;
1016       LitVal = LitVal + (codepoint_buffer[i] & 0xFF);
1017     }
1018   } else if (NumCharsSoFar > 0) {
1019     // otherwise just take the last character
1020     LitVal = buffer_begin[-1];
1021   }
1022 
1023   if (!HadError && multi_char_too_long) {
1024     PP.Diag(Loc,diag::warn_char_constant_too_large);
1025   }
1026 
1027   // Transfer the value from APInt to uint64_t
1028   Value = LitVal.getZExtValue();
1029 
1030   // If this is a single narrow character, sign extend it (e.g. '\xFF' is "-1")
1031   // if 'char' is signed for this target (C99 6.4.4.4p10).  Note that multiple
1032   // character constants are not sign extended in the this implementation:
1033   // '\xFF\xFF' = 65536 and '\x0\xFF' = 255, which matches GCC.
1034   if (isAscii() && NumCharsSoFar == 1 && (Value & 128) &&
1035       PP.getLangOpts().CharIsSigned)
1036     Value = (signed char)Value;
1037 }
1038 
1039 /// \verbatim
1040 ///       string-literal: [C++0x lex.string]
1041 ///         encoding-prefix " [s-char-sequence] "
1042 ///         encoding-prefix R raw-string
1043 ///       encoding-prefix:
1044 ///         u8
1045 ///         u
1046 ///         U
1047 ///         L
1048 ///       s-char-sequence:
1049 ///         s-char
1050 ///         s-char-sequence s-char
1051 ///       s-char:
1052 ///         any member of the source character set except the double-quote ",
1053 ///           backslash \, or new-line character
1054 ///         escape-sequence
1055 ///         universal-character-name
1056 ///       raw-string:
1057 ///         " d-char-sequence ( r-char-sequence ) d-char-sequence "
1058 ///       r-char-sequence:
1059 ///         r-char
1060 ///         r-char-sequence r-char
1061 ///       r-char:
1062 ///         any member of the source character set, except a right parenthesis )
1063 ///           followed by the initial d-char-sequence (which may be empty)
1064 ///           followed by a double quote ".
1065 ///       d-char-sequence:
1066 ///         d-char
1067 ///         d-char-sequence d-char
1068 ///       d-char:
1069 ///         any member of the basic source character set except:
1070 ///           space, the left parenthesis (, the right parenthesis ),
1071 ///           the backslash \, and the control characters representing horizontal
1072 ///           tab, vertical tab, form feed, and newline.
1073 ///       escape-sequence: [C++0x lex.ccon]
1074 ///         simple-escape-sequence
1075 ///         octal-escape-sequence
1076 ///         hexadecimal-escape-sequence
1077 ///       simple-escape-sequence:
1078 ///         one of \' \" \? \\ \a \b \f \n \r \t \v
1079 ///       octal-escape-sequence:
1080 ///         \ octal-digit
1081 ///         \ octal-digit octal-digit
1082 ///         \ octal-digit octal-digit octal-digit
1083 ///       hexadecimal-escape-sequence:
1084 ///         \x hexadecimal-digit
1085 ///         hexadecimal-escape-sequence hexadecimal-digit
1086 ///       universal-character-name:
1087 ///         \u hex-quad
1088 ///         \U hex-quad hex-quad
1089 ///       hex-quad:
1090 ///         hex-digit hex-digit hex-digit hex-digit
1091 /// \endverbatim
1092 ///
1093 StringLiteralParser::
StringLiteralParser(const Token * StringToks,unsigned NumStringToks,Preprocessor & PP,bool Complain)1094 StringLiteralParser(const Token *StringToks, unsigned NumStringToks,
1095                     Preprocessor &PP, bool Complain)
1096   : SM(PP.getSourceManager()), Features(PP.getLangOpts()),
1097     Target(PP.getTargetInfo()), Diags(Complain ? &PP.getDiagnostics() : 0),
1098     MaxTokenLength(0), SizeBound(0), CharByteWidth(0), Kind(tok::unknown),
1099     ResultPtr(ResultBuf.data()), hadError(false), Pascal(false) {
1100   init(StringToks, NumStringToks);
1101 }
1102 
init(const Token * StringToks,unsigned NumStringToks)1103 void StringLiteralParser::init(const Token *StringToks, unsigned NumStringToks){
1104   // The literal token may have come from an invalid source location (e.g. due
1105   // to a PCH error), in which case the token length will be 0.
1106   if (NumStringToks == 0 || StringToks[0].getLength() < 2)
1107     return DiagnoseLexingError(SourceLocation());
1108 
1109   // Scan all of the string portions, remember the max individual token length,
1110   // computing a bound on the concatenated string length, and see whether any
1111   // piece is a wide-string.  If any of the string portions is a wide-string
1112   // literal, the result is a wide-string literal [C99 6.4.5p4].
1113   assert(NumStringToks && "expected at least one token");
1114   MaxTokenLength = StringToks[0].getLength();
1115   assert(StringToks[0].getLength() >= 2 && "literal token is invalid!");
1116   SizeBound = StringToks[0].getLength()-2;  // -2 for "".
1117   Kind = StringToks[0].getKind();
1118 
1119   hadError = false;
1120 
1121   // Implement Translation Phase #6: concatenation of string literals
1122   /// (C99 5.1.1.2p1).  The common case is only one string fragment.
1123   for (unsigned i = 1; i != NumStringToks; ++i) {
1124     if (StringToks[i].getLength() < 2)
1125       return DiagnoseLexingError(StringToks[i].getLocation());
1126 
1127     // The string could be shorter than this if it needs cleaning, but this is a
1128     // reasonable bound, which is all we need.
1129     assert(StringToks[i].getLength() >= 2 && "literal token is invalid!");
1130     SizeBound += StringToks[i].getLength()-2;  // -2 for "".
1131 
1132     // Remember maximum string piece length.
1133     if (StringToks[i].getLength() > MaxTokenLength)
1134       MaxTokenLength = StringToks[i].getLength();
1135 
1136     // Remember if we see any wide or utf-8/16/32 strings.
1137     // Also check for illegal concatenations.
1138     if (StringToks[i].isNot(Kind) && StringToks[i].isNot(tok::string_literal)) {
1139       if (isAscii()) {
1140         Kind = StringToks[i].getKind();
1141       } else {
1142         if (Diags)
1143           Diags->Report(StringToks[i].getLocation(),
1144                         diag::err_unsupported_string_concat);
1145         hadError = true;
1146       }
1147     }
1148   }
1149 
1150   // Include space for the null terminator.
1151   ++SizeBound;
1152 
1153   // TODO: K&R warning: "traditional C rejects string constant concatenation"
1154 
1155   // Get the width in bytes of char/wchar_t/char16_t/char32_t
1156   CharByteWidth = getCharWidth(Kind, Target);
1157   assert((CharByteWidth & 7) == 0 && "Assumes character size is byte multiple");
1158   CharByteWidth /= 8;
1159 
1160   // The output buffer size needs to be large enough to hold wide characters.
1161   // This is a worst-case assumption which basically corresponds to L"" "long".
1162   SizeBound *= CharByteWidth;
1163 
1164   // Size the temporary buffer to hold the result string data.
1165   ResultBuf.resize(SizeBound);
1166 
1167   // Likewise, but for each string piece.
1168   SmallString<512> TokenBuf;
1169   TokenBuf.resize(MaxTokenLength);
1170 
1171   // Loop over all the strings, getting their spelling, and expanding them to
1172   // wide strings as appropriate.
1173   ResultPtr = &ResultBuf[0];   // Next byte to fill in.
1174 
1175   Pascal = false;
1176 
1177   SourceLocation UDSuffixTokLoc;
1178 
1179   for (unsigned i = 0, e = NumStringToks; i != e; ++i) {
1180     const char *ThisTokBuf = &TokenBuf[0];
1181     // Get the spelling of the token, which eliminates trigraphs, etc.  We know
1182     // that ThisTokBuf points to a buffer that is big enough for the whole token
1183     // and 'spelled' tokens can only shrink.
1184     bool StringInvalid = false;
1185     unsigned ThisTokLen =
1186       Lexer::getSpelling(StringToks[i], ThisTokBuf, SM, Features,
1187                          &StringInvalid);
1188     if (StringInvalid)
1189       return DiagnoseLexingError(StringToks[i].getLocation());
1190 
1191     const char *ThisTokBegin = ThisTokBuf;
1192     const char *ThisTokEnd = ThisTokBuf+ThisTokLen;
1193 
1194     // Remove an optional ud-suffix.
1195     if (ThisTokEnd[-1] != '"') {
1196       const char *UDSuffixEnd = ThisTokEnd;
1197       do {
1198         --ThisTokEnd;
1199       } while (ThisTokEnd[-1] != '"');
1200 
1201       StringRef UDSuffix(ThisTokEnd, UDSuffixEnd - ThisTokEnd);
1202 
1203       if (UDSuffixBuf.empty()) {
1204         UDSuffixBuf.assign(UDSuffix);
1205         UDSuffixToken = i;
1206         UDSuffixOffset = ThisTokEnd - ThisTokBuf;
1207         UDSuffixTokLoc = StringToks[i].getLocation();
1208       } else if (!UDSuffixBuf.equals(UDSuffix)) {
1209         // C++11 [lex.ext]p8: At the end of phase 6, if a string literal is the
1210         // result of a concatenation involving at least one user-defined-string-
1211         // literal, all the participating user-defined-string-literals shall
1212         // have the same ud-suffix.
1213         if (Diags) {
1214           SourceLocation TokLoc = StringToks[i].getLocation();
1215           Diags->Report(TokLoc, diag::err_string_concat_mixed_suffix)
1216             << UDSuffixBuf << UDSuffix
1217             << SourceRange(UDSuffixTokLoc, UDSuffixTokLoc)
1218             << SourceRange(TokLoc, TokLoc);
1219         }
1220         hadError = true;
1221       }
1222     }
1223 
1224     // Strip the end quote.
1225     --ThisTokEnd;
1226 
1227     // TODO: Input character set mapping support.
1228 
1229     // Skip marker for wide or unicode strings.
1230     if (ThisTokBuf[0] == 'L' || ThisTokBuf[0] == 'u' || ThisTokBuf[0] == 'U') {
1231       ++ThisTokBuf;
1232       // Skip 8 of u8 marker for utf8 strings.
1233       if (ThisTokBuf[0] == '8')
1234         ++ThisTokBuf;
1235     }
1236 
1237     // Check for raw string
1238     if (ThisTokBuf[0] == 'R') {
1239       ThisTokBuf += 2; // skip R"
1240 
1241       const char *Prefix = ThisTokBuf;
1242       while (ThisTokBuf[0] != '(')
1243         ++ThisTokBuf;
1244       ++ThisTokBuf; // skip '('
1245 
1246       // Remove same number of characters from the end
1247       ThisTokEnd -= ThisTokBuf - Prefix;
1248       assert(ThisTokEnd >= ThisTokBuf && "malformed raw string literal");
1249 
1250       // Copy the string over
1251       if (CopyStringFragment(StringToks[i], ThisTokBegin,
1252                              StringRef(ThisTokBuf, ThisTokEnd - ThisTokBuf)))
1253         hadError = true;
1254     } else {
1255       if (ThisTokBuf[0] != '"') {
1256         // The file may have come from PCH and then changed after loading the
1257         // PCH; Fail gracefully.
1258         return DiagnoseLexingError(StringToks[i].getLocation());
1259       }
1260       ++ThisTokBuf; // skip "
1261 
1262       // Check if this is a pascal string
1263       if (Features.PascalStrings && ThisTokBuf + 1 != ThisTokEnd &&
1264           ThisTokBuf[0] == '\\' && ThisTokBuf[1] == 'p') {
1265 
1266         // If the \p sequence is found in the first token, we have a pascal string
1267         // Otherwise, if we already have a pascal string, ignore the first \p
1268         if (i == 0) {
1269           ++ThisTokBuf;
1270           Pascal = true;
1271         } else if (Pascal)
1272           ThisTokBuf += 2;
1273       }
1274 
1275       while (ThisTokBuf != ThisTokEnd) {
1276         // Is this a span of non-escape characters?
1277         if (ThisTokBuf[0] != '\\') {
1278           const char *InStart = ThisTokBuf;
1279           do {
1280             ++ThisTokBuf;
1281           } while (ThisTokBuf != ThisTokEnd && ThisTokBuf[0] != '\\');
1282 
1283           // Copy the character span over.
1284           if (CopyStringFragment(StringToks[i], ThisTokBegin,
1285                                  StringRef(InStart, ThisTokBuf - InStart)))
1286             hadError = true;
1287           continue;
1288         }
1289         // Is this a Universal Character Name escape?
1290         if (ThisTokBuf[1] == 'u' || ThisTokBuf[1] == 'U') {
1291           EncodeUCNEscape(ThisTokBegin, ThisTokBuf, ThisTokEnd,
1292                           ResultPtr, hadError,
1293                           FullSourceLoc(StringToks[i].getLocation(), SM),
1294                           CharByteWidth, Diags, Features);
1295           continue;
1296         }
1297         // Otherwise, this is a non-UCN escape character.  Process it.
1298         unsigned ResultChar =
1299           ProcessCharEscape(ThisTokBegin, ThisTokBuf, ThisTokEnd, hadError,
1300                             FullSourceLoc(StringToks[i].getLocation(), SM),
1301                             CharByteWidth*8, Diags, Features);
1302 
1303         if (CharByteWidth == 4) {
1304           // FIXME: Make the type of the result buffer correct instead of
1305           // using reinterpret_cast.
1306           UTF32 *ResultWidePtr = reinterpret_cast<UTF32*>(ResultPtr);
1307           *ResultWidePtr = ResultChar;
1308           ResultPtr += 4;
1309         } else if (CharByteWidth == 2) {
1310           // FIXME: Make the type of the result buffer correct instead of
1311           // using reinterpret_cast.
1312           UTF16 *ResultWidePtr = reinterpret_cast<UTF16*>(ResultPtr);
1313           *ResultWidePtr = ResultChar & 0xFFFF;
1314           ResultPtr += 2;
1315         } else {
1316           assert(CharByteWidth == 1 && "Unexpected char width");
1317           *ResultPtr++ = ResultChar & 0xFF;
1318         }
1319       }
1320     }
1321   }
1322 
1323   if (Pascal) {
1324     if (CharByteWidth == 4) {
1325       // FIXME: Make the type of the result buffer correct instead of
1326       // using reinterpret_cast.
1327       UTF32 *ResultWidePtr = reinterpret_cast<UTF32*>(ResultBuf.data());
1328       ResultWidePtr[0] = GetNumStringChars() - 1;
1329     } else if (CharByteWidth == 2) {
1330       // FIXME: Make the type of the result buffer correct instead of
1331       // using reinterpret_cast.
1332       UTF16 *ResultWidePtr = reinterpret_cast<UTF16*>(ResultBuf.data());
1333       ResultWidePtr[0] = GetNumStringChars() - 1;
1334     } else {
1335       assert(CharByteWidth == 1 && "Unexpected char width");
1336       ResultBuf[0] = GetNumStringChars() - 1;
1337     }
1338 
1339     // Verify that pascal strings aren't too large.
1340     if (GetStringLength() > 256) {
1341       if (Diags)
1342         Diags->Report(StringToks[0].getLocation(),
1343                       diag::err_pascal_string_too_long)
1344           << SourceRange(StringToks[0].getLocation(),
1345                          StringToks[NumStringToks-1].getLocation());
1346       hadError = true;
1347       return;
1348     }
1349   } else if (Diags) {
1350     // Complain if this string literal has too many characters.
1351     unsigned MaxChars = Features.CPlusPlus? 65536 : Features.C99 ? 4095 : 509;
1352 
1353     if (GetNumStringChars() > MaxChars)
1354       Diags->Report(StringToks[0].getLocation(),
1355                     diag::ext_string_too_long)
1356         << GetNumStringChars() << MaxChars
1357         << (Features.CPlusPlus ? 2 : Features.C99 ? 1 : 0)
1358         << SourceRange(StringToks[0].getLocation(),
1359                        StringToks[NumStringToks-1].getLocation());
1360   }
1361 }
1362 
1363 /// \brief This function copies from Fragment, which is a sequence of bytes
1364 /// within Tok's contents (which begin at TokBegin) into ResultPtr.
1365 /// Performs widening for multi-byte characters.
CopyStringFragment(const Token & Tok,const char * TokBegin,StringRef Fragment)1366 bool StringLiteralParser::CopyStringFragment(const Token &Tok,
1367                                              const char *TokBegin,
1368                                              StringRef Fragment) {
1369   const UTF8 *ErrorPtrTmp;
1370   if (ConvertUTF8toWide(CharByteWidth, Fragment, ResultPtr, ErrorPtrTmp))
1371     return false;
1372   const char *ErrorPtr = reinterpret_cast<const char *>(ErrorPtrTmp);
1373 
1374   // If we see bad encoding for unprefixed string literals, warn and
1375   // simply copy the byte values, for compatibility with gcc and older
1376   // versions of clang.
1377   bool NoErrorOnBadEncoding = isAscii();
1378   if (NoErrorOnBadEncoding) {
1379     memcpy(ResultPtr, Fragment.data(), Fragment.size());
1380     ResultPtr += Fragment.size();
1381   }
1382   if (Diags) {
1383     Diag(Diags, Features, FullSourceLoc(Tok.getLocation(), SM), TokBegin,
1384          ErrorPtr, ErrorPtr + std::min<unsigned>(getNumBytesForUTF8(*ErrorPtr),
1385                                                  Fragment.end() - ErrorPtr),
1386          NoErrorOnBadEncoding ? diag::warn_bad_string_encoding
1387                               : diag::err_bad_string_encoding);
1388   }
1389   return !NoErrorOnBadEncoding;
1390 }
1391 
DiagnoseLexingError(SourceLocation Loc)1392 void StringLiteralParser::DiagnoseLexingError(SourceLocation Loc) {
1393   hadError = true;
1394   if (Diags)
1395     Diags->Report(Loc, diag::err_lexing_string);
1396 }
1397 
1398 /// getOffsetOfStringByte - This function returns the offset of the
1399 /// specified byte of the string data represented by Token.  This handles
1400 /// advancing over escape sequences in the string.
getOffsetOfStringByte(const Token & Tok,unsigned ByteNo) const1401 unsigned StringLiteralParser::getOffsetOfStringByte(const Token &Tok,
1402                                                     unsigned ByteNo) const {
1403   // Get the spelling of the token.
1404   SmallString<32> SpellingBuffer;
1405   SpellingBuffer.resize(Tok.getLength());
1406 
1407   bool StringInvalid = false;
1408   const char *SpellingPtr = &SpellingBuffer[0];
1409   unsigned TokLen = Lexer::getSpelling(Tok, SpellingPtr, SM, Features,
1410                                        &StringInvalid);
1411   if (StringInvalid)
1412     return 0;
1413 
1414   const char *SpellingStart = SpellingPtr;
1415   const char *SpellingEnd = SpellingPtr+TokLen;
1416 
1417   // Handle UTF-8 strings just like narrow strings.
1418   if (SpellingPtr[0] == 'u' && SpellingPtr[1] == '8')
1419     SpellingPtr += 2;
1420 
1421   assert(SpellingPtr[0] != 'L' && SpellingPtr[0] != 'u' &&
1422          SpellingPtr[0] != 'U' && "Doesn't handle wide or utf strings yet");
1423 
1424   // For raw string literals, this is easy.
1425   if (SpellingPtr[0] == 'R') {
1426     assert(SpellingPtr[1] == '"' && "Should be a raw string literal!");
1427     // Skip 'R"'.
1428     SpellingPtr += 2;
1429     while (*SpellingPtr != '(') {
1430       ++SpellingPtr;
1431       assert(SpellingPtr < SpellingEnd && "Missing ( for raw string literal");
1432     }
1433     // Skip '('.
1434     ++SpellingPtr;
1435     return SpellingPtr - SpellingStart + ByteNo;
1436   }
1437 
1438   // Skip over the leading quote
1439   assert(SpellingPtr[0] == '"' && "Should be a string literal!");
1440   ++SpellingPtr;
1441 
1442   // Skip over bytes until we find the offset we're looking for.
1443   while (ByteNo) {
1444     assert(SpellingPtr < SpellingEnd && "Didn't find byte offset!");
1445 
1446     // Step over non-escapes simply.
1447     if (*SpellingPtr != '\\') {
1448       ++SpellingPtr;
1449       --ByteNo;
1450       continue;
1451     }
1452 
1453     // Otherwise, this is an escape character.  Advance over it.
1454     bool HadError = false;
1455     if (SpellingPtr[1] == 'u' || SpellingPtr[1] == 'U') {
1456       const char *EscapePtr = SpellingPtr;
1457       unsigned Len = MeasureUCNEscape(SpellingStart, SpellingPtr, SpellingEnd,
1458                                       1, Features, HadError);
1459       if (Len > ByteNo) {
1460         // ByteNo is somewhere within the escape sequence.
1461         SpellingPtr = EscapePtr;
1462         break;
1463       }
1464       ByteNo -= Len;
1465     } else {
1466       ProcessCharEscape(SpellingStart, SpellingPtr, SpellingEnd, HadError,
1467                         FullSourceLoc(Tok.getLocation(), SM),
1468                         CharByteWidth*8, Diags, Features);
1469       --ByteNo;
1470     }
1471     assert(!HadError && "This method isn't valid on erroneous strings");
1472   }
1473 
1474   return SpellingPtr-SpellingStart;
1475 }
1476