• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright 2016 Google Inc. All Rights Reserved.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *     http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 package com.google.turbine.parse;
18 
19 import com.google.errorprone.annotations.CheckReturnValue;
20 import com.google.turbine.diag.SourceFile;
21 import com.google.turbine.diag.TurbineError;
22 import com.google.turbine.diag.TurbineError.ErrorKind;
23 
24 /** Preprocesses Unicode escape characters in Java source code, as described in JLS §3.3. */
25 public class UnicodeEscapePreprocessor {
26 
27   public static final char ASCII_SUB = 0x1A;
28 
29   private final SourceFile source;
30   private final String input;
31 
32   private int idx = 0;
33   private int ch;
34   private boolean evenLeadingSlashes = true;
35 
UnicodeEscapePreprocessor(SourceFile source)36   public UnicodeEscapePreprocessor(SourceFile source) {
37     this.source = source;
38     this.input = source.source();
39   }
40 
41   /** Returns the current position in the input. */
position()42   public int position() {
43     return idx - 1;
44   }
45 
46   /** Returns true if all input has been read. */
done()47   public boolean done() {
48     return idx >= input.length();
49   }
50 
51   /** Returns the next unescaped Unicode input character. */
next()52   public int next() {
53     eat();
54     if (ch == '\\' && evenLeadingSlashes) {
55       unicodeEscape();
56     } else {
57       evenLeadingSlashes = true;
58     }
59     return ch;
60   }
61 
62   /** Returns a substring of the raw (escaped) input. */
readString(int from, int to)63   public String readString(int from, int to) {
64     return input.substring(from, to);
65   }
66 
67   /** Consumes a Unicode escape. */
unicodeEscape()68   private void unicodeEscape() {
69     eat();
70     if (ch != 'u') {
71       idx--;
72       ch = '\\';
73       evenLeadingSlashes = false;
74       return;
75     }
76     do {
77       eat();
78     } while (ch == 'u');
79     char acc = (char) ((hexDigit(ch) & 0xff) << 12);
80     eat();
81     acc |= (char) ((hexDigit(ch) & 0xff) << 8);
82     eat();
83     acc |= (char) ((hexDigit(ch) & 0xff) << 4);
84     eat();
85     acc |= (char) (hexDigit(ch) & 0xff);
86     ch = acc;
87     evenLeadingSlashes = ch != '\\';
88   }
89 
90   /** Consumes a hex digit. */
hexDigit(int d)91   private int hexDigit(int d) {
92     switch (d) {
93       case '0':
94       case '1':
95       case '2':
96       case '3':
97       case '4':
98       case '5':
99       case '6':
100       case '7':
101       case '8':
102       case '9':
103         return (d - '0');
104       case 'A':
105       case 'B':
106       case 'C':
107       case 'D':
108       case 'E':
109       case 'F':
110         return ((d - 'A') + 10);
111       case 'a':
112       case 'b':
113       case 'c':
114       case 'd':
115       case 'e':
116       case 'f':
117         return ((d - 'a') + 10);
118       case ASCII_SUB:
119         throw error(ErrorKind.UNEXPECTED_EOF);
120       default:
121         throw error(ErrorKind.INVALID_UNICODE);
122     }
123   }
124 
125   /**
126    * Consumes a raw input character.
127    *
128    * <p>Once the input is exhausted, {@code ch} will always be ASCII SUB (\u001a). JLS §3.5 requires
129    * ASCII SUB to be ignored if it is the last character in the escaped input stream, and assuming
130    * it terminates the input avoids some bounds checks in the lexer.
131    */
eat()132   private void eat() {
133     char hi = done() ? ASCII_SUB : input.charAt(idx);
134     idx++;
135     if (!Character.isHighSurrogate(hi)) {
136       ch = hi;
137       return;
138     }
139     if (done()) {
140       throw error(ErrorKind.UNPAIRED_SURROGATE, (int) hi);
141     }
142     char lo = input.charAt(idx++);
143     if (!Character.isLowSurrogate(lo)) {
144       throw error(ErrorKind.UNPAIRED_SURROGATE, (int) hi);
145     }
146     ch = Character.toCodePoint(hi, lo);
147   }
148 
source()149   public SourceFile source() {
150     return source;
151   }
152 
153   @CheckReturnValue
error(ErrorKind kind, Object... args)154   private TurbineError error(ErrorKind kind, Object... args) {
155     throw TurbineError.format(
156         source(), Math.min(position(), source().source().length() - 1), kind, args);
157   }
158 }
159