• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright 2016 Google Inc. All Rights Reserved.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *     http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 package com.google.turbine.parse;
18 
19 import com.google.turbine.diag.SourceFile;
20 
21 /** Preprocesses Unicode escape characters in Java source code, as described in JLS §3.3. */
22 public class UnicodeEscapePreprocessor {
23 
24   public static final char ASCII_SUB = 0x1A;
25 
26   private final SourceFile source;
27   private final String input;
28 
29   private int idx = 0;
30   private char ch;
31   private boolean evenLeadingSlashes = true;
32 
UnicodeEscapePreprocessor(SourceFile source)33   public UnicodeEscapePreprocessor(SourceFile source) {
34     this.source = source;
35     this.input = source.source();
36   }
37 
38   /** Returns the current position in the input. */
position()39   public int position() {
40     return idx - 1;
41   }
42 
43   /** Returns true if all input has been read. */
done()44   public boolean done() {
45     return idx >= input.length();
46   }
47 
48   /** Returns the next unescaped Unicode input character. */
next()49   public char next() {
50     eat();
51     if (ch == '\\' && evenLeadingSlashes) {
52       unicodeEscape();
53     } else {
54       evenLeadingSlashes = true;
55     }
56     return ch;
57   }
58 
59   /** Returns a substring of the raw (escaped) input. */
readString(int from, int to)60   public String readString(int from, int to) {
61     return input.substring(from, to);
62   }
63 
64   /** Consumes a Unicode escape. */
unicodeEscape()65   private void unicodeEscape() {
66     eat();
67     if (ch != 'u') {
68       idx--;
69       ch = '\\';
70       evenLeadingSlashes = false;
71       return;
72     }
73     do {
74       eat();
75     } while (ch == 'u');
76     char acc = (char) ((hexDigit(ch) & 0xff) << 12);
77     eat();
78     acc |= (char) ((hexDigit(ch) & 0xff) << 8);
79     eat();
80     acc |= (char) ((hexDigit(ch) & 0xff) << 4);
81     eat();
82     acc |= (char) (hexDigit(ch) & 0xff);
83     ch = acc;
84     evenLeadingSlashes = ch != '\\';
85   }
86 
87   /** Consumes a hex digit. */
hexDigit(char d)88   private static int hexDigit(char d) {
89     switch (d) {
90       case '0':
91       case '1':
92       case '2':
93       case '3':
94       case '4':
95       case '5':
96       case '6':
97       case '7':
98       case '8':
99       case '9':
100         return (d - '0');
101       case 'A':
102       case 'B':
103       case 'C':
104       case 'D':
105       case 'E':
106       case 'F':
107         return ((d - 'A') + 10);
108       case 'a':
109       case 'b':
110       case 'c':
111       case 'd':
112       case 'e':
113       case 'f':
114         return ((d - 'a') + 10);
115       case ASCII_SUB:
116         throw new AssertionError("unexpected end of input");
117       default:
118         throw new AssertionError(String.format("unexpected hex digit: 0x%x", (int) d));
119     }
120   }
121 
122   /**
123    * Consumes a raw input character.
124    *
125    * <p>Once the input is exhausted, {@code ch} will always be ASCII SUB (\u001a). JLS §3.5 requires
126    * ASCII SUB to be ignored if it is the last character in the escaped input stream, and assuming
127    * it terminates the input avoids some bounds checks in the lexer.
128    */
eat()129   private void eat() {
130     ch = done() ? ASCII_SUB : input.charAt(idx);
131     idx++;
132   }
133 
source()134   public SourceFile source() {
135     return source;
136   }
137 }
138