1 /* 2 * Copyright 2016 Google Inc. All Rights Reserved. 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 package com.google.turbine.parse; 18 19 import com.google.errorprone.annotations.CheckReturnValue; 20 import com.google.turbine.diag.SourceFile; 21 import com.google.turbine.diag.TurbineError; 22 import com.google.turbine.diag.TurbineError.ErrorKind; 23 24 /** Preprocesses Unicode escape characters in Java source code, as described in JLS §3.3. */ 25 public class UnicodeEscapePreprocessor { 26 27 public static final char ASCII_SUB = 0x1A; 28 29 private final SourceFile source; 30 private final String input; 31 32 private int idx = 0; 33 private int ch; 34 private boolean evenLeadingSlashes = true; 35 UnicodeEscapePreprocessor(SourceFile source)36 public UnicodeEscapePreprocessor(SourceFile source) { 37 this.source = source; 38 this.input = source.source(); 39 } 40 41 /** Returns the current position in the input. */ position()42 public int position() { 43 return idx - 1; 44 } 45 46 /** Returns true if all input has been read. */ done()47 public boolean done() { 48 return idx >= input.length(); 49 } 50 51 /** Returns the next unescaped Unicode input character. */ next()52 public int next() { 53 eat(); 54 if (ch == '\\' && evenLeadingSlashes) { 55 unicodeEscape(); 56 } else { 57 evenLeadingSlashes = true; 58 } 59 return ch; 60 } 61 62 /** Returns a substring of the raw (escaped) input. */ readString(int from, int to)63 public String readString(int from, int to) { 64 return input.substring(from, to); 65 } 66 67 /** Consumes a Unicode escape. */ unicodeEscape()68 private void unicodeEscape() { 69 eat(); 70 if (ch != 'u') { 71 idx--; 72 ch = '\\'; 73 evenLeadingSlashes = false; 74 return; 75 } 76 do { 77 eat(); 78 } while (ch == 'u'); 79 char acc = (char) ((hexDigit(ch) & 0xff) << 12); 80 eat(); 81 acc |= (char) ((hexDigit(ch) & 0xff) << 8); 82 eat(); 83 acc |= (char) ((hexDigit(ch) & 0xff) << 4); 84 eat(); 85 acc |= (char) (hexDigit(ch) & 0xff); 86 ch = acc; 87 evenLeadingSlashes = ch != '\\'; 88 } 89 90 /** Consumes a hex digit. */ hexDigit(int d)91 private int hexDigit(int d) { 92 switch (d) { 93 case '0': 94 case '1': 95 case '2': 96 case '3': 97 case '4': 98 case '5': 99 case '6': 100 case '7': 101 case '8': 102 case '9': 103 return (d - '0'); 104 case 'A': 105 case 'B': 106 case 'C': 107 case 'D': 108 case 'E': 109 case 'F': 110 return ((d - 'A') + 10); 111 case 'a': 112 case 'b': 113 case 'c': 114 case 'd': 115 case 'e': 116 case 'f': 117 return ((d - 'a') + 10); 118 case ASCII_SUB: 119 throw error(ErrorKind.UNEXPECTED_EOF); 120 default: 121 throw error(ErrorKind.INVALID_UNICODE); 122 } 123 } 124 125 /** 126 * Consumes a raw input character. 127 * 128 * <p>Once the input is exhausted, {@code ch} will always be ASCII SUB (\u001a). JLS §3.5 requires 129 * ASCII SUB to be ignored if it is the last character in the escaped input stream, and assuming 130 * it terminates the input avoids some bounds checks in the lexer. 131 */ eat()132 private void eat() { 133 char hi = done() ? ASCII_SUB : input.charAt(idx); 134 idx++; 135 if (!Character.isHighSurrogate(hi)) { 136 ch = hi; 137 return; 138 } 139 if (done()) { 140 throw error(ErrorKind.UNPAIRED_SURROGATE, (int) hi); 141 } 142 char lo = input.charAt(idx++); 143 if (!Character.isLowSurrogate(lo)) { 144 throw error(ErrorKind.UNPAIRED_SURROGATE, (int) hi); 145 } 146 ch = Character.toCodePoint(hi, lo); 147 } 148 source()149 public SourceFile source() { 150 return source; 151 } 152 153 @CheckReturnValue error(ErrorKind kind, Object... args)154 private TurbineError error(ErrorKind kind, Object... args) { 155 throw TurbineError.format( 156 source(), Math.min(position(), source().source().length() - 1), kind, args); 157 } 158 } 159