1 // © 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 /* 4 ******************************************************************************* 5 * Copyright (C) 2009-2015, Google, International Business Machines Corporation 6 * and others. All Rights Reserved. 7 ******************************************************************************* 8 */ 9 package com.ibm.icu.impl; 10 11 import java.io.BufferedReader; 12 import java.io.FileInputStream; 13 import java.io.IOException; 14 import java.io.InputStream; 15 import java.io.InputStreamReader; 16 import java.io.UnsupportedEncodingException; 17 import java.text.ParsePosition; 18 import java.util.Arrays; 19 import java.util.Comparator; 20 import java.util.LinkedHashSet; 21 import java.util.List; 22 import java.util.Map; 23 import java.util.Map.Entry; 24 import java.util.Set; 25 import java.util.TreeMap; 26 import java.util.regex.Pattern; 27 28 import com.ibm.icu.text.StringTransform; 29 import com.ibm.icu.text.SymbolTable; 30 import com.ibm.icu.text.UnicodeSet; 31 import com.ibm.icu.util.Freezable; 32 33 /** 34 * Contains utilities to supplement the JDK Regex, since it doesn't handle 35 * Unicode well. 36 * 37 * <p>TODO: Move to com.ibm.icu.dev.somewhere. 38 * 2015-sep-03: This is used there, and also in CLDR and in UnicodeTools. 39 * 40 * @author markdavis 41 */ 42 public class UnicodeRegex implements Cloneable, Freezable<UnicodeRegex>, StringTransform { 43 private static final Pattern SUPP_ESCAPE = Pattern.compile("\\\\U00([0-9a-fA-F]{6})"); 44 45 // Note: we don't currently have any state, but intend to in the future, 46 // particularly for the regex style supported. 47 48 private SymbolTable symbolTable; 49 50 /** 51 * Set the symbol table for internal processing 52 * @internal 53 */ getSymbolTable()54 public SymbolTable getSymbolTable() { 55 return symbolTable; 56 } 57 58 /** 59 * Get the symbol table for internal processing 60 * @internal 61 */ setSymbolTable(SymbolTable symbolTable)62 public UnicodeRegex setSymbolTable(SymbolTable symbolTable) { 63 this.symbolTable = symbolTable; 64 return this; 65 } 66 67 /** 68 * Adds full Unicode property support, with the latest version of Unicode, 69 * to Java Regex, bringing it up to Level 1 (see 70 * http://www.unicode.org/reports/tr18/). It does this by preprocessing the 71 * regex pattern string and interpreting the character classes (\p{...}, 72 * \P{...}, [...]) according to their syntax and meaning in UnicodeSet. With 73 * this utility, Java regex expressions can be updated to work with the 74 * latest version of Unicode, and with all Unicode properties. Note that the 75 * UnicodeSet syntax has not yet, however, been updated to be completely 76 * consistent with Java regex, so be careful of the differences. 77 * <p>Not thread-safe; create a separate copy for different threads. 78 * <p>In the future, we may extend this to support other regex packages. 79 * 80 * @param regex A modified Java regex pattern, as in the input to 81 * Pattern.compile(), except that all "character classes" are 82 * processed as if they were UnicodeSet patterns. Example: 83 * "abc[:bc=N:]. See UnicodeSet for the differences in syntax. 84 * @return A processed Java regex pattern, suitable for input to 85 * Pattern.compile(). 86 */ 87 @Override transform(String regex)88 public String transform(String regex) { 89 StringBuilder result = new StringBuilder(); 90 UnicodeSet temp = new UnicodeSet(); 91 ParsePosition pos = new ParsePosition(0); 92 int state = 0; // 1 = after \ 93 94 // We add each character unmodified to the output, unless we have a 95 // UnicodeSet. Note that we don't worry about supplementary characters, 96 // since none of the syntax uses them. 97 98 for (int i = 0; i < regex.length(); ++i) { 99 // look for UnicodeSets, allowing for quoting with \ and \Q 100 char ch = regex.charAt(i); 101 switch (state) { 102 case 0: // we only care about \, and '['. 103 if (ch == '\\') { 104 if (UnicodeSet.resemblesPattern(regex, i)) { 105 // should only happen with \p 106 i = processSet(regex, i, result, temp, pos); 107 continue; 108 } 109 state = 1; 110 } else if (ch == '[') { 111 // if we have what looks like a UnicodeSet 112 if (UnicodeSet.resemblesPattern(regex, i)) { 113 i = processSet(regex, i, result, temp, pos); 114 continue; 115 } 116 } 117 break; 118 119 case 1: // we are after a \ 120 if (ch == 'Q') { 121 state = 2; 122 } else { 123 state = 0; 124 } 125 break; 126 127 case 2: // we are in a \Q... 128 if (ch == '\\') { 129 state = 3; 130 } 131 break; 132 133 case 3: // we are in a \Q...\ 134 if (ch == 'E') { 135 state = 0; 136 } else if (ch != '\\') { 137 state = 2; 138 } 139 break; 140 } 141 result.append(ch); 142 } 143 return result.toString(); 144 } 145 146 /** 147 * Convenience static function, using standard parameters. 148 * @param regex as in process() 149 * @return processed regex pattern, as in process() 150 */ 151 public static String fix(String regex) { 152 return STANDARD.transform(regex); 153 } 154 155 /** 156 * Compile a regex string, after processing by fix(...). 157 * 158 * @param regex Raw regex pattern, as in fix(...). 159 * @return Pattern 160 */ 161 public static Pattern compile(String regex) { 162 return Pattern.compile(STANDARD.transform(regex)); 163 } 164 165 /** 166 * Compile a regex string, after processing by fix(...). 167 * 168 * @param regex Raw regex pattern, as in fix(...). 169 * @return Pattern 170 */ 171 public static Pattern compile(String regex, int options) { 172 return Pattern.compile(STANDARD.transform(regex), options); 173 } 174 175 /** 176 * Compile a composed string from a set of BNF lines; see the List version for more information. 177 * 178 * @param bnfLines Series of BNF lines. 179 * @return Pattern 180 */ 181 public String compileBnf(String bnfLines) { 182 return compileBnf(Arrays.asList(bnfLines.split("\\r\\n?|\\n"))); 183 } 184 185 /** 186 * Compile a composed string from a set of BNF lines, such as for composing a regex 187 * expression. The lines can be in any order, but there must not be any 188 * cycles. The result can be used as input for fix(). 189 * <p> 190 * Example: 191 * <pre> 192 * uri = (?: (scheme) \\:)? (host) (?: \\? (query))? (?: \\u0023 (fragment))?; 193 * scheme = reserved+; 194 * host = // reserved+; 195 * query = [\\=reserved]+; 196 * fragment = reserved+; 197 * reserved = [[:ascii:][:alphabetic:]]; 198 * </pre> 199 * <p> 200 * Caveats: at this point the parsing is simple; for example, # cannot be 201 * quoted (use \\u0023); you can set it to null to disable. 202 * The equality sign and a few others can be reset with 203 * setBnfX(). 204 * 205 * @param lines Series of lines that represent a BNF expression. The lines contain 206 * a series of statements that of the form x=y;. A statement can take 207 * multiple lines, but there can't be multiple statements on a line. 208 * A hash quotes to the end of the line. 209 * @return Pattern 210 */ 211 public String compileBnf(List<String> lines) { 212 Map<String, String> variables = getVariables(lines); 213 Set<String> unused = new LinkedHashSet<>(variables.keySet()); 214 // brute force replacement; do twice to allow for different order 215 // later on can optimize 216 for (int i = 0; i < 2; ++i) { 217 for (Entry<String, String> entry : variables.entrySet()) { 218 String variable = entry.getKey(), 219 definition = entry.getValue(); 220 221 for (Entry<String, String> entry2 : variables.entrySet()) { 222 String variable2 = entry2.getKey(), 223 definition2 = entry2.getValue(); 224 if (variable.equals(variable2)) { 225 continue; 226 } 227 String altered2 = definition2.replace(variable, definition); 228 if (!altered2.equals(definition2)) { 229 unused.remove(variable); 230 variables.put(variable2, altered2); 231 // if (log != null) { 232 // try { 233 // log.append(variable2 + "=" + altered2 + ";"); 234 // } catch (IOException e) { 235 // throw (IllegalArgumentException) new IllegalArgumentException().initCause(e); 236 // } 237 // } 238 } 239 } 240 } 241 } 242 if (unused.size() != 1) { 243 throw new IllegalArgumentException("Not a single root: " + unused); 244 } 245 return variables.get(unused.iterator().next()); 246 } 247 248 public String getBnfCommentString() { 249 return bnfCommentString; 250 } 251 252 public void setBnfCommentString(String bnfCommentString) { 253 this.bnfCommentString = bnfCommentString; 254 } 255 256 public String getBnfVariableInfix() { 257 return bnfVariableInfix; 258 } 259 260 public void setBnfVariableInfix(String bnfVariableInfix) { 261 this.bnfVariableInfix = bnfVariableInfix; 262 } 263 264 public String getBnfLineSeparator() { 265 return bnfLineSeparator; 266 } 267 268 public void setBnfLineSeparator(String bnfLineSeparator) { 269 this.bnfLineSeparator = bnfLineSeparator; 270 } 271 272 /** 273 * Utility for loading lines from a file. 274 * @param result The result of the appended lines. 275 * @param file The file to have an input stream. 276 * @param encoding if null, then UTF-8 277 * @return filled list 278 * @throws IOException If there were problems opening the file for input stream. 279 */ 280 public static List<String> appendLines(List<String> result, String file, String encoding) throws IOException { 281 InputStream is = new FileInputStream(file); 282 try { 283 return appendLines(result, is, encoding); 284 } finally { 285 is.close(); 286 } 287 } 288 289 /** 290 * Utility for loading lines from a UTF8 file. 291 * @param result The result of the appended lines. 292 * @param inputStream The input stream. 293 * @param encoding if null, then UTF-8 294 * @return filled list 295 * @throws IOException If there were problems opening the input stream for reading. 296 */ 297 public static List<String> appendLines(List<String> result, InputStream inputStream, String encoding) 298 throws UnsupportedEncodingException, IOException { 299 BufferedReader in = new BufferedReader(new InputStreamReader(inputStream, encoding == null ? "UTF-8" : encoding)); 300 while (true) { 301 String line = in.readLine(); 302 if (line == null) break; 303 result.add(line); 304 } 305 return result; 306 } 307 308 309 310 /* (non-Javadoc) 311 * @see com.ibm.icu.util.Freezable#cloneAsThawed() 312 */ 313 @Override 314 public UnicodeRegex cloneAsThawed() { 315 // TODO Auto-generated method stub 316 try { 317 return (UnicodeRegex)clone(); 318 } catch (CloneNotSupportedException e) { 319 throw new IllegalArgumentException(); // should never happen 320 } 321 } 322 323 /* (non-Javadoc) 324 * @see com.ibm.icu.util.Freezable#freeze() 325 */ 326 @Override 327 public UnicodeRegex freeze() { 328 // no action needed now. 329 return this; 330 } 331 332 /* (non-Javadoc) 333 * @see com.ibm.icu.util.Freezable#isFrozen() 334 */ 335 @Override 336 public boolean isFrozen() { 337 // at this point, always true 338 return true; 339 } 340 341 // ===== PRIVATES ===== 342 343 private int processSet(String regex, int i, StringBuilder result, UnicodeSet temp, ParsePosition pos) { 344 try { 345 pos.setIndex(i); 346 UnicodeSet x = temp.clear().applyPattern(regex, pos, symbolTable, 0); 347 x.complement().complement(); // hack to fix toPattern 348 String pattern = x.toPattern(false); 349 // Escaping of supplementary code points differs between ICU UnicodeSet and Java regex. 350 if (pattern.contains("\\U")) { 351 pattern = SUPP_ESCAPE.matcher(pattern).replaceAll("\\\\x{$1}"); 352 } 353 result.append(pattern); 354 i = pos.getIndex() - 1; // allow for the loop increment 355 return i; 356 } catch (Exception e) { 357 throw (IllegalArgumentException) new IllegalArgumentException("Error in " + regex).initCause(e); 358 } 359 } 360 361 private static final UnicodeRegex STANDARD = new UnicodeRegex(); 362 private String bnfCommentString = "#"; 363 private String bnfVariableInfix = "="; 364 private String bnfLineSeparator = "\n"; 365 // private Appendable log = null; 366 367 private Comparator<Object> LongestFirst = new Comparator<Object>() { 368 @Override 369 public int compare(Object obj0, Object obj1) { 370 String arg0 = obj0.toString(); 371 String arg1 = obj1.toString(); 372 int len0 = arg0.length(); 373 int len1 = arg1.length(); 374 if (len0 != len1) return len1 - len0; 375 return arg0.compareTo(arg1); 376 } 377 }; 378 379 private Map<String, String> getVariables(List<String> lines) { 380 Map<String, String> variables = new TreeMap<>(LongestFirst); 381 String variable = null; 382 StringBuffer definition = new StringBuffer(); 383 int count = 0; 384 for (String line : lines) { 385 ++count; 386 // remove initial bom, comments 387 if (line.length() == 0) continue; 388 if (line.charAt(0) == '\uFEFF') line = line.substring(1); 389 390 if (bnfCommentString != null) { 391 int hashPos = line.indexOf(bnfCommentString); 392 if (hashPos >= 0) line = line.substring(0, hashPos); 393 } 394 String trimline = line.trim(); 395 if (trimline.length() == 0) continue; 396 397 // String[] lineParts = line.split(";"); 398 String linePart = line; // lineParts[i]; // .trim().replace("\\s+", " "); 399 if (linePart.trim().length() == 0) continue; 400 boolean terminated = trimline.endsWith(";"); 401 if (terminated) { 402 linePart = linePart.substring(0,linePart.lastIndexOf(';')); 403 } 404 int equalsPos = linePart.indexOf(bnfVariableInfix); 405 if (equalsPos >= 0) { 406 if (variable != null) { 407 throw new IllegalArgumentException("Missing ';' before " + count + ") " + line); 408 } 409 variable = linePart.substring(0,equalsPos).trim(); 410 if (variables.containsKey(variable)) { 411 throw new IllegalArgumentException("Duplicate variable definition in " + line); 412 } 413 definition.append(linePart.substring(equalsPos+1).trim()); 414 } else { // no equals, so 415 if (variable == null) { 416 throw new IllegalArgumentException("Missing '=' at " + count + ") " + line); 417 } 418 definition.append(bnfLineSeparator).append(linePart); 419 } 420 // we are terminated if i is not at the end, or the line ends with a ; 421 if (terminated) { 422 variables.put(variable, definition.toString()); 423 variable = null; // signal we have no variable 424 definition.setLength(0); 425 } 426 } 427 if (variable != null) { 428 throw new IllegalArgumentException("Missing ';' at end"); 429 } 430 return variables; 431 } 432 } 433