1 // © 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html#License 3 /* 4 ******************************************************************************* 5 * Copyright (C) 2009-2015, Google, International Business Machines Corporation 6 * and others. All Rights Reserved. 7 ******************************************************************************* 8 */ 9 package com.ibm.icu.impl; 10 11 import java.io.BufferedReader; 12 import java.io.FileInputStream; 13 import java.io.IOException; 14 import java.io.InputStream; 15 import java.io.InputStreamReader; 16 import java.io.UnsupportedEncodingException; 17 import java.text.ParsePosition; 18 import java.util.Arrays; 19 import java.util.Comparator; 20 import java.util.LinkedHashSet; 21 import java.util.List; 22 import java.util.Map; 23 import java.util.Map.Entry; 24 import java.util.Set; 25 import java.util.TreeMap; 26 import java.util.regex.Pattern; 27 28 import com.ibm.icu.text.StringTransform; 29 import com.ibm.icu.text.SymbolTable; 30 import com.ibm.icu.text.UnicodeSet; 31 import com.ibm.icu.util.Freezable; 32 33 /** 34 * Contains utilities to supplement the JDK Regex, since it doesn't handle 35 * Unicode well. 36 * 37 * <p>TODO: Move to com.ibm.icu.dev.somewhere. 38 * 2015-sep-03: This is used there, and also in CLDR and in UnicodeTools. 39 * 40 * @author markdavis 41 */ 42 public class UnicodeRegex implements Cloneable, Freezable<UnicodeRegex>, StringTransform { 43 // Note: we don't currently have any state, but intend to in the future, 44 // particularly for the regex style supported. 45 46 private SymbolTable symbolTable; 47 48 /** 49 * Set the symbol table for internal processing 50 * @internal 51 */ getSymbolTable()52 public SymbolTable getSymbolTable() { 53 return symbolTable; 54 } 55 56 /** 57 * Get the symbol table for internal processing 58 * @internal 59 */ setSymbolTable(SymbolTable symbolTable)60 public UnicodeRegex setSymbolTable(SymbolTable symbolTable) { 61 this.symbolTable = symbolTable; 62 return this; 63 } 64 65 /** 66 * Adds full Unicode property support, with the latest version of Unicode, 67 * to Java Regex, bringing it up to Level 1 (see 68 * http://www.unicode.org/reports/tr18/). It does this by preprocessing the 69 * regex pattern string and interpreting the character classes (\p{...}, 70 * \P{...}, [...]) according to their syntax and meaning in UnicodeSet. With 71 * this utility, Java regex expressions can be updated to work with the 72 * latest version of Unicode, and with all Unicode properties. Note that the 73 * UnicodeSet syntax has not yet, however, been updated to be completely 74 * consistent with Java regex, so be careful of the differences. 75 * <p>Not thread-safe; create a separate copy for different threads. 76 * <p>In the future, we may extend this to support other regex packages. 77 * 78 * @regex A modified Java regex pattern, as in the input to 79 * Pattern.compile(), except that all "character classes" are 80 * processed as if they were UnicodeSet patterns. Example: 81 * "abc[:bc=N:]. See UnicodeSet for the differences in syntax. 82 * @return A processed Java regex pattern, suitable for input to 83 * Pattern.compile(). 84 */ 85 @Override transform(String regex)86 public String transform(String regex) { 87 StringBuilder result = new StringBuilder(); 88 UnicodeSet temp = new UnicodeSet(); 89 ParsePosition pos = new ParsePosition(0); 90 int state = 0; // 1 = after \ 91 92 // We add each character unmodified to the output, unless we have a 93 // UnicodeSet. Note that we don't worry about supplementary characters, 94 // since none of the syntax uses them. 95 96 for (int i = 0; i < regex.length(); ++i) { 97 // look for UnicodeSets, allowing for quoting with \ and \Q 98 char ch = regex.charAt(i); 99 switch (state) { 100 case 0: // we only care about \, and '['. 101 if (ch == '\\') { 102 if (UnicodeSet.resemblesPattern(regex, i)) { 103 // should only happen with \p 104 i = processSet(regex, i, result, temp, pos); 105 continue; 106 } 107 state = 1; 108 } else if (ch == '[') { 109 // if we have what looks like a UnicodeSet 110 if (UnicodeSet.resemblesPattern(regex, i)) { 111 i = processSet(regex, i, result, temp, pos); 112 continue; 113 } 114 } 115 break; 116 117 case 1: // we are after a \ 118 if (ch == 'Q') { 119 state = 1; 120 } else { 121 state = 0; 122 } 123 break; 124 125 case 2: // we are in a \Q... 126 if (ch == '\\') { 127 state = 3; 128 } 129 break; 130 131 case 3: // we are in at \Q...\ 132 if (ch == 'E') { 133 state = 0; 134 } 135 state = 2; 136 break; 137 } 138 result.append(ch); 139 } 140 return result.toString(); 141 } 142 143 /** 144 * Convenience static function, using standard parameters. 145 * @param regex as in process() 146 * @return processed regex pattern, as in process() 147 */ 148 public static String fix(String regex) { 149 return STANDARD.transform(regex); 150 } 151 152 /** 153 * Compile a regex string, after processing by fix(...). 154 * 155 * @param regex Raw regex pattern, as in fix(...). 156 * @return Pattern 157 */ 158 public static Pattern compile(String regex) { 159 return Pattern.compile(STANDARD.transform(regex)); 160 } 161 162 /** 163 * Compile a regex string, after processing by fix(...). 164 * 165 * @param regex Raw regex pattern, as in fix(...). 166 * @return Pattern 167 */ 168 public static Pattern compile(String regex, int options) { 169 return Pattern.compile(STANDARD.transform(regex), options); 170 } 171 172 /** 173 * Compile a composed string from a set of BNF lines; see the List version for more information. 174 * 175 * @param bnfLines Series of BNF lines. 176 * @return Pattern 177 */ 178 public String compileBnf(String bnfLines) { 179 return compileBnf(Arrays.asList(bnfLines.split("\\r\\n?|\\n"))); 180 } 181 182 /** 183 * Compile a composed string from a set of BNF lines, such as for composing a regex 184 * expression. The lines can be in any order, but there must not be any 185 * cycles. The result can be used as input for fix(). 186 * <p> 187 * Example: 188 * <pre> 189 * uri = (?: (scheme) \\:)? (host) (?: \\? (query))? (?: \\u0023 (fragment))?; 190 * scheme = reserved+; 191 * host = // reserved+; 192 * query = [\\=reserved]+; 193 * fragment = reserved+; 194 * reserved = [[:ascii:][:alphabetic:]]; 195 * </pre> 196 * <p> 197 * Caveats: at this point the parsing is simple; for example, # cannot be 198 * quoted (use \\u0023); you can set it to null to disable. 199 * The equality sign and a few others can be reset with 200 * setBnfX(). 201 * 202 * @param lines Series of lines that represent a BNF expression. The lines contain 203 * a series of statements that of the form x=y;. A statement can take 204 * multiple lines, but there can't be multiple statements on a line. 205 * A hash quotes to the end of the line. 206 * @return Pattern 207 */ 208 public String compileBnf(List<String> lines) { 209 Map<String, String> variables = getVariables(lines); 210 Set<String> unused = new LinkedHashSet<String>(variables.keySet()); 211 // brute force replacement; do twice to allow for different order 212 // later on can optimize 213 for (int i = 0; i < 2; ++i) { 214 for (Entry<String, String> entry : variables.entrySet()) { 215 String variable = entry.getKey(), 216 definition = entry.getValue(); 217 218 for (Entry<String, String> entry2 : variables.entrySet()) { 219 String variable2 = entry2.getKey(), 220 definition2 = entry2.getValue(); 221 if (variable.equals(variable2)) { 222 continue; 223 } 224 String altered2 = definition2.replace(variable, definition); 225 if (!altered2.equals(definition2)) { 226 unused.remove(variable); 227 variables.put(variable2, altered2); 228 // if (log != null) { 229 // try { 230 // log.append(variable2 + "=" + altered2 + ";"); 231 // } catch (IOException e) { 232 // throw (IllegalArgumentException) new IllegalArgumentException().initCause(e); 233 // } 234 // } 235 } 236 } 237 } 238 } 239 if (unused.size() != 1) { 240 throw new IllegalArgumentException("Not a single root: " + unused); 241 } 242 return variables.get(unused.iterator().next()); 243 } 244 245 public String getBnfCommentString() { 246 return bnfCommentString; 247 } 248 249 public void setBnfCommentString(String bnfCommentString) { 250 this.bnfCommentString = bnfCommentString; 251 } 252 253 public String getBnfVariableInfix() { 254 return bnfVariableInfix; 255 } 256 257 public void setBnfVariableInfix(String bnfVariableInfix) { 258 this.bnfVariableInfix = bnfVariableInfix; 259 } 260 261 public String getBnfLineSeparator() { 262 return bnfLineSeparator; 263 } 264 265 public void setBnfLineSeparator(String bnfLineSeparator) { 266 this.bnfLineSeparator = bnfLineSeparator; 267 } 268 269 /** 270 * Utility for loading lines from a file. 271 * @param result The result of the appended lines. 272 * @param file The file to have an input stream. 273 * @param encoding if null, then UTF-8 274 * @return filled list 275 * @throws IOException If there were problems opening the file for input stream. 276 */ 277 public static List<String> appendLines(List<String> result, String file, String encoding) throws IOException { 278 InputStream is = new FileInputStream(file); 279 try { 280 return appendLines(result, is, encoding); 281 } finally { 282 is.close(); 283 } 284 } 285 286 /** 287 * Utility for loading lines from a UTF8 file. 288 * @param result The result of the appended lines. 289 * @param inputStream The input stream. 290 * @param encoding if null, then UTF-8 291 * @return filled list 292 * @throws IOException If there were problems opening the input stream for reading. 293 */ 294 public static List<String> appendLines(List<String> result, InputStream inputStream, String encoding) 295 throws UnsupportedEncodingException, IOException { 296 BufferedReader in = new BufferedReader(new InputStreamReader(inputStream, encoding == null ? "UTF-8" : encoding)); 297 while (true) { 298 String line = in.readLine(); 299 if (line == null) break; 300 result.add(line); 301 } 302 return result; 303 } 304 305 306 307 /* (non-Javadoc) 308 * @see com.ibm.icu.util.Freezable#cloneAsThawed() 309 */ 310 @Override 311 public UnicodeRegex cloneAsThawed() { 312 // TODO Auto-generated method stub 313 try { 314 return (UnicodeRegex)clone(); 315 } catch (CloneNotSupportedException e) { 316 throw new IllegalArgumentException(); // should never happen 317 } 318 } 319 320 /* (non-Javadoc) 321 * @see com.ibm.icu.util.Freezable#freeze() 322 */ 323 @Override 324 public UnicodeRegex freeze() { 325 // no action needed now. 326 return this; 327 } 328 329 /* (non-Javadoc) 330 * @see com.ibm.icu.util.Freezable#isFrozen() 331 */ 332 @Override 333 public boolean isFrozen() { 334 // at this point, always true 335 return true; 336 } 337 338 // ===== PRIVATES ===== 339 340 private int processSet(String regex, int i, StringBuilder result, UnicodeSet temp, ParsePosition pos) { 341 try { 342 pos.setIndex(i); 343 UnicodeSet x = temp.clear().applyPattern(regex, pos, symbolTable, 0); 344 x.complement().complement(); // hack to fix toPattern 345 result.append(x.toPattern(false)); 346 i = pos.getIndex() - 1; // allow for the loop increment 347 return i; 348 } catch (Exception e) { 349 throw (IllegalArgumentException) new IllegalArgumentException("Error in " + regex).initCause(e); 350 } 351 } 352 353 private static final UnicodeRegex STANDARD = new UnicodeRegex(); 354 private String bnfCommentString = "#"; 355 private String bnfVariableInfix = "="; 356 private String bnfLineSeparator = "\n"; 357 // private Appendable log = null; 358 359 private Comparator<Object> LongestFirst = new Comparator<Object>() { 360 @Override 361 public int compare(Object obj0, Object obj1) { 362 String arg0 = obj0.toString(); 363 String arg1 = obj1.toString(); 364 int len0 = arg0.length(); 365 int len1 = arg1.length(); 366 if (len0 != len1) return len1 - len0; 367 return arg0.compareTo(arg1); 368 } 369 }; 370 371 private Map<String, String> getVariables(List<String> lines) { 372 Map<String, String> variables = new TreeMap<String, String>(LongestFirst); 373 String variable = null; 374 StringBuffer definition = new StringBuffer(); 375 int count = 0; 376 for (String line : lines) { 377 ++count; 378 // remove initial bom, comments 379 if (line.length() == 0) continue; 380 if (line.charAt(0) == '\uFEFF') line = line.substring(1); 381 382 if (bnfCommentString != null) { 383 int hashPos = line.indexOf(bnfCommentString); 384 if (hashPos >= 0) line = line.substring(0, hashPos); 385 } 386 String trimline = line.trim(); 387 if (trimline.length() == 0) continue; 388 389 // String[] lineParts = line.split(";"); 390 String linePart = line; // lineParts[i]; // .trim().replace("\\s+", " "); 391 if (linePart.trim().length() == 0) continue; 392 boolean terminated = trimline.endsWith(";"); 393 if (terminated) { 394 linePart = linePart.substring(0,linePart.lastIndexOf(';')); 395 } 396 int equalsPos = linePart.indexOf(bnfVariableInfix); 397 if (equalsPos >= 0) { 398 if (variable != null) { 399 throw new IllegalArgumentException("Missing ';' before " + count + ") " + line); 400 } 401 variable = linePart.substring(0,equalsPos).trim(); 402 if (variables.containsKey(variable)) { 403 throw new IllegalArgumentException("Duplicate variable definition in " + line); 404 } 405 definition.append(linePart.substring(equalsPos+1).trim()); 406 } else { // no equals, so 407 if (variable == null) { 408 throw new IllegalArgumentException("Missing '=' at " + count + ") " + line); 409 } 410 definition.append(bnfLineSeparator).append(linePart); 411 } 412 // we are terminated if i is not at the end, or the line ends with a ; 413 if (terminated) { 414 variables.put(variable, definition.toString()); 415 variable = null; // signal we have no variable 416 definition.setLength(0); 417 } 418 } 419 if (variable != null) { 420 throw new IllegalArgumentException("Missing ';' at end"); 421 } 422 return variables; 423 } 424 } 425