• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4  *******************************************************************************
5  * Copyright (C) 2009-2015, Google, International Business Machines Corporation
6  * and others. All Rights Reserved.
7  *******************************************************************************
8  */
9 package com.ibm.icu.impl;
10 
11 import java.io.BufferedReader;
12 import java.io.FileInputStream;
13 import java.io.IOException;
14 import java.io.InputStream;
15 import java.io.InputStreamReader;
16 import java.io.UnsupportedEncodingException;
17 import java.text.ParsePosition;
18 import java.util.Arrays;
19 import java.util.Comparator;
20 import java.util.LinkedHashSet;
21 import java.util.List;
22 import java.util.Map;
23 import java.util.Map.Entry;
24 import java.util.Set;
25 import java.util.TreeMap;
26 import java.util.regex.Pattern;
27 
28 import com.ibm.icu.text.StringTransform;
29 import com.ibm.icu.text.SymbolTable;
30 import com.ibm.icu.text.UnicodeSet;
31 import com.ibm.icu.util.Freezable;
32 
33 /**
34  * Contains utilities to supplement the JDK Regex, since it doesn't handle
35  * Unicode well.
36  *
37  * <p>TODO: Move to com.ibm.icu.dev.somewhere.
38  * 2015-sep-03: This is used there, and also in CLDR and in UnicodeTools.
39  *
40  * @author markdavis
41  */
42 public class UnicodeRegex implements Cloneable, Freezable<UnicodeRegex>, StringTransform {
43     private static final Pattern SUPP_ESCAPE = Pattern.compile("\\\\U00([0-9a-fA-F]{6})");
44 
45     // Note: we don't currently have any state, but intend to in the future,
46     // particularly for the regex style supported.
47 
48     private SymbolTable symbolTable;
49 
50     /**
51      * Set the symbol table for internal processing
52      * @internal
53      */
getSymbolTable()54     public SymbolTable getSymbolTable() {
55         return symbolTable;
56     }
57 
58     /**
59      * Get the symbol table for internal processing
60      * @internal
61      */
setSymbolTable(SymbolTable symbolTable)62     public UnicodeRegex setSymbolTable(SymbolTable symbolTable) {
63         this.symbolTable = symbolTable;
64         return this;
65     }
66 
67     /**
68      * Adds full Unicode property support, with the latest version of Unicode,
69      * to Java Regex, bringing it up to Level 1 (see
70      * http://www.unicode.org/reports/tr18/). It does this by preprocessing the
71      * regex pattern string and interpreting the character classes (\p{...},
72      * \P{...}, [...]) according to their syntax and meaning in UnicodeSet. With
73      * this utility, Java regex expressions can be updated to work with the
74      * latest version of Unicode, and with all Unicode properties. Note that the
75      * UnicodeSet syntax has not yet, however, been updated to be completely
76      * consistent with Java regex, so be careful of the differences.
77      * <p>Not thread-safe; create a separate copy for different threads.
78      * <p>In the future, we may extend this to support other regex packages.
79      *
80      * @param regex A modified Java regex pattern, as in the input to
81      *        Pattern.compile(), except that all "character classes" are
82      *        processed as if they were UnicodeSet patterns. Example:
83      *        "abc[:bc=N:]. See UnicodeSet for the differences in syntax.
84      * @return A processed Java regex pattern, suitable for input to
85      *         Pattern.compile().
86      */
87     @Override
transform(String regex)88     public String transform(String regex) {
89         StringBuilder result = new StringBuilder();
90         UnicodeSet temp = new UnicodeSet();
91         ParsePosition pos = new ParsePosition(0);
92         int state = 0; // 1 = after \
93 
94         // We add each character unmodified to the output, unless we have a
95         // UnicodeSet. Note that we don't worry about supplementary characters,
96         // since none of the syntax uses them.
97 
98         for (int i = 0; i < regex.length(); ++i) {
99             // look for UnicodeSets, allowing for quoting with \ and \Q
100             char ch = regex.charAt(i);
101             switch (state) {
102             case 0: // we only care about \, and '['.
103                 if (ch == '\\') {
104                     if (UnicodeSet.resemblesPattern(regex, i)) {
105                         // should only happen with \p
106                         i = processSet(regex, i, result, temp, pos);
107                         continue;
108                     }
109                     state = 1;
110                 } else if (ch == '[') {
111                     // if we have what looks like a UnicodeSet
112                     if (UnicodeSet.resemblesPattern(regex, i)) {
113                         i = processSet(regex, i, result, temp, pos);
114                         continue;
115                     }
116                 }
117                 break;
118 
119             case 1: // we are after a \
120                 if (ch == 'Q') {
121                     state = 2;
122                 } else {
123                     state = 0;
124                 }
125                 break;
126 
127             case 2: // we are in a \Q...
128                 if (ch == '\\') {
129                     state = 3;
130                 }
131                 break;
132 
133             case 3: // we are in a \Q...\
134                 if (ch == 'E') {
135                     state = 0;
136                 } else if (ch != '\\') {
137                     state = 2;
138                 }
139                 break;
140             }
141             result.append(ch);
142         }
143         return result.toString();
144     }
145 
146     /**
147      * Convenience static function, using standard parameters.
148      * @param regex as in process()
149      * @return processed regex pattern, as in process()
150      */
151     public static String fix(String regex) {
152         return STANDARD.transform(regex);
153     }
154 
155     /**
156      * Compile a regex string, after processing by fix(...).
157      *
158      * @param regex Raw regex pattern, as in fix(...).
159      * @return Pattern
160      */
161     public static Pattern compile(String regex) {
162         return Pattern.compile(STANDARD.transform(regex));
163     }
164 
165     /**
166      * Compile a regex string, after processing by fix(...).
167      *
168      * @param regex Raw regex pattern, as in fix(...).
169      * @return Pattern
170      */
171     public static Pattern compile(String regex, int options) {
172         return Pattern.compile(STANDARD.transform(regex), options);
173     }
174 
175     /**
176      * Compile a composed string from a set of BNF lines; see the List version for more information.
177      *
178      * @param bnfLines Series of BNF lines.
179      * @return Pattern
180      */
181     public String compileBnf(String bnfLines) {
182         return compileBnf(Arrays.asList(bnfLines.split("\\r\\n?|\\n")));
183     }
184 
185     /**
186      * Compile a composed string from a set of BNF lines, such as for composing a regex
187      * expression. The lines can be in any order, but there must not be any
188      * cycles. The result can be used as input for fix().
189      * <p>
190      * Example:
191      * <pre>
192      * uri = (?: (scheme) \\:)? (host) (?: \\? (query))? (?: \\u0023 (fragment))?;
193      * scheme = reserved+;
194      * host = // reserved+;
195      * query = [\\=reserved]+;
196      * fragment = reserved+;
197      * reserved = [[:ascii:][:alphabetic:]];
198      * </pre>
199      * <p>
200      * Caveats: at this point the parsing is simple; for example, # cannot be
201      * quoted (use \\u0023); you can set it to null to disable.
202      * The equality sign and a few others can be reset with
203      * setBnfX().
204      *
205      * @param lines Series of lines that represent a BNF expression. The lines contain
206      *          a series of statements that of the form x=y;. A statement can take
207      *          multiple lines, but there can't be multiple statements on a line.
208      *          A hash quotes to the end of the line.
209      * @return Pattern
210      */
211     public String compileBnf(List<String> lines) {
212         Map<String, String> variables = getVariables(lines);
213         Set<String> unused = new LinkedHashSet<>(variables.keySet());
214         // brute force replacement; do twice to allow for different order
215         // later on can optimize
216         for (int i = 0; i < 2; ++i) {
217             for (Entry<String, String> entry : variables.entrySet()) {
218                 String variable   = entry.getKey(),
219                        definition = entry.getValue();
220 
221                 for (Entry<String, String> entry2 : variables.entrySet()) {
222                     String variable2 = entry2.getKey(),
223                            definition2 = entry2.getValue();
224                     if (variable.equals(variable2)) {
225                         continue;
226                     }
227                     String altered2 = definition2.replace(variable, definition);
228                     if (!altered2.equals(definition2)) {
229                         unused.remove(variable);
230                         variables.put(variable2, altered2);
231 //                        if (log != null) {
232 //                            try {
233 //                                log.append(variable2 + "=" + altered2 + ";");
234 //                            } catch (IOException e) {
235 //                                throw (IllegalArgumentException) new IllegalArgumentException().initCause(e);
236 //                            }
237 //                        }
238                     }
239                 }
240             }
241         }
242         if (unused.size() != 1) {
243             throw new IllegalArgumentException("Not a single root: " + unused);
244         }
245         return variables.get(unused.iterator().next());
246     }
247 
248     public String getBnfCommentString() {
249         return bnfCommentString;
250     }
251 
252     public void setBnfCommentString(String bnfCommentString) {
253         this.bnfCommentString = bnfCommentString;
254     }
255 
256     public String getBnfVariableInfix() {
257         return bnfVariableInfix;
258     }
259 
260     public void setBnfVariableInfix(String bnfVariableInfix) {
261         this.bnfVariableInfix = bnfVariableInfix;
262     }
263 
264     public String getBnfLineSeparator() {
265         return bnfLineSeparator;
266     }
267 
268     public void setBnfLineSeparator(String bnfLineSeparator) {
269         this.bnfLineSeparator = bnfLineSeparator;
270     }
271 
272     /**
273      * Utility for loading lines from a file.
274      * @param result The result of the appended lines.
275      * @param file The file to have an input stream.
276      * @param encoding if null, then UTF-8
277      * @return filled list
278      * @throws IOException If there were problems opening the file for input stream.
279      */
280     public static List<String> appendLines(List<String> result, String file, String encoding) throws IOException {
281         InputStream is = new FileInputStream(file);
282         try {
283             return appendLines(result, is, encoding);
284         } finally {
285             is.close();
286         }
287     }
288 
289     /**
290      * Utility for loading lines from a UTF8 file.
291      * @param result The result of the appended lines.
292      * @param inputStream The input stream.
293      * @param encoding if null, then UTF-8
294      * @return filled list
295      * @throws IOException  If there were problems opening the input stream for reading.
296      */
297     public static List<String> appendLines(List<String> result, InputStream inputStream, String encoding)
298             throws UnsupportedEncodingException, IOException {
299         BufferedReader in = new BufferedReader(new InputStreamReader(inputStream, encoding == null ? "UTF-8" : encoding));
300         while (true) {
301             String line = in.readLine();
302             if (line == null) break;
303             result.add(line);
304         }
305         return result;
306     }
307 
308 
309 
310     /* (non-Javadoc)
311      * @see com.ibm.icu.util.Freezable#cloneAsThawed()
312      */
313     @Override
314     public UnicodeRegex cloneAsThawed() {
315         // TODO Auto-generated method stub
316         try {
317             return (UnicodeRegex)clone();
318         } catch (CloneNotSupportedException e) {
319             throw new IllegalArgumentException(); // should never happen
320         }
321     }
322 
323     /* (non-Javadoc)
324      * @see com.ibm.icu.util.Freezable#freeze()
325      */
326     @Override
327     public UnicodeRegex freeze() {
328         // no action needed now.
329         return this;
330     }
331 
332     /* (non-Javadoc)
333      * @see com.ibm.icu.util.Freezable#isFrozen()
334      */
335     @Override
336     public boolean isFrozen() {
337         // at this point, always true
338         return true;
339     }
340 
341     // ===== PRIVATES =====
342 
343     private int processSet(String regex, int i, StringBuilder result, UnicodeSet temp, ParsePosition pos) {
344         try {
345             pos.setIndex(i);
346             UnicodeSet x = temp.clear().applyPattern(regex, pos, symbolTable, 0);
347             x.complement().complement(); // hack to fix toPattern
348             String pattern = x.toPattern(false);
349             // Escaping of supplementary code points differs between ICU UnicodeSet and Java regex.
350             if (pattern.contains("\\U")) {
351                 pattern = SUPP_ESCAPE.matcher(pattern).replaceAll("\\\\x{$1}");
352             }
353             result.append(pattern);
354             i = pos.getIndex() - 1; // allow for the loop increment
355             return i;
356         } catch (Exception e) {
357             throw (IllegalArgumentException) new IllegalArgumentException("Error in " + regex).initCause(e);
358         }
359     }
360 
361     private static final UnicodeRegex STANDARD = new UnicodeRegex();
362     private String bnfCommentString = "#";
363     private String bnfVariableInfix = "=";
364     private String bnfLineSeparator = "\n";
365 //    private Appendable log = null;
366 
367     private Comparator<Object> LongestFirst = new Comparator<Object>() {
368         @Override
369         public int compare(Object obj0, Object obj1) {
370             String arg0 = obj0.toString();
371             String arg1 = obj1.toString();
372             int len0 = arg0.length();
373             int len1 = arg1.length();
374             if (len0 != len1) return len1 - len0;
375             return arg0.compareTo(arg1);
376         }
377     };
378 
379     private Map<String, String> getVariables(List<String> lines) {
380         Map<String, String> variables = new TreeMap<>(LongestFirst);
381         String variable = null;
382         StringBuffer definition = new StringBuffer();
383         int count = 0;
384         for (String line : lines) {
385             ++count;
386             // remove initial bom, comments
387             if (line.length() == 0) continue;
388             if (line.charAt(0) == '\uFEFF') line = line.substring(1);
389 
390             if (bnfCommentString != null) {
391                 int hashPos = line.indexOf(bnfCommentString);
392                 if (hashPos >= 0) line = line.substring(0, hashPos);
393             }
394             String trimline = line.trim();
395             if (trimline.length() == 0) continue;
396 
397             // String[] lineParts = line.split(";");
398             String linePart = line; // lineParts[i]; // .trim().replace("\\s+", " ");
399             if (linePart.trim().length() == 0) continue;
400             boolean terminated = trimline.endsWith(";");
401             if (terminated) {
402                 linePart = linePart.substring(0,linePart.lastIndexOf(';'));
403             }
404             int equalsPos = linePart.indexOf(bnfVariableInfix);
405             if (equalsPos >= 0) {
406                 if (variable != null) {
407                     throw new IllegalArgumentException("Missing ';' before " + count + ") " + line);
408                 }
409                 variable = linePart.substring(0,equalsPos).trim();
410                 if (variables.containsKey(variable)) {
411                     throw new IllegalArgumentException("Duplicate variable definition in " + line);
412                 }
413                 definition.append(linePart.substring(equalsPos+1).trim());
414             } else { // no equals, so
415                 if (variable == null) {
416                     throw new IllegalArgumentException("Missing '=' at " + count + ") " + line);
417                 }
418                 definition.append(bnfLineSeparator).append(linePart);
419             }
420             // we are terminated if i is not at the end, or the line ends with a ;
421             if (terminated) {
422                 variables.put(variable, definition.toString());
423                 variable = null; // signal we have no variable
424                 definition.setLength(0);
425             }
426         }
427         if (variable != null) {
428             throw new IllegalArgumentException("Missing ';' at end");
429         }
430         return variables;
431     }
432 }
433