• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html#License
3 /*
4  *******************************************************************************
5  * Copyright (C) 2009-2015, Google, International Business Machines Corporation
6  * and others. All Rights Reserved.
7  *******************************************************************************
8  */
9 package com.ibm.icu.impl;
10 
11 import java.io.BufferedReader;
12 import java.io.FileInputStream;
13 import java.io.IOException;
14 import java.io.InputStream;
15 import java.io.InputStreamReader;
16 import java.io.UnsupportedEncodingException;
17 import java.text.ParsePosition;
18 import java.util.Arrays;
19 import java.util.Comparator;
20 import java.util.LinkedHashSet;
21 import java.util.List;
22 import java.util.Map;
23 import java.util.Map.Entry;
24 import java.util.Set;
25 import java.util.TreeMap;
26 import java.util.regex.Pattern;
27 
28 import com.ibm.icu.text.StringTransform;
29 import com.ibm.icu.text.SymbolTable;
30 import com.ibm.icu.text.UnicodeSet;
31 import com.ibm.icu.util.Freezable;
32 
33 /**
34  * Contains utilities to supplement the JDK Regex, since it doesn't handle
35  * Unicode well.
36  *
37  * <p>TODO: Move to com.ibm.icu.dev.somewhere.
38  * 2015-sep-03: This is used there, and also in CLDR and in UnicodeTools.
39  *
40  * @author markdavis
41  */
42 public class UnicodeRegex implements Cloneable, Freezable<UnicodeRegex>, StringTransform {
43     // Note: we don't currently have any state, but intend to in the future,
44     // particularly for the regex style supported.
45 
46     private SymbolTable symbolTable;
47 
48     /**
49      * Set the symbol table for internal processing
50      * @internal
51      */
getSymbolTable()52     public SymbolTable getSymbolTable() {
53         return symbolTable;
54     }
55 
56     /**
57      * Get the symbol table for internal processing
58      * @internal
59      */
setSymbolTable(SymbolTable symbolTable)60     public UnicodeRegex setSymbolTable(SymbolTable symbolTable) {
61         this.symbolTable = symbolTable;
62         return this;
63     }
64 
65     /**
66      * Adds full Unicode property support, with the latest version of Unicode,
67      * to Java Regex, bringing it up to Level 1 (see
68      * http://www.unicode.org/reports/tr18/). It does this by preprocessing the
69      * regex pattern string and interpreting the character classes (\p{...},
70      * \P{...}, [...]) according to their syntax and meaning in UnicodeSet. With
71      * this utility, Java regex expressions can be updated to work with the
72      * latest version of Unicode, and with all Unicode properties. Note that the
73      * UnicodeSet syntax has not yet, however, been updated to be completely
74      * consistent with Java regex, so be careful of the differences.
75      * <p>Not thread-safe; create a separate copy for different threads.
76      * <p>In the future, we may extend this to support other regex packages.
77      *
78      * @regex A modified Java regex pattern, as in the input to
79      *        Pattern.compile(), except that all "character classes" are
80      *        processed as if they were UnicodeSet patterns. Example:
81      *        "abc[:bc=N:]. See UnicodeSet for the differences in syntax.
82      * @return A processed Java regex pattern, suitable for input to
83      *         Pattern.compile().
84      */
85     @Override
transform(String regex)86     public String transform(String regex) {
87         StringBuilder result = new StringBuilder();
88         UnicodeSet temp = new UnicodeSet();
89         ParsePosition pos = new ParsePosition(0);
90         int state = 0; // 1 = after \
91 
92         // We add each character unmodified to the output, unless we have a
93         // UnicodeSet. Note that we don't worry about supplementary characters,
94         // since none of the syntax uses them.
95 
96         for (int i = 0; i < regex.length(); ++i) {
97             // look for UnicodeSets, allowing for quoting with \ and \Q
98             char ch = regex.charAt(i);
99             switch (state) {
100             case 0: // we only care about \, and '['.
101                 if (ch == '\\') {
102                     if (UnicodeSet.resemblesPattern(regex, i)) {
103                         // should only happen with \p
104                         i = processSet(regex, i, result, temp, pos);
105                         continue;
106                     }
107                     state = 1;
108                 } else if (ch == '[') {
109                     // if we have what looks like a UnicodeSet
110                     if (UnicodeSet.resemblesPattern(regex, i)) {
111                         i = processSet(regex, i, result, temp, pos);
112                         continue;
113                     }
114                 }
115                 break;
116 
117             case 1: // we are after a \
118                 if (ch == 'Q') {
119                     state = 1;
120                 } else {
121                     state = 0;
122                 }
123                 break;
124 
125             case 2: // we are in a \Q...
126                 if (ch == '\\') {
127                     state = 3;
128                 }
129                 break;
130 
131             case 3: // we are in at \Q...\
132                 if (ch == 'E') {
133                     state = 0;
134                 }
135                 state = 2;
136                 break;
137             }
138             result.append(ch);
139         }
140         return result.toString();
141     }
142 
143     /**
144      * Convenience static function, using standard parameters.
145      * @param regex as in process()
146      * @return processed regex pattern, as in process()
147      */
148     public static String fix(String regex) {
149         return STANDARD.transform(regex);
150     }
151 
152     /**
153      * Compile a regex string, after processing by fix(...).
154      *
155      * @param regex Raw regex pattern, as in fix(...).
156      * @return Pattern
157      */
158     public static Pattern compile(String regex) {
159         return Pattern.compile(STANDARD.transform(regex));
160     }
161 
162     /**
163      * Compile a regex string, after processing by fix(...).
164      *
165      * @param regex Raw regex pattern, as in fix(...).
166      * @return Pattern
167      */
168     public static Pattern compile(String regex, int options) {
169         return Pattern.compile(STANDARD.transform(regex), options);
170     }
171 
172     /**
173      * Compile a composed string from a set of BNF lines; see the List version for more information.
174      *
175      * @param bnfLines Series of BNF lines.
176      * @return Pattern
177      */
178     public String compileBnf(String bnfLines) {
179         return compileBnf(Arrays.asList(bnfLines.split("\\r\\n?|\\n")));
180     }
181 
182     /**
183      * Compile a composed string from a set of BNF lines, such as for composing a regex
184      * expression. The lines can be in any order, but there must not be any
185      * cycles. The result can be used as input for fix().
186      * <p>
187      * Example:
188      * <pre>
189      * uri = (?: (scheme) \\:)? (host) (?: \\? (query))? (?: \\u0023 (fragment))?;
190      * scheme = reserved+;
191      * host = // reserved+;
192      * query = [\\=reserved]+;
193      * fragment = reserved+;
194      * reserved = [[:ascii:][:alphabetic:]];
195      * </pre>
196      * <p>
197      * Caveats: at this point the parsing is simple; for example, # cannot be
198      * quoted (use \\u0023); you can set it to null to disable.
199      * The equality sign and a few others can be reset with
200      * setBnfX().
201      *
202      * @param lines Series of lines that represent a BNF expression. The lines contain
203      *          a series of statements that of the form x=y;. A statement can take
204      *          multiple lines, but there can't be multiple statements on a line.
205      *          A hash quotes to the end of the line.
206      * @return Pattern
207      */
208     public String compileBnf(List<String> lines) {
209         Map<String, String> variables = getVariables(lines);
210         Set<String> unused = new LinkedHashSet<String>(variables.keySet());
211         // brute force replacement; do twice to allow for different order
212         // later on can optimize
213         for (int i = 0; i < 2; ++i) {
214             for (Entry<String, String> entry : variables.entrySet()) {
215                 String variable   = entry.getKey(),
216                        definition = entry.getValue();
217 
218                 for (Entry<String, String> entry2 : variables.entrySet()) {
219                     String variable2 = entry2.getKey(),
220                            definition2 = entry2.getValue();
221                     if (variable.equals(variable2)) {
222                         continue;
223                     }
224                     String altered2 = definition2.replace(variable, definition);
225                     if (!altered2.equals(definition2)) {
226                         unused.remove(variable);
227                         variables.put(variable2, altered2);
228 //                        if (log != null) {
229 //                            try {
230 //                                log.append(variable2 + "=" + altered2 + ";");
231 //                            } catch (IOException e) {
232 //                                throw (IllegalArgumentException) new IllegalArgumentException().initCause(e);
233 //                            }
234 //                        }
235                     }
236                 }
237             }
238         }
239         if (unused.size() != 1) {
240             throw new IllegalArgumentException("Not a single root: " + unused);
241         }
242         return variables.get(unused.iterator().next());
243     }
244 
245     public String getBnfCommentString() {
246         return bnfCommentString;
247     }
248 
249     public void setBnfCommentString(String bnfCommentString) {
250         this.bnfCommentString = bnfCommentString;
251     }
252 
253     public String getBnfVariableInfix() {
254         return bnfVariableInfix;
255     }
256 
257     public void setBnfVariableInfix(String bnfVariableInfix) {
258         this.bnfVariableInfix = bnfVariableInfix;
259     }
260 
261     public String getBnfLineSeparator() {
262         return bnfLineSeparator;
263     }
264 
265     public void setBnfLineSeparator(String bnfLineSeparator) {
266         this.bnfLineSeparator = bnfLineSeparator;
267     }
268 
269     /**
270      * Utility for loading lines from a file.
271      * @param result The result of the appended lines.
272      * @param file The file to have an input stream.
273      * @param encoding if null, then UTF-8
274      * @return filled list
275      * @throws IOException If there were problems opening the file for input stream.
276      */
277     public static List<String> appendLines(List<String> result, String file, String encoding) throws IOException {
278         InputStream is = new FileInputStream(file);
279         try {
280             return appendLines(result, is, encoding);
281         } finally {
282             is.close();
283         }
284     }
285 
286     /**
287      * Utility for loading lines from a UTF8 file.
288      * @param result The result of the appended lines.
289      * @param inputStream The input stream.
290      * @param encoding if null, then UTF-8
291      * @return filled list
292      * @throws IOException  If there were problems opening the input stream for reading.
293      */
294     public static List<String> appendLines(List<String> result, InputStream inputStream, String encoding)
295             throws UnsupportedEncodingException, IOException {
296         BufferedReader in = new BufferedReader(new InputStreamReader(inputStream, encoding == null ? "UTF-8" : encoding));
297         while (true) {
298             String line = in.readLine();
299             if (line == null) break;
300             result.add(line);
301         }
302         return result;
303     }
304 
305 
306 
307     /* (non-Javadoc)
308      * @see com.ibm.icu.util.Freezable#cloneAsThawed()
309      */
310     @Override
311     public UnicodeRegex cloneAsThawed() {
312         // TODO Auto-generated method stub
313         try {
314             return (UnicodeRegex)clone();
315         } catch (CloneNotSupportedException e) {
316             throw new IllegalArgumentException(); // should never happen
317         }
318     }
319 
320     /* (non-Javadoc)
321      * @see com.ibm.icu.util.Freezable#freeze()
322      */
323     @Override
324     public UnicodeRegex freeze() {
325         // no action needed now.
326         return this;
327     }
328 
329     /* (non-Javadoc)
330      * @see com.ibm.icu.util.Freezable#isFrozen()
331      */
332     @Override
333     public boolean isFrozen() {
334         // at this point, always true
335         return true;
336     }
337 
338     // ===== PRIVATES =====
339 
340     private int processSet(String regex, int i, StringBuilder result, UnicodeSet temp, ParsePosition pos) {
341         try {
342             pos.setIndex(i);
343             UnicodeSet x = temp.clear().applyPattern(regex, pos, symbolTable, 0);
344             x.complement().complement(); // hack to fix toPattern
345             result.append(x.toPattern(false));
346             i = pos.getIndex() - 1; // allow for the loop increment
347             return i;
348         } catch (Exception e) {
349             throw (IllegalArgumentException) new IllegalArgumentException("Error in " + regex).initCause(e);
350         }
351     }
352 
353     private static final UnicodeRegex STANDARD = new UnicodeRegex();
354     private String bnfCommentString = "#";
355     private String bnfVariableInfix = "=";
356     private String bnfLineSeparator = "\n";
357 //    private Appendable log = null;
358 
359     private Comparator<Object> LongestFirst = new Comparator<Object>() {
360         @Override
361         public int compare(Object obj0, Object obj1) {
362             String arg0 = obj0.toString();
363             String arg1 = obj1.toString();
364             int len0 = arg0.length();
365             int len1 = arg1.length();
366             if (len0 != len1) return len1 - len0;
367             return arg0.compareTo(arg1);
368         }
369     };
370 
371     private Map<String, String> getVariables(List<String> lines) {
372         Map<String, String> variables = new TreeMap<String, String>(LongestFirst);
373         String variable = null;
374         StringBuffer definition = new StringBuffer();
375         int count = 0;
376         for (String line : lines) {
377             ++count;
378             // remove initial bom, comments
379             if (line.length() == 0) continue;
380             if (line.charAt(0) == '\uFEFF') line = line.substring(1);
381 
382             if (bnfCommentString != null) {
383                 int hashPos = line.indexOf(bnfCommentString);
384                 if (hashPos >= 0) line = line.substring(0, hashPos);
385             }
386             String trimline = line.trim();
387             if (trimline.length() == 0) continue;
388 
389             // String[] lineParts = line.split(";");
390             String linePart = line; // lineParts[i]; // .trim().replace("\\s+", " ");
391             if (linePart.trim().length() == 0) continue;
392             boolean terminated = trimline.endsWith(";");
393             if (terminated) {
394                 linePart = linePart.substring(0,linePart.lastIndexOf(';'));
395             }
396             int equalsPos = linePart.indexOf(bnfVariableInfix);
397             if (equalsPos >= 0) {
398                 if (variable != null) {
399                     throw new IllegalArgumentException("Missing ';' before " + count + ") " + line);
400                 }
401                 variable = linePart.substring(0,equalsPos).trim();
402                 if (variables.containsKey(variable)) {
403                     throw new IllegalArgumentException("Duplicate variable definition in " + line);
404                 }
405                 definition.append(linePart.substring(equalsPos+1).trim());
406             } else { // no equals, so
407                 if (variable == null) {
408                     throw new IllegalArgumentException("Missing '=' at " + count + ") " + line);
409                 }
410                 definition.append(bnfLineSeparator).append(linePart);
411             }
412             // we are terminated if i is not at the end, or the line ends with a ;
413             if (terminated) {
414                 variables.put(variable, definition.toString());
415                 variable = null; // signal we have no variable
416                 definition.setLength(0);
417             }
418         }
419         if (variable != null) {
420             throw new IllegalArgumentException("Missing ';' at end");
421         }
422         return variables;
423     }
424 }
425