• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 package org.unicode.cldr.tool;
2 
3 import java.util.Collection;
4 import java.util.Comparator;
5 import java.util.Map.Entry;
6 import java.util.Set;
7 import java.util.TreeSet;
8 import java.util.regex.Matcher;
9 import java.util.regex.Pattern;
10 import java.util.regex.PatternSyntaxException;
11 
12 import com.google.common.base.Joiner;
13 import com.google.common.collect.Multimap;
14 import com.google.common.collect.TreeMultimap;
15 import com.ibm.icu.lang.UCharacter;
16 import com.ibm.icu.text.UnicodeSet;
17 import com.ibm.icu.util.Output;
18 
19 /**
20  * Class to minimize certain types of regex. It does not work with open-ended quantifiers like * or +.
21  * The regex that is produced requires no backup, so should be faster as well as more compact.
22  * (But less readable!)
23  * @author markdavis
24  */
25 public class MinimizeRegex {
26     /**
27      * Sort  strings length-first, because (ab|abc) in regex world can stop at the first match, eg "ab".
28      */
29     private static final Comparator<String> LENGTH_FIRST_COMPARE = Comparator.comparingInt(String::length)
30     .reversed()
31     .thenComparing(Comparator.naturalOrder());
32 
main(String[] args)33     public static void main(String[] args) {
34         String defaultArg = "zxx|zu|zh|zgh|yue|yo|yi|yav|xog|xh|wo|wae|vun|vo|vi|vai|uz|ur|und|uk|ug|tzm|twq|tt|tr|to|tk|ti|th|tg|teo|te|ta|sw|sv|su|st|sr|sq|so|sn|smn|sm|sl|sk|si|shi|sg|ses|seh|se|sd|sbp|saq|sah|sa|rwk|rw|ru|rof|ro|rn|rm|qu|pt|ps|prg|pl|pa|os|or|om|nyn|ny|nus|no|nnh|nn|nmg|nl|ne|nds|nd|nb|naq|mzn|my|mul|mua|mt|ms|mr|mn|ml|mk|mi|mgo|mgh|mg|mfe|mer|mas|lv|luy|luo|lu|lt|lrc|lo|ln|lkt|lg|lb|lag|la|ky|kw|ku|ksh|ksf|ksb|ks|kok|ko|kn|km|kln|kl|kkj|kk|ki|khq|kea|kde|kam|kab|ka|jv|jmc|jgo|ja|it|is|ii|ig|id|ia|hy|hu|ht|hsb|hr|hmn|hi|he|haw|ha|gv|guz|gu|gsw|gl|gd|ga|fy|fur|fr|fo|fil|fi|ff|fa|ewo|eu|et|es|eo|en|el|ee|ebu|dz|dyo|dua|dsb|dje|de|dav|da|cy|cu|cs|co|ckb|chr|cgg|ceb|ce|ccp|ca|bs|brx|br|bo|bn|bm|bg|bez|bem|be|bas|az|ast|asa|as|ar|am|ak|agq|af";
35         //defaultArg = "aa|ace|ad[ay]|ain|al[et]|anp?|arp|ast|av|awa|ay|ma[dgik]|mdf|men|mh|mi[cn]|mni|mos|mu[ls]|mwl|myv";
36         String regexString = args.length < 1 ? defaultArg : args[0];
37         UnicodeSet set = new UnicodeSet(args.length < 2 ? "[:ascii:]" : args[1]);
38 
39         System.out.println(defaultArg + "\n");
40         Output<Set<String>> flattenedOut = new Output<>();
41         String recompressed = compressWith(regexString, set, flattenedOut);
42         System.out.println(Joiner.on("|").join(flattenedOut.value) + "\n");
43         System.out.println(recompressed + "\n");
44     }
45 
46     public static String compressWith(String regexString, UnicodeSet set) {
47         return compressWith(regexString, set, null);
48     }
49 
50     public static String simplePattern(Collection<String> strings) {
51         TreeSet<String> temp = new TreeSet<>(LENGTH_FIRST_COMPARE);
52         temp.addAll(strings);
53         return Joiner.on("|").join(temp);
54     }
55 
56     public static String compressWith(String regexString, UnicodeSet set, Output<Set<String>> flattenedOut) {
57         Set<String> flattened = flatten(Pattern.compile(regexString), "", set);
58         String regexString2 = Joiner.on("|").join(flattened);
59         Set<String> flattened2 = flatten(Pattern.compile(regexString2), "", set);
60         if (!flattened2.equals(flattened)) {
61             throw new IllegalArgumentException("Failed to compress: " + regexString + " using " + set + ", got " + regexString2);
62         }
63 
64         if (flattenedOut != null) {
65             flattenedOut.value = flattened;
66         }
67         return compressWith(flattened, set);
68     }
69 
70     /**
71      * Does not work with sets of strings containing regex syntax.
72      * @param flattened
73      * @param set
74      * @return
75      */
76     public static String compressWith(Set<String> flattened, UnicodeSet set) {
77         String recompressed = compress(flattened, new Output<Boolean>());
78         Set<String> flattened2;
79         try {
80             flattened2 = flatten(Pattern.compile(recompressed), "", set);
81         } catch (PatternSyntaxException e) {
82             int loc = e.getIndex();
83             if (loc >= 0) {
84                 recompressed = recompressed.substring(0,loc) + "$$$$$" + recompressed.substring(loc);
85             }
86             throw new IllegalArgumentException("Failed to parse: " + recompressed, e);
87         }
88         if (!flattened2.equals(flattened)) {
89             throw new IllegalArgumentException("Failed to compress:\n" + flattened + "\n≠ " + flattened2);
90         }
91         return recompressed;
92     }
93 
94     private static String compress(Set<String> flattened, Output<Boolean> isSingle) {
95         // make a map from first code points to remainder
96         Multimap<Integer, String> firstToRemainder = TreeMultimap.create();
97         UnicodeSet results = new UnicodeSet();
98         boolean hasEmpty = false;
99         for (String s : flattened) {
100             if (s.isEmpty()) {
101                 hasEmpty = true;
102                 continue;
103             }
104             int first = s.codePointAt(0);
105             firstToRemainder.put(first, s.substring(UCharacter.charCount(first)));
106         }
107         StringBuilder buf = new StringBuilder();
108         for (Entry<Integer, Collection<String>> entry : firstToRemainder.asMap().entrySet()) {
109             Set<String> items = (Set<String>) entry.getValue();
110             buf.setLength(0);
111             buf.appendCodePoint(entry.getKey());
112             if (items.size() == 1) {
113                 buf.append(items.iterator().next());
114             } else {
115                 String sub = compress(items, isSingle);
116                 if (isSingle.value) {
117                     buf.append(sub);
118                 } else {
119                     buf.append('(').append(sub).append(')');
120                 }
121             }
122             results.add(buf.toString());
123         }
124         Set<String> strings = new TreeSet<>(results.strings());
125         results.removeAll(strings);
126         switch(results.size()) {
127         case 0:
128             break;
129         case 1:
130             strings.add(results.iterator().next());
131             break;
132         default:
133             strings.add(results.toPattern(false));
134             break;
135         }
136         switch (strings.size()) {
137         case 0: throw new IllegalArgumentException();
138         case 1:
139             isSingle.value = true;
140             return strings.iterator().next() + (hasEmpty ? "?" : "");
141         default:
142             String result = Joiner.on("|").join(strings);
143             if (hasEmpty) {
144                 isSingle.value = true;
145                 return '(' + result + ")?";
146             }
147             isSingle.value = false;
148             return result;
149         }
150     }
151 
152     public static TreeSet<String> flatten(Pattern pattern, String prefix, UnicodeSet set) {
153         return flatten(pattern.matcher(""), prefix, set, new TreeSet<>(LENGTH_FIRST_COMPARE));
154     }
155 
156     private static TreeSet<String> flatten(Matcher matcher, String prefix, UnicodeSet set, TreeSet<String> results) {
157         for (String s : set) {
158             String trial = prefix + s;
159             matcher.reset(trial);
160             boolean matches = matcher.matches();
161             if (matches) {
162                 results.add(trial);
163             }
164             if (matcher.hitEnd()) {
165                 flatten(matcher, trial, set, results);
166             }
167         }
168         return results;
169     }
170 }
171