• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 package org.unicode.cldr.tool;
2 
3 import com.google.common.base.Joiner;
4 import com.google.common.collect.Multimap;
5 import com.google.common.collect.TreeMultimap;
6 import com.ibm.icu.lang.UCharacter;
7 import com.ibm.icu.text.UnicodeSet;
8 import com.ibm.icu.util.Output;
9 import java.util.Collection;
10 import java.util.Comparator;
11 import java.util.Map.Entry;
12 import java.util.Set;
13 import java.util.TreeSet;
14 import java.util.regex.Matcher;
15 import java.util.regex.Pattern;
16 import java.util.regex.PatternSyntaxException;
17 
18 /**
19  * Class to minimize certain types of regex. It does not work with open-ended quantifiers like * or
20  * +. The regex that is produced requires no backup, so should be faster as well as more compact.
21  * (But less readable!)
22  *
23  * @author markdavis
24  */
25 public class MinimizeRegex {
26     /**
27      * Sort strings length-first, because (ab|abc) in regex world can stop at the first match, eg
28      * "ab".
29      */
30     private static final Comparator<String> LENGTH_FIRST_COMPARE =
31             Comparator.comparingInt(String::length)
32                     .reversed()
33                     .thenComparing(Comparator.naturalOrder());
34 
main(String[] args)35     public static void main(String[] args) {
36         String defaultArg =
37                 "zxx|zu|zh|zgh|yue|yo|yi|yav|xog|xh|wo|wae|vun|vo|vi|vai|uz|ur|und|uk|ug|tzm|twq|tt|tr|to|tk|ti|th|tg|teo|te|ta|sw|sv|su|st|sr|sq|so|sn|smn|sm|sl|sk|si|shi|sg|ses|seh|se|sd|sbp|saq|sah|sa|rwk|rw|ru|rof|ro|rn|rm|qu|pt|ps|prg|pl|pa|os|or|om|nyn|ny|nus|no|nnh|nn|nmg|nl|ne|nds|nd|nb|naq|mzn|my|mul|mua|mt|ms|mr|mn|ml|mk|mi|mgo|mgh|mg|mfe|mer|mas|lv|luy|luo|lu|lt|lrc|lo|ln|lkt|lg|lb|lag|la|ky|kw|ku|ksh|ksf|ksb|ks|kok|ko|kn|km|kln|kl|kkj|kk|ki|khq|kea|kde|kam|kab|ka|jv|jmc|jgo|ja|it|is|ii|ig|id|ia|hy|hu|ht|hsb|hr|hmn|hi|he|haw|ha|gv|guz|gu|gsw|gl|gd|ga|fy|fur|fr|fo|fil|fi|ff|fa|ewo|eu|et|es|eo|en|el|ee|ebu|dz|dyo|dua|dsb|dje|de|dav|da|cy|cu|cs|co|ckb|chr|cgg|ceb|ce|ccp|ca|bs|brx|br|bo|bn|bm|bg|bez|bem|be|bas|az|ast|asa|as|ar|am|ak|agq|af";
38         // defaultArg =
39         // "aa|ace|ad[ay]|ain|al[et]|anp?|arp|ast|av|awa|ay|ma[dgik]|mdf|men|mh|mi[cn]|mni|mos|mu[ls]|mwl|myv";
40         String regexString = args.length < 1 ? defaultArg : args[0];
41         UnicodeSet set = new UnicodeSet(args.length < 2 ? "[:ascii:]" : args[1]);
42 
43         System.out.println(defaultArg + "\n");
44         Output<Set<String>> flattenedOut = new Output<>();
45         String recompressed = compressWith(regexString, set, flattenedOut);
46         System.out.println(Joiner.on("|").join(flattenedOut.value) + "\n");
47         System.out.println(recompressed + "\n");
48     }
49 
50     public static String compressWith(String regexString, UnicodeSet set) {
51         return compressWith(regexString, set, null);
52     }
53 
54     public static String simplePattern(Collection<String> strings) {
55         TreeSet<String> temp = new TreeSet<>(LENGTH_FIRST_COMPARE);
56         temp.addAll(strings);
57         return Joiner.on("|").join(temp);
58     }
59 
60     public static String compressWith(
61             String regexString, UnicodeSet set, Output<Set<String>> flattenedOut) {
62         Set<String> flattened = flatten(Pattern.compile(regexString), "", set);
63         String regexString2 = Joiner.on("|").join(flattened);
64         Set<String> flattened2 = flatten(Pattern.compile(regexString2), "", set);
65         if (!flattened2.equals(flattened)) {
66             throw new IllegalArgumentException(
67                     "Failed to compress: "
68                             + regexString
69                             + " using "
70                             + set
71                             + ", got "
72                             + regexString2);
73         }
74 
75         if (flattenedOut != null) {
76             flattenedOut.value = flattened;
77         }
78         return compressWith(flattened, set);
79     }
80 
81     /**
82      * Does not work with sets of strings containing regex syntax.
83      *
84      * @param flattened
85      * @param set
86      * @return
87      */
88     public static String compressWith(Set<String> flattened, UnicodeSet set) {
89         String recompressed = compress(flattened, new Output<Boolean>());
90         Set<String> flattened2;
91         try {
92             flattened2 = flatten(Pattern.compile(recompressed), "", set);
93         } catch (PatternSyntaxException e) {
94             int loc = e.getIndex();
95             if (loc >= 0) {
96                 recompressed =
97                         recompressed.substring(0, loc) + "$$$$$" + recompressed.substring(loc);
98             }
99             throw new IllegalArgumentException("Failed to parse: " + recompressed, e);
100         }
101         if (!flattened2.equals(flattened)) {
102             throw new IllegalArgumentException(
103                     "Failed to compress:\n" + flattened + "\n≠ " + flattened2);
104         }
105         return recompressed;
106     }
107 
108     private static String compress(Set<String> flattened, Output<Boolean> isSingle) {
109         // make a map from first code points to remainder
110         Multimap<Integer, String> firstToRemainder = TreeMultimap.create();
111         UnicodeSet results = new UnicodeSet();
112         boolean hasEmpty = false;
113         for (String s : flattened) {
114             if (s.isEmpty()) {
115                 hasEmpty = true;
116                 continue;
117             }
118             int first = s.codePointAt(0);
119             firstToRemainder.put(first, s.substring(UCharacter.charCount(first)));
120         }
121         StringBuilder buf = new StringBuilder();
122         for (Entry<Integer, Collection<String>> entry : firstToRemainder.asMap().entrySet()) {
123             Set<String> items = (Set<String>) entry.getValue();
124             buf.setLength(0);
125             buf.appendCodePoint(entry.getKey());
126             if (items.size() == 1) {
127                 buf.append(items.iterator().next());
128             } else {
129                 String sub = compress(items, isSingle);
130                 if (isSingle.value) {
131                     buf.append(sub);
132                 } else {
133                     buf.append('(').append(sub).append(')');
134                 }
135             }
136             results.add(buf.toString());
137         }
138         Set<String> strings = new TreeSet<>(results.strings());
139         results.removeAll(strings);
140         switch (results.size()) {
141             case 0:
142                 break;
143             case 1:
144                 strings.add(results.iterator().next());
145                 break;
146             default:
147                 strings.add(results.toPattern(false));
148                 break;
149         }
150         switch (strings.size()) {
151             case 0:
152                 throw new IllegalArgumentException();
153             case 1:
154                 isSingle.value = true;
155                 return strings.iterator().next() + (hasEmpty ? "?" : "");
156             default:
157                 String result = Joiner.on("|").join(strings);
158                 if (hasEmpty) {
159                     isSingle.value = true;
160                     return '(' + result + ")?";
161                 }
162                 isSingle.value = false;
163                 return result;
164         }
165     }
166 
167     public static TreeSet<String> flatten(Pattern pattern, String prefix, UnicodeSet set) {
168         return flatten(pattern.matcher(""), prefix, set, new TreeSet<>(LENGTH_FIRST_COMPARE));
169     }
170 
171     private static TreeSet<String> flatten(
172             Matcher matcher, String prefix, UnicodeSet set, TreeSet<String> results) {
173         for (String s : set) {
174             String trial = prefix + s;
175             matcher.reset(trial);
176             boolean matches = matcher.matches();
177             if (matches) {
178                 results.add(trial);
179             }
180             if (matcher.hitEnd()) {
181                 flatten(matcher, trial, set, results);
182             }
183         }
184         return results;
185     }
186 }
187