1 package org.unicode.cldr.tool; 2 3 import com.google.common.base.Joiner; 4 import com.google.common.collect.Multimap; 5 import com.google.common.collect.TreeMultimap; 6 import com.ibm.icu.lang.UCharacter; 7 import com.ibm.icu.text.UnicodeSet; 8 import com.ibm.icu.util.Output; 9 import java.util.Collection; 10 import java.util.Comparator; 11 import java.util.Map.Entry; 12 import java.util.Set; 13 import java.util.TreeSet; 14 import java.util.regex.Matcher; 15 import java.util.regex.Pattern; 16 import java.util.regex.PatternSyntaxException; 17 18 /** 19 * Class to minimize certain types of regex. It does not work with open-ended quantifiers like * or 20 * +. The regex that is produced requires no backup, so should be faster as well as more compact. 21 * (But less readable!) 22 * 23 * @author markdavis 24 */ 25 public class MinimizeRegex { 26 /** 27 * Sort strings length-first, because (ab|abc) in regex world can stop at the first match, eg 28 * "ab". 29 */ 30 private static final Comparator<String> LENGTH_FIRST_COMPARE = 31 Comparator.comparingInt(String::length) 32 .reversed() 33 .thenComparing(Comparator.naturalOrder()); 34 main(String[] args)35 public static void main(String[] args) { 36 String defaultArg = 37 "zxx|zu|zh|zgh|yue|yo|yi|yav|xog|xh|wo|wae|vun|vo|vi|vai|uz|ur|und|uk|ug|tzm|twq|tt|tr|to|tk|ti|th|tg|teo|te|ta|sw|sv|su|st|sr|sq|so|sn|smn|sm|sl|sk|si|shi|sg|ses|seh|se|sd|sbp|saq|sah|sa|rwk|rw|ru|rof|ro|rn|rm|qu|pt|ps|prg|pl|pa|os|or|om|nyn|ny|nus|no|nnh|nn|nmg|nl|ne|nds|nd|nb|naq|mzn|my|mul|mua|mt|ms|mr|mn|ml|mk|mi|mgo|mgh|mg|mfe|mer|mas|lv|luy|luo|lu|lt|lrc|lo|ln|lkt|lg|lb|lag|la|ky|kw|ku|ksh|ksf|ksb|ks|kok|ko|kn|km|kln|kl|kkj|kk|ki|khq|kea|kde|kam|kab|ka|jv|jmc|jgo|ja|it|is|ii|ig|id|ia|hy|hu|ht|hsb|hr|hmn|hi|he|haw|ha|gv|guz|gu|gsw|gl|gd|ga|fy|fur|fr|fo|fil|fi|ff|fa|ewo|eu|et|es|eo|en|el|ee|ebu|dz|dyo|dua|dsb|dje|de|dav|da|cy|cu|cs|co|ckb|chr|cgg|ceb|ce|ccp|ca|bs|brx|br|bo|bn|bm|bg|bez|bem|be|bas|az|ast|asa|as|ar|am|ak|agq|af"; 38 // defaultArg = 39 // "aa|ace|ad[ay]|ain|al[et]|anp?|arp|ast|av|awa|ay|ma[dgik]|mdf|men|mh|mi[cn]|mni|mos|mu[ls]|mwl|myv"; 40 String regexString = args.length < 1 ? defaultArg : args[0]; 41 UnicodeSet set = new UnicodeSet(args.length < 2 ? "[:ascii:]" : args[1]); 42 43 System.out.println(defaultArg + "\n"); 44 Output<Set<String>> flattenedOut = new Output<>(); 45 String recompressed = compressWith(regexString, set, flattenedOut); 46 System.out.println(Joiner.on("|").join(flattenedOut.value) + "\n"); 47 System.out.println(recompressed + "\n"); 48 } 49 50 public static String compressWith(String regexString, UnicodeSet set) { 51 return compressWith(regexString, set, null); 52 } 53 54 public static String simplePattern(Collection<String> strings) { 55 TreeSet<String> temp = new TreeSet<>(LENGTH_FIRST_COMPARE); 56 temp.addAll(strings); 57 return Joiner.on("|").join(temp); 58 } 59 60 public static String compressWith( 61 String regexString, UnicodeSet set, Output<Set<String>> flattenedOut) { 62 Set<String> flattened = flatten(Pattern.compile(regexString), "", set); 63 String regexString2 = Joiner.on("|").join(flattened); 64 Set<String> flattened2 = flatten(Pattern.compile(regexString2), "", set); 65 if (!flattened2.equals(flattened)) { 66 throw new IllegalArgumentException( 67 "Failed to compress: " 68 + regexString 69 + " using " 70 + set 71 + ", got " 72 + regexString2); 73 } 74 75 if (flattenedOut != null) { 76 flattenedOut.value = flattened; 77 } 78 return compressWith(flattened, set); 79 } 80 81 /** 82 * Does not work with sets of strings containing regex syntax. 83 * 84 * @param flattened 85 * @param set 86 * @return 87 */ 88 public static String compressWith(Set<String> flattened, UnicodeSet set) { 89 String recompressed = compress(flattened, new Output<Boolean>()); 90 Set<String> flattened2; 91 try { 92 flattened2 = flatten(Pattern.compile(recompressed), "", set); 93 } catch (PatternSyntaxException e) { 94 int loc = e.getIndex(); 95 if (loc >= 0) { 96 recompressed = 97 recompressed.substring(0, loc) + "$$$$$" + recompressed.substring(loc); 98 } 99 throw new IllegalArgumentException("Failed to parse: " + recompressed, e); 100 } 101 if (!flattened2.equals(flattened)) { 102 throw new IllegalArgumentException( 103 "Failed to compress:\n" + flattened + "\n≠ " + flattened2); 104 } 105 return recompressed; 106 } 107 108 private static String compress(Set<String> flattened, Output<Boolean> isSingle) { 109 // make a map from first code points to remainder 110 Multimap<Integer, String> firstToRemainder = TreeMultimap.create(); 111 UnicodeSet results = new UnicodeSet(); 112 boolean hasEmpty = false; 113 for (String s : flattened) { 114 if (s.isEmpty()) { 115 hasEmpty = true; 116 continue; 117 } 118 int first = s.codePointAt(0); 119 firstToRemainder.put(first, s.substring(UCharacter.charCount(first))); 120 } 121 StringBuilder buf = new StringBuilder(); 122 for (Entry<Integer, Collection<String>> entry : firstToRemainder.asMap().entrySet()) { 123 Set<String> items = (Set<String>) entry.getValue(); 124 buf.setLength(0); 125 buf.appendCodePoint(entry.getKey()); 126 if (items.size() == 1) { 127 buf.append(items.iterator().next()); 128 } else { 129 String sub = compress(items, isSingle); 130 if (isSingle.value) { 131 buf.append(sub); 132 } else { 133 buf.append('(').append(sub).append(')'); 134 } 135 } 136 results.add(buf.toString()); 137 } 138 Set<String> strings = new TreeSet<>(results.strings()); 139 results.removeAll(strings); 140 switch (results.size()) { 141 case 0: 142 break; 143 case 1: 144 strings.add(results.iterator().next()); 145 break; 146 default: 147 strings.add(results.toPattern(false)); 148 break; 149 } 150 switch (strings.size()) { 151 case 0: 152 throw new IllegalArgumentException(); 153 case 1: 154 isSingle.value = true; 155 return strings.iterator().next() + (hasEmpty ? "?" : ""); 156 default: 157 String result = Joiner.on("|").join(strings); 158 if (hasEmpty) { 159 isSingle.value = true; 160 return '(' + result + ")?"; 161 } 162 isSingle.value = false; 163 return result; 164 } 165 } 166 167 public static TreeSet<String> flatten(Pattern pattern, String prefix, UnicodeSet set) { 168 return flatten(pattern.matcher(""), prefix, set, new TreeSet<>(LENGTH_FIRST_COMPARE)); 169 } 170 171 private static TreeSet<String> flatten( 172 Matcher matcher, String prefix, UnicodeSet set, TreeSet<String> results) { 173 for (String s : set) { 174 String trial = prefix + s; 175 matcher.reset(trial); 176 boolean matches = matcher.matches(); 177 if (matches) { 178 results.add(trial); 179 } 180 if (matcher.hitEnd()) { 181 flatten(matcher, trial, set, results); 182 } 183 } 184 return results; 185 } 186 } 187