1 package org.unicode.cldr.tool; 2 3 import java.util.Collection; 4 import java.util.Comparator; 5 import java.util.Map.Entry; 6 import java.util.Set; 7 import java.util.TreeSet; 8 import java.util.regex.Matcher; 9 import java.util.regex.Pattern; 10 import java.util.regex.PatternSyntaxException; 11 12 import com.google.common.base.Joiner; 13 import com.google.common.collect.Multimap; 14 import com.google.common.collect.TreeMultimap; 15 import com.ibm.icu.lang.UCharacter; 16 import com.ibm.icu.text.UnicodeSet; 17 import com.ibm.icu.util.Output; 18 19 /** 20 * Class to minimize certain types of regex. It does not work with open-ended quantifiers like * or +. 21 * The regex that is produced requires no backup, so should be faster as well as more compact. 22 * (But less readable!) 23 * @author markdavis 24 */ 25 public class MinimizeRegex { 26 /** 27 * Sort strings length-first, because (ab|abc) in regex world can stop at the first match, eg "ab". 28 */ 29 private static final Comparator<String> LENGTH_FIRST_COMPARE = Comparator.comparingInt(String::length) 30 .reversed() 31 .thenComparing(Comparator.naturalOrder()); 32 main(String[] args)33 public static void main(String[] args) { 34 String defaultArg = "zxx|zu|zh|zgh|yue|yo|yi|yav|xog|xh|wo|wae|vun|vo|vi|vai|uz|ur|und|uk|ug|tzm|twq|tt|tr|to|tk|ti|th|tg|teo|te|ta|sw|sv|su|st|sr|sq|so|sn|smn|sm|sl|sk|si|shi|sg|ses|seh|se|sd|sbp|saq|sah|sa|rwk|rw|ru|rof|ro|rn|rm|qu|pt|ps|prg|pl|pa|os|or|om|nyn|ny|nus|no|nnh|nn|nmg|nl|ne|nds|nd|nb|naq|mzn|my|mul|mua|mt|ms|mr|mn|ml|mk|mi|mgo|mgh|mg|mfe|mer|mas|lv|luy|luo|lu|lt|lrc|lo|ln|lkt|lg|lb|lag|la|ky|kw|ku|ksh|ksf|ksb|ks|kok|ko|kn|km|kln|kl|kkj|kk|ki|khq|kea|kde|kam|kab|ka|jv|jmc|jgo|ja|it|is|ii|ig|id|ia|hy|hu|ht|hsb|hr|hmn|hi|he|haw|ha|gv|guz|gu|gsw|gl|gd|ga|fy|fur|fr|fo|fil|fi|ff|fa|ewo|eu|et|es|eo|en|el|ee|ebu|dz|dyo|dua|dsb|dje|de|dav|da|cy|cu|cs|co|ckb|chr|cgg|ceb|ce|ccp|ca|bs|brx|br|bo|bn|bm|bg|bez|bem|be|bas|az|ast|asa|as|ar|am|ak|agq|af"; 35 //defaultArg = "aa|ace|ad[ay]|ain|al[et]|anp?|arp|ast|av|awa|ay|ma[dgik]|mdf|men|mh|mi[cn]|mni|mos|mu[ls]|mwl|myv"; 36 String regexString = args.length < 1 ? defaultArg : args[0]; 37 UnicodeSet set = new UnicodeSet(args.length < 2 ? "[:ascii:]" : args[1]); 38 39 System.out.println(defaultArg + "\n"); 40 Output<Set<String>> flattenedOut = new Output<>(); 41 String recompressed = compressWith(regexString, set, flattenedOut); 42 System.out.println(Joiner.on("|").join(flattenedOut.value) + "\n"); 43 System.out.println(recompressed + "\n"); 44 } 45 46 public static String compressWith(String regexString, UnicodeSet set) { 47 return compressWith(regexString, set, null); 48 } 49 50 public static String simplePattern(Collection<String> strings) { 51 TreeSet<String> temp = new TreeSet<>(LENGTH_FIRST_COMPARE); 52 temp.addAll(strings); 53 return Joiner.on("|").join(temp); 54 } 55 56 public static String compressWith(String regexString, UnicodeSet set, Output<Set<String>> flattenedOut) { 57 Set<String> flattened = flatten(Pattern.compile(regexString), "", set); 58 String regexString2 = Joiner.on("|").join(flattened); 59 Set<String> flattened2 = flatten(Pattern.compile(regexString2), "", set); 60 if (!flattened2.equals(flattened)) { 61 throw new IllegalArgumentException("Failed to compress: " + regexString + " using " + set + ", got " + regexString2); 62 } 63 64 if (flattenedOut != null) { 65 flattenedOut.value = flattened; 66 } 67 return compressWith(flattened, set); 68 } 69 70 /** 71 * Does not work with sets of strings containing regex syntax. 72 * @param flattened 73 * @param set 74 * @return 75 */ 76 public static String compressWith(Set<String> flattened, UnicodeSet set) { 77 String recompressed = compress(flattened, new Output<Boolean>()); 78 Set<String> flattened2; 79 try { 80 flattened2 = flatten(Pattern.compile(recompressed), "", set); 81 } catch (PatternSyntaxException e) { 82 int loc = e.getIndex(); 83 if (loc >= 0) { 84 recompressed = recompressed.substring(0,loc) + "$$$$$" + recompressed.substring(loc); 85 } 86 throw new IllegalArgumentException("Failed to parse: " + recompressed, e); 87 } 88 if (!flattened2.equals(flattened)) { 89 throw new IllegalArgumentException("Failed to compress:\n" + flattened + "\n≠ " + flattened2); 90 } 91 return recompressed; 92 } 93 94 private static String compress(Set<String> flattened, Output<Boolean> isSingle) { 95 // make a map from first code points to remainder 96 Multimap<Integer, String> firstToRemainder = TreeMultimap.create(); 97 UnicodeSet results = new UnicodeSet(); 98 boolean hasEmpty = false; 99 for (String s : flattened) { 100 if (s.isEmpty()) { 101 hasEmpty = true; 102 continue; 103 } 104 int first = s.codePointAt(0); 105 firstToRemainder.put(first, s.substring(UCharacter.charCount(first))); 106 } 107 StringBuilder buf = new StringBuilder(); 108 for (Entry<Integer, Collection<String>> entry : firstToRemainder.asMap().entrySet()) { 109 Set<String> items = (Set<String>) entry.getValue(); 110 buf.setLength(0); 111 buf.appendCodePoint(entry.getKey()); 112 if (items.size() == 1) { 113 buf.append(items.iterator().next()); 114 } else { 115 String sub = compress(items, isSingle); 116 if (isSingle.value) { 117 buf.append(sub); 118 } else { 119 buf.append('(').append(sub).append(')'); 120 } 121 } 122 results.add(buf.toString()); 123 } 124 Set<String> strings = new TreeSet<>(results.strings()); 125 results.removeAll(strings); 126 switch(results.size()) { 127 case 0: 128 break; 129 case 1: 130 strings.add(results.iterator().next()); 131 break; 132 default: 133 strings.add(results.toPattern(false)); 134 break; 135 } 136 switch (strings.size()) { 137 case 0: throw new IllegalArgumentException(); 138 case 1: 139 isSingle.value = true; 140 return strings.iterator().next() + (hasEmpty ? "?" : ""); 141 default: 142 String result = Joiner.on("|").join(strings); 143 if (hasEmpty) { 144 isSingle.value = true; 145 return '(' + result + ")?"; 146 } 147 isSingle.value = false; 148 return result; 149 } 150 } 151 152 public static TreeSet<String> flatten(Pattern pattern, String prefix, UnicodeSet set) { 153 return flatten(pattern.matcher(""), prefix, set, new TreeSet<>(LENGTH_FIRST_COMPARE)); 154 } 155 156 private static TreeSet<String> flatten(Matcher matcher, String prefix, UnicodeSet set, TreeSet<String> results) { 157 for (String s : set) { 158 String trial = prefix + s; 159 matcher.reset(trial); 160 boolean matches = matcher.matches(); 161 if (matches) { 162 results.add(trial); 163 } 164 if (matcher.hitEnd()) { 165 flatten(matcher, trial, set, results); 166 } 167 } 168 return results; 169 } 170 } 171