1 package org.unicode.cldr.draft; 2 3 import com.ibm.icu.impl.Utility; 4 import com.ibm.icu.text.Normalizer; 5 import com.ibm.icu.text.StringTransform; 6 import com.ibm.icu.text.Transliterator; 7 import com.ibm.icu.text.UnicodeSet; 8 import java.util.ArrayList; 9 import java.util.List; 10 import java.util.regex.Matcher; 11 import java.util.regex.Pattern; 12 import org.unicode.cldr.draft.PatternFixer.Target; 13 14 public class RegexTransformBuilder { 15 static final boolean DEBUG = false; 16 private static final boolean SKIP_BAD = true; 17 18 // initially just very rough rule parser, for proof-of-concept createFromRules(String string)19 public static StringTransform createFromRules(String string) { 20 List<StringTransform> compound = new ArrayList<>(); 21 22 List<Rule> rules = new ArrayList<>(); 23 String[] ruleSet = string.split(";"); 24 Matcher m = RULE_PATTERN.matcher(""); 25 List<String> results = new ArrayList<>(); 26 Matcher variable = VARIABLE.matcher(""); 27 UnicodeSet filter = null; 28 29 if (DEBUG) System.out.println(); 30 31 for (String ruleString : ruleSet) { 32 ruleString = ruleString.trim(); 33 if (DEBUG) System.out.print(ruleString + "\t=>\t"); 34 35 if (ruleString.startsWith("::")) { 36 if (rules.size() != 0) { 37 compound.add(new RegexTransform(rules)); 38 rules.clear(); 39 } 40 final String body = ruleString.substring(2).trim(); 41 if (body.equalsIgnoreCase("NULL")) { 42 // nothing 43 if (DEBUG) System.out.println(); 44 } else if (UnicodeSet.resemblesPattern(body, 0)) { 45 filter = new UnicodeSet(body); 46 if (DEBUG) System.out.println(":: " + filter + " ;"); 47 } else { 48 // if we didn't find a filter, it is a Transliterator 49 final Transliterator translit = Transliterator.getInstance(body.trim()); 50 compound.add(translit); 51 if (DEBUG) System.out.println(":: " + translit + " ;"); 52 } 53 continue; 54 } 55 if (!m.reset(ruleString).matches()) { 56 if (SKIP_BAD) { 57 System.out.println("BAD RULE"); 58 continue; 59 } else { 60 throw new IllegalArgumentException( 61 "Bad rule: {" + Utility.escape(ruleString) + "} ;"); 62 } 63 } 64 65 String pre = m.group(1); 66 if (pre == null) { 67 pre = ""; 68 } else { 69 pre = fix(pre); 70 } 71 72 String main = fix(m.group(2)); 73 if (m.group(3) != null) { 74 main += "(?=" + fix(m.group(3)) + ")"; 75 } 76 77 results.clear(); 78 String result = m.group(4).trim(); 79 variable.reset(result); 80 int last = 0; 81 while (true) { 82 if (!variable.find()) { 83 results.add(result.substring(last)); 84 break; 85 } else { 86 results.add(result.substring(last, variable.start())); 87 results.add(variable.group()); 88 last = variable.end(); 89 } 90 } 91 try { 92 Rule rule = new Rule(pre, main, results); 93 if (DEBUG) System.out.println(rule); 94 rules.add(rule); 95 } catch (Exception e) { 96 System.out.println("BAD:\t" + e.getMessage()); 97 } 98 } 99 100 // add any trailing rules 101 if (rules.size() != 0) { 102 compound.add(new RegexTransform(rules)); 103 rules.clear(); 104 } 105 106 // generate final result 107 StringTransform result = 108 compound.size() == 1 ? compound.get(0) : new CompoundTransform(compound); 109 if (filter != null) { 110 return new UnicodeSetFilteredTransform(filter, result); 111 } 112 return result; 113 } 114 fix(String pattern)115 private static String fix(String pattern) { 116 pattern = pattern.trim(); 117 // TODO fix pattern to not have anything but NFD in patterns 118 PATTERN_FIXER.fix(pattern); 119 pattern = Normalizer.decompose(pattern, false); 120 // pre = pre.replace("[:", "\\p{"); 121 // pre = pre.replace(":]", "}"); 122 return pattern; 123 } 124 125 private static final PatternFixer PATTERN_FIXER = new PatternFixer(Target.JAVA); 126 127 static Pattern RULE_PATTERN = 128 Pattern.compile( 129 "(?:([^{}>]*) \\{)?" + "([^}<>]*)" + "(?:\\} ([^<>]*))?" + "<?> (.*)", 130 Pattern.COMMENTS); 131 static Pattern VARIABLE = Pattern.compile("\\$[0-9]", Pattern.COMMENTS); 132 } 133