1 package org.unicode.cldr.draft; 2 3 import java.util.ArrayList; 4 import java.util.List; 5 import java.util.regex.Matcher; 6 import java.util.regex.Pattern; 7 8 import org.unicode.cldr.draft.PatternFixer.Target; 9 10 import com.ibm.icu.impl.Utility; 11 import com.ibm.icu.text.Normalizer; 12 import com.ibm.icu.text.StringTransform; 13 import com.ibm.icu.text.Transliterator; 14 import com.ibm.icu.text.UnicodeSet; 15 16 public class RegexTransformBuilder { 17 static final boolean DEBUG = false; 18 private static final boolean SKIP_BAD = true; 19 20 // initially just very rough rule parser, for proof-of-concept createFromRules(String string)21 public static StringTransform createFromRules(String string) { 22 List<StringTransform> compound = new ArrayList<>(); 23 24 List<Rule> rules = new ArrayList<>(); 25 String[] ruleSet = string.split(";"); 26 Matcher m = RULE_PATTERN.matcher(""); 27 List<String> results = new ArrayList<>(); 28 Matcher variable = VARIABLE.matcher(""); 29 UnicodeSet filter = null; 30 31 if (DEBUG) System.out.println(); 32 33 for (String ruleString : ruleSet) { 34 ruleString = ruleString.trim(); 35 if (DEBUG) System.out.print(ruleString + "\t=>\t"); 36 37 if (ruleString.startsWith("::")) { 38 if (rules.size() != 0) { 39 compound.add(new RegexTransform(rules)); 40 rules.clear(); 41 } 42 final String body = ruleString.substring(2).trim(); 43 if (body.equalsIgnoreCase("NULL")) { 44 // nothing 45 if (DEBUG) System.out.println(); 46 } else if (UnicodeSet.resemblesPattern(body, 0)) { 47 filter = new UnicodeSet(body); 48 if (DEBUG) System.out.println(":: " + filter + " ;"); 49 } else { 50 // if we didn't find a filter, it is a Transliterator 51 final Transliterator translit = Transliterator.getInstance(body.trim()); 52 compound.add(translit); 53 if (DEBUG) System.out.println(":: " + translit + " ;"); 54 } 55 continue; 56 } 57 if (!m.reset(ruleString).matches()) { 58 if (SKIP_BAD) { 59 System.out.println("BAD RULE"); 60 continue; 61 } else { 62 throw new IllegalArgumentException("Bad rule: {" + Utility.escape(ruleString) + "} ;"); 63 } 64 } 65 66 String pre = m.group(1); 67 if (pre == null) { 68 pre = ""; 69 } else { 70 pre = fix(pre); 71 } 72 73 String main = fix(m.group(2)); 74 if (m.group(3) != null) { 75 main += "(?=" + fix(m.group(3)) + ")"; 76 } 77 78 results.clear(); 79 String result = m.group(4).trim(); 80 variable.reset(result); 81 int last = 0; 82 while (true) { 83 if (!variable.find()) { 84 results.add(result.substring(last)); 85 break; 86 } else { 87 results.add(result.substring(last, variable.start())); 88 results.add(variable.group()); 89 last = variable.end(); 90 } 91 } 92 try { 93 Rule rule = new Rule(pre, main, results); 94 if (DEBUG) System.out.println(rule); 95 rules.add(rule); 96 } catch (Exception e) { 97 System.out.println("BAD:\t" + e.getMessage()); 98 } 99 } 100 101 // add any trailing rules 102 if (rules.size() != 0) { 103 compound.add(new RegexTransform(rules)); 104 rules.clear(); 105 } 106 107 // generate final result 108 StringTransform result = compound.size() == 1 ? compound.get(0) : new CompoundTransform(compound); 109 if (filter != null) { 110 return new UnicodeSetFilteredTransform(filter, result); 111 } 112 return result; 113 } 114 fix(String pattern)115 private static String fix(String pattern) { 116 pattern = pattern.trim(); 117 // TODO fix pattern to not have anything but NFD in patterns 118 PATTERN_FIXER.fix(pattern); 119 pattern = Normalizer.decompose(pattern, false); 120 // pre = pre.replace("[:", "\\p{"); 121 // pre = pre.replace(":]", "}"); 122 return pattern; 123 } 124 125 private static final PatternFixer PATTERN_FIXER = new PatternFixer(Target.JAVA); 126 127 static Pattern RULE_PATTERN = Pattern.compile( 128 "(?:([^{}>]*) \\{)?" + 129 "([^}<>]*)" + 130 "(?:\\} ([^<>]*))?" + 131 "<?> (.*)", 132 Pattern.COMMENTS); 133 static Pattern VARIABLE = Pattern.compile( 134 "\\$[0-9]", 135 Pattern.COMMENTS); 136 } 137