• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 package org.unicode.cldr.draft;
2 
3 import java.util.ArrayList;
4 import java.util.List;
5 import java.util.regex.Matcher;
6 import java.util.regex.Pattern;
7 
8 import org.unicode.cldr.draft.PatternFixer.Target;
9 
10 import com.ibm.icu.impl.Utility;
11 import com.ibm.icu.text.Normalizer;
12 import com.ibm.icu.text.StringTransform;
13 import com.ibm.icu.text.Transliterator;
14 import com.ibm.icu.text.UnicodeSet;
15 
16 public class RegexTransformBuilder {
17     static final boolean DEBUG = false;
18     private static final boolean SKIP_BAD = true;
19 
20     // initially just very rough rule parser, for proof-of-concept
createFromRules(String string)21     public static StringTransform createFromRules(String string) {
22         List<StringTransform> compound = new ArrayList<>();
23 
24         List<Rule> rules = new ArrayList<>();
25         String[] ruleSet = string.split(";");
26         Matcher m = RULE_PATTERN.matcher("");
27         List<String> results = new ArrayList<>();
28         Matcher variable = VARIABLE.matcher("");
29         UnicodeSet filter = null;
30 
31         if (DEBUG) System.out.println();
32 
33         for (String ruleString : ruleSet) {
34             ruleString = ruleString.trim();
35             if (DEBUG) System.out.print(ruleString + "\t=>\t");
36 
37             if (ruleString.startsWith("::")) {
38                 if (rules.size() != 0) {
39                     compound.add(new RegexTransform(rules));
40                     rules.clear();
41                 }
42                 final String body = ruleString.substring(2).trim();
43                 if (body.equalsIgnoreCase("NULL")) {
44                     // nothing
45                     if (DEBUG) System.out.println();
46                 } else if (UnicodeSet.resemblesPattern(body, 0)) {
47                     filter = new UnicodeSet(body);
48                     if (DEBUG) System.out.println(":: " + filter + " ;");
49                 } else {
50                     // if we didn't find a filter, it is a Transliterator
51                     final Transliterator translit = Transliterator.getInstance(body.trim());
52                     compound.add(translit);
53                     if (DEBUG) System.out.println(":: " + translit + " ;");
54                 }
55                 continue;
56             }
57             if (!m.reset(ruleString).matches()) {
58                 if (SKIP_BAD) {
59                     System.out.println("BAD RULE");
60                     continue;
61                 } else {
62                     throw new IllegalArgumentException("Bad rule: {" + Utility.escape(ruleString) + "} ;");
63                 }
64             }
65 
66             String pre = m.group(1);
67             if (pre == null) {
68                 pre = "";
69             } else {
70                 pre = fix(pre);
71             }
72 
73             String main = fix(m.group(2));
74             if (m.group(3) != null) {
75                 main += "(?=" + fix(m.group(3)) + ")";
76             }
77 
78             results.clear();
79             String result = m.group(4).trim();
80             variable.reset(result);
81             int last = 0;
82             while (true) {
83                 if (!variable.find()) {
84                     results.add(result.substring(last));
85                     break;
86                 } else {
87                     results.add(result.substring(last, variable.start()));
88                     results.add(variable.group());
89                     last = variable.end();
90                 }
91             }
92             try {
93                 Rule rule = new Rule(pre, main, results);
94                 if (DEBUG) System.out.println(rule);
95                 rules.add(rule);
96             } catch (Exception e) {
97                 System.out.println("BAD:\t" + e.getMessage());
98             }
99         }
100 
101         // add any trailing rules
102         if (rules.size() != 0) {
103             compound.add(new RegexTransform(rules));
104             rules.clear();
105         }
106 
107         // generate final result
108         StringTransform result = compound.size() == 1 ? compound.get(0) : new CompoundTransform(compound);
109         if (filter != null) {
110             return new UnicodeSetFilteredTransform(filter, result);
111         }
112         return result;
113     }
114 
fix(String pattern)115     private static String fix(String pattern) {
116         pattern = pattern.trim();
117         // TODO fix pattern to not have anything but NFD in patterns
118         PATTERN_FIXER.fix(pattern);
119         pattern = Normalizer.decompose(pattern, false);
120         // pre = pre.replace("[:", "\\p{");
121         // pre = pre.replace(":]", "}");
122         return pattern;
123     }
124 
125     private static final PatternFixer PATTERN_FIXER = new PatternFixer(Target.JAVA);
126 
127     static Pattern RULE_PATTERN = Pattern.compile(
128         "(?:([^{}>]*) \\{)?" +
129             "([^}<>]*)" +
130             "(?:\\} ([^<>]*))?" +
131             "<?> (.*)",
132         Pattern.COMMENTS);
133     static Pattern VARIABLE = Pattern.compile(
134         "\\$[0-9]",
135         Pattern.COMMENTS);
136 }
137