• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 package org.unicode.cldr.draft;
2 
3 import com.ibm.icu.impl.Utility;
4 import com.ibm.icu.text.Normalizer;
5 import com.ibm.icu.text.StringTransform;
6 import com.ibm.icu.text.Transliterator;
7 import com.ibm.icu.text.UnicodeSet;
8 import java.util.ArrayList;
9 import java.util.List;
10 import java.util.regex.Matcher;
11 import java.util.regex.Pattern;
12 import org.unicode.cldr.draft.PatternFixer.Target;
13 
14 public class RegexTransformBuilder {
15     static final boolean DEBUG = false;
16     private static final boolean SKIP_BAD = true;
17 
18     // initially just very rough rule parser, for proof-of-concept
createFromRules(String string)19     public static StringTransform createFromRules(String string) {
20         List<StringTransform> compound = new ArrayList<>();
21 
22         List<Rule> rules = new ArrayList<>();
23         String[] ruleSet = string.split(";");
24         Matcher m = RULE_PATTERN.matcher("");
25         List<String> results = new ArrayList<>();
26         Matcher variable = VARIABLE.matcher("");
27         UnicodeSet filter = null;
28 
29         if (DEBUG) System.out.println();
30 
31         for (String ruleString : ruleSet) {
32             ruleString = ruleString.trim();
33             if (DEBUG) System.out.print(ruleString + "\t=>\t");
34 
35             if (ruleString.startsWith("::")) {
36                 if (rules.size() != 0) {
37                     compound.add(new RegexTransform(rules));
38                     rules.clear();
39                 }
40                 final String body = ruleString.substring(2).trim();
41                 if (body.equalsIgnoreCase("NULL")) {
42                     // nothing
43                     if (DEBUG) System.out.println();
44                 } else if (UnicodeSet.resemblesPattern(body, 0)) {
45                     filter = new UnicodeSet(body);
46                     if (DEBUG) System.out.println(":: " + filter + " ;");
47                 } else {
48                     // if we didn't find a filter, it is a Transliterator
49                     final Transliterator translit = Transliterator.getInstance(body.trim());
50                     compound.add(translit);
51                     if (DEBUG) System.out.println(":: " + translit + " ;");
52                 }
53                 continue;
54             }
55             if (!m.reset(ruleString).matches()) {
56                 if (SKIP_BAD) {
57                     System.out.println("BAD RULE");
58                     continue;
59                 } else {
60                     throw new IllegalArgumentException(
61                             "Bad rule: {" + Utility.escape(ruleString) + "} ;");
62                 }
63             }
64 
65             String pre = m.group(1);
66             if (pre == null) {
67                 pre = "";
68             } else {
69                 pre = fix(pre);
70             }
71 
72             String main = fix(m.group(2));
73             if (m.group(3) != null) {
74                 main += "(?=" + fix(m.group(3)) + ")";
75             }
76 
77             results.clear();
78             String result = m.group(4).trim();
79             variable.reset(result);
80             int last = 0;
81             while (true) {
82                 if (!variable.find()) {
83                     results.add(result.substring(last));
84                     break;
85                 } else {
86                     results.add(result.substring(last, variable.start()));
87                     results.add(variable.group());
88                     last = variable.end();
89                 }
90             }
91             try {
92                 Rule rule = new Rule(pre, main, results);
93                 if (DEBUG) System.out.println(rule);
94                 rules.add(rule);
95             } catch (Exception e) {
96                 System.out.println("BAD:\t" + e.getMessage());
97             }
98         }
99 
100         // add any trailing rules
101         if (rules.size() != 0) {
102             compound.add(new RegexTransform(rules));
103             rules.clear();
104         }
105 
106         // generate final result
107         StringTransform result =
108                 compound.size() == 1 ? compound.get(0) : new CompoundTransform(compound);
109         if (filter != null) {
110             return new UnicodeSetFilteredTransform(filter, result);
111         }
112         return result;
113     }
114 
fix(String pattern)115     private static String fix(String pattern) {
116         pattern = pattern.trim();
117         // TODO fix pattern to not have anything but NFD in patterns
118         PATTERN_FIXER.fix(pattern);
119         pattern = Normalizer.decompose(pattern, false);
120         // pre = pre.replace("[:", "\\p{");
121         // pre = pre.replace(":]", "}");
122         return pattern;
123     }
124 
125     private static final PatternFixer PATTERN_FIXER = new PatternFixer(Target.JAVA);
126 
127     static Pattern RULE_PATTERN =
128             Pattern.compile(
129                     "(?:([^{}>]*) \\{)?" + "([^}<>]*)" + "(?:\\} ([^<>]*))?" + "<?> (.*)",
130                     Pattern.COMMENTS);
131     static Pattern VARIABLE = Pattern.compile("\\$[0-9]", Pattern.COMMENTS);
132 }
133