• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 package org.unicode.cldr.tool;
2 
3 import java.io.IOException;
4 import java.io.PrintWriter;
5 import java.io.StringWriter;
6 import java.util.Set;
7 import java.util.TreeMap;
8 import java.util.TreeSet;
9 
10 import org.unicode.cldr.draft.FileUtilities;
11 import org.unicode.cldr.icu.LDMLConstants;
12 import org.unicode.cldr.tool.Option.Options;
13 import org.unicode.cldr.util.CLDRConfig;
14 import org.unicode.cldr.util.CLDRFile;
15 import org.unicode.cldr.util.CLDRPaths;
16 import org.unicode.cldr.util.CLDRTool;
17 import org.unicode.cldr.util.SimpleXMLSource;
18 import org.unicode.cldr.util.XPathParts;
19 import org.unicode.cldr.util.XPathParts.Comments.CommentType;
20 
21 import com.ibm.icu.impl.Relation;
22 import com.ibm.icu.lang.UCharacter;
23 import com.ibm.icu.text.Normalizer2;
24 import com.ibm.icu.text.Transliterator;
25 import com.ibm.icu.text.UnicodeSet;
26 import com.ibm.icu.util.VersionInfo;
27 
28 /**
29  * This tool is manually run to generate *part* of the ar.xml
30  * collation tailorings: the mappings from presentation forms to
31  * identical and tertiary equivalents of the normal forms.
32  *
33  * To generate ar.xml, it is used with default options.
34  *
35  * By Steven R. Loomis (srl) thx Markus Scherer
36  *
37  */
38 @CLDRTool(alias = "generatedecompcollrules",
39     description = "based on decomposition, generate identical/tertiary collation rules. Used to generate collation/ar.xml.",
40     hidden = "Run manually to generate collation/ar.xml - not general purpose.")
41 public class GenerateDecompCollationRules {
42 
43     private static final char SINGLEQUOTE = '\'';
44 
45     private final static UnicodeSet isWord = new UnicodeSet("[\\uFDF0-\\uFDFF]");
46 
47     private final static String RESET = "\u200E&";
48     private final static String IDENTICAL = "\u200E=";
49     private final static String TERTIARY = "\u200E<<<";
50     private final static String COMMENT = "# ";
51     private final static String NL = "\n";
52 
53     private static final Options myOptions = new Options(GenerateDecompCollationRules.class);
54 
55     enum MyOptions {
56         unicodeset(".*", "[[:dt=init:][:dt=med:][:dt=fin:][:dt=iso:]]", "UnicodeSet of input chars"), verbose(null, null, "verbose debugging messages");
57 
58         // boilerplate
59         final Option option;
60 
MyOptions(String argumentPattern, String defaultArgument, String helpText)61         MyOptions(String argumentPattern, String defaultArgument, String helpText) {
62             option = myOptions.add(this, argumentPattern, defaultArgument, helpText);
63         }
64     }
65 
66     final static Transliterator hex = Transliterator.getInstance("any-hex");
67     final static Transliterator hexForComment = Transliterator.getInstance("[^ ] any-hex");
68     final static Transliterator name = Transliterator.getInstance("any-name");
69     final static Transliterator escapeRules = Transliterator.getInstance("nfc;[[:Mn:]] any-hex");
70 
main(String[] args)71     public static void main(String[] args) throws IOException {
72         myOptions.parse(MyOptions.verbose, args, true);
73         final boolean verbose = myOptions.get(MyOptions.verbose).doesOccur();
74         final CLDRConfig cldrConfig = CLDRConfig.getInstance();
75         final Normalizer2 nfkd = Normalizer2.getNFKDInstance();
76         final Normalizer2 nfc = Normalizer2.getNFCInstance();
77 
78         if (false) {
79             final String astr = "\uFE70";
80             final String astr_nfkd = nfkd.normalize(astr);
81             final String astr_nfkd_nfc = nfc.normalize(astr_nfkd);
82             System.out.println("'" + astr + "'=" + hex.transform(astr) + ", NFKD: '" + astr_nfkd + "'=" + hex.transform(astr_nfkd));
83             System.out.println(" NFC: '" + astr_nfkd_nfc + "'=" + hex.transform(astr_nfkd_nfc));
84             System.out.println(" escapeRules(astr): '" + escapeRules.transform(astr));
85             System.out.println(" escapeRules(astr_nfkd): '" + escapeRules.transform(astr_nfkd));
86         }
87 
88         UnicodeSet uSet;
89         Option uSetOption = myOptions.get(MyOptions.unicodeset);
90         final String uSetRules = uSetOption.doesOccur() ? uSetOption.getValue() : uSetOption.getDefaultArgument();
91         System.out.println("UnicodeSet rules: " + uSetRules);
92         try {
93             uSet = new UnicodeSet(uSetRules);
94         } catch (Throwable t) {
95             t.printStackTrace();
96             System.err.println("Failed to construct UnicodeSet from \"" + uSetRules + "\" - see http://unicode.org/cldr/utility/list-unicodeset.jsp");
97             return;
98         }
99         System.out.println("UnicodeSet size: " + uSet.size());
100 
101         final Relation<String, String> reg2pres = new Relation(new TreeMap<String, Set<String>>(), TreeSet.class);
102 
103         for (final String presForm : uSet) {
104             final String regForm = nfkd.normalize(presForm).trim();
105             if (verbose) System.out.println("# >" + presForm + "< = " + hex.transliterate(presForm) + "... ->" +
106                 regForm + "=" + hex.transliterate(regForm));
107             if (regForm.length() > 31 || presForm.length() > 31) {
108                 System.out.println("!! Skipping, TOO LONG: " + presForm + " -> " + regForm);
109             } else {
110                 reg2pres.put(regForm, presForm);
111             }
112         }
113         System.out.println("Relation size: " + reg2pres.size());
114 
115         StringBuilder rules = new StringBuilder();
116 
117         rules.append(COMMENT)
118             .append("Generated by " + GenerateDecompCollationRules.class.getSimpleName() + NL +
119                 COMMENT + "ICU v" + VersionInfo.ICU_VERSION + ", Unicode v" +
120                 UCharacter.getUnicodeVersion() + NL +
121                 COMMENT + "from rules " + uSetRules + NL + COMMENT + NL);
122 
123         for (final String regForm : reg2pres.keySet()) {
124             final Set<String> presForms = reg2pres.get(regForm);
125 
126             final String relation = (presForms.size() == 1) &&
127                 isWord.containsAll(presForms.iterator().next()) ? TERTIARY : // only pres form is a word.
128                     IDENTICAL; // all other cases.
129 
130             // COMMENT
131             rules.append(COMMENT)
132                 .append(RESET)
133                 .append(hexForComment.transliterate(regForm));
134 
135             for (final String presForm : presForms) {
136                 rules.append(relation)
137                     .append(hexForComment.transliterate(presForm));
138             }
139             rules.append(NL);
140 
141             // ACTUAL RULE
142             rules.append(RESET)
143                 .append(toRule(regForm));
144 
145             for (final String presForm : presForms) {
146                 rules.append(relation)
147                     .append(toRule(presForm));
148             }
149             rules.append(NL);
150         }
151 
152         if (verbose) {
153             System.out.println(rules);
154         }
155 
156         // now, generate the output file
157         XPathParts xpp = new XPathParts(null, null)
158             .addElements(LDMLConstants.LDML,
159                 LDMLConstants.COLLATIONS,
160                 LDMLConstants.COLLATION,
161                 "cr");
162         // The following crashes. Bug #XXXX
163         //xpp.setAttribute(-1, LDMLConstants.COLLATION, LDMLConstants.STANDARD);
164         SimpleXMLSource xmlSource = new SimpleXMLSource("ar");
165         CLDRFile newFile = new CLDRFile(xmlSource);
166         newFile.add(xpp.toString(), "xyzzy");
167         newFile.addComment(xpp.toString(), "Generated by " + GenerateDecompCollationRules.class.getSimpleName() + " " + new java.util.Date() + "\n" +
168             "from rules " + uSetRules + "\n", CommentType.PREBLOCK);
169         final String filename = newFile.getLocaleID() + ".xml";
170         StringWriter sw = new StringWriter();
171         newFile.write(new PrintWriter(sw));
172         sw.close();
173         try (PrintWriter w = FileUtilities.openUTF8Writer(CLDRPaths.GEN_DIRECTORY, filename)) {
174             w.print(sw.toString().replace("xyzzy",
175                 "<![CDATA[\n" +
176                     rules.toString().replaceAll("\\\\u0020", "\\\\\\\\u0020") +
177                     "\n" + "]]>"));
178             //newFile.write(w);
179             System.out.println("Wrote to " + CLDRPaths.GEN_DIRECTORY + "/" + filename);
180         }
181 
182     }
183 
184     /**
185      * convert a rule to the right form for escaping.
186      * @param rule
187      * @return
188      */
toRule(String rule)189     private static String toRule(String rule) {
190         final String asHex = escapeRules.transform(rule);
191         // quote any strings with spaces
192         if (asHex.contains(" ")) {
193             final StringBuilder sb = new StringBuilder(rule.length());
194             sb.append(SINGLEQUOTE)
195                 .append(asHex)
196                 .append(SINGLEQUOTE);
197             return sb.toString();
198         } else {
199             return asHex;
200         }
201     }
202 }
203