1 package org.unicode.cldr.tool; 2 3 import java.io.IOException; 4 import java.io.PrintWriter; 5 import java.io.StringWriter; 6 import java.util.Set; 7 import java.util.TreeMap; 8 import java.util.TreeSet; 9 10 import org.unicode.cldr.draft.FileUtilities; 11 import org.unicode.cldr.icu.LDMLConstants; 12 import org.unicode.cldr.tool.Option.Options; 13 import org.unicode.cldr.util.CLDRConfig; 14 import org.unicode.cldr.util.CLDRFile; 15 import org.unicode.cldr.util.CLDRPaths; 16 import org.unicode.cldr.util.CLDRTool; 17 import org.unicode.cldr.util.SimpleXMLSource; 18 import org.unicode.cldr.util.XPathParts; 19 import org.unicode.cldr.util.XPathParts.Comments.CommentType; 20 21 import com.ibm.icu.impl.Relation; 22 import com.ibm.icu.lang.UCharacter; 23 import com.ibm.icu.text.Normalizer2; 24 import com.ibm.icu.text.Transliterator; 25 import com.ibm.icu.text.UnicodeSet; 26 import com.ibm.icu.util.VersionInfo; 27 28 /** 29 * This tool is manually run to generate *part* of the ar.xml 30 * collation tailorings: the mappings from presentation forms to 31 * identical and tertiary equivalents of the normal forms. 32 * 33 * To generate ar.xml, it is used with default options. 34 * 35 * By Steven R. Loomis (srl) thx Markus Scherer 36 * 37 */ 38 @CLDRTool(alias = "generatedecompcollrules", 39 description = "based on decomposition, generate identical/tertiary collation rules. Used to generate collation/ar.xml.", 40 hidden = "Run manually to generate collation/ar.xml - not general purpose.") 41 public class GenerateDecompCollationRules { 42 43 private static final char SINGLEQUOTE = '\''; 44 45 private final static UnicodeSet isWord = new UnicodeSet("[\\uFDF0-\\uFDFF]"); 46 47 private final static String RESET = "\u200E&"; 48 private final static String IDENTICAL = "\u200E="; 49 private final static String TERTIARY = "\u200E<<<"; 50 private final static String COMMENT = "# "; 51 private final static String NL = "\n"; 52 53 private static final Options myOptions = new Options(GenerateDecompCollationRules.class); 54 55 enum MyOptions { 56 unicodeset(".*", "[[:dt=init:][:dt=med:][:dt=fin:][:dt=iso:]]", "UnicodeSet of input chars"), verbose(null, null, "verbose debugging messages"); 57 58 // boilerplate 59 final Option option; 60 MyOptions(String argumentPattern, String defaultArgument, String helpText)61 MyOptions(String argumentPattern, String defaultArgument, String helpText) { 62 option = myOptions.add(this, argumentPattern, defaultArgument, helpText); 63 } 64 } 65 66 final static Transliterator hex = Transliterator.getInstance("any-hex"); 67 final static Transliterator hexForComment = Transliterator.getInstance("[^ ] any-hex"); 68 final static Transliterator name = Transliterator.getInstance("any-name"); 69 final static Transliterator escapeRules = Transliterator.getInstance("nfc;[[:Mn:]] any-hex"); 70 main(String[] args)71 public static void main(String[] args) throws IOException { 72 myOptions.parse(MyOptions.verbose, args, true); 73 final boolean verbose = myOptions.get(MyOptions.verbose).doesOccur(); 74 final CLDRConfig cldrConfig = CLDRConfig.getInstance(); 75 final Normalizer2 nfkd = Normalizer2.getNFKDInstance(); 76 final Normalizer2 nfc = Normalizer2.getNFCInstance(); 77 78 if (false) { 79 final String astr = "\uFE70"; 80 final String astr_nfkd = nfkd.normalize(astr); 81 final String astr_nfkd_nfc = nfc.normalize(astr_nfkd); 82 System.out.println("'" + astr + "'=" + hex.transform(astr) + ", NFKD: '" + astr_nfkd + "'=" + hex.transform(astr_nfkd)); 83 System.out.println(" NFC: '" + astr_nfkd_nfc + "'=" + hex.transform(astr_nfkd_nfc)); 84 System.out.println(" escapeRules(astr): '" + escapeRules.transform(astr)); 85 System.out.println(" escapeRules(astr_nfkd): '" + escapeRules.transform(astr_nfkd)); 86 } 87 88 UnicodeSet uSet; 89 Option uSetOption = myOptions.get(MyOptions.unicodeset); 90 final String uSetRules = uSetOption.doesOccur() ? uSetOption.getValue() : uSetOption.getDefaultArgument(); 91 System.out.println("UnicodeSet rules: " + uSetRules); 92 try { 93 uSet = new UnicodeSet(uSetRules); 94 } catch (Throwable t) { 95 t.printStackTrace(); 96 System.err.println("Failed to construct UnicodeSet from \"" + uSetRules + "\" - see http://unicode.org/cldr/utility/list-unicodeset.jsp"); 97 return; 98 } 99 System.out.println("UnicodeSet size: " + uSet.size()); 100 101 final Relation<String, String> reg2pres = new Relation(new TreeMap<String, Set<String>>(), TreeSet.class); 102 103 for (final String presForm : uSet) { 104 final String regForm = nfkd.normalize(presForm).trim(); 105 if (verbose) System.out.println("# >" + presForm + "< = " + hex.transliterate(presForm) + "... ->" + 106 regForm + "=" + hex.transliterate(regForm)); 107 if (regForm.length() > 31 || presForm.length() > 31) { 108 System.out.println("!! Skipping, TOO LONG: " + presForm + " -> " + regForm); 109 } else { 110 reg2pres.put(regForm, presForm); 111 } 112 } 113 System.out.println("Relation size: " + reg2pres.size()); 114 115 StringBuilder rules = new StringBuilder(); 116 117 rules.append(COMMENT) 118 .append("Generated by " + GenerateDecompCollationRules.class.getSimpleName() + NL + 119 COMMENT + "ICU v" + VersionInfo.ICU_VERSION + ", Unicode v" + 120 UCharacter.getUnicodeVersion() + NL + 121 COMMENT + "from rules " + uSetRules + NL + COMMENT + NL); 122 123 for (final String regForm : reg2pres.keySet()) { 124 final Set<String> presForms = reg2pres.get(regForm); 125 126 final String relation = (presForms.size() == 1) && 127 isWord.containsAll(presForms.iterator().next()) ? TERTIARY : // only pres form is a word. 128 IDENTICAL; // all other cases. 129 130 // COMMENT 131 rules.append(COMMENT) 132 .append(RESET) 133 .append(hexForComment.transliterate(regForm)); 134 135 for (final String presForm : presForms) { 136 rules.append(relation) 137 .append(hexForComment.transliterate(presForm)); 138 } 139 rules.append(NL); 140 141 // ACTUAL RULE 142 rules.append(RESET) 143 .append(toRule(regForm)); 144 145 for (final String presForm : presForms) { 146 rules.append(relation) 147 .append(toRule(presForm)); 148 } 149 rules.append(NL); 150 } 151 152 if (verbose) { 153 System.out.println(rules); 154 } 155 156 // now, generate the output file 157 XPathParts xpp = new XPathParts() 158 .addElements(LDMLConstants.LDML, 159 LDMLConstants.COLLATIONS, 160 LDMLConstants.COLLATION, 161 "cr"); 162 // The following crashes. Bug #XXXX 163 //xpp.setAttribute(-1, LDMLConstants.COLLATION, LDMLConstants.STANDARD); 164 SimpleXMLSource xmlSource = new SimpleXMLSource("ar"); 165 CLDRFile newFile = new CLDRFile(xmlSource); 166 newFile.add(xpp.toString(), "xyzzy"); 167 newFile.addComment(xpp.toString(), "Generated by " + GenerateDecompCollationRules.class.getSimpleName() + " " + new java.util.Date() + "\n" + 168 "from rules " + uSetRules + "\n", CommentType.PREBLOCK); 169 final String filename = newFile.getLocaleID() + ".xml"; 170 StringWriter sw = new StringWriter(); 171 newFile.write(new PrintWriter(sw)); 172 sw.close(); 173 try (PrintWriter w = FileUtilities.openUTF8Writer(CLDRPaths.GEN_DIRECTORY, filename)) { 174 w.print(sw.toString().replace("xyzzy", 175 "<![CDATA[\n" + 176 rules.toString().replaceAll("\\\\u0020", "\\\\\\\\u0020") + 177 "\n" + "]]>")); 178 //newFile.write(w); 179 System.out.println("Wrote to " + CLDRPaths.GEN_DIRECTORY + "/" + filename); 180 } 181 182 } 183 184 /** 185 * convert a rule to the right form for escaping. 186 * @param rule 187 * @return 188 */ toRule(String rule)189 private static String toRule(String rule) { 190 final String asHex = escapeRules.transform(rule); 191 // quote any strings with spaces 192 if (asHex.contains(" ")) { 193 final StringBuilder sb = new StringBuilder(rule.length()); 194 sb.append(SINGLEQUOTE) 195 .append(asHex) 196 .append(SINGLEQUOTE); 197 return sb.toString(); 198 } else { 199 return asHex; 200 } 201 } 202 } 203