1 package org.unicode.cldr.tool; 2 3 import java.io.File; 4 import java.io.PrintWriter; 5 import java.util.Map; 6 import java.util.concurrent.ConcurrentHashMap; 7 8 import org.unicode.cldr.draft.FileUtilities; 9 import org.unicode.cldr.util.CLDRFile; 10 import org.unicode.cldr.util.CLDRPaths; 11 import org.unicode.cldr.util.CLDRTransforms; 12 import org.unicode.cldr.util.CLDRTransforms.ParsedTransformID; 13 import org.unicode.cldr.util.CldrUtility; 14 import org.unicode.cldr.util.DtdType; 15 import org.unicode.cldr.util.Factory; 16 import org.unicode.cldr.util.LocaleIDParser; 17 import org.unicode.cldr.util.SimpleFactory.NoSourceDirectoryException; 18 import org.unicode.cldr.util.SimpleXMLSource; 19 import org.unicode.cldr.util.XMLSource; 20 21 import com.ibm.icu.text.Normalizer; 22 import com.ibm.icu.text.Transliterator; 23 import com.ibm.icu.text.UnicodeSet; 24 import com.ibm.icu.util.ICUUncheckedIOException; 25 26 /** 27 * Transforms the contents of a CLDRFile. 28 * 29 * @author jchye 30 */ 31 public class CLDRFileTransformer { 32 /** 33 * Contains all supported locale-to-locale conversions along with information 34 * needed to convert each locale. Each enum value is named after the locale that results 35 * from the conversion. 36 */ 37 enum PolicyIfExisting { 38 RETAIN, DISCARD, MINIMIZE 39 } 40 41 public enum LocaleTransform { 42 sr_Latn("sr", "Serbian-Latin-BGN.xml", Transliterator.FORWARD, "[:script=Cyrl:]", PolicyIfExisting.DISCARD), // 43 sr_Latn_BA("sr_Cyrl_BA", "Serbian-Latin-BGN.xml", Transliterator.FORWARD, "[:script=Cyrl:]", PolicyIfExisting.DISCARD), // 44 sr_Latn_ME("sr_Cyrl_ME", "Serbian-Latin-BGN.xml", Transliterator.FORWARD, "[:script=Cyrl:]", PolicyIfExisting.DISCARD), // 45 sr_Latn_XK("sr_Cyrl_XK", "Serbian-Latin-BGN.xml", Transliterator.FORWARD, "[:script=Cyrl:]", PolicyIfExisting.DISCARD), // 46 ha_NE("ha", "ha-ha_NE.xml", Transliterator.FORWARD, "[y Y ƴ Ƴ ʼ]", PolicyIfExisting.DISCARD), // 47 yo_BJ("yo", "yo-yo_BJ.xml", Transliterator.FORWARD, "[ẹ ọ ṣ Ẹ Ọ Ṣ]", PolicyIfExisting.DISCARD), // 48 de_CH("de", "[ß] Casefold", Transliterator.FORWARD, "[ß]", PolicyIfExisting.MINIMIZE), // 49 yue_Hans("yue", "Simplified-Traditional.xml", Transliterator.REVERSE, "[:script=Hant:]", PolicyIfExisting.RETAIN), // 50 // en_NZ("en_AU", "null", Transliterator.FORWARD, "[]", PolicyIfExisting.DISCARD), 51 // Needs work to fix currency symbols, handle Maori. See http://unicode.org/cldr/trac/ticket/9516#comment:6 52 ; 53 54 private final String inputLocale; 55 private final String transformFilename; 56 private final int direction; 57 private final UnicodeSet inputChars; 58 private final PolicyIfExisting policy; 59 60 /** 61 * @deprecated Use {@link #LocaleTransform(String,String,int,String,PolicyIfExisting)} instead 62 */ LocaleTransform(String inputLocale, String transformFilename, int direction, String inputCharPattern)63 private LocaleTransform(String inputLocale, String transformFilename, int direction, String inputCharPattern) { 64 this(inputLocale, transformFilename, direction, inputCharPattern, PolicyIfExisting.DISCARD); 65 } 66 LocaleTransform(String inputLocale, String transformFilename, int direction, String inputCharPattern, PolicyIfExisting policy)67 private LocaleTransform(String inputLocale, String transformFilename, int direction, String inputCharPattern, PolicyIfExisting policy) { 68 this.inputLocale = inputLocale; 69 this.transformFilename = transformFilename; 70 this.direction = direction; 71 this.inputChars = new UnicodeSet(inputCharPattern); 72 this.policy = policy; 73 } 74 75 /** 76 * @return the locale that used for conversion 77 */ getInputLocale()78 public String getInputLocale() { 79 return inputLocale; 80 } 81 82 /** 83 * @return the locale that used for conversion 84 */ getOutputLocale()85 public String getOutputLocale() { 86 return this.toString(); 87 } 88 89 /** 90 * @return the filename of the transform used to make the conversion 91 */ getTransformFilename()92 public String getTransformFilename() { 93 return transformFilename; 94 } 95 96 /** 97 * @return the direction of the transformation 98 */ getDirection()99 public int getDirection() { 100 return direction; 101 } 102 103 /** 104 * @return the set of characters in the input locale that should have been removed after 105 * transformation, used for internal debugging 106 */ getInputChars()107 private UnicodeSet getInputChars() { 108 return inputChars; 109 } 110 } 111 112 private UnicodeSet unconverted = new UnicodeSet(); 113 private Factory factory; 114 /* 115 * The transliterators map exists, and is static, to avoid wasting a lot of time creating 116 * a new Transliterator more often than necessary. (An alternative to "static" here might be to 117 * create only one CLDRFileTransformer, maybe as a member of ExampleGenerator.) 118 * Use ConcurrentHashMap rather than HashMap to avoid concurrency problems. 119 * Reference: https://unicode.org/cldr/trac/ticket/11657 120 */ 121 private static Map<LocaleTransform, Transliterator> transliterators = new ConcurrentHashMap<LocaleTransform, Transliterator>(); 122 private String transformDir; 123 124 /** 125 * @param factory 126 * the factory to get locale data from 127 * @param transformDir 128 * the directory containing the transform files 129 */ CLDRFileTransformer(Factory factory, String transformDir)130 public CLDRFileTransformer(Factory factory, String transformDir) { 131 this.factory = factory; 132 this.transformDir = transformDir; 133 } 134 loadTransliterator(LocaleTransform localeTransform)135 public Transliterator loadTransliterator(LocaleTransform localeTransform) { 136 if (transliterators.containsKey(localeTransform)) { 137 return transliterators.get(localeTransform); 138 } 139 Transliterator transliterator; 140 if (localeTransform.getTransformFilename().contains(".xml")) { 141 ParsedTransformID directionInfo = new ParsedTransformID(); 142 String ruleString = CLDRTransforms.getIcuRulesFromXmlFile(transformDir, localeTransform.getTransformFilename(), directionInfo); 143 transliterator = Transliterator.createFromRules(directionInfo.getId(), 144 ruleString, localeTransform.getDirection()); 145 transliterators.put(localeTransform, transliterator); 146 } else { 147 transliterator = Transliterator.getInstance(localeTransform.getTransformFilename()); 148 } 149 return transliterator; 150 } 151 152 /** 153 * NOTE: This method does not currently handle nested transliterators. 154 * 155 * @param input 156 * @return null if the input file was missing, or if there is no new output file. 157 */ transform(LocaleTransform localeTransform)158 public CLDRFile transform(LocaleTransform localeTransform) { 159 Transliterator transliterator = loadTransliterator(localeTransform); 160 CLDRFile input; 161 try { 162 input = factory.make(localeTransform.getInputLocale(), false); 163 } catch (ICUUncheckedIOException e1) { 164 return null; // input file is missing (or otherwise unavailable) 165 } 166 boolean hadOutput = true; 167 CLDRFile output; 168 try { 169 output = factory.make(localeTransform.getOutputLocale(), false); 170 } catch (NoSourceDirectoryException e) { 171 // if we can't open the file, then just make a new one. 172 XMLSource dataSource = new SimpleXMLSource(localeTransform.getOutputLocale()); 173 output = new CLDRFile(dataSource); 174 hadOutput = false; 175 } 176 String outputParentString = LocaleIDParser.getParent(localeTransform.getOutputLocale()); 177 CLDRFile outputParent = factory.make(outputParentString, true); 178 179 outputParent = factory.make(localeTransform.getInputLocale(), false); 180 XMLSource outputSource = new SimpleXMLSource(localeTransform.toString()); 181 for (String xpath : input) { 182 String value = input.getStringValue(xpath); 183 if (CldrUtility.INHERITANCE_MARKER.equals(value)) { 184 value = null; 185 } 186 if (value == null) { 187 continue; 188 } 189 String fullPath = input.getFullXPath(xpath); 190 String oldValue = output.getStringValue(xpath); 191 String parentValue = outputParent.getStringValue(xpath); 192 value = transformValue(transliterator, localeTransform, xpath, value, oldValue, parentValue); 193 if (value != null && !CldrUtility.INHERITANCE_MARKER.equals(value)) { 194 outputSource.putValueAtPath(fullPath, value); 195 } 196 } 197 if (!outputSource.iterator().hasNext()) { // empty new output 198 if (!hadOutput) { 199 return null; // don't add file if nothing to add 200 } 201 } 202 return new CLDRFile(outputSource); 203 } 204 205 /** 206 * Transforms a CLDRFile value into another form. 207 * @param parentValue 208 */ transformValue(Transliterator transliterator, LocaleTransform localeTransform, String path, String value, String oldValue, String parentValue)209 private String transformValue(Transliterator transliterator, LocaleTransform localeTransform, String path, String value, 210 String oldValue, String parentValue) { 211 212 // allows us to change only new values 213 switch (localeTransform.policy) { 214 case RETAIN: 215 case MINIMIZE: 216 if (oldValue != null) { 217 return oldValue; 218 } 219 break; 220 default: 221 } 222 223 UnicodeSet chars = localeTransform.getInputChars(); 224 String transliterated; 225 226 // TODO: Don't transform dates/patterns. 227 // For now, don't try to transliterate the exemplar characters - use the ones from the original locale. 228 // In the future, we can probably control this better with a config file - similar to CLDRModify's config file. 229 if (path.contains("exemplarCharacters")) { 230 if (oldValue != null) { 231 transliterated = oldValue; 232 } else { 233 transliterated = value; 234 } 235 } else { 236 transliterated = transliterator.transliterate(value); 237 transliterated = Normalizer.compose(transliterated, false); 238 } 239 if (localeTransform.policy == PolicyIfExisting.MINIMIZE) { 240 if (transliterated.equals(value)) { 241 return null; 242 } 243 } 244 245 if (chars.containsSome(transliterated)) { 246 unconverted.addAll(new UnicodeSet().addAll(chars).retainAll(transliterated)); 247 } 248 return transliterated; 249 } 250 main(String[] args)251 public static void main(String[] args) throws Exception { 252 for (String dir : DtdType.ldml.directories) { 253 if (dir.equals("casing") // skip, field contents are keywords, not localizable content 254 || dir.equals("collation") // skip, field contents are complex, and can't be simply remapped 255 || dir.equals("annotationsDerived") // skip, derived later 256 ) { 257 continue; 258 } 259 System.out.println("\nDirectory: " + dir); 260 Factory factory = Factory.make(CLDRPaths.COMMON_DIRECTORY + dir + "/", ".*"); 261 CLDRFileTransformer transformer = new CLDRFileTransformer(factory, CLDRPaths.COMMON_DIRECTORY + "transforms" + File.separator); 262 for (LocaleTransform localeTransform : LocaleTransform.values()) { 263 CLDRFile output = transformer.transform(localeTransform); 264 if (output == null) { 265 System.out.println("SKIPPING missing file: " + dir + "/" + localeTransform.inputLocale + ".xml"); 266 continue; 267 } 268 String outputDir = CLDRPaths.GEN_DIRECTORY + "common/" + dir + File.separator; 269 String outputFile = output.getLocaleID() + ".xml"; 270 PrintWriter out = FileUtilities.openUTF8Writer(outputDir, outputFile); 271 System.out.println("Generating locale file: " + outputDir + outputFile); 272 if (!transformer.unconverted.isEmpty()) { 273 System.out.println("Untransformed characters: " + transformer.unconverted); 274 transformer.unconverted.clear(); 275 } 276 output.write(out); 277 out.close(); 278 } 279 } 280 } 281 } 282