1 package org.unicode.cldr.tool; 2 3 import java.io.File; 4 import java.io.PrintWriter; 5 import java.util.Map; 6 import java.util.concurrent.ConcurrentHashMap; 7 8 import org.unicode.cldr.draft.FileUtilities; 9 import org.unicode.cldr.test.DisplayAndInputProcessor; 10 import org.unicode.cldr.util.CLDRFile; 11 import org.unicode.cldr.util.CLDRPaths; 12 import org.unicode.cldr.util.CLDRTransforms; 13 import org.unicode.cldr.util.CLDRTransforms.ParsedTransformID; 14 import org.unicode.cldr.util.CldrUtility; 15 import org.unicode.cldr.util.DtdType; 16 import org.unicode.cldr.util.Factory; 17 import org.unicode.cldr.util.LocaleIDParser; 18 import org.unicode.cldr.util.SimpleFactory.NoSourceDirectoryException; 19 import org.unicode.cldr.util.SimpleXMLSource; 20 import org.unicode.cldr.util.XMLSource; 21 22 import com.ibm.icu.text.Normalizer; 23 import com.ibm.icu.text.Transliterator; 24 import com.ibm.icu.text.UnicodeSet; 25 import com.ibm.icu.util.ICUUncheckedIOException; 26 27 /** 28 * Transforms the contents of a CLDRFile. 29 * 30 * @author jchye 31 */ 32 public class CLDRFileTransformer { 33 public enum PolicyIfExisting { 34 RETAIN, // Do not transliterate if existing output has locale content 35 DISCARD, // Replace existing output locale content 36 MINIMIZE // RETAIN, plus drop values if translit is a no-op. 37 } 38 39 /** 40 * Contains all supported locale-to-locale conversions along with information 41 * needed to convert each locale. Each enum value is named after the locale that results 42 * from the conversion. 43 */ 44 public enum LocaleTransform { 45 sr_Latn("sr", "Serbian-Latin-BGN.xml", Transliterator.FORWARD, "[:script=Cyrl:]", PolicyIfExisting.DISCARD), // 46 sr_Latn_BA("sr_Cyrl_BA", "Serbian-Latin-BGN.xml", Transliterator.FORWARD, "[:script=Cyrl:]", PolicyIfExisting.DISCARD), // 47 sr_Latn_ME("sr_Cyrl_ME", "Serbian-Latin-BGN.xml", Transliterator.FORWARD, "[:script=Cyrl:]", PolicyIfExisting.DISCARD), // 48 sr_Latn_XK("sr_Cyrl_XK", "Serbian-Latin-BGN.xml", Transliterator.FORWARD, "[:script=Cyrl:]", PolicyIfExisting.DISCARD), // 49 ha_NE("ha", "ha-ha_NE.xml", Transliterator.FORWARD, "[y Y ƴ Ƴ ʼ]", PolicyIfExisting.DISCARD), // 50 yo_BJ("yo", "yo-yo_BJ.xml", Transliterator.FORWARD, "[ẹ ọ ṣ Ẹ Ọ Ṣ]", PolicyIfExisting.DISCARD), // 51 de_CH("de", "[ß] Casefold", Transliterator.FORWARD, "[ß]", PolicyIfExisting.MINIMIZE), // 52 yue_Hans("yue", "Simplified-Traditional.xml", Transliterator.REVERSE, "[:script=Hant:]", PolicyIfExisting.RETAIN), // 53 // en_NZ("en_AU", "null", Transliterator.FORWARD, "[]", PolicyIfExisting.DISCARD), 54 // Needs work to fix currency symbols, handle Māori. See http://unicode.org/cldr/trac/ticket/9516#comment:6 55 ; 56 57 private final String inputLocale; 58 private final String transformFilename; 59 private final int direction; 60 private final UnicodeSet inputChars; 61 private final PolicyIfExisting policy; 62 63 /** 64 * @deprecated Use {@link #LocaleTransform(String,String,int,String,PolicyIfExisting)} instead 65 */ 66 @Deprecated LocaleTransform(String inputLocale, String transformFilename, int direction, String inputCharPattern)67 private LocaleTransform(String inputLocale, String transformFilename, int direction, String inputCharPattern) { 68 this(inputLocale, transformFilename, direction, inputCharPattern, PolicyIfExisting.DISCARD); 69 } 70 LocaleTransform(String inputLocale, String transformFilename, int direction, String inputCharPattern, PolicyIfExisting policy)71 private LocaleTransform(String inputLocale, String transformFilename, int direction, String inputCharPattern, PolicyIfExisting policy) { 72 this.inputLocale = inputLocale; 73 this.transformFilename = transformFilename; 74 this.direction = direction; 75 this.inputChars = new UnicodeSet(inputCharPattern); 76 this.policy = policy; 77 } 78 79 /** 80 * @return the policy for existing content 81 */ getPolicyIfExisting()82 public PolicyIfExisting getPolicyIfExisting() { 83 return policy; 84 } 85 86 /** 87 * @return the locale that used for conversion 88 */ getInputLocale()89 public String getInputLocale() { 90 return inputLocale; 91 } 92 93 /** 94 * @return the locale that used for conversion 95 */ getOutputLocale()96 public String getOutputLocale() { 97 return this.toString(); 98 } 99 100 /** 101 * @return the filename of the transform used to make the conversion 102 */ getTransformFilename()103 public String getTransformFilename() { 104 return transformFilename; 105 } 106 107 /** 108 * @return the direction of the transformation 109 */ getDirection()110 public int getDirection() { 111 return direction; 112 } 113 114 /** 115 * @return the set of characters in the input locale that should have been removed after 116 * transformation, used for internal debugging 117 */ getInputChars()118 private UnicodeSet getInputChars() { 119 return inputChars; 120 } 121 } 122 123 private UnicodeSet unconverted = new UnicodeSet(); 124 private Factory factory; 125 /* 126 * The transliterators map exists, and is static, to avoid wasting a lot of time creating 127 * a new Transliterator more often than necessary. (An alternative to "static" here might be to 128 * create only one CLDRFileTransformer, maybe as a member of ExampleGenerator.) 129 * Use ConcurrentHashMap rather than HashMap to avoid concurrency problems. 130 * Reference: https://unicode.org/cldr/trac/ticket/11657 131 */ 132 private static Map<LocaleTransform, Transliterator> transliterators = new ConcurrentHashMap<>(); 133 private String transformDir; 134 135 /** 136 * @param factory 137 * the factory to get locale data from 138 * @param transformDir 139 * the directory containing the transform files 140 */ CLDRFileTransformer(Factory factory, String transformDir)141 public CLDRFileTransformer(Factory factory, String transformDir) { 142 this.factory = factory; 143 this.transformDir = transformDir; 144 } 145 loadTransliterator(LocaleTransform localeTransform)146 public Transliterator loadTransliterator(LocaleTransform localeTransform) { 147 if (transliterators.containsKey(localeTransform)) { 148 return transliterators.get(localeTransform); 149 } 150 Transliterator transliterator; 151 if (localeTransform.getTransformFilename().contains(".xml")) { 152 ParsedTransformID directionInfo = new ParsedTransformID(); 153 String ruleString = CLDRTransforms.getIcuRulesFromXmlFile(transformDir, localeTransform.getTransformFilename(), directionInfo); 154 transliterator = Transliterator.createFromRules(directionInfo.getId(), 155 ruleString, localeTransform.getDirection()); 156 } else { 157 transliterator = Transliterator.getInstance(localeTransform.getTransformFilename()); 158 } 159 transliterators.put(localeTransform, transliterator); 160 return transliterator; 161 } 162 163 /** 164 * NOTE: This method does not currently handle nested transliterators. 165 * 166 * @param input 167 * @return null if the input file was missing, or if there is no new output file. 168 */ transform(LocaleTransform localeTransform)169 public CLDRFile transform(LocaleTransform localeTransform) { 170 Transliterator transliterator = loadTransliterator(localeTransform); 171 CLDRFile input; 172 final String inputLocale = localeTransform.getInputLocale(); 173 try { 174 input = factory.make(inputLocale, false); 175 } catch (ICUUncheckedIOException e1) { 176 return null; // input file is missing (or otherwise unavailable) 177 } 178 boolean hadOutput = true; 179 CLDRFile output; 180 try { 181 output = factory.make(localeTransform.getOutputLocale(), false); 182 } catch (NoSourceDirectoryException e) { 183 // if we can't open the file, then just make a new one. 184 XMLSource dataSource = new SimpleXMLSource(localeTransform.getOutputLocale()); 185 output = new CLDRFile(dataSource); 186 hadOutput = false; 187 } 188 String outputParentString = LocaleIDParser.getParent(localeTransform.getOutputLocale()); 189 CLDRFile outputParent = factory.make(outputParentString, true); 190 191 outputParent = factory.make(inputLocale, false); 192 XMLSource outputSource = new SimpleXMLSource(localeTransform.toString()); 193 DisplayAndInputProcessor daip = new DisplayAndInputProcessor(output, true); 194 for (String xpath : input) { 195 String value = input.getStringValue(xpath); 196 if (CldrUtility.INHERITANCE_MARKER.equals(value)) { 197 final String foundIn = input.getSourceLocaleID(xpath, null); 198 // Include these only when they are actually present in this file 199 if (!foundIn.equals(inputLocale)) { 200 // inheritance marker came from somewhere else, ignore it 201 continue; 202 } 203 } 204 if (value == null) { 205 continue; 206 } 207 String fullPath = input.getFullXPath(xpath); 208 String oldValue = output.getStringValue(xpath); 209 String parentValue = outputParent.getStringValue(xpath); 210 value = transformValue(transliterator, localeTransform, xpath, value, oldValue, parentValue); 211 if (value != null) { 212 // check again 213 if (CldrUtility.INHERITANCE_MARKER.equals(value)) { 214 final String foundIn = input.getSourceLocaleID(xpath, null); 215 // Include these only when they are actually present in this file 216 if (!foundIn.equals(inputLocale)) { 217 // inheritance marker came from somewhere else, ignore it 218 continue; 219 } 220 } 221 value = daip.processInput(xpath, value, null); 222 outputSource.putValueAtPath(fullPath, value); 223 } 224 } 225 if (!outputSource.iterator().hasNext()) { // empty new output 226 if (!hadOutput) { 227 return null; // don't add file if nothing to add 228 } 229 } 230 return new CLDRFile(outputSource); 231 } 232 233 /** 234 * Transforms a CLDRFile value into another form. 235 * @param parentValue 236 */ transformValue(Transliterator transliterator, LocaleTransform localeTransform, String path, String value, String oldValue, String parentValue)237 private String transformValue(Transliterator transliterator, LocaleTransform localeTransform, String path, String value, 238 String oldValue, String parentValue) { 239 240 // allows us to change only new values 241 switch (localeTransform.policy) { 242 case RETAIN: 243 case MINIMIZE: 244 if (oldValue != null) { 245 return oldValue; 246 } 247 break; 248 default: 249 } 250 251 UnicodeSet chars = localeTransform.getInputChars(); 252 String transliterated; 253 254 // TODO: Don't transform dates/patterns. 255 // For now, don't try to transliterate the exemplar characters - use the ones from the original locale. 256 // In the future, we can probably control this better with a config file - similar to CLDRModify's config file. 257 if (path.contains("exemplarCharacters")) { 258 if (oldValue != null) { 259 transliterated = oldValue; 260 } else { 261 transliterated = value; 262 } 263 } else { 264 transliterated = transliterator.transliterate(value); 265 transliterated = Normalizer.compose(transliterated, false); 266 } 267 if (localeTransform.policy == PolicyIfExisting.MINIMIZE) { 268 if (transliterated.equals(value)) { 269 return null; 270 } 271 } 272 273 if (chars.containsSome(transliterated)) { 274 unconverted.addAll(new UnicodeSet().addAll(chars).retainAll(transliterated)); 275 } 276 return transliterated; 277 } 278 main(String[] args)279 public static void main(String[] args) throws Exception { 280 for (String dir : DtdType.ldml.directories) { 281 if (dir.equals("casing") // skip, field contents are keywords, not localizable content 282 || dir.equals("collation") // skip, field contents are complex, and can't be simply remapped 283 || dir.equals("annotationsDerived") // skip, derived later 284 ) { 285 continue; 286 } 287 System.out.println("\nDirectory: " + dir); 288 final String sourceDirectory = CLDRPaths.COMMON_DIRECTORY + dir + "/"; 289 Factory factory = Factory.make(sourceDirectory, ".*"); 290 291 CLDRFileTransformer transformer = new CLDRFileTransformer(factory, CLDRPaths.COMMON_DIRECTORY + "transforms" + File.separator); 292 for (LocaleTransform localeTransform : LocaleTransform.values()) { 293 CLDRFile output = transformer.transform(localeTransform); 294 if (output == null) { 295 System.out.println("SKIPPING missing file: " + dir + "/" + localeTransform.inputLocale + ".xml"); 296 continue; 297 } 298 String outputDir = CLDRPaths.GEN_DIRECTORY + "common/" + dir + File.separator; 299 String outputFile = output.getLocaleID() + ".xml"; 300 301 try (PrintWriter out = FileUtilities.openUTF8Writer(outputDir, outputFile)) { 302 System.out.println("Generating locale file: " + outputDir + outputFile); 303 if (!transformer.unconverted.isEmpty()) { 304 System.out.println("Untransformed characters: " + transformer.unconverted); 305 transformer.unconverted.clear(); 306 } 307 output.write(out); 308 } 309 } 310 } 311 } 312 } 313