1 package org.unicode.cldr.tool; 2 3 import java.io.File; 4 import java.io.PrintWriter; 5 import java.util.Map; 6 import java.util.concurrent.ConcurrentHashMap; 7 8 import org.unicode.cldr.draft.FileUtilities; 9 import org.unicode.cldr.test.DisplayAndInputProcessor; 10 import org.unicode.cldr.util.CLDRFile; 11 import org.unicode.cldr.util.CLDRPaths; 12 import org.unicode.cldr.util.CLDRTransforms; 13 import org.unicode.cldr.util.CLDRTransforms.ParsedTransformID; 14 import org.unicode.cldr.util.CldrUtility; 15 import org.unicode.cldr.util.DtdType; 16 import org.unicode.cldr.util.Factory; 17 import org.unicode.cldr.util.LocaleIDParser; 18 import org.unicode.cldr.util.SimpleFactory.NoSourceDirectoryException; 19 import org.unicode.cldr.util.SimpleXMLSource; 20 import org.unicode.cldr.util.XMLSource; 21 22 import com.ibm.icu.text.Normalizer; 23 import com.ibm.icu.text.Transliterator; 24 import com.ibm.icu.text.UnicodeSet; 25 import com.ibm.icu.util.ICUUncheckedIOException; 26 27 /** 28 * Transforms the contents of a CLDRFile. 29 * 30 * @author jchye 31 */ 32 public class CLDRFileTransformer { 33 public enum PolicyIfExisting { 34 RETAIN, // Do not transliterate if existing output has locale content 35 DISCARD, // Replace existing output locale content 36 MINIMIZE // RETAIN, plus drop values if translit is a no-op. 37 } 38 39 /** 40 * Contains all supported locale-to-locale conversions along with information 41 * needed to convert each locale. Each enum value is named after the locale that results 42 * from the conversion. 43 */ 44 public enum LocaleTransform { 45 sr_Latn("sr", "Serbian-Latin-BGN.xml", Transliterator.FORWARD, "[:script=Cyrl:]", PolicyIfExisting.DISCARD), // 46 sr_Latn_BA("sr_Cyrl_BA", "Serbian-Latin-BGN.xml", Transliterator.FORWARD, "[:script=Cyrl:]", PolicyIfExisting.DISCARD), // 47 sr_Latn_ME("sr_Cyrl_ME", "Serbian-Latin-BGN.xml", Transliterator.FORWARD, "[:script=Cyrl:]", PolicyIfExisting.DISCARD), // 48 sr_Latn_XK("sr_Cyrl_XK", "Serbian-Latin-BGN.xml", Transliterator.FORWARD, "[:script=Cyrl:]", PolicyIfExisting.DISCARD), // 49 ha_NE("ha", "ha-ha_NE.xml", Transliterator.FORWARD, "[y Y ƴ Ƴ ʼ]", PolicyIfExisting.DISCARD), // 50 yo_BJ("yo", "yo-yo_BJ.xml", Transliterator.FORWARD, "[ẹ ọ ṣ Ẹ Ọ Ṣ]", PolicyIfExisting.DISCARD), // 51 de_CH("de", "[ß] Casefold", Transliterator.FORWARD, "[ß]", PolicyIfExisting.MINIMIZE), // 52 yue_Hans("yue", "Simplified-Traditional.xml", Transliterator.REVERSE, "[:script=Hant:]", PolicyIfExisting.RETAIN), // 53 // en_NZ("en_AU", "null", Transliterator.FORWARD, "[]", PolicyIfExisting.DISCARD), 54 // Needs work to fix currency symbols, handle Māori. See http://unicode.org/cldr/trac/ticket/9516#comment:6 55 ; 56 57 private final String inputLocale; 58 private final String transformFilename; 59 private final int direction; 60 private final UnicodeSet inputChars; 61 private final PolicyIfExisting policy; 62 63 /** 64 * @deprecated Use {@link #LocaleTransform(String,String,int,String,PolicyIfExisting)} instead 65 */ 66 @Deprecated LocaleTransform(String inputLocale, String transformFilename, int direction, String inputCharPattern)67 private LocaleTransform(String inputLocale, String transformFilename, int direction, String inputCharPattern) { 68 this(inputLocale, transformFilename, direction, inputCharPattern, PolicyIfExisting.DISCARD); 69 } 70 LocaleTransform(String inputLocale, String transformFilename, int direction, String inputCharPattern, PolicyIfExisting policy)71 private LocaleTransform(String inputLocale, String transformFilename, int direction, String inputCharPattern, PolicyIfExisting policy) { 72 this.inputLocale = inputLocale; 73 this.transformFilename = transformFilename; 74 this.direction = direction; 75 this.inputChars = new UnicodeSet(inputCharPattern); 76 this.policy = policy; 77 } 78 79 /** 80 * @return the policy for existing content 81 */ getPolicyIfExisting()82 public PolicyIfExisting getPolicyIfExisting() { 83 return policy; 84 } 85 86 /** 87 * @return the locale that used for conversion 88 */ getInputLocale()89 public String getInputLocale() { 90 return inputLocale; 91 } 92 93 /** 94 * @return the locale that used for conversion 95 */ getOutputLocale()96 public String getOutputLocale() { 97 return this.toString(); 98 } 99 100 /** 101 * @return the filename of the transform used to make the conversion 102 */ getTransformFilename()103 public String getTransformFilename() { 104 return transformFilename; 105 } 106 107 /** 108 * @return the direction of the transformation 109 */ getDirection()110 public int getDirection() { 111 return direction; 112 } 113 114 /** 115 * @return the set of characters in the input locale that should have been removed after 116 * transformation, used for internal debugging 117 */ getInputChars()118 private UnicodeSet getInputChars() { 119 return inputChars; 120 } 121 } 122 123 private UnicodeSet unconverted = new UnicodeSet(); 124 private Factory factory; 125 /* 126 * The transliterators map exists, and is static, to avoid wasting a lot of time creating 127 * a new Transliterator more often than necessary. (An alternative to "static" here might be to 128 * create only one CLDRFileTransformer, maybe as a member of ExampleGenerator.) 129 * Use ConcurrentHashMap rather than HashMap to avoid concurrency problems. 130 * Reference: https://unicode.org/cldr/trac/ticket/11657 131 */ 132 private static Map<LocaleTransform, Transliterator> transliterators = new ConcurrentHashMap<>(); 133 private String transformDir; 134 135 /** 136 * @param factory 137 * the factory to get locale data from 138 * @param transformDir 139 * the directory containing the transform files 140 */ CLDRFileTransformer(Factory factory, String transformDir)141 public CLDRFileTransformer(Factory factory, String transformDir) { 142 this.factory = factory; 143 this.transformDir = transformDir; 144 } 145 loadTransliterator(LocaleTransform localeTransform)146 public Transliterator loadTransliterator(LocaleTransform localeTransform) { 147 if (transliterators.containsKey(localeTransform)) { 148 return transliterators.get(localeTransform); 149 } 150 Transliterator transliterator; 151 if (localeTransform.getTransformFilename().contains(".xml")) { 152 ParsedTransformID directionInfo = new ParsedTransformID(); 153 String ruleString = CLDRTransforms.getIcuRulesFromXmlFile(transformDir, localeTransform.getTransformFilename(), directionInfo); 154 transliterator = Transliterator.createFromRules(directionInfo.getId(), 155 ruleString, localeTransform.getDirection()); 156 } else { 157 transliterator = Transliterator.getInstance(localeTransform.getTransformFilename()); 158 } 159 transliterators.put(localeTransform, transliterator); 160 return transliterator; 161 } 162 163 /** 164 * NOTE: This method does not currently handle nested transliterators. 165 * 166 * @param input 167 * @return null if the input file was missing, or if there is no new output file. 168 */ transform(LocaleTransform localeTransform)169 public CLDRFile transform(LocaleTransform localeTransform) { 170 Transliterator transliterator = loadTransliterator(localeTransform); 171 CLDRFile input; 172 try { 173 input = factory.make(localeTransform.getInputLocale(), false); 174 } catch (ICUUncheckedIOException e1) { 175 return null; // input file is missing (or otherwise unavailable) 176 } 177 boolean hadOutput = true; 178 CLDRFile output; 179 try { 180 output = factory.make(localeTransform.getOutputLocale(), false); 181 } catch (NoSourceDirectoryException e) { 182 // if we can't open the file, then just make a new one. 183 XMLSource dataSource = new SimpleXMLSource(localeTransform.getOutputLocale()); 184 output = new CLDRFile(dataSource); 185 hadOutput = false; 186 } 187 String outputParentString = LocaleIDParser.getParent(localeTransform.getOutputLocale()); 188 CLDRFile outputParent = factory.make(outputParentString, true); 189 190 outputParent = factory.make(localeTransform.getInputLocale(), false); 191 XMLSource outputSource = new SimpleXMLSource(localeTransform.toString()); 192 DisplayAndInputProcessor daip = new DisplayAndInputProcessor(output, true); 193 for (String xpath : input) { 194 String value = input.getStringValue(xpath); 195 if (CldrUtility.INHERITANCE_MARKER.equals(value)) { 196 value = null; 197 } 198 if (value == null) { 199 continue; 200 } 201 String fullPath = input.getFullXPath(xpath); 202 String oldValue = output.getStringValue(xpath); 203 String parentValue = outputParent.getStringValue(xpath); 204 value = transformValue(transliterator, localeTransform, xpath, value, oldValue, parentValue); 205 if (value != null && !CldrUtility.INHERITANCE_MARKER.equals(value)) { 206 value = daip.processInput(xpath, value, null); 207 outputSource.putValueAtPath(fullPath, value); 208 } 209 } 210 if (!outputSource.iterator().hasNext()) { // empty new output 211 if (!hadOutput) { 212 return null; // don't add file if nothing to add 213 } 214 } 215 return new CLDRFile(outputSource); 216 } 217 218 /** 219 * Transforms a CLDRFile value into another form. 220 * @param parentValue 221 */ transformValue(Transliterator transliterator, LocaleTransform localeTransform, String path, String value, String oldValue, String parentValue)222 private String transformValue(Transliterator transliterator, LocaleTransform localeTransform, String path, String value, 223 String oldValue, String parentValue) { 224 225 // allows us to change only new values 226 switch (localeTransform.policy) { 227 case RETAIN: 228 case MINIMIZE: 229 if (oldValue != null) { 230 return oldValue; 231 } 232 break; 233 default: 234 } 235 236 UnicodeSet chars = localeTransform.getInputChars(); 237 String transliterated; 238 239 // TODO: Don't transform dates/patterns. 240 // For now, don't try to transliterate the exemplar characters - use the ones from the original locale. 241 // In the future, we can probably control this better with a config file - similar to CLDRModify's config file. 242 if (path.contains("exemplarCharacters")) { 243 if (oldValue != null) { 244 transliterated = oldValue; 245 } else { 246 transliterated = value; 247 } 248 } else { 249 transliterated = transliterator.transliterate(value); 250 transliterated = Normalizer.compose(transliterated, false); 251 } 252 if (localeTransform.policy == PolicyIfExisting.MINIMIZE) { 253 if (transliterated.equals(value)) { 254 return null; 255 } 256 } 257 258 if (chars.containsSome(transliterated)) { 259 unconverted.addAll(new UnicodeSet().addAll(chars).retainAll(transliterated)); 260 } 261 return transliterated; 262 } 263 main(String[] args)264 public static void main(String[] args) throws Exception { 265 for (String dir : DtdType.ldml.directories) { 266 if (dir.equals("casing") // skip, field contents are keywords, not localizable content 267 || dir.equals("collation") // skip, field contents are complex, and can't be simply remapped 268 || dir.equals("annotationsDerived") // skip, derived later 269 ) { 270 continue; 271 } 272 System.out.println("\nDirectory: " + dir); 273 final String sourceDirectory = CLDRPaths.COMMON_DIRECTORY + dir + "/"; 274 Factory factory = Factory.make(sourceDirectory, ".*"); 275 276 CLDRFileTransformer transformer = new CLDRFileTransformer(factory, CLDRPaths.COMMON_DIRECTORY + "transforms" + File.separator); 277 for (LocaleTransform localeTransform : LocaleTransform.values()) { 278 CLDRFile output = transformer.transform(localeTransform); 279 if (output == null) { 280 System.out.println("SKIPPING missing file: " + dir + "/" + localeTransform.inputLocale + ".xml"); 281 continue; 282 } 283 String outputDir = CLDRPaths.GEN_DIRECTORY + "common/" + dir + File.separator; 284 String outputFile = output.getLocaleID() + ".xml"; 285 286 try (PrintWriter out = FileUtilities.openUTF8Writer(outputDir, outputFile)) { 287 System.out.println("Generating locale file: " + outputDir + outputFile); 288 if (!transformer.unconverted.isEmpty()) { 289 System.out.println("Untransformed characters: " + transformer.unconverted); 290 transformer.unconverted.clear(); 291 } 292 output.write(out); 293 } 294 } 295 } 296 } 297 } 298