1 package org.unicode.cldr.tool; 2 3 import java.io.File; 4 import java.io.PrintWriter; 5 import java.util.HashMap; 6 import java.util.Map; 7 8 import org.unicode.cldr.draft.FileUtilities; 9 import org.unicode.cldr.util.CLDRFile; 10 import org.unicode.cldr.util.CLDRPaths; 11 import org.unicode.cldr.util.CLDRTransforms; 12 import org.unicode.cldr.util.CLDRTransforms.ParsedTransformID; 13 import org.unicode.cldr.util.DtdType; 14 import org.unicode.cldr.util.Factory; 15 import org.unicode.cldr.util.LocaleIDParser; 16 import org.unicode.cldr.util.SimpleFactory.NoSourceDirectoryException; 17 import org.unicode.cldr.util.SimpleXMLSource; 18 import org.unicode.cldr.util.XMLSource; 19 20 import com.ibm.icu.text.Normalizer; 21 import com.ibm.icu.text.Transliterator; 22 import com.ibm.icu.text.UnicodeSet; 23 import com.ibm.icu.util.ICUUncheckedIOException; 24 25 /** 26 * Transforms the contents of a CLDRFile. 27 * 28 * @author jchye 29 */ 30 public class CLDRFileTransformer { 31 /** 32 * Contains all supported locale-to-locale conversions along with information 33 * needed to convert each locale. Each enum value is named after the locale that results 34 * from the conversion. 35 */ 36 enum PolicyIfExisting { 37 RETAIN, DISCARD, MINIMIZE 38 } 39 40 public enum LocaleTransform { 41 sr_Latn("sr", "Serbian-Latin-BGN.xml", Transliterator.FORWARD, "[:script=Cyrl:]", PolicyIfExisting.DISCARD), sr_Latn_BA("sr_Cyrl_BA", 42 "Serbian-Latin-BGN.xml", Transliterator.FORWARD, "[:script=Cyrl:]", PolicyIfExisting.DISCARD), sr_Latn_ME("sr_Cyrl_ME", "Serbian-Latin-BGN.xml", 43 Transliterator.FORWARD, "[:script=Cyrl:]", PolicyIfExisting.DISCARD), sr_Latn_XK("sr_Cyrl_XK", "Serbian-Latin-BGN.xml", Transliterator.FORWARD, 44 "[:script=Cyrl:]", PolicyIfExisting.DISCARD), ha_NE("ha", "ha-ha_NE.xml", Transliterator.FORWARD, "[y Y ƴ Ƴ ʼ]", 45 PolicyIfExisting.DISCARD), yo_BJ("yo", "yo-yo_BJ.xml", Transliterator.FORWARD, "[ẹ ọ ṣ Ẹ Ọ Ṣ]", PolicyIfExisting.DISCARD), de_CH("de", 46 "[ß] Casefold", Transliterator.FORWARD, "[ß]", PolicyIfExisting.MINIMIZE), yue_Hans("yue", "Simplified-Traditional.xml", 47 Transliterator.REVERSE, "[:script=Hant:]", PolicyIfExisting.RETAIN), 48 // en_NZ("en_AU", "null", Transliterator.FORWARD, "[]", PolicyIfExisting.DISCARD), 49 // Needs work to fix currency symbols, handle Maori. See http://unicode.org/cldr/trac/ticket/9516#comment:6 50 ; 51 52 private final String inputLocale; 53 private final String transformFilename; 54 private final int direction; 55 private final UnicodeSet inputChars; 56 private final PolicyIfExisting policy; 57 58 /** 59 * @deprecated Use {@link #LocaleTransform(String,String,int,String,PolicyIfExisting)} instead 60 */ LocaleTransform(String inputLocale, String transformFilename, int direction, String inputCharPattern)61 private LocaleTransform(String inputLocale, String transformFilename, int direction, String inputCharPattern) { 62 this(inputLocale, transformFilename, direction, inputCharPattern, PolicyIfExisting.DISCARD); 63 } 64 LocaleTransform(String inputLocale, String transformFilename, int direction, String inputCharPattern, PolicyIfExisting policy)65 private LocaleTransform(String inputLocale, String transformFilename, int direction, String inputCharPattern, PolicyIfExisting policy) { 66 this.inputLocale = inputLocale; 67 this.transformFilename = transformFilename; 68 this.direction = direction; 69 this.inputChars = new UnicodeSet(inputCharPattern); 70 this.policy = policy; 71 } 72 73 /** 74 * @return the locale that used for conversion 75 */ getInputLocale()76 public String getInputLocale() { 77 return inputLocale; 78 } 79 80 /** 81 * @return the locale that used for conversion 82 */ getOutputLocale()83 public String getOutputLocale() { 84 return this.toString(); 85 } 86 87 /** 88 * @return the filename of the transform used to make the conversion 89 */ getTransformFilename()90 public String getTransformFilename() { 91 return transformFilename; 92 } 93 94 /** 95 * @return the direction of the transformation 96 */ getDirection()97 public int getDirection() { 98 return direction; 99 } 100 101 /** 102 * @return the set of characters in the input locale that should have been removed after 103 * transformation, used for internal debugging 104 */ getInputChars()105 private UnicodeSet getInputChars() { 106 return inputChars; 107 } 108 } 109 110 private UnicodeSet unconverted = new UnicodeSet(); 111 private Factory factory; 112 private Map<LocaleTransform, Transliterator> transliterators = new HashMap<LocaleTransform, Transliterator>(); 113 private String transformDir; 114 115 /** 116 * @param factory 117 * the factory to get locale data from 118 * @param transformDir 119 * the directory containing the transform files 120 */ CLDRFileTransformer(Factory factory, String transformDir)121 public CLDRFileTransformer(Factory factory, String transformDir) { 122 this.factory = factory; 123 this.transformDir = transformDir; 124 } 125 loadTransliterator(LocaleTransform localeTransform)126 public Transliterator loadTransliterator(LocaleTransform localeTransform) { 127 if (transliterators.containsKey(localeTransform)) { 128 return transliterators.get(localeTransform); 129 } 130 Transliterator transliterator; 131 if (localeTransform.getTransformFilename().contains(".xml")) { 132 ParsedTransformID directionInfo = new ParsedTransformID(); 133 String ruleString = CLDRTransforms.getIcuRulesFromXmlFile(transformDir, localeTransform.getTransformFilename(), directionInfo); 134 transliterator = Transliterator.createFromRules(directionInfo.getId(), 135 ruleString, localeTransform.getDirection()); 136 transliterators.put(localeTransform, transliterator); 137 } else { 138 transliterator = Transliterator.getInstance(localeTransform.getTransformFilename()); 139 } 140 return transliterator; 141 } 142 143 /** 144 * NOTE: This method does not currently handle nested transliterators. 145 * 146 * @param input 147 * @return null if the input file was missing, or if there is no new output file. 148 */ transform(LocaleTransform localeTransform)149 public CLDRFile transform(LocaleTransform localeTransform) { 150 Transliterator transliterator = loadTransliterator(localeTransform); 151 CLDRFile input; 152 try { 153 input = factory.make(localeTransform.getInputLocale(), false); 154 } catch (ICUUncheckedIOException e1) { 155 return null; // input file is missing (or otherwise unavailable) 156 } 157 boolean hadOutput = true; 158 CLDRFile output; 159 try { 160 output = factory.make(localeTransform.getOutputLocale(), false); 161 } catch (NoSourceDirectoryException e) { 162 // if we can't open the file, then just make a new one. 163 XMLSource dataSource = new SimpleXMLSource(localeTransform.getOutputLocale()); 164 output = new CLDRFile(dataSource); 165 hadOutput = false; 166 } 167 String outputParentString = LocaleIDParser.getParent(localeTransform.getOutputLocale()); 168 CLDRFile outputParent = factory.make(outputParentString, true); 169 170 outputParent = factory.make(localeTransform.getInputLocale(), false); 171 XMLSource outputSource = new SimpleXMLSource(localeTransform.toString()); 172 for (String xpath : input) { 173 String fullPath = input.getFullXPath(xpath); 174 String value = input.getStringValue(xpath); 175 String oldValue = output.getStringValue(xpath); 176 String parentValue = outputParent.getStringValue(xpath); 177 value = transformValue(transliterator, localeTransform, xpath, value, oldValue, parentValue); 178 if (value != null) { 179 outputSource.putValueAtPath(fullPath, value); 180 } 181 } 182 if (!outputSource.iterator().hasNext()) { // empty new output 183 if (!hadOutput) { 184 return null; // don't add file if nothing to add 185 } 186 } 187 return new CLDRFile(outputSource); 188 } 189 190 /** 191 * Transforms a CLDRFile value into another form. 192 * @param parentValue 193 */ transformValue(Transliterator transliterator, LocaleTransform localeTransform, String path, String value, String oldValue, String parentValue)194 private String transformValue(Transliterator transliterator, LocaleTransform localeTransform, String path, String value, 195 String oldValue, String parentValue) { 196 197 // allows us to change only new values 198 switch (localeTransform.policy) { 199 case RETAIN: 200 case MINIMIZE: 201 if (oldValue != null) { 202 return oldValue; 203 } 204 break; 205 default: 206 } 207 208 UnicodeSet chars = localeTransform.getInputChars(); 209 String transliterated; 210 211 // TODO: Don't transform dates/patterns. 212 // For now, don't try to transliterate the exemplar characters - use the ones from the original locale. 213 // In the future, we can probably control this better with a config file - similar to CLDRModify's config file. 214 if (path.contains("exemplarCharacters")) { 215 if (oldValue != null) { 216 transliterated = oldValue; 217 } else { 218 transliterated = value; 219 } 220 } else { 221 transliterated = transliterator.transliterate(value); 222 transliterated = Normalizer.compose(transliterated, false); 223 } 224 if (localeTransform.policy == PolicyIfExisting.MINIMIZE) { 225 if (transliterated.equals(value)) { 226 return null; 227 } 228 } 229 230 if (chars.containsSome(transliterated)) { 231 unconverted.addAll(new UnicodeSet().addAll(chars).retainAll(transliterated)); 232 } 233 return transliterated; 234 } 235 main(String[] args)236 public static void main(String[] args) throws Exception { 237 for (String dir : DtdType.ldml.directories) { 238 if (dir.equals("casing") // skip, field contents are keywords, not localizable content 239 || dir.equals("collation") // skip, field contents are complex, and can't be simply remapped 240 || dir.equals("annotationsDerived") // skip, derived later 241 ) { 242 continue; 243 } 244 System.out.println("\nDirectory: " + dir); 245 Factory factory = Factory.make(CLDRPaths.COMMON_DIRECTORY + dir + "/", ".*"); 246 CLDRFileTransformer transformer = new CLDRFileTransformer(factory, CLDRPaths.COMMON_DIRECTORY + "transforms" + File.separator); 247 for (LocaleTransform localeTransform : LocaleTransform.values()) { 248 CLDRFile output = transformer.transform(localeTransform); 249 if (output == null) { 250 System.out.println("SKIPPING missing file: " + dir + "/" + localeTransform.inputLocale + ".xml"); 251 continue; 252 } 253 String outputDir = CLDRPaths.GEN_DIRECTORY + "common/" + dir + File.separator; 254 String outputFile = output.getLocaleID() + ".xml"; 255 PrintWriter out = FileUtilities.openUTF8Writer(outputDir, outputFile); 256 System.out.println("Generating locale file: " + outputDir + outputFile); 257 if (!transformer.unconverted.isEmpty()) { 258 System.out.println("Untransformed characters: " + transformer.unconverted); 259 transformer.unconverted.clear(); 260 } 261 output.write(out); 262 out.close(); 263 } 264 } 265 } 266 } 267