• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 package org.unicode.cldr.tool;
2 
3 import java.io.File;
4 import java.io.PrintWriter;
5 import java.util.Map;
6 import java.util.concurrent.ConcurrentHashMap;
7 
8 import org.unicode.cldr.draft.FileUtilities;
9 import org.unicode.cldr.util.CLDRFile;
10 import org.unicode.cldr.util.CLDRPaths;
11 import org.unicode.cldr.util.CLDRTransforms;
12 import org.unicode.cldr.util.CLDRTransforms.ParsedTransformID;
13 import org.unicode.cldr.util.CldrUtility;
14 import org.unicode.cldr.util.DtdType;
15 import org.unicode.cldr.util.Factory;
16 import org.unicode.cldr.util.LocaleIDParser;
17 import org.unicode.cldr.util.SimpleFactory.NoSourceDirectoryException;
18 import org.unicode.cldr.util.SimpleXMLSource;
19 import org.unicode.cldr.util.XMLSource;
20 
21 import com.ibm.icu.text.Normalizer;
22 import com.ibm.icu.text.Transliterator;
23 import com.ibm.icu.text.UnicodeSet;
24 import com.ibm.icu.util.ICUUncheckedIOException;
25 
26 /**
27  * Transforms the contents of a CLDRFile.
28  *
29  * @author jchye
30  */
31 public class CLDRFileTransformer {
32     /**
33      * Contains all supported locale-to-locale conversions along with information
34      * needed to convert each locale. Each enum value is named after the locale that results
35      * from the conversion.
36      */
37     enum PolicyIfExisting {
38         RETAIN, DISCARD, MINIMIZE
39     }
40 
41     public enum LocaleTransform {
42         sr_Latn("sr", "Serbian-Latin-BGN.xml", Transliterator.FORWARD, "[:script=Cyrl:]", PolicyIfExisting.DISCARD), //
43         sr_Latn_BA("sr_Cyrl_BA", "Serbian-Latin-BGN.xml", Transliterator.FORWARD, "[:script=Cyrl:]", PolicyIfExisting.DISCARD), //
44         sr_Latn_ME("sr_Cyrl_ME", "Serbian-Latin-BGN.xml", Transliterator.FORWARD, "[:script=Cyrl:]", PolicyIfExisting.DISCARD), //
45         sr_Latn_XK("sr_Cyrl_XK", "Serbian-Latin-BGN.xml", Transliterator.FORWARD, "[:script=Cyrl:]", PolicyIfExisting.DISCARD), //
46         ha_NE("ha", "ha-ha_NE.xml", Transliterator.FORWARD, "[y Y ƴ Ƴ ʼ]", PolicyIfExisting.DISCARD), //
47         yo_BJ("yo", "yo-yo_BJ.xml", Transliterator.FORWARD, "[ẹ ọ ṣ Ẹ Ọ Ṣ]", PolicyIfExisting.DISCARD), //
48         de_CH("de", "[ß] Casefold", Transliterator.FORWARD, "[ß]", PolicyIfExisting.MINIMIZE), //
49         yue_Hans("yue", "Simplified-Traditional.xml", Transliterator.REVERSE, "[:script=Hant:]", PolicyIfExisting.RETAIN), //
50         // en_NZ("en_AU", "null", Transliterator.FORWARD, "[]", PolicyIfExisting.DISCARD),
51         // Needs work to fix currency symbols, handle Maori. See http://unicode.org/cldr/trac/ticket/9516#comment:6
52         ;
53 
54         private final String inputLocale;
55         private final String transformFilename;
56         private final int direction;
57         private final UnicodeSet inputChars;
58         private final PolicyIfExisting policy;
59 
60         /**
61          * @deprecated Use {@link #LocaleTransform(String,String,int,String,PolicyIfExisting)} instead
62          */
LocaleTransform(String inputLocale, String transformFilename, int direction, String inputCharPattern)63         private LocaleTransform(String inputLocale, String transformFilename, int direction, String inputCharPattern) {
64             this(inputLocale, transformFilename, direction, inputCharPattern, PolicyIfExisting.DISCARD);
65         }
66 
LocaleTransform(String inputLocale, String transformFilename, int direction, String inputCharPattern, PolicyIfExisting policy)67         private LocaleTransform(String inputLocale, String transformFilename, int direction, String inputCharPattern, PolicyIfExisting policy) {
68             this.inputLocale = inputLocale;
69             this.transformFilename = transformFilename;
70             this.direction = direction;
71             this.inputChars = new UnicodeSet(inputCharPattern);
72             this.policy = policy;
73         }
74 
75         /**
76          * @return the locale that used for conversion
77          */
getInputLocale()78         public String getInputLocale() {
79             return inputLocale;
80         }
81 
82         /**
83          * @return the locale that used for conversion
84          */
getOutputLocale()85         public String getOutputLocale() {
86             return this.toString();
87         }
88 
89         /**
90          * @return the filename of the transform used to make the conversion
91          */
getTransformFilename()92         public String getTransformFilename() {
93             return transformFilename;
94         }
95 
96         /**
97          * @return the direction of the transformation
98          */
getDirection()99         public int getDirection() {
100             return direction;
101         }
102 
103         /**
104          * @return the set of characters in the input locale that should have been removed after
105          *         transformation, used for internal debugging
106          */
getInputChars()107         private UnicodeSet getInputChars() {
108             return inputChars;
109         }
110     }
111 
112     private UnicodeSet unconverted = new UnicodeSet();
113     private Factory factory;
114     /*
115      * The transliterators map exists, and is static, to avoid wasting a lot of time creating
116      * a new Transliterator more often than necessary. (An alternative to "static" here might be to
117      * create only one CLDRFileTransformer, maybe as a member of ExampleGenerator.)
118      * Use ConcurrentHashMap rather than HashMap to avoid concurrency problems.
119      * Reference: https://unicode.org/cldr/trac/ticket/11657
120      */
121     private static Map<LocaleTransform, Transliterator> transliterators = new ConcurrentHashMap<LocaleTransform, Transliterator>();
122     private String transformDir;
123 
124     /**
125      * @param factory
126      *            the factory to get locale data from
127      * @param transformDir
128      *            the directory containing the transform files
129      */
CLDRFileTransformer(Factory factory, String transformDir)130     public CLDRFileTransformer(Factory factory, String transformDir) {
131         this.factory = factory;
132         this.transformDir = transformDir;
133     }
134 
loadTransliterator(LocaleTransform localeTransform)135     public Transliterator loadTransliterator(LocaleTransform localeTransform) {
136         if (transliterators.containsKey(localeTransform)) {
137             return transliterators.get(localeTransform);
138         }
139         Transliterator transliterator;
140         if (localeTransform.getTransformFilename().contains(".xml")) {
141             ParsedTransformID directionInfo = new ParsedTransformID();
142             String ruleString = CLDRTransforms.getIcuRulesFromXmlFile(transformDir, localeTransform.getTransformFilename(), directionInfo);
143             transliterator = Transliterator.createFromRules(directionInfo.getId(),
144                 ruleString, localeTransform.getDirection());
145             transliterators.put(localeTransform, transliterator);
146         } else {
147             transliterator = Transliterator.getInstance(localeTransform.getTransformFilename());
148         }
149         return transliterator;
150     }
151 
152     /**
153      * NOTE: This method does not currently handle nested transliterators.
154      *
155      * @param input
156      * @return null if the input file was missing, or if there is no new output file.
157      */
transform(LocaleTransform localeTransform)158     public CLDRFile transform(LocaleTransform localeTransform) {
159         Transliterator transliterator = loadTransliterator(localeTransform);
160         CLDRFile input;
161         try {
162             input = factory.make(localeTransform.getInputLocale(), false);
163         } catch (ICUUncheckedIOException e1) {
164             return null; // input file is missing (or otherwise unavailable)
165         }
166         boolean hadOutput = true;
167         CLDRFile output;
168         try {
169             output = factory.make(localeTransform.getOutputLocale(), false);
170         } catch (NoSourceDirectoryException e) {
171             // if we can't open the file, then just make a new one.
172             XMLSource dataSource = new SimpleXMLSource(localeTransform.getOutputLocale());
173             output = new CLDRFile(dataSource);
174             hadOutput = false;
175         }
176         String outputParentString = LocaleIDParser.getParent(localeTransform.getOutputLocale());
177         CLDRFile outputParent = factory.make(outputParentString, true);
178 
179         outputParent = factory.make(localeTransform.getInputLocale(), false);
180         XMLSource outputSource = new SimpleXMLSource(localeTransform.toString());
181         for (String xpath : input) {
182             String value = input.getStringValue(xpath);
183             if (CldrUtility.INHERITANCE_MARKER.equals(value)) {
184                 value = null;
185             }
186             if (value == null) {
187                 continue;
188             }
189             String fullPath = input.getFullXPath(xpath);
190             String oldValue = output.getStringValue(xpath);
191             String parentValue = outputParent.getStringValue(xpath);
192             value = transformValue(transliterator, localeTransform, xpath, value, oldValue, parentValue);
193             if (value != null && !CldrUtility.INHERITANCE_MARKER.equals(value)) {
194                 outputSource.putValueAtPath(fullPath, value);
195             }
196         }
197         if (!outputSource.iterator().hasNext()) { // empty new output
198             if (!hadOutput) {
199                 return null; // don't add file if nothing to add
200             }
201         }
202         return new CLDRFile(outputSource);
203     }
204 
205     /**
206      * Transforms a CLDRFile value into another form.
207      * @param parentValue
208      */
transformValue(Transliterator transliterator, LocaleTransform localeTransform, String path, String value, String oldValue, String parentValue)209     private String transformValue(Transliterator transliterator, LocaleTransform localeTransform, String path, String value,
210         String oldValue, String parentValue) {
211 
212         // allows us to change only new values
213         switch (localeTransform.policy) {
214         case RETAIN:
215         case MINIMIZE:
216             if (oldValue != null) {
217                 return oldValue;
218             }
219             break;
220         default:
221         }
222 
223         UnicodeSet chars = localeTransform.getInputChars();
224         String transliterated;
225 
226         // TODO: Don't transform dates/patterns.
227         // For now, don't try to transliterate the exemplar characters - use the ones from the original locale.
228         // In the future, we can probably control this better with a config file - similar to CLDRModify's config file.
229         if (path.contains("exemplarCharacters")) {
230             if (oldValue != null) {
231                 transliterated = oldValue;
232             } else {
233                 transliterated = value;
234             }
235         } else {
236             transliterated = transliterator.transliterate(value);
237             transliterated = Normalizer.compose(transliterated, false);
238         }
239         if (localeTransform.policy == PolicyIfExisting.MINIMIZE) {
240             if (transliterated.equals(value)) {
241                 return null;
242             }
243         }
244 
245         if (chars.containsSome(transliterated)) {
246             unconverted.addAll(new UnicodeSet().addAll(chars).retainAll(transliterated));
247         }
248         return transliterated;
249     }
250 
main(String[] args)251     public static void main(String[] args) throws Exception {
252         for (String dir : DtdType.ldml.directories) {
253             if (dir.equals("casing") // skip, field contents are keywords, not localizable content
254                 || dir.equals("collation") // skip, field contents are complex, and can't be simply remapped
255                 || dir.equals("annotationsDerived") // skip, derived later
256             ) {
257                 continue;
258             }
259             System.out.println("\nDirectory: " + dir);
260             Factory factory = Factory.make(CLDRPaths.COMMON_DIRECTORY + dir + "/", ".*");
261             CLDRFileTransformer transformer = new CLDRFileTransformer(factory, CLDRPaths.COMMON_DIRECTORY + "transforms" + File.separator);
262             for (LocaleTransform localeTransform : LocaleTransform.values()) {
263                 CLDRFile output = transformer.transform(localeTransform);
264                 if (output == null) {
265                     System.out.println("SKIPPING missing file: " + dir + "/" + localeTransform.inputLocale + ".xml");
266                     continue;
267                 }
268                 String outputDir = CLDRPaths.GEN_DIRECTORY + "common/" + dir + File.separator;
269                 String outputFile = output.getLocaleID() + ".xml";
270                 PrintWriter out = FileUtilities.openUTF8Writer(outputDir, outputFile);
271                 System.out.println("Generating locale file: " + outputDir + outputFile);
272                 if (!transformer.unconverted.isEmpty()) {
273                     System.out.println("Untransformed characters: " + transformer.unconverted);
274                     transformer.unconverted.clear();
275                 }
276                 output.write(out);
277                 out.close();
278             }
279         }
280     }
281 }
282