• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 package org.unicode.cldr.tool;
2 
3 import java.io.File;
4 import java.io.PrintWriter;
5 import java.util.Map;
6 import java.util.concurrent.ConcurrentHashMap;
7 
8 import org.unicode.cldr.draft.FileUtilities;
9 import org.unicode.cldr.test.DisplayAndInputProcessor;
10 import org.unicode.cldr.util.CLDRFile;
11 import org.unicode.cldr.util.CLDRPaths;
12 import org.unicode.cldr.util.CLDRTransforms;
13 import org.unicode.cldr.util.CLDRTransforms.ParsedTransformID;
14 import org.unicode.cldr.util.CldrUtility;
15 import org.unicode.cldr.util.DtdType;
16 import org.unicode.cldr.util.Factory;
17 import org.unicode.cldr.util.LocaleIDParser;
18 import org.unicode.cldr.util.SimpleFactory.NoSourceDirectoryException;
19 import org.unicode.cldr.util.SimpleXMLSource;
20 import org.unicode.cldr.util.XMLSource;
21 
22 import com.ibm.icu.text.Normalizer;
23 import com.ibm.icu.text.Transliterator;
24 import com.ibm.icu.text.UnicodeSet;
25 import com.ibm.icu.util.ICUUncheckedIOException;
26 
27 /**
28  * Transforms the contents of a CLDRFile.
29  *
30  * @author jchye
31  */
32 public class CLDRFileTransformer {
33     public enum PolicyIfExisting {
34         RETAIN,  // Do not transliterate if existing output has locale content
35         DISCARD, // Replace existing output locale content
36         MINIMIZE // RETAIN, plus drop values if translit is a no-op.
37     }
38 
39     /**
40      * Contains all supported locale-to-locale conversions along with information
41      * needed to convert each locale. Each enum value is named after the locale that results
42      * from the conversion.
43      */
44     public enum LocaleTransform {
45         sr_Latn("sr", "Serbian-Latin-BGN.xml", Transliterator.FORWARD, "[:script=Cyrl:]", PolicyIfExisting.DISCARD), //
46         sr_Latn_BA("sr_Cyrl_BA", "Serbian-Latin-BGN.xml", Transliterator.FORWARD, "[:script=Cyrl:]", PolicyIfExisting.DISCARD), //
47         sr_Latn_ME("sr_Cyrl_ME", "Serbian-Latin-BGN.xml", Transliterator.FORWARD, "[:script=Cyrl:]", PolicyIfExisting.DISCARD), //
48         sr_Latn_XK("sr_Cyrl_XK", "Serbian-Latin-BGN.xml", Transliterator.FORWARD, "[:script=Cyrl:]", PolicyIfExisting.DISCARD), //
49         ha_NE("ha", "ha-ha_NE.xml", Transliterator.FORWARD, "[y Y ƴ Ƴ ʼ]", PolicyIfExisting.DISCARD), //
50         yo_BJ("yo", "yo-yo_BJ.xml", Transliterator.FORWARD, "[ẹ ọ ṣ Ẹ Ọ Ṣ]", PolicyIfExisting.DISCARD), //
51         de_CH("de", "[ß] Casefold", Transliterator.FORWARD, "[ß]", PolicyIfExisting.MINIMIZE), //
52         yue_Hans("yue", "Simplified-Traditional.xml", Transliterator.REVERSE, "[:script=Hant:]", PolicyIfExisting.RETAIN), //
53         // en_NZ("en_AU", "null", Transliterator.FORWARD, "[]", PolicyIfExisting.DISCARD),
54         // Needs work to fix currency symbols, handle Māori. See http://unicode.org/cldr/trac/ticket/9516#comment:6
55         ;
56 
57         private final String inputLocale;
58         private final String transformFilename;
59         private final int direction;
60         private final UnicodeSet inputChars;
61         private final PolicyIfExisting policy;
62 
63         /**
64          * @deprecated Use {@link #LocaleTransform(String,String,int,String,PolicyIfExisting)} instead
65          */
66         @Deprecated
LocaleTransform(String inputLocale, String transformFilename, int direction, String inputCharPattern)67         private LocaleTransform(String inputLocale, String transformFilename, int direction, String inputCharPattern) {
68             this(inputLocale, transformFilename, direction, inputCharPattern, PolicyIfExisting.DISCARD);
69         }
70 
LocaleTransform(String inputLocale, String transformFilename, int direction, String inputCharPattern, PolicyIfExisting policy)71         private LocaleTransform(String inputLocale, String transformFilename, int direction, String inputCharPattern, PolicyIfExisting policy) {
72             this.inputLocale = inputLocale;
73             this.transformFilename = transformFilename;
74             this.direction = direction;
75             this.inputChars = new UnicodeSet(inputCharPattern);
76             this.policy = policy;
77         }
78 
79         /**
80          * @return the policy for existing content
81          */
getPolicyIfExisting()82         public PolicyIfExisting getPolicyIfExisting() {
83             return policy;
84         }
85 
86         /**
87          * @return the locale that used for conversion
88          */
getInputLocale()89         public String getInputLocale() {
90             return inputLocale;
91         }
92 
93         /**
94          * @return the locale that used for conversion
95          */
getOutputLocale()96         public String getOutputLocale() {
97             return this.toString();
98         }
99 
100         /**
101          * @return the filename of the transform used to make the conversion
102          */
getTransformFilename()103         public String getTransformFilename() {
104             return transformFilename;
105         }
106 
107         /**
108          * @return the direction of the transformation
109          */
getDirection()110         public int getDirection() {
111             return direction;
112         }
113 
114         /**
115          * @return the set of characters in the input locale that should have been removed after
116          *         transformation, used for internal debugging
117          */
getInputChars()118         private UnicodeSet getInputChars() {
119             return inputChars;
120         }
121     }
122 
123     private UnicodeSet unconverted = new UnicodeSet();
124     private Factory factory;
125     /*
126      * The transliterators map exists, and is static, to avoid wasting a lot of time creating
127      * a new Transliterator more often than necessary. (An alternative to "static" here might be to
128      * create only one CLDRFileTransformer, maybe as a member of ExampleGenerator.)
129      * Use ConcurrentHashMap rather than HashMap to avoid concurrency problems.
130      * Reference: https://unicode.org/cldr/trac/ticket/11657
131      */
132     private static Map<LocaleTransform, Transliterator> transliterators = new ConcurrentHashMap<>();
133     private String transformDir;
134 
135     /**
136      * @param factory
137      *            the factory to get locale data from
138      * @param transformDir
139      *            the directory containing the transform files
140      */
CLDRFileTransformer(Factory factory, String transformDir)141     public CLDRFileTransformer(Factory factory, String transformDir) {
142         this.factory = factory;
143         this.transformDir = transformDir;
144     }
145 
loadTransliterator(LocaleTransform localeTransform)146     public Transliterator loadTransliterator(LocaleTransform localeTransform) {
147         if (transliterators.containsKey(localeTransform)) {
148             return transliterators.get(localeTransform);
149         }
150         Transliterator transliterator;
151         if (localeTransform.getTransformFilename().contains(".xml")) {
152             ParsedTransformID directionInfo = new ParsedTransformID();
153             String ruleString = CLDRTransforms.getIcuRulesFromXmlFile(transformDir, localeTransform.getTransformFilename(), directionInfo);
154             transliterator = Transliterator.createFromRules(directionInfo.getId(),
155                 ruleString, localeTransform.getDirection());
156         } else {
157             transliterator = Transliterator.getInstance(localeTransform.getTransformFilename());
158         }
159         transliterators.put(localeTransform, transliterator);
160         return transliterator;
161     }
162 
163     /**
164      * NOTE: This method does not currently handle nested transliterators.
165      *
166      * @param input
167      * @return null if the input file was missing, or if there is no new output file.
168      */
transform(LocaleTransform localeTransform)169     public CLDRFile transform(LocaleTransform localeTransform) {
170         Transliterator transliterator = loadTransliterator(localeTransform);
171         CLDRFile input;
172         final String inputLocale = localeTransform.getInputLocale();
173         try {
174             input = factory.make(inputLocale, false);
175         } catch (ICUUncheckedIOException e1) {
176             return null; // input file is missing (or otherwise unavailable)
177         }
178         boolean hadOutput = true;
179         CLDRFile output;
180         try {
181             output = factory.make(localeTransform.getOutputLocale(), false);
182         } catch (NoSourceDirectoryException e) {
183             // if we can't open the file, then just make a new one.
184             XMLSource dataSource = new SimpleXMLSource(localeTransform.getOutputLocale());
185             output = new CLDRFile(dataSource);
186             hadOutput = false;
187         }
188         String outputParentString = LocaleIDParser.getParent(localeTransform.getOutputLocale());
189         CLDRFile outputParent = factory.make(outputParentString, true);
190 
191         outputParent = factory.make(inputLocale, false);
192         XMLSource outputSource = new SimpleXMLSource(localeTransform.toString());
193         DisplayAndInputProcessor daip = new DisplayAndInputProcessor(output, true);
194         for (String xpath : input) {
195             String value = input.getStringValue(xpath);
196             if (CldrUtility.INHERITANCE_MARKER.equals(value)) {
197                 final String foundIn = input.getSourceLocaleID(xpath, null);
198                 // Include these only when they are actually present in this file
199                 if (!foundIn.equals(inputLocale)) {
200                     // inheritance marker came from somewhere else, ignore it
201                     continue;
202                 }
203             }
204             if (value == null) {
205                 continue;
206             }
207             String fullPath = input.getFullXPath(xpath);
208             String oldValue = output.getStringValue(xpath);
209             String parentValue = outputParent.getStringValue(xpath);
210             value = transformValue(transliterator, localeTransform, xpath, value, oldValue, parentValue);
211             if (value != null) {
212                 // check again
213                 if (CldrUtility.INHERITANCE_MARKER.equals(value)) {
214                     final String foundIn = input.getSourceLocaleID(xpath, null);
215                     // Include these only when they are actually present in this file
216                     if (!foundIn.equals(inputLocale)) {
217                         // inheritance marker came from somewhere else, ignore it
218                         continue;
219                     }
220                 }
221                 value = daip.processInput(xpath, value, null);
222                 outputSource.putValueAtPath(fullPath, value);
223             }
224         }
225         if (!outputSource.iterator().hasNext()) { // empty new output
226             if (!hadOutput) {
227                 return null; // don't add file if nothing to add
228             }
229         }
230         return new CLDRFile(outputSource);
231     }
232 
233     /**
234      * Transforms a CLDRFile value into another form.
235      * @param parentValue
236      */
transformValue(Transliterator transliterator, LocaleTransform localeTransform, String path, String value, String oldValue, String parentValue)237     private String transformValue(Transliterator transliterator, LocaleTransform localeTransform, String path, String value,
238         String oldValue, String parentValue) {
239 
240         // allows us to change only new values
241         switch (localeTransform.policy) {
242         case RETAIN:
243         case MINIMIZE:
244             if (oldValue != null) {
245                 return oldValue;
246             }
247             break;
248         default:
249         }
250 
251         UnicodeSet chars = localeTransform.getInputChars();
252         String transliterated;
253 
254         // TODO: Don't transform dates/patterns.
255         // For now, don't try to transliterate the exemplar characters - use the ones from the original locale.
256         // In the future, we can probably control this better with a config file - similar to CLDRModify's config file.
257         if (path.contains("exemplarCharacters")) {
258             if (oldValue != null) {
259                 transliterated = oldValue;
260             } else {
261                 transliterated = value;
262             }
263         } else {
264             transliterated = transliterator.transliterate(value);
265             transliterated = Normalizer.compose(transliterated, false);
266         }
267         if (localeTransform.policy == PolicyIfExisting.MINIMIZE) {
268             if (transliterated.equals(value)) {
269                 return null;
270             }
271         }
272 
273         if (chars.containsSome(transliterated)) {
274             unconverted.addAll(new UnicodeSet().addAll(chars).retainAll(transliterated));
275         }
276         return transliterated;
277     }
278 
main(String[] args)279     public static void main(String[] args) throws Exception {
280         for (String dir : DtdType.ldml.directories) {
281             if (dir.equals("casing") // skip, field contents are keywords, not localizable content
282                 || dir.equals("collation") // skip, field contents are complex, and can't be simply remapped
283                 || dir.equals("annotationsDerived") // skip, derived later
284                 ) {
285                 continue;
286             }
287             System.out.println("\nDirectory: " + dir);
288             final String sourceDirectory = CLDRPaths.COMMON_DIRECTORY + dir + "/";
289             Factory factory = Factory.make(sourceDirectory, ".*");
290 
291             CLDRFileTransformer transformer = new CLDRFileTransformer(factory, CLDRPaths.COMMON_DIRECTORY + "transforms" + File.separator);
292             for (LocaleTransform localeTransform : LocaleTransform.values()) {
293                 CLDRFile output = transformer.transform(localeTransform);
294                 if (output == null) {
295                     System.out.println("SKIPPING missing file: " + dir + "/" + localeTransform.inputLocale + ".xml");
296                     continue;
297                 }
298                 String outputDir = CLDRPaths.GEN_DIRECTORY + "common/" + dir + File.separator;
299                 String outputFile = output.getLocaleID() + ".xml";
300 
301                 try (PrintWriter out = FileUtilities.openUTF8Writer(outputDir, outputFile)) {
302                     System.out.println("Generating locale file: " + outputDir + outputFile);
303                     if (!transformer.unconverted.isEmpty()) {
304                         System.out.println("Untransformed characters: " + transformer.unconverted);
305                         transformer.unconverted.clear();
306                     }
307                     output.write(out);
308                 }
309             }
310         }
311     }
312 }
313