• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 package org.unicode.cldr.tool;
2 
3 import java.io.File;
4 import java.io.PrintWriter;
5 import java.util.Map;
6 import java.util.concurrent.ConcurrentHashMap;
7 
8 import org.unicode.cldr.draft.FileUtilities;
9 import org.unicode.cldr.test.DisplayAndInputProcessor;
10 import org.unicode.cldr.util.CLDRFile;
11 import org.unicode.cldr.util.CLDRPaths;
12 import org.unicode.cldr.util.CLDRTransforms;
13 import org.unicode.cldr.util.CLDRTransforms.ParsedTransformID;
14 import org.unicode.cldr.util.CldrUtility;
15 import org.unicode.cldr.util.DtdType;
16 import org.unicode.cldr.util.Factory;
17 import org.unicode.cldr.util.LocaleIDParser;
18 import org.unicode.cldr.util.SimpleFactory.NoSourceDirectoryException;
19 import org.unicode.cldr.util.SimpleXMLSource;
20 import org.unicode.cldr.util.XMLSource;
21 
22 import com.ibm.icu.text.Normalizer;
23 import com.ibm.icu.text.Transliterator;
24 import com.ibm.icu.text.UnicodeSet;
25 import com.ibm.icu.util.ICUUncheckedIOException;
26 
27 /**
28  * Transforms the contents of a CLDRFile.
29  *
30  * @author jchye
31  */
32 public class CLDRFileTransformer {
33     public enum PolicyIfExisting {
34         RETAIN,  // Do not transliterate if existing output has locale content
35         DISCARD, // Replace existing output locale content
36         MINIMIZE // RETAIN, plus drop values if translit is a no-op.
37     }
38 
39     /**
40      * Contains all supported locale-to-locale conversions along with information
41      * needed to convert each locale. Each enum value is named after the locale that results
42      * from the conversion.
43      */
44     public enum LocaleTransform {
45         sr_Latn("sr", "Serbian-Latin-BGN.xml", Transliterator.FORWARD, "[:script=Cyrl:]", PolicyIfExisting.DISCARD), //
46         sr_Latn_BA("sr_Cyrl_BA", "Serbian-Latin-BGN.xml", Transliterator.FORWARD, "[:script=Cyrl:]", PolicyIfExisting.DISCARD), //
47         sr_Latn_ME("sr_Cyrl_ME", "Serbian-Latin-BGN.xml", Transliterator.FORWARD, "[:script=Cyrl:]", PolicyIfExisting.DISCARD), //
48         sr_Latn_XK("sr_Cyrl_XK", "Serbian-Latin-BGN.xml", Transliterator.FORWARD, "[:script=Cyrl:]", PolicyIfExisting.DISCARD), //
49         ha_NE("ha", "ha-ha_NE.xml", Transliterator.FORWARD, "[y Y ƴ Ƴ ʼ]", PolicyIfExisting.DISCARD), //
50         yo_BJ("yo", "yo-yo_BJ.xml", Transliterator.FORWARD, "[ẹ ọ ṣ Ẹ Ọ Ṣ]", PolicyIfExisting.DISCARD), //
51         de_CH("de", "[ß] Casefold", Transliterator.FORWARD, "[ß]", PolicyIfExisting.MINIMIZE), //
52         yue_Hans("yue", "Simplified-Traditional.xml", Transliterator.REVERSE, "[:script=Hant:]", PolicyIfExisting.RETAIN), //
53         // en_NZ("en_AU", "null", Transliterator.FORWARD, "[]", PolicyIfExisting.DISCARD),
54         // Needs work to fix currency symbols, handle Māori. See http://unicode.org/cldr/trac/ticket/9516#comment:6
55         ;
56 
57         private final String inputLocale;
58         private final String transformFilename;
59         private final int direction;
60         private final UnicodeSet inputChars;
61         private final PolicyIfExisting policy;
62 
63         /**
64          * @deprecated Use {@link #LocaleTransform(String,String,int,String,PolicyIfExisting)} instead
65          */
66         @Deprecated
LocaleTransform(String inputLocale, String transformFilename, int direction, String inputCharPattern)67         private LocaleTransform(String inputLocale, String transformFilename, int direction, String inputCharPattern) {
68             this(inputLocale, transformFilename, direction, inputCharPattern, PolicyIfExisting.DISCARD);
69         }
70 
LocaleTransform(String inputLocale, String transformFilename, int direction, String inputCharPattern, PolicyIfExisting policy)71         private LocaleTransform(String inputLocale, String transformFilename, int direction, String inputCharPattern, PolicyIfExisting policy) {
72             this.inputLocale = inputLocale;
73             this.transformFilename = transformFilename;
74             this.direction = direction;
75             this.inputChars = new UnicodeSet(inputCharPattern);
76             this.policy = policy;
77         }
78 
79         /**
80          * @return the policy for existing content
81          */
getPolicyIfExisting()82         public PolicyIfExisting getPolicyIfExisting() {
83             return policy;
84         }
85 
86         /**
87          * @return the locale that used for conversion
88          */
getInputLocale()89         public String getInputLocale() {
90             return inputLocale;
91         }
92 
93         /**
94          * @return the locale that used for conversion
95          */
getOutputLocale()96         public String getOutputLocale() {
97             return this.toString();
98         }
99 
100         /**
101          * @return the filename of the transform used to make the conversion
102          */
getTransformFilename()103         public String getTransformFilename() {
104             return transformFilename;
105         }
106 
107         /**
108          * @return the direction of the transformation
109          */
getDirection()110         public int getDirection() {
111             return direction;
112         }
113 
114         /**
115          * @return the set of characters in the input locale that should have been removed after
116          *         transformation, used for internal debugging
117          */
getInputChars()118         private UnicodeSet getInputChars() {
119             return inputChars;
120         }
121     }
122 
123     private UnicodeSet unconverted = new UnicodeSet();
124     private Factory factory;
125     /*
126      * The transliterators map exists, and is static, to avoid wasting a lot of time creating
127      * a new Transliterator more often than necessary. (An alternative to "static" here might be to
128      * create only one CLDRFileTransformer, maybe as a member of ExampleGenerator.)
129      * Use ConcurrentHashMap rather than HashMap to avoid concurrency problems.
130      * Reference: https://unicode.org/cldr/trac/ticket/11657
131      */
132     private static Map<LocaleTransform, Transliterator> transliterators = new ConcurrentHashMap<>();
133     private String transformDir;
134 
135     /**
136      * @param factory
137      *            the factory to get locale data from
138      * @param transformDir
139      *            the directory containing the transform files
140      */
CLDRFileTransformer(Factory factory, String transformDir)141     public CLDRFileTransformer(Factory factory, String transformDir) {
142         this.factory = factory;
143         this.transformDir = transformDir;
144     }
145 
loadTransliterator(LocaleTransform localeTransform)146     public Transliterator loadTransliterator(LocaleTransform localeTransform) {
147         if (transliterators.containsKey(localeTransform)) {
148             return transliterators.get(localeTransform);
149         }
150         Transliterator transliterator;
151         if (localeTransform.getTransformFilename().contains(".xml")) {
152             ParsedTransformID directionInfo = new ParsedTransformID();
153             String ruleString = CLDRTransforms.getIcuRulesFromXmlFile(transformDir, localeTransform.getTransformFilename(), directionInfo);
154             transliterator = Transliterator.createFromRules(directionInfo.getId(),
155                 ruleString, localeTransform.getDirection());
156         } else {
157             transliterator = Transliterator.getInstance(localeTransform.getTransformFilename());
158         }
159         transliterators.put(localeTransform, transliterator);
160         return transliterator;
161     }
162 
163     /**
164      * NOTE: This method does not currently handle nested transliterators.
165      *
166      * @param input
167      * @return null if the input file was missing, or if there is no new output file.
168      */
transform(LocaleTransform localeTransform)169     public CLDRFile transform(LocaleTransform localeTransform) {
170         Transliterator transliterator = loadTransliterator(localeTransform);
171         CLDRFile input;
172         try {
173             input = factory.make(localeTransform.getInputLocale(), false);
174         } catch (ICUUncheckedIOException e1) {
175             return null; // input file is missing (or otherwise unavailable)
176         }
177         boolean hadOutput = true;
178         CLDRFile output;
179         try {
180             output = factory.make(localeTransform.getOutputLocale(), false);
181         } catch (NoSourceDirectoryException e) {
182             // if we can't open the file, then just make a new one.
183             XMLSource dataSource = new SimpleXMLSource(localeTransform.getOutputLocale());
184             output = new CLDRFile(dataSource);
185             hadOutput = false;
186         }
187         String outputParentString = LocaleIDParser.getParent(localeTransform.getOutputLocale());
188         CLDRFile outputParent = factory.make(outputParentString, true);
189 
190         outputParent = factory.make(localeTransform.getInputLocale(), false);
191         XMLSource outputSource = new SimpleXMLSource(localeTransform.toString());
192         DisplayAndInputProcessor daip = new DisplayAndInputProcessor(output, true);
193         for (String xpath : input) {
194             String value = input.getStringValue(xpath);
195             if (CldrUtility.INHERITANCE_MARKER.equals(value)) {
196                 value = null;
197             }
198             if (value == null) {
199                 continue;
200             }
201             String fullPath = input.getFullXPath(xpath);
202             String oldValue = output.getStringValue(xpath);
203             String parentValue = outputParent.getStringValue(xpath);
204             value = transformValue(transliterator, localeTransform, xpath, value, oldValue, parentValue);
205             if (value != null && !CldrUtility.INHERITANCE_MARKER.equals(value)) {
206                 value = daip.processInput(xpath, value, null);
207                 outputSource.putValueAtPath(fullPath, value);
208             }
209         }
210         if (!outputSource.iterator().hasNext()) { // empty new output
211             if (!hadOutput) {
212                 return null; // don't add file if nothing to add
213             }
214         }
215         return new CLDRFile(outputSource);
216     }
217 
218     /**
219      * Transforms a CLDRFile value into another form.
220      * @param parentValue
221      */
transformValue(Transliterator transliterator, LocaleTransform localeTransform, String path, String value, String oldValue, String parentValue)222     private String transformValue(Transliterator transliterator, LocaleTransform localeTransform, String path, String value,
223         String oldValue, String parentValue) {
224 
225         // allows us to change only new values
226         switch (localeTransform.policy) {
227         case RETAIN:
228         case MINIMIZE:
229             if (oldValue != null) {
230                 return oldValue;
231             }
232             break;
233         default:
234         }
235 
236         UnicodeSet chars = localeTransform.getInputChars();
237         String transliterated;
238 
239         // TODO: Don't transform dates/patterns.
240         // For now, don't try to transliterate the exemplar characters - use the ones from the original locale.
241         // In the future, we can probably control this better with a config file - similar to CLDRModify's config file.
242         if (path.contains("exemplarCharacters")) {
243             if (oldValue != null) {
244                 transliterated = oldValue;
245             } else {
246                 transliterated = value;
247             }
248         } else {
249             transliterated = transliterator.transliterate(value);
250             transliterated = Normalizer.compose(transliterated, false);
251         }
252         if (localeTransform.policy == PolicyIfExisting.MINIMIZE) {
253             if (transliterated.equals(value)) {
254                 return null;
255             }
256         }
257 
258         if (chars.containsSome(transliterated)) {
259             unconverted.addAll(new UnicodeSet().addAll(chars).retainAll(transliterated));
260         }
261         return transliterated;
262     }
263 
main(String[] args)264     public static void main(String[] args) throws Exception {
265         for (String dir : DtdType.ldml.directories) {
266             if (dir.equals("casing") // skip, field contents are keywords, not localizable content
267                 || dir.equals("collation") // skip, field contents are complex, and can't be simply remapped
268                 || dir.equals("annotationsDerived") // skip, derived later
269                 ) {
270                 continue;
271             }
272             System.out.println("\nDirectory: " + dir);
273             final String sourceDirectory = CLDRPaths.COMMON_DIRECTORY + dir + "/";
274             Factory factory = Factory.make(sourceDirectory, ".*");
275 
276             CLDRFileTransformer transformer = new CLDRFileTransformer(factory, CLDRPaths.COMMON_DIRECTORY + "transforms" + File.separator);
277             for (LocaleTransform localeTransform : LocaleTransform.values()) {
278                 CLDRFile output = transformer.transform(localeTransform);
279                 if (output == null) {
280                     System.out.println("SKIPPING missing file: " + dir + "/" + localeTransform.inputLocale + ".xml");
281                     continue;
282                 }
283                 String outputDir = CLDRPaths.GEN_DIRECTORY + "common/" + dir + File.separator;
284                 String outputFile = output.getLocaleID() + ".xml";
285 
286                 try (PrintWriter out = FileUtilities.openUTF8Writer(outputDir, outputFile)) {
287                     System.out.println("Generating locale file: " + outputDir + outputFile);
288                     if (!transformer.unconverted.isEmpty()) {
289                         System.out.println("Untransformed characters: " + transformer.unconverted);
290                         transformer.unconverted.clear();
291                     }
292                     output.write(out);
293                 }
294             }
295         }
296     }
297 }
298