• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 package org.unicode.cldr.tool;
2 
3 import com.google.common.base.Joiner;
4 import com.google.common.base.Objects;
5 import com.ibm.icu.impl.Relation;
6 import com.ibm.icu.impl.Row;
7 import com.ibm.icu.impl.Row.R3;
8 import com.ibm.icu.lang.CharSequences;
9 import com.ibm.icu.util.ICUException;
10 import com.ibm.icu.util.VersionInfo;
11 import java.io.DataOutputStream;
12 import java.io.File;
13 import java.io.FileOutputStream;
14 import java.io.IOException;
15 import java.io.PrintWriter;
16 import java.util.ArrayList;
17 import java.util.Collections;
18 import java.util.HashMap;
19 import java.util.HashSet;
20 import java.util.List;
21 import java.util.Map;
22 import java.util.Map.Entry;
23 import java.util.Set;
24 import java.util.TreeMap;
25 import java.util.TreeSet;
26 import java.util.regex.Matcher;
27 import java.util.regex.Pattern;
28 import org.unicode.cldr.draft.FileUtilities;
29 import org.unicode.cldr.test.DisplayAndInputProcessor;
30 import org.unicode.cldr.test.OutdatedPaths;
31 import org.unicode.cldr.tool.Option.Options;
32 import org.unicode.cldr.util.CLDRFile;
33 import org.unicode.cldr.util.CLDRPaths;
34 import org.unicode.cldr.util.CldrUtility;
35 import org.unicode.cldr.util.Factory;
36 import org.unicode.cldr.util.LanguageTagParser;
37 import org.unicode.cldr.util.Pair;
38 import org.unicode.cldr.util.PathUtilities;
39 import org.unicode.cldr.util.PatternCache;
40 import org.unicode.cldr.util.SimpleFactory;
41 import org.unicode.cldr.util.StringId;
42 
43 public class GenerateBirth {
44     private static boolean DEBUG = false;
45 
46     static CldrVersion[] VERSIONS;
47 
48     static Factory[] factories;
49 
50     static final Options myOptions =
51             new Options()
52                     .add(
53                             "target",
54                             ".*",
55                             CLDRPaths.BIRTH_DATA_DIR,
56                             "The target directory for building the text files that show the results.")
57                     .add(
58                             "log",
59                             ".*",
60                             CLDRPaths.STAGING_DIRECTORY
61                                     + "births/"
62                                     + CldrVersion.baseline.getVersionInfo().getVersionString(2, 4),
63                             "The target directory for building the text files that show the results.")
64                     .add(
65                             "file",
66                             ".*",
67                             ".*",
68                             "Filter the information based on file name, using a regex argument. The '.xml' is removed from the file before filtering")
69                     .add("previous", "Stop after writing the English previous data.")
70                     .add(
71                             "oldest",
72                             "\\d+(\\.\\d+)?",
73                             "38.0",
74                             "Oldest version to go back to, eg 36.1")
75                     .add("debug", "Debug");
76 
main(String[] args)77     public static void main(String[] args) throws IOException {
78         System.out.println("Run TestOutdatedPaths.java -v to see a listing of changes.");
79         myOptions.parse(args, true);
80         DEBUG = myOptions.get("debug").doesOccur();
81 
82         try {
83             CldrVersion.checkVersions(); // verify versions up to date
84         } catch (Exception e) {
85             throw new ICUException(
86                     "This tool can only be run if the archive of released versions matching CldrVersion is available.",
87                     e);
88         }
89 
90         // generate the list for as far as we want to go back
91 
92         VersionInfo oldest = VersionInfo.getInstance(myOptions.get("oldest").getValue());
93         List<CldrVersion> versions = new ArrayList<>();
94         boolean foundStart = false;
95         for (CldrVersion version : CldrVersion.CLDR_VERSIONS_DESCENDING) {
96             versions.add(version);
97             if (version.getVersionInfo() == oldest) {
98                 foundStart = true;
99                 break;
100             }
101         }
102         if (!foundStart) {
103             throw new IllegalArgumentException(
104                     "The last version is "
105                             + myOptions.get("oldest").getValue()
106                             + "; it must be in: "
107                             + Joiner.on(", ").join(CldrVersion.CLDR_VERSIONS_DESCENDING));
108         }
109         VERSIONS = versions.toArray(new CldrVersion[versions.size()]);
110 
111         // set up the CLDR Factories for each version
112         factories = new Factory[VERSIONS.length]; // hack for now; should change to list
113 
114         String filePattern = myOptions.get("file").getValue();
115 
116         ArrayList<Factory> list = new ArrayList<>();
117         for (CldrVersion version : VERSIONS) {
118             if (version == CldrVersion.unknown) {
119                 continue;
120             }
121             List<File> paths = version.getPathsForFactory();
122 
123             System.out.println(version + ", " + paths);
124             Factory aFactory =
125                     SimpleFactory.make(paths.toArray(new File[paths.size()]), filePattern);
126             list.add(aFactory);
127         }
128         list.toArray(factories);
129 
130         final String dataDirectory = myOptions.get("target").getValue();
131         File dataDir = new File(dataDirectory);
132         if (!dataDir.isDirectory()) {
133             throw new IllegalArgumentException("-t value is not directory: " + dataDir);
134         }
135 
136         // load and process English
137 
138         String logDirectory = myOptions.get("log").getValue();
139 
140         System.out.println("en\tBegin");
141         Births english = new Births("en");
142         english.writeBirth(logDirectory, "en", null);
143         String englishDataFile = dataDirectory + "/" + OutdatedPaths.OUTDATED_ENGLISH_DATA;
144         english.writeBirthValues(englishDataFile);
145 
146         Map<Long, Pair<CldrVersion, String>> pathToPrevious = new HashMap<>();
147 
148         // Verify that the write of English worked
149 
150         OutdatedPaths.readBirthValues(dataDirectory, null, pathToPrevious);
151         for (Entry<String, R3<CldrVersion, String, String>> entry :
152                 english.pathToBirthCurrentPrevious.entrySet()) {
153             String path = entry.getKey();
154             String previous = entry.getValue().get2();
155             CldrVersion birth = entry.getValue().get0();
156             if (previous == null) {
157                 previous = OutdatedPaths.NO_VALUE;
158             }
159             long id = StringId.getId(path);
160             Pair<CldrVersion, String> readValue = pathToPrevious.get(id);
161             CldrVersion birthRead = readValue == null ? null : readValue.getFirst();
162             String previousRead = readValue == null ? null : readValue.getSecond();
163             if (!Objects.equal(previous, previousRead) || !Objects.equal(birth, birthRead)) {
164                 throw new IllegalArgumentException(
165                         "path: "
166                                 + path
167                                 + "\tprevious: "
168                                 + previous
169                                 + "\tread: "
170                                 + readValue
171                                 + "\tbirth: "
172                                 + birth
173                                 + "\tread: "
174                                 + birthRead);
175             }
176         }
177 
178         // Set up the binary data files for all others
179 
180         File file = new File(dataDirectory + "/" + OutdatedPaths.OUTDATED_DATA);
181         final String outputDataFile = PathUtilities.getNormalizedPathString(file);
182         TreeMap<String, Set<String>> localeToNewer = new TreeMap<>();
183 
184         System.out.println("Writing data: " + outputDataFile);
185         try (DataOutputStream dataOut = new DataOutputStream(new FileOutputStream(file))) {
186             dataOut.writeUTF(OutdatedPaths.FORMAT_KEY);
187 
188             // Load and process all the locales
189 
190             LanguageTagParser ltp = new LanguageTagParser();
191             for (String fileName : factories[0].getAvailable()) {
192                 if (fileName.equals("en")) {
193                     continue;
194                 }
195                 if (!ltp.set(fileName).getRegion().isEmpty()) {
196                     continue; // skip region locales
197                 }
198                 // TODO skip default content
199                 System.out.println();
200                 System.out.println(fileName + "\t" + "Begin");
201                 Births other = new Births(fileName);
202                 Set<String> newer = other.writeBirth(logDirectory, fileName, english);
203 
204                 dataOut.writeUTF(fileName);
205                 dataOut.writeInt(newer.size());
206                 for (String item : newer) {
207                     long id = StringId.getId(item);
208                     dataOut.writeLong(id);
209                     if (DEBUG) {
210                         System.out.println(id + "\t" + item);
211                     }
212                 }
213                 localeToNewer.put(fileName, newer);
214             }
215             dataOut.writeUTF("$END$");
216         }
217 
218         // Doublecheck the data
219 
220         OutdatedPaths outdatedPaths = new OutdatedPaths(dataDirectory);
221         Set<String> needPrevious = new TreeSet<>();
222         int errorCount = 0;
223         for (Entry<String, Set<String>> localeAndNewer : localeToNewer.entrySet()) {
224             String locale = localeAndNewer.getKey();
225             System.out.println("Checking " + locale);
226             Set<String> newer = localeAndNewer.getValue();
227             if (newer.size() != outdatedPaths.countOutdated(locale)) {
228                 throw new IllegalArgumentException("broken: " + locale);
229             }
230             for (String xpath : newer) {
231                 boolean isOutdated = outdatedPaths.isRawOutdated(locale, xpath);
232                 if (!isOutdated) {
233                     System.out.println(
234                             "Error, broken locale: "
235                                     + locale
236                                     + "\t"
237                                     + StringId.getId(xpath)
238                                     + "\t"
239                                     + xpath);
240                     ++errorCount;
241                 }
242                 if (outdatedPaths.isSkipped(xpath)) {
243                     continue;
244                 }
245                 String previous = outdatedPaths.getPreviousEnglish(xpath);
246                 if (previous.isEmpty() != english.emptyPrevious.contains(xpath)) {
247                     System.out.println(
248                             "previous.isEmpty() != original "
249                                     + locale
250                                     + "\t"
251                                     + StringId.getId(xpath)
252                                     + "\t"
253                                     + xpath);
254                     needPrevious.add(xpath);
255                     ++errorCount;
256                 }
257             }
258         }
259         // give a reminder since the above will be lost
260         System.out.println("Wrote: " + englishDataFile);
261         if (errorCount != 0) {
262             throw new IllegalArgumentException(
263                     "Done, but " + errorCount + " errors writing to " + outputDataFile);
264         } else {
265             System.out.println("Done, no errors writing to: " + outputDataFile);
266         }
267         System.out.println("Please commit the above two files and start a PR.");
268     }
269 
270     static class Births {
271         private static final boolean USE_RESOLVED = false;
272         final Relation<CldrVersion, String> birthToPaths;
273         final Map<String, Row.R3<CldrVersion, String, String>> pathToBirthCurrentPrevious;
274         final String locale;
275         static final Pattern TYPE = PatternCache.get("\\[@type=\"([^\"]*)\"");
276         final Matcher typeMatcher = TYPE.matcher("");
277         Set<String> emptyPrevious = new HashSet<>();
278 
Births(String file)279         Births(String file) {
280             locale = file;
281 
282             CLDRFile[] files = new CLDRFile[factories.length];
283             DisplayAndInputProcessor[] processors = new DisplayAndInputProcessor[factories.length];
284 
285             for (int i = 0; i < factories.length; ++i) {
286                 final CldrVersion ver = CldrVersion.CLDR_VERSIONS_DESCENDING.get(i);
287                 try {
288                     files[i] = factories[i].make(file, USE_RESOLVED);
289                     processors[i] = new DisplayAndInputProcessor(files[i], false);
290                 } catch (SimpleFactory.NoSourceDirectoryException nsd) {
291                     // stop when we fail to find a dir
292                     System.out.println(
293                             String.format("%s\tEnd of source directories at v%s", file, ver));
294                     break;
295                 } catch (Throwable t) {
296                     throw new RuntimeException(
297                             "Exception while processing " + file + " v" + ver, t);
298                 }
299             }
300             System.out.println(String.format("%s\tDone", file));
301             birthToPaths = Relation.of(new TreeMap<CldrVersion, Set<String>>(), TreeSet.class);
302             pathToBirthCurrentPrevious = new HashMap<>();
303             for (String xpath : files[0]) {
304                 xpath = xpath.intern();
305                 if (xpath.contains("[@type=\"ar\"]")) {
306                     int debug = 0;
307                 }
308                 String base = getProcessedStringValue(0, xpath, files, processors);
309 
310                 String previousValue = null;
311                 int i;
312                 CLDRFile lastFile = files[0];
313                 for (i = 1; i < files.length && files[i] != null; ++i) {
314                     String previous = getProcessedStringValue(i, xpath, files, processors);
315                     if (previous == null) {
316                         previous = OutdatedPaths.NO_VALUE; // fixNullPrevious(xpath);
317                     }
318                     if (!CharSequences.equals(base, previous)) {
319                         if (previous != null) {
320                             previousValue = previous;
321                         }
322                         break;
323                     }
324                     lastFile = files[i];
325                 }
326                 CldrVersion version = CldrVersion.from(lastFile.getDtdVersionInfo());
327                 birthToPaths.put(version, xpath);
328                 pathToBirthCurrentPrevious.put(xpath, Row.of(version, base, previousValue));
329             }
330         }
331 
getProcessedStringValue( int fileNumber, String xpath, CLDRFile[] files, DisplayAndInputProcessor[] processors)332         public String getProcessedStringValue(
333                 int fileNumber,
334                 String xpath,
335                 CLDRFile[] files,
336                 DisplayAndInputProcessor[] processors) {
337             String base = files[fileNumber].getStringValue(xpath);
338             if (base != null) {
339                 base = processors[fileNumber].processInput(xpath, base, null);
340             }
341             return base;
342         }
343 
fixNullPrevious(String xpath)344         private String fixNullPrevious(String xpath) {
345             if (typeMatcher.reset(xpath).find()) {
346                 String type = typeMatcher.group(1);
347                 if (xpath.contains("metazone")) {
348                     return type.replace("_", " ");
349                 } else if (xpath.contains("zone")) {
350                     String[] splits = type.split("/");
351                     return splits[splits.length - 1].replace("_", " ");
352                 }
353                 return type;
354             }
355             return null;
356         }
357 
writeBirthValues(String file)358         public void writeBirthValues(String file) throws IOException {
359             try (DataOutputStream dataOut = new DataOutputStream(new FileOutputStream(file))) {
360                 dataOut.writeUTF(OutdatedPaths.FORMAT_KEY);
361                 System.out.println("Writing data: " + PathUtilities.getNormalizedPathString(file));
362                 dataOut.writeInt(pathToBirthCurrentPrevious.size());
363 
364                 // Load and process all the locales
365 
366                 // TreeMap<String, Set<String>> localeToNewer = new TreeMap<String, Set<String>>();
367                 for (Entry<String, R3<CldrVersion, String, String>> entry :
368                         pathToBirthCurrentPrevious.entrySet()) {
369                     String path = entry.getKey();
370                     R3<CldrVersion, String, String> birthCurrentPrevious = entry.getValue();
371                     CldrVersion birth = birthCurrentPrevious.get0();
372                     String current = birthCurrentPrevious.get1();
373                     String previous = birthCurrentPrevious.get2();
374                     long id = StringId.getId(path);
375                     dataOut.writeLong(id);
376                     final String previousString =
377                             previous == null ? OutdatedPaths.NO_VALUE : previous;
378                     dataOut.writeUTF(previousString);
379                     if (previous == null) {
380                         emptyPrevious.add(path);
381                     }
382                     dataOut.writeUTF(birth.toString());
383                     if (true) {
384                         System.out.println(
385                                 id + "\t" + birth + "\t«" + current + "⇐" + previous + "»");
386                     }
387                 }
388                 dataOut.writeUTF("$END$");
389                 emptyPrevious = Collections.unmodifiableSet(emptyPrevious);
390             }
391         }
392 
writeBirth(PrintWriter out, Births onlyNewer)393         Set<String> writeBirth(PrintWriter out, Births onlyNewer) {
394 
395             out.println("Loc\tVersion\tValue\tPrevValue\tEVersion\tEValue\tEPrevValue\tPath");
396 
397             Set<String> newer = new HashSet<>();
398             HashMap<Long, String> sanityCheck = new HashMap<>();
399             CldrVersion onlyNewerVersion = null;
400             String otherValue = "n/a";
401             String olderOtherValue = "n/a";
402             for (Entry<CldrVersion, Set<String>> entry2 : birthToPaths.keyValuesSet()) {
403                 CldrVersion version = entry2.getKey();
404                 for (String xpath : entry2.getValue()) {
405                     long id = StringId.getId(xpath);
406                     String old = sanityCheck.get(id);
407                     if (old != null) {
408                         throw new IllegalArgumentException(
409                                 "Path Collision " + xpath + ", old:" + old + ", id: " + id);
410                     } else {
411                         sanityCheck.put(id, xpath);
412                     }
413                     R3<CldrVersion, String, String> info = pathToBirthCurrentPrevious.get(xpath);
414                     if (onlyNewer != null) {
415 
416                         R3<CldrVersion, String, String> otherInfo =
417                                 onlyNewer.pathToBirthCurrentPrevious.get(xpath);
418                         if (otherInfo == null) {
419                             continue;
420                         }
421                         // skip if not older than "comparison version"
422                         onlyNewerVersion = otherInfo.get0();
423                         if (!version.isOlderThan(onlyNewerVersion)) {
424                             continue;
425                         }
426                         otherValue = fixNull(otherInfo.get1());
427                         olderOtherValue = fixNull(otherInfo.get2());
428                         newer.add(xpath);
429                     }
430                     String value = fixNull(info.get1());
431                     String olderValue = fixNull(info.get2());
432 
433                     out.println(
434                             locale
435                                     + "\t"
436                                     + version
437                                     + "\t"
438                                     + value
439                                     + "\t"
440                                     + olderValue
441                                     + "\t"
442                                     + CldrUtility.ifNull(onlyNewerVersion, "n/a")
443                                     + "\t"
444                                     + otherValue
445                                     + "\t"
446                                     + olderOtherValue
447                                     + "\t"
448                                     + xpath);
449                 }
450             }
451             return newer;
452         }
453 
fixNull(String value)454         private String fixNull(String value) {
455             if (value == null) {
456                 value = OutdatedPaths.NO_VALUE;
457             }
458             return value;
459         }
460 
writeBirth(String directory, String filename, Births onlyNewer)461         Set<String> writeBirth(String directory, String filename, Births onlyNewer)
462                 throws IOException {
463             try (PrintWriter out = FileUtilities.openUTF8Writer(directory, filename + ".txt")) {
464                 Set<String> newer = writeBirth(out, onlyNewer);
465                 return newer;
466             }
467         }
468     }
469 }
470