1 package org.unicode.cldr.tool; 2 3 import com.google.common.base.Joiner; 4 import com.google.common.base.Objects; 5 import com.ibm.icu.impl.Relation; 6 import com.ibm.icu.impl.Row; 7 import com.ibm.icu.impl.Row.R3; 8 import com.ibm.icu.lang.CharSequences; 9 import com.ibm.icu.util.ICUException; 10 import com.ibm.icu.util.VersionInfo; 11 import java.io.DataOutputStream; 12 import java.io.File; 13 import java.io.FileOutputStream; 14 import java.io.IOException; 15 import java.io.PrintWriter; 16 import java.util.ArrayList; 17 import java.util.Collections; 18 import java.util.HashMap; 19 import java.util.HashSet; 20 import java.util.List; 21 import java.util.Map; 22 import java.util.Map.Entry; 23 import java.util.Set; 24 import java.util.TreeMap; 25 import java.util.TreeSet; 26 import java.util.regex.Matcher; 27 import java.util.regex.Pattern; 28 import org.unicode.cldr.draft.FileUtilities; 29 import org.unicode.cldr.test.DisplayAndInputProcessor; 30 import org.unicode.cldr.test.OutdatedPaths; 31 import org.unicode.cldr.tool.Option.Options; 32 import org.unicode.cldr.util.CLDRFile; 33 import org.unicode.cldr.util.CLDRPaths; 34 import org.unicode.cldr.util.CldrUtility; 35 import org.unicode.cldr.util.Factory; 36 import org.unicode.cldr.util.LanguageTagParser; 37 import org.unicode.cldr.util.Pair; 38 import org.unicode.cldr.util.PathUtilities; 39 import org.unicode.cldr.util.PatternCache; 40 import org.unicode.cldr.util.SimpleFactory; 41 import org.unicode.cldr.util.StringId; 42 43 public class GenerateBirth { 44 private static boolean DEBUG = false; 45 46 static CldrVersion[] VERSIONS; 47 48 static Factory[] factories; 49 50 static final Options myOptions = 51 new Options() 52 .add( 53 "target", 54 ".*", 55 CLDRPaths.BIRTH_DATA_DIR, 56 "The target directory for building the text files that show the results.") 57 .add( 58 "log", 59 ".*", 60 CLDRPaths.STAGING_DIRECTORY 61 + "births/" 62 + CldrVersion.baseline.getVersionInfo().getVersionString(2, 4), 63 "The target directory for building the text files that show the results.") 64 .add( 65 "file", 66 ".*", 67 ".*", 68 "Filter the information based on file name, using a regex argument. The '.xml' is removed from the file before filtering") 69 .add("previous", "Stop after writing the English previous data.") 70 .add( 71 "oldest", 72 "\\d+(\\.\\d+)?", 73 "38.0", 74 "Oldest version to go back to, eg 36.1") 75 .add("debug", "Debug"); 76 main(String[] args)77 public static void main(String[] args) throws IOException { 78 System.out.println("Run TestOutdatedPaths.java -v to see a listing of changes."); 79 myOptions.parse(args, true); 80 DEBUG = myOptions.get("debug").doesOccur(); 81 82 try { 83 CldrVersion.checkVersions(); // verify versions up to date 84 } catch (Exception e) { 85 throw new ICUException( 86 "This tool can only be run if the archive of released versions matching CldrVersion is available.", 87 e); 88 } 89 90 // generate the list for as far as we want to go back 91 92 VersionInfo oldest = VersionInfo.getInstance(myOptions.get("oldest").getValue()); 93 List<CldrVersion> versions = new ArrayList<>(); 94 boolean foundStart = false; 95 for (CldrVersion version : CldrVersion.CLDR_VERSIONS_DESCENDING) { 96 versions.add(version); 97 if (version.getVersionInfo() == oldest) { 98 foundStart = true; 99 break; 100 } 101 } 102 if (!foundStart) { 103 throw new IllegalArgumentException( 104 "The last version is " 105 + myOptions.get("oldest").getValue() 106 + "; it must be in: " 107 + Joiner.on(", ").join(CldrVersion.CLDR_VERSIONS_DESCENDING)); 108 } 109 VERSIONS = versions.toArray(new CldrVersion[versions.size()]); 110 111 // set up the CLDR Factories for each version 112 factories = new Factory[VERSIONS.length]; // hack for now; should change to list 113 114 String filePattern = myOptions.get("file").getValue(); 115 116 ArrayList<Factory> list = new ArrayList<>(); 117 for (CldrVersion version : VERSIONS) { 118 if (version == CldrVersion.unknown) { 119 continue; 120 } 121 List<File> paths = version.getPathsForFactory(); 122 123 System.out.println(version + ", " + paths); 124 Factory aFactory = 125 SimpleFactory.make(paths.toArray(new File[paths.size()]), filePattern); 126 list.add(aFactory); 127 } 128 list.toArray(factories); 129 130 final String dataDirectory = myOptions.get("target").getValue(); 131 File dataDir = new File(dataDirectory); 132 if (!dataDir.isDirectory()) { 133 throw new IllegalArgumentException("-t value is not directory: " + dataDir); 134 } 135 136 // load and process English 137 138 String logDirectory = myOptions.get("log").getValue(); 139 140 System.out.println("en\tBegin"); 141 Births english = new Births("en"); 142 english.writeBirth(logDirectory, "en", null); 143 String englishDataFile = dataDirectory + "/" + OutdatedPaths.OUTDATED_ENGLISH_DATA; 144 english.writeBirthValues(englishDataFile); 145 146 Map<Long, Pair<CldrVersion, String>> pathToPrevious = new HashMap<>(); 147 148 // Verify that the write of English worked 149 150 OutdatedPaths.readBirthValues(dataDirectory, null, pathToPrevious); 151 for (Entry<String, R3<CldrVersion, String, String>> entry : 152 english.pathToBirthCurrentPrevious.entrySet()) { 153 String path = entry.getKey(); 154 String previous = entry.getValue().get2(); 155 CldrVersion birth = entry.getValue().get0(); 156 if (previous == null) { 157 previous = OutdatedPaths.NO_VALUE; 158 } 159 long id = StringId.getId(path); 160 Pair<CldrVersion, String> readValue = pathToPrevious.get(id); 161 CldrVersion birthRead = readValue == null ? null : readValue.getFirst(); 162 String previousRead = readValue == null ? null : readValue.getSecond(); 163 if (!Objects.equal(previous, previousRead) || !Objects.equal(birth, birthRead)) { 164 throw new IllegalArgumentException( 165 "path: " 166 + path 167 + "\tprevious: " 168 + previous 169 + "\tread: " 170 + readValue 171 + "\tbirth: " 172 + birth 173 + "\tread: " 174 + birthRead); 175 } 176 } 177 178 // Set up the binary data files for all others 179 180 File file = new File(dataDirectory + "/" + OutdatedPaths.OUTDATED_DATA); 181 final String outputDataFile = PathUtilities.getNormalizedPathString(file); 182 TreeMap<String, Set<String>> localeToNewer = new TreeMap<>(); 183 184 System.out.println("Writing data: " + outputDataFile); 185 try (DataOutputStream dataOut = new DataOutputStream(new FileOutputStream(file))) { 186 dataOut.writeUTF(OutdatedPaths.FORMAT_KEY); 187 188 // Load and process all the locales 189 190 LanguageTagParser ltp = new LanguageTagParser(); 191 for (String fileName : factories[0].getAvailable()) { 192 if (fileName.equals("en")) { 193 continue; 194 } 195 if (!ltp.set(fileName).getRegion().isEmpty()) { 196 continue; // skip region locales 197 } 198 // TODO skip default content 199 System.out.println(); 200 System.out.println(fileName + "\t" + "Begin"); 201 Births other = new Births(fileName); 202 Set<String> newer = other.writeBirth(logDirectory, fileName, english); 203 204 dataOut.writeUTF(fileName); 205 dataOut.writeInt(newer.size()); 206 for (String item : newer) { 207 long id = StringId.getId(item); 208 dataOut.writeLong(id); 209 if (DEBUG) { 210 System.out.println(id + "\t" + item); 211 } 212 } 213 localeToNewer.put(fileName, newer); 214 } 215 dataOut.writeUTF("$END$"); 216 } 217 218 // Doublecheck the data 219 220 OutdatedPaths outdatedPaths = new OutdatedPaths(dataDirectory); 221 Set<String> needPrevious = new TreeSet<>(); 222 int errorCount = 0; 223 for (Entry<String, Set<String>> localeAndNewer : localeToNewer.entrySet()) { 224 String locale = localeAndNewer.getKey(); 225 System.out.println("Checking " + locale); 226 Set<String> newer = localeAndNewer.getValue(); 227 if (newer.size() != outdatedPaths.countOutdated(locale)) { 228 throw new IllegalArgumentException("broken: " + locale); 229 } 230 for (String xpath : newer) { 231 boolean isOutdated = outdatedPaths.isRawOutdated(locale, xpath); 232 if (!isOutdated) { 233 System.out.println( 234 "Error, broken locale: " 235 + locale 236 + "\t" 237 + StringId.getId(xpath) 238 + "\t" 239 + xpath); 240 ++errorCount; 241 } 242 if (outdatedPaths.isSkipped(xpath)) { 243 continue; 244 } 245 String previous = outdatedPaths.getPreviousEnglish(xpath); 246 if (previous.isEmpty() != english.emptyPrevious.contains(xpath)) { 247 System.out.println( 248 "previous.isEmpty() != original " 249 + locale 250 + "\t" 251 + StringId.getId(xpath) 252 + "\t" 253 + xpath); 254 needPrevious.add(xpath); 255 ++errorCount; 256 } 257 } 258 } 259 // give a reminder since the above will be lost 260 System.out.println("Wrote: " + englishDataFile); 261 if (errorCount != 0) { 262 throw new IllegalArgumentException( 263 "Done, but " + errorCount + " errors writing to " + outputDataFile); 264 } else { 265 System.out.println("Done, no errors writing to: " + outputDataFile); 266 } 267 System.out.println("Please commit the above two files and start a PR."); 268 } 269 270 static class Births { 271 private static final boolean USE_RESOLVED = false; 272 final Relation<CldrVersion, String> birthToPaths; 273 final Map<String, Row.R3<CldrVersion, String, String>> pathToBirthCurrentPrevious; 274 final String locale; 275 static final Pattern TYPE = PatternCache.get("\\[@type=\"([^\"]*)\""); 276 final Matcher typeMatcher = TYPE.matcher(""); 277 Set<String> emptyPrevious = new HashSet<>(); 278 Births(String file)279 Births(String file) { 280 locale = file; 281 282 CLDRFile[] files = new CLDRFile[factories.length]; 283 DisplayAndInputProcessor[] processors = new DisplayAndInputProcessor[factories.length]; 284 285 for (int i = 0; i < factories.length; ++i) { 286 final CldrVersion ver = CldrVersion.CLDR_VERSIONS_DESCENDING.get(i); 287 try { 288 files[i] = factories[i].make(file, USE_RESOLVED); 289 processors[i] = new DisplayAndInputProcessor(files[i], false); 290 } catch (SimpleFactory.NoSourceDirectoryException nsd) { 291 // stop when we fail to find a dir 292 System.out.println( 293 String.format("%s\tEnd of source directories at v%s", file, ver)); 294 break; 295 } catch (Throwable t) { 296 throw new RuntimeException( 297 "Exception while processing " + file + " v" + ver, t); 298 } 299 } 300 System.out.println(String.format("%s\tDone", file)); 301 birthToPaths = Relation.of(new TreeMap<CldrVersion, Set<String>>(), TreeSet.class); 302 pathToBirthCurrentPrevious = new HashMap<>(); 303 for (String xpath : files[0]) { 304 xpath = xpath.intern(); 305 if (xpath.contains("[@type=\"ar\"]")) { 306 int debug = 0; 307 } 308 String base = getProcessedStringValue(0, xpath, files, processors); 309 310 String previousValue = null; 311 int i; 312 CLDRFile lastFile = files[0]; 313 for (i = 1; i < files.length && files[i] != null; ++i) { 314 String previous = getProcessedStringValue(i, xpath, files, processors); 315 if (previous == null) { 316 previous = OutdatedPaths.NO_VALUE; // fixNullPrevious(xpath); 317 } 318 if (!CharSequences.equals(base, previous)) { 319 if (previous != null) { 320 previousValue = previous; 321 } 322 break; 323 } 324 lastFile = files[i]; 325 } 326 CldrVersion version = CldrVersion.from(lastFile.getDtdVersionInfo()); 327 birthToPaths.put(version, xpath); 328 pathToBirthCurrentPrevious.put(xpath, Row.of(version, base, previousValue)); 329 } 330 } 331 getProcessedStringValue( int fileNumber, String xpath, CLDRFile[] files, DisplayAndInputProcessor[] processors)332 public String getProcessedStringValue( 333 int fileNumber, 334 String xpath, 335 CLDRFile[] files, 336 DisplayAndInputProcessor[] processors) { 337 String base = files[fileNumber].getStringValue(xpath); 338 if (base != null) { 339 base = processors[fileNumber].processInput(xpath, base, null); 340 } 341 return base; 342 } 343 fixNullPrevious(String xpath)344 private String fixNullPrevious(String xpath) { 345 if (typeMatcher.reset(xpath).find()) { 346 String type = typeMatcher.group(1); 347 if (xpath.contains("metazone")) { 348 return type.replace("_", " "); 349 } else if (xpath.contains("zone")) { 350 String[] splits = type.split("/"); 351 return splits[splits.length - 1].replace("_", " "); 352 } 353 return type; 354 } 355 return null; 356 } 357 writeBirthValues(String file)358 public void writeBirthValues(String file) throws IOException { 359 try (DataOutputStream dataOut = new DataOutputStream(new FileOutputStream(file))) { 360 dataOut.writeUTF(OutdatedPaths.FORMAT_KEY); 361 System.out.println("Writing data: " + PathUtilities.getNormalizedPathString(file)); 362 dataOut.writeInt(pathToBirthCurrentPrevious.size()); 363 364 // Load and process all the locales 365 366 // TreeMap<String, Set<String>> localeToNewer = new TreeMap<String, Set<String>>(); 367 for (Entry<String, R3<CldrVersion, String, String>> entry : 368 pathToBirthCurrentPrevious.entrySet()) { 369 String path = entry.getKey(); 370 R3<CldrVersion, String, String> birthCurrentPrevious = entry.getValue(); 371 CldrVersion birth = birthCurrentPrevious.get0(); 372 String current = birthCurrentPrevious.get1(); 373 String previous = birthCurrentPrevious.get2(); 374 long id = StringId.getId(path); 375 dataOut.writeLong(id); 376 final String previousString = 377 previous == null ? OutdatedPaths.NO_VALUE : previous; 378 dataOut.writeUTF(previousString); 379 if (previous == null) { 380 emptyPrevious.add(path); 381 } 382 dataOut.writeUTF(birth.toString()); 383 if (true) { 384 System.out.println( 385 id + "\t" + birth + "\t«" + current + "⇐" + previous + "»"); 386 } 387 } 388 dataOut.writeUTF("$END$"); 389 emptyPrevious = Collections.unmodifiableSet(emptyPrevious); 390 } 391 } 392 writeBirth(PrintWriter out, Births onlyNewer)393 Set<String> writeBirth(PrintWriter out, Births onlyNewer) { 394 395 out.println("Loc\tVersion\tValue\tPrevValue\tEVersion\tEValue\tEPrevValue\tPath"); 396 397 Set<String> newer = new HashSet<>(); 398 HashMap<Long, String> sanityCheck = new HashMap<>(); 399 CldrVersion onlyNewerVersion = null; 400 String otherValue = "n/a"; 401 String olderOtherValue = "n/a"; 402 for (Entry<CldrVersion, Set<String>> entry2 : birthToPaths.keyValuesSet()) { 403 CldrVersion version = entry2.getKey(); 404 for (String xpath : entry2.getValue()) { 405 long id = StringId.getId(xpath); 406 String old = sanityCheck.get(id); 407 if (old != null) { 408 throw new IllegalArgumentException( 409 "Path Collision " + xpath + ", old:" + old + ", id: " + id); 410 } else { 411 sanityCheck.put(id, xpath); 412 } 413 R3<CldrVersion, String, String> info = pathToBirthCurrentPrevious.get(xpath); 414 if (onlyNewer != null) { 415 416 R3<CldrVersion, String, String> otherInfo = 417 onlyNewer.pathToBirthCurrentPrevious.get(xpath); 418 if (otherInfo == null) { 419 continue; 420 } 421 // skip if not older than "comparison version" 422 onlyNewerVersion = otherInfo.get0(); 423 if (!version.isOlderThan(onlyNewerVersion)) { 424 continue; 425 } 426 otherValue = fixNull(otherInfo.get1()); 427 olderOtherValue = fixNull(otherInfo.get2()); 428 newer.add(xpath); 429 } 430 String value = fixNull(info.get1()); 431 String olderValue = fixNull(info.get2()); 432 433 out.println( 434 locale 435 + "\t" 436 + version 437 + "\t" 438 + value 439 + "\t" 440 + olderValue 441 + "\t" 442 + CldrUtility.ifNull(onlyNewerVersion, "n/a") 443 + "\t" 444 + otherValue 445 + "\t" 446 + olderOtherValue 447 + "\t" 448 + xpath); 449 } 450 } 451 return newer; 452 } 453 fixNull(String value)454 private String fixNull(String value) { 455 if (value == null) { 456 value = OutdatedPaths.NO_VALUE; 457 } 458 return value; 459 } 460 writeBirth(String directory, String filename, Births onlyNewer)461 Set<String> writeBirth(String directory, String filename, Births onlyNewer) 462 throws IOException { 463 try (PrintWriter out = FileUtilities.openUTF8Writer(directory, filename + ".txt")) { 464 Set<String> newer = writeBirth(out, onlyNewer); 465 return newer; 466 } 467 } 468 } 469 } 470