1 package org.unicode.cldr.tool; 2 3 import com.google.common.base.Joiner; 4 import com.google.common.base.Objects; 5 import com.ibm.icu.impl.Relation; 6 import com.ibm.icu.impl.Row; 7 import com.ibm.icu.impl.Row.R3; 8 import com.ibm.icu.lang.CharSequences; 9 import com.ibm.icu.util.ICUException; 10 import com.ibm.icu.util.VersionInfo; 11 import java.io.DataOutputStream; 12 import java.io.File; 13 import java.io.FileOutputStream; 14 import java.io.IOException; 15 import java.io.PrintWriter; 16 import java.util.ArrayList; 17 import java.util.Collections; 18 import java.util.HashMap; 19 import java.util.HashSet; 20 import java.util.List; 21 import java.util.Map; 22 import java.util.Map.Entry; 23 import java.util.Set; 24 import java.util.TreeMap; 25 import java.util.TreeSet; 26 import java.util.regex.Matcher; 27 import java.util.regex.Pattern; 28 import org.unicode.cldr.draft.FileUtilities; 29 import org.unicode.cldr.test.DisplayAndInputProcessor; 30 import org.unicode.cldr.test.OutdatedPaths; 31 import org.unicode.cldr.tool.Option.Options; 32 import org.unicode.cldr.util.CLDRFile; 33 import org.unicode.cldr.util.CLDRPaths; 34 import org.unicode.cldr.util.CldrUtility; 35 import org.unicode.cldr.util.Factory; 36 import org.unicode.cldr.util.LanguageTagParser; 37 import org.unicode.cldr.util.Pair; 38 import org.unicode.cldr.util.PathUtilities; 39 import org.unicode.cldr.util.PatternCache; 40 import org.unicode.cldr.util.SimpleFactory; 41 import org.unicode.cldr.util.StringId; 42 43 public class GenerateBirth { 44 private static boolean DEBUG = false; 45 46 static CldrVersion[] VERSIONS; 47 48 static Factory[] factories; 49 50 static final Options myOptions = 51 new Options() 52 .add( 53 "target", 54 ".*", 55 CLDRPaths.BIRTH_DATA_DIR, 56 "The target directory for building the text files that show the results.") 57 .add( 58 "log", 59 ".*", 60 CLDRPaths.STAGING_DIRECTORY 61 + "births/" 62 + CldrVersion.baseline.getVersionInfo().getVersionString(2, 4), 63 "The target directory for building the text files that show the results.") 64 .add( 65 "file", 66 ".*", 67 ".*", 68 "Filter the information based on file name, using a regex argument. The '.xml' is removed from the file before filtering") 69 .add("previous", "Stop after writing the English previous data.") 70 .add( 71 "oldest", 72 "\\d+(\\.\\d+)?", 73 "38.0", 74 "Oldest version to go back to, eg 36.1") 75 .add("debug", "Debug"); 76 main(String[] args)77 public static void main(String[] args) throws IOException { 78 System.out.println("Run TestOutdatedPaths.java -v to see a listing of changes."); 79 myOptions.parse(args, true); 80 DEBUG = myOptions.get("debug").doesOccur(); 81 82 try { 83 CldrVersion.checkVersions(); // verify versions up to date 84 } catch (Exception e) { 85 throw new ICUException( 86 "This tool can only be run if the archive of released versions matching CldrVersion is available.", 87 e); 88 } 89 90 // generate the list for as far as we want to go back 91 92 VersionInfo oldest = VersionInfo.getInstance(myOptions.get("oldest").getValue()); 93 List<CldrVersion> versions = new ArrayList<>(); 94 boolean foundStart = false; 95 for (CldrVersion version : CldrVersion.CLDR_VERSIONS_DESCENDING) { 96 versions.add(version); 97 if (version.getVersionInfo() == oldest) { 98 foundStart = true; 99 break; 100 } 101 } 102 if (!foundStart) { 103 throw new IllegalArgumentException( 104 "The last version is " 105 + myOptions.get("oldest").getValue() 106 + "; it must be in: " 107 + Joiner.on(", ").join(CldrVersion.CLDR_VERSIONS_DESCENDING)); 108 } 109 VERSIONS = versions.toArray(new CldrVersion[versions.size()]); 110 111 // set up the CLDR Factories for each version 112 factories = new Factory[VERSIONS.length]; // hack for now; should change to list 113 114 String filePattern = myOptions.get("file").getValue(); 115 116 ArrayList<Factory> list = new ArrayList<>(); 117 for (CldrVersion version : VERSIONS) { 118 if (version == CldrVersion.unknown) { 119 continue; 120 } 121 List<File> paths = version.getPathsForFactory(); 122 123 System.out.println(version + ", " + paths); 124 Factory aFactory = 125 SimpleFactory.make(paths.toArray(new File[paths.size()]), filePattern); 126 list.add(aFactory); 127 } 128 list.toArray(factories); 129 130 final String dataDirectory = myOptions.get("target").getValue(); 131 File dataDir = new File(dataDirectory); 132 if (!dataDir.isDirectory()) { 133 throw new IllegalArgumentException("-t value is not directory: " + dataDir); 134 } 135 136 // load and process English 137 138 String logDirectory = myOptions.get("log").getValue(); 139 140 System.out.println("en"); 141 Births english = new Births("en"); 142 english.writeBirth(logDirectory, "en", null); 143 english.writeBirthValues(dataDirectory + "/" + OutdatedPaths.OUTDATED_ENGLISH_DATA); 144 145 Map<Long, Pair<CldrVersion, String>> pathToPrevious = new HashMap<>(); 146 147 // Verify that the write of English worked 148 149 OutdatedPaths.readBirthValues(dataDirectory, null, pathToPrevious); 150 for (Entry<String, R3<CldrVersion, String, String>> entry : 151 english.pathToBirthCurrentPrevious.entrySet()) { 152 String path = entry.getKey(); 153 String previous = entry.getValue().get2(); 154 CldrVersion birth = entry.getValue().get0(); 155 if (previous == null) { 156 previous = OutdatedPaths.NO_VALUE; 157 } 158 long id = StringId.getId(path); 159 Pair<CldrVersion, String> readValue = pathToPrevious.get(id); 160 CldrVersion birthRead = readValue == null ? null : readValue.getFirst(); 161 String previousRead = readValue == null ? null : readValue.getSecond(); 162 if (!Objects.equal(previous, previousRead) || !Objects.equal(birth, birthRead)) { 163 throw new IllegalArgumentException( 164 "path: " 165 + path 166 + "\tprevious: " 167 + previous 168 + "\tread: " 169 + readValue 170 + "\tbirth: " 171 + birth 172 + "\tread: " 173 + birthRead); 174 } 175 } 176 177 // Set up the binary data files for all others 178 179 File file = new File(dataDirectory + "/" + OutdatedPaths.OUTDATED_DATA); 180 final String outputDataFile = PathUtilities.getNormalizedPathString(file); 181 TreeMap<String, Set<String>> localeToNewer = new TreeMap<>(); 182 183 System.out.println("Writing data: " + outputDataFile); 184 try (DataOutputStream dataOut = new DataOutputStream(new FileOutputStream(file))) { 185 dataOut.writeUTF(OutdatedPaths.FORMAT_KEY); 186 187 // Load and process all the locales 188 189 LanguageTagParser ltp = new LanguageTagParser(); 190 for (String fileName : factories[0].getAvailable()) { 191 if (fileName.equals("en")) { 192 continue; 193 } 194 if (!ltp.set(fileName).getRegion().isEmpty()) { 195 continue; // skip region locales 196 } 197 // TODO skip default content locales 198 System.out.println(fileName); 199 Births other = new Births(fileName); 200 Set<String> newer = other.writeBirth(logDirectory, fileName, english); 201 202 dataOut.writeUTF(fileName); 203 dataOut.writeInt(newer.size()); 204 for (String item : newer) { 205 long id = StringId.getId(item); 206 dataOut.writeLong(id); 207 if (DEBUG) { 208 System.out.println(id + "\t" + item); 209 } 210 } 211 localeToNewer.put(fileName, newer); 212 } 213 dataOut.writeUTF("$END$"); 214 } 215 216 // Doublecheck the data 217 218 OutdatedPaths outdatedPaths = new OutdatedPaths(dataDirectory); 219 Set<String> needPrevious = new TreeSet<>(); 220 int errorCount = 0; 221 for (Entry<String, Set<String>> localeAndNewer : localeToNewer.entrySet()) { 222 String locale = localeAndNewer.getKey(); 223 System.out.println("Checking " + locale); 224 Set<String> newer = localeAndNewer.getValue(); 225 if (newer.size() != outdatedPaths.countOutdated(locale)) { 226 throw new IllegalArgumentException("broken: " + locale); 227 } 228 for (String xpath : newer) { 229 boolean isOutdated = outdatedPaths.isRawOutdated(locale, xpath); 230 if (!isOutdated) { 231 System.out.println( 232 "Error, broken locale: " 233 + locale 234 + "\t" 235 + StringId.getId(xpath) 236 + "\t" 237 + xpath); 238 ++errorCount; 239 } 240 if (outdatedPaths.isSkipped(xpath)) { 241 continue; 242 } 243 String previous = outdatedPaths.getPreviousEnglish(xpath); 244 if (previous.isEmpty() != english.emptyPrevious.contains(xpath)) { 245 System.out.println( 246 "previous.isEmpty() != original " 247 + locale 248 + "\t" 249 + StringId.getId(xpath) 250 + "\t" 251 + xpath); 252 needPrevious.add(xpath); 253 ++errorCount; 254 } 255 } 256 } 257 if (errorCount != 0) { 258 throw new IllegalArgumentException("Done, but " + errorCount + " errors"); 259 } else { 260 System.out.println("Done, no errors"); 261 } 262 } 263 264 static class Births { 265 private static final boolean USE_RESOLVED = false; 266 final Relation<CldrVersion, String> birthToPaths; 267 final Map<String, Row.R3<CldrVersion, String, String>> pathToBirthCurrentPrevious; 268 final String locale; 269 static final Pattern TYPE = PatternCache.get("\\[@type=\"([^\"]*)\""); 270 final Matcher typeMatcher = TYPE.matcher(""); 271 Set<String> emptyPrevious = new HashSet<>(); 272 Births(String file)273 Births(String file) { 274 locale = file; 275 276 CLDRFile[] files = new CLDRFile[factories.length]; 277 DisplayAndInputProcessor[] processors = new DisplayAndInputProcessor[factories.length]; 278 279 for (int i = 0; i < factories.length; ++i) { 280 try { 281 files[i] = factories[i].make(file, USE_RESOLVED); 282 processors[i] = new DisplayAndInputProcessor(files[i], false); 283 } catch (Exception e) { 284 // stop when we fail to find 285 System.out.println( 286 "Stopped at " 287 + file 288 + ", " 289 + CldrVersion.CLDR_VERSIONS_DESCENDING.get(i)); 290 // e.printStackTrace(); 291 break; 292 } 293 } 294 birthToPaths = Relation.of(new TreeMap<CldrVersion, Set<String>>(), TreeSet.class); 295 pathToBirthCurrentPrevious = new HashMap<>(); 296 for (String xpath : files[0]) { 297 xpath = xpath.intern(); 298 if (xpath.contains("[@type=\"ar\"]")) { 299 int debug = 0; 300 } 301 String base = getProcessedStringValue(0, xpath, files, processors); 302 303 String previousValue = null; 304 int i; 305 CLDRFile lastFile = files[0]; 306 for (i = 1; i < files.length && files[i] != null; ++i) { 307 String previous = getProcessedStringValue(i, xpath, files, processors); 308 if (previous == null) { 309 previous = OutdatedPaths.NO_VALUE; // fixNullPrevious(xpath); 310 } 311 if (!CharSequences.equals(base, previous)) { 312 if (previous != null) { 313 previousValue = previous; 314 } 315 break; 316 } 317 lastFile = files[i]; 318 } 319 CldrVersion version = CldrVersion.from(lastFile.getDtdVersionInfo()); 320 birthToPaths.put(version, xpath); 321 pathToBirthCurrentPrevious.put(xpath, Row.of(version, base, previousValue)); 322 } 323 } 324 getProcessedStringValue( int fileNumber, String xpath, CLDRFile[] files, DisplayAndInputProcessor[] processors)325 public String getProcessedStringValue( 326 int fileNumber, 327 String xpath, 328 CLDRFile[] files, 329 DisplayAndInputProcessor[] processors) { 330 String base = files[fileNumber].getStringValue(xpath); 331 if (base != null) { 332 base = processors[fileNumber].processInput(xpath, base, null); 333 } 334 return base; 335 } 336 fixNullPrevious(String xpath)337 private String fixNullPrevious(String xpath) { 338 if (typeMatcher.reset(xpath).find()) { 339 String type = typeMatcher.group(1); 340 if (xpath.contains("metazone")) { 341 return type.replace("_", " "); 342 } else if (xpath.contains("zone")) { 343 String[] splits = type.split("/"); 344 return splits[splits.length - 1].replace("_", " "); 345 } 346 return type; 347 } 348 return null; 349 } 350 writeBirthValues(String file)351 public void writeBirthValues(String file) throws IOException { 352 try (DataOutputStream dataOut = new DataOutputStream(new FileOutputStream(file))) { 353 dataOut.writeUTF(OutdatedPaths.FORMAT_KEY); 354 System.out.println("Writing data: " + PathUtilities.getNormalizedPathString(file)); 355 dataOut.writeInt(pathToBirthCurrentPrevious.size()); 356 357 // Load and process all the locales 358 359 // TreeMap<String, Set<String>> localeToNewer = new TreeMap<String, Set<String>>(); 360 for (Entry<String, R3<CldrVersion, String, String>> entry : 361 pathToBirthCurrentPrevious.entrySet()) { 362 String path = entry.getKey(); 363 R3<CldrVersion, String, String> birthCurrentPrevious = entry.getValue(); 364 CldrVersion birth = birthCurrentPrevious.get0(); 365 String current = birthCurrentPrevious.get1(); 366 String previous = birthCurrentPrevious.get2(); 367 long id = StringId.getId(path); 368 dataOut.writeLong(id); 369 final String previousString = 370 previous == null ? OutdatedPaths.NO_VALUE : previous; 371 dataOut.writeUTF(previousString); 372 if (previous == null) { 373 emptyPrevious.add(path); 374 } 375 dataOut.writeUTF(birth.toString()); 376 if (true) { 377 System.out.println( 378 id + "\t" + birth + "\t«" + current + "⇐" + previous + "»"); 379 } 380 } 381 dataOut.writeUTF("$END$"); 382 emptyPrevious = Collections.unmodifiableSet(emptyPrevious); 383 } 384 } 385 writeBirth(PrintWriter out, Births onlyNewer)386 Set<String> writeBirth(PrintWriter out, Births onlyNewer) { 387 388 out.println("Loc\tVersion\tValue\tPrevValue\tEVersion\tEValue\tEPrevValue\tPath"); 389 390 Set<String> newer = new HashSet<>(); 391 HashMap<Long, String> sanityCheck = new HashMap<>(); 392 CldrVersion onlyNewerVersion = null; 393 String otherValue = "n/a"; 394 String olderOtherValue = "n/a"; 395 for (Entry<CldrVersion, Set<String>> entry2 : birthToPaths.keyValuesSet()) { 396 CldrVersion version = entry2.getKey(); 397 for (String xpath : entry2.getValue()) { 398 long id = StringId.getId(xpath); 399 String old = sanityCheck.get(id); 400 if (old != null) { 401 throw new IllegalArgumentException( 402 "Path Collision " + xpath + ", old:" + old + ", id: " + id); 403 } else { 404 sanityCheck.put(id, xpath); 405 } 406 R3<CldrVersion, String, String> info = pathToBirthCurrentPrevious.get(xpath); 407 if (onlyNewer != null) { 408 409 R3<CldrVersion, String, String> otherInfo = 410 onlyNewer.pathToBirthCurrentPrevious.get(xpath); 411 if (otherInfo == null) { 412 continue; 413 } 414 // skip if not older than "comparison version" 415 onlyNewerVersion = otherInfo.get0(); 416 if (!version.isOlderThan(onlyNewerVersion)) { 417 continue; 418 } 419 otherValue = fixNull(otherInfo.get1()); 420 olderOtherValue = fixNull(otherInfo.get2()); 421 newer.add(xpath); 422 } 423 String value = fixNull(info.get1()); 424 String olderValue = fixNull(info.get2()); 425 426 out.println( 427 locale 428 + "\t" 429 + version 430 + "\t" 431 + value 432 + "\t" 433 + olderValue 434 + "\t" 435 + CldrUtility.ifNull(onlyNewerVersion, "n/a") 436 + "\t" 437 + otherValue 438 + "\t" 439 + olderOtherValue 440 + "\t" 441 + xpath); 442 } 443 } 444 return newer; 445 } 446 fixNull(String value)447 private String fixNull(String value) { 448 if (value == null) { 449 value = OutdatedPaths.NO_VALUE; 450 } 451 return value; 452 } 453 writeBirth(String directory, String filename, Births onlyNewer)454 Set<String> writeBirth(String directory, String filename, Births onlyNewer) 455 throws IOException { 456 try (PrintWriter out = FileUtilities.openUTF8Writer(directory, filename + ".txt")) { 457 Set<String> newer = writeBirth(out, onlyNewer); 458 return newer; 459 } 460 } 461 } 462 } 463