1 package org.unicode.cldr.tool; 2 3 import java.io.DataOutputStream; 4 import java.io.File; 5 import java.io.FileOutputStream; 6 import java.io.IOException; 7 import java.io.PrintWriter; 8 import java.util.ArrayList; 9 import java.util.Collections; 10 import java.util.HashMap; 11 import java.util.HashSet; 12 import java.util.List; 13 import java.util.Map; 14 import java.util.Map.Entry; 15 import java.util.Set; 16 import java.util.TreeMap; 17 import java.util.TreeSet; 18 import java.util.regex.Matcher; 19 import java.util.regex.Pattern; 20 21 import org.unicode.cldr.draft.FileUtilities; 22 import org.unicode.cldr.test.DisplayAndInputProcessor; 23 import org.unicode.cldr.test.OutdatedPaths; 24 import org.unicode.cldr.tool.Option.Options; 25 import org.unicode.cldr.util.CLDRFile; 26 import org.unicode.cldr.util.CLDRPaths; 27 import org.unicode.cldr.util.CldrUtility; 28 import org.unicode.cldr.util.Factory; 29 import org.unicode.cldr.util.LanguageTagParser; 30 import org.unicode.cldr.util.Pair; 31 import org.unicode.cldr.util.PathUtilities; 32 import org.unicode.cldr.util.PatternCache; 33 import org.unicode.cldr.util.SimpleFactory; 34 import org.unicode.cldr.util.StringId; 35 36 import com.google.common.base.Joiner; 37 import com.google.common.base.Objects; 38 import com.ibm.icu.impl.Relation; 39 import com.ibm.icu.impl.Row; 40 import com.ibm.icu.impl.Row.R3; 41 import com.ibm.icu.lang.CharSequences; 42 import com.ibm.icu.util.ICUException; 43 import com.ibm.icu.util.VersionInfo; 44 45 public class GenerateBirth { 46 private static boolean DEBUG = false; 47 48 static CldrVersion[] VERSIONS; 49 50 static Factory[] factories; 51 52 final static Options myOptions = new Options() 53 .add("target", ".*", CLDRPaths.BIRTH_DATA_DIR, 54 "The target directory for building the text files that show the results.") 55 .add("log", ".*", CLDRPaths.STAGING_DIRECTORY + "births/" + CldrVersion.baseline.getVersionInfo().getVersionString(2, 4), 56 "The target directory for building the text files that show the results.") 57 .add( 58 "file", 59 ".*", 60 ".*", 61 "Filter the information based on file name, using a regex argument. The '.xml' is removed from the file before filtering") 62 .add("previous", "Stop after writing the English previous data.") 63 .add("oldest", 64 "\\d+(\\.\\d+)?", 65 "36.0", 66 "Oldest version to go back to, eg 36.1") 67 .add("debug", "Debug"); 68 main(String[] args)69 public static void main(String[] args) throws IOException { 70 System.out.println("Run TestOutdatedPaths.java -v to see a listing of changes."); 71 myOptions.parse(args, true); 72 DEBUG = myOptions.get("debug").doesOccur(); 73 74 try { 75 CldrVersion.checkVersions(); // verify versions up to date 76 } catch (Exception e) { 77 throw new ICUException("This tool can only be run if the archive of released versions matching CldrVersion is available.", e); 78 } 79 80 // generate the list for as far as we want to go back 81 82 VersionInfo oldest = VersionInfo.getInstance(myOptions.get("oldest").getValue()); 83 List<CldrVersion> versions = new ArrayList<>(); 84 boolean foundStart = false; 85 for (CldrVersion version : CldrVersion.CLDR_VERSIONS_DESCENDING) { 86 versions.add(version); 87 if (version.getVersionInfo() == oldest) { 88 foundStart = true; 89 break; 90 } 91 } 92 if (!foundStart) { 93 throw new IllegalArgumentException("The last version is " + myOptions.get("oldest").getValue() + "; it must be in: " + Joiner.on(", ").join(CldrVersion.CLDR_VERSIONS_DESCENDING)); 94 } 95 VERSIONS = versions.toArray(new CldrVersion[versions.size()]); 96 97 // set up the CLDR Factories for each version 98 factories = new Factory[VERSIONS.length]; // hack for now; should change to list 99 100 String filePattern = myOptions.get("file").getValue(); 101 102 ArrayList<Factory> list = new ArrayList<>(); 103 for (CldrVersion version : VERSIONS) { 104 if (version == CldrVersion.unknown) { 105 continue; 106 } 107 List<File> paths = version.getPathsForFactory(); 108 109 System.out.println(version + ", " + paths); 110 Factory aFactory = SimpleFactory.make(paths.toArray(new File[paths.size()]), filePattern); 111 list.add(aFactory); 112 } 113 list.toArray(factories); 114 115 final String dataDirectory = myOptions.get("target").getValue(); 116 File dataDir = new File(dataDirectory); 117 if (!dataDir.isDirectory()) { 118 throw new IllegalArgumentException("-t value is not directory: " + dataDir); 119 } 120 121 // load and process English 122 123 String logDirectory = myOptions.get("log").getValue(); 124 125 System.out.println("en"); 126 Births english = new Births("en"); 127 english.writeBirth(logDirectory, "en", null); 128 english.writeBirthValues(dataDirectory + "/" + OutdatedPaths.OUTDATED_ENGLISH_DATA); 129 130 Map<Long, Pair<CldrVersion, String>> pathToPrevious = new HashMap<>(); 131 132 // Verify that the write of English worked 133 134 OutdatedPaths.readBirthValues(dataDirectory, null, pathToPrevious); 135 for (Entry<String, R3<CldrVersion, String, String>> entry : english.pathToBirthCurrentPrevious.entrySet()) { 136 String path = entry.getKey(); 137 String previous = entry.getValue().get2(); 138 CldrVersion birth = entry.getValue().get0(); 139 if (previous == null) { 140 previous = OutdatedPaths.NO_VALUE; 141 } 142 long id = StringId.getId(path); 143 Pair<CldrVersion, String> readValue = pathToPrevious.get(id); 144 CldrVersion birthRead = readValue == null ? null : readValue.getFirst(); 145 String previousRead = readValue == null ? null : readValue.getSecond(); 146 if (!Objects.equal(previous, previousRead) || !Objects.equal(birth, birthRead)) { 147 throw new IllegalArgumentException("path: " + path 148 + "\tprevious: " + previous + "\tread: " + readValue 149 + "\tbirth: " + birth + "\tread: " + birthRead); 150 } 151 } 152 153 // Set up the binary data files for all others 154 155 File file = new File(dataDirectory + "/" + OutdatedPaths.OUTDATED_DATA); 156 final String outputDataFile = PathUtilities.getNormalizedPathString(file); 157 TreeMap<String, Set<String>> localeToNewer = new TreeMap<>(); 158 159 System.out.println("Writing data: " + outputDataFile); 160 try (DataOutputStream dataOut = new DataOutputStream(new FileOutputStream(file))) { 161 dataOut.writeUTF(OutdatedPaths.FORMAT_KEY); 162 163 // Load and process all the locales 164 165 LanguageTagParser ltp = new LanguageTagParser(); 166 for (String fileName : factories[0].getAvailable()) { 167 if (fileName.equals("en")) { 168 continue; 169 } 170 if (!ltp.set(fileName).getRegion().isEmpty()) { 171 continue; // skip region locales 172 } 173 // TODO skip default content locales 174 System.out.println(fileName); 175 Births other = new Births(fileName); 176 Set<String> newer = other.writeBirth(logDirectory, fileName, english); 177 178 dataOut.writeUTF(fileName); 179 dataOut.writeInt(newer.size()); 180 for (String item : newer) { 181 long id = StringId.getId(item); 182 dataOut.writeLong(id); 183 if (DEBUG) { 184 System.out.println(id + "\t" + item); 185 } 186 } 187 localeToNewer.put(fileName, newer); 188 } 189 dataOut.writeUTF("$END$"); 190 } 191 192 // Doublecheck the data 193 194 OutdatedPaths outdatedPaths = new OutdatedPaths(dataDirectory); 195 Set<String> needPrevious = new TreeSet<>(); 196 int errorCount = 0; 197 for (Entry<String, Set<String>> localeAndNewer : localeToNewer.entrySet()) { 198 String locale = localeAndNewer.getKey(); 199 System.out.println("Checking " + locale); 200 Set<String> newer = localeAndNewer.getValue(); 201 if (newer.size() != outdatedPaths.countOutdated(locale)) { 202 throw new IllegalArgumentException("broken: " + locale); 203 } 204 for (String xpath : newer) { 205 boolean isOutdated = outdatedPaths.isRawOutdated(locale, xpath); 206 if (!isOutdated) { 207 System.out.println("Error, broken locale: " + locale + "\t" + StringId.getId(xpath) + "\t" + xpath); 208 ++errorCount; 209 } 210 if (outdatedPaths.isSkipped(xpath)) { 211 continue; 212 } 213 String previous = outdatedPaths.getPreviousEnglish(xpath); 214 if (previous.isEmpty() != english.emptyPrevious.contains(xpath)) { 215 System.out.println("previous.isEmpty() != original " + locale + "\t" + StringId.getId(xpath) + "\t" 216 + xpath); 217 needPrevious.add(xpath); 218 ++errorCount; 219 } 220 } 221 } 222 if (errorCount != 0) { 223 throw new IllegalArgumentException("Done, but " + errorCount + " errors"); 224 } else { 225 System.out.println("Done, no errors"); 226 } 227 } 228 229 static class Births { 230 private static final boolean USE_RESOLVED = false; 231 final Relation<CldrVersion, String> birthToPaths; 232 final Map<String, Row.R3<CldrVersion, String, String>> pathToBirthCurrentPrevious; 233 final String locale; 234 static final Pattern TYPE = PatternCache.get("\\[@type=\"([^\"]*)\""); 235 final Matcher typeMatcher = TYPE.matcher(""); 236 Set<String> emptyPrevious = new HashSet<>(); 237 Births(String file)238 Births(String file) { 239 locale = file; 240 241 CLDRFile[] files = new CLDRFile[factories.length]; 242 DisplayAndInputProcessor[] processors = new DisplayAndInputProcessor[factories.length]; 243 244 for (int i = 0; i < factories.length; ++i) { 245 try { 246 files[i] = factories[i].make(file, USE_RESOLVED); 247 processors[i] = new DisplayAndInputProcessor(files[i], false); 248 } catch (Exception e) { 249 // stop when we fail to find 250 System.out.println("Stopped at " + file + ", " + CldrVersion.CLDR_VERSIONS_DESCENDING.get(i)); 251 //e.printStackTrace(); 252 break; 253 } 254 } 255 birthToPaths = Relation.of(new TreeMap<CldrVersion, Set<String>>(), TreeSet.class); 256 pathToBirthCurrentPrevious = new HashMap<>(); 257 for (String xpath : files[0]) { 258 xpath = xpath.intern(); 259 if (xpath.contains("[@type=\"ar\"]")) { 260 int debug = 0; 261 } 262 String base = getProcessedStringValue(0, xpath, files, processors); 263 264 String previousValue = null; 265 int i; 266 CLDRFile lastFile = files[0]; 267 for (i = 1; i < files.length && files[i] != null; ++i) { 268 String previous = getProcessedStringValue(i, xpath, files, processors); 269 if (previous == null) { 270 previous = OutdatedPaths.NO_VALUE; // fixNullPrevious(xpath); 271 } 272 if (!CharSequences.equals(base, previous)) { 273 if (previous != null) { 274 previousValue = previous; 275 } 276 break; 277 } 278 lastFile = files[i]; 279 } 280 CldrVersion version = CldrVersion.from(lastFile.getDtdVersionInfo()); 281 birthToPaths.put(version, xpath); 282 pathToBirthCurrentPrevious.put(xpath, Row.of(version, base, previousValue)); 283 } 284 } 285 getProcessedStringValue(int fileNumber, String xpath, CLDRFile[] files, DisplayAndInputProcessor[] processors)286 public String getProcessedStringValue(int fileNumber, String xpath, CLDRFile[] files, DisplayAndInputProcessor[] processors) { 287 String base = files[fileNumber].getStringValue(xpath); 288 if (base != null) { 289 base = processors[fileNumber].processInput(xpath, base, null); 290 } 291 return base; 292 } 293 fixNullPrevious(String xpath)294 private String fixNullPrevious(String xpath) { 295 if (typeMatcher.reset(xpath).find()) { 296 String type = typeMatcher.group(1); 297 if (xpath.contains("metazone")) { 298 return type.replace("_", " "); 299 } else if (xpath.contains("zone")) { 300 String[] splits = type.split("/"); 301 return splits[splits.length - 1].replace("_", " "); 302 } 303 return type; 304 } 305 return null; 306 } 307 writeBirthValues(String file)308 public void writeBirthValues(String file) throws IOException { 309 try (DataOutputStream dataOut = new DataOutputStream(new FileOutputStream(file))) { 310 dataOut.writeUTF(OutdatedPaths.FORMAT_KEY); 311 System.out.println("Writing data: " + PathUtilities.getNormalizedPathString(file)); 312 dataOut.writeInt(pathToBirthCurrentPrevious.size()); 313 314 // Load and process all the locales 315 316 //TreeMap<String, Set<String>> localeToNewer = new TreeMap<String, Set<String>>(); 317 for (Entry<String, R3<CldrVersion, String, String>> entry : pathToBirthCurrentPrevious.entrySet()) { 318 String path = entry.getKey(); 319 R3<CldrVersion, String, String> birthCurrentPrevious = entry.getValue(); 320 CldrVersion birth = birthCurrentPrevious.get0(); 321 String current = birthCurrentPrevious.get1(); 322 String previous = birthCurrentPrevious.get2(); 323 long id = StringId.getId(path); 324 dataOut.writeLong(id); 325 final String previousString = previous == null ? OutdatedPaths.NO_VALUE : previous; 326 dataOut.writeUTF(previousString); 327 if (previous == null) { 328 emptyPrevious.add(path); 329 } 330 dataOut.writeUTF(birth.toString()); 331 if (true) { 332 System.out.println(id + "\t" + birth + "\t«" + current + "⇐" + previous + "»"); 333 } 334 } 335 dataOut.writeUTF("$END$"); 336 emptyPrevious = Collections.unmodifiableSet(emptyPrevious); 337 } 338 } 339 writeBirth(PrintWriter out, Births onlyNewer)340 Set<String> writeBirth(PrintWriter out, Births onlyNewer) { 341 342 out.println("Loc\tVersion\tValue\tPrevValue\tEVersion\tEValue\tEPrevValue\tPath"); 343 344 Set<String> newer = new HashSet<>(); 345 HashMap<Long, String> sanityCheck = new HashMap<>(); 346 CldrVersion onlyNewerVersion = null; 347 String otherValue = "n/a"; 348 String olderOtherValue = "n/a"; 349 for (Entry<CldrVersion, Set<String>> entry2 : birthToPaths.keyValuesSet()) { 350 CldrVersion version = entry2.getKey(); 351 for (String xpath : entry2.getValue()) { 352 long id = StringId.getId(xpath); 353 String old = sanityCheck.get(id); 354 if (old != null) { 355 throw new IllegalArgumentException("Path Collision " + xpath + ", old:" + old + ", id: " + id); 356 } else { 357 sanityCheck.put(id, xpath); 358 } 359 R3<CldrVersion, String, String> info = pathToBirthCurrentPrevious.get(xpath); 360 if (onlyNewer != null) { 361 362 R3<CldrVersion, String, String> otherInfo = onlyNewer.pathToBirthCurrentPrevious.get(xpath); 363 if (otherInfo == null) { 364 continue; 365 } 366 // skip if not older than "comparison version" 367 onlyNewerVersion = otherInfo.get0(); 368 if (!version.isOlderThan(onlyNewerVersion)) { 369 continue; 370 } 371 otherValue = fixNull(otherInfo.get1()); 372 olderOtherValue = fixNull(otherInfo.get2()); 373 newer.add(xpath); 374 } 375 String value = fixNull(info.get1()); 376 String olderValue = fixNull(info.get2()); 377 378 out.println(locale 379 + "\t" + version 380 + "\t" + value 381 + "\t" + olderValue 382 + "\t" + CldrUtility.ifNull(onlyNewerVersion, "n/a") 383 + "\t" + otherValue 384 + "\t" + olderOtherValue 385 + "\t" + xpath); 386 387 } 388 } 389 return newer; 390 } 391 fixNull(String value)392 private String fixNull(String value) { 393 if (value == null) { 394 value = OutdatedPaths.NO_VALUE; 395 } 396 return value; 397 } 398 writeBirth(String directory, String filename, Births onlyNewer)399 Set<String> writeBirth(String directory, String filename, Births onlyNewer) throws IOException { 400 try (PrintWriter out = FileUtilities.openUTF8Writer(directory, filename + ".txt")) { 401 Set<String> newer = writeBirth(out, onlyNewer); 402 return newer; 403 } 404 } 405 } 406 } 407