1 package org.unicode.cldr.tool; 2 3 import java.io.BufferedReader; 4 import java.io.File; 5 import java.io.IOException; 6 import java.io.PrintWriter; 7 import java.util.ArrayList; 8 import java.util.Arrays; 9 import java.util.Collection; 10 import java.util.Collections; 11 import java.util.HashSet; 12 import java.util.LinkedHashSet; 13 import java.util.List; 14 import java.util.Map; 15 import java.util.Map.Entry; 16 import java.util.Set; 17 import java.util.TreeMap; 18 import java.util.TreeSet; 19 import java.util.regex.Matcher; 20 import java.util.regex.Pattern; 21 22 import org.unicode.cldr.draft.FileUtilities; 23 import org.unicode.cldr.tool.Option.Options; 24 import org.unicode.cldr.util.Builder; 25 import org.unicode.cldr.util.CLDRConfig; 26 import org.unicode.cldr.util.CLDRFile; 27 import org.unicode.cldr.util.CLDRPaths; 28 import org.unicode.cldr.util.CldrUtility; 29 import org.unicode.cldr.util.Counter; 30 import org.unicode.cldr.util.DtdData; 31 import org.unicode.cldr.util.DtdData.Attribute; 32 import org.unicode.cldr.util.DtdData.Element; 33 import org.unicode.cldr.util.DtdType; 34 import org.unicode.cldr.util.PathStarrer; 35 import org.unicode.cldr.util.PatternCache; 36 import org.unicode.cldr.util.RegexUtilities; 37 import org.unicode.cldr.util.SupplementalDataInfo; 38 import org.unicode.cldr.util.XMLFileReader; 39 import org.unicode.cldr.util.XMLFileReader.SimpleHandler; 40 import org.unicode.cldr.util.XPathParts; 41 import org.xml.sax.ErrorHandler; 42 import org.xml.sax.SAXException; 43 import org.xml.sax.SAXParseException; 44 45 import com.google.common.base.Splitter; 46 import com.ibm.icu.dev.util.CollectionUtilities; 47 import com.ibm.icu.impl.Relation; 48 import com.ibm.icu.impl.Row; 49 import com.ibm.icu.impl.Row.R2; 50 import com.ibm.icu.impl.Row.R4; 51 import com.ibm.icu.util.VersionInfo; 52 53 public class GenerateItemCounts { 54 private static final SupplementalDataInfo SUPPLEMENTAL_DATA_INFO = CLDRConfig.getInstance().getSupplementalDataInfo(); 55 private static final boolean SKIP_ORDERING = true; 56 private static final String OUT_DIRECTORY = CLDRPaths.GEN_DIRECTORY + "/itemcount/"; // CldrUtility.MAIN_DIRECTORY; 57 private Map<String, List<StackTraceElement>> cantRead = new TreeMap<String, List<StackTraceElement>>(); 58 static { 59 System.err.println("Probably obsolete tool"); 60 } 61 private static String[] DIRECTORIES = { 62 // MUST be oldest first! 63 // "cldr-archive/cldr-21.0", 64 // "cldr-24.0", 65 "cldr-27.0", 66 "trunk" 67 }; 68 69 private static String TRUNK_VERSION = "26.0"; 70 71 static boolean doChanges = true; 72 static Relation<String, String> path2value = Relation.of(new TreeMap<String, Set<String>>(), TreeSet.class); 73 static final AttributeTypes ATTRIBUTE_TYPES = new AttributeTypes(); 74 75 final static Options myOptions = new Options(); 76 77 enum MyOptions { 78 summary(null, null, "if present, summarizes data already collected. Run once with, once without."), directory(".*", ".*", 79 "if summary, creates filtered version (eg -d main): does a find in the name, which is of the form dir/file"), verbose(null, null, 80 "verbose debugging messages"), rawfilter(".*", ".*", "filter the raw files (non-summary, mostly for debugging)"),; 81 // boilerplate 82 final Option option; 83 MyOptions(String argumentPattern, String defaultArgument, String helpText)84 MyOptions(String argumentPattern, String defaultArgument, String helpText) { 85 option = myOptions.add(this, argumentPattern, defaultArgument, helpText); 86 } 87 } 88 89 static Matcher DIR_FILE_MATCHER; 90 static Matcher RAW_FILE_MATCHER; 91 static boolean VERBOSE; 92 main(String[] args)93 public static void main(String[] args) throws IOException { 94 myOptions.parse(MyOptions.directory, args, true); 95 96 DIR_FILE_MATCHER = PatternCache.get(MyOptions.directory.option.getValue()).matcher(""); 97 RAW_FILE_MATCHER = PatternCache.get(MyOptions.rawfilter.option.getValue()).matcher(""); 98 VERBOSE = MyOptions.verbose.option.doesOccur(); 99 100 if (MyOptions.summary.option.doesOccur()) { 101 doSummary(); 102 System.out.println("DONE"); 103 return; 104 // } else if (arg.equals("changes")) { 105 // doChanges = true; 106 } else { 107 } 108 // Pattern dirPattern = dirPattern = PatternCache.get(arg); 109 GenerateItemCounts main = new GenerateItemCounts(); 110 try { 111 Relation<String, String> oldPath2value = null; 112 for (String dir : DIRECTORIES) { 113 // if (dirPattern != null && !dirPattern.matcher(dir).find()) continue; 114 final String pathname = dir.equals("trunk") ? CLDRPaths.BASE_DIRECTORY 115 : CLDRPaths.ARCHIVE_DIRECTORY + "/" + dir; 116 boolean isFinal = dir == DIRECTORIES[DIRECTORIES.length - 1]; 117 118 String fulldir = new File(pathname).getCanonicalPath(); 119 String prefix = (MyOptions.rawfilter.option.doesOccur() ? "filtered_" : ""); 120 String fileKey = dir.replace("/", "_"); 121 try ( 122 PrintWriter summary = FileUtilities.openUTF8Writer(OUT_DIRECTORY, prefix + fileKey + "_count.txt"); 123 PrintWriter changes = FileUtilities.openUTF8Writer(OUT_DIRECTORY, prefix + fileKey + "_changes.txt"); 124 PrintWriter changesNew = FileUtilities.openUTF8Writer(OUT_DIRECTORY, prefix + fileKey + "_news.txt"); 125 PrintWriter changesDeletes = FileUtilities.openUTF8Writer(OUT_DIRECTORY, prefix + fileKey + "_deletes.txt"); 126 PrintWriter changesSummary = FileUtilities.openUTF8Writer(OUT_DIRECTORY, prefix + fileKey + "_changes_summary.txt");) { 127 main.summarizeCoverage(summary, fulldir, isFinal); 128 if (doChanges) { 129 if (oldPath2value != null) { 130 compare(summary, changes, changesNew, changesDeletes, changesSummary, oldPath2value, path2value); 131 checkBadAttributes(path2value, prefix + fileKey + "_dtd_check.txt"); 132 } 133 oldPath2value = path2value; 134 path2value = Relation.of(new TreeMap<String, Set<String>>(), TreeSet.class); 135 } 136 } 137 } 138 ATTRIBUTE_TYPES.showStarred(); 139 } finally { 140 if (main.cantRead.size() != 0) { 141 System.out.println("Couldn't read:\t"); 142 for (String file : main.cantRead.keySet()) { 143 System.out.println(file + "\t" + main.cantRead.get(file)); 144 } 145 } 146 System.out.println("DONE"); 147 } 148 } 149 150 static final Set<String> SKIP_ATTRIBUTES = new HashSet<>(Arrays.asList("draft", "references", "validSubLocales")); 151 152 static final Relation<String, DtdType> ELEMENTS_OCCURRING = Relation.of(new TreeMap(), TreeSet.class); 153 static final Relation<String, DtdType> ELEMENTS_POSSIBLE = Relation.of(new TreeMap(), TreeSet.class); 154 static final Relation<String, Row.R2<DtdType, String>> ATTRIBUTES_OCCURRING = Relation.of(new TreeMap(), TreeSet.class); 155 static final Relation<String, Row.R2<DtdType, String>> ATTRIBUTES_POSSIBLE = Relation.of(new TreeMap(), TreeSet.class); 156 checkBadAttributes(Relation<String, String> path2value2, String outputFile)157 private static void checkBadAttributes(Relation<String, String> path2value2, String outputFile) 158 throws IOException { 159 // an attribute is misplaced if it is not distinguishing, but is on a non-final node. 160 161 Set<String> errors = new LinkedHashSet<>(); 162 163 SupplementalDataInfo supp = SUPPLEMENTAL_DATA_INFO; 164 for (DtdType dtdType : DtdType.values()) { 165 if (dtdType == DtdType.ldmlICU) { 166 continue; 167 } 168 DtdData data = DtdData.getInstance(dtdType); 169 for (Element element : data.getElements()) { 170 String elementName = element.name; 171 ELEMENTS_POSSIBLE.put(elementName, dtdType); 172 final Set<Element> children = element.getChildren().keySet(); 173 174 boolean skipFinal = children.isEmpty() 175 || children.size() == 1 176 && children.iterator().next().name.equals("special"); 177 178 for (Entry<Attribute, Integer> attributeInt : element.getAttributes().entrySet()) { 179 Attribute attribute = attributeInt.getKey(); 180 String attributeName = attribute.name; 181 if (attribute.defaultValue != null) { 182 errors.add("Warning, default value «" + attribute.defaultValue 183 + "» for: " + dtdType + "\t" + elementName + "\t" + attributeName); 184 } 185 final R2<DtdType, String> attributeRow = Row.of(dtdType, elementName); 186 ATTRIBUTES_POSSIBLE.put(attributeName, attributeRow); 187 if (skipFinal || SKIP_ATTRIBUTES.contains(attributeName)) { // don't worry about non-final, references, draft, standard 188 continue; 189 } 190 if (supp.isDeprecated(dtdType, elementName, attributeName, null)) { 191 continue; 192 } 193 if (!CLDRFile.isDistinguishing(dtdType, elementName, attributeName)) { 194 String doesOccur = ""; 195 final Set<R2<DtdType, String>> attributeRows = ATTRIBUTES_OCCURRING.get(attributeName); 196 if (attributeRows == null || !attributeRows.contains(attributeRow)) { 197 doesOccur = "\tNEVER"; 198 } 199 errors.add("Warning, !disting, !leaf: " + dtdType + "\t" + elementName + "\t" + attributeName + "\t" + children + doesOccur); 200 } 201 } 202 } 203 } 204 try ( 205 PrintWriter out = FileUtilities.openUTF8Writer(OUT_DIRECTORY, outputFile)) { 206 out.println("\nElements\tDeprecated\tOccurring\tPossible in DTD, but never occurs"); 207 208 for (Entry<String, Set<DtdType>> x : ELEMENTS_POSSIBLE.keyValuesSet()) { 209 final String element = x.getKey(); 210 if (element.equals("#PCDATA") || element.equals("ANY") || element.equals("generation")) { 211 continue; 212 } 213 final Set<DtdType> possible = x.getValue(); 214 Set<DtdType> deprecated = new TreeSet(); 215 for (DtdType dtdType : possible) { 216 if (SUPPLEMENTAL_DATA_INFO.isDeprecated(dtdType, element, "*", "*")) { 217 deprecated.add(dtdType); 218 } 219 } 220 Set<DtdType> notDeprecated = new TreeSet(possible); 221 notDeprecated.removeAll(deprecated); 222 223 Set<DtdType> occurs = CldrUtility.ifNull(ELEMENTS_OCCURRING.get(element), Collections.EMPTY_SET); 224 Set<DtdType> noOccur = new TreeSet(possible); 225 noOccur.removeAll(occurs); 226 227 if (!Collections.disjoint(deprecated, occurs)) { // deprecated must not occur 228 final Set<DtdType> intersection = CldrUtility.intersect(deprecated, occurs); 229 errors.add("Error: element «" + element 230 + "» is deprecated in " + (deprecated.equals(possible) ? "EVERYWHERE" : intersection) + 231 " but occurs in live data: " + intersection); 232 } 233 if (!Collections.disjoint(notDeprecated, noOccur)) { // if !deprecated & !occur, warning 234 errors.add("Warning: element «" + element 235 + "» doesn't occur in and is not deprecated in " + CldrUtility.intersect(notDeprecated, noOccur)); 236 } 237 238 out.println(element 239 + "\t" + deprecated 240 + "\t" + occurs 241 + "\t" + noOccur); 242 } 243 244 out.println("\nAttributes\tDeprecated\tOccurring\tPossible in DTD, but never occurs"); 245 246 for (Entry<String, Set<R2<DtdType, String>>> x : ATTRIBUTES_POSSIBLE.keyValuesSet()) { 247 final String attribute = x.getKey(); 248 if (attribute.equals("alt") || attribute.equals("draft") || attribute.equals("references")) { 249 continue; 250 } 251 final Set<R2<DtdType, String>> possible = x.getValue(); 252 Set<R2<DtdType, String>> deprecated = new TreeSet(); 253 for (R2<DtdType, String> s : possible) { 254 final DtdType dtdType = s.get0(); 255 final String element = s.get1(); 256 if (SUPPLEMENTAL_DATA_INFO.isDeprecated(dtdType, element, attribute, "*")) { 257 deprecated.add(s); 258 } 259 } 260 Set<R2<DtdType, String>> notDeprecated = new TreeSet(possible); 261 notDeprecated.removeAll(deprecated); 262 263 Set<R2<DtdType, String>> occurs = CldrUtility.ifNull(ATTRIBUTES_OCCURRING.get(attribute), Collections.EMPTY_SET); 264 Set<R2<DtdType, String>> noOccur = new TreeSet(possible); 265 noOccur.removeAll(occurs); 266 267 if (!Collections.disjoint(deprecated, occurs)) { // deprecated must not occur 268 final Set<R2<DtdType, String>> intersection = CldrUtility.intersect(deprecated, occurs); 269 errors.add("Error: attribute «" + attribute 270 + "» is deprecated in " + (deprecated.equals(possible) ? "EVERYWHERE" : intersection) + 271 " but occurs in live data: " + intersection); 272 } 273 if (!Collections.disjoint(notDeprecated, noOccur)) { // if !deprecated & !occur, warning 274 errors.add("Warning: attribute «" + attribute 275 + "» doesn't occur in and is not deprecated in " + CldrUtility.intersect(notDeprecated, noOccur)); 276 } 277 out.println(attribute 278 + "\t" + deprecated 279 + "\t" + occurs 280 + "\t" + noOccur); 281 } 282 out.println("\nERRORS/WARNINGS"); 283 out.println(CollectionUtilities.join(errors, "\n")); 284 } 285 } 286 287 static class AttributeTypes { 288 Relation<String, String> elementPathToAttributes = Relation.of(new TreeMap<String, Set<String>>(), 289 TreeSet.class); 290 final PathStarrer PATH_STARRER = new PathStarrer().setSubstitutionPattern("*"); 291 final Set<String> STARRED_PATHS = new TreeSet<String>(); 292 XPathParts parts = new XPathParts(); 293 StringBuilder elementPath = new StringBuilder(); 294 add(String path)295 public void add(String path) { 296 parts.set(path); 297 elementPath.setLength(0); 298 //DtdType type = CLDRFile.DtdType.valueOf(parts.getElement(0)); 299 for (int i = 0; i < parts.size(); ++i) { 300 String element = parts.getElement(i); 301 elementPath.append('/').append(element); 302 elementPathToAttributes.putAll(elementPath.toString().intern(), parts.getAttributeKeys(i)); 303 } 304 } 305 showStarred()306 public void showStarred() throws IOException { 307 PrintWriter starred = FileUtilities.openUTF8Writer(OUT_DIRECTORY, "starred" + ".txt"); 308 309 for (Entry<String, Set<String>> entry : elementPathToAttributes.keyValuesSet()) { 310 Set<String> attributes = entry.getValue(); 311 if (attributes.size() == 0) { 312 continue; 313 } 314 String path = entry.getKey(); 315 String[] elements = path.split("/"); 316 DtdType type = DtdType.valueOf(elements[1]); 317 String finalElement = elements[elements.length - 1]; 318 starred.print(path); 319 for (String attribute : attributes) { 320 if (CLDRFile.isDistinguishing(type, finalElement, attribute)) { 321 starred.print("[@" + attribute + "='disting.']"); 322 } else { 323 starred.print("[@" + attribute + "='DATA']"); 324 } 325 } 326 starred.println(); 327 } 328 starred.close(); 329 } 330 } 331 332 static Pattern prefix = PatternCache.get("([^/]+/[^/]+)(.*)"); 333 334 static class Delta { 335 Counter<String> newCount = new Counter<String>(); 336 Counter<String> deletedCount = new Counter<String>(); 337 Counter<String> changedCount = new Counter<String>(); 338 Counter<String> unchangedCount = new Counter<String>(); 339 print(PrintWriter changesSummary, Set<String> prefixes)340 void print(PrintWriter changesSummary, Set<String> prefixes) { 341 changesSummary.println("Total" 342 + "\t" + unchangedCount.getTotal() 343 + "\t" + deletedCount.getTotal() 344 + "\t" + changedCount.getTotal() 345 + "\t" + newCount.getTotal()); 346 changesSummary.println("Directory\tSame\tRemoved\tChanged\tAdded"); 347 for (String prefix : prefixes) { 348 changesSummary.println(prefix 349 + "\t" + unchangedCount.get(prefix) 350 + "\t" + deletedCount.get(prefix) 351 + "\t" + changedCount.get(prefix) 352 + "\t" + newCount.get(prefix)); 353 } 354 } 355 } 356 compare(PrintWriter summary, PrintWriter changes, PrintWriter changesNew, PrintWriter changesDeletes, PrintWriter changesSummary, Relation<String, String> oldPath2value, Relation<String, String> path2value2)357 private static void compare(PrintWriter summary, PrintWriter changes, PrintWriter changesNew, 358 PrintWriter changesDeletes, PrintWriter changesSummary, Relation<String, String> oldPath2value, 359 Relation<String, String> path2value2) { 360 Set<String> union = Builder.with(new TreeSet<String>()).addAll(oldPath2value.keySet()) 361 .addAll(path2value2.keySet()).get(); 362 long total = 0; 363 Matcher prefixMatcher = prefix.matcher(""); 364 Delta charCount = new Delta(); 365 Delta itemCount = new Delta(); 366 Counter<String> newLength = new Counter<String>(); 367 Counter<String> deletedLength = new Counter<String>(); 368 Counter<String> changedLength = new Counter<String>(); 369 Counter<String> unchangedLength = new Counter<String>(); 370 Set<String> prefixes = new TreeSet(); 371 for (String path : union) { 372 if (!prefixMatcher.reset(path).find()) { 373 throw new IllegalArgumentException(); 374 } 375 String prefix = prefixMatcher.group(1); 376 prefixes.add(prefix); 377 String localPath = prefixMatcher.group(2); 378 Set<String> set1 = oldPath2value.getAll(path); 379 Set<String> set2 = path2value2.getAll(path); 380 if (set2 != null) { 381 total += set2.size(); 382 } 383 if (set1 == null) { 384 changesNew.println(prefix + "\t" + "\t" + set2 + "\t" + localPath); 385 itemCount.newCount.add(prefix, set2.size()); 386 charCount.newCount.add(prefix, totalLength(set2)); 387 } else if (set2 == null) { 388 changesDeletes.println(prefix + "\t" + set1 + "\t\t" + localPath); 389 itemCount.deletedCount.add(prefix, -set1.size()); 390 charCount.deletedCount.add(prefix, -totalLength(set1)); 391 } else if (!set1.equals(set2)) { 392 TreeSet<String> set1minus2 = Builder.with(new TreeSet<String>()).addAll(set1).removeAll(set2).get(); 393 TreeSet<String> set2minus1 = Builder.with(new TreeSet<String>()).addAll(set2).removeAll(set1).get(); 394 TreeSet<String> set2and1 = Builder.with(new TreeSet<String>()).addAll(set2).retainAll(set1).get(); 395 itemCount.changedCount.add(prefix, (set2minus1.size() + set1minus2.size() + 1) / 2); 396 itemCount.unchangedCount.add(prefix, set2and1.size()); 397 charCount.changedCount.add(prefix, (totalLength(set2minus1) + totalLength(set1minus2) + 1) / 2); 398 charCount.unchangedCount.add(prefix, totalLength(set2and1)); 399 changes.println(prefix + "\t" + set1minus2 400 + "\t" 401 + set2minus1 402 + "\t" + localPath); 403 } else { 404 itemCount.unchangedCount.add(prefix, set2.size()); 405 charCount.unchangedCount.add(prefix, totalLength(set2)); 406 } 407 } 408 itemCount.print(changesSummary, prefixes); 409 changesSummary.println(); 410 charCount.print(changesSummary, prefixes); 411 // union = Builder.with(new TreeSet<String>()) 412 // .addAll(newCount.keySet()) 413 // .addAll(deletedCount.keySet()) 414 // .addAll(changedCount.keySet()) 415 // .addAll(unchangedCount.keySet()) 416 // .get(); 417 summary.println("#Total:\t" + total); 418 } 419 totalLength(Set<String> set2)420 private static long totalLength(Set<String> set2) { 421 int result = 0; 422 for (String s : set2) { 423 result += s.length(); 424 } 425 return result; 426 } 427 428 final static Pattern LOCALE_PATTERN = PatternCache.get( 429 "([a-z]{2,3})(?:[_-]([A-Z][a-z]{3}))?(?:[_-]([a-zA-Z0-9]{2,3}))?([_-][a-zA-Z0-9]{1,8})*"); 430 doSummary()431 public static void doSummary() throws IOException { 432 Map<String, R4<Counter<String>, Counter<String>, Counter<String>, Counter<String>>> key_release_count = new TreeMap<String, R4<Counter<String>, Counter<String>, Counter<String>, Counter<String>>>(); 433 Matcher countryLocale = LOCALE_PATTERN.matcher(""); 434 List<String> releases = new ArrayList<String>(); 435 Pattern releaseNumber = PatternCache.get("count_(?:.*-(\\d+(\\.\\d+)*)|trunk)\\.txt"); 436 // int releaseCount = 1; 437 Relation<String, String> release_keys = Relation.of(new TreeMap<String, Set<String>>(), TreeSet.class); 438 Relation<String, String> localesToPaths = Relation.of(new TreeMap<String, Set<String>>(), TreeSet.class); 439 Set<String> writtenLanguages = new TreeSet<String>(); 440 Set<String> countries = new TreeSet<String>(); 441 442 File[] listFiles = new File(OUT_DIRECTORY).listFiles(); 443 // find the most recent version 444 VersionInfo mostRecentVersion = VersionInfo.getInstance(0); 445 for (File subdir : listFiles) { 446 final String name = subdir.getName(); 447 final Matcher releaseMatcher = releaseNumber.matcher(name); 448 if (!releaseMatcher.matches()) { 449 if (name.startsWith("count_")) { 450 throw new IllegalArgumentException("Bad match " + RegexUtilities.showMismatch(releaseMatcher, name)); 451 } 452 continue; 453 } 454 String releaseNum = releaseMatcher.group(1); // "1." + releaseCount++; 455 if (releaseNum == null) { 456 releaseNum = TRUNK_VERSION; 457 } 458 VersionInfo vi = VersionInfo.getInstance(releaseNum); 459 if (vi.compareTo(mostRecentVersion) > 0) { 460 mostRecentVersion = vi; 461 } 462 } 463 464 for (File subdir : listFiles) { 465 final String name = subdir.getName(); 466 final Matcher releaseMatcher = releaseNumber.matcher(name); 467 if (!releaseMatcher.matches()) { 468 if (name.startsWith("count_")) { 469 throw new IllegalArgumentException("Bad match " + RegexUtilities.showMismatch(releaseMatcher, name)); 470 } 471 continue; 472 } 473 String releaseNum = releaseMatcher.group(1); // "1." + releaseCount++; 474 if (releaseNum == null) { 475 releaseNum = TRUNK_VERSION; 476 } 477 VersionInfo vi = VersionInfo.getInstance(releaseNum); 478 boolean captureData = vi.equals(mostRecentVersion); 479 releases.add(releaseNum); 480 BufferedReader in = FileUtilities.openUTF8Reader("", subdir.getCanonicalPath()); 481 while (true) { 482 String line = in.readLine(); 483 if (line == null) break; 484 line = line.trim(); 485 if (line.startsWith("#")) { 486 continue; 487 } 488 // common/main New: [Yellowknife] /gl//ldml/dates/timeZoneNames/zone[@type="America/Yellowknife"]/exemplarCity 489 490 String[] parts = line.split("\t"); 491 try { 492 String file = parts[0]; 493 if (file.startsWith("seed/") || !DIR_FILE_MATCHER.reset(file).find()) { 494 if (VERBOSE) { 495 System.out.println("Skipping: " + RegexUtilities.showMismatch(DIR_FILE_MATCHER, file)); 496 } 497 continue; 498 } else if (VERBOSE) { 499 System.out.println("Including: " + file); 500 } 501 502 long valueCount = Long.parseLong(parts[1]); 503 long valueLen = Long.parseLong(parts[2]); 504 long attrCount = Long.parseLong(parts[3]); 505 long attrLen = Long.parseLong(parts[4]); 506 int lastSlash = file.lastIndexOf("/"); 507 String key2 = file; 508 String path = file.substring(0, lastSlash); 509 String key = file.substring(lastSlash + 1); 510 if (countryLocale.reset(key).matches()) { 511 String lang = countryLocale.group(1); 512 String script = countryLocale.group(2); 513 String country = countryLocale.group(3); 514 String writtenLang = lang + (script == null ? "" : "_" + script); 515 String locale = writtenLang + (country == null ? "" : "_" + country); 516 if (captureData) { 517 localesToPaths.put(locale, path); 518 writtenLanguages.add(writtenLang); 519 if (country != null) { 520 countries.add(country); 521 } 522 } 523 // System.out.println(key + " => " + newKey); 524 //key = writtenLang + "—" + ULocale.getDisplayName(writtenLang, "en"); 525 } 526 if (valueCount + attrCount == 0) continue; 527 release_keys.put(releaseNum, key2); 528 R4<Counter<String>, Counter<String>, Counter<String>, Counter<String>> release_count = key_release_count 529 .get(key2); 530 if (release_count == null) { 531 release_count = Row.of(new Counter<String>(), new Counter<String>(), new Counter<String>(), 532 new Counter<String>()); 533 key_release_count.put(key2, release_count); 534 } 535 release_count.get0().add(releaseNum, valueCount); 536 release_count.get1().add(releaseNum, valueLen); 537 release_count.get2().add(releaseNum, attrCount); 538 release_count.get3().add(releaseNum, attrLen); 539 } catch (Exception e) { 540 throw new IllegalArgumentException(line, e); 541 } 542 } 543 in.close(); 544 } 545 PrintWriter summary = FileUtilities.openUTF8Writer(OUT_DIRECTORY, (MyOptions.directory.option.doesOccur() ? "filtered-" : "") + "summary" + 546 ".txt"); 547 for (String file : releases) { 548 summary.print("\t" + file + "\tlen"); 549 } 550 summary.println(); 551 for (String key : key_release_count.keySet()) { 552 summary.print(key); 553 R4<Counter<String>, Counter<String>, Counter<String>, Counter<String>> release_count = key_release_count 554 .get(key); 555 for (String release2 : releases) { 556 long count = release_count.get0().get(release2) + release_count.get2().get(release2); 557 long len = release_count.get1().get(release2) + release_count.get3().get(release2); 558 summary.print("\t" + count + "\t" + len); 559 } 560 summary.println(); 561 } 562 for (String release : release_keys.keySet()) { 563 System.out.println("Release:\t" + release + "\t" + release_keys.getAll(release).size()); 564 } 565 summary.close(); 566 PrintWriter summary2 = FileUtilities.openUTF8Writer(OUT_DIRECTORY, (MyOptions.directory.option.doesOccur() ? "filtered-" : "") + "locales" + 567 ".txt"); 568 summary2.println("#Languages (inc. script):\t" + writtenLanguages.size()); 569 summary2.println("#Countries:\t" + countries.size()); 570 summary2.println("#Locales:\t" + localesToPaths.size()); 571 for (Entry<String, Set<String>> entry : localesToPaths.keyValuesSet()) { 572 summary2.println(entry.getKey() + "\t" + CollectionUtilities.join(entry.getValue(), "\t")); 573 } 574 summary2.close(); 575 } 576 577 static final Set<String> ATTRIBUTES_TO_SKIP = Builder.with(new HashSet<String>()) 578 .addAll("version", "references", "standard", "draft").freeze(); 579 static final Pattern skipPath = PatternCache.get("" + 580 "\\[\\@alt=\"[^\"]*proposed" + 581 "|^//" + 582 "(ldml(\\[[^/]*)?/identity" + 583 "|(ldmlBCP47|supplementalData|keyboard)(\\[[^/]*)?/(generation|version)" + 584 ")"); 585 capture(DtdType type2, XPathParts parts)586 static void capture(DtdType type2, XPathParts parts) { 587 for (int i = 0; i < parts.size(); ++i) { 588 String element = parts.getElement(i); 589 ELEMENTS_OCCURRING.put(element, type2); 590 for (String attribute : parts.getAttributes(i).keySet()) { 591 ATTRIBUTES_OCCURRING.put(attribute, Row.of(type2, element)); 592 } 593 } 594 } 595 596 static class MyHandler extends SimpleHandler { 597 XPathParts parts = new XPathParts(); 598 long valueCount; 599 long valueLen; 600 long attributeCount; 601 long attributeLen; 602 Matcher skipPathMatcher = skipPath.matcher(""); 603 Splitter lines = Splitter.onPattern("\n+").omitEmptyStrings().trimResults(); 604 String prefix; 605 int orderedCount; 606 DtdType type; 607 private final boolean isFinal; 608 MyHandler(String prefix, boolean isFinal)609 MyHandler(String prefix, boolean isFinal) { 610 this.prefix = prefix; 611 this.isFinal = isFinal; 612 } 613 614 @Override handlePathValue(String path, String value)615 public void handlePathValue(String path, String value) { 616 if (type == null) { 617 parts.set(path); 618 type = DtdType.valueOf(parts.getElement(0)); 619 } 620 621 ATTRIBUTE_TYPES.add(path); 622 623 if (skipPathMatcher.reset(path).find()) { 624 return; 625 } 626 String pathKey = null; 627 if (doChanges) { 628 // if (path.contains("/collations")) { 629 // System.out.println("whoops"); 630 // } 631 pathKey = fixKeyPath(path); 632 } 633 int len = value.length(); 634 value = value.trim(); 635 if (value.isEmpty() && len > 0) { 636 value = " "; 637 } 638 if (value.length() != 0) { 639 List<String> valueLines = lines.splitToList(value); 640 if (valueLines.size() == 1) { 641 valueCount++; 642 valueLen += value.length(); 643 if (doChanges) { 644 path2value.put(pathKey, value); 645 } 646 } else { 647 int count = 0; 648 for (String v : valueLines) { 649 valueCount++; 650 valueLen += v.length(); 651 if (doChanges) { 652 path2value.put(pathKey + "/_q" + count++, v); 653 } 654 } 655 } 656 } 657 parts.set(path); 658 if (isFinal) { 659 capture(type, parts); 660 } 661 if (path.contains("[@")) { 662 int i = parts.size() - 1; // only look at last item 663 Collection<String> attributes = parts.getAttributeKeys(i); 664 if (attributes.size() != 0) { 665 String element = parts.getElement(i); 666 for (String attribute : attributes) { 667 if (ATTRIBUTES_TO_SKIP.contains(attribute) 668 || CLDRFile.isDistinguishing(type, element, attribute)) { 669 continue; 670 } 671 String valuePart = parts.getAttributeValue(i, attribute); 672 // String[] valueParts = attrValue.split("\\s"); 673 // for (String valuePart : valueParts) { 674 attributeCount++; 675 attributeLen += valuePart.length(); 676 if (doChanges) { 677 path2value.put(pathKey + "/_" + attribute, valuePart); 678 // } 679 } 680 } 681 } 682 } 683 } 684 fixKeyPath(String path)685 private String fixKeyPath(String path) { 686 parts.set(path); 687 for (int i = 0; i < parts.size(); ++i) { 688 String element = parts.getElement(i); 689 if (!SKIP_ORDERING) { 690 if (CLDRFile.isOrdered(element, type)) { 691 parts.addAttribute("_q", String.valueOf(orderedCount++)); 692 } 693 } 694 } 695 return prefix + CLDRFile.getDistinguishingXPath(parts.toString(), null, false); 696 } 697 } 698 check(String systemID, String name, boolean isFinal)699 private MyHandler check(String systemID, String name, boolean isFinal) { 700 MyHandler myHandler = new MyHandler(name, isFinal); 701 try { 702 XMLFileReader reader = new XMLFileReader().setHandler(myHandler); 703 reader.read(systemID, XMLFileReader.CONTENT_HANDLER, true); 704 } catch (Exception e) { 705 cantRead.put(name, Arrays.asList(e.getStackTrace())); 706 } 707 return myHandler; 708 709 // try { 710 // FileInputStream fis = new FileInputStream(systemID); 711 // XMLFileReader xmlReader = XMLFileReader.createXMLReader(true); 712 // xmlReader.setErrorHandler(new MyErrorHandler()); 713 // MyHandler myHandler = new MyHandler(); 714 // smlReader 715 // xmlReader.setHandler(myHandler); 716 // InputSource is = new InputSource(fis); 717 // is.setSystemId(systemID.toString()); 718 // xmlReader.parse(is); 719 // fis.close(); 720 // return myHandler; 721 // } catch (SAXParseException e) { 722 // System.out.println("\t" + "Can't read " + systemID); 723 // System.out.println("\t" + e.getClass() + "\t" + e.getMessage()); 724 // } catch (SAXException e) { 725 // System.out.println("\t" + "Can't read " + systemID); 726 // System.out.println("\t" + e.getClass() + "\t" + e.getMessage()); 727 // } catch (IOException e) { 728 // System.out.println("\t" + "Can't read " + systemID); 729 // System.out.println("\t" + e.getClass() + "\t" + e.getMessage()); 730 // } 731 } 732 733 static class MyErrorHandler implements ErrorHandler { error(SAXParseException exception)734 public void error(SAXParseException exception) throws SAXException { 735 System.out.println("\nerror: " + XMLFileReader.showSAX(exception)); 736 throw exception; 737 } 738 fatalError(SAXParseException exception)739 public void fatalError(SAXParseException exception) throws SAXException { 740 System.out.println("\nfatalError: " + XMLFileReader.showSAX(exception)); 741 throw exception; 742 } 743 warning(SAXParseException exception)744 public void warning(SAXParseException exception) throws SAXException { 745 System.out.println("\nwarning: " + XMLFileReader.showSAX(exception)); 746 throw exception; 747 } 748 } 749 summarizeCoverage(PrintWriter summary, String commonDir, boolean isFinal)750 private void summarizeCoverage(PrintWriter summary, String commonDir, boolean isFinal) { 751 System.out.println(commonDir); 752 summary.println("#name" + "\t" + "value-count" + "\t" + "value-len" + "\t" + "attr-count" + "\t" + "attr-len"); 753 File commonDirectory = new File(commonDir); 754 if (!commonDirectory.exists()) { 755 System.out.println("Doesn't exist:\t" + commonDirectory); 756 } 757 summarizeFiles(summary, commonDirectory, isFinal, 1); 758 } 759 760 static final Set<String> SKIP_DIRS = new HashSet<>(Arrays.asList("specs", "tools", "seed", "exemplars")); 761 summarizeFiles(PrintWriter summary, File directory, boolean isFinal, int level)762 public void summarizeFiles(PrintWriter summary, File directory, boolean isFinal, int level) { 763 System.out.println("\t\t\t\t\t\t\t".substring(0, level) + directory); 764 int count = 0; 765 for (File file : directory.listFiles()) { 766 String filename = file.getName(); 767 if (filename.startsWith(".")) { 768 // do nothing 769 } else if (file.isDirectory()) { 770 if (!SKIP_DIRS.contains(filename)) { 771 summarizeFiles(summary, file, isFinal, level + 1); 772 } 773 } else if (!filename.startsWith("#") && filename.endsWith(".xml")) { 774 String name = new File(directory.getParent()).getName() + "/" + directory.getName() + "/" 775 + file.getName(); 776 name = name.substring(0, name.length() - 4); // strip .xml 777 if (!RAW_FILE_MATCHER.reset(name).find()) { 778 continue; 779 } 780 if (VERBOSE) { 781 System.out.println(name); 782 } else { 783 System.out.print("."); 784 if (++count > 100) { 785 count = 0; 786 System.out.println(); 787 } 788 System.out.flush(); 789 } 790 MyHandler handler = check(file.toString(), name, isFinal); 791 summary.println(name + "\t" + handler.valueCount + "\t" + handler.valueLen + "\t" 792 + handler.attributeCount + "\t" + handler.attributeLen); 793 } 794 } 795 System.out.println(); 796 } 797 } 798