1 package org.unicode.cldr.tool; 2 3 import java.io.File; 4 import java.io.FileReader; 5 import java.io.IOException; 6 import java.io.OutputStreamWriter; 7 import java.io.Reader; 8 import java.io.Writer; 9 import java.util.ArrayList; 10 import java.util.Arrays; 11 import java.util.Collections; 12 import java.util.EnumSet; 13 import java.util.HashMap; 14 import java.util.HashSet; 15 import java.util.Iterator; 16 import java.util.LinkedHashMap; 17 import java.util.LinkedHashSet; 18 import java.util.List; 19 import java.util.Locale; 20 import java.util.Map; 21 import java.util.Map.Entry; 22 import java.util.Set; 23 import java.util.Stack; 24 import java.util.TreeMap; 25 import java.util.TreeSet; 26 import java.util.regex.Matcher; 27 import java.util.regex.Pattern; 28 29 import org.unicode.cldr.tool.Option.Options; 30 import org.unicode.cldr.util.CLDRPaths; 31 import org.unicode.cldr.util.CLDRTool; 32 import org.unicode.cldr.util.ChainedMap; 33 import org.unicode.cldr.util.ChainedMap.M4; 34 import org.unicode.cldr.util.CldrUtility; 35 import org.unicode.cldr.util.Counter; 36 import org.unicode.cldr.util.DtdData; 37 import org.unicode.cldr.util.DtdData.Attribute; 38 import org.unicode.cldr.util.DtdData.Element; 39 import org.unicode.cldr.util.DtdType; 40 import org.unicode.cldr.util.Pair; 41 import org.unicode.cldr.util.PathUtilities; 42 import org.unicode.cldr.util.PatternCache; 43 import org.unicode.cldr.util.RegexUtilities; 44 import org.unicode.cldr.util.SimpleHtmlParser; 45 import org.unicode.cldr.util.SimpleHtmlParser.Type; 46 import org.unicode.cldr.util.TransliteratorUtilities; 47 48 import com.google.common.base.Joiner; 49 import com.google.common.collect.ImmutableSet; 50 import com.ibm.icu.impl.Relation; 51 import com.ibm.icu.impl.Row.R4; 52 import com.ibm.icu.text.BreakIterator; 53 import com.ibm.icu.util.Output; 54 import com.ibm.icu.util.ULocale; 55 56 @CLDRTool(alias = "checkhtmlfiles", description = "Look for errors in CLDR documentation tools", hidden = "Used for CLDR process") 57 public class CheckHtmlFiles { 58 59 static final Set<String> NOPOP = new HashSet<>(Arrays.asList("br", "img", "link", "meta", "!doctype", "hr", "col", "input")); 60 61 static final EnumSet<Type> SUPPRESS = EnumSet.of( 62 Type.ELEMENT, Type.ELEMENT_START, Type.ELEMENT_END, Type.ELEMENT_POP, 63 Type.ATTRIBUTE, Type.ATTRIBUTE_CONTENT); 64 65 final static Options myOptions = new Options(); 66 final static Writer LOG = new OutputStreamWriter(System.out); 67 static Pattern WELLFORMED_HEADER = PatternCache.get("\\s*(\\d+(\\.\\d+)*\\s*).*"); 68 static Pattern SUPPRESS_SECTION_NUMBER = PatternCache.get( 69 "(Annex [A-Z]: .*)" + 70 "|(Appendix [A-Z].*)" + 71 "|(.*Migrati(on|ng).*)" + 72 "|Step \\d+.*" + 73 "|Example \\d+.*" + 74 "|D\\d+\\.\\s.*" + 75 "|References" + 76 "|Acknowledge?ments" + 77 "|Rights to .*Images" + 78 "|Modifications" + 79 "|(Revision \\d+\\.?)"); 80 static Pattern SUPPRESS_REVISION = PatternCache.get("Revision \\d+\\.?"); 81 static Pattern SPACES = PatternCache.get("\\s+"); 82 83 enum MyOptions { 84 // old(".*", Settings.OTHER_WORKSPACE_DIRECTORY + "cldr-archive/cldr-22.1/specs/ldml/tr35\\.html", "source data (regex)"), 85 target(".*", CLDRPaths.BASE_DIRECTORY + "specs" + File.separator + "ldml" + File.separator + 86 "tr35(-.*)?\\.html", "target data (regex); ucd for Unicode docs; " 87 + "for others use the format -t ${workspace_loc}/unicode-draft/reports/tr51/tr51.html"), verbose(".*", "none", "verbose debugging messages"), 88 // contents(".*", CLDRPaths.BASE_DIRECTORY + "specs/ldml/tr35(-.*)?\\.html", "generate contents"), 89 // /cldr-archive 90 ; 91 92 // boilerplate 93 final Option option; 94 MyOptions(String argumentPattern, String defaultArgument, String helpText)95 MyOptions(String argumentPattern, String defaultArgument, String helpText) { 96 option = myOptions.add(this, argumentPattern, defaultArgument, helpText); 97 } 98 } 99 100 enum Verbosity { 101 none, element, all; of(String input)102 static Verbosity of(String input) { 103 return input == null ? Verbosity.none : Verbosity.valueOf(input.toLowerCase(Locale.ROOT)); 104 } 105 } 106 107 static Verbosity verbose; 108 static boolean doContents; 109 static boolean isLdml; 110 main(String[] args)111 public static void main(String[] args) throws IOException { 112 System.out.println("First do a replace of <a\\s+name=\"([^\"]*)\"\\s*> by <a name=\"$1\" href=\"#$1\">"); 113 System.out.println("Then check for all links with no anchors: <a([^>]*)></a>"); 114 System.out.println("Then check for all links that don't start with name or href <a (?!href|name)"); 115 116 myOptions.parse(MyOptions.target, args, true); 117 verbose = Verbosity.of(MyOptions.verbose.option.getValue()); 118 119 String targetString = MyOptions.target.option.getValue(); 120 if (targetString.contains("ldml")) { 121 isLdml = true; 122 } 123 if (targetString.equalsIgnoreCase("ucd")) { 124 targetString = CLDRPaths.BASE_DIRECTORY + "../unicode-draft/reports/tr(\\d+)/tr(\\d+).html"; 125 } else if (targetString.equalsIgnoreCase("security")) { 126 targetString = CLDRPaths.BASE_DIRECTORY + "../unicode-draft/reports/tr(3[69])/tr(3[69]).html"; 127 } 128 Data target = new Data().getSentences(targetString); 129 if (target.count == 0) { 130 throw new IllegalArgumentException("No files matched with " + targetString); 131 } 132 133 if (isLdml) { 134 checkForDtd(target); 135 } 136 137 System.out.println("*TOTAL COUNTS* files:" + target.count + ", fatal errors:" + target.totalFatalCount + ", nonfatal errors:" 138 + target.totalErrorCount); 139 if (target.totalFatalCount > 0 || target.totalErrorCount > 0) { 140 System.exit(1); // give an error status 141 } 142 143 System.exit(0); 144 145 // Data source = new Data().getSentences(MyOptions.old.option.getValue()); 146 // String file = MyOptions.target.option.getValue(); 147 // 148 // Data target = new Data().getSentences(file); 149 // 150 // int missingCount = 0, extraCount = 0; 151 // int line = 0; 152 // for (String sentence : source) { 153 // ++line; 154 // long sourceCount = source.getCount(sentence); 155 // long targetCount = target.getCount(sentence); 156 // if (targetCount == 0) { 157 // System.out.println(line + "\tMISSING:\t" + sourceCount + "≠" + targetCount + "\t" + sentence); 158 // ++missingCount; 159 // } 160 // } 161 // line = 0; 162 // for (String sentence : target) { 163 // ++line; 164 // long sourceCount = source.getCount(sentence); 165 // long targetCount = target.getCount(sentence); 166 // if (sourceCount == 0) { 167 // System.out.println(line + "\tEXTRA:\t" + targetCount + "≠" + sourceCount + "\t" + sentence); 168 // ++extraCount; 169 // } 170 // } 171 // System.out.println("Missing:\t" + missingCount); 172 // System.out.println("Extra:\t" + extraCount); 173 } 174 175 private static final Set<String> SKIP_ATTR = ImmutableSet.of("draft", "alt", "references", "cldrVersion", "unicodeVersion"); 176 checkForDtd(Data target)177 private static void checkForDtd(Data target) { 178 M4<String, String, DtdType, Boolean> typeToElements = ChainedMap.of(new TreeMap(), new TreeMap(), new TreeMap(), Boolean.class); 179 for (DtdType type : DtdType.values()) { 180 if (type == DtdType.ldmlICU) continue; 181 DtdData dtdData = DtdData.getInstance(type); 182 Set<Element> elements = dtdData.getElements(); 183 for (Element element : elements) { 184 if (element.isDeprecated() 185 || element.equals(dtdData.PCDATA) 186 || element.equals(dtdData.ANY)) continue; 187 typeToElements.put(element.name, element.toDtdString(), type, Boolean.TRUE); 188 } 189 Set<Attribute> attributes = dtdData.getAttributes(); 190 for (Attribute attribute : attributes) { 191 if (attribute.isDeprecated()) continue; 192 if (SKIP_ATTR.contains(attribute.name)) { 193 continue; 194 } 195 typeToElements.put(attribute.element.name, attribute.appendDtdString(new StringBuilder()).toString(), type, Boolean.TRUE); 196 } 197 } 198 final Map<String, String> skeletonToInFile = new HashMap<>(); 199 Relation<String, String> extra = new Relation(new TreeMap(), TreeSet.class); 200 for (R4<String, String, String, Boolean> elementItem : target.dtdItems.rows()) { 201 String file = elementItem.get0(); 202 String element = elementItem.get1(); 203 String item = elementItem.get2(); 204 extra.put(element, item); 205 skeletonToInFile.put(item.replace(" ", ""), item); 206 } 207 ChainedMap.M4<String, String, DtdType, Comparison> status = ChainedMap.of(new TreeMap(), new TreeMap(), new TreeMap(), Comparison.class); 208 for (R4<String, String, DtdType, Boolean> entry : typeToElements.rows()) { 209 final String element = entry.get0(); 210 final String key = entry.get1(); 211 final DtdType dtdType = entry.get2(); 212 String spaceless = key.replace(" ", ""); 213 String realKey = skeletonToInFile.get(spaceless); 214 if (realKey == null) { 215 status.put(element, key, dtdType, Comparison.missing); 216 } else { 217 boolean found = extra.remove(element, realKey); 218 if (!found) { 219 status.put(element, key, dtdType, Comparison.no_rem); 220 } 221 } 222 } 223 for (Entry<String, String> extraItem : extra.entrySet()) { 224 status.put(extraItem.getKey(), extraItem.getValue(), DtdType.ldmlICU, Comparison.extra); 225 } 226 TreeSet<String> reverse = new TreeSet<>(Collections.reverseOrder()); 227 for (Entry<String, Map<String, Map<DtdType, Comparison>>> entry1 : status) { 228 String element = entry1.getKey(); 229 reverse.clear(); 230 final Map<String, Map<DtdType, Comparison>> itemToDtdTypeToComparison = entry1.getValue(); 231 reverse.addAll(itemToDtdTypeToComparison.keySet()); 232 for (String item : reverse) { 233 Map<DtdType, Comparison> typeToComparison = itemToDtdTypeToComparison.get(item); 234 for (Entry<DtdType, Comparison> entry2 : typeToComparison.entrySet()) { 235 System.out.println(element 236 + "\t" + entry2.getValue() 237 + "\t" + CldrUtility.ifSame(entry2.getKey(), DtdType.ldmlICU, "") 238 + "\t" + item); 239 } 240 } 241 } 242 } 243 244 enum Comparison { 245 missing, extra, no_rem 246 } 247 248 static Pattern WHITESPACE = PatternCache.get("[\\s]+"); 249 static Pattern BADSECTION = PatternCache.get("^\\s*(\\d+\\s*)?Section\\s*\\d+\\s*[-:]\\s*"); 250 251 static final Set<String> FORCEBREAK = new HashSet<>(Arrays.asList( 252 "table", "div", "blockquote", 253 "p", "br", "td", "th", "h1", "h2", "h3", "h4", "h5", "li")); 254 255 // enum ContentsElements {h1, h2, h3, h4, h5, caption} 256 257 static final Set<String> DO_CONTENTS = new HashSet<>(Arrays.asList( 258 "h1", "h2", "h3", "h4", "h5", "caption")); 259 260 static class Levels implements Comparable<Levels> { 261 final int[] levels = new int[10]; 262 final int h2_start; 263 Levels(int h2_start)264 public Levels(int h2_start) { 265 levels[0] = h2_start; // special adjustment of starting header level 266 this.h2_start = h2_start; 267 } 268 Levels()269 public Levels() { 270 this(0); 271 } 272 273 /** 274 * h2 = level 0, h3 is level 1, etc. 275 * @param level 276 * @return 277 */ next(int level, Output<Boolean> missingLevel)278 Levels next(int level, Output<Boolean> missingLevel) { 279 level -= 2; // h2 = level 0 280 missingLevel.value = false; 281 if (levels[0] < h2_start) { 282 missingLevel.value = true; 283 } 284 for (int i = 1; i < level; ++i) { 285 if (levels[i] == 0) { 286 missingLevel.value = true; 287 } 288 } 289 levels[level]++; 290 for (int i = level + 1; i < levels.length; ++i) { 291 levels[i] = 0; 292 } 293 return this; 294 } 295 getDepth()296 public int getDepth() { 297 for (int i = 0;; ++i) { 298 int level = levels[i]; 299 if (level == 0) { 300 return i - 1; 301 } 302 } 303 } 304 305 @Override toString()306 public String toString() { 307 StringBuilder b = new StringBuilder(); 308 for (int i = 0;; ++i) { 309 int level = levels[i]; 310 if (level == 0) { 311 return b.toString(); 312 } 313 if (b.length() != 0) { 314 b.append('.'); 315 } 316 b.append(level); 317 } 318 } 319 parse(String group)320 public static Levels parse(String group) { 321 Levels result = new Levels(); 322 int currentLevel = 0; 323 for (int i = 0; i < group.length(); ++i) { 324 char ch = group.charAt(i); 325 if (ch == '.') { 326 currentLevel++; 327 } else { 328 ch -= '0'; 329 if (ch > '9') { 330 break; 331 } 332 result.levels[currentLevel] = result.levels[currentLevel] * 10 + ch; 333 } 334 } 335 return result; 336 } 337 338 @Override compareTo(Levels other)339 public int compareTo(Levels other) { 340 for (int i = 0; i < levels.length; ++i) { 341 if (levels[i] != other.levels[i]) { 342 return levels[i] < other.levels[i] ? -1 : 1; 343 } 344 } 345 return 0; 346 } 347 set(Levels other)348 public void set(Levels other) { 349 for (int i = 0; i < levels.length; ++i) { 350 levels[i] = other.levels[i]; 351 } 352 } 353 } 354 355 static class HeadingInfo { 356 private Levels levels = new Levels(); 357 private String text = ""; 358 private Set<String> ids = new LinkedHashSet<>(); 359 private boolean suppressSection; 360 private boolean isHeader; 361 362 // temporary 363 private int level; 364 setLevel(String headingLabel, HeadingInfo lastHeading)365 public void setLevel(String headingLabel, HeadingInfo lastHeading) { 366 isHeader = !headingLabel.equals("caption"); 367 level = isHeader ? headingLabel.charAt(1) - '0' : lastHeading.level; 368 } 369 370 @Override toString()371 public String toString() { 372 // <h3><a name="Identity_Elements" href="#Identity_Elements">5.3 Identity Elements</a></h3> 373 String id = ids.isEmpty() ? "NOID" : ids.iterator().next(); 374 String result = "<" + getLabel() 375 + "<a name=\"" + id + "\" href=\"#" + id + "\">" 376 + (!isHeader ? "" : suppressSection ? "" : levels + " ") 377 + TransliteratorUtilities.toHTML.transform(text) 378 + "</a>"; 379 if (ids.size() > 1) { 380 boolean first = true; 381 for (String id2 : ids) { 382 if (first) { 383 first = false; 384 } else { 385 result += "<a name=\"" + id2 + "\"></a>"; 386 } 387 } 388 } 389 return result + "</" + getLabel(); 390 } 391 getLabel()392 public String getLabel() { 393 return isHeader ? "h" + level + ">" : "caption>"; 394 } 395 toHeader()396 public String toHeader() { 397 String id = ids.iterator().next(); 398 return ("<li>" 399 + (!isHeader ? (text.contains("Table") || text.contains("Figure") ? "" : "Table: ") : suppressSection ? "" : levels + " ") 400 + "<a href=\"#" + id + "\">" 401 + TransliteratorUtilities.toHTML.transform(text) 402 + "</a>"); 403 } 404 addText(String toAppend)405 public void addText(String toAppend) { 406 String temp = TransliteratorUtilities.fromHTML.transform(toAppend); 407 if (text.isEmpty()) { 408 if (temp.startsWith(" ")) { 409 text = temp.substring(1); 410 } else { 411 text = temp; 412 } 413 } else { 414 text += temp; 415 } 416 text = SPACES.matcher(text).replaceAll(" "); // clean up all spaces; make more efficient later 417 // used to trim, but we need to retain space between elements. So only trim the start, and later, the end 418 } 419 isContents()420 public boolean isContents() { 421 return text.toString().startsWith("Contents"); 422 } 423 addId(String id)424 void addId(String id) { 425 this.ids.add(id); 426 } 427 setLevels(int line, Levels levels, Set<String> errors)428 public void setLevels(int line, Levels levels, Set<String> errors) { 429 this.levels.set(levels); 430 String error = ""; 431 if (badSectionMatcher.reset(text).find()) { 432 text = text.substring(badSectionMatcher.end()); 433 error += "Extra 'Section...' at start; "; 434 } 435 if (isHeader) { 436 if (!headerMatcher.reset(text).matches()) { 437 if (!SUPPRESS_SECTION_NUMBER.matcher(text).matches()) { 438 error += "Missing section numbers; "; 439 } 440 } else { 441 text = text.substring(headerMatcher.end(1)); 442 if (text.startsWith(".")) { 443 text = text.substring(1).trim(); 444 error += "Extra . at start; "; 445 } 446 Levels parsedLevels = Levels.parse(headerMatcher.group(1)); 447 if (levels.compareTo(parsedLevels) != 0) { 448 error += "Section numbers mismatch, was " + parsedLevels + "; "; 449 } 450 } 451 } 452 if (ids.isEmpty()) { 453 addId(text.toString().trim().replaceAll("[^A-Za-z0-9]+", "_")); 454 error += "Missing double link"; 455 } 456 if (!error.isEmpty()) { 457 errors.add(this + "\t<!-- " + line + ": " + error + " -->"); 458 } 459 suppressSection = SUPPRESS_SECTION_NUMBER.matcher(text).matches(); 460 } 461 addIds(Counter<String> idCounter)462 public void addIds(Counter<String> idCounter) { 463 for (String id : ids) { 464 idCounter.add(id, 1); 465 } 466 } 467 fixText()468 public HeadingInfo fixText() { 469 if (text.endsWith(" ")) { 470 text = text.substring(0, text.length() - 1); 471 } 472 return this; 473 } 474 } 475 476 static Matcher headerMatcher = WELLFORMED_HEADER.matcher(""); 477 static Matcher badSectionMatcher = BADSECTION.matcher(""); 478 479 static class HeadingInfoList { 480 private static final long serialVersionUID = -6722150173224993960L; 481 Levels lastBuildLevel; 482 private Set<String> errors = new LinkedHashSet<>(); 483 Output<Boolean> missingLevel = new Output<>(false); 484 private String fileName; 485 ArrayList<HeadingInfo> list = new ArrayList<>(); 486 HeadingInfoList(String fileName, int h2_START)487 public HeadingInfoList(String fileName, int h2_START) { 488 this.fileName = fileName; 489 lastBuildLevel = new Levels(h2_START); 490 } 491 add(int line, HeadingInfo h)492 public boolean add(int line, HeadingInfo h) { 493 h.fixText(); 494 if (SUPPRESS_REVISION.matcher(h.text).matches()) { 495 return false; 496 } 497 if (h.isHeader) { 498 h.setLevels(line, lastBuildLevel.next(h.level, missingLevel), errors); 499 } else { 500 h.setLevels(line, lastBuildLevel, errors); 501 } 502 if (missingLevel.value) { 503 errors.add("FATAL: Missing Level in: " + h); 504 } 505 return list.add(h); 506 } 507 508 static final String PAD = "\t"; 509 listContents()510 public void listContents() { 511 512 System.out.print("\n\t\t<!-- START Generated TOC: CheckHtmlFiles -->"); 513 Counter<String> idCounter = new Counter<>(); 514 515 int lastLevel = new Levels().getDepth(); 516 String pad = PAD; 517 int ulCount = 0; 518 int liCount = 0; 519 for (HeadingInfo h : list) { 520 h.addIds(idCounter); 521 final int depth = h.levels.getDepth() + (h.isHeader ? 0 : 1); 522 int levelDiff = depth - lastLevel; 523 lastLevel = depth; 524 if (levelDiff > 0) { 525 System.out.println(); 526 for (int i = 0; i < levelDiff; ++i) { 527 pad += PAD; 528 System.out.println(pad + "<ul class=\"toc\">"); 529 ++ulCount; 530 } 531 pad += PAD; 532 } else if (levelDiff < 0) { 533 System.out.println("</li>"); 534 --liCount; 535 for (int i = 0; i > levelDiff; --i) { 536 pad = pad.substring(PAD.length()); 537 System.out.println(pad + "</ul>"); 538 --ulCount; 539 pad = pad.substring(PAD.length()); 540 System.out.println(pad + "</li>"); 541 --liCount; 542 } 543 } else { 544 System.out.println("</li>"); 545 --liCount; 546 } 547 548 System.out.print(pad + h.toHeader()); 549 ++liCount; 550 551 // <li>1.1 <a href="#Conformance">Conformance</a></li> 552 553 // <ul class="toc"> 554 // <li>1 <a href="#Introduction">Introduction</a> 555 // <ul class="toc"> 556 // <li>1.1 <a href="#Conformance">Conformance</a> 557 // </li> 558 // ... 559 // </ul> 560 // </li> 561 } 562 563 // finish up and make sure we are balances 564 565 int levelDiff = -lastLevel; 566 System.out.println("</li>"); 567 --liCount; 568 for (int i = 0; i > levelDiff; --i) { 569 pad = pad.substring(PAD.length()); 570 System.out.println(pad + "</ul>"); 571 --ulCount; 572 pad = pad.substring(PAD.length()); 573 System.out.println(pad + "</li>"); 574 --liCount; 575 } 576 pad = pad.substring(PAD.length()); 577 System.out.println(pad + "</ul>"); 578 System.out.println(pad + "<!-- END Generated TOC: CheckHtmlFiles -->"); 579 --ulCount; 580 if (liCount != 0 || ulCount != 0) { 581 throw new IllegalArgumentException("Mismatched counts in generated contents, li:" + liCount + ", ul:" + ulCount); 582 } 583 for (String id : idCounter) { 584 long count = idCounter.get(id); 585 if (count != 1) { 586 errors.add("FATAL: Non-Unique ID: " + id); 587 } 588 } 589 } 590 591 /** 592 * Prints out errs 593 * @return fatal err count 594 */ showErrors()595 public int showErrors() { 596 int fatalCount = 0; 597 if (!errors.isEmpty()) { 598 System.out.println("\n*ERRORS*\n"); 599 for (String error : errors) { 600 if (error.startsWith("FATAL:")) { 601 System.out.println(fileName + "\t" + error); 602 fatalCount++; 603 } 604 } 605 if (fatalCount == 0) { 606 for (String error : errors) { 607 System.out.println(fileName + "\t" + error); 608 } 609 } 610 } 611 if (this.list.size() == 0) { 612 System.out.println("No header items (eg <h2>) captured."); 613 fatalCount = 1; 614 } 615 return fatalCount; 616 } 617 618 /** 619 * @return total number of errors 620 */ totalErrorCount()621 public int totalErrorCount() { 622 return errors.size(); 623 } 624 } 625 626 static class ElementLine { 627 final String element; 628 final int line; 629 ElementLine(String element, int line)630 public ElementLine(String element, int line) { 631 super(); 632 this.element = element; 633 this.line = line; 634 } 635 636 @Override toString()637 public String toString() { 638 return element + '[' + line + ']'; 639 } 640 } 641 642 static class Data implements Iterable<String> { 643 private static final Pattern ELEMENT_ATTLIST = Pattern.compile("<!(ELEMENT|ATTLIST)\\s+(\\S+)[^>]*>"); 644 List<String> sentences = new ArrayList<>(); 645 M4<String, String, String, Boolean> dtdItems = ChainedMap.of( 646 new LinkedHashMap<String, Object>(), 647 new TreeMap<String, Object>(), 648 new TreeMap<String, Object>(), Boolean.class); 649 Counter<String> hashedSentences = new Counter<>(); 650 int count = 0; 651 int totalErrorCount = 0; 652 int totalFatalCount = 0; 653 getSentences(String fileRegex)654 public Data getSentences(String fileRegex) throws IOException { 655 String base; 656 String regex; 657 try { 658 int firstParen = fileRegex.indexOf('('); 659 if (firstParen < 0) { 660 firstParen = fileRegex.length(); 661 } 662 int lastSlash = fileRegex.lastIndexOf(File.separatorChar, firstParen); 663 base = fileRegex.substring(0, lastSlash); 664 regex = fileRegex.substring(lastSlash + 1); 665 } catch (Exception e) { 666 throw new IllegalArgumentException("Target file must be in special format. " + 667 "Up to the first path part /.../ containing a paragraph is constant, and the rest is a regex."); 668 } 669 670 //File sourceFile = new File(fileRegex); 671 File sourceDirectory = new File(base); 672 if (!sourceDirectory.exists()) { 673 throw new IllegalArgumentException("Can't find " + sourceDirectory); 674 } 675 String canonicalBase = PathUtilities.getNormalizedPathString(sourceDirectory); 676 String FileRegex = canonicalBase + File.separator + regex; 677 FileRegex = FileRegex.replace("\\", "\\\\"); 678 FileRegex = FileRegex.replace("\\\\.", "\\."); 679 Matcher m = PatternCache.get(FileRegex).matcher(""); 680 System.out.println("Matcher: " + m); 681 682 return getSentences(sourceDirectory, m); 683 } 684 getSentences(File sourceDirectory, Matcher m)685 public Data getSentences(File sourceDirectory, Matcher m) throws IOException { 686 //System.out.println("Processing:\t" + sourceDirectory); 687 for (File file : sourceDirectory.listFiles()) { 688 if (file.isDirectory()) { 689 getSentences(file, m); 690 continue; 691 } 692 String fileString = file.getCanonicalFile().toString(); 693 File fileCanonical = new File(fileString); 694 if (!m.reset(fileString).matches()) { 695 if (verbose == Verbosity.all) { 696 System.out.println("Skipping: " + RegexUtilities.showMismatch(m, fileString) 697 + "\t" + sourceDirectory); 698 } 699 continue; 700 } 701 702 System.out.println("\nProcessing:\t" + sourceDirectory + File.separator + fileString); 703 704 int H2_START = fileString.contains("tr18") ? -1 : 0; 705 try (Reader in = new FileReader(fileCanonical)) { 706 parseFile(fileCanonical, H2_START, in); 707 } 708 } 709 return this; 710 } 711 712 SimpleHtmlParser parser = new SimpleHtmlParser(); 713 parseFile(File fileCanonical, int H2_START, Reader in)714 public void parseFile(File fileCanonical, int H2_START, Reader in) throws IOException { 715 Matcher wsMatcher = WHITESPACE.matcher(""); 716 ++count; 717 // SimpleHtmlParser parser = new SimpleHtmlParser().setReader(in); 718 parser.setReader(in); 719 StringBuilder buffer = new StringBuilder(); 720 StringBuilder content = new StringBuilder(); 721 HeadingInfo heading = new HeadingInfo(); 722 final String fileName = fileCanonical.getName(); 723 HeadingInfoList headingInfoList = new HeadingInfoList(fileName, H2_START); 724 Stack<ElementLine> elementStack = new Stack<>(); 725 Stack<Pair<String, String>> attributeStack = new Stack<>(); 726 String contentString; 727 boolean inHeading = false; 728 boolean inPop = false; 729 boolean inAnchor = false; 730 boolean haveContents = false; 731 HeadingInfo lastHeading = null; 732 // for detecting missing captions 733 boolean pushedTable = false; 734 boolean checkCaption = false; 735 List<Integer> captionWarnings = new ArrayList<>(); 736 737 main: while (true) { 738 int lineCount = parser.getLineCount(); 739 Type x = parser.next(content); 740 if (verbose == Verbosity.all && !SUPPRESS.contains(x)) { 741 LOG.write(parser.getLineCount() + "\t" + x + ":\t«" + content + "»"); 742 //SimpleHtmlParser.writeResult(x, content, LOG); 743 LOG.write("\n"); 744 LOG.flush(); 745 } 746 switch (x) { 747 case QUOTE: 748 contentString = content.toString().toLowerCase(Locale.ENGLISH).trim(); 749 if (contentString.equalsIgnoreCase("nocaption")) { 750 pushedTable = false; 751 } 752 break; 753 case ATTRIBUTE: 754 contentString = content.toString().toLowerCase(Locale.ENGLISH); 755 if (inHeading && (contentString.equals("name") || contentString.equals("id"))) { 756 inAnchor = true; 757 } else { 758 inAnchor = false; 759 } 760 attributeStack.add(new Pair<String, String>(contentString, null)); 761 break; 762 case ATTRIBUTE_CONTENT: 763 contentString = content.toString().toLowerCase(Locale.ENGLISH); 764 if (inAnchor) { 765 heading.addId(content.toString()); 766 } 767 Pair<String, String> lastAttribute = attributeStack.peek(); 768 if (lastAttribute.getSecond() != null) { 769 System.out.println(lineCount + "\tDouble Attribute: " + contentString + ", peek=" + lastAttribute); 770 } else { 771 lastAttribute.setSecond(contentString); 772 } 773 break; 774 case ELEMENT: 775 contentString = content.toString().toLowerCase(Locale.ENGLISH); 776 if (inPop) { 777 ElementLine peek; 778 while (true) { 779 peek = elementStack.peek(); 780 if (!NOPOP.contains(peek.element)) { 781 break; 782 } 783 elementStack.pop(); 784 } 785 if (!peek.element.equals(contentString)) { 786 System.out.println(lineCount 787 + "\tCouldn't pop: " + contentString 788 + ", " + showElementStack(elementStack)); 789 } else { 790 elementStack.pop(); 791 } 792 } else { 793 // check that the first element following a table is a caption 794 if (pushedTable && !"caption".equals(contentString)) { 795 captionWarnings.add(lineCount); 796 } 797 elementStack.push(new ElementLine(contentString, lineCount)); 798 pushedTable = checkCaption && "table".equals(contentString); 799 if (!checkCaption && "h3".equals(contentString)) { // h3 around Summary in standard format 800 checkCaption = true; 801 } 802 } 803 if (verbose != Verbosity.none) { 804 LOG.write(parser.getLineCount() + "\telem:\t" + showElementStack(elementStack) + "\n"); 805 LOG.flush(); 806 } 807 if (FORCEBREAK.contains(contentString)) { 808 buffer.append("\n"); 809 } 810 if (DO_CONTENTS.contains(contentString)) { 811 if (inPop) { 812 if (inHeading) { 813 inHeading = false; 814 if (heading.isContents()) { 815 haveContents = true; 816 } else if (haveContents) { 817 headingInfoList.add(parser.getLineCount(), heading); 818 lastHeading = heading; 819 } 820 heading = new HeadingInfo(); 821 } 822 } else { 823 heading.setLevel(contentString, lastHeading); 824 inHeading = true; 825 } 826 } 827 break; 828 case ELEMENT_START: 829 inPop = false; 830 break; 831 case ELEMENT_END: 832 if (verbose == Verbosity.all && !attributeStack.isEmpty()) { 833 LOG.write(parser.getLineCount() + "\tattr:\t" + showAttributeStack(attributeStack) + System.lineSeparator()); 834 LOG.flush(); 835 } 836 attributeStack.clear(); 837 inPop = false; 838 break; 839 case ELEMENT_POP: 840 inPop = true; 841 break; 842 case ELEMENT_CONTENT: 843 contentString = wsMatcher.reset(content).replaceAll(" ").replace(" ", " "); 844 buffer.append(contentString.indexOf('&') >= 0 845 ? TransliteratorUtilities.fromHTML.transform(contentString) 846 : contentString); 847 if (inHeading) { 848 heading.addText(contentString); 849 } 850 break; 851 case DONE: 852 break main; 853 default: 854 break; // skip everything else. 855 } 856 } 857 858 // get DTD elements 859 Matcher m = ELEMENT_ATTLIST.matcher(buffer); 860 while (m.find()) { 861 dtdItems.put(fileName, m.group(2), m.group(), true); 862 //System.out.println(fileName + "\t" + m.group()); 863 } 864 BreakIterator sentenceBreak = BreakIterator.getSentenceInstance(ULocale.ENGLISH); 865 String bufferString = normalizeWhitespace(buffer); 866 sentenceBreak.setText(bufferString); 867 int last = 0; 868 while (true) { 869 int pos = sentenceBreak.next(); 870 if (pos == BreakIterator.DONE) { 871 break; 872 } 873 String sentence = bufferString.substring(last, pos).trim(); 874 last = pos; 875 if (sentence.isEmpty()) { 876 continue; 877 } 878 hashedSentences.add(sentence, 1); 879 sentences.add(sentence); 880 } 881 if (!captionWarnings.isEmpty()) { 882 System.out.println("WARNING: Missing <caption> on the following lines: " 883 + "\n " + Joiner.on(", ").join(captionWarnings) 884 + "\n\tTo fix, add <caption> after the <table>, such as:" 885 + "\n\t\t<table>" 886 + "\n\t\t\t<caption>Private Use Codes in CLDR</a></caption>" 887 + "\n\tOften the sentence just before the <table> can be made into the caption." 888 + "\n\tThe next time you run this program, you’ll be prompted with double-links." 889 + "\n\tIf it really shouldn't have a caption, add <!-- nocaption --> after the <table> instead."); 890 } 891 int fatalCount = headingInfoList.showErrors(); 892 totalFatalCount += fatalCount; 893 totalErrorCount += headingInfoList.totalErrorCount(); 894 if (fatalCount == 0) { 895 headingInfoList.listContents(); 896 } else { 897 System.out.println("\nFix fatal errors in " + fileCanonical + " before contents can be generated"); 898 } 899 } 900 showAttributeStack(Stack<Pair<String, String>> attributeStack)901 private String showAttributeStack(Stack<Pair<String, String>> attributeStack) { 902 StringBuilder result = new StringBuilder(); 903 for (Pair<String, String> s : attributeStack) { 904 result.append("[@"); 905 result.append(s.getFirst()); 906 final String second = s.getSecond(); 907 if (second != null) { 908 result.append("='"); 909 result.append(second); 910 result.append("'"); 911 } 912 result.append("]"); 913 } 914 return result.toString(); 915 } 916 showElementStack(Stack<ElementLine> elementStack)917 private String showElementStack(Stack<ElementLine> elementStack) { 918 StringBuilder result = new StringBuilder(); 919 for (ElementLine s : elementStack) { 920 result.append('/').append(s); 921 } 922 return result.toString(); 923 } 924 925 /** 926 * Return string after collapsing multiple whitespace containing '\\n' to '\\n', 927 * and otherwise 'space'. 928 * @param input 929 * @return 930 */ normalizeWhitespace(CharSequence input)931 private String normalizeWhitespace(CharSequence input) { 932 Matcher m = WHITESPACE.matcher(input); 933 StringBuilder buffer = new StringBuilder(); 934 int last = 0; 935 while (m.find()) { 936 int start = m.start(); 937 buffer.append(input.subSequence(last, start)); 938 last = m.end(); 939 String whiteString = m.group(); 940 if (whiteString.indexOf('\n') >= 0) { 941 buffer.append('\n'); 942 } else { 943 buffer.append(' '); 944 } 945 } 946 buffer.append(input.subSequence(last, input.length())); 947 return buffer.toString().trim(); 948 } 949 getCount(String sentence)950 public long getCount(String sentence) { 951 return hashedSentences.getCount(sentence); 952 } 953 954 @Override iterator()955 public Iterator<String> iterator() { 956 return sentences.iterator(); 957 } 958 } 959 } 960