• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 package org.unicode.cldr.tool;
2 
3 import java.io.File;
4 import java.io.FileReader;
5 import java.io.IOException;
6 import java.io.OutputStreamWriter;
7 import java.io.Reader;
8 import java.io.Writer;
9 import java.util.ArrayList;
10 import java.util.Arrays;
11 import java.util.Collections;
12 import java.util.EnumSet;
13 import java.util.HashMap;
14 import java.util.HashSet;
15 import java.util.Iterator;
16 import java.util.LinkedHashMap;
17 import java.util.LinkedHashSet;
18 import java.util.List;
19 import java.util.Locale;
20 import java.util.Map;
21 import java.util.Map.Entry;
22 import java.util.Set;
23 import java.util.Stack;
24 import java.util.TreeMap;
25 import java.util.TreeSet;
26 import java.util.regex.Matcher;
27 import java.util.regex.Pattern;
28 
29 import org.unicode.cldr.tool.Option.Options;
30 import org.unicode.cldr.util.CLDRPaths;
31 import org.unicode.cldr.util.CLDRTool;
32 import org.unicode.cldr.util.ChainedMap;
33 import org.unicode.cldr.util.ChainedMap.M4;
34 import org.unicode.cldr.util.CldrUtility;
35 import org.unicode.cldr.util.Counter;
36 import org.unicode.cldr.util.DtdData;
37 import org.unicode.cldr.util.DtdData.Attribute;
38 import org.unicode.cldr.util.DtdData.Element;
39 import org.unicode.cldr.util.DtdType;
40 import org.unicode.cldr.util.Pair;
41 import org.unicode.cldr.util.PathUtilities;
42 import org.unicode.cldr.util.PatternCache;
43 import org.unicode.cldr.util.RegexUtilities;
44 import org.unicode.cldr.util.SimpleHtmlParser;
45 import org.unicode.cldr.util.SimpleHtmlParser.Type;
46 import org.unicode.cldr.util.TransliteratorUtilities;
47 
48 import com.google.common.base.Joiner;
49 import com.google.common.collect.ImmutableSet;
50 import com.ibm.icu.impl.Relation;
51 import com.ibm.icu.impl.Row.R4;
52 import com.ibm.icu.text.BreakIterator;
53 import com.ibm.icu.util.Output;
54 import com.ibm.icu.util.ULocale;
55 
56 @CLDRTool(alias = "checkhtmlfiles", description = "Look for errors in CLDR documentation tools", hidden = "Used for CLDR process")
57 public class CheckHtmlFiles {
58 
59     static final Set<String> NOPOP = new HashSet<>(Arrays.asList("br", "img", "link", "meta", "!doctype", "hr", "col", "input"));
60 
61     static final EnumSet<Type> SUPPRESS = EnumSet.of(
62         Type.ELEMENT, Type.ELEMENT_START, Type.ELEMENT_END, Type.ELEMENT_POP,
63         Type.ATTRIBUTE, Type.ATTRIBUTE_CONTENT);
64 
65     final static Options myOptions = new Options();
66     final static Writer LOG = new OutputStreamWriter(System.out);
67     static Pattern WELLFORMED_HEADER = PatternCache.get("\\s*(\\d+(\\.\\d+)*\\s*).*");
68     static Pattern SUPPRESS_SECTION_NUMBER = PatternCache.get(
69         "(Annex [A-Z]: .*)" +
70             "|(Appendix [A-Z].*)" +
71             "|(.*Migrati(on|ng).*)" +
72             "|Step \\d+.*" +
73             "|Example \\d+.*" +
74             "|D\\d+\\.\\s.*" +
75             "|References" +
76             "|Acknowledge?ments" +
77             "|Rights to .*Images" +
78             "|Modifications" +
79             "|(Revision \\d+\\.?)");
80     static Pattern SUPPRESS_REVISION = PatternCache.get("Revision \\d+\\.?");
81     static Pattern SPACES = PatternCache.get("\\s+");
82 
83     enum MyOptions {
84 //        old(".*", Settings.OTHER_WORKSPACE_DIRECTORY + "cldr-archive/cldr-22.1/specs/ldml/tr35\\.html", "source data (regex)"),
85         target(".*", CLDRPaths.BASE_DIRECTORY + "specs" + File.separator + "ldml" + File.separator +
86             "tr35(-.*)?\\.html", "target data (regex); ucd for Unicode docs; "
87                 + "for others use the format -t ${workspace_loc}/unicode-draft/reports/tr51/tr51.html"), verbose(".*", "none", "verbose debugging messages"),
88 //        contents(".*", CLDRPaths.BASE_DIRECTORY + "specs/ldml/tr35(-.*)?\\.html", "generate contents"),
89         // /cldr-archive
90         ;
91 
92         // boilerplate
93         final Option option;
94 
MyOptions(String argumentPattern, String defaultArgument, String helpText)95         MyOptions(String argumentPattern, String defaultArgument, String helpText) {
96             option = myOptions.add(this, argumentPattern, defaultArgument, helpText);
97         }
98     }
99 
100     enum Verbosity {
101         none, element, all;
of(String input)102         static Verbosity of(String input) {
103             return input == null ? Verbosity.none : Verbosity.valueOf(input.toLowerCase(Locale.ROOT));
104         }
105     }
106 
107     static Verbosity verbose;
108     static boolean doContents;
109     static boolean isLdml;
110 
main(String[] args)111     public static void main(String[] args) throws IOException {
112         System.out.println("First do a replace of <a\\s+name=\"([^\"]*)\"\\s*> by <a name=\"$1\" href=\"#$1\">");
113         System.out.println("Then check for all links with no anchors: <a([^>]*)></a>");
114         System.out.println("Then check for all links that don't start with name or href <a (?!href|name)");
115 
116         myOptions.parse(MyOptions.target, args, true);
117         verbose = Verbosity.of(MyOptions.verbose.option.getValue());
118 
119         String targetString = MyOptions.target.option.getValue();
120         if (targetString.contains("ldml")) {
121             isLdml = true;
122         }
123         if (targetString.equalsIgnoreCase("ucd")) {
124             targetString = CLDRPaths.BASE_DIRECTORY + "../unicode-draft/reports/tr(\\d+)/tr(\\d+).html";
125         } else if (targetString.equalsIgnoreCase("security")) {
126             targetString = CLDRPaths.BASE_DIRECTORY + "../unicode-draft/reports/tr(3[69])/tr(3[69]).html";
127         }
128         Data target = new Data().getSentences(targetString);
129         if (target.count == 0) {
130             throw new IllegalArgumentException("No files matched with " + targetString);
131         }
132 
133         if (isLdml) {
134             checkForDtd(target);
135         }
136 
137         System.out.println("*TOTAL COUNTS*  files:" + target.count + ", fatal errors:" + target.totalFatalCount + ", nonfatal errors:"
138             + target.totalErrorCount);
139         if (target.totalFatalCount > 0 || target.totalErrorCount > 0) {
140             System.exit(1); // give an error status
141         }
142 
143         System.exit(0);
144 
145 //        Data source = new Data().getSentences(MyOptions.old.option.getValue());
146 //        String file = MyOptions.target.option.getValue();
147 //
148 //        Data target = new Data().getSentences(file);
149 //
150 //        int missingCount = 0, extraCount = 0;
151 //        int line = 0;
152 //        for (String sentence : source) {
153 //            ++line;
154 //            long sourceCount = source.getCount(sentence);
155 //            long targetCount = target.getCount(sentence);
156 //            if (targetCount == 0) {
157 //                System.out.println(line + "\tMISSING:\t" + sourceCount + "≠" + targetCount + "\t" + sentence);
158 //                ++missingCount;
159 //            }
160 //        }
161 //        line = 0;
162 //        for (String sentence : target) {
163 //            ++line;
164 //            long sourceCount = source.getCount(sentence);
165 //            long targetCount = target.getCount(sentence);
166 //            if (sourceCount == 0) {
167 //                System.out.println(line + "\tEXTRA:\t" + targetCount + "≠" + sourceCount + "\t" + sentence);
168 //                ++extraCount;
169 //            }
170 //        }
171 //        System.out.println("Missing:\t" + missingCount);
172 //        System.out.println("Extra:\t" + extraCount);
173     }
174 
175     private static final Set<String> SKIP_ATTR = ImmutableSet.of("draft", "alt", "references", "cldrVersion", "unicodeVersion");
176 
checkForDtd(Data target)177     private static void checkForDtd(Data target) {
178         M4<String, String, DtdType, Boolean> typeToElements = ChainedMap.of(new TreeMap(), new TreeMap(), new TreeMap(), Boolean.class);
179         for (DtdType type : DtdType.values()) {
180             if (type == DtdType.ldmlICU) continue;
181             DtdData dtdData = DtdData.getInstance(type);
182             Set<Element> elements = dtdData.getElements();
183             for (Element element : elements) {
184                 if (element.isDeprecated()
185                     || element.equals(dtdData.PCDATA)
186                     || element.equals(dtdData.ANY)) continue;
187                 typeToElements.put(element.name, element.toDtdString(), type, Boolean.TRUE);
188             }
189             Set<Attribute> attributes = dtdData.getAttributes();
190             for (Attribute attribute : attributes) {
191                 if (attribute.isDeprecated()) continue;
192                 if (SKIP_ATTR.contains(attribute.name)) {
193                     continue;
194                 }
195                 typeToElements.put(attribute.element.name, attribute.appendDtdString(new StringBuilder()).toString(), type, Boolean.TRUE);
196             }
197         }
198         final Map<String, String> skeletonToInFile = new HashMap<>();
199         Relation<String, String> extra = new Relation(new TreeMap(), TreeSet.class);
200         for (R4<String, String, String, Boolean> elementItem : target.dtdItems.rows()) {
201             String file = elementItem.get0();
202             String element = elementItem.get1();
203             String item = elementItem.get2();
204             extra.put(element, item);
205             skeletonToInFile.put(item.replace(" ", ""), item);
206         }
207         ChainedMap.M4<String, String, DtdType, Comparison> status = ChainedMap.of(new TreeMap(), new TreeMap(), new TreeMap(), Comparison.class);
208         for (R4<String, String, DtdType, Boolean> entry : typeToElements.rows()) {
209             final String element = entry.get0();
210             final String key = entry.get1();
211             final DtdType dtdType = entry.get2();
212             String spaceless = key.replace(" ", "");
213             String realKey = skeletonToInFile.get(spaceless);
214             if (realKey == null) {
215                 status.put(element, key, dtdType, Comparison.missing);
216             } else {
217                 boolean found = extra.remove(element, realKey);
218                 if (!found) {
219                     status.put(element, key, dtdType, Comparison.no_rem);
220                 }
221             }
222         }
223         for (Entry<String, String> extraItem : extra.entrySet()) {
224             status.put(extraItem.getKey(), extraItem.getValue(), DtdType.ldmlICU, Comparison.extra);
225         }
226         TreeSet<String> reverse = new TreeSet<>(Collections.reverseOrder());
227         for (Entry<String, Map<String, Map<DtdType, Comparison>>> entry1 : status) {
228             String element = entry1.getKey();
229             reverse.clear();
230             final Map<String, Map<DtdType, Comparison>> itemToDtdTypeToComparison = entry1.getValue();
231             reverse.addAll(itemToDtdTypeToComparison.keySet());
232             for (String item : reverse) {
233                 Map<DtdType, Comparison> typeToComparison = itemToDtdTypeToComparison.get(item);
234                 for (Entry<DtdType, Comparison> entry2 : typeToComparison.entrySet()) {
235                     System.out.println(element
236                         + "\t" + entry2.getValue()
237                         + "\t" + CldrUtility.ifSame(entry2.getKey(), DtdType.ldmlICU, "")
238                         + "\t" + item);
239                 }
240             }
241         }
242     }
243 
244     enum Comparison {
245         missing, extra, no_rem
246     }
247 
248     static Pattern WHITESPACE = PatternCache.get("[\\s]+");
249     static Pattern BADSECTION = PatternCache.get("^\\s*(\\d+\\s*)?Section\\s*\\d+\\s*[-:]\\s*");
250 
251     static final Set<String> FORCEBREAK = new HashSet<>(Arrays.asList(
252         "table", "div", "blockquote",
253         "p", "br", "td", "th", "h1", "h2", "h3", "h4", "h5", "li"));
254 
255 //    enum ContentsElements {h1, h2, h3, h4, h5, caption}
256 
257     static final Set<String> DO_CONTENTS = new HashSet<>(Arrays.asList(
258         "h1", "h2", "h3", "h4", "h5", "caption"));
259 
260     static class Levels implements Comparable<Levels> {
261         final int[] levels = new int[10];
262         final int h2_start;
263 
Levels(int h2_start)264         public Levels(int h2_start) {
265             levels[0] = h2_start; // special adjustment of starting header level
266             this.h2_start = h2_start;
267         }
268 
Levels()269         public Levels() {
270             this(0);
271         }
272 
273         /**
274          * h2 = level 0, h3 is level 1, etc.
275          * @param level
276          * @return
277          */
next(int level, Output<Boolean> missingLevel)278         Levels next(int level, Output<Boolean> missingLevel) {
279             level -= 2; // h2 = level 0
280             missingLevel.value = false;
281             if (levels[0] < h2_start) {
282                 missingLevel.value = true;
283             }
284             for (int i = 1; i < level; ++i) {
285                 if (levels[i] == 0) {
286                     missingLevel.value = true;
287                 }
288             }
289             levels[level]++;
290             for (int i = level + 1; i < levels.length; ++i) {
291                 levels[i] = 0;
292             }
293             return this;
294         }
295 
getDepth()296         public int getDepth() {
297             for (int i = 0;; ++i) {
298                 int level = levels[i];
299                 if (level == 0) {
300                     return i - 1;
301                 }
302             }
303         }
304 
305         @Override
toString()306         public String toString() {
307             StringBuilder b = new StringBuilder();
308             for (int i = 0;; ++i) {
309                 int level = levels[i];
310                 if (level == 0) {
311                     return b.toString();
312                 }
313                 if (b.length() != 0) {
314                     b.append('.');
315                 }
316                 b.append(level);
317             }
318         }
319 
parse(String group)320         public static Levels parse(String group) {
321             Levels result = new Levels();
322             int currentLevel = 0;
323             for (int i = 0; i < group.length(); ++i) {
324                 char ch = group.charAt(i);
325                 if (ch == '.') {
326                     currentLevel++;
327                 } else {
328                     ch -= '0';
329                     if (ch > '9') {
330                         break;
331                     }
332                     result.levels[currentLevel] = result.levels[currentLevel] * 10 + ch;
333                 }
334             }
335             return result;
336         }
337 
338         @Override
compareTo(Levels other)339         public int compareTo(Levels other) {
340             for (int i = 0; i < levels.length; ++i) {
341                 if (levels[i] != other.levels[i]) {
342                     return levels[i] < other.levels[i] ? -1 : 1;
343                 }
344             }
345             return 0;
346         }
347 
set(Levels other)348         public void set(Levels other) {
349             for (int i = 0; i < levels.length; ++i) {
350                 levels[i] = other.levels[i];
351             }
352         }
353     }
354 
355     static class HeadingInfo {
356         private Levels levels = new Levels();
357         private String text = "";
358         private Set<String> ids = new LinkedHashSet<>();
359         private boolean suppressSection;
360         private boolean isHeader;
361 
362         // temporary
363         private int level;
364 
setLevel(String headingLabel, HeadingInfo lastHeading)365         public void setLevel(String headingLabel, HeadingInfo lastHeading) {
366             isHeader = !headingLabel.equals("caption");
367             level = isHeader ? headingLabel.charAt(1) - '0' : lastHeading.level;
368         }
369 
370         @Override
toString()371         public String toString() {
372             //   <h3><a name="Identity_Elements" href="#Identity_Elements">5.3 Identity Elements</a></h3>
373             String id = ids.isEmpty() ? "NOID" : ids.iterator().next();
374             String result = "<" + getLabel()
375                 + "<a name=\"" + id + "\" href=\"#" + id + "\">"
376                 + (!isHeader ? "" : suppressSection ? "" : levels + " ")
377                 + TransliteratorUtilities.toHTML.transform(text)
378                 + "</a>";
379             if (ids.size() > 1) {
380                 boolean first = true;
381                 for (String id2 : ids) {
382                     if (first) {
383                         first = false;
384                     } else {
385                         result += "<a name=\"" + id2 + "\"></a>";
386                     }
387                 }
388             }
389             return result + "</" + getLabel();
390         }
391 
getLabel()392         public String getLabel() {
393             return isHeader ? "h" + level + ">" : "caption>";
394         }
395 
toHeader()396         public String toHeader() {
397             String id = ids.iterator().next();
398             return ("<li>"
399                 + (!isHeader ? (text.contains("Table") || text.contains("Figure") ? "" : "Table: ") : suppressSection ? "" : levels + " ")
400                 + "<a href=\"#" + id + "\">"
401                 + TransliteratorUtilities.toHTML.transform(text)
402                 + "</a>");
403         }
404 
addText(String toAppend)405         public void addText(String toAppend) {
406             String temp = TransliteratorUtilities.fromHTML.transform(toAppend);
407             if (text.isEmpty()) {
408                 if (temp.startsWith(" ")) {
409                     text = temp.substring(1);
410                 } else {
411                     text = temp;
412                 }
413             } else {
414                 text += temp;
415             }
416             text = SPACES.matcher(text).replaceAll(" "); // clean up all spaces; make more efficient later
417             // used to trim, but we need to retain space between elements. So only trim the start, and later, the end
418         }
419 
isContents()420         public boolean isContents() {
421             return text.toString().startsWith("Contents");
422         }
423 
addId(String id)424         void addId(String id) {
425             this.ids.add(id);
426         }
427 
setLevels(int line, Levels levels, Set<String> errors)428         public void setLevels(int line, Levels levels, Set<String> errors) {
429             this.levels.set(levels);
430             String error = "";
431             if (badSectionMatcher.reset(text).find()) {
432                 text = text.substring(badSectionMatcher.end());
433                 error += "Extra 'Section...' at start; ";
434             }
435             if (isHeader) {
436                 if (!headerMatcher.reset(text).matches()) {
437                     if (!SUPPRESS_SECTION_NUMBER.matcher(text).matches()) {
438                         error += "Missing section numbers; ";
439                     }
440                 } else {
441                     text = text.substring(headerMatcher.end(1));
442                     if (text.startsWith(".")) {
443                         text = text.substring(1).trim();
444                         error += "Extra . at start; ";
445                     }
446                     Levels parsedLevels = Levels.parse(headerMatcher.group(1));
447                     if (levels.compareTo(parsedLevels) != 0) {
448                         error += "Section numbers mismatch, was " + parsedLevels + "; ";
449                     }
450                 }
451             }
452             if (ids.isEmpty()) {
453                 addId(text.toString().trim().replaceAll("[^A-Za-z0-9]+", "_"));
454                 error += "Missing double link";
455             }
456             if (!error.isEmpty()) {
457                 errors.add(this + "\t<!-- " + line + ": " + error + " -->");
458             }
459             suppressSection = SUPPRESS_SECTION_NUMBER.matcher(text).matches();
460         }
461 
addIds(Counter<String> idCounter)462         public void addIds(Counter<String> idCounter) {
463             for (String id : ids) {
464                 idCounter.add(id, 1);
465             }
466         }
467 
fixText()468         public HeadingInfo fixText() {
469             if (text.endsWith(" ")) {
470                 text = text.substring(0, text.length() - 1);
471             }
472             return this;
473         }
474     }
475 
476     static Matcher headerMatcher = WELLFORMED_HEADER.matcher("");
477     static Matcher badSectionMatcher = BADSECTION.matcher("");
478 
479     static class HeadingInfoList {
480         private static final long serialVersionUID = -6722150173224993960L;
481         Levels lastBuildLevel;
482         private Set<String> errors = new LinkedHashSet<>();
483         Output<Boolean> missingLevel = new Output<>(false);
484         private String fileName;
485         ArrayList<HeadingInfo> list = new ArrayList<>();
486 
HeadingInfoList(String fileName, int h2_START)487         public HeadingInfoList(String fileName, int h2_START) {
488             this.fileName = fileName;
489             lastBuildLevel = new Levels(h2_START);
490         }
491 
add(int line, HeadingInfo h)492         public boolean add(int line, HeadingInfo h) {
493             h.fixText();
494             if (SUPPRESS_REVISION.matcher(h.text).matches()) {
495                 return false;
496             }
497             if (h.isHeader) {
498                 h.setLevels(line, lastBuildLevel.next(h.level, missingLevel), errors);
499             } else {
500                 h.setLevels(line, lastBuildLevel, errors);
501             }
502             if (missingLevel.value) {
503                 errors.add("FATAL: Missing Level in: " + h);
504             }
505             return list.add(h);
506         }
507 
508         static final String PAD = "\t";
509 
listContents()510         public void listContents() {
511 
512             System.out.print("\n\t\t<!-- START Generated TOC: CheckHtmlFiles -->");
513             Counter<String> idCounter = new Counter<>();
514 
515             int lastLevel = new Levels().getDepth();
516             String pad = PAD;
517             int ulCount = 0;
518             int liCount = 0;
519             for (HeadingInfo h : list) {
520                 h.addIds(idCounter);
521                 final int depth = h.levels.getDepth() + (h.isHeader ? 0 : 1);
522                 int levelDiff = depth - lastLevel;
523                 lastLevel = depth;
524                 if (levelDiff > 0) {
525                     System.out.println();
526                     for (int i = 0; i < levelDiff; ++i) {
527                         pad += PAD;
528                         System.out.println(pad + "<ul class=\"toc\">");
529                         ++ulCount;
530                     }
531                     pad += PAD;
532                 } else if (levelDiff < 0) {
533                     System.out.println("</li>");
534                     --liCount;
535                     for (int i = 0; i > levelDiff; --i) {
536                         pad = pad.substring(PAD.length());
537                         System.out.println(pad + "</ul>");
538                         --ulCount;
539                         pad = pad.substring(PAD.length());
540                         System.out.println(pad + "</li>");
541                         --liCount;
542                     }
543                 } else {
544                     System.out.println("</li>");
545                     --liCount;
546                 }
547 
548                 System.out.print(pad + h.toHeader());
549                 ++liCount;
550 
551                 //              <li>1.1 <a href="#Conformance">Conformance</a></li>
552 
553                 //                <ul class="toc">
554                 //                <li>1 <a href="#Introduction">Introduction</a>
555                 //                  <ul class="toc">
556                 //                    <li>1.1 <a href="#Conformance">Conformance</a>
557                 //                    </li>
558                 //                    ...
559                 //                  </ul>
560                 //                </li>
561             }
562 
563             // finish up and make sure we are balances
564 
565             int levelDiff = -lastLevel;
566             System.out.println("</li>");
567             --liCount;
568             for (int i = 0; i > levelDiff; --i) {
569                 pad = pad.substring(PAD.length());
570                 System.out.println(pad + "</ul>");
571                 --ulCount;
572                 pad = pad.substring(PAD.length());
573                 System.out.println(pad + "</li>");
574                 --liCount;
575             }
576             pad = pad.substring(PAD.length());
577             System.out.println(pad + "</ul>");
578             System.out.println(pad + "<!-- END Generated TOC: CheckHtmlFiles -->");
579             --ulCount;
580             if (liCount != 0 || ulCount != 0) {
581                 throw new IllegalArgumentException("Mismatched counts in generated contents, li:" + liCount + ", ul:" + ulCount);
582             }
583             for (String id : idCounter) {
584                 long count = idCounter.get(id);
585                 if (count != 1) {
586                     errors.add("FATAL: Non-Unique ID: " + id);
587                 }
588             }
589         }
590 
591         /**
592          * Prints out errs
593          * @return fatal err count
594          */
showErrors()595         public int showErrors() {
596             int fatalCount = 0;
597             if (!errors.isEmpty()) {
598                 System.out.println("\n*ERRORS*\n");
599                 for (String error : errors) {
600                     if (error.startsWith("FATAL:")) {
601                         System.out.println(fileName + "\t" + error);
602                         fatalCount++;
603                     }
604                 }
605                 if (fatalCount == 0) {
606                     for (String error : errors) {
607                         System.out.println(fileName + "\t" + error);
608                     }
609                 }
610             }
611             if (this.list.size() == 0) {
612                 System.out.println("No header items (eg <h2>) captured.");
613                 fatalCount = 1;
614             }
615             return fatalCount;
616         }
617 
618         /**
619          * @return total number of errors
620          */
totalErrorCount()621         public int totalErrorCount() {
622             return errors.size();
623         }
624     }
625 
626     static class ElementLine {
627         final String element;
628         final int line;
629 
ElementLine(String element, int line)630         public ElementLine(String element, int line) {
631             super();
632             this.element = element;
633             this.line = line;
634         }
635 
636         @Override
toString()637         public String toString() {
638             return element + '[' + line + ']';
639         }
640     }
641 
642     static class Data implements Iterable<String> {
643         private static final Pattern ELEMENT_ATTLIST = Pattern.compile("<!(ELEMENT|ATTLIST)\\s+(\\S+)[^>]*>");
644         List<String> sentences = new ArrayList<>();
645         M4<String, String, String, Boolean> dtdItems = ChainedMap.of(
646             new LinkedHashMap<String, Object>(),
647             new TreeMap<String, Object>(),
648             new TreeMap<String, Object>(), Boolean.class);
649         Counter<String> hashedSentences = new Counter<>();
650         int count = 0;
651         int totalErrorCount = 0;
652         int totalFatalCount = 0;
653 
getSentences(String fileRegex)654         public Data getSentences(String fileRegex) throws IOException {
655             String base;
656             String regex;
657             try {
658                 int firstParen = fileRegex.indexOf('(');
659                 if (firstParen < 0) {
660                     firstParen = fileRegex.length();
661                 }
662                 int lastSlash = fileRegex.lastIndexOf(File.separatorChar, firstParen);
663                 base = fileRegex.substring(0, lastSlash);
664                 regex = fileRegex.substring(lastSlash + 1);
665             } catch (Exception e) {
666                 throw new IllegalArgumentException("Target file must be in special format. " +
667                     "Up to the first path part /.../ containing a paragraph is constant, and the rest is a regex.");
668             }
669 
670             //File sourceFile = new File(fileRegex);
671             File sourceDirectory = new File(base);
672             if (!sourceDirectory.exists()) {
673                 throw new IllegalArgumentException("Can't find " + sourceDirectory);
674             }
675             String canonicalBase = PathUtilities.getNormalizedPathString(sourceDirectory);
676             String FileRegex = canonicalBase + File.separator + regex;
677             FileRegex = FileRegex.replace("\\", "\\\\");
678             FileRegex = FileRegex.replace("\\\\.", "\\.");
679             Matcher m = PatternCache.get(FileRegex).matcher("");
680             System.out.println("Matcher: " + m);
681 
682             return getSentences(sourceDirectory, m);
683         }
684 
getSentences(File sourceDirectory, Matcher m)685         public Data getSentences(File sourceDirectory, Matcher m) throws IOException {
686             //System.out.println("Processing:\t" + sourceDirectory);
687             for (File file : sourceDirectory.listFiles()) {
688                 if (file.isDirectory()) {
689                     getSentences(file, m);
690                     continue;
691                 }
692                 String fileString = file.getCanonicalFile().toString();
693                 File fileCanonical = new File(fileString);
694                 if (!m.reset(fileString).matches()) {
695                     if (verbose == Verbosity.all) {
696                         System.out.println("Skipping: " + RegexUtilities.showMismatch(m, fileString)
697                             + "\t" + sourceDirectory);
698                     }
699                     continue;
700                 }
701 
702                 System.out.println("\nProcessing:\t" + sourceDirectory + File.separator + fileString);
703 
704                 int H2_START = fileString.contains("tr18") ? -1 : 0;
705                 try (Reader in = new FileReader(fileCanonical)) {
706                     parseFile(fileCanonical, H2_START, in);
707                 }
708             }
709             return this;
710         }
711 
712         SimpleHtmlParser parser = new SimpleHtmlParser();
713 
parseFile(File fileCanonical, int H2_START, Reader in)714         public void parseFile(File fileCanonical, int H2_START, Reader in) throws IOException {
715             Matcher wsMatcher = WHITESPACE.matcher("");
716             ++count;
717             // SimpleHtmlParser parser = new SimpleHtmlParser().setReader(in);
718             parser.setReader(in);
719             StringBuilder buffer = new StringBuilder();
720             StringBuilder content = new StringBuilder();
721             HeadingInfo heading = new HeadingInfo();
722             final String fileName = fileCanonical.getName();
723             HeadingInfoList headingInfoList = new HeadingInfoList(fileName, H2_START);
724             Stack<ElementLine> elementStack = new Stack<>();
725             Stack<Pair<String, String>> attributeStack = new Stack<>();
726             String contentString;
727             boolean inHeading = false;
728             boolean inPop = false;
729             boolean inAnchor = false;
730             boolean haveContents = false;
731             HeadingInfo lastHeading = null;
732             // for detecting missing captions
733             boolean pushedTable = false;
734             boolean checkCaption = false;
735             List<Integer> captionWarnings = new ArrayList<>();
736 
737             main: while (true) {
738                 int lineCount = parser.getLineCount();
739                 Type x = parser.next(content);
740                 if (verbose == Verbosity.all && !SUPPRESS.contains(x)) {
741                     LOG.write(parser.getLineCount() + "\t" + x + ":\t«" + content + "»");
742                     //SimpleHtmlParser.writeResult(x, content, LOG);
743                     LOG.write("\n");
744                     LOG.flush();
745                 }
746                 switch (x) {
747                 case QUOTE:
748                     contentString = content.toString().toLowerCase(Locale.ENGLISH).trim();
749                     if (contentString.equalsIgnoreCase("nocaption")) {
750                         pushedTable = false;
751                     }
752                     break;
753                 case ATTRIBUTE:
754                     contentString = content.toString().toLowerCase(Locale.ENGLISH);
755                     if (inHeading && (contentString.equals("name") || contentString.equals("id"))) {
756                         inAnchor = true;
757                     } else {
758                         inAnchor = false;
759                     }
760                     attributeStack.add(new Pair<String, String>(contentString, null));
761                     break;
762                 case ATTRIBUTE_CONTENT:
763                     contentString = content.toString().toLowerCase(Locale.ENGLISH);
764                     if (inAnchor) {
765                         heading.addId(content.toString());
766                     }
767                     Pair<String, String> lastAttribute = attributeStack.peek();
768                     if (lastAttribute.getSecond() != null) {
769                         System.out.println(lineCount + "\tDouble Attribute: " + contentString + ", peek=" + lastAttribute);
770                     } else {
771                         lastAttribute.setSecond(contentString);
772                     }
773                     break;
774                 case ELEMENT:
775                     contentString = content.toString().toLowerCase(Locale.ENGLISH);
776                     if (inPop) {
777                         ElementLine peek;
778                         while (true) {
779                             peek = elementStack.peek();
780                             if (!NOPOP.contains(peek.element)) {
781                                 break;
782                             }
783                             elementStack.pop();
784                         }
785                         if (!peek.element.equals(contentString)) {
786                             System.out.println(lineCount
787                                 + "\tCouldn't pop: " + contentString
788                                 + ", " + showElementStack(elementStack));
789                         } else {
790                             elementStack.pop();
791                         }
792                     } else {
793                         // check that the first element following a table is a caption
794                         if (pushedTable && !"caption".equals(contentString)) {
795                             captionWarnings.add(lineCount);
796                         }
797                         elementStack.push(new ElementLine(contentString, lineCount));
798                         pushedTable = checkCaption && "table".equals(contentString);
799                         if (!checkCaption && "h3".equals(contentString)) { // h3 around Summary in standard format
800                             checkCaption = true;
801                         }
802                     }
803                     if (verbose != Verbosity.none) {
804                         LOG.write(parser.getLineCount() + "\telem:\t" + showElementStack(elementStack) + "\n");
805                         LOG.flush();
806                     }
807                     if (FORCEBREAK.contains(contentString)) {
808                         buffer.append("\n");
809                     }
810                     if (DO_CONTENTS.contains(contentString)) {
811                         if (inPop) {
812                             if (inHeading) {
813                                 inHeading = false;
814                                 if (heading.isContents()) {
815                                     haveContents = true;
816                                 } else if (haveContents) {
817                                     headingInfoList.add(parser.getLineCount(), heading);
818                                     lastHeading = heading;
819                                 }
820                                 heading = new HeadingInfo();
821                             }
822                         } else {
823                             heading.setLevel(contentString, lastHeading);
824                             inHeading = true;
825                         }
826                     }
827                     break;
828                 case ELEMENT_START:
829                     inPop = false;
830                     break;
831                 case ELEMENT_END:
832                     if (verbose == Verbosity.all && !attributeStack.isEmpty()) {
833                         LOG.write(parser.getLineCount() + "\tattr:\t" + showAttributeStack(attributeStack) + System.lineSeparator());
834                         LOG.flush();
835                     }
836                     attributeStack.clear();
837                     inPop = false;
838                     break;
839                 case ELEMENT_POP:
840                     inPop = true;
841                     break;
842                 case ELEMENT_CONTENT:
843                     contentString = wsMatcher.reset(content).replaceAll(" ").replace("&nbsp;", " ");
844                     buffer.append(contentString.indexOf('&') >= 0
845                         ? TransliteratorUtilities.fromHTML.transform(contentString)
846                         : contentString);
847                     if (inHeading) {
848                         heading.addText(contentString);
849                     }
850                     break;
851                 case DONE:
852                     break main;
853                 default:
854                     break; // skip everything else.
855                 }
856             }
857 
858             // get DTD elements
859             Matcher m = ELEMENT_ATTLIST.matcher(buffer);
860             while (m.find()) {
861                 dtdItems.put(fileName, m.group(2), m.group(), true);
862                 //System.out.println(fileName + "\t" + m.group());
863             }
864             BreakIterator sentenceBreak = BreakIterator.getSentenceInstance(ULocale.ENGLISH);
865             String bufferString = normalizeWhitespace(buffer);
866             sentenceBreak.setText(bufferString);
867             int last = 0;
868             while (true) {
869                 int pos = sentenceBreak.next();
870                 if (pos == BreakIterator.DONE) {
871                     break;
872                 }
873                 String sentence = bufferString.substring(last, pos).trim();
874                 last = pos;
875                 if (sentence.isEmpty()) {
876                     continue;
877                 }
878                 hashedSentences.add(sentence, 1);
879                 sentences.add(sentence);
880             }
881             if (!captionWarnings.isEmpty()) {
882                 System.out.println("WARNING: Missing <caption> on the following lines: "
883                     + "\n    " + Joiner.on(", ").join(captionWarnings)
884                     + "\n\tTo fix, add <caption> after the <table>, such as:"
885                     + "\n\t\t<table>"
886                     + "\n\t\t\t<caption>Private Use Codes in CLDR</a></caption>"
887                     + "\n\tOften the sentence just before the <table> can be made into the caption."
888                     + "\n\tThe next time you run this program, you’ll be prompted with double-links."
889                     + "\n\tIf it really shouldn't have a caption, add <!-- nocaption --> after the <table> instead.");
890             }
891             int fatalCount = headingInfoList.showErrors();
892             totalFatalCount += fatalCount;
893             totalErrorCount += headingInfoList.totalErrorCount();
894             if (fatalCount == 0) {
895                 headingInfoList.listContents();
896             } else {
897                 System.out.println("\nFix fatal errors in " + fileCanonical + " before contents can be generated");
898             }
899         }
900 
showAttributeStack(Stack<Pair<String, String>> attributeStack)901         private String showAttributeStack(Stack<Pair<String, String>> attributeStack) {
902             StringBuilder result = new StringBuilder();
903             for (Pair<String, String> s : attributeStack) {
904                 result.append("[@");
905                 result.append(s.getFirst());
906                 final String second = s.getSecond();
907                 if (second != null) {
908                     result.append("='");
909                     result.append(second);
910                     result.append("'");
911                 }
912                 result.append("]");
913             }
914             return result.toString();
915         }
916 
showElementStack(Stack<ElementLine> elementStack)917         private String showElementStack(Stack<ElementLine> elementStack) {
918             StringBuilder result = new StringBuilder();
919             for (ElementLine s : elementStack) {
920                 result.append('/').append(s);
921             }
922             return result.toString();
923         }
924 
925         /**
926          * Return string after collapsing multiple whitespace containing '\\n' to '\\n',
927          * and otherwise 'space'.
928          * @param input
929          * @return
930          */
normalizeWhitespace(CharSequence input)931         private String normalizeWhitespace(CharSequence input) {
932             Matcher m = WHITESPACE.matcher(input);
933             StringBuilder buffer = new StringBuilder();
934             int last = 0;
935             while (m.find()) {
936                 int start = m.start();
937                 buffer.append(input.subSequence(last, start));
938                 last = m.end();
939                 String whiteString = m.group();
940                 if (whiteString.indexOf('\n') >= 0) {
941                     buffer.append('\n');
942                 } else {
943                     buffer.append(' ');
944                 }
945             }
946             buffer.append(input.subSequence(last, input.length()));
947             return buffer.toString().trim();
948         }
949 
getCount(String sentence)950         public long getCount(String sentence) {
951             return hashedSentences.getCount(sentence);
952         }
953 
954         @Override
iterator()955         public Iterator<String> iterator() {
956             return sentences.iterator();
957         }
958     }
959 }
960