• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright 2016 Google Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
5  * in compliance with the License. You may obtain a copy of the License at
6  *
7  *     http://www.apache.org/licenses/LICENSE-2.0
8  *
9  * Unless required by applicable law or agreed to in writing, software distributed under the License
10  * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
11  * or implied. See the License for the specific language governing permissions and limitations under
12  * the License.
13  */
14 
15 package com.google.googlejavaformat.java.javadoc;
16 
17 import static com.google.common.base.Preconditions.checkArgument;
18 import static com.google.common.base.Preconditions.checkNotNull;
19 import static com.google.common.base.Verify.verify;
20 import static com.google.common.collect.Iterators.peekingIterator;
21 import static com.google.googlejavaformat.java.javadoc.Token.Type.BEGIN_JAVADOC;
22 import static com.google.googlejavaformat.java.javadoc.Token.Type.BLOCKQUOTE_CLOSE_TAG;
23 import static com.google.googlejavaformat.java.javadoc.Token.Type.BLOCKQUOTE_OPEN_TAG;
24 import static com.google.googlejavaformat.java.javadoc.Token.Type.BR_TAG;
25 import static com.google.googlejavaformat.java.javadoc.Token.Type.CODE_CLOSE_TAG;
26 import static com.google.googlejavaformat.java.javadoc.Token.Type.CODE_OPEN_TAG;
27 import static com.google.googlejavaformat.java.javadoc.Token.Type.END_JAVADOC;
28 import static com.google.googlejavaformat.java.javadoc.Token.Type.FOOTER_JAVADOC_TAG_START;
29 import static com.google.googlejavaformat.java.javadoc.Token.Type.FORCED_NEWLINE;
30 import static com.google.googlejavaformat.java.javadoc.Token.Type.HEADER_CLOSE_TAG;
31 import static com.google.googlejavaformat.java.javadoc.Token.Type.HEADER_OPEN_TAG;
32 import static com.google.googlejavaformat.java.javadoc.Token.Type.HTML_COMMENT;
33 import static com.google.googlejavaformat.java.javadoc.Token.Type.LIST_CLOSE_TAG;
34 import static com.google.googlejavaformat.java.javadoc.Token.Type.LIST_ITEM_CLOSE_TAG;
35 import static com.google.googlejavaformat.java.javadoc.Token.Type.LIST_ITEM_OPEN_TAG;
36 import static com.google.googlejavaformat.java.javadoc.Token.Type.LIST_OPEN_TAG;
37 import static com.google.googlejavaformat.java.javadoc.Token.Type.LITERAL;
38 import static com.google.googlejavaformat.java.javadoc.Token.Type.MOE_BEGIN_STRIP_COMMENT;
39 import static com.google.googlejavaformat.java.javadoc.Token.Type.MOE_END_STRIP_COMMENT;
40 import static com.google.googlejavaformat.java.javadoc.Token.Type.OPTIONAL_LINE_BREAK;
41 import static com.google.googlejavaformat.java.javadoc.Token.Type.PARAGRAPH_CLOSE_TAG;
42 import static com.google.googlejavaformat.java.javadoc.Token.Type.PARAGRAPH_OPEN_TAG;
43 import static com.google.googlejavaformat.java.javadoc.Token.Type.PRE_CLOSE_TAG;
44 import static com.google.googlejavaformat.java.javadoc.Token.Type.PRE_OPEN_TAG;
45 import static com.google.googlejavaformat.java.javadoc.Token.Type.TABLE_CLOSE_TAG;
46 import static com.google.googlejavaformat.java.javadoc.Token.Type.TABLE_OPEN_TAG;
47 import static com.google.googlejavaformat.java.javadoc.Token.Type.WHITESPACE;
48 import static java.lang.String.format;
49 import static java.util.regex.Pattern.CASE_INSENSITIVE;
50 import static java.util.regex.Pattern.DOTALL;
51 import static java.util.regex.Pattern.compile;
52 
53 import com.google.common.base.CharMatcher;
54 import com.google.common.collect.ImmutableList;
55 import com.google.common.collect.PeekingIterator;
56 import com.google.googlejavaformat.java.javadoc.Token.Type;
57 import java.util.ArrayDeque;
58 import java.util.ArrayList;
59 import java.util.Deque;
60 import java.util.List;
61 import java.util.regex.Pattern;
62 
63 /** Lexer for the Javadoc formatter. */
64 final class JavadocLexer {
65   /** Takes a Javadoc comment, including ∕✱✱ and ✱∕, and returns tokens, including ∕✱✱ and ✱∕. */
lex(String input)66   static ImmutableList<Token> lex(String input) throws LexException {
67     /*
68      * TODO(cpovirk): In theory, we should interpret Unicode escapes (yet output them in their
69      * original form). This would mean mean everything from an encoded ∕✱✱ to an encoded <pre> tag,
70      * so we'll probably never bother.
71      */
72     input = stripJavadocBeginAndEnd(input);
73     input = normalizeLineEndings(input);
74     return new JavadocLexer(new CharStream(input)).generateTokens();
75   }
76 
77   /** The lexer crashes on windows line endings, so for now just normalize to `\n`. */
78   // TODO(cushon): use the platform line separator for output
normalizeLineEndings(String input)79   private static String normalizeLineEndings(String input) {
80     return NON_UNIX_LINE_ENDING.matcher(input).replaceAll("\n");
81   }
82 
83   private static final Pattern NON_UNIX_LINE_ENDING = Pattern.compile("\r\n?");
84 
stripJavadocBeginAndEnd(String input)85   private static String stripJavadocBeginAndEnd(String input) {
86     /*
87      * We do this ahead of time so that the main part of the lexer need not say things like
88      * "(?![*]/)" to avoid accidentally swallowing ✱∕ when consuming a newline.
89      */
90     checkArgument(input.startsWith("/**"), "Missing /**: %s", input);
91     checkArgument(input.endsWith("*/") && input.length() > 4, "Missing */: %s", input);
92     return input.substring("/**".length(), input.length() - "*/".length());
93   }
94 
95   private final CharStream input;
96   private final NestingCounter braceDepth = new NestingCounter();
97   private final NestingCounter preDepth = new NestingCounter();
98   private final NestingCounter codeDepth = new NestingCounter();
99   private final NestingCounter tableDepth = new NestingCounter();
100   private boolean somethingSinceNewline;
101 
JavadocLexer(CharStream input)102   private JavadocLexer(CharStream input) {
103     this.input = checkNotNull(input);
104   }
105 
generateTokens()106   private ImmutableList<Token> generateTokens() throws LexException {
107     ImmutableList.Builder<Token> tokens = ImmutableList.builder();
108 
109     Token token = new Token(BEGIN_JAVADOC, "/**");
110     tokens.add(token);
111 
112     while (!input.isExhausted()) {
113       token = readToken();
114       tokens.add(token);
115     }
116 
117     checkMatchingTags();
118 
119     token = new Token(END_JAVADOC, "*/");
120     tokens.add(token);
121 
122     ImmutableList<Token> result = tokens.build();
123     result = joinAdjacentLiteralsAndAdjacentWhitespace(result);
124     result = inferParagraphTags(result);
125     result = optionalizeSpacesAfterLinks(result);
126     result = deindentPreCodeBlocks(result);
127     return result;
128   }
129 
readToken()130   private Token readToken() throws LexException {
131     Type type = consumeToken();
132     String value = input.readAndResetRecorded();
133     return new Token(type, value);
134   }
135 
consumeToken()136   private Type consumeToken() throws LexException {
137     boolean preserveExistingFormatting = preserveExistingFormatting();
138 
139     if (input.tryConsumeRegex(NEWLINE_PATTERN)) {
140       somethingSinceNewline = false;
141       return preserveExistingFormatting ? FORCED_NEWLINE : WHITESPACE;
142     } else if (input.tryConsume(" ") || input.tryConsume("\t")) {
143       // TODO(cpovirk): How about weird whitespace chars? Ideally we'd distinguish breaking vs. not.
144       // Returning LITERAL here prevent us from breaking a <pre> line. For more info, see LITERAL.
145       return preserveExistingFormatting ? LITERAL : WHITESPACE;
146     }
147 
148     /*
149      * TODO(cpovirk): Maybe try to detect things like "{@code\n@GwtCompatible}" that aren't intended
150      * as tags. But in the most likely case, in which that happens inside <pre>{@code, we have no
151      * great options for fixing it.
152      * https://github.com/google/google-java-format/issues/7#issuecomment-197383926
153      */
154     if (!somethingSinceNewline && input.tryConsumeRegex(FOOTER_TAG_PATTERN)) {
155       checkMatchingTags();
156       somethingSinceNewline = true;
157       return FOOTER_JAVADOC_TAG_START;
158     }
159     somethingSinceNewline = true;
160 
161     if (input.tryConsumeRegex(INLINE_TAG_OPEN_PATTERN)) {
162       braceDepth.increment();
163       return LITERAL;
164     } else if (input.tryConsume("{")) {
165       braceDepth.incrementIfPositive();
166       return LITERAL;
167     } else if (input.tryConsume("}")) {
168       braceDepth.decrementIfPositive();
169       return LITERAL;
170     }
171 
172     // Inside an inline tag, don't do any HTML interpretation.
173     if (braceDepth.isPositive()) {
174       verify(input.tryConsumeRegex(LITERAL_PATTERN));
175       return LITERAL;
176     }
177 
178     if (input.tryConsumeRegex(PRE_OPEN_PATTERN)) {
179       preDepth.increment();
180       return preserveExistingFormatting ? LITERAL : PRE_OPEN_TAG;
181     } else if (input.tryConsumeRegex(PRE_CLOSE_PATTERN)) {
182       preDepth.decrementIfPositive();
183       return preserveExistingFormatting() ? LITERAL : PRE_CLOSE_TAG;
184     }
185 
186     if (input.tryConsumeRegex(CODE_OPEN_PATTERN)) {
187       codeDepth.increment();
188       return preserveExistingFormatting ? LITERAL : CODE_OPEN_TAG;
189     } else if (input.tryConsumeRegex(CODE_CLOSE_PATTERN)) {
190       codeDepth.decrementIfPositive();
191       return preserveExistingFormatting() ? LITERAL : CODE_CLOSE_TAG;
192     }
193 
194     if (input.tryConsumeRegex(TABLE_OPEN_PATTERN)) {
195       tableDepth.increment();
196       return preserveExistingFormatting ? LITERAL : TABLE_OPEN_TAG;
197     } else if (input.tryConsumeRegex(TABLE_CLOSE_PATTERN)) {
198       tableDepth.decrementIfPositive();
199       return preserveExistingFormatting() ? LITERAL : TABLE_CLOSE_TAG;
200     }
201 
202     if (preserveExistingFormatting) {
203       verify(input.tryConsumeRegex(LITERAL_PATTERN));
204       return LITERAL;
205     }
206 
207     if (input.tryConsumeRegex(PARAGRAPH_OPEN_PATTERN)) {
208       return PARAGRAPH_OPEN_TAG;
209     } else if (input.tryConsumeRegex(PARAGRAPH_CLOSE_PATTERN)) {
210       return PARAGRAPH_CLOSE_TAG;
211     } else if (input.tryConsumeRegex(LIST_OPEN_PATTERN)) {
212       return LIST_OPEN_TAG;
213     } else if (input.tryConsumeRegex(LIST_CLOSE_PATTERN)) {
214       return LIST_CLOSE_TAG;
215     } else if (input.tryConsumeRegex(LIST_ITEM_OPEN_PATTERN)) {
216       return LIST_ITEM_OPEN_TAG;
217     } else if (input.tryConsumeRegex(LIST_ITEM_CLOSE_PATTERN)) {
218       return LIST_ITEM_CLOSE_TAG;
219     } else if (input.tryConsumeRegex(BLOCKQUOTE_OPEN_PATTERN)) {
220       return BLOCKQUOTE_OPEN_TAG;
221     } else if (input.tryConsumeRegex(BLOCKQUOTE_CLOSE_PATTERN)) {
222       return BLOCKQUOTE_CLOSE_TAG;
223     } else if (input.tryConsumeRegex(HEADER_OPEN_PATTERN)) {
224       return HEADER_OPEN_TAG;
225     } else if (input.tryConsumeRegex(HEADER_CLOSE_PATTERN)) {
226       return HEADER_CLOSE_TAG;
227     } else if (input.tryConsumeRegex(BR_PATTERN)) {
228       return BR_TAG;
229     } else if (input.tryConsumeRegex(MOE_BEGIN_STRIP_COMMENT_PATTERN)) {
230       return MOE_BEGIN_STRIP_COMMENT;
231     } else if (input.tryConsumeRegex(MOE_END_STRIP_COMMENT_PATTERN)) {
232       return MOE_END_STRIP_COMMENT;
233     } else if (input.tryConsumeRegex(HTML_COMMENT_PATTERN)) {
234       return HTML_COMMENT;
235     } else if (input.tryConsumeRegex(LITERAL_PATTERN)) {
236       return LITERAL;
237     }
238     throw new AssertionError();
239   }
240 
preserveExistingFormatting()241   private boolean preserveExistingFormatting() {
242     return preDepth.isPositive() || tableDepth.isPositive() || codeDepth.isPositive();
243   }
244 
checkMatchingTags()245   private void checkMatchingTags() throws LexException {
246     if (braceDepth.isPositive()
247         || preDepth.isPositive()
248         || tableDepth.isPositive()
249         || codeDepth.isPositive()) {
250       throw new LexException();
251     }
252   }
253 
254   /**
255    * Join together adjacent literal tokens, and join together adjacent whitespace tokens.
256    *
257    * <p>For literal tokens, this means something like {@code ["<b>", "foo", "</b>"] =>
258    * ["<b>foo</b>"]}. See {@link #LITERAL_PATTERN} for discussion of why those tokens are separate
259    * to begin with.
260    *
261    * <p>Whitespace tokens are treated analogously. We don't really "want" to join whitespace tokens,
262    * but in the course of joining literals, we incidentally join whitespace, too. We do take
263    * advantage of the joining later on: It simplifies {@link #inferParagraphTags}.
264    *
265    * <p>Note that we do <i>not</i> merge a literal token and a whitespace token together.
266    */
joinAdjacentLiteralsAndAdjacentWhitespace(List<Token> input)267   private static ImmutableList<Token> joinAdjacentLiteralsAndAdjacentWhitespace(List<Token> input) {
268     /*
269      * Note: Our final token is always END_JAVADOC. This saves us some trouble:
270      *
271      * - Our inner while() doesn't need a hasNext() check.
272      *
273      * - We don't need to check for leftover accumulated literals after we exit the loop.
274      */
275     ImmutableList.Builder<Token> output = ImmutableList.builder();
276     StringBuilder accumulated = new StringBuilder();
277 
278     for (PeekingIterator<Token> tokens = peekingIterator(input.iterator()); tokens.hasNext(); ) {
279       if (tokens.peek().getType() == LITERAL) {
280         accumulated.append(tokens.peek().getValue());
281         tokens.next();
282         continue;
283       }
284 
285       /*
286        * IF we have accumulated some literals to join together (say, "foo<b>bar</b>"), and IF we'll
287        * next see whitespace followed by a "@" literal, we need to join that together with the
288        * previous literals. That ensures that we won't insert a line break before the "@," turning
289        * it into a tag.
290        */
291 
292       if (accumulated.length() == 0) {
293         output.add(tokens.peek());
294         tokens.next();
295         continue;
296       }
297 
298       StringBuilder seenWhitespace = new StringBuilder();
299       while (tokens.peek().getType() == WHITESPACE) {
300         seenWhitespace.append(tokens.next().getValue());
301       }
302 
303       if (tokens.peek().getType() == LITERAL && tokens.peek().getValue().startsWith("@")) {
304         // OK, we're in the case described above.
305         accumulated.append(" ");
306         accumulated.append(tokens.peek().getValue());
307         tokens.next();
308         continue;
309       }
310 
311       output.add(new Token(LITERAL, accumulated.toString()));
312       accumulated.setLength(0);
313 
314       if (seenWhitespace.length() > 0) {
315         output.add(new Token(WHITESPACE, seenWhitespace.toString()));
316       }
317 
318       // We have another token coming, possibly of type OTHER. Leave it for the next iteration.
319     }
320 
321     /*
322      * TODO(cpovirk): Another case where we could try to join tokens is if a line ends with
323      * /[^ -]-/, as in "non-\nblocking."
324      */
325     return output.build();
326   }
327 
328   /**
329    * Where the input has two consecutive line breaks between literals, insert a {@code <p>} tag
330    * between the literals.
331    *
332    * <p>This method must be called after {@link #joinAdjacentLiteralsAndAdjacentWhitespace}, as it
333    * assumes that adjacent whitespace tokens have already been joined.
334    */
inferParagraphTags(List<Token> input)335   private static ImmutableList<Token> inferParagraphTags(List<Token> input) {
336     ImmutableList.Builder<Token> output = ImmutableList.builder();
337 
338     for (PeekingIterator<Token> tokens = peekingIterator(input.iterator()); tokens.hasNext(); ) {
339       if (tokens.peek().getType() == LITERAL) {
340         output.add(tokens.next());
341 
342         if (tokens.peek().getType() == WHITESPACE
343             && hasMultipleNewlines(tokens.peek().getValue())) {
344           output.add(tokens.next());
345 
346           if (tokens.peek().getType() == LITERAL) {
347             output.add(new Token(PARAGRAPH_OPEN_TAG, "<p>"));
348           }
349         }
350       } else {
351         // TODO(cpovirk): Or just `continue` from the <p> case and move this out of the `else`?
352         output.add(tokens.next());
353       }
354     }
355 
356     return output.build();
357 
358     /*
359      * Note: We do not want to insert <p> tags inside <pre>. Fortunately, the formatter gets that
360      * right without special effort on our part. The reason: Line breaks inside a <pre> section are
361      * of type FORCED_NEWLINE rather than WHITESPACE.
362      */
363   }
364 
365   /**
366    * Replaces whitespace after a {@code href=...>} token with an "optional link break." This allows
367    * us to output either {@code <a href=foo>foo</a>} or {@code <a href=foo>\nfoo</a>}, depending on
368    * how much space we have left on the line.
369    *
370    * <p>This method must be called after {@link #joinAdjacentLiteralsAndAdjacentWhitespace}, as it
371    * assumes that adjacent whitespace tokens have already been joined.
372    */
optionalizeSpacesAfterLinks(List<Token> input)373   private static ImmutableList<Token> optionalizeSpacesAfterLinks(List<Token> input) {
374     ImmutableList.Builder<Token> output = ImmutableList.builder();
375 
376     for (PeekingIterator<Token> tokens = peekingIterator(input.iterator()); tokens.hasNext(); ) {
377       if (tokens.peek().getType() == LITERAL && tokens.peek().getValue().matches("^href=[^>]*>")) {
378         output.add(tokens.next());
379 
380         if (tokens.peek().getType() == WHITESPACE) {
381           output.add(new Token(OPTIONAL_LINE_BREAK, tokens.next().getValue()));
382         }
383       } else {
384         output.add(tokens.next());
385       }
386     }
387 
388     return output.build();
389 
390     /*
391      * Note: We do not want to insert <p> tags inside <pre>. Fortunately, the formatter gets that
392      * right without special effort on our part. The reason: Line breaks inside a <pre> section are
393      * of type FORCED_NEWLINE rather than WHITESPACE.
394      */
395   }
396 
397   /**
398    * Adjust indentation inside `<pre>{@code` blocks.
399    *
400    * <p>Also trim leading and trailing blank lines, and move the trailing `}` to its own line.
401    */
deindentPreCodeBlocks(List<Token> input)402   private static ImmutableList<Token> deindentPreCodeBlocks(List<Token> input) {
403     ImmutableList.Builder<Token> output = ImmutableList.builder();
404     for (PeekingIterator<Token> tokens = peekingIterator(input.iterator()); tokens.hasNext(); ) {
405       if (tokens.peek().getType() != PRE_OPEN_TAG) {
406         output.add(tokens.next());
407         continue;
408       }
409 
410       output.add(tokens.next());
411       List<Token> initialNewlines = new ArrayList<>();
412       while (tokens.hasNext() && tokens.peek().getType() == FORCED_NEWLINE) {
413         initialNewlines.add(tokens.next());
414       }
415       if (tokens.peek().getType() != LITERAL
416           || !tokens.peek().getValue().matches("[ \t]*[{]@code")) {
417         output.addAll(initialNewlines);
418         output.add(tokens.next());
419         continue;
420       }
421 
422       deindentPreCodeBlock(output, tokens);
423     }
424     return output.build();
425   }
426 
deindentPreCodeBlock( ImmutableList.Builder<Token> output, PeekingIterator<Token> tokens)427   private static void deindentPreCodeBlock(
428       ImmutableList.Builder<Token> output, PeekingIterator<Token> tokens) {
429     Deque<Token> saved = new ArrayDeque<>();
430     output.add(new Token(LITERAL, tokens.next().getValue().trim()));
431     while (tokens.hasNext() && tokens.peek().getType() != PRE_CLOSE_TAG) {
432       Token token = tokens.next();
433       saved.addLast(token);
434     }
435     while (!saved.isEmpty() && saved.peekFirst().getType() == FORCED_NEWLINE) {
436       saved.removeFirst();
437     }
438     while (!saved.isEmpty() && saved.peekLast().getType() == FORCED_NEWLINE) {
439       saved.removeLast();
440     }
441     if (saved.isEmpty()) {
442       return;
443     }
444 
445     // move the trailing `}` to its own line
446     Token last = saved.peekLast();
447     boolean trailingBrace = false;
448     if (last.getType() == LITERAL && last.getValue().endsWith("}")) {
449       saved.removeLast();
450       if (last.length() > 1) {
451         saved.addLast(
452             new Token(LITERAL, last.getValue().substring(0, last.getValue().length() - 1)));
453         saved.addLast(new Token(FORCED_NEWLINE, null));
454       }
455       trailingBrace = true;
456     }
457 
458     int trim = -1;
459     for (Token token : saved) {
460       if (token.getType() == LITERAL) {
461         int idx = CharMatcher.isNot(' ').indexIn(token.getValue());
462         if (idx != -1 && (trim == -1 || idx < trim)) {
463           trim = idx;
464         }
465       }
466     }
467 
468     output.add(new Token(FORCED_NEWLINE, "\n"));
469     for (Token token : saved) {
470       if (token.getType() == LITERAL) {
471         output.add(
472             new Token(
473                 LITERAL,
474                 trim > 0 && token.length() > trim
475                     ? token.getValue().substring(trim)
476                     : token.getValue()));
477       } else {
478         output.add(token);
479       }
480     }
481 
482     if (trailingBrace) {
483       output.add(new Token(LITERAL, "}"));
484     } else {
485       output.add(new Token(FORCED_NEWLINE, "\n"));
486     }
487   }
488 
489   private static final CharMatcher NEWLINE = CharMatcher.is('\n');
490 
hasMultipleNewlines(String s)491   private static boolean hasMultipleNewlines(String s) {
492     return NEWLINE.countIn(s) > 1;
493   }
494 
495   /*
496    * This also eats any trailing whitespace. We would be smart enough to ignore that, anyway --
497    * except in the case of <pre>/<table>, inside which we otherwise leave whitespace intact.
498    *
499    * We'd remove the trailing whitespace later on (in JavaCommentsHelper.rewrite), but I feel safer
500    * stripping it now: It otherwise might confuse our line-length count, which we use for wrapping.
501    */
502   private static final Pattern NEWLINE_PATTERN = compile("^[ \t]*\n[ \t]*[*]?[ \t]?");
503 
504   // We ensure elsewhere that we match this only at the beginning of a line.
505   // Only match tags that start with a lowercase letter, to avoid false matches on unescaped
506   // annotations inside code blocks.
507   // Match "@param <T>" specially in case the <T> is a <P> or other HTML tag we treat specially.
508   private static final Pattern FOOTER_TAG_PATTERN = compile("^@(param\\s+<\\w+>|[a-z]\\w*)");
509   private static final Pattern MOE_BEGIN_STRIP_COMMENT_PATTERN =
510       compile("^<!--\\s*M" + "OE:begin_intracomment_strip\\s*-->");
511   private static final Pattern MOE_END_STRIP_COMMENT_PATTERN =
512       compile("^<!--\\s*M" + "OE:end_intracomment_strip\\s*-->");
513   private static final Pattern HTML_COMMENT_PATTERN = fullCommentPattern();
514   private static final Pattern PRE_OPEN_PATTERN = openTagPattern("pre");
515   private static final Pattern PRE_CLOSE_PATTERN = closeTagPattern("pre");
516   private static final Pattern CODE_OPEN_PATTERN = openTagPattern("code");
517   private static final Pattern CODE_CLOSE_PATTERN = closeTagPattern("code");
518   private static final Pattern TABLE_OPEN_PATTERN = openTagPattern("table");
519   private static final Pattern TABLE_CLOSE_PATTERN = closeTagPattern("table");
520   private static final Pattern LIST_OPEN_PATTERN = openTagPattern("ul|ol|dl");
521   private static final Pattern LIST_CLOSE_PATTERN = closeTagPattern("ul|ol|dl");
522   private static final Pattern LIST_ITEM_OPEN_PATTERN = openTagPattern("li|dt|dd");
523   private static final Pattern LIST_ITEM_CLOSE_PATTERN = closeTagPattern("li|dt|dd");
524   private static final Pattern HEADER_OPEN_PATTERN = openTagPattern("h[1-6]");
525   private static final Pattern HEADER_CLOSE_PATTERN = closeTagPattern("h[1-6]");
526   private static final Pattern PARAGRAPH_OPEN_PATTERN = openTagPattern("p");
527   private static final Pattern PARAGRAPH_CLOSE_PATTERN = closeTagPattern("p");
528   private static final Pattern BLOCKQUOTE_OPEN_PATTERN = openTagPattern("blockquote");
529   private static final Pattern BLOCKQUOTE_CLOSE_PATTERN = closeTagPattern("blockquote");
530   private static final Pattern BR_PATTERN = openTagPattern("br");
531   private static final Pattern INLINE_TAG_OPEN_PATTERN = compile("^[{]@\\w*");
532   /*
533    * We exclude < so that we don't swallow following HTML tags. This lets us fix up "foo<p>" (~400
534    * hits in Google-internal code). We will join unnecessarily split "words" (like "foo<b>bar</b>")
535    * in a later step. There's a similar story for braces. I'm not sure I actually need to exclude @
536    * or *. TODO(cpovirk): Try removing them.
537    *
538    * Thanks to the "rejoin" step in joinAdjacentLiteralsAndAdjacentWhitespace(), we could get away
539    * with matching only one character here. That would eliminate the need for the regex entirely.
540    * That might be faster or slower than what we do now.
541    */
542   private static final Pattern LITERAL_PATTERN = compile("^.[^ \t\n@<{}*]*", DOTALL);
543 
fullCommentPattern()544   private static Pattern fullCommentPattern() {
545     return compile("^<!--.*?-->", DOTALL);
546   }
547 
openTagPattern(String namePattern)548   private static Pattern openTagPattern(String namePattern) {
549     return compile(format("^<(?:%s)\\b[^>]*>", namePattern), CASE_INSENSITIVE);
550   }
551 
closeTagPattern(String namePattern)552   private static Pattern closeTagPattern(String namePattern) {
553     return compile(format("^</(?:%s)\\b[^>]*>", namePattern), CASE_INSENSITIVE);
554   }
555 
556   static class LexException extends Exception {}
557 }
558