1 /* 2 * Copyright 2016 Google Inc. 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except 5 * in compliance with the License. You may obtain a copy of the License at 6 * 7 * http://www.apache.org/licenses/LICENSE-2.0 8 * 9 * Unless required by applicable law or agreed to in writing, software distributed under the License 10 * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express 11 * or implied. See the License for the specific language governing permissions and limitations under 12 * the License. 13 */ 14 15 package com.google.googlejavaformat.java.javadoc; 16 17 import static com.google.common.base.Preconditions.checkArgument; 18 import static com.google.common.base.Preconditions.checkNotNull; 19 import static com.google.common.base.Verify.verify; 20 import static com.google.common.collect.Iterators.peekingIterator; 21 import static com.google.googlejavaformat.java.javadoc.Token.Type.BEGIN_JAVADOC; 22 import static com.google.googlejavaformat.java.javadoc.Token.Type.BLOCKQUOTE_CLOSE_TAG; 23 import static com.google.googlejavaformat.java.javadoc.Token.Type.BLOCKQUOTE_OPEN_TAG; 24 import static com.google.googlejavaformat.java.javadoc.Token.Type.BR_TAG; 25 import static com.google.googlejavaformat.java.javadoc.Token.Type.CODE_CLOSE_TAG; 26 import static com.google.googlejavaformat.java.javadoc.Token.Type.CODE_OPEN_TAG; 27 import static com.google.googlejavaformat.java.javadoc.Token.Type.END_JAVADOC; 28 import static com.google.googlejavaformat.java.javadoc.Token.Type.FOOTER_JAVADOC_TAG_START; 29 import static com.google.googlejavaformat.java.javadoc.Token.Type.FORCED_NEWLINE; 30 import static com.google.googlejavaformat.java.javadoc.Token.Type.HEADER_CLOSE_TAG; 31 import static com.google.googlejavaformat.java.javadoc.Token.Type.HEADER_OPEN_TAG; 32 import static com.google.googlejavaformat.java.javadoc.Token.Type.HTML_COMMENT; 33 import static com.google.googlejavaformat.java.javadoc.Token.Type.LIST_CLOSE_TAG; 34 import static com.google.googlejavaformat.java.javadoc.Token.Type.LIST_ITEM_CLOSE_TAG; 35 import static com.google.googlejavaformat.java.javadoc.Token.Type.LIST_ITEM_OPEN_TAG; 36 import static com.google.googlejavaformat.java.javadoc.Token.Type.LIST_OPEN_TAG; 37 import static com.google.googlejavaformat.java.javadoc.Token.Type.LITERAL; 38 import static com.google.googlejavaformat.java.javadoc.Token.Type.MOE_BEGIN_STRIP_COMMENT; 39 import static com.google.googlejavaformat.java.javadoc.Token.Type.MOE_END_STRIP_COMMENT; 40 import static com.google.googlejavaformat.java.javadoc.Token.Type.OPTIONAL_LINE_BREAK; 41 import static com.google.googlejavaformat.java.javadoc.Token.Type.PARAGRAPH_CLOSE_TAG; 42 import static com.google.googlejavaformat.java.javadoc.Token.Type.PARAGRAPH_OPEN_TAG; 43 import static com.google.googlejavaformat.java.javadoc.Token.Type.PRE_CLOSE_TAG; 44 import static com.google.googlejavaformat.java.javadoc.Token.Type.PRE_OPEN_TAG; 45 import static com.google.googlejavaformat.java.javadoc.Token.Type.TABLE_CLOSE_TAG; 46 import static com.google.googlejavaformat.java.javadoc.Token.Type.TABLE_OPEN_TAG; 47 import static com.google.googlejavaformat.java.javadoc.Token.Type.WHITESPACE; 48 import static java.lang.String.format; 49 import static java.util.regex.Pattern.CASE_INSENSITIVE; 50 import static java.util.regex.Pattern.DOTALL; 51 import static java.util.regex.Pattern.compile; 52 53 import com.google.common.base.CharMatcher; 54 import com.google.common.collect.ImmutableList; 55 import com.google.common.collect.PeekingIterator; 56 import com.google.googlejavaformat.java.javadoc.Token.Type; 57 import java.util.ArrayDeque; 58 import java.util.ArrayList; 59 import java.util.Deque; 60 import java.util.List; 61 import java.util.regex.Pattern; 62 63 /** Lexer for the Javadoc formatter. */ 64 final class JavadocLexer { 65 /** Takes a Javadoc comment, including ∕✱✱ and ✱∕, and returns tokens, including ∕✱✱ and ✱∕. */ lex(String input)66 static ImmutableList<Token> lex(String input) throws LexException { 67 /* 68 * TODO(cpovirk): In theory, we should interpret Unicode escapes (yet output them in their 69 * original form). This would mean mean everything from an encoded ∕✱✱ to an encoded <pre> tag, 70 * so we'll probably never bother. 71 */ 72 input = stripJavadocBeginAndEnd(input); 73 input = normalizeLineEndings(input); 74 return new JavadocLexer(new CharStream(input)).generateTokens(); 75 } 76 77 /** The lexer crashes on windows line endings, so for now just normalize to `\n`. */ 78 // TODO(cushon): use the platform line separator for output normalizeLineEndings(String input)79 private static String normalizeLineEndings(String input) { 80 return NON_UNIX_LINE_ENDING.matcher(input).replaceAll("\n"); 81 } 82 83 private static final Pattern NON_UNIX_LINE_ENDING = Pattern.compile("\r\n?"); 84 stripJavadocBeginAndEnd(String input)85 private static String stripJavadocBeginAndEnd(String input) { 86 /* 87 * We do this ahead of time so that the main part of the lexer need not say things like 88 * "(?![*]/)" to avoid accidentally swallowing ✱∕ when consuming a newline. 89 */ 90 checkArgument(input.startsWith("/**"), "Missing /**: %s", input); 91 checkArgument(input.endsWith("*/") && input.length() > 4, "Missing */: %s", input); 92 return input.substring("/**".length(), input.length() - "*/".length()); 93 } 94 95 private final CharStream input; 96 private final NestingCounter braceDepth = new NestingCounter(); 97 private final NestingCounter preDepth = new NestingCounter(); 98 private final NestingCounter codeDepth = new NestingCounter(); 99 private final NestingCounter tableDepth = new NestingCounter(); 100 private boolean somethingSinceNewline; 101 JavadocLexer(CharStream input)102 private JavadocLexer(CharStream input) { 103 this.input = checkNotNull(input); 104 } 105 generateTokens()106 private ImmutableList<Token> generateTokens() throws LexException { 107 ImmutableList.Builder<Token> tokens = ImmutableList.builder(); 108 109 Token token = new Token(BEGIN_JAVADOC, "/**"); 110 tokens.add(token); 111 112 while (!input.isExhausted()) { 113 token = readToken(); 114 tokens.add(token); 115 } 116 117 checkMatchingTags(); 118 119 token = new Token(END_JAVADOC, "*/"); 120 tokens.add(token); 121 122 ImmutableList<Token> result = tokens.build(); 123 result = joinAdjacentLiteralsAndAdjacentWhitespace(result); 124 result = inferParagraphTags(result); 125 result = optionalizeSpacesAfterLinks(result); 126 result = deindentPreCodeBlocks(result); 127 return result; 128 } 129 readToken()130 private Token readToken() throws LexException { 131 Type type = consumeToken(); 132 String value = input.readAndResetRecorded(); 133 return new Token(type, value); 134 } 135 consumeToken()136 private Type consumeToken() throws LexException { 137 boolean preserveExistingFormatting = preserveExistingFormatting(); 138 139 if (input.tryConsumeRegex(NEWLINE_PATTERN)) { 140 somethingSinceNewline = false; 141 return preserveExistingFormatting ? FORCED_NEWLINE : WHITESPACE; 142 } else if (input.tryConsume(" ") || input.tryConsume("\t")) { 143 // TODO(cpovirk): How about weird whitespace chars? Ideally we'd distinguish breaking vs. not. 144 // Returning LITERAL here prevent us from breaking a <pre> line. For more info, see LITERAL. 145 return preserveExistingFormatting ? LITERAL : WHITESPACE; 146 } 147 148 /* 149 * TODO(cpovirk): Maybe try to detect things like "{@code\n@GwtCompatible}" that aren't intended 150 * as tags. But in the most likely case, in which that happens inside <pre>{@code, we have no 151 * great options for fixing it. 152 * https://github.com/google/google-java-format/issues/7#issuecomment-197383926 153 */ 154 if (!somethingSinceNewline && input.tryConsumeRegex(FOOTER_TAG_PATTERN)) { 155 checkMatchingTags(); 156 somethingSinceNewline = true; 157 return FOOTER_JAVADOC_TAG_START; 158 } 159 somethingSinceNewline = true; 160 161 if (input.tryConsumeRegex(INLINE_TAG_OPEN_PATTERN)) { 162 braceDepth.increment(); 163 return LITERAL; 164 } else if (input.tryConsume("{")) { 165 braceDepth.incrementIfPositive(); 166 return LITERAL; 167 } else if (input.tryConsume("}")) { 168 braceDepth.decrementIfPositive(); 169 return LITERAL; 170 } 171 172 // Inside an inline tag, don't do any HTML interpretation. 173 if (braceDepth.isPositive()) { 174 verify(input.tryConsumeRegex(LITERAL_PATTERN)); 175 return LITERAL; 176 } 177 178 if (input.tryConsumeRegex(PRE_OPEN_PATTERN)) { 179 preDepth.increment(); 180 return preserveExistingFormatting ? LITERAL : PRE_OPEN_TAG; 181 } else if (input.tryConsumeRegex(PRE_CLOSE_PATTERN)) { 182 preDepth.decrementIfPositive(); 183 return preserveExistingFormatting() ? LITERAL : PRE_CLOSE_TAG; 184 } 185 186 if (input.tryConsumeRegex(CODE_OPEN_PATTERN)) { 187 codeDepth.increment(); 188 return preserveExistingFormatting ? LITERAL : CODE_OPEN_TAG; 189 } else if (input.tryConsumeRegex(CODE_CLOSE_PATTERN)) { 190 codeDepth.decrementIfPositive(); 191 return preserveExistingFormatting() ? LITERAL : CODE_CLOSE_TAG; 192 } 193 194 if (input.tryConsumeRegex(TABLE_OPEN_PATTERN)) { 195 tableDepth.increment(); 196 return preserveExistingFormatting ? LITERAL : TABLE_OPEN_TAG; 197 } else if (input.tryConsumeRegex(TABLE_CLOSE_PATTERN)) { 198 tableDepth.decrementIfPositive(); 199 return preserveExistingFormatting() ? LITERAL : TABLE_CLOSE_TAG; 200 } 201 202 if (preserveExistingFormatting) { 203 verify(input.tryConsumeRegex(LITERAL_PATTERN)); 204 return LITERAL; 205 } 206 207 if (input.tryConsumeRegex(PARAGRAPH_OPEN_PATTERN)) { 208 return PARAGRAPH_OPEN_TAG; 209 } else if (input.tryConsumeRegex(PARAGRAPH_CLOSE_PATTERN)) { 210 return PARAGRAPH_CLOSE_TAG; 211 } else if (input.tryConsumeRegex(LIST_OPEN_PATTERN)) { 212 return LIST_OPEN_TAG; 213 } else if (input.tryConsumeRegex(LIST_CLOSE_PATTERN)) { 214 return LIST_CLOSE_TAG; 215 } else if (input.tryConsumeRegex(LIST_ITEM_OPEN_PATTERN)) { 216 return LIST_ITEM_OPEN_TAG; 217 } else if (input.tryConsumeRegex(LIST_ITEM_CLOSE_PATTERN)) { 218 return LIST_ITEM_CLOSE_TAG; 219 } else if (input.tryConsumeRegex(BLOCKQUOTE_OPEN_PATTERN)) { 220 return BLOCKQUOTE_OPEN_TAG; 221 } else if (input.tryConsumeRegex(BLOCKQUOTE_CLOSE_PATTERN)) { 222 return BLOCKQUOTE_CLOSE_TAG; 223 } else if (input.tryConsumeRegex(HEADER_OPEN_PATTERN)) { 224 return HEADER_OPEN_TAG; 225 } else if (input.tryConsumeRegex(HEADER_CLOSE_PATTERN)) { 226 return HEADER_CLOSE_TAG; 227 } else if (input.tryConsumeRegex(BR_PATTERN)) { 228 return BR_TAG; 229 } else if (input.tryConsumeRegex(MOE_BEGIN_STRIP_COMMENT_PATTERN)) { 230 return MOE_BEGIN_STRIP_COMMENT; 231 } else if (input.tryConsumeRegex(MOE_END_STRIP_COMMENT_PATTERN)) { 232 return MOE_END_STRIP_COMMENT; 233 } else if (input.tryConsumeRegex(HTML_COMMENT_PATTERN)) { 234 return HTML_COMMENT; 235 } else if (input.tryConsumeRegex(LITERAL_PATTERN)) { 236 return LITERAL; 237 } 238 throw new AssertionError(); 239 } 240 preserveExistingFormatting()241 private boolean preserveExistingFormatting() { 242 return preDepth.isPositive() || tableDepth.isPositive() || codeDepth.isPositive(); 243 } 244 checkMatchingTags()245 private void checkMatchingTags() throws LexException { 246 if (braceDepth.isPositive() 247 || preDepth.isPositive() 248 || tableDepth.isPositive() 249 || codeDepth.isPositive()) { 250 throw new LexException(); 251 } 252 } 253 254 /** 255 * Join together adjacent literal tokens, and join together adjacent whitespace tokens. 256 * 257 * <p>For literal tokens, this means something like {@code ["<b>", "foo", "</b>"] => 258 * ["<b>foo</b>"]}. See {@link #LITERAL_PATTERN} for discussion of why those tokens are separate 259 * to begin with. 260 * 261 * <p>Whitespace tokens are treated analogously. We don't really "want" to join whitespace tokens, 262 * but in the course of joining literals, we incidentally join whitespace, too. We do take 263 * advantage of the joining later on: It simplifies {@link #inferParagraphTags}. 264 * 265 * <p>Note that we do <i>not</i> merge a literal token and a whitespace token together. 266 */ joinAdjacentLiteralsAndAdjacentWhitespace(List<Token> input)267 private static ImmutableList<Token> joinAdjacentLiteralsAndAdjacentWhitespace(List<Token> input) { 268 /* 269 * Note: Our final token is always END_JAVADOC. This saves us some trouble: 270 * 271 * - Our inner while() doesn't need a hasNext() check. 272 * 273 * - We don't need to check for leftover accumulated literals after we exit the loop. 274 */ 275 ImmutableList.Builder<Token> output = ImmutableList.builder(); 276 StringBuilder accumulated = new StringBuilder(); 277 278 for (PeekingIterator<Token> tokens = peekingIterator(input.iterator()); tokens.hasNext(); ) { 279 if (tokens.peek().getType() == LITERAL) { 280 accumulated.append(tokens.peek().getValue()); 281 tokens.next(); 282 continue; 283 } 284 285 /* 286 * IF we have accumulated some literals to join together (say, "foo<b>bar</b>"), and IF we'll 287 * next see whitespace followed by a "@" literal, we need to join that together with the 288 * previous literals. That ensures that we won't insert a line break before the "@," turning 289 * it into a tag. 290 */ 291 292 if (accumulated.length() == 0) { 293 output.add(tokens.peek()); 294 tokens.next(); 295 continue; 296 } 297 298 StringBuilder seenWhitespace = new StringBuilder(); 299 while (tokens.peek().getType() == WHITESPACE) { 300 seenWhitespace.append(tokens.next().getValue()); 301 } 302 303 if (tokens.peek().getType() == LITERAL && tokens.peek().getValue().startsWith("@")) { 304 // OK, we're in the case described above. 305 accumulated.append(" "); 306 accumulated.append(tokens.peek().getValue()); 307 tokens.next(); 308 continue; 309 } 310 311 output.add(new Token(LITERAL, accumulated.toString())); 312 accumulated.setLength(0); 313 314 if (seenWhitespace.length() > 0) { 315 output.add(new Token(WHITESPACE, seenWhitespace.toString())); 316 } 317 318 // We have another token coming, possibly of type OTHER. Leave it for the next iteration. 319 } 320 321 /* 322 * TODO(cpovirk): Another case where we could try to join tokens is if a line ends with 323 * /[^ -]-/, as in "non-\nblocking." 324 */ 325 return output.build(); 326 } 327 328 /** 329 * Where the input has two consecutive line breaks between literals, insert a {@code <p>} tag 330 * between the literals. 331 * 332 * <p>This method must be called after {@link #joinAdjacentLiteralsAndAdjacentWhitespace}, as it 333 * assumes that adjacent whitespace tokens have already been joined. 334 */ inferParagraphTags(List<Token> input)335 private static ImmutableList<Token> inferParagraphTags(List<Token> input) { 336 ImmutableList.Builder<Token> output = ImmutableList.builder(); 337 338 for (PeekingIterator<Token> tokens = peekingIterator(input.iterator()); tokens.hasNext(); ) { 339 if (tokens.peek().getType() == LITERAL) { 340 output.add(tokens.next()); 341 342 if (tokens.peek().getType() == WHITESPACE 343 && hasMultipleNewlines(tokens.peek().getValue())) { 344 output.add(tokens.next()); 345 346 if (tokens.peek().getType() == LITERAL) { 347 output.add(new Token(PARAGRAPH_OPEN_TAG, "<p>")); 348 } 349 } 350 } else { 351 // TODO(cpovirk): Or just `continue` from the <p> case and move this out of the `else`? 352 output.add(tokens.next()); 353 } 354 } 355 356 return output.build(); 357 358 /* 359 * Note: We do not want to insert <p> tags inside <pre>. Fortunately, the formatter gets that 360 * right without special effort on our part. The reason: Line breaks inside a <pre> section are 361 * of type FORCED_NEWLINE rather than WHITESPACE. 362 */ 363 } 364 365 /** 366 * Replaces whitespace after a {@code href=...>} token with an "optional link break." This allows 367 * us to output either {@code <a href=foo>foo</a>} or {@code <a href=foo>\nfoo</a>}, depending on 368 * how much space we have left on the line. 369 * 370 * <p>This method must be called after {@link #joinAdjacentLiteralsAndAdjacentWhitespace}, as it 371 * assumes that adjacent whitespace tokens have already been joined. 372 */ optionalizeSpacesAfterLinks(List<Token> input)373 private static ImmutableList<Token> optionalizeSpacesAfterLinks(List<Token> input) { 374 ImmutableList.Builder<Token> output = ImmutableList.builder(); 375 376 for (PeekingIterator<Token> tokens = peekingIterator(input.iterator()); tokens.hasNext(); ) { 377 if (tokens.peek().getType() == LITERAL && tokens.peek().getValue().matches("^href=[^>]*>")) { 378 output.add(tokens.next()); 379 380 if (tokens.peek().getType() == WHITESPACE) { 381 output.add(new Token(OPTIONAL_LINE_BREAK, tokens.next().getValue())); 382 } 383 } else { 384 output.add(tokens.next()); 385 } 386 } 387 388 return output.build(); 389 390 /* 391 * Note: We do not want to insert <p> tags inside <pre>. Fortunately, the formatter gets that 392 * right without special effort on our part. The reason: Line breaks inside a <pre> section are 393 * of type FORCED_NEWLINE rather than WHITESPACE. 394 */ 395 } 396 397 /** 398 * Adjust indentation inside `<pre>{@code` blocks. 399 * 400 * <p>Also trim leading and trailing blank lines, and move the trailing `}` to its own line. 401 */ deindentPreCodeBlocks(List<Token> input)402 private static ImmutableList<Token> deindentPreCodeBlocks(List<Token> input) { 403 ImmutableList.Builder<Token> output = ImmutableList.builder(); 404 for (PeekingIterator<Token> tokens = peekingIterator(input.iterator()); tokens.hasNext(); ) { 405 if (tokens.peek().getType() != PRE_OPEN_TAG) { 406 output.add(tokens.next()); 407 continue; 408 } 409 410 output.add(tokens.next()); 411 List<Token> initialNewlines = new ArrayList<>(); 412 while (tokens.hasNext() && tokens.peek().getType() == FORCED_NEWLINE) { 413 initialNewlines.add(tokens.next()); 414 } 415 if (tokens.peek().getType() != LITERAL 416 || !tokens.peek().getValue().matches("[ \t]*[{]@code")) { 417 output.addAll(initialNewlines); 418 output.add(tokens.next()); 419 continue; 420 } 421 422 deindentPreCodeBlock(output, tokens); 423 } 424 return output.build(); 425 } 426 deindentPreCodeBlock( ImmutableList.Builder<Token> output, PeekingIterator<Token> tokens)427 private static void deindentPreCodeBlock( 428 ImmutableList.Builder<Token> output, PeekingIterator<Token> tokens) { 429 Deque<Token> saved = new ArrayDeque<>(); 430 output.add(new Token(LITERAL, tokens.next().getValue().trim())); 431 while (tokens.hasNext() && tokens.peek().getType() != PRE_CLOSE_TAG) { 432 Token token = tokens.next(); 433 saved.addLast(token); 434 } 435 while (!saved.isEmpty() && saved.peekFirst().getType() == FORCED_NEWLINE) { 436 saved.removeFirst(); 437 } 438 while (!saved.isEmpty() && saved.peekLast().getType() == FORCED_NEWLINE) { 439 saved.removeLast(); 440 } 441 if (saved.isEmpty()) { 442 return; 443 } 444 445 // move the trailing `}` to its own line 446 Token last = saved.peekLast(); 447 boolean trailingBrace = false; 448 if (last.getType() == LITERAL && last.getValue().endsWith("}")) { 449 saved.removeLast(); 450 if (last.length() > 1) { 451 saved.addLast( 452 new Token(LITERAL, last.getValue().substring(0, last.getValue().length() - 1))); 453 saved.addLast(new Token(FORCED_NEWLINE, null)); 454 } 455 trailingBrace = true; 456 } 457 458 int trim = -1; 459 for (Token token : saved) { 460 if (token.getType() == LITERAL) { 461 int idx = CharMatcher.isNot(' ').indexIn(token.getValue()); 462 if (idx != -1 && (trim == -1 || idx < trim)) { 463 trim = idx; 464 } 465 } 466 } 467 468 output.add(new Token(FORCED_NEWLINE, "\n")); 469 for (Token token : saved) { 470 if (token.getType() == LITERAL) { 471 output.add( 472 new Token( 473 LITERAL, 474 trim > 0 && token.length() > trim 475 ? token.getValue().substring(trim) 476 : token.getValue())); 477 } else { 478 output.add(token); 479 } 480 } 481 482 if (trailingBrace) { 483 output.add(new Token(LITERAL, "}")); 484 } else { 485 output.add(new Token(FORCED_NEWLINE, "\n")); 486 } 487 } 488 489 private static final CharMatcher NEWLINE = CharMatcher.is('\n'); 490 hasMultipleNewlines(String s)491 private static boolean hasMultipleNewlines(String s) { 492 return NEWLINE.countIn(s) > 1; 493 } 494 495 /* 496 * This also eats any trailing whitespace. We would be smart enough to ignore that, anyway -- 497 * except in the case of <pre>/<table>, inside which we otherwise leave whitespace intact. 498 * 499 * We'd remove the trailing whitespace later on (in JavaCommentsHelper.rewrite), but I feel safer 500 * stripping it now: It otherwise might confuse our line-length count, which we use for wrapping. 501 */ 502 private static final Pattern NEWLINE_PATTERN = compile("^[ \t]*\n[ \t]*[*]?[ \t]?"); 503 504 // We ensure elsewhere that we match this only at the beginning of a line. 505 // Only match tags that start with a lowercase letter, to avoid false matches on unescaped 506 // annotations inside code blocks. 507 // Match "@param <T>" specially in case the <T> is a <P> or other HTML tag we treat specially. 508 private static final Pattern FOOTER_TAG_PATTERN = compile("^@(param\\s+<\\w+>|[a-z]\\w*)"); 509 private static final Pattern MOE_BEGIN_STRIP_COMMENT_PATTERN = 510 compile("^<!--\\s*M" + "OE:begin_intracomment_strip\\s*-->"); 511 private static final Pattern MOE_END_STRIP_COMMENT_PATTERN = 512 compile("^<!--\\s*M" + "OE:end_intracomment_strip\\s*-->"); 513 private static final Pattern HTML_COMMENT_PATTERN = fullCommentPattern(); 514 private static final Pattern PRE_OPEN_PATTERN = openTagPattern("pre"); 515 private static final Pattern PRE_CLOSE_PATTERN = closeTagPattern("pre"); 516 private static final Pattern CODE_OPEN_PATTERN = openTagPattern("code"); 517 private static final Pattern CODE_CLOSE_PATTERN = closeTagPattern("code"); 518 private static final Pattern TABLE_OPEN_PATTERN = openTagPattern("table"); 519 private static final Pattern TABLE_CLOSE_PATTERN = closeTagPattern("table"); 520 private static final Pattern LIST_OPEN_PATTERN = openTagPattern("ul|ol|dl"); 521 private static final Pattern LIST_CLOSE_PATTERN = closeTagPattern("ul|ol|dl"); 522 private static final Pattern LIST_ITEM_OPEN_PATTERN = openTagPattern("li|dt|dd"); 523 private static final Pattern LIST_ITEM_CLOSE_PATTERN = closeTagPattern("li|dt|dd"); 524 private static final Pattern HEADER_OPEN_PATTERN = openTagPattern("h[1-6]"); 525 private static final Pattern HEADER_CLOSE_PATTERN = closeTagPattern("h[1-6]"); 526 private static final Pattern PARAGRAPH_OPEN_PATTERN = openTagPattern("p"); 527 private static final Pattern PARAGRAPH_CLOSE_PATTERN = closeTagPattern("p"); 528 private static final Pattern BLOCKQUOTE_OPEN_PATTERN = openTagPattern("blockquote"); 529 private static final Pattern BLOCKQUOTE_CLOSE_PATTERN = closeTagPattern("blockquote"); 530 private static final Pattern BR_PATTERN = openTagPattern("br"); 531 private static final Pattern INLINE_TAG_OPEN_PATTERN = compile("^[{]@\\w*"); 532 /* 533 * We exclude < so that we don't swallow following HTML tags. This lets us fix up "foo<p>" (~400 534 * hits in Google-internal code). We will join unnecessarily split "words" (like "foo<b>bar</b>") 535 * in a later step. There's a similar story for braces. I'm not sure I actually need to exclude @ 536 * or *. TODO(cpovirk): Try removing them. 537 * 538 * Thanks to the "rejoin" step in joinAdjacentLiteralsAndAdjacentWhitespace(), we could get away 539 * with matching only one character here. That would eliminate the need for the regex entirely. 540 * That might be faster or slower than what we do now. 541 */ 542 private static final Pattern LITERAL_PATTERN = compile("^.[^ \t\n@<{}*]*", DOTALL); 543 fullCommentPattern()544 private static Pattern fullCommentPattern() { 545 return compile("^<!--.*?-->", DOTALL); 546 } 547 openTagPattern(String namePattern)548 private static Pattern openTagPattern(String namePattern) { 549 return compile(format("^<(?:%s)\\b[^>]*>", namePattern), CASE_INSENSITIVE); 550 } 551 closeTagPattern(String namePattern)552 private static Pattern closeTagPattern(String namePattern) { 553 return compile(format("^</(?:%s)\\b[^>]*>", namePattern), CASE_INSENSITIVE); 554 } 555 556 static class LexException extends Exception {} 557 } 558