1 package org.jsoup.parser; 2 3 import org.jsoup.helper.Validate; 4 import org.jsoup.internal.Normalizer; 5 import org.jsoup.nodes.Attributes; 6 import org.jsoup.nodes.Range; 7 import org.jspecify.annotations.Nullable; 8 9 import java.util.HashMap; 10 import java.util.Map; 11 12 import static org.jsoup.internal.SharedConstants.*; 13 14 15 /** 16 * Parse tokens for the Tokeniser. 17 */ 18 abstract class Token { 19 final TokenType type; // used in switches in TreeBuilder vs .getClass() 20 static final int Unset = -1; 21 private int startPos, endPos = Unset; // position in CharacterReader this token was read from 22 Token(TokenType type)23 private Token(TokenType type) { 24 this.type = type; 25 } 26 tokenType()27 String tokenType() { 28 return this.getClass().getSimpleName(); 29 } 30 31 /** 32 * Reset the data represent by this token, for reuse. Prevents the need to create transfer objects for every 33 * piece of data, which immediately get GCed. 34 */ reset()35 Token reset() { 36 startPos = Unset; 37 endPos = Unset; 38 return this; 39 } 40 startPos()41 int startPos() { 42 return startPos; 43 } 44 startPos(int pos)45 void startPos(int pos) { 46 startPos = pos; 47 } 48 endPos()49 int endPos() { 50 return endPos; 51 } 52 endPos(int pos)53 void endPos(int pos) { 54 endPos = pos; 55 } 56 reset(StringBuilder sb)57 static void reset(StringBuilder sb) { 58 if (sb != null) { 59 sb.delete(0, sb.length()); 60 } 61 } 62 63 static final class Doctype extends Token { 64 final StringBuilder name = new StringBuilder(); 65 String pubSysKey = null; 66 final StringBuilder publicIdentifier = new StringBuilder(); 67 final StringBuilder systemIdentifier = new StringBuilder(); 68 boolean forceQuirks = false; 69 Doctype()70 Doctype() { 71 super(TokenType.Doctype); 72 } 73 74 @Override reset()75 Token reset() { 76 super.reset(); 77 reset(name); 78 pubSysKey = null; 79 reset(publicIdentifier); 80 reset(systemIdentifier); 81 forceQuirks = false; 82 return this; 83 } 84 getName()85 String getName() { 86 return name.toString(); 87 } 88 getPubSysKey()89 String getPubSysKey() { 90 return pubSysKey; 91 } 92 getPublicIdentifier()93 String getPublicIdentifier() { 94 return publicIdentifier.toString(); 95 } 96 getSystemIdentifier()97 public String getSystemIdentifier() { 98 return systemIdentifier.toString(); 99 } 100 isForceQuirks()101 public boolean isForceQuirks() { 102 return forceQuirks; 103 } 104 105 @Override toString()106 public String toString() { 107 return "<!doctype " + getName() + ">"; 108 } 109 } 110 111 static abstract class Tag extends Token { 112 @Nullable protected String tagName; 113 @Nullable protected String normalName; // lc version of tag name, for case-insensitive tree build 114 boolean selfClosing = false; 115 @Nullable Attributes attributes; // start tags get attributes on construction. End tags get attributes on first new attribute (but only for parser convenience, not used). 116 117 @Nullable private String attrName; // try to get attr names and vals in one shot, vs Builder 118 private final StringBuilder attrNameSb = new StringBuilder(); 119 private boolean hasAttrName = false; 120 121 @Nullable private String attrValue; 122 private final StringBuilder attrValueSb = new StringBuilder(); 123 private boolean hasAttrValue = false; 124 private boolean hasEmptyAttrValue = false; // distinguish boolean attribute from empty string value 125 126 // attribute source range tracking 127 final TreeBuilder treeBuilder; 128 final boolean trackSource; 129 int attrNameStart, attrNameEnd, attrValStart, attrValEnd; 130 Tag(TokenType type, TreeBuilder treeBuilder)131 Tag(TokenType type, TreeBuilder treeBuilder) { 132 super(type); 133 this.treeBuilder = treeBuilder; 134 this.trackSource = treeBuilder.trackSourceRange; 135 } 136 137 @Override reset()138 Tag reset() { 139 super.reset(); 140 tagName = null; 141 normalName = null; 142 selfClosing = false; 143 attributes = null; 144 resetPendingAttr(); 145 return this; 146 } 147 resetPendingAttr()148 private void resetPendingAttr() { 149 reset(attrNameSb); 150 attrName = null; 151 hasAttrName = false; 152 153 reset(attrValueSb); 154 attrValue = null; 155 hasEmptyAttrValue = false; 156 hasAttrValue = false; 157 158 if (trackSource) 159 attrNameStart = attrNameEnd = attrValStart = attrValEnd = Unset; 160 } 161 162 /* Limits runaway crafted HTML from spewing attributes and getting a little sluggish in ensureCapacity. 163 Real-world HTML will P99 around 8 attributes, so plenty of headroom. Implemented here and not in the Attributes 164 object so that API users can add more if ever required. */ 165 private static final int MaxAttributes = 512; 166 newAttribute()167 final void newAttribute() { 168 if (attributes == null) 169 attributes = new Attributes(); 170 171 if (hasAttrName && attributes.size() < MaxAttributes) { 172 // the tokeniser has skipped whitespace control chars, but trimming could collapse to empty for other control codes, so verify here 173 String name = attrNameSb.length() > 0 ? attrNameSb.toString() : attrName; 174 name = name.trim(); 175 if (name.length() > 0) { 176 String value; 177 if (hasAttrValue) 178 value = attrValueSb.length() > 0 ? attrValueSb.toString() : attrValue; 179 else if (hasEmptyAttrValue) 180 value = ""; 181 else 182 value = null; 183 // note that we add, not put. So that the first is kept, and rest are deduped, once in a context where case sensitivity is known, and we can warn for duplicates. 184 attributes.add(name, value); 185 186 trackAttributeRange(name); 187 } 188 } 189 resetPendingAttr(); 190 } 191 trackAttributeRange(String name)192 private void trackAttributeRange(String name) { 193 if (trackSource && isStartTag()) { 194 final StartTag start = asStartTag(); 195 final CharacterReader r = start.treeBuilder.reader; 196 final boolean preserve = start.treeBuilder.settings.preserveAttributeCase(); 197 198 assert attributes != null; 199 //noinspection unchecked 200 Map<String, Range.AttributeRange> attrRanges = 201 (Map<String, Range.AttributeRange>) attributes.userData(AttrRangeKey); 202 if (attrRanges == null) { 203 attrRanges = new HashMap<>(); 204 attributes.userData(AttrRangeKey, attrRanges); 205 } 206 207 if (!preserve) name = Normalizer.lowerCase(name); 208 if (attrRanges.containsKey(name)) return; // dedupe ranges as we go; actual attributes get deduped later for error count 209 210 // if there's no value (e.g. boolean), make it an implicit range at current 211 if (!hasAttrValue) attrValStart = attrValEnd = attrNameEnd; 212 213 Range.AttributeRange range = new Range.AttributeRange( 214 new Range( 215 new Range.Position(attrNameStart, r.lineNumber(attrNameStart), r.columnNumber(attrNameStart)), 216 new Range.Position(attrNameEnd, r.lineNumber(attrNameEnd), r.columnNumber(attrNameEnd))), 217 new Range( 218 new Range.Position(attrValStart, r.lineNumber(attrValStart), r.columnNumber(attrValStart)), 219 new Range.Position(attrValEnd, r.lineNumber(attrValEnd), r.columnNumber(attrValEnd))) 220 ); 221 attrRanges.put(name, range); 222 } 223 } 224 hasAttributes()225 final boolean hasAttributes() { 226 return attributes != null; 227 } 228 229 /** Case-sensitive check */ hasAttribute(String key)230 final boolean hasAttribute(String key) { 231 return attributes != null && attributes.hasKey(key); 232 } 233 hasAttributeIgnoreCase(String key)234 final boolean hasAttributeIgnoreCase(String key) { 235 return attributes != null && attributes.hasKeyIgnoreCase(key); 236 } 237 finaliseTag()238 final void finaliseTag() { 239 // finalises for emit 240 if (hasAttrName) { 241 newAttribute(); 242 } 243 } 244 245 /** Preserves case */ name()246 final String name() { // preserves case, for input into Tag.valueOf (which may drop case) 247 Validate.isFalse(tagName == null || tagName.length() == 0); 248 return tagName; 249 } 250 251 /** Lower case */ normalName()252 final String normalName() { // lower case, used in tree building for working out where in tree it should go 253 return normalName; 254 } 255 toStringName()256 final String toStringName() { 257 return tagName != null ? tagName : "[unset]"; 258 } 259 name(String name)260 final Tag name(String name) { 261 tagName = name; 262 normalName = ParseSettings.normalName(tagName); 263 return this; 264 } 265 isSelfClosing()266 final boolean isSelfClosing() { 267 return selfClosing; 268 } 269 270 // these appenders are rarely hit in not null state-- caused by null chars. appendTagName(String append)271 final void appendTagName(String append) { 272 // might have null chars - need to replace with null replacement character 273 append = append.replace(TokeniserState.nullChar, Tokeniser.replacementChar); 274 tagName = tagName == null ? append : tagName.concat(append); 275 normalName = ParseSettings.normalName(tagName); 276 } 277 appendTagName(char append)278 final void appendTagName(char append) { 279 appendTagName(String.valueOf(append)); 280 } 281 appendAttributeName(String append, int startPos, int endPos)282 final void appendAttributeName(String append, int startPos, int endPos) { 283 // might have null chars because we eat in one pass - need to replace with null replacement character 284 append = append.replace(TokeniserState.nullChar, Tokeniser.replacementChar); 285 286 ensureAttrName(startPos, endPos); 287 if (attrNameSb.length() == 0) { 288 attrName = append; 289 } else { 290 attrNameSb.append(append); 291 } 292 } 293 appendAttributeName(char append, int startPos, int endPos)294 final void appendAttributeName(char append, int startPos, int endPos) { 295 ensureAttrName(startPos, endPos); 296 attrNameSb.append(append); 297 } 298 appendAttributeValue(String append, int startPos, int endPos)299 final void appendAttributeValue(String append, int startPos, int endPos) { 300 ensureAttrValue(startPos, endPos); 301 if (attrValueSb.length() == 0) { 302 attrValue = append; 303 } else { 304 attrValueSb.append(append); 305 } 306 } 307 appendAttributeValue(char append, int startPos, int endPos)308 final void appendAttributeValue(char append, int startPos, int endPos) { 309 ensureAttrValue(startPos, endPos); 310 attrValueSb.append(append); 311 } 312 appendAttributeValue(int[] appendCodepoints, int startPos, int endPos)313 final void appendAttributeValue(int[] appendCodepoints, int startPos, int endPos) { 314 ensureAttrValue(startPos, endPos); 315 for (int codepoint : appendCodepoints) { 316 attrValueSb.appendCodePoint(codepoint); 317 } 318 } 319 setEmptyAttributeValue()320 final void setEmptyAttributeValue() { 321 hasEmptyAttrValue = true; 322 } 323 ensureAttrName(int startPos, int endPos)324 private void ensureAttrName(int startPos, int endPos) { 325 hasAttrName = true; 326 // if on second hit, we'll need to move to the builder 327 if (attrName != null) { 328 attrNameSb.append(attrName); 329 attrName = null; 330 } 331 if (trackSource) { 332 attrNameStart = attrNameStart > Unset ? attrNameStart : startPos; // latches to first 333 attrNameEnd = endPos; 334 } 335 } 336 ensureAttrValue(int startPos, int endPos)337 private void ensureAttrValue(int startPos, int endPos) { 338 hasAttrValue = true; 339 // if on second hit, we'll need to move to the builder 340 if (attrValue != null) { 341 attrValueSb.append(attrValue); 342 attrValue = null; 343 } 344 if (trackSource) { 345 attrValStart = attrValStart > Unset ? attrValStart : startPos; // latches to first 346 attrValEnd = endPos; 347 } 348 } 349 350 @Override toString()351 abstract public String toString(); 352 } 353 354 final static class StartTag extends Tag { 355 356 // TreeBuilder is provided so if tracking, can get line / column positions for Range; and can dedupe as we go StartTag(TreeBuilder treeBuilder)357 StartTag(TreeBuilder treeBuilder) { 358 super(TokenType.StartTag, treeBuilder); 359 } 360 361 @Override reset()362 Tag reset() { 363 super.reset(); 364 attributes = null; 365 return this; 366 } 367 nameAttr(String name, Attributes attributes)368 StartTag nameAttr(String name, Attributes attributes) { 369 this.tagName = name; 370 this.attributes = attributes; 371 normalName = ParseSettings.normalName(tagName); 372 return this; 373 } 374 375 @Override toString()376 public String toString() { 377 String closer = isSelfClosing() ? "/>" : ">"; 378 if (hasAttributes() && attributes.size() > 0) 379 return "<" + toStringName() + " " + attributes.toString() + closer; 380 else 381 return "<" + toStringName() + closer; 382 } 383 } 384 385 final static class EndTag extends Tag{ EndTag(TreeBuilder treeBuilder)386 EndTag(TreeBuilder treeBuilder) { 387 super(TokenType.EndTag, treeBuilder); 388 } 389 390 @Override toString()391 public String toString() { 392 return "</" + toStringName() + ">"; 393 } 394 } 395 396 final static class Comment extends Token { 397 private final StringBuilder data = new StringBuilder(); 398 private String dataS; // try to get in one shot 399 boolean bogus = false; 400 401 @Override reset()402 Token reset() { 403 super.reset(); 404 reset(data); 405 dataS = null; 406 bogus = false; 407 return this; 408 } 409 Comment()410 Comment() { 411 super(TokenType.Comment); 412 } 413 getData()414 String getData() { 415 return dataS != null ? dataS : data.toString(); 416 } 417 append(String append)418 Comment append(String append) { 419 ensureData(); 420 if (data.length() == 0) { 421 dataS = append; 422 } else { 423 data.append(append); 424 } 425 return this; 426 } 427 append(char append)428 Comment append(char append) { 429 ensureData(); 430 data.append(append); 431 return this; 432 } 433 ensureData()434 private void ensureData() { 435 // if on second hit, we'll need to move to the builder 436 if (dataS != null) { 437 data.append(dataS); 438 dataS = null; 439 } 440 } 441 442 @Override toString()443 public String toString() { 444 return "<!--" + getData() + "-->"; 445 } 446 } 447 448 static class Character extends Token implements Cloneable { 449 private String data; 450 Character()451 Character() { 452 super(TokenType.Character); 453 } 454 455 @Override reset()456 Token reset() { 457 super.reset(); 458 data = null; 459 return this; 460 } 461 data(String data)462 Character data(String data) { 463 this.data = data; 464 return this; 465 } 466 getData()467 String getData() { 468 return data; 469 } 470 471 @Override toString()472 public String toString() { 473 return getData(); 474 } 475 clone()476 @Override protected Token.Character clone() { 477 try { 478 return (Token.Character) super.clone(); 479 } catch (CloneNotSupportedException e) { 480 throw new RuntimeException(e); 481 } 482 } 483 } 484 485 final static class CData extends Character { CData(String data)486 CData(String data) { 487 super(); 488 this.data(data); 489 } 490 491 @Override toString()492 public String toString() { 493 return "<![CDATA[" + getData() + "]]>"; 494 } 495 496 } 497 498 final static class EOF extends Token { EOF()499 EOF() { 500 super(Token.TokenType.EOF); 501 } 502 503 @Override reset()504 Token reset() { 505 super.reset(); 506 return this; 507 } 508 509 @Override toString()510 public String toString() { 511 return ""; 512 } 513 } 514 isDoctype()515 final boolean isDoctype() { 516 return type == TokenType.Doctype; 517 } 518 asDoctype()519 final Doctype asDoctype() { 520 return (Doctype) this; 521 } 522 isStartTag()523 final boolean isStartTag() { 524 return type == TokenType.StartTag; 525 } 526 asStartTag()527 final StartTag asStartTag() { 528 return (StartTag) this; 529 } 530 isEndTag()531 final boolean isEndTag() { 532 return type == TokenType.EndTag; 533 } 534 asEndTag()535 final EndTag asEndTag() { 536 return (EndTag) this; 537 } 538 isComment()539 final boolean isComment() { 540 return type == TokenType.Comment; 541 } 542 asComment()543 final Comment asComment() { 544 return (Comment) this; 545 } 546 isCharacter()547 final boolean isCharacter() { 548 return type == TokenType.Character; 549 } 550 isCData()551 final boolean isCData() { 552 return this instanceof CData; 553 } 554 asCharacter()555 final Character asCharacter() { 556 return (Character) this; 557 } 558 isEOF()559 final boolean isEOF() { 560 return type == TokenType.EOF; 561 } 562 563 public enum TokenType { 564 Doctype, 565 StartTag, 566 EndTag, 567 Comment, 568 Character, // note no CData - treated in builder as an extension of Character 569 EOF 570 } 571 } 572