• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 package org.jsoup.parser;
2 
3 import org.jsoup.helper.Validate;
4 import org.jsoup.internal.Normalizer;
5 import org.jsoup.nodes.Attributes;
6 import org.jsoup.nodes.Range;
7 import org.jspecify.annotations.Nullable;
8 
9 import java.util.HashMap;
10 import java.util.Map;
11 
12 import static org.jsoup.internal.SharedConstants.*;
13 
14 
15 /**
16  * Parse tokens for the Tokeniser.
17  */
18 abstract class Token {
19     final TokenType type; // used in switches in TreeBuilder vs .getClass()
20     static final int Unset = -1;
21     private int startPos, endPos = Unset; // position in CharacterReader this token was read from
22 
Token(TokenType type)23     private Token(TokenType type) {
24         this.type = type;
25     }
26 
tokenType()27     String tokenType() {
28         return this.getClass().getSimpleName();
29     }
30 
31     /**
32      * Reset the data represent by this token, for reuse. Prevents the need to create transfer objects for every
33      * piece of data, which immediately get GCed.
34      */
reset()35     Token reset() {
36         startPos = Unset;
37         endPos = Unset;
38         return this;
39     }
40 
startPos()41     int startPos() {
42         return startPos;
43     }
44 
startPos(int pos)45     void startPos(int pos) {
46         startPos = pos;
47     }
48 
endPos()49     int endPos() {
50         return endPos;
51     }
52 
endPos(int pos)53     void endPos(int pos) {
54         endPos = pos;
55     }
56 
reset(StringBuilder sb)57     static void reset(StringBuilder sb) {
58         if (sb != null) {
59             sb.delete(0, sb.length());
60         }
61     }
62 
63     static final class Doctype extends Token {
64         final StringBuilder name = new StringBuilder();
65         String pubSysKey = null;
66         final StringBuilder publicIdentifier = new StringBuilder();
67         final StringBuilder systemIdentifier = new StringBuilder();
68         boolean forceQuirks = false;
69 
Doctype()70         Doctype() {
71             super(TokenType.Doctype);
72         }
73 
74         @Override
reset()75         Token reset() {
76             super.reset();
77             reset(name);
78             pubSysKey = null;
79             reset(publicIdentifier);
80             reset(systemIdentifier);
81             forceQuirks = false;
82             return this;
83         }
84 
getName()85         String getName() {
86             return name.toString();
87         }
88 
getPubSysKey()89         String getPubSysKey() {
90             return pubSysKey;
91         }
92 
getPublicIdentifier()93         String getPublicIdentifier() {
94             return publicIdentifier.toString();
95         }
96 
getSystemIdentifier()97         public String getSystemIdentifier() {
98             return systemIdentifier.toString();
99         }
100 
isForceQuirks()101         public boolean isForceQuirks() {
102             return forceQuirks;
103         }
104 
105         @Override
toString()106         public String toString() {
107             return "<!doctype " + getName() + ">";
108         }
109     }
110 
111     static abstract class Tag extends Token {
112         @Nullable protected String tagName;
113         @Nullable protected String normalName; // lc version of tag name, for case-insensitive tree build
114         boolean selfClosing = false;
115         @Nullable Attributes attributes; // start tags get attributes on construction. End tags get attributes on first new attribute (but only for parser convenience, not used).
116 
117         @Nullable private String attrName; // try to get attr names and vals in one shot, vs Builder
118         private final StringBuilder attrNameSb = new StringBuilder();
119         private boolean hasAttrName = false;
120 
121         @Nullable private String attrValue;
122         private final StringBuilder attrValueSb = new StringBuilder();
123         private boolean hasAttrValue = false;
124         private boolean hasEmptyAttrValue = false; // distinguish boolean attribute from empty string value
125 
126         // attribute source range tracking
127         final TreeBuilder treeBuilder;
128         final boolean trackSource;
129         int attrNameStart, attrNameEnd, attrValStart, attrValEnd;
130 
Tag(TokenType type, TreeBuilder treeBuilder)131         Tag(TokenType type, TreeBuilder treeBuilder) {
132             super(type);
133             this.treeBuilder = treeBuilder;
134             this.trackSource = treeBuilder.trackSourceRange;
135         }
136 
137         @Override
reset()138         Tag reset() {
139             super.reset();
140             tagName = null;
141             normalName = null;
142             selfClosing = false;
143             attributes = null;
144             resetPendingAttr();
145             return this;
146         }
147 
resetPendingAttr()148         private void resetPendingAttr() {
149             reset(attrNameSb);
150             attrName = null;
151             hasAttrName = false;
152 
153             reset(attrValueSb);
154             attrValue = null;
155             hasEmptyAttrValue = false;
156             hasAttrValue = false;
157 
158             if (trackSource)
159                 attrNameStart = attrNameEnd = attrValStart = attrValEnd = Unset;
160         }
161 
162         /* Limits runaway crafted HTML from spewing attributes and getting a little sluggish in ensureCapacity.
163         Real-world HTML will P99 around 8 attributes, so plenty of headroom. Implemented here and not in the Attributes
164         object so that API users can add more if ever required. */
165         private static final int MaxAttributes = 512;
166 
newAttribute()167         final void newAttribute() {
168             if (attributes == null)
169                 attributes = new Attributes();
170 
171             if (hasAttrName && attributes.size() < MaxAttributes) {
172                 // the tokeniser has skipped whitespace control chars, but trimming could collapse to empty for other control codes, so verify here
173                 String name = attrNameSb.length() > 0 ? attrNameSb.toString() : attrName;
174                 name = name.trim();
175                 if (name.length() > 0) {
176                     String value;
177                     if (hasAttrValue)
178                         value = attrValueSb.length() > 0 ? attrValueSb.toString() : attrValue;
179                     else if (hasEmptyAttrValue)
180                         value = "";
181                     else
182                         value = null;
183                     // note that we add, not put. So that the first is kept, and rest are deduped, once in a context where case sensitivity is known, and we can warn for duplicates.
184                     attributes.add(name, value);
185 
186                     trackAttributeRange(name);
187                 }
188             }
189             resetPendingAttr();
190         }
191 
trackAttributeRange(String name)192         private void trackAttributeRange(String name) {
193             if (trackSource && isStartTag()) {
194                 final StartTag start = asStartTag();
195                 final CharacterReader r = start.treeBuilder.reader;
196                 final boolean preserve = start.treeBuilder.settings.preserveAttributeCase();
197 
198                 assert attributes != null;
199                 //noinspection unchecked
200                 Map<String, Range.AttributeRange> attrRanges =
201                     (Map<String, Range.AttributeRange>) attributes.userData(AttrRangeKey);
202                 if (attrRanges == null) {
203                     attrRanges = new HashMap<>();
204                     attributes.userData(AttrRangeKey, attrRanges);
205                 }
206 
207                 if (!preserve) name = Normalizer.lowerCase(name);
208                 if (attrRanges.containsKey(name)) return; // dedupe ranges as we go; actual attributes get deduped later for error count
209 
210                 // if there's no value (e.g. boolean), make it an implicit range at current
211                 if (!hasAttrValue) attrValStart = attrValEnd = attrNameEnd;
212 
213                 Range.AttributeRange range = new Range.AttributeRange(
214                     new Range(
215                         new Range.Position(attrNameStart, r.lineNumber(attrNameStart), r.columnNumber(attrNameStart)),
216                         new Range.Position(attrNameEnd, r.lineNumber(attrNameEnd), r.columnNumber(attrNameEnd))),
217                     new Range(
218                         new Range.Position(attrValStart, r.lineNumber(attrValStart), r.columnNumber(attrValStart)),
219                         new Range.Position(attrValEnd, r.lineNumber(attrValEnd), r.columnNumber(attrValEnd)))
220                 );
221                 attrRanges.put(name, range);
222             }
223         }
224 
hasAttributes()225         final boolean hasAttributes() {
226             return attributes != null;
227         }
228 
229         /** Case-sensitive check */
hasAttribute(String key)230         final boolean hasAttribute(String key) {
231             return attributes != null && attributes.hasKey(key);
232         }
233 
hasAttributeIgnoreCase(String key)234         final boolean hasAttributeIgnoreCase(String key) {
235             return attributes != null && attributes.hasKeyIgnoreCase(key);
236         }
237 
finaliseTag()238         final void finaliseTag() {
239             // finalises for emit
240             if (hasAttrName) {
241                 newAttribute();
242             }
243         }
244 
245         /** Preserves case */
name()246         final String name() { // preserves case, for input into Tag.valueOf (which may drop case)
247             Validate.isFalse(tagName == null || tagName.length() == 0);
248             return tagName;
249         }
250 
251         /** Lower case */
normalName()252         final String normalName() { // lower case, used in tree building for working out where in tree it should go
253             return normalName;
254         }
255 
toStringName()256         final String toStringName() {
257             return tagName != null ? tagName : "[unset]";
258         }
259 
name(String name)260         final Tag name(String name) {
261             tagName = name;
262             normalName = ParseSettings.normalName(tagName);
263             return this;
264         }
265 
isSelfClosing()266         final boolean isSelfClosing() {
267             return selfClosing;
268         }
269 
270         // these appenders are rarely hit in not null state-- caused by null chars.
appendTagName(String append)271         final void appendTagName(String append) {
272             // might have null chars - need to replace with null replacement character
273             append = append.replace(TokeniserState.nullChar, Tokeniser.replacementChar);
274             tagName = tagName == null ? append : tagName.concat(append);
275             normalName = ParseSettings.normalName(tagName);
276         }
277 
appendTagName(char append)278         final void appendTagName(char append) {
279             appendTagName(String.valueOf(append));
280         }
281 
appendAttributeName(String append, int startPos, int endPos)282         final void appendAttributeName(String append, int startPos, int endPos) {
283             // might have null chars because we eat in one pass - need to replace with null replacement character
284             append = append.replace(TokeniserState.nullChar, Tokeniser.replacementChar);
285 
286             ensureAttrName(startPos, endPos);
287             if (attrNameSb.length() == 0) {
288                 attrName = append;
289             } else {
290                 attrNameSb.append(append);
291             }
292         }
293 
appendAttributeName(char append, int startPos, int endPos)294         final void appendAttributeName(char append, int startPos, int endPos) {
295             ensureAttrName(startPos, endPos);
296             attrNameSb.append(append);
297         }
298 
appendAttributeValue(String append, int startPos, int endPos)299         final void appendAttributeValue(String append, int startPos, int endPos) {
300             ensureAttrValue(startPos, endPos);
301             if (attrValueSb.length() == 0) {
302                 attrValue = append;
303             } else {
304                 attrValueSb.append(append);
305             }
306         }
307 
appendAttributeValue(char append, int startPos, int endPos)308         final void appendAttributeValue(char append, int startPos, int endPos) {
309             ensureAttrValue(startPos, endPos);
310             attrValueSb.append(append);
311         }
312 
appendAttributeValue(int[] appendCodepoints, int startPos, int endPos)313         final void appendAttributeValue(int[] appendCodepoints, int startPos, int endPos) {
314             ensureAttrValue(startPos, endPos);
315             for (int codepoint : appendCodepoints) {
316                 attrValueSb.appendCodePoint(codepoint);
317             }
318         }
319 
setEmptyAttributeValue()320         final void setEmptyAttributeValue() {
321             hasEmptyAttrValue = true;
322         }
323 
ensureAttrName(int startPos, int endPos)324         private void ensureAttrName(int startPos, int endPos) {
325             hasAttrName = true;
326             // if on second hit, we'll need to move to the builder
327             if (attrName != null) {
328                 attrNameSb.append(attrName);
329                 attrName = null;
330             }
331             if (trackSource) {
332                 attrNameStart = attrNameStart > Unset ? attrNameStart : startPos; // latches to first
333                 attrNameEnd = endPos;
334             }
335         }
336 
ensureAttrValue(int startPos, int endPos)337         private void ensureAttrValue(int startPos, int endPos) {
338             hasAttrValue = true;
339             // if on second hit, we'll need to move to the builder
340             if (attrValue != null) {
341                 attrValueSb.append(attrValue);
342                 attrValue = null;
343             }
344             if (trackSource) {
345                 attrValStart = attrValStart > Unset ? attrValStart : startPos; // latches to first
346                 attrValEnd = endPos;
347             }
348         }
349 
350         @Override
toString()351         abstract public String toString();
352     }
353 
354     final static class StartTag extends Tag {
355 
356         // TreeBuilder is provided so if tracking, can get line / column positions for Range; and can dedupe as we go
StartTag(TreeBuilder treeBuilder)357         StartTag(TreeBuilder treeBuilder) {
358             super(TokenType.StartTag, treeBuilder);
359         }
360 
361         @Override
reset()362         Tag reset() {
363             super.reset();
364             attributes = null;
365             return this;
366         }
367 
nameAttr(String name, Attributes attributes)368         StartTag nameAttr(String name, Attributes attributes) {
369             this.tagName = name;
370             this.attributes = attributes;
371             normalName = ParseSettings.normalName(tagName);
372             return this;
373         }
374 
375         @Override
toString()376         public String toString() {
377             String closer = isSelfClosing() ? "/>" : ">";
378             if (hasAttributes() && attributes.size() > 0)
379                 return "<" + toStringName() + " " + attributes.toString() + closer;
380             else
381                 return "<" + toStringName() + closer;
382         }
383     }
384 
385     final static class EndTag extends Tag{
EndTag(TreeBuilder treeBuilder)386         EndTag(TreeBuilder treeBuilder) {
387             super(TokenType.EndTag, treeBuilder);
388         }
389 
390         @Override
toString()391         public String toString() {
392             return "</" + toStringName() + ">";
393         }
394     }
395 
396     final static class Comment extends Token {
397         private final StringBuilder data = new StringBuilder();
398         private String dataS; // try to get in one shot
399         boolean bogus = false;
400 
401         @Override
reset()402         Token reset() {
403             super.reset();
404             reset(data);
405             dataS = null;
406             bogus = false;
407             return this;
408         }
409 
Comment()410         Comment() {
411             super(TokenType.Comment);
412         }
413 
getData()414         String getData() {
415             return dataS != null ? dataS : data.toString();
416         }
417 
append(String append)418         Comment append(String append) {
419             ensureData();
420             if (data.length() == 0) {
421                 dataS = append;
422             } else {
423                 data.append(append);
424             }
425             return this;
426         }
427 
append(char append)428         Comment append(char append) {
429             ensureData();
430             data.append(append);
431             return this;
432         }
433 
ensureData()434         private void ensureData() {
435             // if on second hit, we'll need to move to the builder
436             if (dataS != null) {
437                 data.append(dataS);
438                 dataS = null;
439             }
440         }
441 
442         @Override
toString()443         public String toString() {
444             return "<!--" + getData() + "-->";
445         }
446     }
447 
448     static class Character extends Token implements Cloneable {
449         private String data;
450 
Character()451         Character() {
452             super(TokenType.Character);
453         }
454 
455         @Override
reset()456         Token reset() {
457             super.reset();
458             data = null;
459             return this;
460         }
461 
data(String data)462         Character data(String data) {
463             this.data = data;
464             return this;
465         }
466 
getData()467         String getData() {
468             return data;
469         }
470 
471         @Override
toString()472         public String toString() {
473             return getData();
474         }
475 
clone()476         @Override protected Token.Character clone() {
477             try {
478                 return (Token.Character) super.clone();
479             } catch (CloneNotSupportedException e) {
480                 throw new RuntimeException(e);
481             }
482         }
483     }
484 
485     final static class CData extends Character {
CData(String data)486         CData(String data) {
487             super();
488             this.data(data);
489         }
490 
491         @Override
toString()492         public String toString() {
493             return "<![CDATA[" + getData() + "]]>";
494         }
495 
496     }
497 
498     final static class EOF extends Token {
EOF()499         EOF() {
500             super(Token.TokenType.EOF);
501         }
502 
503         @Override
reset()504         Token reset() {
505             super.reset();
506             return this;
507         }
508 
509         @Override
toString()510         public String toString() {
511             return "";
512         }
513     }
514 
isDoctype()515     final boolean isDoctype() {
516         return type == TokenType.Doctype;
517     }
518 
asDoctype()519     final Doctype asDoctype() {
520         return (Doctype) this;
521     }
522 
isStartTag()523     final boolean isStartTag() {
524         return type == TokenType.StartTag;
525     }
526 
asStartTag()527     final StartTag asStartTag() {
528         return (StartTag) this;
529     }
530 
isEndTag()531     final boolean isEndTag() {
532         return type == TokenType.EndTag;
533     }
534 
asEndTag()535     final EndTag asEndTag() {
536         return (EndTag) this;
537     }
538 
isComment()539     final boolean isComment() {
540         return type == TokenType.Comment;
541     }
542 
asComment()543     final Comment asComment() {
544         return (Comment) this;
545     }
546 
isCharacter()547     final boolean isCharacter() {
548         return type == TokenType.Character;
549     }
550 
isCData()551     final boolean isCData() {
552         return this instanceof CData;
553     }
554 
asCharacter()555     final Character asCharacter() {
556         return (Character) this;
557     }
558 
isEOF()559     final boolean isEOF() {
560         return type == TokenType.EOF;
561     }
562 
563     public enum TokenType {
564         Doctype,
565         StartTag,
566         EndTag,
567         Comment,
568         Character, // note no CData - treated in builder as an extension of Character
569         EOF
570     }
571 }
572