• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 package org.jsoup.parser;
2 
3 import org.jsoup.helper.Validate;
4 import org.jsoup.internal.StringUtil;
5 import org.jsoup.nodes.Entities;
6 import org.jspecify.annotations.Nullable;
7 
8 import java.util.Arrays;
9 
10 /**
11  * Readers the input stream into tokens.
12  */
13 final class Tokeniser {
14     static final char replacementChar = '\uFFFD'; // replaces null character
15     private static final char[] notCharRefCharsSorted = new char[]{'\t', '\n', '\r', '\f', ' ', '<', '&'};
16 
17     // Some illegal character escapes are parsed by browsers as windows-1252 instead. See issue #1034
18     // https://html.spec.whatwg.org/multipage/parsing.html#numeric-character-reference-end-state
19     static final int win1252ExtensionsStart = 0x80;
20     static final int[] win1252Extensions = new int[] {
21             // we could build this manually, but Windows-1252 is not a standard java charset so that could break on
22             // some platforms - this table is verified with a test
23             0x20AC, 0x0081, 0x201A, 0x0192, 0x201E, 0x2026, 0x2020, 0x2021,
24             0x02C6, 0x2030, 0x0160, 0x2039, 0x0152, 0x008D, 0x017D, 0x008F,
25             0x0090, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014,
26             0x02DC, 0x2122, 0x0161, 0x203A, 0x0153, 0x009D, 0x017E, 0x0178,
27     };
28 
29     static {
30         Arrays.sort(notCharRefCharsSorted);
31     }
32 
33     private final CharacterReader reader; // html input
34     private final ParseErrorList errors; // errors found while tokenising
35 
36     private TokeniserState state = TokeniserState.Data; // current tokenisation state
37     @Nullable private Token emitPending = null; // the token we are about to emit on next read
38     private boolean isEmitPending = false;
39     @Nullable private String charsString = null; // characters pending an emit. Will fall to charsBuilder if more than one
40     private final StringBuilder charsBuilder = new StringBuilder(1024); // buffers characters to output as one token, if more than one emit per read
41     final StringBuilder dataBuffer = new StringBuilder(1024); // buffers data looking for </script>
42 
43     final Token.StartTag startPending;
44     final Token.EndTag endPending;
45     Token.Tag tagPending; // tag we are building up: start or end pending
46     final Token.Character charPending = new Token.Character();
47     final Token.Doctype doctypePending = new Token.Doctype(); // doctype building up
48     final Token.Comment commentPending = new Token.Comment(); // comment building up
49     @Nullable private String lastStartTag; // the last start tag emitted, to test appropriate end tag
50     @Nullable private String lastStartCloseSeq; // "</" + lastStartTag, so we can quickly check for that in RCData
51 
52     private static final int Unset = -1;
53     private int markupStartPos, charStartPos = Unset; // reader pos at the start of markup / characters. updated on state transition
54 
Tokeniser(TreeBuilder treeBuilder)55     Tokeniser(TreeBuilder treeBuilder) {
56         tagPending = startPending  = new Token.StartTag(treeBuilder);
57         endPending = new Token.EndTag(treeBuilder);
58         this.reader = treeBuilder.reader;
59         this.errors = treeBuilder.parser.getErrors();
60     }
61 
read()62     Token read() {
63         while (!isEmitPending) {
64             state.read(this, reader);
65         }
66 
67         // if emit is pending, a non-character token was found: return any chars in buffer, and leave token for next read:
68         final StringBuilder cb = this.charsBuilder;
69         if (cb.length() != 0) {
70             String str = cb.toString();
71             cb.delete(0, cb.length());
72             Token token = charPending.data(str);
73             charsString = null;
74             return token;
75         } else if (charsString != null) {
76             Token token = charPending.data(charsString);
77             charsString = null;
78             return token;
79         } else {
80             isEmitPending = false;
81             assert emitPending != null;
82             return emitPending;
83         }
84     }
85 
emit(Token token)86     void emit(Token token) {
87         Validate.isFalse(isEmitPending);
88 
89         emitPending = token;
90         isEmitPending = true;
91         token.startPos(markupStartPos);
92         token.endPos(reader.pos());
93         charStartPos = Unset;
94 
95         if (token.type == Token.TokenType.StartTag) {
96             Token.StartTag startTag = (Token.StartTag) token;
97             lastStartTag = startTag.tagName;
98             lastStartCloseSeq = null; // only lazy inits
99         } else if (token.type == Token.TokenType.EndTag) {
100             Token.EndTag endTag = (Token.EndTag) token;
101             if (endTag.hasAttributes())
102                 error("Attributes incorrectly present on end tag [/%s]", endTag.normalName());
103         }
104     }
105 
emit(final String str)106     void emit(final String str) {
107         // buffer strings up until last string token found, to emit only one token for a run of character refs etc.
108         // does not set isEmitPending; read checks that
109         if (charsString == null) {
110             charsString = str;
111         } else {
112             if (charsBuilder.length() == 0) { // switching to string builder as more than one emit before read
113                 charsBuilder.append(charsString);
114             }
115             charsBuilder.append(str);
116         }
117         charPending.startPos(charStartPos);
118         charPending.endPos(reader.pos());
119     }
120 
121     // variations to limit need to create temp strings
emit(final StringBuilder str)122     void emit(final StringBuilder str) {
123         if (charsString == null) {
124             charsString = str.toString();
125         } else {
126             if (charsBuilder.length() == 0) {
127                 charsBuilder.append(charsString);
128             }
129             charsBuilder.append(str);
130         }
131         charPending.startPos(charStartPos);
132         charPending.endPos(reader.pos());
133     }
134 
emit(char c)135     void emit(char c) {
136         if (charsString == null) {
137             charsString = String.valueOf(c);
138         } else {
139             if (charsBuilder.length() == 0) {
140                 charsBuilder.append(charsString);
141             }
142             charsBuilder.append(c);
143         }
144         charPending.startPos(charStartPos);
145         charPending.endPos(reader.pos());
146     }
147 
emit(char[] chars)148     void emit(char[] chars) {
149         emit(String.valueOf(chars));
150     }
151 
emit(int[] codepoints)152     void emit(int[] codepoints) {
153         emit(new String(codepoints, 0, codepoints.length));
154     }
155 
getState()156     TokeniserState getState() {
157         return state;
158     }
159 
transition(TokeniserState newState)160     void transition(TokeniserState newState) {
161         // track markup / data position on state transitions
162         switch (newState) {
163             case TagOpen:
164                 markupStartPos = reader.pos();
165                 break;
166             case Data:
167                 if (charStartPos == Unset) // don't reset when we are jumping between e.g data -> char ref -> data
168                     charStartPos = reader.pos();
169         }
170 
171         this.state = newState;
172     }
173 
advanceTransition(TokeniserState newState)174     void advanceTransition(TokeniserState newState) {
175         transition(newState);
176         reader.advance();
177     }
178 
179     final private int[] codepointHolder = new int[1]; // holder to not have to keep creating arrays
180     final private int[] multipointHolder = new int[2];
consumeCharacterReference(@ullable Character additionalAllowedCharacter, boolean inAttribute)181     @Nullable int[] consumeCharacterReference(@Nullable Character additionalAllowedCharacter, boolean inAttribute) {
182         if (reader.isEmpty())
183             return null;
184         if (additionalAllowedCharacter != null && additionalAllowedCharacter == reader.current())
185             return null;
186         if (reader.matchesAnySorted(notCharRefCharsSorted))
187             return null;
188 
189         final int[] codeRef = codepointHolder;
190         reader.mark();
191         if (reader.matchConsume("#")) { // numbered
192             boolean isHexMode = reader.matchConsumeIgnoreCase("X");
193             String numRef = isHexMode ? reader.consumeHexSequence() : reader.consumeDigitSequence();
194             if (numRef.length() == 0) { // didn't match anything
195                 characterReferenceError("numeric reference with no numerals");
196                 reader.rewindToMark();
197                 return null;
198             }
199 
200             reader.unmark();
201             if (!reader.matchConsume(";"))
202                 characterReferenceError("missing semicolon on [&#%s]", numRef); // missing semi
203             int charval = -1;
204             try {
205                 int base = isHexMode ? 16 : 10;
206                 charval = Integer.valueOf(numRef, base);
207             } catch (NumberFormatException ignored) {
208                 // skip
209             }
210             // todo: check for extra illegal unicode points as parse errors - described https://html.spec.whatwg.org/multipage/syntax.html#character-references and in Infra
211             // The numeric character reference forms described above are allowed to reference any code point excluding U+000D CR, noncharacters, and controls other than ASCII whitespace.
212             if (charval == -1 || charval > 0x10FFFF) {
213                 characterReferenceError("character [%s] outside of valid range", charval);
214                 codeRef[0] = replacementChar;
215             } else {
216                 // fix illegal unicode characters to match browser behavior
217                 if (charval >= win1252ExtensionsStart && charval < win1252ExtensionsStart + win1252Extensions.length) {
218                     characterReferenceError("character [%s] is not a valid unicode code point", charval);
219                     charval = win1252Extensions[charval - win1252ExtensionsStart];
220                 }
221 
222                 // todo: implement number replacement table
223                 // todo: check for extra illegal unicode points as parse errors
224                 codeRef[0] = charval;
225             }
226             return codeRef;
227         } else { // named
228             // get as many letters as possible, and look for matching entities.
229             String nameRef = reader.consumeLetterThenDigitSequence();
230             boolean looksLegit = reader.matches(';');
231             // found if a base named entity without a ;, or an extended entity with the ;.
232             boolean found = (Entities.isBaseNamedEntity(nameRef) || (Entities.isNamedEntity(nameRef) && looksLegit));
233 
234             if (!found) {
235                 reader.rewindToMark();
236                 if (looksLegit) // named with semicolon
237                     characterReferenceError("invalid named reference [%s]", nameRef);
238                 return null;
239             }
240             if (inAttribute && (reader.matchesLetter() || reader.matchesDigit() || reader.matchesAny('=', '-', '_'))) {
241                 // don't want that to match
242                 reader.rewindToMark();
243                 return null;
244             }
245 
246             reader.unmark();
247             if (!reader.matchConsume(";"))
248                 characterReferenceError("missing semicolon on [&%s]", nameRef); // missing semi
249             int numChars = Entities.codepointsForName(nameRef, multipointHolder);
250             if (numChars == 1) {
251                 codeRef[0] = multipointHolder[0];
252                 return codeRef;
253             } else if (numChars ==2) {
254                 return multipointHolder;
255             } else {
256                 Validate.fail("Unexpected characters returned for " + nameRef);
257                 return multipointHolder;
258             }
259         }
260     }
261 
createTagPending(boolean start)262     Token.Tag createTagPending(boolean start) {
263         tagPending = start ? startPending.reset() : endPending.reset();
264         return tagPending;
265     }
266 
emitTagPending()267     void emitTagPending() {
268         tagPending.finaliseTag();
269         emit(tagPending);
270     }
271 
createCommentPending()272     void createCommentPending() {
273         commentPending.reset();
274     }
275 
emitCommentPending()276     void emitCommentPending() {
277         emit(commentPending);
278     }
279 
createBogusCommentPending()280     void createBogusCommentPending() {
281         commentPending.reset();
282         commentPending.bogus = true;
283     }
284 
createDoctypePending()285     void createDoctypePending() {
286         doctypePending.reset();
287     }
288 
emitDoctypePending()289     void emitDoctypePending() {
290         emit(doctypePending);
291     }
292 
createTempBuffer()293     void createTempBuffer() {
294         Token.reset(dataBuffer);
295     }
296 
isAppropriateEndTagToken()297     boolean isAppropriateEndTagToken() {
298         return lastStartTag != null && tagPending.name().equalsIgnoreCase(lastStartTag);
299     }
300 
appropriateEndTagName()301     @Nullable String appropriateEndTagName() {
302         return lastStartTag; // could be null
303     }
304 
305     /** Returns the closer sequence {@code </lastStart} */
appropriateEndTagSeq()306     String appropriateEndTagSeq() {
307         if (lastStartCloseSeq == null) // reset on start tag emit
308             lastStartCloseSeq = "</" + lastStartTag;
309         return lastStartCloseSeq;
310     }
311 
error(TokeniserState state)312     void error(TokeniserState state) {
313         if (errors.canAddError())
314             errors.add(new ParseError(reader, "Unexpected character '%s' in input state [%s]", reader.current(), state));
315     }
316 
eofError(TokeniserState state)317     void eofError(TokeniserState state) {
318         if (errors.canAddError())
319             errors.add(new ParseError(reader, "Unexpectedly reached end of file (EOF) in input state [%s]", state));
320     }
321 
characterReferenceError(String message, Object... args)322     private void characterReferenceError(String message, Object... args) {
323         if (errors.canAddError())
324             errors.add(new ParseError(reader, String.format("Invalid character reference: " + message, args)));
325     }
326 
error(String errorMsg)327     void error(String errorMsg) {
328         if (errors.canAddError())
329             errors.add(new ParseError(reader, errorMsg));
330     }
331 
error(String errorMsg, Object... args)332     void error(String errorMsg, Object... args) {
333         if (errors.canAddError())
334             errors.add(new ParseError(reader, errorMsg, args));
335     }
336 
currentNodeInHtmlNS()337     static boolean currentNodeInHtmlNS() {
338         // todo: implement namespaces correctly
339         return true;
340         // Element currentNode = currentNode();
341         // return currentNode != null && currentNode.namespace().equals("HTML");
342     }
343 
344     /**
345      * Utility method to consume reader and unescape entities found within.
346      * @param inAttribute if the text to be unescaped is in an attribute
347      * @return unescaped string from reader
348      */
unescapeEntities(boolean inAttribute)349     String unescapeEntities(boolean inAttribute) {
350         StringBuilder builder = StringUtil.borrowBuilder();
351         while (!reader.isEmpty()) {
352             builder.append(reader.consumeTo('&'));
353             if (reader.matches('&')) {
354                 reader.consume();
355                 int[] c = consumeCharacterReference(null, inAttribute);
356                 if (c == null || c.length==0)
357                     builder.append('&');
358                 else {
359                     builder.appendCodePoint(c[0]);
360                     if (c.length == 2)
361                         builder.appendCodePoint(c[1]);
362                 }
363 
364             }
365         }
366         return StringUtil.releaseBuilder(builder);
367     }
368 }
369