1 package org.jsoup.parser; 2 3 import org.jsoup.helper.Validate; 4 import org.jsoup.internal.StringUtil; 5 import org.jsoup.nodes.Entities; 6 import org.jspecify.annotations.Nullable; 7 8 import java.util.Arrays; 9 10 /** 11 * Readers the input stream into tokens. 12 */ 13 final class Tokeniser { 14 static final char replacementChar = '\uFFFD'; // replaces null character 15 private static final char[] notCharRefCharsSorted = new char[]{'\t', '\n', '\r', '\f', ' ', '<', '&'}; 16 17 // Some illegal character escapes are parsed by browsers as windows-1252 instead. See issue #1034 18 // https://html.spec.whatwg.org/multipage/parsing.html#numeric-character-reference-end-state 19 static final int win1252ExtensionsStart = 0x80; 20 static final int[] win1252Extensions = new int[] { 21 // we could build this manually, but Windows-1252 is not a standard java charset so that could break on 22 // some platforms - this table is verified with a test 23 0x20AC, 0x0081, 0x201A, 0x0192, 0x201E, 0x2026, 0x2020, 0x2021, 24 0x02C6, 0x2030, 0x0160, 0x2039, 0x0152, 0x008D, 0x017D, 0x008F, 25 0x0090, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014, 26 0x02DC, 0x2122, 0x0161, 0x203A, 0x0153, 0x009D, 0x017E, 0x0178, 27 }; 28 29 static { 30 Arrays.sort(notCharRefCharsSorted); 31 } 32 33 private final CharacterReader reader; // html input 34 private final ParseErrorList errors; // errors found while tokenising 35 36 private TokeniserState state = TokeniserState.Data; // current tokenisation state 37 @Nullable private Token emitPending = null; // the token we are about to emit on next read 38 private boolean isEmitPending = false; 39 @Nullable private String charsString = null; // characters pending an emit. Will fall to charsBuilder if more than one 40 private final StringBuilder charsBuilder = new StringBuilder(1024); // buffers characters to output as one token, if more than one emit per read 41 final StringBuilder dataBuffer = new StringBuilder(1024); // buffers data looking for </script> 42 43 final Token.StartTag startPending; 44 final Token.EndTag endPending; 45 Token.Tag tagPending; // tag we are building up: start or end pending 46 final Token.Character charPending = new Token.Character(); 47 final Token.Doctype doctypePending = new Token.Doctype(); // doctype building up 48 final Token.Comment commentPending = new Token.Comment(); // comment building up 49 @Nullable private String lastStartTag; // the last start tag emitted, to test appropriate end tag 50 @Nullable private String lastStartCloseSeq; // "</" + lastStartTag, so we can quickly check for that in RCData 51 52 private static final int Unset = -1; 53 private int markupStartPos, charStartPos = Unset; // reader pos at the start of markup / characters. updated on state transition 54 Tokeniser(TreeBuilder treeBuilder)55 Tokeniser(TreeBuilder treeBuilder) { 56 tagPending = startPending = new Token.StartTag(treeBuilder); 57 endPending = new Token.EndTag(treeBuilder); 58 this.reader = treeBuilder.reader; 59 this.errors = treeBuilder.parser.getErrors(); 60 } 61 read()62 Token read() { 63 while (!isEmitPending) { 64 state.read(this, reader); 65 } 66 67 // if emit is pending, a non-character token was found: return any chars in buffer, and leave token for next read: 68 final StringBuilder cb = this.charsBuilder; 69 if (cb.length() != 0) { 70 String str = cb.toString(); 71 cb.delete(0, cb.length()); 72 Token token = charPending.data(str); 73 charsString = null; 74 return token; 75 } else if (charsString != null) { 76 Token token = charPending.data(charsString); 77 charsString = null; 78 return token; 79 } else { 80 isEmitPending = false; 81 assert emitPending != null; 82 return emitPending; 83 } 84 } 85 emit(Token token)86 void emit(Token token) { 87 Validate.isFalse(isEmitPending); 88 89 emitPending = token; 90 isEmitPending = true; 91 token.startPos(markupStartPos); 92 token.endPos(reader.pos()); 93 charStartPos = Unset; 94 95 if (token.type == Token.TokenType.StartTag) { 96 Token.StartTag startTag = (Token.StartTag) token; 97 lastStartTag = startTag.tagName; 98 lastStartCloseSeq = null; // only lazy inits 99 } else if (token.type == Token.TokenType.EndTag) { 100 Token.EndTag endTag = (Token.EndTag) token; 101 if (endTag.hasAttributes()) 102 error("Attributes incorrectly present on end tag [/%s]", endTag.normalName()); 103 } 104 } 105 emit(final String str)106 void emit(final String str) { 107 // buffer strings up until last string token found, to emit only one token for a run of character refs etc. 108 // does not set isEmitPending; read checks that 109 if (charsString == null) { 110 charsString = str; 111 } else { 112 if (charsBuilder.length() == 0) { // switching to string builder as more than one emit before read 113 charsBuilder.append(charsString); 114 } 115 charsBuilder.append(str); 116 } 117 charPending.startPos(charStartPos); 118 charPending.endPos(reader.pos()); 119 } 120 121 // variations to limit need to create temp strings emit(final StringBuilder str)122 void emit(final StringBuilder str) { 123 if (charsString == null) { 124 charsString = str.toString(); 125 } else { 126 if (charsBuilder.length() == 0) { 127 charsBuilder.append(charsString); 128 } 129 charsBuilder.append(str); 130 } 131 charPending.startPos(charStartPos); 132 charPending.endPos(reader.pos()); 133 } 134 emit(char c)135 void emit(char c) { 136 if (charsString == null) { 137 charsString = String.valueOf(c); 138 } else { 139 if (charsBuilder.length() == 0) { 140 charsBuilder.append(charsString); 141 } 142 charsBuilder.append(c); 143 } 144 charPending.startPos(charStartPos); 145 charPending.endPos(reader.pos()); 146 } 147 emit(char[] chars)148 void emit(char[] chars) { 149 emit(String.valueOf(chars)); 150 } 151 emit(int[] codepoints)152 void emit(int[] codepoints) { 153 emit(new String(codepoints, 0, codepoints.length)); 154 } 155 getState()156 TokeniserState getState() { 157 return state; 158 } 159 transition(TokeniserState newState)160 void transition(TokeniserState newState) { 161 // track markup / data position on state transitions 162 switch (newState) { 163 case TagOpen: 164 markupStartPos = reader.pos(); 165 break; 166 case Data: 167 if (charStartPos == Unset) // don't reset when we are jumping between e.g data -> char ref -> data 168 charStartPos = reader.pos(); 169 } 170 171 this.state = newState; 172 } 173 advanceTransition(TokeniserState newState)174 void advanceTransition(TokeniserState newState) { 175 transition(newState); 176 reader.advance(); 177 } 178 179 final private int[] codepointHolder = new int[1]; // holder to not have to keep creating arrays 180 final private int[] multipointHolder = new int[2]; consumeCharacterReference(@ullable Character additionalAllowedCharacter, boolean inAttribute)181 @Nullable int[] consumeCharacterReference(@Nullable Character additionalAllowedCharacter, boolean inAttribute) { 182 if (reader.isEmpty()) 183 return null; 184 if (additionalAllowedCharacter != null && additionalAllowedCharacter == reader.current()) 185 return null; 186 if (reader.matchesAnySorted(notCharRefCharsSorted)) 187 return null; 188 189 final int[] codeRef = codepointHolder; 190 reader.mark(); 191 if (reader.matchConsume("#")) { // numbered 192 boolean isHexMode = reader.matchConsumeIgnoreCase("X"); 193 String numRef = isHexMode ? reader.consumeHexSequence() : reader.consumeDigitSequence(); 194 if (numRef.length() == 0) { // didn't match anything 195 characterReferenceError("numeric reference with no numerals"); 196 reader.rewindToMark(); 197 return null; 198 } 199 200 reader.unmark(); 201 if (!reader.matchConsume(";")) 202 characterReferenceError("missing semicolon on [&#%s]", numRef); // missing semi 203 int charval = -1; 204 try { 205 int base = isHexMode ? 16 : 10; 206 charval = Integer.valueOf(numRef, base); 207 } catch (NumberFormatException ignored) { 208 // skip 209 } 210 // todo: check for extra illegal unicode points as parse errors - described https://html.spec.whatwg.org/multipage/syntax.html#character-references and in Infra 211 // The numeric character reference forms described above are allowed to reference any code point excluding U+000D CR, noncharacters, and controls other than ASCII whitespace. 212 if (charval == -1 || charval > 0x10FFFF) { 213 characterReferenceError("character [%s] outside of valid range", charval); 214 codeRef[0] = replacementChar; 215 } else { 216 // fix illegal unicode characters to match browser behavior 217 if (charval >= win1252ExtensionsStart && charval < win1252ExtensionsStart + win1252Extensions.length) { 218 characterReferenceError("character [%s] is not a valid unicode code point", charval); 219 charval = win1252Extensions[charval - win1252ExtensionsStart]; 220 } 221 222 // todo: implement number replacement table 223 // todo: check for extra illegal unicode points as parse errors 224 codeRef[0] = charval; 225 } 226 return codeRef; 227 } else { // named 228 // get as many letters as possible, and look for matching entities. 229 String nameRef = reader.consumeLetterThenDigitSequence(); 230 boolean looksLegit = reader.matches(';'); 231 // found if a base named entity without a ;, or an extended entity with the ;. 232 boolean found = (Entities.isBaseNamedEntity(nameRef) || (Entities.isNamedEntity(nameRef) && looksLegit)); 233 234 if (!found) { 235 reader.rewindToMark(); 236 if (looksLegit) // named with semicolon 237 characterReferenceError("invalid named reference [%s]", nameRef); 238 return null; 239 } 240 if (inAttribute && (reader.matchesLetter() || reader.matchesDigit() || reader.matchesAny('=', '-', '_'))) { 241 // don't want that to match 242 reader.rewindToMark(); 243 return null; 244 } 245 246 reader.unmark(); 247 if (!reader.matchConsume(";")) 248 characterReferenceError("missing semicolon on [&%s]", nameRef); // missing semi 249 int numChars = Entities.codepointsForName(nameRef, multipointHolder); 250 if (numChars == 1) { 251 codeRef[0] = multipointHolder[0]; 252 return codeRef; 253 } else if (numChars ==2) { 254 return multipointHolder; 255 } else { 256 Validate.fail("Unexpected characters returned for " + nameRef); 257 return multipointHolder; 258 } 259 } 260 } 261 createTagPending(boolean start)262 Token.Tag createTagPending(boolean start) { 263 tagPending = start ? startPending.reset() : endPending.reset(); 264 return tagPending; 265 } 266 emitTagPending()267 void emitTagPending() { 268 tagPending.finaliseTag(); 269 emit(tagPending); 270 } 271 createCommentPending()272 void createCommentPending() { 273 commentPending.reset(); 274 } 275 emitCommentPending()276 void emitCommentPending() { 277 emit(commentPending); 278 } 279 createBogusCommentPending()280 void createBogusCommentPending() { 281 commentPending.reset(); 282 commentPending.bogus = true; 283 } 284 createDoctypePending()285 void createDoctypePending() { 286 doctypePending.reset(); 287 } 288 emitDoctypePending()289 void emitDoctypePending() { 290 emit(doctypePending); 291 } 292 createTempBuffer()293 void createTempBuffer() { 294 Token.reset(dataBuffer); 295 } 296 isAppropriateEndTagToken()297 boolean isAppropriateEndTagToken() { 298 return lastStartTag != null && tagPending.name().equalsIgnoreCase(lastStartTag); 299 } 300 appropriateEndTagName()301 @Nullable String appropriateEndTagName() { 302 return lastStartTag; // could be null 303 } 304 305 /** Returns the closer sequence {@code </lastStart} */ appropriateEndTagSeq()306 String appropriateEndTagSeq() { 307 if (lastStartCloseSeq == null) // reset on start tag emit 308 lastStartCloseSeq = "</" + lastStartTag; 309 return lastStartCloseSeq; 310 } 311 error(TokeniserState state)312 void error(TokeniserState state) { 313 if (errors.canAddError()) 314 errors.add(new ParseError(reader, "Unexpected character '%s' in input state [%s]", reader.current(), state)); 315 } 316 eofError(TokeniserState state)317 void eofError(TokeniserState state) { 318 if (errors.canAddError()) 319 errors.add(new ParseError(reader, "Unexpectedly reached end of file (EOF) in input state [%s]", state)); 320 } 321 characterReferenceError(String message, Object... args)322 private void characterReferenceError(String message, Object... args) { 323 if (errors.canAddError()) 324 errors.add(new ParseError(reader, String.format("Invalid character reference: " + message, args))); 325 } 326 error(String errorMsg)327 void error(String errorMsg) { 328 if (errors.canAddError()) 329 errors.add(new ParseError(reader, errorMsg)); 330 } 331 error(String errorMsg, Object... args)332 void error(String errorMsg, Object... args) { 333 if (errors.canAddError()) 334 errors.add(new ParseError(reader, errorMsg, args)); 335 } 336 currentNodeInHtmlNS()337 static boolean currentNodeInHtmlNS() { 338 // todo: implement namespaces correctly 339 return true; 340 // Element currentNode = currentNode(); 341 // return currentNode != null && currentNode.namespace().equals("HTML"); 342 } 343 344 /** 345 * Utility method to consume reader and unescape entities found within. 346 * @param inAttribute if the text to be unescaped is in an attribute 347 * @return unescaped string from reader 348 */ unescapeEntities(boolean inAttribute)349 String unescapeEntities(boolean inAttribute) { 350 StringBuilder builder = StringUtil.borrowBuilder(); 351 while (!reader.isEmpty()) { 352 builder.append(reader.consumeTo('&')); 353 if (reader.matches('&')) { 354 reader.consume(); 355 int[] c = consumeCharacterReference(null, inAttribute); 356 if (c == null || c.length==0) 357 builder.append('&'); 358 else { 359 builder.appendCodePoint(c[0]); 360 if (c.length == 2) 361 builder.appendCodePoint(c[1]); 362 } 363 364 } 365 } 366 return StringUtil.releaseBuilder(builder); 367 } 368 } 369