1 package org.jsoup.parser; 2 3 import org.jsoup.helper.Validate; 4 import org.jsoup.internal.SharedConstants; 5 import org.jsoup.nodes.Attributes; 6 import org.jsoup.nodes.Document; 7 import org.jsoup.nodes.Element; 8 import org.jsoup.nodes.Node; 9 import org.jsoup.nodes.Range; 10 11 import java.io.Reader; 12 import java.util.ArrayList; 13 import java.util.HashMap; 14 import java.util.List; 15 import java.util.Map; 16 17 import static org.jsoup.parser.Parser.NamespaceHtml; 18 19 /** 20 * @author Jonathan Hedley 21 */ 22 abstract class TreeBuilder { 23 protected Parser parser; 24 CharacterReader reader; 25 Tokeniser tokeniser; 26 Document doc; // current doc we are building into 27 ArrayList<Element> stack; // the stack of open elements 28 String baseUri; // current base uri, for creating new elements 29 Token currentToken; // currentToken is used for error and source position tracking. Null at start of fragment parse 30 ParseSettings settings; 31 Map<String, Tag> seenTags; // tags we've used in this parse; saves tag GC for custom tags. 32 33 private Token.StartTag start; // start tag to process 34 private final Token.EndTag end = new Token.EndTag(this); defaultSettings()35 abstract ParseSettings defaultSettings(); 36 37 boolean trackSourceRange; // optionally tracks the source range of nodes and attributes 38 initialiseParse(Reader input, String baseUri, Parser parser)39 void initialiseParse(Reader input, String baseUri, Parser parser) { 40 Validate.notNullParam(input, "input"); 41 Validate.notNullParam(baseUri, "baseUri"); 42 Validate.notNull(parser); 43 44 doc = new Document(parser.defaultNamespace(), baseUri); 45 doc.parser(parser); 46 this.parser = parser; 47 settings = parser.settings(); 48 reader = new CharacterReader(input); 49 trackSourceRange = parser.isTrackPosition(); 50 reader.trackNewlines(parser.isTrackErrors() || trackSourceRange); // when tracking errors or source ranges, enable newline tracking for better legibility 51 tokeniser = new Tokeniser(this); 52 stack = new ArrayList<>(32); 53 seenTags = new HashMap<>(); 54 start = new Token.StartTag(this); 55 currentToken = start; // init current token to the virtual start token. 56 this.baseUri = baseUri; 57 } 58 parse(Reader input, String baseUri, Parser parser)59 Document parse(Reader input, String baseUri, Parser parser) { 60 initialiseParse(input, baseUri, parser); 61 runParser(); 62 63 // tidy up - as the Parser and Treebuilder are retained in document for settings / fragments 64 reader.close(); 65 reader = null; 66 tokeniser = null; 67 stack = null; 68 seenTags = null; 69 70 return doc; 71 } 72 73 /** 74 Create a new copy of this TreeBuilder 75 @return copy, ready for a new parse 76 */ newInstance()77 abstract TreeBuilder newInstance(); 78 parseFragment(String inputFragment, Element context, String baseUri, Parser parser)79 abstract List<Node> parseFragment(String inputFragment, Element context, String baseUri, Parser parser); 80 runParser()81 void runParser() { 82 final Tokeniser tokeniser = this.tokeniser; 83 final Token.TokenType eof = Token.TokenType.EOF; 84 85 while (true) { 86 Token token = tokeniser.read(); 87 currentToken = token; 88 process(token); 89 if (token.type == eof) 90 break; 91 token.reset(); 92 } 93 94 // once we hit the end, pop remaining items off the stack 95 while (!stack.isEmpty()) pop(); 96 } 97 process(Token token)98 abstract boolean process(Token token); 99 processStartTag(String name)100 boolean processStartTag(String name) { 101 // these are "virtual" start tags (auto-created by the treebuilder), so not tracking the start position 102 final Token.StartTag start = this.start; 103 if (currentToken == start) { // don't recycle an in-use token 104 return process(new Token.StartTag(this).name(name)); 105 } 106 return process(start.reset().name(name)); 107 } 108 processStartTag(String name, Attributes attrs)109 boolean processStartTag(String name, Attributes attrs) { 110 final Token.StartTag start = this.start; 111 if (currentToken == start) { // don't recycle an in-use token 112 return process(new Token.StartTag(this).nameAttr(name, attrs)); 113 } 114 start.reset(); 115 start.nameAttr(name, attrs); 116 return process(start); 117 } 118 processEndTag(String name)119 boolean processEndTag(String name) { 120 if (currentToken == end) { // don't recycle an in-use token 121 return process(new Token.EndTag(this).name(name)); 122 } 123 return process(end.reset().name(name)); 124 } 125 126 /** 127 Removes the last Element from the stack, hits onNodeClosed, and then returns it. 128 * @return 129 */ pop()130 final Element pop() { 131 int size = stack.size(); 132 Element removed = stack.remove(size - 1); 133 onNodeClosed(removed); 134 return removed; 135 } 136 137 /** 138 Adds the specified Element to the end of the stack, and hits onNodeInserted. 139 * @param element 140 */ push(Element element)141 final void push(Element element) { 142 stack.add(element); 143 onNodeInserted(element); 144 } 145 146 /** 147 Get the current element (last on the stack). If all items have been removed, returns the document instead 148 (which might not actually be on the stack; use stack.size() == 0 to test if required. 149 @return the last element on the stack, if any; or the root document 150 */ currentElement()151 Element currentElement() { 152 int size = stack.size(); 153 return size > 0 ? stack.get(size-1) : doc; 154 } 155 156 /** 157 Checks if the Current Element's normal name equals the supplied name, in the HTML namespace. 158 @param normalName name to check 159 @return true if there is a current element on the stack, and its name equals the supplied 160 */ currentElementIs(String normalName)161 boolean currentElementIs(String normalName) { 162 if (stack.size() == 0) 163 return false; 164 Element current = currentElement(); 165 return current != null && current.normalName().equals(normalName) 166 && current.tag().namespace().equals(NamespaceHtml); 167 } 168 169 /** 170 Checks if the Current Element's normal name equals the supplied name, in the specified namespace. 171 @param normalName name to check 172 @param namespace the namespace 173 @return true if there is a current element on the stack, and its name equals the supplied 174 */ currentElementIs(String normalName, String namespace)175 boolean currentElementIs(String normalName, String namespace) { 176 if (stack.size() == 0) 177 return false; 178 Element current = currentElement(); 179 return current != null && current.normalName().equals(normalName) 180 && current.tag().namespace().equals(namespace); 181 } 182 183 /** 184 * If the parser is tracking errors, add an error at the current position. 185 * @param msg error message 186 */ error(String msg)187 void error(String msg) { 188 error(msg, (Object[]) null); 189 } 190 191 /** 192 * If the parser is tracking errors, add an error at the current position. 193 * @param msg error message template 194 * @param args template arguments 195 */ error(String msg, Object... args)196 void error(String msg, Object... args) { 197 ParseErrorList errors = parser.getErrors(); 198 if (errors.canAddError()) 199 errors.add(new ParseError(reader, msg, args)); 200 } 201 202 /** 203 (An internal method, visible for Element. For HTML parse, signals that script and style text should be treated as 204 Data Nodes). 205 */ isContentForTagData(String normalName)206 boolean isContentForTagData(String normalName) { 207 return false; 208 } 209 tagFor(String tagName, String namespace, ParseSettings settings)210 Tag tagFor(String tagName, String namespace, ParseSettings settings) { 211 Tag cached = seenTags.get(tagName); // note that we don't normalize the cache key. But tag via valueOf may be normalized. 212 if (cached == null || !cached.namespace().equals(namespace)) { 213 // only return from cache if the namespace is the same. not running nested cache to save double hit on the common flow 214 Tag tag = Tag.valueOf(tagName, namespace, settings); 215 seenTags.put(tagName, tag); 216 return tag; 217 } 218 return cached; 219 } 220 tagFor(String tagName, ParseSettings settings)221 Tag tagFor(String tagName, ParseSettings settings) { 222 return tagFor(tagName, defaultNamespace(), settings); 223 } 224 225 /** 226 Gets the default namespace for this TreeBuilder 227 * @return the default namespace 228 */ defaultNamespace()229 String defaultNamespace() { 230 return NamespaceHtml; 231 } 232 233 /** 234 Called by implementing TreeBuilders when a node has been inserted. This implementation includes optionally tracking 235 the source range of the node. @param node the node that was just inserted 236 */ onNodeInserted(Node node)237 void onNodeInserted(Node node) { 238 trackNodePosition(node, true); 239 } 240 241 /** 242 Called by implementing TreeBuilders when a node is explicitly closed. This implementation includes optionally 243 tracking the closing source range of the node. @param node the node being closed 244 */ onNodeClosed(Node node)245 void onNodeClosed(Node node) { 246 trackNodePosition(node, false); 247 } 248 trackNodePosition(Node node, boolean isStart)249 private void trackNodePosition(Node node, boolean isStart) { 250 if (!trackSourceRange) return; 251 252 final Token token = currentToken; 253 int startPos = token.startPos(); 254 int endPos = token.endPos(); 255 256 // handle implicit element open / closes. 257 if (node instanceof Element) { 258 final Element el = (Element) node; 259 if (token.isEOF()) { 260 if (el.endSourceRange().isTracked()) 261 return; // /body and /html are left on stack until EOF, don't reset them 262 startPos = endPos = reader.pos(); 263 } else if (isStart) { // opening tag 264 if (!token.isStartTag() || !el.normalName().equals(token.asStartTag().normalName)) { 265 endPos = startPos; 266 } 267 } else { // closing tag 268 if (!el.tag().isEmpty() && !el.tag().isSelfClosing()) { 269 if (!token.isEndTag() || !el.normalName().equals(token.asEndTag().normalName)) { 270 endPos = startPos; 271 } 272 } 273 } 274 } 275 276 Range.Position startPosition = new Range.Position 277 (startPos, reader.lineNumber(startPos), reader.columnNumber(startPos)); 278 Range.Position endPosition = new Range.Position 279 (endPos, reader.lineNumber(endPos), reader.columnNumber(endPos)); 280 Range range = new Range(startPosition, endPosition); 281 node.attributes().userData(isStart ? SharedConstants.RangeKey : SharedConstants.EndRangeKey, range); 282 } 283 } 284