• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 package org.jsoup.parser;
2 
3 import org.jsoup.helper.Validate;
4 import org.jsoup.internal.SharedConstants;
5 import org.jsoup.nodes.Attributes;
6 import org.jsoup.nodes.Document;
7 import org.jsoup.nodes.Element;
8 import org.jsoup.nodes.Node;
9 import org.jsoup.nodes.Range;
10 
11 import java.io.Reader;
12 import java.util.ArrayList;
13 import java.util.HashMap;
14 import java.util.List;
15 import java.util.Map;
16 
17 import static org.jsoup.parser.Parser.NamespaceHtml;
18 
19 /**
20  * @author Jonathan Hedley
21  */
22 abstract class TreeBuilder {
23     protected Parser parser;
24     CharacterReader reader;
25     Tokeniser tokeniser;
26     Document doc; // current doc we are building into
27     ArrayList<Element> stack; // the stack of open elements
28     String baseUri; // current base uri, for creating new elements
29     Token currentToken; // currentToken is used for error and source position tracking. Null at start of fragment parse
30     ParseSettings settings;
31     Map<String, Tag> seenTags; // tags we've used in this parse; saves tag GC for custom tags.
32 
33     private Token.StartTag start; // start tag to process
34     private final Token.EndTag end  = new Token.EndTag(this);
defaultSettings()35     abstract ParseSettings defaultSettings();
36 
37     boolean trackSourceRange;  // optionally tracks the source range of nodes and attributes
38 
initialiseParse(Reader input, String baseUri, Parser parser)39     void initialiseParse(Reader input, String baseUri, Parser parser) {
40         Validate.notNullParam(input, "input");
41         Validate.notNullParam(baseUri, "baseUri");
42         Validate.notNull(parser);
43 
44         doc = new Document(parser.defaultNamespace(), baseUri);
45         doc.parser(parser);
46         this.parser = parser;
47         settings = parser.settings();
48         reader = new CharacterReader(input);
49         trackSourceRange = parser.isTrackPosition();
50         reader.trackNewlines(parser.isTrackErrors() || trackSourceRange); // when tracking errors or source ranges, enable newline tracking for better legibility
51         tokeniser = new Tokeniser(this);
52         stack = new ArrayList<>(32);
53         seenTags = new HashMap<>();
54         start = new Token.StartTag(this);
55         currentToken = start; // init current token to the virtual start token.
56         this.baseUri = baseUri;
57     }
58 
parse(Reader input, String baseUri, Parser parser)59     Document parse(Reader input, String baseUri, Parser parser) {
60         initialiseParse(input, baseUri, parser);
61         runParser();
62 
63         // tidy up - as the Parser and Treebuilder are retained in document for settings / fragments
64         reader.close();
65         reader = null;
66         tokeniser = null;
67         stack = null;
68         seenTags = null;
69 
70         return doc;
71     }
72 
73     /**
74      Create a new copy of this TreeBuilder
75      @return copy, ready for a new parse
76      */
newInstance()77     abstract TreeBuilder newInstance();
78 
parseFragment(String inputFragment, Element context, String baseUri, Parser parser)79     abstract List<Node> parseFragment(String inputFragment, Element context, String baseUri, Parser parser);
80 
runParser()81     void runParser() {
82         final Tokeniser tokeniser = this.tokeniser;
83         final Token.TokenType eof = Token.TokenType.EOF;
84 
85         while (true) {
86             Token token = tokeniser.read();
87             currentToken = token;
88             process(token);
89             if (token.type == eof)
90                 break;
91             token.reset();
92         }
93 
94         // once we hit the end, pop remaining items off the stack
95         while (!stack.isEmpty()) pop();
96     }
97 
process(Token token)98     abstract boolean process(Token token);
99 
processStartTag(String name)100     boolean processStartTag(String name) {
101         // these are "virtual" start tags (auto-created by the treebuilder), so not tracking the start position
102         final Token.StartTag start = this.start;
103         if (currentToken == start) { // don't recycle an in-use token
104             return process(new Token.StartTag(this).name(name));
105         }
106         return process(start.reset().name(name));
107     }
108 
processStartTag(String name, Attributes attrs)109     boolean processStartTag(String name, Attributes attrs) {
110         final Token.StartTag start = this.start;
111         if (currentToken == start) { // don't recycle an in-use token
112             return process(new Token.StartTag(this).nameAttr(name, attrs));
113         }
114         start.reset();
115         start.nameAttr(name, attrs);
116         return process(start);
117     }
118 
processEndTag(String name)119     boolean processEndTag(String name) {
120         if (currentToken == end) { // don't recycle an in-use token
121             return process(new Token.EndTag(this).name(name));
122         }
123         return process(end.reset().name(name));
124     }
125 
126     /**
127      Removes the last Element from the stack, hits onNodeClosed, and then returns it.
128      * @return
129      */
pop()130     final Element pop() {
131         int size = stack.size();
132         Element removed = stack.remove(size - 1);
133         onNodeClosed(removed);
134         return removed;
135     }
136 
137     /**
138      Adds the specified Element to the end of the stack, and hits onNodeInserted.
139      * @param element
140      */
push(Element element)141     final void push(Element element) {
142         stack.add(element);
143         onNodeInserted(element);
144     }
145 
146     /**
147      Get the current element (last on the stack). If all items have been removed, returns the document instead
148      (which might not actually be on the stack; use stack.size() == 0 to test if required.
149      @return the last element on the stack, if any; or the root document
150      */
currentElement()151     Element currentElement() {
152         int size = stack.size();
153         return size > 0 ? stack.get(size-1) : doc;
154     }
155 
156     /**
157      Checks if the Current Element's normal name equals the supplied name, in the HTML namespace.
158      @param normalName name to check
159      @return true if there is a current element on the stack, and its name equals the supplied
160      */
currentElementIs(String normalName)161     boolean currentElementIs(String normalName) {
162         if (stack.size() == 0)
163             return false;
164         Element current = currentElement();
165         return current != null && current.normalName().equals(normalName)
166             && current.tag().namespace().equals(NamespaceHtml);
167     }
168 
169     /**
170      Checks if the Current Element's normal name equals the supplied name, in the specified namespace.
171      @param normalName name to check
172      @param namespace the namespace
173      @return true if there is a current element on the stack, and its name equals the supplied
174      */
currentElementIs(String normalName, String namespace)175     boolean currentElementIs(String normalName, String namespace) {
176         if (stack.size() == 0)
177             return false;
178         Element current = currentElement();
179         return current != null && current.normalName().equals(normalName)
180             && current.tag().namespace().equals(namespace);
181     }
182 
183     /**
184      * If the parser is tracking errors, add an error at the current position.
185      * @param msg error message
186      */
error(String msg)187     void error(String msg) {
188         error(msg, (Object[]) null);
189     }
190 
191     /**
192      * If the parser is tracking errors, add an error at the current position.
193      * @param msg error message template
194      * @param args template arguments
195      */
error(String msg, Object... args)196     void error(String msg, Object... args) {
197         ParseErrorList errors = parser.getErrors();
198         if (errors.canAddError())
199             errors.add(new ParseError(reader, msg, args));
200     }
201 
202     /**
203      (An internal method, visible for Element. For HTML parse, signals that script and style text should be treated as
204      Data Nodes).
205      */
isContentForTagData(String normalName)206     boolean isContentForTagData(String normalName) {
207         return false;
208     }
209 
tagFor(String tagName, String namespace, ParseSettings settings)210     Tag tagFor(String tagName, String namespace, ParseSettings settings) {
211         Tag cached = seenTags.get(tagName); // note that we don't normalize the cache key. But tag via valueOf may be normalized.
212         if (cached == null || !cached.namespace().equals(namespace)) {
213             // only return from cache if the namespace is the same. not running nested cache to save double hit on the common flow
214             Tag tag = Tag.valueOf(tagName, namespace, settings);
215             seenTags.put(tagName, tag);
216             return tag;
217         }
218         return cached;
219     }
220 
tagFor(String tagName, ParseSettings settings)221     Tag tagFor(String tagName, ParseSettings settings) {
222         return tagFor(tagName, defaultNamespace(), settings);
223     }
224 
225     /**
226      Gets the default namespace for this TreeBuilder
227      * @return the default namespace
228      */
defaultNamespace()229     String defaultNamespace() {
230         return NamespaceHtml;
231     }
232 
233     /**
234      Called by implementing TreeBuilders when a node has been inserted. This implementation includes optionally tracking
235      the source range of the node.  @param node the node that was just inserted
236      */
onNodeInserted(Node node)237     void onNodeInserted(Node node) {
238         trackNodePosition(node, true);
239     }
240 
241     /**
242      Called by implementing TreeBuilders when a node is explicitly closed. This implementation includes optionally
243      tracking the closing source range of the node.  @param node the node being closed
244      */
onNodeClosed(Node node)245     void onNodeClosed(Node node) {
246         trackNodePosition(node, false);
247     }
248 
trackNodePosition(Node node, boolean isStart)249     private void trackNodePosition(Node node, boolean isStart) {
250         if (!trackSourceRange) return;
251 
252         final Token token = currentToken;
253         int startPos = token.startPos();
254         int endPos = token.endPos();
255 
256         // handle implicit element open / closes.
257         if (node instanceof Element) {
258             final Element el = (Element) node;
259             if (token.isEOF()) {
260                 if (el.endSourceRange().isTracked())
261                     return; // /body and /html are left on stack until EOF, don't reset them
262                 startPos = endPos = reader.pos();
263             } else if (isStart) { // opening tag
264                 if  (!token.isStartTag() || !el.normalName().equals(token.asStartTag().normalName)) {
265                     endPos = startPos;
266                 }
267             } else { // closing tag
268                 if (!el.tag().isEmpty() && !el.tag().isSelfClosing()) {
269                     if (!token.isEndTag() || !el.normalName().equals(token.asEndTag().normalName)) {
270                         endPos = startPos;
271                     }
272                 }
273             }
274         }
275 
276         Range.Position startPosition = new Range.Position
277             (startPos, reader.lineNumber(startPos), reader.columnNumber(startPos));
278         Range.Position endPosition = new Range.Position
279             (endPos, reader.lineNumber(endPos), reader.columnNumber(endPos));
280         Range range = new Range(startPosition, endPosition);
281         node.attributes().userData(isStart ? SharedConstants.RangeKey : SharedConstants.EndRangeKey, range);
282     }
283 }
284