• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 package org.jsoup.nodes;
2 
3 import org.jsoup.Connection;
4 import org.jsoup.Jsoup;
5 import org.jsoup.helper.DataUtil;
6 import org.jsoup.helper.Validate;
7 import org.jsoup.internal.StringUtil;
8 import org.jsoup.parser.ParseSettings;
9 import org.jsoup.parser.Parser;
10 import org.jsoup.parser.Tag;
11 import org.jsoup.select.Elements;
12 import org.jsoup.select.Evaluator;
13 import org.jsoup.select.Selector;
14 import org.jspecify.annotations.Nullable;
15 
16 import java.nio.charset.Charset;
17 import java.nio.charset.CharsetEncoder;
18 import java.util.List;
19 
20 import static org.jsoup.parser.Parser.NamespaceHtml;
21 
22 /**
23  A HTML Document.
24 
25  @author Jonathan Hedley, jonathan@hedley.net */
26 public class Document extends Element {
27     private @Nullable Connection connection; // the connection this doc was fetched from, if any
28     private OutputSettings outputSettings = new OutputSettings();
29     private Parser parser; // the parser used to parse this document
30     private QuirksMode quirksMode = QuirksMode.noQuirks;
31     private final String location;
32     private boolean updateMetaCharset = false;
33 
34     /**
35      Create a new, empty Document, in the specified namespace.
36      @param namespace the namespace of this Document's root node.
37      @param baseUri base URI of document
38      @see org.jsoup.Jsoup#parse
39      @see #createShell
40      */
Document(String namespace, String baseUri)41     public Document(String namespace, String baseUri) {
42         super(Tag.valueOf("#root", namespace, ParseSettings.htmlDefault), baseUri);
43         this.location = baseUri;
44         this.parser = Parser.htmlParser(); // default, but overridable
45     }
46 
47     /**
48      Create a new, empty Document, in the HTML namespace.
49      @param baseUri base URI of document
50      @see org.jsoup.Jsoup#parse
51      @see #Document(String namespace, String baseUri)
52      */
Document(String baseUri)53     public Document(String baseUri) {
54         this(NamespaceHtml, baseUri);
55     }
56 
57     /**
58      Create a valid, empty shell of a document, suitable for adding more elements to.
59      @param baseUri baseUri of document
60      @return document with html, head, and body elements.
61      */
createShell(String baseUri)62     public static Document createShell(String baseUri) {
63         Validate.notNull(baseUri);
64 
65         Document doc = new Document(baseUri);
66         doc.parser = doc.parser();
67         Element html = doc.appendElement("html");
68         html.appendElement("head");
69         html.appendElement("body");
70 
71         return doc;
72     }
73 
74     /**
75      * Get the URL this Document was parsed from. If the starting URL is a redirect,
76      * this will return the final URL from which the document was served from.
77      * <p>Will return an empty string if the location is unknown (e.g. if parsed from a String).
78      * @return location
79      */
location()80     public String location() {
81         return location;
82     }
83 
84     /**
85      Returns the Connection (Request/Response) object that was used to fetch this document, if any; otherwise, a new
86      default Connection object. This can be used to continue a session, preserving settings and cookies, etc.
87      @return the Connection (session) associated with this Document, or an empty one otherwise.
88      @see Connection#newRequest()
89      */
connection()90     public Connection connection() {
91         if (connection == null)
92             return Jsoup.newSession();
93         else
94             return connection;
95     }
96 
97     /**
98      * Returns this Document's doctype.
99      * @return document type, or null if not set
100      */
documentType()101     public @Nullable DocumentType documentType() {
102         for (Node node : childNodes) {
103             if (node instanceof DocumentType)
104                 return (DocumentType) node;
105             else if (!(node instanceof LeafNode)) // scans forward across comments, text, processing instructions etc
106                 break;
107         }
108         return null;
109         // todo - add a set document type?
110     }
111 
112     /**
113      Find the root HTML element, or create it if it doesn't exist.
114      @return the root HTML element.
115      */
htmlEl()116     private Element htmlEl() {
117         Element el = firstElementChild();
118         while (el != null) {
119             if (el.nameIs("html"))
120                 return el;
121             el = el.nextElementSibling();
122         }
123         return appendElement("html");
124     }
125 
126     /**
127      Get this document's {@code head} element.
128      <p>
129      As a side-effect, if this Document does not already have a HTML structure, it will be created. If you do not want
130      that, use {@code #selectFirst("head")} instead.
131 
132      @return {@code head} element.
133      */
head()134     public Element head() {
135         final Element html = htmlEl();
136         Element el = html.firstElementChild();
137         while (el != null) {
138             if (el.nameIs("head"))
139                 return el;
140             el = el.nextElementSibling();
141         }
142         return html.prependElement("head");
143     }
144 
145     /**
146      Get this document's {@code <body>} or {@code <frameset>} element.
147      <p>
148      As a <b>side-effect</b>, if this Document does not already have a HTML structure, it will be created with a {@code
149     <body>} element. If you do not want that, use {@code #selectFirst("body")} instead.
150 
151      @return {@code body} element for documents with a {@code <body>}, a new {@code <body>} element if the document
152      had no contents, or the outermost {@code <frameset> element} for frameset documents.
153      */
body()154     public Element body() {
155         final Element html = htmlEl();
156         Element el = html.firstElementChild();
157         while (el != null) {
158             if (el.nameIs("body") || el.nameIs("frameset"))
159                 return el;
160             el = el.nextElementSibling();
161         }
162         return html.appendElement("body");
163     }
164 
165     /**
166      Get each of the {@code <form>} elements contained in this document.
167      @return a List of FormElement objects, which will be empty if there are none.
168      @see Elements#forms()
169      @see FormElement#elements()
170      @since 1.15.4
171      */
forms()172     public List<FormElement> forms() {
173         return select("form").forms();
174     }
175 
176     /**
177      Selects the first {@link FormElement} in this document that matches the query. If none match, throws an
178      {@link IllegalArgumentException}.
179      @param cssQuery a {@link Selector} CSS query
180      @return the first matching {@code <form>} element
181      @throws IllegalArgumentException if no match is found
182      @since 1.15.4
183      */
expectForm(String cssQuery)184     public FormElement expectForm(String cssQuery) {
185         Elements els = select(cssQuery);
186         for (Element el : els) {
187             if (el instanceof FormElement) return (FormElement) el;
188         }
189         Validate.fail("No form elements matched the query '%s' in the document.", cssQuery);
190         return null; // (not really)
191     }
192 
193     /**
194      Get the string contents of the document's {@code title} element.
195      @return Trimmed title, or empty string if none set.
196      */
title()197     public String title() {
198         // title is a preserve whitespace tag (for document output), but normalised here
199         Element titleEl = head().selectFirst(titleEval);
200         return titleEl != null ? StringUtil.normaliseWhitespace(titleEl.text()).trim() : "";
201     }
202     private static final Evaluator titleEval = new Evaluator.Tag("title");
203 
204     /**
205      Set the document's {@code title} element. Updates the existing element, or adds {@code title} to {@code head} if
206      not present
207      @param title string to set as title
208      */
title(String title)209     public void title(String title) {
210         Validate.notNull(title);
211         Element titleEl = head().selectFirst(titleEval);
212         if (titleEl == null) // add to head
213             titleEl = head().appendElement("title");
214         titleEl.text(title);
215     }
216 
217     /**
218      Create a new Element, with this document's base uri. Does not make the new element a child of this document.
219      @param tagName element tag name (e.g. {@code a})
220      @return new element
221      */
createElement(String tagName)222     public Element createElement(String tagName) {
223         return new Element(Tag.valueOf(tagName, parser.defaultNamespace(), ParseSettings.preserveCase), this.baseUri());
224     }
225 
226     @Override
outerHtml()227     public String outerHtml() {
228         return super.html(); // no outer wrapper tag
229     }
230 
231     /**
232      Set the text of the {@code body} of this document. Any existing nodes within the body will be cleared.
233      @param text unencoded text
234      @return this document
235      */
236     @Override
text(String text)237     public Element text(String text) {
238         body().text(text); // overridden to not nuke doc structure
239         return this;
240     }
241 
242     @Override
nodeName()243     public String nodeName() {
244         return "#document";
245     }
246 
247     /**
248      * Sets the charset used in this document. This method is equivalent
249      * to {@link OutputSettings#charset(java.nio.charset.Charset)
250      * OutputSettings.charset(Charset)} but in addition it updates the
251      * charset / encoding element within the document.
252      *
253      * <p>This enables
254      * {@link #updateMetaCharsetElement(boolean) meta charset update}.</p>
255      *
256      * <p>If there's no element with charset / encoding information yet it will
257      * be created. Obsolete charset / encoding definitions are removed!</p>
258      *
259      * <p><b>Elements used:</b></p>
260      *
261      * <ul>
262      * <li><b>Html:</b> <i>&lt;meta charset="CHARSET"&gt;</i></li>
263      * <li><b>Xml:</b> <i>&lt;?xml version="1.0" encoding="CHARSET"&gt;</i></li>
264      * </ul>
265      *
266      * @param charset Charset
267      *
268      * @see #updateMetaCharsetElement(boolean)
269      * @see OutputSettings#charset(java.nio.charset.Charset)
270      */
charset(Charset charset)271     public void charset(Charset charset) {
272         updateMetaCharsetElement(true);
273         outputSettings.charset(charset);
274         ensureMetaCharsetElement();
275     }
276 
277     /**
278      * Returns the charset used in this document. This method is equivalent
279      * to {@link OutputSettings#charset()}.
280      *
281      * @return Current Charset
282      *
283      * @see OutputSettings#charset()
284      */
charset()285     public Charset charset() {
286         return outputSettings.charset();
287     }
288 
289     /**
290      * Sets whether the element with charset information in this document is
291      * updated on changes through {@link #charset(java.nio.charset.Charset)
292      * Document.charset(Charset)} or not.
293      *
294      * <p>If set to <tt>false</tt> <i>(default)</i> there are no elements
295      * modified.</p>
296      *
297      * @param update If <tt>true</tt> the element updated on charset
298      * changes, <tt>false</tt> if not
299      *
300      * @see #charset(java.nio.charset.Charset)
301      */
updateMetaCharsetElement(boolean update)302     public void updateMetaCharsetElement(boolean update) {
303         this.updateMetaCharset = update;
304     }
305 
306     /**
307      * Returns whether the element with charset information in this document is
308      * updated on changes through {@link #charset(java.nio.charset.Charset)
309      * Document.charset(Charset)} or not.
310      *
311      * @return Returns <tt>true</tt> if the element is updated on charset
312      * changes, <tt>false</tt> if not
313      */
updateMetaCharsetElement()314     public boolean updateMetaCharsetElement() {
315         return updateMetaCharset;
316     }
317 
318     @Override
clone()319     public Document clone() {
320         Document clone = (Document) super.clone();
321         clone.outputSettings = this.outputSettings.clone();
322         return clone;
323     }
324 
325     @Override
shallowClone()326     public Document shallowClone() {
327         Document clone = new Document(this.tag().namespace(), baseUri());
328         if (attributes != null)
329             clone.attributes = attributes.clone();
330         clone.outputSettings = this.outputSettings.clone();
331         return clone;
332     }
333 
334     /**
335      * Ensures a meta charset (html) or xml declaration (xml) with the current
336      * encoding used. This only applies with
337      * {@link #updateMetaCharsetElement(boolean) updateMetaCharset} set to
338      * <tt>true</tt>, otherwise this method does nothing.
339      *
340      * <ul>
341      * <li>An existing element gets updated with the current charset</li>
342      * <li>If there's no element yet it will be inserted</li>
343      * <li>Obsolete elements are removed</li>
344      * </ul>
345      *
346      * <p><b>Elements used:</b></p>
347      *
348      * <ul>
349      * <li><b>Html:</b> <i>&lt;meta charset="CHARSET"&gt;</i></li>
350      * <li><b>Xml:</b> <i>&lt;?xml version="1.0" encoding="CHARSET"&gt;</i></li>
351      * </ul>
352      */
ensureMetaCharsetElement()353     private void ensureMetaCharsetElement() {
354         if (updateMetaCharset) {
355             OutputSettings.Syntax syntax = outputSettings().syntax();
356 
357             if (syntax == OutputSettings.Syntax.html) {
358                 Element metaCharset = selectFirst("meta[charset]");
359                 if (metaCharset != null) {
360                     metaCharset.attr("charset", charset().displayName());
361                 } else {
362                     head().appendElement("meta").attr("charset", charset().displayName());
363                 }
364                 select("meta[name=charset]").remove(); // Remove obsolete elements
365             } else if (syntax == OutputSettings.Syntax.xml) {
366                 Node node = ensureChildNodes().get(0);
367                 if (node instanceof XmlDeclaration) {
368                     XmlDeclaration decl = (XmlDeclaration) node;
369                     if (decl.name().equals("xml")) {
370                         decl.attr("encoding", charset().displayName());
371                         if (decl.hasAttr("version"))
372                             decl.attr("version", "1.0");
373                     } else {
374                         decl = new XmlDeclaration("xml", false);
375                         decl.attr("version", "1.0");
376                         decl.attr("encoding", charset().displayName());
377                         prependChild(decl);
378                     }
379                 } else {
380                     XmlDeclaration decl = new XmlDeclaration("xml", false);
381                     decl.attr("version", "1.0");
382                     decl.attr("encoding", charset().displayName());
383                     prependChild(decl);
384                 }
385             }
386         }
387     }
388 
389 
390     /**
391      * A Document's output settings control the form of the text() and html() methods.
392      */
393     public static class OutputSettings implements Cloneable {
394         /**
395          * The output serialization syntax.
396          */
397         public enum Syntax {html, xml}
398 
399         private Entities.EscapeMode escapeMode = Entities.EscapeMode.base;
400         private Charset charset;
401         Entities.CoreCharset coreCharset; // fast encoders for ascii and utf8
402         private final ThreadLocal<CharsetEncoder> encoderThreadLocal = new ThreadLocal<>(); // initialized by start of OuterHtmlVisitor
403 
404         private boolean prettyPrint = true;
405         private boolean outline = false;
406         private int indentAmount = 1;
407         private int maxPaddingWidth = 30;
408         private Syntax syntax = Syntax.html;
409 
OutputSettings()410         public OutputSettings() {
411             charset(DataUtil.UTF_8);
412         }
413 
414         /**
415          * Get the document's current HTML escape mode: <code>base</code>, which provides a limited set of named HTML
416          * entities and escapes other characters as numbered entities for maximum compatibility; or <code>extended</code>,
417          * which uses the complete set of HTML named entities.
418          * <p>
419          * The default escape mode is <code>base</code>.
420          * @return the document's current escape mode
421          */
escapeMode()422         public Entities.EscapeMode escapeMode() {
423             return escapeMode;
424         }
425 
426         /**
427          * Set the document's escape mode, which determines how characters are escaped when the output character set
428          * does not support a given character:- using either a named or a numbered escape.
429          * @param escapeMode the new escape mode to use
430          * @return the document's output settings, for chaining
431          */
escapeMode(Entities.EscapeMode escapeMode)432         public OutputSettings escapeMode(Entities.EscapeMode escapeMode) {
433             this.escapeMode = escapeMode;
434             return this;
435         }
436 
437         /**
438          * Get the document's current output charset, which is used to control which characters are escaped when
439          * generating HTML (via the <code>html()</code> methods), and which are kept intact.
440          * <p>
441          * Where possible (when parsing from a URL or File), the document's output charset is automatically set to the
442          * input charset. Otherwise, it defaults to UTF-8.
443          * @return the document's current charset.
444          */
charset()445         public Charset charset() {
446             return charset;
447         }
448 
449         /**
450          * Update the document's output charset.
451          * @param charset the new charset to use.
452          * @return the document's output settings, for chaining
453          */
charset(Charset charset)454         public OutputSettings charset(Charset charset) {
455             this.charset = charset;
456             coreCharset = Entities.CoreCharset.byName(charset.name());
457             return this;
458         }
459 
460         /**
461          * Update the document's output charset.
462          * @param charset the new charset (by name) to use.
463          * @return the document's output settings, for chaining
464          */
charset(String charset)465         public OutputSettings charset(String charset) {
466             charset(Charset.forName(charset));
467             return this;
468         }
469 
prepareEncoder()470         CharsetEncoder prepareEncoder() {
471             // created at start of OuterHtmlVisitor so each pass has own encoder, so OutputSettings can be shared among threads
472             CharsetEncoder encoder = charset.newEncoder();
473             encoderThreadLocal.set(encoder);
474             return encoder;
475         }
476 
encoder()477         CharsetEncoder encoder() {
478             CharsetEncoder encoder = encoderThreadLocal.get();
479             return encoder != null ? encoder : prepareEncoder();
480         }
481 
482         /**
483          * Get the document's current output syntax.
484          * @return current syntax
485          */
syntax()486         public Syntax syntax() {
487             return syntax;
488         }
489 
490         /**
491          * Set the document's output syntax. Either {@code html}, with empty tags and boolean attributes (etc), or
492          * {@code xml}, with self-closing tags.
493          * <p>When set to {@link Document.OutputSettings.Syntax#xml xml}, the {@link #escapeMode() escapeMode} is
494          * automatically set to {@link Entities.EscapeMode#xhtml}, but may be subsequently changed if desired.</p>
495          * @param syntax serialization syntax
496          * @return the document's output settings, for chaining
497          */
syntax(Syntax syntax)498         public OutputSettings syntax(Syntax syntax) {
499             this.syntax = syntax;
500             if (syntax == Syntax.xml)
501                 this.escapeMode(Entities.EscapeMode.xhtml);
502             return this;
503         }
504 
505         /**
506          * Get if pretty printing is enabled. Default is true. If disabled, the HTML output methods will not re-format
507          * the output, and the output will generally look like the input.
508          * @return if pretty printing is enabled.
509          */
prettyPrint()510         public boolean prettyPrint() {
511             return prettyPrint;
512         }
513 
514         /**
515          * Enable or disable pretty printing.
516          * @param pretty new pretty print setting
517          * @return this, for chaining
518          */
prettyPrint(boolean pretty)519         public OutputSettings prettyPrint(boolean pretty) {
520             prettyPrint = pretty;
521             return this;
522         }
523 
524         /**
525          * Get if outline mode is enabled. Default is false. If enabled, the HTML output methods will consider
526          * all tags as block.
527          * @return if outline mode is enabled.
528          */
outline()529         public boolean outline() {
530             return outline;
531         }
532 
533         /**
534          * Enable or disable HTML outline mode.
535          * @param outlineMode new outline setting
536          * @return this, for chaining
537          */
outline(boolean outlineMode)538         public OutputSettings outline(boolean outlineMode) {
539             outline = outlineMode;
540             return this;
541         }
542 
543         /**
544          * Get the current tag indent amount, used when pretty printing.
545          * @return the current indent amount
546          */
indentAmount()547         public int indentAmount() {
548             return indentAmount;
549         }
550 
551         /**
552          * Set the indent amount for pretty printing
553          * @param indentAmount number of spaces to use for indenting each level. Must be {@literal >=} 0.
554          * @return this, for chaining
555          */
indentAmount(int indentAmount)556         public OutputSettings indentAmount(int indentAmount) {
557             Validate.isTrue(indentAmount >= 0);
558             this.indentAmount = indentAmount;
559             return this;
560         }
561 
562         /**
563          * Get the current max padding amount, used when pretty printing
564          * so very deeply nested nodes don't get insane padding amounts.
565          * @return the current indent amount
566          */
maxPaddingWidth()567         public int maxPaddingWidth() {
568             return maxPaddingWidth;
569         }
570 
571         /**
572          * Set the max padding amount for pretty printing so very deeply nested nodes don't get insane padding amounts.
573          * @param maxPaddingWidth number of spaces to use for indenting each level of nested nodes. Must be {@literal >=} -1.
574          *        Default is 30 and -1 means unlimited.
575          * @return this, for chaining
576          */
maxPaddingWidth(int maxPaddingWidth)577         public OutputSettings maxPaddingWidth(int maxPaddingWidth) {
578             Validate.isTrue(maxPaddingWidth >= -1);
579             this.maxPaddingWidth = maxPaddingWidth;
580             return this;
581         }
582 
583         @Override
clone()584         public OutputSettings clone() {
585             OutputSettings clone;
586             try {
587                 clone = (OutputSettings) super.clone();
588             } catch (CloneNotSupportedException e) {
589                 throw new RuntimeException(e);
590             }
591             clone.charset(charset.name()); // new charset, coreCharset, and charset encoder
592             clone.escapeMode = Entities.EscapeMode.valueOf(escapeMode.name());
593             // indentAmount, maxPaddingWidth, and prettyPrint are primitives so object.clone() will handle
594             return clone;
595         }
596     }
597 
598     /**
599      * Get the document's current output settings.
600      * @return the document's current output settings.
601      */
outputSettings()602     public OutputSettings outputSettings() {
603         return outputSettings;
604     }
605 
606     /**
607      * Set the document's output settings.
608      * @param outputSettings new output settings.
609      * @return this document, for chaining.
610      */
outputSettings(OutputSettings outputSettings)611     public Document outputSettings(OutputSettings outputSettings) {
612         Validate.notNull(outputSettings);
613         this.outputSettings = outputSettings;
614         return this;
615     }
616 
617     public enum QuirksMode {
618         noQuirks, quirks, limitedQuirks
619     }
620 
quirksMode()621     public QuirksMode quirksMode() {
622         return quirksMode;
623     }
624 
quirksMode(QuirksMode quirksMode)625     public Document quirksMode(QuirksMode quirksMode) {
626         this.quirksMode = quirksMode;
627         return this;
628     }
629 
630     /**
631      * Get the parser that was used to parse this document.
632      * @return the parser
633      */
parser()634     public Parser parser() {
635         return parser;
636     }
637 
638     /**
639      * Set the parser used to create this document. This parser is then used when further parsing within this document
640      * is required.
641      * @param parser the configured parser to use when further parsing is required for this document.
642      * @return this document, for chaining.
643      */
parser(Parser parser)644     public Document parser(Parser parser) {
645         this.parser = parser;
646         return this;
647     }
648 
649     /**
650      Set the Connection used to fetch this document. This Connection is used as a session object when further requests are
651      made (e.g. when a form is submitted).
652 
653      @param connection to set
654      @return this document, for chaining
655      @see Connection#newRequest()
656      @since 1.14.1
657      */
connection(Connection connection)658     public Document connection(Connection connection) {
659         Validate.notNull(connection);
660         this.connection = connection;
661         return this;
662     }
663 }
664