• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 package org.jsoup.helper;
2 
3 import org.jsoup.Jsoup;
4 import org.jsoup.TextUtil;
5 import org.jsoup.integration.ParseTest;
6 import org.jsoup.nodes.Element;
7 import org.jsoup.nodes.TextNode;
8 import org.junit.jupiter.api.Test;
9 import org.w3c.dom.Document;
10 import org.w3c.dom.Node;
11 import org.w3c.dom.NodeList;
12 import org.xml.sax.InputSource;
13 
14 import javax.xml.parsers.DocumentBuilder;
15 import javax.xml.parsers.DocumentBuilderFactory;
16 import javax.xml.transform.OutputKeys;
17 import javax.xml.xpath.XPathConstants;
18 import javax.xml.xpath.XPathExpression;
19 import javax.xml.xpath.XPathExpressionException;
20 import javax.xml.xpath.XPathFactory;
21 import java.io.ByteArrayInputStream;
22 import java.io.File;
23 import java.io.IOException;
24 import java.io.StringReader;
25 import java.nio.charset.StandardCharsets;
26 import java.util.Locale;
27 import java.util.Map;
28 
29 import static org.junit.jupiter.api.Assertions.*;
30 
31 public class W3CDomTest {
32 
parseXml(String xml, boolean nameSpaceAware)33     private static Document parseXml(String xml, boolean nameSpaceAware) {
34         try {
35             DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
36             factory.setNamespaceAware(nameSpaceAware);
37             DocumentBuilder builder = factory.newDocumentBuilder();
38             builder.setEntityResolver((publicId, systemId) -> {
39                 if (systemId.contains("about:legacy-compat")) { // <!doctype html>
40                     return new InputSource(new StringReader(""));
41                 } else {
42                     return null;
43                 }
44             });
45             Document dom = builder.parse(new ByteArrayInputStream(xml.getBytes(StandardCharsets.UTF_8)));
46             dom.normalizeDocument();
47             return dom;
48         } catch (Exception e) {
49             throw new IllegalStateException(e);
50         }
51     }
52 
53     @Test
simpleConversion()54     public void simpleConversion() {
55         String html = "<html><head><title>W3c</title></head><body><p class='one' id=12>Text</p><!-- comment --><invalid>What<script>alert('!')";
56         org.jsoup.nodes.Document doc = Jsoup.parse(html);
57 
58         W3CDom w3c = new W3CDom();
59         Document wDoc = w3c.fromJsoup(doc);
60         NodeList meta = wDoc.getElementsByTagName("META");
61         assertEquals(0, meta.getLength());
62 
63         String out = W3CDom.asString(wDoc, W3CDom.OutputXml());
64         String expected = "<?xml version=\"1.0\" encoding=\"UTF-8\"?><html xmlns=\"http://www.w3.org/1999/xhtml\"><head><title>W3c</title></head><body><p class=\"one\" id=\"12\">Text</p><!-- comment --><invalid>What<script>alert('!')</script></invalid></body></html>";
65         assertEquals(expected, TextUtil.stripNewlines(out));
66 
67         Document roundTrip = parseXml(out, true);
68         assertEquals("Text", roundTrip.getElementsByTagName("p").item(0).getTextContent());
69 
70         // check we can set properties
71         Map<String, String> properties = W3CDom.OutputXml();
72         properties.put(OutputKeys.INDENT, "yes");
73         String furtherOut = W3CDom.asString(wDoc, properties);
74         assertTrue(furtherOut.length() > out.length()); // wanted to assert formatting, but actual indentation is platform specific so breaks in CI
75         String furtherExpected =
76             "<?xml version=\"1.0\" encoding=\"UTF-8\"?><html xmlns=\"http://www.w3.org/1999/xhtml\"><head><title>W3c</title></head><body><p class=\"one\" id=\"12\">Text</p><!-- comment --><invalid>What<script>alert('!')</script></invalid></body></html>";
77         assertEquals(furtherExpected, TextUtil.stripNewlines(furtherOut)); // on windows, DOM will write newlines as \r\n
78     }
79 
80     @Test
namespacePreservation()81     public void namespacePreservation() throws IOException {
82         File in = ParseTest.getFile("/htmltests/namespaces.xhtml");
83         org.jsoup.nodes.Document jsoupDoc;
84         jsoupDoc = Jsoup.parse(in, "UTF-8");
85 
86         Document doc;
87         org.jsoup.helper.W3CDom jDom = new org.jsoup.helper.W3CDom();
88         doc = jDom.fromJsoup(jsoupDoc);
89 
90         Node htmlEl = doc.getChildNodes().item(0);
91         assertEquals("http://www.w3.org/1999/xhtml", htmlEl.getNamespaceURI());
92         assertEquals("html", htmlEl.getLocalName());
93         assertEquals("html", htmlEl.getNodeName());
94 
95         // inherits default namespace
96         Node head = htmlEl.getFirstChild().getNextSibling();
97         assertEquals("http://www.w3.org/1999/xhtml", head.getNamespaceURI());
98         assertEquals("head", head.getLocalName());
99         assertEquals("head", head.getNodeName());
100 
101         Node epubTitle = htmlEl.getChildNodes().item(3).getChildNodes().item(3);
102         assertEquals("Check", epubTitle.getTextContent());
103         assertEquals("http://www.idpf.org/2007/ops", epubTitle.getNamespaceURI());
104         assertEquals("title", epubTitle.getLocalName());
105         assertEquals("epub:title", epubTitle.getNodeName());
106 
107         Node xSection = epubTitle.getNextSibling().getNextSibling();
108         assertEquals("urn:test", xSection.getNamespaceURI());
109         assertEquals("section", xSection.getLocalName());
110         assertEquals("x:section", xSection.getNodeName());
111 
112         // https://github.com/jhy/jsoup/issues/977
113         // does not keep last set namespace
114         Node svg = xSection.getNextSibling().getNextSibling();
115         assertEquals("http://www.w3.org/2000/svg", svg.getNamespaceURI());
116         assertEquals("svg", svg.getLocalName());
117         assertEquals("svg", svg.getNodeName());
118 
119         Node path = svg.getChildNodes().item(1);
120         assertEquals("http://www.w3.org/2000/svg", path.getNamespaceURI());
121         assertEquals("path", path.getLocalName());
122         assertEquals("path", path.getNodeName());
123 
124         Node clip = path.getChildNodes().item(1);
125         assertEquals("http://example.com/clip", clip.getNamespaceURI());
126         assertEquals("clip", clip.getLocalName());
127         assertEquals("clip", clip.getNodeName());
128         assertEquals("456", clip.getTextContent());
129 
130         Node picture = svg.getNextSibling().getNextSibling();
131         assertEquals("http://www.w3.org/1999/xhtml", picture.getNamespaceURI());
132         assertEquals("picture", picture.getLocalName());
133         assertEquals("picture", picture.getNodeName());
134 
135         Node img = picture.getFirstChild();
136         assertEquals("http://www.w3.org/1999/xhtml", img.getNamespaceURI());
137         assertEquals("img", img.getLocalName());
138         assertEquals("img", img.getNodeName());
139 
140     }
141 
142     @Test
handlesInvalidAttributeNames()143     public void handlesInvalidAttributeNames() {
144         String html = "<html><head></head><body style=\"color: red\" \" name\"></body></html>";
145         org.jsoup.nodes.Document jsoupDoc;
146         jsoupDoc = Jsoup.parse(html);
147         Element body = jsoupDoc.select("body").first();
148         assertTrue(body.hasAttr("\"")); // actually an attribute with key '"'. Correct per HTML5 spec, but w3c xml dom doesn't dig it
149         assertTrue(body.hasAttr("name\""));
150 
151         Document w3Doc = W3CDom.convert(jsoupDoc);
152         String xml = W3CDom.asString(w3Doc, W3CDom.OutputXml());
153         assertEquals("<?xml version=\"1.0\" encoding=\"UTF-8\"?><html xmlns=\"http://www.w3.org/1999/xhtml\"><head/><body name=\"\" style=\"color: red\"/></html>", xml);
154     }
155 
156     @Test
htmlInputDocMaintainsHtmlAttributeNames()157     public void htmlInputDocMaintainsHtmlAttributeNames() {
158         String html = "<!DOCTYPE html><html><head></head><body><p hành=\"1\" hình=\"2\">unicode attr names</p></body></html>";
159         org.jsoup.nodes.Document jsoupDoc;
160         jsoupDoc = Jsoup.parse(html);
161 
162         Document w3Doc = W3CDom.convert(jsoupDoc);
163         String out = W3CDom.asString(w3Doc, W3CDom.OutputHtml());
164         String expected = "<!DOCTYPE html SYSTEM \"about:legacy-compat\"><html xmlns=\"http://www.w3.org/1999/xhtml\"><head><META http-equiv=\"Content-Type\" content=\"text/html; charset=UTF-8\"></head><body><p hành=\"1\" hình=\"2\">unicode attr names</p></body></html>";
165         assertEquals(expected, TextUtil.stripNewlines(out));
166     }
167 
168     @Test
xmlInputDocMaintainsHtmlAttributeNames()169     public void xmlInputDocMaintainsHtmlAttributeNames() {
170         String html = "<!DOCTYPE html><html><head></head><body><p hành=\"1\" hình=\"2\">unicode attr names coerced</p></body></html>";
171         org.jsoup.nodes.Document jsoupDoc;
172         jsoupDoc = Jsoup.parse(html);
173         jsoupDoc.outputSettings().syntax(org.jsoup.nodes.Document.OutputSettings.Syntax.xml);
174 
175         Document w3Doc = W3CDom.convert(jsoupDoc);
176         String out = W3CDom.asString(w3Doc, W3CDom.OutputHtml());
177         String expected = "<!DOCTYPE html SYSTEM \"about:legacy-compat\"><html xmlns=\"http://www.w3.org/1999/xhtml\"><head><META http-equiv=\"Content-Type\" content=\"text/html; charset=UTF-8\"></head><body><p hnh=\"2\">unicode attr names coerced</p></body></html>";
178         assertEquals(expected, TextUtil.stripNewlines(out));
179     }
180 
181     @Test
handlesInvalidTagAsText()182     public void handlesInvalidTagAsText() {
183         org.jsoup.nodes.Document jsoup = Jsoup.parse("<インセンティブで高収入!>Text <p>More</p>");
184 
185         Document w3Doc = W3CDom.convert(jsoup);
186         String xml = W3CDom.asString(w3Doc, W3CDom.OutputXml());
187         assertEquals("<?xml version=\"1.0\" encoding=\"UTF-8\"?><html xmlns=\"http://www.w3.org/1999/xhtml\"><head/><body>&lt;インセンティブで高収入!&gt;Text <p>More</p></body></html>", xml);
188     }
189 
190     @Test
treatsUndeclaredNamespaceAsLocalName()191     public void treatsUndeclaredNamespaceAsLocalName() {
192         String html = "<fb:like>One</fb:like>";
193         org.jsoup.nodes.Document doc = Jsoup.parse(html);
194 
195         Document w3Doc = new W3CDom().fromJsoup(doc);
196         Node htmlEl = w3Doc.getFirstChild();
197 
198         assertEquals("http://www.w3.org/1999/xhtml", htmlEl.getNamespaceURI());
199         assertEquals("html", htmlEl.getLocalName());
200         assertEquals("html", htmlEl.getNodeName());
201 
202         Node fb = htmlEl.getFirstChild().getNextSibling().getFirstChild();
203         assertNull(fb.getNamespaceURI());
204         assertEquals("like", fb.getLocalName());
205         assertEquals("fb:like", fb.getNodeName());
206     }
207 
208     @Test
xmlnsXpathTest()209     public void xmlnsXpathTest() throws XPathExpressionException {
210         W3CDom w3c = new W3CDom();
211         String html = "<html><body><div>hello</div></body></html>";
212         Document dom = w3c.fromJsoup(Jsoup.parse(html));
213         NodeList nodeList = xpath(dom, "//*[local-name()=\"body\"]");// namespace aware; HTML namespace is default
214         assertEquals("div", nodeList.item(0).getLocalName());
215 
216         // default output is namespace aware, so query needs to be as well
217         html = "<html xmlns='http://www.w3.org/1999/xhtml'><body id='One'><div>hello</div></body></html>";
218         dom = w3c.fromJsoup(Jsoup.parse(html));
219         nodeList = xpath(dom, "//body");
220         assertNull(nodeList); // no matches
221 
222         dom = w3c.fromJsoup(Jsoup.parse(html));
223         nodeList = xpath(dom, "//*[local-name()=\"body\"]");
224         assertNotNull(nodeList);
225         assertEquals(1, nodeList.getLength());
226         assertEquals("div", nodeList.item(0).getLocalName());
227         assertEquals("http://www.w3.org/1999/xhtml", nodeList.item(0).getNamespaceURI());
228         assertNull(nodeList.item(0).getPrefix());
229 
230         // get rid of the name space awareness
231         String xml = w3c.asString(dom);
232         dom = parseXml(xml, false);
233         Node item = (Node) xpath(dom, "//body");
234         assertEquals("body", item.getNodeName());
235         assertNull(item.getNamespaceURI());
236         assertNull(item.getPrefix());
237 
238         // put back, will get zero
239         dom = parseXml(xml, true);
240         nodeList = xpath(dom, "//body");
241         assertNull(nodeList);
242     }
243 
244     @Test
xhtmlNoNamespace()245     public void xhtmlNoNamespace() throws XPathExpressionException {
246         W3CDom w3c = new W3CDom();
247         String html = "<html><body><div>hello</div></body></html>";
248         w3c.namespaceAware(false);
249         Document dom = w3c.fromJsoup(Jsoup.parse(html));
250         NodeList nodeList = xpath(dom, "//body");// no namespace
251         assertEquals(1, nodeList.getLength());
252         assertEquals("div", nodeList.item(0).getLocalName());
253     }
254 
255     @Test
canDisableNamespaces()256     void canDisableNamespaces() throws XPathExpressionException {
257         W3CDom w3c = new W3CDom();
258         assertTrue(w3c.namespaceAware());
259 
260         w3c.namespaceAware(false);
261         assertFalse(w3c.namespaceAware());
262 
263         String html = "<html xmlns='http://www.w3.org/1999/xhtml'><body id='One'><div>hello</div></body></html>";
264         Document dom = w3c.fromJsoup(Jsoup.parse(html));
265         NodeList nodeList = xpath(dom, "//body");// no ns, so needs no prefix
266         assertEquals("div", nodeList.item(0).getLocalName());
267     }
268 
xpath(Document w3cDoc, String query)269     private NodeList xpath(Document w3cDoc, String query) throws XPathExpressionException {
270         XPathExpression xpath = XPathFactory.newInstance().newXPath().compile(query);
271         return ((NodeList) xpath.evaluate(w3cDoc, XPathConstants.NODE));
272     }
273 
274     @Test
testRoundTripDoctype()275     public void testRoundTripDoctype() {
276         // TODO - not super happy with this output - but plain DOM doesn't let it out, and don't want to rebuild the writer
277         // because we have Saxon on the test classpath, the transformer will change to that, and so case may change (e.g. Java base in META, Saxon is meta for HTML)
278         String base = "<!DOCTYPE html><p>One</p>";
279         assertEqualsIgnoreCase("<!DOCTYPE html SYSTEM \"about:legacy-compat\"><html xmlns=\"http://www.w3.org/1999/xhtml\"><head><META http-equiv=\"Content-Type\" content=\"text/html; charset=UTF-8\"></head><body><p>One</p></body></html>", output(base, true));
280         assertEqualsIgnoreCase("<?xml version=\"1.0\" encoding=\"UTF-8\"?><!DOCTYPE html SYSTEM \"about:legacy-compat\"><html xmlns=\"http://www.w3.org/1999/xhtml\"><head/><body><p>One</p></body></html>", output(base, false));
281 
282         String publicDoc = "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\">";
283         assertEqualsIgnoreCase("<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\"><html xmlns=\"http://www.w3.org/1999/xhtml\"><head><META http-equiv=\"Content-Type\" content=\"text/html; charset=UTF-8\"></head><body></body></html>", output(publicDoc, true));
284         // different impls will have different XML formatting. OpenJDK 13 default gives this: <body /> but others have <body/>, so just check start
285         assertTrue(output(publicDoc, false).startsWith("<?xml version=\"1.0\" encoding=\"UTF-8\"?><!DOCTYPE html PUBLIC"));
286 
287         String systemDoc = "<!DOCTYPE html SYSTEM \"exampledtdfile.dtd\">";
288         assertEqualsIgnoreCase("<!DOCTYPE html SYSTEM \"exampledtdfile.dtd\"><html xmlns=\"http://www.w3.org/1999/xhtml\"><head><META http-equiv=\"Content-Type\" content=\"text/html; charset=UTF-8\"></head><body></body></html>", output(systemDoc, true));
289         assertEqualsIgnoreCase("<?xml version=\"1.0\" encoding=\"UTF-8\"?><!DOCTYPE html SYSTEM \"exampledtdfile.dtd\"><html xmlns=\"http://www.w3.org/1999/xhtml\"><head/><body/></html>", output(systemDoc, false));
290 
291         String legacyDoc = "<!DOCTYPE html SYSTEM \"about:legacy-compat\">";
292         assertEqualsIgnoreCase("<!DOCTYPE html SYSTEM \"about:legacy-compat\"><html xmlns=\"http://www.w3.org/1999/xhtml\"><head><META http-equiv=\"Content-Type\" content=\"text/html; charset=UTF-8\"></head><body></body></html>", output(legacyDoc, true));
293         assertEqualsIgnoreCase("<?xml version=\"1.0\" encoding=\"UTF-8\"?><!DOCTYPE html SYSTEM \"about:legacy-compat\"><html xmlns=\"http://www.w3.org/1999/xhtml\"><head/><body/></html>", output(legacyDoc, false));
294 
295         String noDoctype = "<p>One</p>";
296         assertEqualsIgnoreCase("<html xmlns=\"http://www.w3.org/1999/xhtml\"><head><META http-equiv=\"Content-Type\" content=\"text/html; charset=UTF-8\"></head><body><p>One</p></body></html>", output(noDoctype, true));
297         assertEqualsIgnoreCase("<?xml version=\"1.0\" encoding=\"UTF-8\"?><html xmlns=\"http://www.w3.org/1999/xhtml\"><head/><body><p>One</p></body></html>", output(noDoctype, false));
298     }
299 
output(String in, boolean modeHtml)300     private String output(String in, boolean modeHtml) {
301         org.jsoup.nodes.Document jdoc = Jsoup.parse(in);
302         Document w3c = W3CDom.convert(jdoc);
303 
304         Map<String, String> properties = modeHtml ? W3CDom.OutputHtml() : W3CDom.OutputXml();
305         return TextUtil.normalizeSpaces(W3CDom.asString(w3c, properties));
306     }
307 
assertEqualsIgnoreCase(String want, String have)308     private void assertEqualsIgnoreCase(String want, String have) {
309         assertEquals(want.toLowerCase(Locale.ROOT), have.toLowerCase(Locale.ROOT));
310     }
311 
312 
313     @Test
canOutputHtmlWithoutNamespace()314     public void canOutputHtmlWithoutNamespace() {
315         String html = "<p>One</p>";
316         org.jsoup.nodes.Document jdoc = Jsoup.parse(html);
317         W3CDom w3c = new W3CDom();
318         w3c.namespaceAware(false);
319 
320         String asHtml = W3CDom.asString(w3c.fromJsoup(jdoc), W3CDom.OutputHtml());
321         String asXtml = W3CDom.asString(w3c.fromJsoup(jdoc), W3CDom.OutputXml());
322         assertEqualsIgnoreCase(
323             "<html><head><meta http-equiv=\"content-type\" content=\"text/html; charset=utf-8\"></head><body><p>one</p></body></html>",
324             asHtml);
325         assertEqualsIgnoreCase(
326             "<?xml version=\"1.0\" encoding=\"UTF-8\"?><html><head/><body><p>One</p></body></html>",
327             asXtml);
328     }
329 
convertsElementsAndMaintainsSource()330     @Test public void convertsElementsAndMaintainsSource() {
331         org.jsoup.nodes.Document jdoc = Jsoup.parse("<body><div><p>One</div><div><p>Two");
332         W3CDom w3CDom = new W3CDom();
333         Element jDiv = jdoc.selectFirst("div");
334         assertNotNull(jDiv);
335         Document doc = w3CDom.fromJsoup(jDiv);
336         Node div = w3CDom.contextNode(doc);
337 
338         assertEquals("div", div.getLocalName());
339         assertEquals(jDiv, div.getUserData(W3CDom.SourceProperty));
340 
341         Node textNode = div.getFirstChild().getFirstChild();
342         assertEquals("One", textNode.getTextContent());
343         assertEquals(Node.TEXT_NODE, textNode.getNodeType());
344 
345         org.jsoup.nodes.TextNode jText = (TextNode) jDiv.childNode(0).childNode(0);
346         assertEquals(jText, textNode.getUserData(W3CDom.SourceProperty));
347     }
348 
canXmlParseCdataNodes()349     @Test public void canXmlParseCdataNodes() throws XPathExpressionException {
350         String html = "<p><script>1 && 2</script><style>3 && 4</style> 5 &amp;&amp; 6</p>";
351         org.jsoup.nodes.Document jdoc = Jsoup.parse(html);
352         jdoc.outputSettings().syntax(org.jsoup.nodes.Document.OutputSettings.Syntax.xml);
353         String xml = jdoc.body().html();
354         assertTrue(xml.contains("<script>//<![CDATA[\n1 && 2\n//]]></script>")); // as asserted in ElementTest
355         Document doc = parseXml(xml, false);
356         NodeList list = xpath(doc, "//script");
357         assertEquals(2, list.getLength());
358         Node scriptComment = list.item(0); // will be the cdata node
359         assertEquals("//", scriptComment.getTextContent());
360         Node script = list.item(1);
361         assertEquals("\n" +
362             "1 && 2\n" +
363             "//", script.getTextContent());
364 
365     }
366 
handlesEmptyDoctype()367     @Test public void handlesEmptyDoctype() {
368         String html = "<!doctype>Foo";
369         org.jsoup.nodes.Document jdoc = Jsoup.parse(html);
370         Document doc = (new W3CDom()).fromJsoup(jdoc);
371         assertNull(doc.getDoctype());
372         assertEquals("Foo", doc.getFirstChild().getTextContent());
373     }
374 
375 }
376