1 package org.jsoup.helper; 2 3 import org.jsoup.Jsoup; 4 import org.jsoup.TextUtil; 5 import org.jsoup.integration.ParseTest; 6 import org.jsoup.nodes.Element; 7 import org.jsoup.nodes.TextNode; 8 import org.junit.jupiter.api.Test; 9 import org.w3c.dom.Document; 10 import org.w3c.dom.Node; 11 import org.w3c.dom.NodeList; 12 import org.xml.sax.InputSource; 13 14 import javax.xml.parsers.DocumentBuilder; 15 import javax.xml.parsers.DocumentBuilderFactory; 16 import javax.xml.transform.OutputKeys; 17 import javax.xml.xpath.XPathConstants; 18 import javax.xml.xpath.XPathExpression; 19 import javax.xml.xpath.XPathExpressionException; 20 import javax.xml.xpath.XPathFactory; 21 import java.io.ByteArrayInputStream; 22 import java.io.File; 23 import java.io.IOException; 24 import java.io.StringReader; 25 import java.nio.charset.StandardCharsets; 26 import java.util.Locale; 27 import java.util.Map; 28 29 import static org.junit.jupiter.api.Assertions.*; 30 31 public class W3CDomTest { 32 parseXml(String xml, boolean nameSpaceAware)33 private static Document parseXml(String xml, boolean nameSpaceAware) { 34 try { 35 DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance(); 36 factory.setNamespaceAware(nameSpaceAware); 37 DocumentBuilder builder = factory.newDocumentBuilder(); 38 builder.setEntityResolver((publicId, systemId) -> { 39 if (systemId.contains("about:legacy-compat")) { // <!doctype html> 40 return new InputSource(new StringReader("")); 41 } else { 42 return null; 43 } 44 }); 45 Document dom = builder.parse(new ByteArrayInputStream(xml.getBytes(StandardCharsets.UTF_8))); 46 dom.normalizeDocument(); 47 return dom; 48 } catch (Exception e) { 49 throw new IllegalStateException(e); 50 } 51 } 52 53 @Test simpleConversion()54 public void simpleConversion() { 55 String html = "<html><head><title>W3c</title></head><body><p class='one' id=12>Text</p><!-- comment --><invalid>What<script>alert('!')"; 56 org.jsoup.nodes.Document doc = Jsoup.parse(html); 57 58 W3CDom w3c = new W3CDom(); 59 Document wDoc = w3c.fromJsoup(doc); 60 NodeList meta = wDoc.getElementsByTagName("META"); 61 assertEquals(0, meta.getLength()); 62 63 String out = W3CDom.asString(wDoc, W3CDom.OutputXml()); 64 String expected = "<?xml version=\"1.0\" encoding=\"UTF-8\"?><html xmlns=\"http://www.w3.org/1999/xhtml\"><head><title>W3c</title></head><body><p class=\"one\" id=\"12\">Text</p><!-- comment --><invalid>What<script>alert('!')</script></invalid></body></html>"; 65 assertEquals(expected, TextUtil.stripNewlines(out)); 66 67 Document roundTrip = parseXml(out, true); 68 assertEquals("Text", roundTrip.getElementsByTagName("p").item(0).getTextContent()); 69 70 // check we can set properties 71 Map<String, String> properties = W3CDom.OutputXml(); 72 properties.put(OutputKeys.INDENT, "yes"); 73 String furtherOut = W3CDom.asString(wDoc, properties); 74 assertTrue(furtherOut.length() > out.length()); // wanted to assert formatting, but actual indentation is platform specific so breaks in CI 75 String furtherExpected = 76 "<?xml version=\"1.0\" encoding=\"UTF-8\"?><html xmlns=\"http://www.w3.org/1999/xhtml\"><head><title>W3c</title></head><body><p class=\"one\" id=\"12\">Text</p><!-- comment --><invalid>What<script>alert('!')</script></invalid></body></html>"; 77 assertEquals(furtherExpected, TextUtil.stripNewlines(furtherOut)); // on windows, DOM will write newlines as \r\n 78 } 79 80 @Test namespacePreservation()81 public void namespacePreservation() throws IOException { 82 File in = ParseTest.getFile("/htmltests/namespaces.xhtml"); 83 org.jsoup.nodes.Document jsoupDoc; 84 jsoupDoc = Jsoup.parse(in, "UTF-8"); 85 86 Document doc; 87 org.jsoup.helper.W3CDom jDom = new org.jsoup.helper.W3CDom(); 88 doc = jDom.fromJsoup(jsoupDoc); 89 90 Node htmlEl = doc.getChildNodes().item(0); 91 assertEquals("http://www.w3.org/1999/xhtml", htmlEl.getNamespaceURI()); 92 assertEquals("html", htmlEl.getLocalName()); 93 assertEquals("html", htmlEl.getNodeName()); 94 95 // inherits default namespace 96 Node head = htmlEl.getFirstChild().getNextSibling(); 97 assertEquals("http://www.w3.org/1999/xhtml", head.getNamespaceURI()); 98 assertEquals("head", head.getLocalName()); 99 assertEquals("head", head.getNodeName()); 100 101 Node epubTitle = htmlEl.getChildNodes().item(3).getChildNodes().item(3); 102 assertEquals("Check", epubTitle.getTextContent()); 103 assertEquals("http://www.idpf.org/2007/ops", epubTitle.getNamespaceURI()); 104 assertEquals("title", epubTitle.getLocalName()); 105 assertEquals("epub:title", epubTitle.getNodeName()); 106 107 Node xSection = epubTitle.getNextSibling().getNextSibling(); 108 assertEquals("urn:test", xSection.getNamespaceURI()); 109 assertEquals("section", xSection.getLocalName()); 110 assertEquals("x:section", xSection.getNodeName()); 111 112 // https://github.com/jhy/jsoup/issues/977 113 // does not keep last set namespace 114 Node svg = xSection.getNextSibling().getNextSibling(); 115 assertEquals("http://www.w3.org/2000/svg", svg.getNamespaceURI()); 116 assertEquals("svg", svg.getLocalName()); 117 assertEquals("svg", svg.getNodeName()); 118 119 Node path = svg.getChildNodes().item(1); 120 assertEquals("http://www.w3.org/2000/svg", path.getNamespaceURI()); 121 assertEquals("path", path.getLocalName()); 122 assertEquals("path", path.getNodeName()); 123 124 Node clip = path.getChildNodes().item(1); 125 assertEquals("http://example.com/clip", clip.getNamespaceURI()); 126 assertEquals("clip", clip.getLocalName()); 127 assertEquals("clip", clip.getNodeName()); 128 assertEquals("456", clip.getTextContent()); 129 130 Node picture = svg.getNextSibling().getNextSibling(); 131 assertEquals("http://www.w3.org/1999/xhtml", picture.getNamespaceURI()); 132 assertEquals("picture", picture.getLocalName()); 133 assertEquals("picture", picture.getNodeName()); 134 135 Node img = picture.getFirstChild(); 136 assertEquals("http://www.w3.org/1999/xhtml", img.getNamespaceURI()); 137 assertEquals("img", img.getLocalName()); 138 assertEquals("img", img.getNodeName()); 139 140 } 141 142 @Test handlesInvalidAttributeNames()143 public void handlesInvalidAttributeNames() { 144 String html = "<html><head></head><body style=\"color: red\" \" name\"></body></html>"; 145 org.jsoup.nodes.Document jsoupDoc; 146 jsoupDoc = Jsoup.parse(html); 147 Element body = jsoupDoc.select("body").first(); 148 assertTrue(body.hasAttr("\"")); // actually an attribute with key '"'. Correct per HTML5 spec, but w3c xml dom doesn't dig it 149 assertTrue(body.hasAttr("name\"")); 150 151 Document w3Doc = W3CDom.convert(jsoupDoc); 152 String xml = W3CDom.asString(w3Doc, W3CDom.OutputXml()); 153 assertEquals("<?xml version=\"1.0\" encoding=\"UTF-8\"?><html xmlns=\"http://www.w3.org/1999/xhtml\"><head/><body name=\"\" style=\"color: red\"/></html>", xml); 154 } 155 156 @Test htmlInputDocMaintainsHtmlAttributeNames()157 public void htmlInputDocMaintainsHtmlAttributeNames() { 158 String html = "<!DOCTYPE html><html><head></head><body><p hành=\"1\" hình=\"2\">unicode attr names</p></body></html>"; 159 org.jsoup.nodes.Document jsoupDoc; 160 jsoupDoc = Jsoup.parse(html); 161 162 Document w3Doc = W3CDom.convert(jsoupDoc); 163 String out = W3CDom.asString(w3Doc, W3CDom.OutputHtml()); 164 String expected = "<!DOCTYPE html SYSTEM \"about:legacy-compat\"><html xmlns=\"http://www.w3.org/1999/xhtml\"><head><META http-equiv=\"Content-Type\" content=\"text/html; charset=UTF-8\"></head><body><p hành=\"1\" hình=\"2\">unicode attr names</p></body></html>"; 165 assertEquals(expected, TextUtil.stripNewlines(out)); 166 } 167 168 @Test xmlInputDocMaintainsHtmlAttributeNames()169 public void xmlInputDocMaintainsHtmlAttributeNames() { 170 String html = "<!DOCTYPE html><html><head></head><body><p hành=\"1\" hình=\"2\">unicode attr names coerced</p></body></html>"; 171 org.jsoup.nodes.Document jsoupDoc; 172 jsoupDoc = Jsoup.parse(html); 173 jsoupDoc.outputSettings().syntax(org.jsoup.nodes.Document.OutputSettings.Syntax.xml); 174 175 Document w3Doc = W3CDom.convert(jsoupDoc); 176 String out = W3CDom.asString(w3Doc, W3CDom.OutputHtml()); 177 String expected = "<!DOCTYPE html SYSTEM \"about:legacy-compat\"><html xmlns=\"http://www.w3.org/1999/xhtml\"><head><META http-equiv=\"Content-Type\" content=\"text/html; charset=UTF-8\"></head><body><p hnh=\"2\">unicode attr names coerced</p></body></html>"; 178 assertEquals(expected, TextUtil.stripNewlines(out)); 179 } 180 181 @Test handlesInvalidTagAsText()182 public void handlesInvalidTagAsText() { 183 org.jsoup.nodes.Document jsoup = Jsoup.parse("<インセンティブで高収入!>Text <p>More</p>"); 184 185 Document w3Doc = W3CDom.convert(jsoup); 186 String xml = W3CDom.asString(w3Doc, W3CDom.OutputXml()); 187 assertEquals("<?xml version=\"1.0\" encoding=\"UTF-8\"?><html xmlns=\"http://www.w3.org/1999/xhtml\"><head/><body><インセンティブで高収入!>Text <p>More</p></body></html>", xml); 188 } 189 190 @Test treatsUndeclaredNamespaceAsLocalName()191 public void treatsUndeclaredNamespaceAsLocalName() { 192 String html = "<fb:like>One</fb:like>"; 193 org.jsoup.nodes.Document doc = Jsoup.parse(html); 194 195 Document w3Doc = new W3CDom().fromJsoup(doc); 196 Node htmlEl = w3Doc.getFirstChild(); 197 198 assertEquals("http://www.w3.org/1999/xhtml", htmlEl.getNamespaceURI()); 199 assertEquals("html", htmlEl.getLocalName()); 200 assertEquals("html", htmlEl.getNodeName()); 201 202 Node fb = htmlEl.getFirstChild().getNextSibling().getFirstChild(); 203 assertNull(fb.getNamespaceURI()); 204 assertEquals("like", fb.getLocalName()); 205 assertEquals("fb:like", fb.getNodeName()); 206 } 207 208 @Test xmlnsXpathTest()209 public void xmlnsXpathTest() throws XPathExpressionException { 210 W3CDom w3c = new W3CDom(); 211 String html = "<html><body><div>hello</div></body></html>"; 212 Document dom = w3c.fromJsoup(Jsoup.parse(html)); 213 NodeList nodeList = xpath(dom, "//*[local-name()=\"body\"]");// namespace aware; HTML namespace is default 214 assertEquals("div", nodeList.item(0).getLocalName()); 215 216 // default output is namespace aware, so query needs to be as well 217 html = "<html xmlns='http://www.w3.org/1999/xhtml'><body id='One'><div>hello</div></body></html>"; 218 dom = w3c.fromJsoup(Jsoup.parse(html)); 219 nodeList = xpath(dom, "//body"); 220 assertNull(nodeList); // no matches 221 222 dom = w3c.fromJsoup(Jsoup.parse(html)); 223 nodeList = xpath(dom, "//*[local-name()=\"body\"]"); 224 assertNotNull(nodeList); 225 assertEquals(1, nodeList.getLength()); 226 assertEquals("div", nodeList.item(0).getLocalName()); 227 assertEquals("http://www.w3.org/1999/xhtml", nodeList.item(0).getNamespaceURI()); 228 assertNull(nodeList.item(0).getPrefix()); 229 230 // get rid of the name space awareness 231 String xml = w3c.asString(dom); 232 dom = parseXml(xml, false); 233 Node item = (Node) xpath(dom, "//body"); 234 assertEquals("body", item.getNodeName()); 235 assertNull(item.getNamespaceURI()); 236 assertNull(item.getPrefix()); 237 238 // put back, will get zero 239 dom = parseXml(xml, true); 240 nodeList = xpath(dom, "//body"); 241 assertNull(nodeList); 242 } 243 244 @Test xhtmlNoNamespace()245 public void xhtmlNoNamespace() throws XPathExpressionException { 246 W3CDom w3c = new W3CDom(); 247 String html = "<html><body><div>hello</div></body></html>"; 248 w3c.namespaceAware(false); 249 Document dom = w3c.fromJsoup(Jsoup.parse(html)); 250 NodeList nodeList = xpath(dom, "//body");// no namespace 251 assertEquals(1, nodeList.getLength()); 252 assertEquals("div", nodeList.item(0).getLocalName()); 253 } 254 255 @Test canDisableNamespaces()256 void canDisableNamespaces() throws XPathExpressionException { 257 W3CDom w3c = new W3CDom(); 258 assertTrue(w3c.namespaceAware()); 259 260 w3c.namespaceAware(false); 261 assertFalse(w3c.namespaceAware()); 262 263 String html = "<html xmlns='http://www.w3.org/1999/xhtml'><body id='One'><div>hello</div></body></html>"; 264 Document dom = w3c.fromJsoup(Jsoup.parse(html)); 265 NodeList nodeList = xpath(dom, "//body");// no ns, so needs no prefix 266 assertEquals("div", nodeList.item(0).getLocalName()); 267 } 268 xpath(Document w3cDoc, String query)269 private NodeList xpath(Document w3cDoc, String query) throws XPathExpressionException { 270 XPathExpression xpath = XPathFactory.newInstance().newXPath().compile(query); 271 return ((NodeList) xpath.evaluate(w3cDoc, XPathConstants.NODE)); 272 } 273 274 @Test testRoundTripDoctype()275 public void testRoundTripDoctype() { 276 // TODO - not super happy with this output - but plain DOM doesn't let it out, and don't want to rebuild the writer 277 // because we have Saxon on the test classpath, the transformer will change to that, and so case may change (e.g. Java base in META, Saxon is meta for HTML) 278 String base = "<!DOCTYPE html><p>One</p>"; 279 assertEqualsIgnoreCase("<!DOCTYPE html SYSTEM \"about:legacy-compat\"><html xmlns=\"http://www.w3.org/1999/xhtml\"><head><META http-equiv=\"Content-Type\" content=\"text/html; charset=UTF-8\"></head><body><p>One</p></body></html>", output(base, true)); 280 assertEqualsIgnoreCase("<?xml version=\"1.0\" encoding=\"UTF-8\"?><!DOCTYPE html SYSTEM \"about:legacy-compat\"><html xmlns=\"http://www.w3.org/1999/xhtml\"><head/><body><p>One</p></body></html>", output(base, false)); 281 282 String publicDoc = "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\">"; 283 assertEqualsIgnoreCase("<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\"><html xmlns=\"http://www.w3.org/1999/xhtml\"><head><META http-equiv=\"Content-Type\" content=\"text/html; charset=UTF-8\"></head><body></body></html>", output(publicDoc, true)); 284 // different impls will have different XML formatting. OpenJDK 13 default gives this: <body /> but others have <body/>, so just check start 285 assertTrue(output(publicDoc, false).startsWith("<?xml version=\"1.0\" encoding=\"UTF-8\"?><!DOCTYPE html PUBLIC")); 286 287 String systemDoc = "<!DOCTYPE html SYSTEM \"exampledtdfile.dtd\">"; 288 assertEqualsIgnoreCase("<!DOCTYPE html SYSTEM \"exampledtdfile.dtd\"><html xmlns=\"http://www.w3.org/1999/xhtml\"><head><META http-equiv=\"Content-Type\" content=\"text/html; charset=UTF-8\"></head><body></body></html>", output(systemDoc, true)); 289 assertEqualsIgnoreCase("<?xml version=\"1.0\" encoding=\"UTF-8\"?><!DOCTYPE html SYSTEM \"exampledtdfile.dtd\"><html xmlns=\"http://www.w3.org/1999/xhtml\"><head/><body/></html>", output(systemDoc, false)); 290 291 String legacyDoc = "<!DOCTYPE html SYSTEM \"about:legacy-compat\">"; 292 assertEqualsIgnoreCase("<!DOCTYPE html SYSTEM \"about:legacy-compat\"><html xmlns=\"http://www.w3.org/1999/xhtml\"><head><META http-equiv=\"Content-Type\" content=\"text/html; charset=UTF-8\"></head><body></body></html>", output(legacyDoc, true)); 293 assertEqualsIgnoreCase("<?xml version=\"1.0\" encoding=\"UTF-8\"?><!DOCTYPE html SYSTEM \"about:legacy-compat\"><html xmlns=\"http://www.w3.org/1999/xhtml\"><head/><body/></html>", output(legacyDoc, false)); 294 295 String noDoctype = "<p>One</p>"; 296 assertEqualsIgnoreCase("<html xmlns=\"http://www.w3.org/1999/xhtml\"><head><META http-equiv=\"Content-Type\" content=\"text/html; charset=UTF-8\"></head><body><p>One</p></body></html>", output(noDoctype, true)); 297 assertEqualsIgnoreCase("<?xml version=\"1.0\" encoding=\"UTF-8\"?><html xmlns=\"http://www.w3.org/1999/xhtml\"><head/><body><p>One</p></body></html>", output(noDoctype, false)); 298 } 299 output(String in, boolean modeHtml)300 private String output(String in, boolean modeHtml) { 301 org.jsoup.nodes.Document jdoc = Jsoup.parse(in); 302 Document w3c = W3CDom.convert(jdoc); 303 304 Map<String, String> properties = modeHtml ? W3CDom.OutputHtml() : W3CDom.OutputXml(); 305 return TextUtil.normalizeSpaces(W3CDom.asString(w3c, properties)); 306 } 307 assertEqualsIgnoreCase(String want, String have)308 private void assertEqualsIgnoreCase(String want, String have) { 309 assertEquals(want.toLowerCase(Locale.ROOT), have.toLowerCase(Locale.ROOT)); 310 } 311 312 313 @Test canOutputHtmlWithoutNamespace()314 public void canOutputHtmlWithoutNamespace() { 315 String html = "<p>One</p>"; 316 org.jsoup.nodes.Document jdoc = Jsoup.parse(html); 317 W3CDom w3c = new W3CDom(); 318 w3c.namespaceAware(false); 319 320 String asHtml = W3CDom.asString(w3c.fromJsoup(jdoc), W3CDom.OutputHtml()); 321 String asXtml = W3CDom.asString(w3c.fromJsoup(jdoc), W3CDom.OutputXml()); 322 assertEqualsIgnoreCase( 323 "<html><head><meta http-equiv=\"content-type\" content=\"text/html; charset=utf-8\"></head><body><p>one</p></body></html>", 324 asHtml); 325 assertEqualsIgnoreCase( 326 "<?xml version=\"1.0\" encoding=\"UTF-8\"?><html><head/><body><p>One</p></body></html>", 327 asXtml); 328 } 329 convertsElementsAndMaintainsSource()330 @Test public void convertsElementsAndMaintainsSource() { 331 org.jsoup.nodes.Document jdoc = Jsoup.parse("<body><div><p>One</div><div><p>Two"); 332 W3CDom w3CDom = new W3CDom(); 333 Element jDiv = jdoc.selectFirst("div"); 334 assertNotNull(jDiv); 335 Document doc = w3CDom.fromJsoup(jDiv); 336 Node div = w3CDom.contextNode(doc); 337 338 assertEquals("div", div.getLocalName()); 339 assertEquals(jDiv, div.getUserData(W3CDom.SourceProperty)); 340 341 Node textNode = div.getFirstChild().getFirstChild(); 342 assertEquals("One", textNode.getTextContent()); 343 assertEquals(Node.TEXT_NODE, textNode.getNodeType()); 344 345 org.jsoup.nodes.TextNode jText = (TextNode) jDiv.childNode(0).childNode(0); 346 assertEquals(jText, textNode.getUserData(W3CDom.SourceProperty)); 347 } 348 canXmlParseCdataNodes()349 @Test public void canXmlParseCdataNodes() throws XPathExpressionException { 350 String html = "<p><script>1 && 2</script><style>3 && 4</style> 5 && 6</p>"; 351 org.jsoup.nodes.Document jdoc = Jsoup.parse(html); 352 jdoc.outputSettings().syntax(org.jsoup.nodes.Document.OutputSettings.Syntax.xml); 353 String xml = jdoc.body().html(); 354 assertTrue(xml.contains("<script>//<![CDATA[\n1 && 2\n//]]></script>")); // as asserted in ElementTest 355 Document doc = parseXml(xml, false); 356 NodeList list = xpath(doc, "//script"); 357 assertEquals(2, list.getLength()); 358 Node scriptComment = list.item(0); // will be the cdata node 359 assertEquals("//", scriptComment.getTextContent()); 360 Node script = list.item(1); 361 assertEquals("\n" + 362 "1 && 2\n" + 363 "//", script.getTextContent()); 364 365 } 366 handlesEmptyDoctype()367 @Test public void handlesEmptyDoctype() { 368 String html = "<!doctype>Foo"; 369 org.jsoup.nodes.Document jdoc = Jsoup.parse(html); 370 Document doc = (new W3CDom()).fromJsoup(jdoc); 371 assertNull(doc.getDoctype()); 372 assertEquals("Foo", doc.getFirstChild().getTextContent()); 373 } 374 375 } 376