1 package org.jsoup.integration; 2 3 import org.jsoup.Jsoup; 4 import org.jsoup.helper.DataUtil; 5 import org.jsoup.nodes.Document; 6 import org.jsoup.nodes.Element; 7 import org.jsoup.parser.ParseErrorList; 8 import org.jsoup.parser.Parser; 9 import org.jsoup.select.Elements; 10 import org.junit.jupiter.api.Test; 11 12 import java.io.*; 13 import java.net.URISyntaxException; 14 import java.net.URL; 15 import java.nio.ByteBuffer; 16 import java.nio.charset.StandardCharsets; 17 import java.nio.file.Files; 18 import java.util.zip.GZIPInputStream; 19 20 import static org.junit.jupiter.api.Assertions.*; 21 22 /** 23 * Integration test: parses from real-world example HTML. 24 * 25 * @author Jonathan Hedley, jonathan@hedley.net 26 */ 27 public class ParseTest { 28 @Test testHtml5Charset()29 public void testHtml5Charset() throws IOException { 30 // test that <meta charset="gb2312"> works 31 File in = getFile("/htmltests/meta-charset-1.html"); 32 Document doc = Jsoup.parse(in, null, "http://example.com/"); //gb2312, has html5 <meta charset> 33 assertEquals("新", doc.text()); 34 assertEquals("GB2312", doc.outputSettings().charset().displayName()); 35 36 // double check, no charset, falls back to utf8 which is incorrect 37 in = getFile("/htmltests/meta-charset-2.html"); // 38 doc = Jsoup.parse(in, null, "http://example.com"); // gb2312, no charset 39 assertEquals("UTF-8", doc.outputSettings().charset().displayName()); 40 assertNotEquals("新", doc.text()); 41 42 // confirm fallback to utf8 43 in = getFile("/htmltests/meta-charset-3.html"); 44 doc = Jsoup.parse(in, null, "http://example.com/"); // utf8, no charset 45 assertEquals("UTF-8", doc.outputSettings().charset().displayName()); 46 assertEquals("新", doc.text()); 47 } 48 49 @Test testBrokenHtml5CharsetWithASingleDoubleQuote()50 public void testBrokenHtml5CharsetWithASingleDoubleQuote() throws IOException { 51 InputStream in = inputStreamFrom("<html>\n" + 52 "<head><meta charset=UTF-8\"></head>\n" + 53 "<body></body>\n" + 54 "</html>"); 55 Document doc = Jsoup.parse(in, null, "http://example.com/"); 56 assertEquals("UTF-8", doc.outputSettings().charset().displayName()); 57 } 58 59 @Test testLowercaseUtf8Charset()60 public void testLowercaseUtf8Charset() throws IOException { 61 File in = getFile("/htmltests/lowercase-charset-test.html"); 62 Document doc = Jsoup.parse(in, null); 63 64 Element form = doc.select("#form").first(); 65 assertEquals(2, form.children().size()); 66 assertEquals("UTF-8", doc.outputSettings().charset().name()); 67 } 68 69 @Test testXwiki()70 public void testXwiki() throws IOException { 71 // https://github.com/jhy/jsoup/issues/1324 72 // this tests that when in CharacterReader we hit a buffer while marked, we preserve the mark when buffered up and can rewind 73 File in = getFile("/htmltests/xwiki-1324.html.gz"); 74 Document doc = Jsoup.parse(in, null, "https://localhost/"); 75 assertEquals("XWiki Jetty HSQLDB 12.1-SNAPSHOT", doc.select("#xwikiplatformversion").text()); 76 77 // was getting busted at =userdirectory, because it hit the bufferup point but the mark was then lost. so 78 // updated to preserve the mark. 79 String wantHtml = "<a class=\"list-group-item\" data-id=\"userdirectory\" href=\"/xwiki/bin/admin/XWiki/XWikiPreferences?editor=globaladmin&section=userdirectory\" title=\"Customize the user directory live table.\">User Directory</a>"; 80 assertEquals(wantHtml, doc.select("[data-id=userdirectory]").outerHtml()); 81 } 82 83 @Test testXwikiExpanded()84 public void testXwikiExpanded() throws IOException { 85 // https://github.com/jhy/jsoup/issues/1324 86 // this tests that if there is a huge illegal character reference, we can get through a buffer and rewind, and still catch that it's an invalid refence, 87 // and the parse tree is correct. 88 File in = getFile("/htmltests/xwiki-edit.html.gz"); 89 Parser parser = Parser.htmlParser(); 90 Document doc = Jsoup.parse(new GZIPInputStream(new FileInputStream(in)), "UTF-8", "https://localhost/", parser.setTrackErrors(100)); 91 ParseErrorList errors = parser.getErrors(); 92 93 assertEquals("XWiki Jetty HSQLDB 12.1-SNAPSHOT", doc.select("#xwikiplatformversion").text()); 94 assertEquals(0, errors.size()); // not an invalid reference because did not look legit 95 96 // was getting busted at =userdirectory, because it hit the bufferup point but the mark was then lost. so 97 // updated to preserve the mark. 98 String wantHtml = "<a class=\"list-group-item\" data-id=\"userdirectory\" href=\"/xwiki/bin/admin/XWiki/XWikiPreferences?editor=globaladmin&RIGHTHERERIGHTHERERIGHTHERERIGHTHERE"; 99 assertTrue(doc.select("[data-id=userdirectory]").outerHtml().startsWith(wantHtml)); 100 } 101 testWikiExpandedFromString()102 @Test public void testWikiExpandedFromString() throws IOException { 103 File in = getFile("/htmltests/xwiki-edit.html.gz"); 104 String html = getFileAsString(in); 105 Document doc = Jsoup.parse(html); 106 assertEquals("XWiki Jetty HSQLDB 12.1-SNAPSHOT", doc.select("#xwikiplatformversion").text()); 107 String wantHtml = "<a class=\"list-group-item\" data-id=\"userdirectory\" href=\"/xwiki/bin/admin/XWiki/XWikiPreferences?editor=globaladmin&RIGHTHERERIGHTHERERIGHTHERERIGHTHERE"; 108 assertTrue(doc.select("[data-id=userdirectory]").outerHtml().startsWith(wantHtml)); 109 } 110 testWikiFromString()111 @Test public void testWikiFromString() throws IOException { 112 File in = getFile("/htmltests/xwiki-1324.html.gz"); 113 String html = getFileAsString(in); 114 Document doc = Jsoup.parse(html); 115 assertEquals("XWiki Jetty HSQLDB 12.1-SNAPSHOT", doc.select("#xwikiplatformversion").text()); 116 String wantHtml = "<a class=\"list-group-item\" data-id=\"userdirectory\" href=\"/xwiki/bin/admin/XWiki/XWikiPreferences?editor=globaladmin&section=userdirectory\" title=\"Customize the user directory live table.\">User Directory</a>"; 117 assertEquals(wantHtml, doc.select("[data-id=userdirectory]").outerHtml()); 118 } 119 testFileParseNoCharsetMethod()120 @Test public void testFileParseNoCharsetMethod() throws IOException { 121 File in = getFile("/htmltests/xwiki-1324.html.gz"); 122 Document doc = Jsoup.parse(in); 123 assertEquals("XWiki Jetty HSQLDB 12.1-SNAPSHOT", doc.select("#xwikiplatformversion").text()); 124 } 125 126 getFile(String resourceName)127 public static File getFile(String resourceName) { 128 try { 129 URL resource = ParseTest.class.getResource(resourceName); 130 return resource != null ? new File(resource.toURI()) : new File("/404"); 131 } catch (URISyntaxException e) { 132 throw new IllegalStateException(e); 133 } 134 } 135 inputStreamFrom(String s)136 public static InputStream inputStreamFrom(String s) { 137 return new ByteArrayInputStream(s.getBytes(StandardCharsets.UTF_8)); 138 } 139 getFileAsString(File file)140 public static String getFileAsString(File file) throws IOException { 141 byte[] bytes; 142 if (file.getName().endsWith(".gz")) { 143 InputStream stream = new GZIPInputStream(new FileInputStream(file)); 144 ByteBuffer byteBuffer = DataUtil.readToByteBuffer(stream, 0); 145 bytes = byteBuffer.array(); 146 } else { 147 bytes = Files.readAllBytes(file.toPath()); 148 } 149 return new String(bytes); 150 } 151 152 } 153