• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 package org.jsoup.integration;
2 
3 import org.jsoup.Jsoup;
4 import org.jsoup.helper.DataUtil;
5 import org.jsoup.nodes.Document;
6 import org.jsoup.nodes.Element;
7 import org.jsoup.parser.ParseErrorList;
8 import org.jsoup.parser.Parser;
9 import org.jsoup.select.Elements;
10 import org.junit.jupiter.api.Test;
11 
12 import java.io.*;
13 import java.net.URISyntaxException;
14 import java.net.URL;
15 import java.nio.ByteBuffer;
16 import java.nio.charset.StandardCharsets;
17 import java.nio.file.Files;
18 import java.util.zip.GZIPInputStream;
19 
20 import static org.junit.jupiter.api.Assertions.*;
21 
22 /**
23  * Integration test: parses from real-world example HTML.
24  *
25  * @author Jonathan Hedley, jonathan@hedley.net
26  */
27 public class ParseTest {
28     @Test
testHtml5Charset()29     public void testHtml5Charset() throws IOException {
30         // test that <meta charset="gb2312"> works
31         File in = getFile("/htmltests/meta-charset-1.html");
32         Document doc = Jsoup.parse(in, null, "http://example.com/"); //gb2312, has html5 <meta charset>
33         assertEquals("新", doc.text());
34         assertEquals("GB2312", doc.outputSettings().charset().displayName());
35 
36         // double check, no charset, falls back to utf8 which is incorrect
37         in = getFile("/htmltests/meta-charset-2.html"); //
38         doc = Jsoup.parse(in, null, "http://example.com"); // gb2312, no charset
39         assertEquals("UTF-8", doc.outputSettings().charset().displayName());
40         assertNotEquals("新", doc.text());
41 
42         // confirm fallback to utf8
43         in = getFile("/htmltests/meta-charset-3.html");
44         doc = Jsoup.parse(in, null, "http://example.com/"); // utf8, no charset
45         assertEquals("UTF-8", doc.outputSettings().charset().displayName());
46         assertEquals("新", doc.text());
47     }
48 
49     @Test
testBrokenHtml5CharsetWithASingleDoubleQuote()50     public void testBrokenHtml5CharsetWithASingleDoubleQuote() throws IOException {
51         InputStream in = inputStreamFrom("<html>\n" +
52                 "<head><meta charset=UTF-8\"></head>\n" +
53                 "<body></body>\n" +
54                 "</html>");
55         Document doc = Jsoup.parse(in, null, "http://example.com/");
56         assertEquals("UTF-8", doc.outputSettings().charset().displayName());
57     }
58 
59     @Test
testLowercaseUtf8Charset()60     public void testLowercaseUtf8Charset() throws IOException {
61         File in = getFile("/htmltests/lowercase-charset-test.html");
62         Document doc = Jsoup.parse(in, null);
63 
64         Element form = doc.select("#form").first();
65         assertEquals(2, form.children().size());
66         assertEquals("UTF-8", doc.outputSettings().charset().name());
67     }
68 
69     @Test
testXwiki()70     public void testXwiki() throws IOException {
71         // https://github.com/jhy/jsoup/issues/1324
72         // this tests that when in CharacterReader we hit a buffer while marked, we preserve the mark when buffered up and can rewind
73         File in = getFile("/htmltests/xwiki-1324.html.gz");
74         Document doc = Jsoup.parse(in, null, "https://localhost/");
75         assertEquals("XWiki Jetty HSQLDB 12.1-SNAPSHOT", doc.select("#xwikiplatformversion").text());
76 
77         // was getting busted at =userdirectory, because it hit the bufferup point but the mark was then lost. so
78         // updated to preserve the mark.
79         String wantHtml = "<a class=\"list-group-item\" data-id=\"userdirectory\" href=\"/xwiki/bin/admin/XWiki/XWikiPreferences?editor=globaladmin&amp;section=userdirectory\" title=\"Customize the user directory live table.\">User Directory</a>";
80         assertEquals(wantHtml, doc.select("[data-id=userdirectory]").outerHtml());
81     }
82 
83     @Test
testXwikiExpanded()84     public void testXwikiExpanded() throws IOException {
85         // https://github.com/jhy/jsoup/issues/1324
86         // this tests that if there is a huge illegal character reference, we can get through a buffer and rewind, and still catch that it's an invalid refence,
87         // and the parse tree is correct.
88         File in = getFile("/htmltests/xwiki-edit.html.gz");
89         Parser parser = Parser.htmlParser();
90         Document doc = Jsoup.parse(new GZIPInputStream(new FileInputStream(in)), "UTF-8", "https://localhost/", parser.setTrackErrors(100));
91         ParseErrorList errors = parser.getErrors();
92 
93         assertEquals("XWiki Jetty HSQLDB 12.1-SNAPSHOT", doc.select("#xwikiplatformversion").text());
94         assertEquals(0, errors.size()); // not an invalid reference because did not look legit
95 
96         // was getting busted at =userdirectory, because it hit the bufferup point but the mark was then lost. so
97         // updated to preserve the mark.
98         String wantHtml = "<a class=\"list-group-item\" data-id=\"userdirectory\" href=\"/xwiki/bin/admin/XWiki/XWikiPreferences?editor=globaladmin&amp;RIGHTHERERIGHTHERERIGHTHERERIGHTHERE";
99         assertTrue(doc.select("[data-id=userdirectory]").outerHtml().startsWith(wantHtml));
100     }
101 
testWikiExpandedFromString()102     @Test public void testWikiExpandedFromString() throws IOException {
103         File in = getFile("/htmltests/xwiki-edit.html.gz");
104         String html = getFileAsString(in);
105         Document doc = Jsoup.parse(html);
106         assertEquals("XWiki Jetty HSQLDB 12.1-SNAPSHOT", doc.select("#xwikiplatformversion").text());
107         String wantHtml = "<a class=\"list-group-item\" data-id=\"userdirectory\" href=\"/xwiki/bin/admin/XWiki/XWikiPreferences?editor=globaladmin&amp;RIGHTHERERIGHTHERERIGHTHERERIGHTHERE";
108         assertTrue(doc.select("[data-id=userdirectory]").outerHtml().startsWith(wantHtml));
109     }
110 
testWikiFromString()111     @Test public void testWikiFromString() throws IOException {
112         File in = getFile("/htmltests/xwiki-1324.html.gz");
113         String html = getFileAsString(in);
114         Document doc = Jsoup.parse(html);
115         assertEquals("XWiki Jetty HSQLDB 12.1-SNAPSHOT", doc.select("#xwikiplatformversion").text());
116         String wantHtml = "<a class=\"list-group-item\" data-id=\"userdirectory\" href=\"/xwiki/bin/admin/XWiki/XWikiPreferences?editor=globaladmin&amp;section=userdirectory\" title=\"Customize the user directory live table.\">User Directory</a>";
117         assertEquals(wantHtml, doc.select("[data-id=userdirectory]").outerHtml());
118     }
119 
testFileParseNoCharsetMethod()120     @Test public void testFileParseNoCharsetMethod() throws IOException {
121         File in = getFile("/htmltests/xwiki-1324.html.gz");
122         Document doc = Jsoup.parse(in);
123         assertEquals("XWiki Jetty HSQLDB 12.1-SNAPSHOT", doc.select("#xwikiplatformversion").text());
124     }
125 
126 
getFile(String resourceName)127     public static File getFile(String resourceName) {
128         try {
129             URL resource = ParseTest.class.getResource(resourceName);
130             return resource != null ? new File(resource.toURI()) : new File("/404");
131         } catch (URISyntaxException e) {
132             throw new IllegalStateException(e);
133         }
134     }
135 
inputStreamFrom(String s)136     public static InputStream inputStreamFrom(String s) {
137         return new ByteArrayInputStream(s.getBytes(StandardCharsets.UTF_8));
138     }
139 
getFileAsString(File file)140     public static String getFileAsString(File file) throws IOException {
141         byte[] bytes;
142         if (file.getName().endsWith(".gz")) {
143             InputStream stream = new GZIPInputStream(new FileInputStream(file));
144             ByteBuffer byteBuffer = DataUtil.readToByteBuffer(stream, 0);
145             bytes = byteBuffer.array();
146         } else {
147             bytes = Files.readAllBytes(file.toPath());
148         }
149         return new String(bytes);
150     }
151 
152 }
153