• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 package org.jsoup.helper;
2 
3 import org.jsoup.Jsoup;
4 import org.jsoup.integration.ParseTest;
5 import org.jsoup.nodes.Document;
6 import org.jsoup.parser.Parser;
7 import org.junit.jupiter.api.Test;
8 
9 import java.io.*;
10 import java.nio.ByteBuffer;
11 import java.nio.charset.Charset;
12 import java.nio.charset.StandardCharsets;
13 import java.nio.file.Files;
14 
15 import static org.jsoup.integration.ParseTest.getFile;
16 import static org.junit.jupiter.api.Assertions.*;
17 
18 public class DataUtilTest {
19     @Test
testCharset()20     public void testCharset() {
21         assertEquals("utf-8", DataUtil.getCharsetFromContentType("text/html;charset=utf-8 "));
22         assertEquals("UTF-8", DataUtil.getCharsetFromContentType("text/html; charset=UTF-8"));
23         assertEquals("ISO-8859-1", DataUtil.getCharsetFromContentType("text/html; charset=ISO-8859-1"));
24         assertNull(DataUtil.getCharsetFromContentType("text/html"));
25         assertNull(DataUtil.getCharsetFromContentType(null));
26         assertNull(DataUtil.getCharsetFromContentType("text/html;charset=Unknown"));
27     }
28 
29     @Test
testQuotedCharset()30     public void testQuotedCharset() {
31         assertEquals("utf-8", DataUtil.getCharsetFromContentType("text/html; charset=\"utf-8\""));
32         assertEquals("UTF-8", DataUtil.getCharsetFromContentType("text/html;charset=\"UTF-8\""));
33         assertEquals("ISO-8859-1", DataUtil.getCharsetFromContentType("text/html; charset=\"ISO-8859-1\""));
34         assertNull(DataUtil.getCharsetFromContentType("text/html; charset=\"Unsupported\""));
35         assertEquals("UTF-8", DataUtil.getCharsetFromContentType("text/html; charset='UTF-8'"));
36     }
37 
stream(String data)38     private InputStream stream(String data) {
39         return new ByteArrayInputStream(data.getBytes(StandardCharsets.UTF_8));
40     }
41 
stream(String data, String charset)42     private InputStream stream(String data, String charset) {
43         return new ByteArrayInputStream(data.getBytes(Charset.forName(charset)));
44     }
45 
46     @Test
discardsSpuriousByteOrderMark()47     public void discardsSpuriousByteOrderMark() throws IOException {
48         String html = "\uFEFF<html><head><title>One</title></head><body>Two</body></html>";
49         Document doc = DataUtil.parseInputStream(stream(html), "UTF-8", "http://foo.com/", Parser.htmlParser());
50         assertEquals("One", doc.head().text());
51     }
52 
53     @Test
discardsSpuriousByteOrderMarkWhenNoCharsetSet()54     public void discardsSpuriousByteOrderMarkWhenNoCharsetSet() throws IOException {
55         String html = "\uFEFF<html><head><title>One</title></head><body>Two</body></html>";
56         Document doc = DataUtil.parseInputStream(stream(html), null, "http://foo.com/", Parser.htmlParser());
57         assertEquals("One", doc.head().text());
58         assertEquals("UTF-8", doc.outputSettings().charset().displayName());
59     }
60 
61     @Test
shouldNotThrowExceptionOnEmptyCharset()62     public void shouldNotThrowExceptionOnEmptyCharset() {
63         assertNull(DataUtil.getCharsetFromContentType("text/html; charset="));
64         assertNull(DataUtil.getCharsetFromContentType("text/html; charset=;"));
65     }
66 
67     @Test
shouldSelectFirstCharsetOnWeirdMultileCharsetsInMetaTags()68     public void shouldSelectFirstCharsetOnWeirdMultileCharsetsInMetaTags() {
69         assertEquals("ISO-8859-1", DataUtil.getCharsetFromContentType("text/html; charset=ISO-8859-1, charset=1251"));
70     }
71 
72     @Test
shouldCorrectCharsetForDuplicateCharsetString()73     public void shouldCorrectCharsetForDuplicateCharsetString() {
74         assertEquals("iso-8859-1", DataUtil.getCharsetFromContentType("text/html; charset=charset=iso-8859-1"));
75     }
76 
77     @Test
shouldReturnNullForIllegalCharsetNames()78     public void shouldReturnNullForIllegalCharsetNames() {
79         assertNull(DataUtil.getCharsetFromContentType("text/html; charset=$HJKDF§$/("));
80     }
81 
82     @Test
generatesMimeBoundaries()83     public void generatesMimeBoundaries() {
84         String m1 = DataUtil.mimeBoundary();
85         String m2 = DataUtil.mimeBoundary();
86 
87         assertEquals(DataUtil.boundaryLength, m1.length());
88         assertEquals(DataUtil.boundaryLength, m2.length());
89         assertNotSame(m1, m2);
90     }
91 
92     @Test
wrongMetaCharsetFallback()93     public void wrongMetaCharsetFallback() throws IOException {
94         String html = "<html><head><meta charset=iso-8></head><body></body></html>";
95 
96         Document doc = DataUtil.parseInputStream(stream(html), null, "http://example.com", Parser.htmlParser());
97 
98         final String expected = "<html>\n" +
99                 " <head>\n" +
100                 "  <meta charset=\"iso-8\">\n" +
101                 " </head>\n" +
102                 " <body></body>\n" +
103                 "</html>";
104 
105         assertEquals(expected, doc.toString());
106     }
107 
108     @Test
secondMetaElementWithContentTypeContainsCharsetParameter()109     public void secondMetaElementWithContentTypeContainsCharsetParameter() throws Exception {
110         String html = "<html><head>" +
111                 "<meta http-equiv=\"Content-Type\" content=\"text/html\">" +
112                 "<meta http-equiv=\"Content-Type\" content=\"text/html; charset=euc-kr\">" +
113                 "</head><body>한국어</body></html>";
114 
115         Document doc = DataUtil.parseInputStream(stream(html, "euc-kr"), null, "http://example.com", Parser.htmlParser());
116 
117         assertEquals("한국어", doc.body().text());
118     }
119 
120     @Test
firstMetaElementWithCharsetShouldBeUsedForDecoding()121     public void firstMetaElementWithCharsetShouldBeUsedForDecoding() throws Exception {
122         String html = "<html><head>" +
123                 "<meta http-equiv=\"Content-Type\" content=\"text/html; charset=iso-8859-1\">" +
124                 "<meta http-equiv=\"Content-Type\" content=\"text/html; charset=koi8-u\">" +
125                 "</head><body>Übergrößenträger</body></html>";
126 
127         Document doc = DataUtil.parseInputStream(stream(html, "iso-8859-1"), null, "http://example.com", Parser.htmlParser());
128 
129         assertEquals("Übergrößenträger", doc.body().text());
130     }
131 
132     @Test
parseSequenceInputStream()133     public void parseSequenceInputStream() throws IOException {
134         // https://github.com/jhy/jsoup/pull/1671
135         File in = getFile("/htmltests/medium.html");
136         String fileContent = new String(Files.readAllBytes(in.toPath()));
137         int halfLength = fileContent.length() / 2;
138         String firstPart = fileContent.substring(0, halfLength);
139         String secondPart = fileContent.substring(halfLength);
140         SequenceInputStream sequenceStream = new SequenceInputStream(
141                 stream(firstPart),
142                 stream(secondPart)
143         );
144         Document doc = DataUtil.parseInputStream(sequenceStream, null, "", Parser.htmlParser());
145         assertEquals(fileContent, doc.outerHtml());
146     }
147 
148     @Test
supportsBOMinFiles()149     public void supportsBOMinFiles() throws IOException {
150         // test files from http://www.i18nl10n.com/korean/utftest/
151         File in = getFile("/bomtests/bom_utf16be.html");
152         Document doc = Jsoup.parse(in, null, "http://example.com");
153         assertTrue(doc.title().contains("UTF-16BE"));
154         assertTrue(doc.text().contains("가각갂갃간갅"));
155 
156         in = getFile("/bomtests/bom_utf16le.html");
157         doc = Jsoup.parse(in, null, "http://example.com");
158         assertTrue(doc.title().contains("UTF-16LE"));
159         assertTrue(doc.text().contains("가각갂갃간갅"));
160 
161         in = getFile("/bomtests/bom_utf32be.html");
162         doc = Jsoup.parse(in, null, "http://example.com");
163         assertTrue(doc.title().contains("UTF-32BE"));
164         assertTrue(doc.text().contains("가각갂갃간갅"));
165 
166         in = getFile("/bomtests/bom_utf32le.html");
167         doc = Jsoup.parse(in, null, "http://example.com");
168         assertTrue(doc.title().contains("UTF-32LE"));
169         assertTrue(doc.text().contains("가각갂갃간갅"));
170     }
171 
172     @Test
supportsUTF8BOM()173     public void supportsUTF8BOM() throws IOException {
174         File in = getFile("/bomtests/bom_utf8.html");
175         Document doc = Jsoup.parse(in, null, "http://example.com");
176         assertEquals("OK", doc.head().select("title").text());
177     }
178 
179     @Test
noExtraNULLBytes()180     public void noExtraNULLBytes() throws IOException {
181     	final byte[] b = "<html><head><meta charset=\"UTF-8\"></head><body><div><u>ü</u>ü</div></body></html>".getBytes(StandardCharsets.UTF_8);
182 
183     	Document doc = Jsoup.parse(new ByteArrayInputStream(b), null, "");
184     	assertFalse( doc.outerHtml().contains("\u0000") );
185     }
186 
187     @Test
supportsZippedUTF8BOM()188     public void supportsZippedUTF8BOM() throws IOException {
189         File in = getFile("/bomtests/bom_utf8.html.gz");
190         Document doc = Jsoup.parse(in, null, "http://example.com");
191         assertEquals("OK", doc.head().select("title").text());
192         assertEquals("There is a UTF8 BOM at the top (before the XML decl). If not read correctly, will look like a non-joining space.", doc.body().text());
193     }
194 
195     @Test
supportsXmlCharsetDeclaration()196     public void supportsXmlCharsetDeclaration() throws IOException {
197         String encoding = "iso-8859-1";
198         InputStream soup = new ByteArrayInputStream((
199                 "<?xml version=\"1.0\" encoding=\"iso-8859-1\"?>" +
200                         "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Strict//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd\">" +
201                         "<html xmlns=\"http://www.w3.org/1999/xhtml\" lang=\"en\" xml:lang=\"en\">Hellö Wörld!</html>"
202         ).getBytes(Charset.forName(encoding)));
203 
204         Document doc = Jsoup.parse(soup, null, "");
205         assertEquals("Hellö Wörld!", doc.body().text());
206     }
207 
208 
209     @Test
lLoadsGzipFile()210     public void lLoadsGzipFile() throws IOException {
211         File in = getFile("/htmltests/gzip.html.gz");
212         Document doc = Jsoup.parse(in, null);
213         assertEquals("Gzip test", doc.title());
214         assertEquals("This is a gzipped HTML file.", doc.selectFirst("p").text());
215     }
216 
217     @Test
loadsZGzipFile()218     public void loadsZGzipFile() throws IOException {
219         // compressed on win, with z suffix
220         File in = getFile("/htmltests/gzip.html.z");
221         Document doc = Jsoup.parse(in, null);
222         assertEquals("Gzip test", doc.title());
223         assertEquals("This is a gzipped HTML file.", doc.selectFirst("p").text());
224     }
225 
226     @Test
handlesFakeGzipFile()227     public void handlesFakeGzipFile() throws IOException {
228         File in = getFile("/htmltests/fake-gzip.html.gz");
229         Document doc = Jsoup.parse(in, null);
230         assertEquals("This is not gzipped", doc.title());
231         assertEquals("And should still be readable.", doc.selectFirst("p").text());
232     }
233 
234     // an input stream to give a range of output sizes, that changes on each read
235     static class VaryingReadInputStream extends InputStream {
236         final InputStream in;
237         int stride = 0;
238 
VaryingReadInputStream(InputStream in)239         VaryingReadInputStream(InputStream in) {
240             this.in = in;
241         }
242 
read()243         public int read() throws IOException {
244             return in.read();
245         }
246 
read(byte[] b)247         public int read(byte[] b) throws IOException {
248             return in.read(b, 0, Math.min(b.length, ++stride));
249         }
250 
read(byte[] b, int off, int len)251         public int read(byte[] b, int off, int len) throws IOException {
252             return in.read(b, off, Math.min(len, ++stride));
253         }
254     }
255 
256     @Test
handlesChunkedInputStream()257     void handlesChunkedInputStream() throws IOException {
258         File inputFile = ParseTest.getFile("/htmltests/large.html");
259         String input = ParseTest.getFileAsString(inputFile);
260         VaryingReadInputStream stream = new VaryingReadInputStream(ParseTest.inputStreamFrom(input));
261 
262         Document expected = Jsoup.parse(input, "https://example.com");
263         Document doc = Jsoup.parse(stream, null, "https://example.com");
264         assertTrue(doc.hasSameValue(expected));
265     }
266 
267     @Test
handlesUnlimitedRead()268     void handlesUnlimitedRead() throws IOException {
269         File inputFile = ParseTest.getFile("/htmltests/large.html");
270         String input = ParseTest.getFileAsString(inputFile);
271         VaryingReadInputStream stream = new VaryingReadInputStream(ParseTest.inputStreamFrom(input));
272 
273         ByteBuffer byteBuffer = DataUtil.readToByteBuffer(stream, 0);
274         String read = new String(byteBuffer.array());
275 
276         assertEquals(input, read);
277     }
278 }
279