1 package org.jsoup.helper; 2 3 import org.jsoup.Jsoup; 4 import org.jsoup.integration.ParseTest; 5 import org.jsoup.nodes.Document; 6 import org.jsoup.parser.Parser; 7 import org.junit.jupiter.api.Test; 8 9 import java.io.*; 10 import java.nio.ByteBuffer; 11 import java.nio.charset.Charset; 12 import java.nio.charset.StandardCharsets; 13 import java.nio.file.Files; 14 15 import static org.jsoup.integration.ParseTest.getFile; 16 import static org.junit.jupiter.api.Assertions.*; 17 18 public class DataUtilTest { 19 @Test testCharset()20 public void testCharset() { 21 assertEquals("utf-8", DataUtil.getCharsetFromContentType("text/html;charset=utf-8 ")); 22 assertEquals("UTF-8", DataUtil.getCharsetFromContentType("text/html; charset=UTF-8")); 23 assertEquals("ISO-8859-1", DataUtil.getCharsetFromContentType("text/html; charset=ISO-8859-1")); 24 assertNull(DataUtil.getCharsetFromContentType("text/html")); 25 assertNull(DataUtil.getCharsetFromContentType(null)); 26 assertNull(DataUtil.getCharsetFromContentType("text/html;charset=Unknown")); 27 } 28 29 @Test testQuotedCharset()30 public void testQuotedCharset() { 31 assertEquals("utf-8", DataUtil.getCharsetFromContentType("text/html; charset=\"utf-8\"")); 32 assertEquals("UTF-8", DataUtil.getCharsetFromContentType("text/html;charset=\"UTF-8\"")); 33 assertEquals("ISO-8859-1", DataUtil.getCharsetFromContentType("text/html; charset=\"ISO-8859-1\"")); 34 assertNull(DataUtil.getCharsetFromContentType("text/html; charset=\"Unsupported\"")); 35 assertEquals("UTF-8", DataUtil.getCharsetFromContentType("text/html; charset='UTF-8'")); 36 } 37 stream(String data)38 private InputStream stream(String data) { 39 return new ByteArrayInputStream(data.getBytes(StandardCharsets.UTF_8)); 40 } 41 stream(String data, String charset)42 private InputStream stream(String data, String charset) { 43 return new ByteArrayInputStream(data.getBytes(Charset.forName(charset))); 44 } 45 46 @Test discardsSpuriousByteOrderMark()47 public void discardsSpuriousByteOrderMark() throws IOException { 48 String html = "\uFEFF<html><head><title>One</title></head><body>Two</body></html>"; 49 Document doc = DataUtil.parseInputStream(stream(html), "UTF-8", "http://foo.com/", Parser.htmlParser()); 50 assertEquals("One", doc.head().text()); 51 } 52 53 @Test discardsSpuriousByteOrderMarkWhenNoCharsetSet()54 public void discardsSpuriousByteOrderMarkWhenNoCharsetSet() throws IOException { 55 String html = "\uFEFF<html><head><title>One</title></head><body>Two</body></html>"; 56 Document doc = DataUtil.parseInputStream(stream(html), null, "http://foo.com/", Parser.htmlParser()); 57 assertEquals("One", doc.head().text()); 58 assertEquals("UTF-8", doc.outputSettings().charset().displayName()); 59 } 60 61 @Test shouldNotThrowExceptionOnEmptyCharset()62 public void shouldNotThrowExceptionOnEmptyCharset() { 63 assertNull(DataUtil.getCharsetFromContentType("text/html; charset=")); 64 assertNull(DataUtil.getCharsetFromContentType("text/html; charset=;")); 65 } 66 67 @Test shouldSelectFirstCharsetOnWeirdMultileCharsetsInMetaTags()68 public void shouldSelectFirstCharsetOnWeirdMultileCharsetsInMetaTags() { 69 assertEquals("ISO-8859-1", DataUtil.getCharsetFromContentType("text/html; charset=ISO-8859-1, charset=1251")); 70 } 71 72 @Test shouldCorrectCharsetForDuplicateCharsetString()73 public void shouldCorrectCharsetForDuplicateCharsetString() { 74 assertEquals("iso-8859-1", DataUtil.getCharsetFromContentType("text/html; charset=charset=iso-8859-1")); 75 } 76 77 @Test shouldReturnNullForIllegalCharsetNames()78 public void shouldReturnNullForIllegalCharsetNames() { 79 assertNull(DataUtil.getCharsetFromContentType("text/html; charset=$HJKDF§$/(")); 80 } 81 82 @Test generatesMimeBoundaries()83 public void generatesMimeBoundaries() { 84 String m1 = DataUtil.mimeBoundary(); 85 String m2 = DataUtil.mimeBoundary(); 86 87 assertEquals(DataUtil.boundaryLength, m1.length()); 88 assertEquals(DataUtil.boundaryLength, m2.length()); 89 assertNotSame(m1, m2); 90 } 91 92 @Test wrongMetaCharsetFallback()93 public void wrongMetaCharsetFallback() throws IOException { 94 String html = "<html><head><meta charset=iso-8></head><body></body></html>"; 95 96 Document doc = DataUtil.parseInputStream(stream(html), null, "http://example.com", Parser.htmlParser()); 97 98 final String expected = "<html>\n" + 99 " <head>\n" + 100 " <meta charset=\"iso-8\">\n" + 101 " </head>\n" + 102 " <body></body>\n" + 103 "</html>"; 104 105 assertEquals(expected, doc.toString()); 106 } 107 108 @Test secondMetaElementWithContentTypeContainsCharsetParameter()109 public void secondMetaElementWithContentTypeContainsCharsetParameter() throws Exception { 110 String html = "<html><head>" + 111 "<meta http-equiv=\"Content-Type\" content=\"text/html\">" + 112 "<meta http-equiv=\"Content-Type\" content=\"text/html; charset=euc-kr\">" + 113 "</head><body>한국어</body></html>"; 114 115 Document doc = DataUtil.parseInputStream(stream(html, "euc-kr"), null, "http://example.com", Parser.htmlParser()); 116 117 assertEquals("한국어", doc.body().text()); 118 } 119 120 @Test firstMetaElementWithCharsetShouldBeUsedForDecoding()121 public void firstMetaElementWithCharsetShouldBeUsedForDecoding() throws Exception { 122 String html = "<html><head>" + 123 "<meta http-equiv=\"Content-Type\" content=\"text/html; charset=iso-8859-1\">" + 124 "<meta http-equiv=\"Content-Type\" content=\"text/html; charset=koi8-u\">" + 125 "</head><body>Übergrößenträger</body></html>"; 126 127 Document doc = DataUtil.parseInputStream(stream(html, "iso-8859-1"), null, "http://example.com", Parser.htmlParser()); 128 129 assertEquals("Übergrößenträger", doc.body().text()); 130 } 131 132 @Test parseSequenceInputStream()133 public void parseSequenceInputStream() throws IOException { 134 // https://github.com/jhy/jsoup/pull/1671 135 File in = getFile("/htmltests/medium.html"); 136 String fileContent = new String(Files.readAllBytes(in.toPath())); 137 int halfLength = fileContent.length() / 2; 138 String firstPart = fileContent.substring(0, halfLength); 139 String secondPart = fileContent.substring(halfLength); 140 SequenceInputStream sequenceStream = new SequenceInputStream( 141 stream(firstPart), 142 stream(secondPart) 143 ); 144 Document doc = DataUtil.parseInputStream(sequenceStream, null, "", Parser.htmlParser()); 145 assertEquals(fileContent, doc.outerHtml()); 146 } 147 148 @Test supportsBOMinFiles()149 public void supportsBOMinFiles() throws IOException { 150 // test files from http://www.i18nl10n.com/korean/utftest/ 151 File in = getFile("/bomtests/bom_utf16be.html"); 152 Document doc = Jsoup.parse(in, null, "http://example.com"); 153 assertTrue(doc.title().contains("UTF-16BE")); 154 assertTrue(doc.text().contains("가각갂갃간갅")); 155 156 in = getFile("/bomtests/bom_utf16le.html"); 157 doc = Jsoup.parse(in, null, "http://example.com"); 158 assertTrue(doc.title().contains("UTF-16LE")); 159 assertTrue(doc.text().contains("가각갂갃간갅")); 160 161 in = getFile("/bomtests/bom_utf32be.html"); 162 doc = Jsoup.parse(in, null, "http://example.com"); 163 assertTrue(doc.title().contains("UTF-32BE")); 164 assertTrue(doc.text().contains("가각갂갃간갅")); 165 166 in = getFile("/bomtests/bom_utf32le.html"); 167 doc = Jsoup.parse(in, null, "http://example.com"); 168 assertTrue(doc.title().contains("UTF-32LE")); 169 assertTrue(doc.text().contains("가각갂갃간갅")); 170 } 171 172 @Test supportsUTF8BOM()173 public void supportsUTF8BOM() throws IOException { 174 File in = getFile("/bomtests/bom_utf8.html"); 175 Document doc = Jsoup.parse(in, null, "http://example.com"); 176 assertEquals("OK", doc.head().select("title").text()); 177 } 178 179 @Test noExtraNULLBytes()180 public void noExtraNULLBytes() throws IOException { 181 final byte[] b = "<html><head><meta charset=\"UTF-8\"></head><body><div><u>ü</u>ü</div></body></html>".getBytes(StandardCharsets.UTF_8); 182 183 Document doc = Jsoup.parse(new ByteArrayInputStream(b), null, ""); 184 assertFalse( doc.outerHtml().contains("\u0000") ); 185 } 186 187 @Test supportsZippedUTF8BOM()188 public void supportsZippedUTF8BOM() throws IOException { 189 File in = getFile("/bomtests/bom_utf8.html.gz"); 190 Document doc = Jsoup.parse(in, null, "http://example.com"); 191 assertEquals("OK", doc.head().select("title").text()); 192 assertEquals("There is a UTF8 BOM at the top (before the XML decl). If not read correctly, will look like a non-joining space.", doc.body().text()); 193 } 194 195 @Test supportsXmlCharsetDeclaration()196 public void supportsXmlCharsetDeclaration() throws IOException { 197 String encoding = "iso-8859-1"; 198 InputStream soup = new ByteArrayInputStream(( 199 "<?xml version=\"1.0\" encoding=\"iso-8859-1\"?>" + 200 "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Strict//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd\">" + 201 "<html xmlns=\"http://www.w3.org/1999/xhtml\" lang=\"en\" xml:lang=\"en\">Hellö Wörld!</html>" 202 ).getBytes(Charset.forName(encoding))); 203 204 Document doc = Jsoup.parse(soup, null, ""); 205 assertEquals("Hellö Wörld!", doc.body().text()); 206 } 207 208 209 @Test lLoadsGzipFile()210 public void lLoadsGzipFile() throws IOException { 211 File in = getFile("/htmltests/gzip.html.gz"); 212 Document doc = Jsoup.parse(in, null); 213 assertEquals("Gzip test", doc.title()); 214 assertEquals("This is a gzipped HTML file.", doc.selectFirst("p").text()); 215 } 216 217 @Test loadsZGzipFile()218 public void loadsZGzipFile() throws IOException { 219 // compressed on win, with z suffix 220 File in = getFile("/htmltests/gzip.html.z"); 221 Document doc = Jsoup.parse(in, null); 222 assertEquals("Gzip test", doc.title()); 223 assertEquals("This is a gzipped HTML file.", doc.selectFirst("p").text()); 224 } 225 226 @Test handlesFakeGzipFile()227 public void handlesFakeGzipFile() throws IOException { 228 File in = getFile("/htmltests/fake-gzip.html.gz"); 229 Document doc = Jsoup.parse(in, null); 230 assertEquals("This is not gzipped", doc.title()); 231 assertEquals("And should still be readable.", doc.selectFirst("p").text()); 232 } 233 234 // an input stream to give a range of output sizes, that changes on each read 235 static class VaryingReadInputStream extends InputStream { 236 final InputStream in; 237 int stride = 0; 238 VaryingReadInputStream(InputStream in)239 VaryingReadInputStream(InputStream in) { 240 this.in = in; 241 } 242 read()243 public int read() throws IOException { 244 return in.read(); 245 } 246 read(byte[] b)247 public int read(byte[] b) throws IOException { 248 return in.read(b, 0, Math.min(b.length, ++stride)); 249 } 250 read(byte[] b, int off, int len)251 public int read(byte[] b, int off, int len) throws IOException { 252 return in.read(b, off, Math.min(len, ++stride)); 253 } 254 } 255 256 @Test handlesChunkedInputStream()257 void handlesChunkedInputStream() throws IOException { 258 File inputFile = ParseTest.getFile("/htmltests/large.html"); 259 String input = ParseTest.getFileAsString(inputFile); 260 VaryingReadInputStream stream = new VaryingReadInputStream(ParseTest.inputStreamFrom(input)); 261 262 Document expected = Jsoup.parse(input, "https://example.com"); 263 Document doc = Jsoup.parse(stream, null, "https://example.com"); 264 assertTrue(doc.hasSameValue(expected)); 265 } 266 267 @Test handlesUnlimitedRead()268 void handlesUnlimitedRead() throws IOException { 269 File inputFile = ParseTest.getFile("/htmltests/large.html"); 270 String input = ParseTest.getFileAsString(inputFile); 271 VaryingReadInputStream stream = new VaryingReadInputStream(ParseTest.inputStreamFrom(input)); 272 273 ByteBuffer byteBuffer = DataUtil.readToByteBuffer(stream, 0); 274 String read = new String(byteBuffer.array()); 275 276 assertEquals(input, read); 277 } 278 } 279