• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 package org.jsoup.helper;
2 
3 import org.jsoup.internal.ControllableInputStream;
4 import org.jsoup.internal.Normalizer;
5 import org.jsoup.internal.SharedConstants;
6 import org.jsoup.internal.StringUtil;
7 import org.jsoup.nodes.Comment;
8 import org.jsoup.nodes.Document;
9 import org.jsoup.nodes.Element;
10 import org.jsoup.nodes.Node;
11 import org.jsoup.nodes.XmlDeclaration;
12 import org.jsoup.parser.Parser;
13 import org.jsoup.select.Elements;
14 import org.jspecify.annotations.Nullable;
15 
16 import java.io.BufferedReader;
17 import java.io.CharArrayReader;
18 import java.io.File;
19 import java.io.FileInputStream;
20 import java.io.IOException;
21 import java.io.InputStream;
22 import java.io.InputStreamReader;
23 import java.io.OutputStream;
24 import java.io.UncheckedIOException;
25 import java.nio.Buffer;
26 import java.nio.ByteBuffer;
27 import java.nio.CharBuffer;
28 import java.nio.charset.Charset;
29 import java.nio.charset.IllegalCharsetNameException;
30 import java.util.Locale;
31 import java.util.Random;
32 import java.util.regex.Matcher;
33 import java.util.regex.Pattern;
34 import java.util.zip.GZIPInputStream;
35 
36 import static org.jsoup.internal.SharedConstants.DefaultBufferSize;
37 
38 /**
39  * Internal static utilities for handling data.
40  *
41  */
42 @SuppressWarnings("CharsetObjectCanBeUsed")
43 public final class DataUtil {
44     private static final Pattern charsetPattern = Pattern.compile("(?i)\\bcharset=\\s*(?:[\"'])?([^\\s,;\"']*)");
45     public static final Charset UTF_8 = Charset.forName("UTF-8"); // Don't use StandardCharsets, as those only appear in Android API 19, and we target 10.
46     static final String defaultCharsetName = UTF_8.name(); // used if not found in header or meta charset
47     private static final int firstReadBufferSize = 1024 * 5;
48     private static final char[] mimeBoundaryChars =
49             "-_1234567890abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ".toCharArray();
50     static final int boundaryLength = 32;
51 
DataUtil()52     private DataUtil() {}
53 
54     /**
55      * Loads and parses a file to a Document, with the HtmlParser. Files that are compressed with gzip (and end in {@code .gz} or {@code .z})
56      * are supported in addition to uncompressed files.
57      *
58      * @param file file to load
59      * @param charsetName (optional) character set of input; specify {@code null} to attempt to autodetect. A BOM in
60      *     the file will always override this setting.
61      * @param baseUri base URI of document, to resolve relative links against
62      * @return Document
63      * @throws IOException on IO error
64      */
load(File file, @Nullable String charsetName, String baseUri)65     public static Document load(File file, @Nullable String charsetName, String baseUri) throws IOException {
66         return load(file, charsetName, baseUri, Parser.htmlParser());
67     }
68 
69     /**
70      * Loads and parses a file to a Document. Files that are compressed with gzip (and end in {@code .gz} or {@code .z})
71      * are supported in addition to uncompressed files.
72      *
73      * @param file file to load
74      * @param charsetName (optional) character set of input; specify {@code null} to attempt to autodetect. A BOM in
75      *     the file will always override this setting.
76      * @param baseUri base URI of document, to resolve relative links against
77      * @param parser alternate {@link Parser#xmlParser() parser} to use.
78 
79      * @return Document
80      * @throws IOException on IO error
81      * @since 1.14.2
82      */
load(File file, @Nullable String charsetName, String baseUri, Parser parser)83     public static Document load(File file, @Nullable String charsetName, String baseUri, Parser parser) throws IOException {
84         InputStream stream = new FileInputStream(file);
85         String name = Normalizer.lowerCase(file.getName());
86         if (name.endsWith(".gz") || name.endsWith(".z")) {
87             // unfortunately file input streams don't support marks (why not?), so we will close and reopen after read
88             boolean zipped;
89             try {
90                 zipped = (stream.read() == 0x1f && stream.read() == 0x8b); // gzip magic bytes
91             } finally {
92                 stream.close();
93 
94             }
95             stream = zipped ? new GZIPInputStream(new FileInputStream(file)) : new FileInputStream(file);
96         }
97         return parseInputStream(stream, charsetName, baseUri, parser);
98     }
99 
100     /**
101      * Parses a Document from an input steam.
102      * @param in input stream to parse. The stream will be closed after reading.
103      * @param charsetName character set of input (optional)
104      * @param baseUri base URI of document, to resolve relative links against
105      * @return Document
106      * @throws IOException on IO error
107      */
load(InputStream in, @Nullable String charsetName, String baseUri)108     public static Document load(InputStream in, @Nullable String charsetName, String baseUri) throws IOException {
109         return parseInputStream(in, charsetName, baseUri, Parser.htmlParser());
110     }
111 
112     /**
113      * Parses a Document from an input steam, using the provided Parser.
114      * @param in input stream to parse. The stream will be closed after reading.
115      * @param charsetName character set of input (optional)
116      * @param baseUri base URI of document, to resolve relative links against
117      * @param parser alternate {@link Parser#xmlParser() parser} to use.
118      * @return Document
119      * @throws IOException on IO error
120      */
load(InputStream in, @Nullable String charsetName, String baseUri, Parser parser)121     public static Document load(InputStream in, @Nullable String charsetName, String baseUri, Parser parser) throws IOException {
122         return parseInputStream(in, charsetName, baseUri, parser);
123     }
124 
125     /**
126      * Writes the input stream to the output stream. Doesn't close them.
127      * @param in input stream to read from
128      * @param out output stream to write to
129      * @throws IOException on IO error
130      */
crossStreams(final InputStream in, final OutputStream out)131     static void crossStreams(final InputStream in, final OutputStream out) throws IOException {
132         final byte[] buffer = new byte[DefaultBufferSize];
133         int len;
134         while ((len = in.read(buffer)) != -1) {
135             out.write(buffer, 0, len);
136         }
137     }
138 
parseInputStream(@ullable InputStream input, @Nullable String charsetName, String baseUri, Parser parser)139     static Document parseInputStream(@Nullable InputStream input, @Nullable String charsetName, String baseUri, Parser parser) throws IOException  {
140         if (input == null) // empty body
141             return new Document(baseUri);
142         input = ControllableInputStream.wrap(input, DefaultBufferSize, 0);
143 
144         @Nullable Document doc = null;
145 
146         // read the start of the stream and look for a BOM or meta charset
147         try {
148             input.mark(DefaultBufferSize);
149             ByteBuffer firstBytes = readToByteBuffer(input, firstReadBufferSize - 1); // -1 because we read one more to see if completed. First read is < buffer size, so can't be invalid.
150             boolean fullyRead = (input.read() == -1);
151             input.reset();
152 
153             // look for BOM - overrides any other header or input
154             BomCharset bomCharset = detectCharsetFromBom(firstBytes);
155             if (bomCharset != null)
156                 charsetName = bomCharset.charset;
157 
158             if (charsetName == null) { // determine from meta. safe first parse as UTF-8
159                 try {
160                     CharBuffer defaultDecoded = UTF_8.decode(firstBytes);
161                     if (defaultDecoded.hasArray())
162                         doc = parser.parseInput(new CharArrayReader(defaultDecoded.array(), defaultDecoded.arrayOffset(), defaultDecoded.limit()), baseUri);
163                     else
164                         doc = parser.parseInput(defaultDecoded.toString(), baseUri);
165                 } catch (UncheckedIOException e) {
166                     throw e.getCause();
167                 }
168 
169                 // look for <meta http-equiv="Content-Type" content="text/html;charset=gb2312"> or HTML5 <meta charset="gb2312">
170                 Elements metaElements = doc.select("meta[http-equiv=content-type], meta[charset]");
171                 String foundCharset = null; // if not found, will keep utf-8 as best attempt
172                 for (Element meta : metaElements) {
173                     if (meta.hasAttr("http-equiv"))
174                         foundCharset = getCharsetFromContentType(meta.attr("content"));
175                     if (foundCharset == null && meta.hasAttr("charset"))
176                         foundCharset = meta.attr("charset");
177                     if (foundCharset != null)
178                         break;
179                 }
180 
181                 // look for <?xml encoding='ISO-8859-1'?>
182                 if (foundCharset == null && doc.childNodeSize() > 0) {
183                     Node first = doc.childNode(0);
184                     XmlDeclaration decl = null;
185                     if (first instanceof XmlDeclaration)
186                         decl = (XmlDeclaration) first;
187                     else if (first instanceof Comment) {
188                         Comment comment = (Comment) first;
189                         if (comment.isXmlDeclaration())
190                             decl = comment.asXmlDeclaration();
191                     }
192                     if (decl != null) {
193                         if (decl.name().equalsIgnoreCase("xml"))
194                             foundCharset = decl.attr("encoding");
195                     }
196                 }
197                 foundCharset = validateCharset(foundCharset);
198                 if (foundCharset != null && !foundCharset.equalsIgnoreCase(defaultCharsetName)) { // need to re-decode. (case insensitive check here to match how validate works)
199                     foundCharset = foundCharset.trim().replaceAll("[\"']", "");
200                     charsetName = foundCharset;
201                     doc = null;
202                 } else if (!fullyRead) {
203                     doc = null;
204                 }
205             } else { // specified by content type header (or by user on file load)
206                 Validate.notEmpty(charsetName, "Must set charset arg to character set of file to parse. Set to null to attempt to detect from HTML");
207             }
208             if (doc == null) {
209                 if (charsetName == null)
210                     charsetName = defaultCharsetName;
211                 BufferedReader reader = new BufferedReader(new InputStreamReader(input, Charset.forName(charsetName)), DefaultBufferSize); // Android level does not allow us try-with-resources
212                 try {
213                     if (bomCharset != null && bomCharset.offset) { // creating the buffered reader ignores the input pos, so must skip here
214                         long skipped = reader.skip(1);
215                         Validate.isTrue(skipped == 1); // WTF if this fails.
216                     }
217                     try {
218                         doc = parser.parseInput(reader, baseUri);
219                     } catch (UncheckedIOException e) {
220                         // io exception when parsing (not seen before because reading the stream as we go)
221                         throw e.getCause();
222                     }
223                     Charset charset = charsetName.equals(defaultCharsetName) ? UTF_8 : Charset.forName(charsetName);
224                     doc.outputSettings().charset(charset);
225                     if (!charset.canEncode()) {
226                         // some charsets can read but not encode; switch to an encodable charset and update the meta el
227                         doc.charset(UTF_8);
228                     }
229                 }
230                 finally {
231                     reader.close();
232                 }
233             }
234         }
235         finally {
236             input.close();
237         }
238         return doc;
239     }
240 
241     /**
242      * Read the input stream into a byte buffer. To deal with slow input streams, you may interrupt the thread this
243      * method is executing on. The data read until being interrupted will be available.
244      * @param inStream the input stream to read from
245      * @param maxSize the maximum size in bytes to read from the stream. Set to 0 to be unlimited.
246      * @return the filled byte buffer
247      * @throws IOException if an exception occurs whilst reading from the input stream.
248      */
readToByteBuffer(InputStream inStream, int maxSize)249     public static ByteBuffer readToByteBuffer(InputStream inStream, int maxSize) throws IOException {
250         return ControllableInputStream.readToByteBuffer(inStream, maxSize);
251     }
252 
emptyByteBuffer()253     static ByteBuffer emptyByteBuffer() {
254         return ByteBuffer.allocate(0);
255     }
256 
257     /**
258      * Parse out a charset from a content type header. If the charset is not supported, returns null (so the default
259      * will kick in.)
260      * @param contentType e.g. "text/html; charset=EUC-JP"
261      * @return "EUC-JP", or null if not found. Charset is trimmed and uppercased.
262      */
getCharsetFromContentType(@ullable String contentType)263     static @Nullable String getCharsetFromContentType(@Nullable String contentType) {
264         if (contentType == null) return null;
265         Matcher m = charsetPattern.matcher(contentType);
266         if (m.find()) {
267             String charset = m.group(1).trim();
268             charset = charset.replace("charset=", "");
269             return validateCharset(charset);
270         }
271         return null;
272     }
273 
validateCharset(@ullable String cs)274     private @Nullable static String validateCharset(@Nullable String cs) {
275         if (cs == null || cs.length() == 0) return null;
276         cs = cs.trim().replaceAll("[\"']", "");
277         try {
278             if (Charset.isSupported(cs)) return cs;
279             cs = cs.toUpperCase(Locale.ENGLISH);
280             if (Charset.isSupported(cs)) return cs;
281         } catch (IllegalCharsetNameException e) {
282             // if our this charset matching fails.... we just take the default
283         }
284         return null;
285     }
286 
287     /**
288      * Creates a random string, suitable for use as a mime boundary
289      */
mimeBoundary()290     static String mimeBoundary() {
291         final StringBuilder mime = StringUtil.borrowBuilder();
292         final Random rand = new Random();
293         for (int i = 0; i < boundaryLength; i++) {
294             mime.append(mimeBoundaryChars[rand.nextInt(mimeBoundaryChars.length)]);
295         }
296         return StringUtil.releaseBuilder(mime);
297     }
298 
detectCharsetFromBom(final ByteBuffer byteData)299     private static @Nullable BomCharset detectCharsetFromBom(final ByteBuffer byteData) {
300         @SuppressWarnings("UnnecessaryLocalVariable") final Buffer buffer = byteData; // .mark and rewind used to return Buffer, now ByteBuffer, so cast for backward compat
301         buffer.mark();
302         byte[] bom = new byte[4];
303         if (byteData.remaining() >= bom.length) {
304             byteData.get(bom);
305             buffer.rewind();
306         }
307         if (bom[0] == 0x00 && bom[1] == 0x00 && bom[2] == (byte) 0xFE && bom[3] == (byte) 0xFF || // BE
308             bom[0] == (byte) 0xFF && bom[1] == (byte) 0xFE && bom[2] == 0x00 && bom[3] == 0x00) { // LE
309             return new BomCharset("UTF-32", false); // and I hope it's on your system
310         } else if (bom[0] == (byte) 0xFE && bom[1] == (byte) 0xFF || // BE
311             bom[0] == (byte) 0xFF && bom[1] == (byte) 0xFE) {
312             return new BomCharset("UTF-16", false); // in all Javas
313         } else if (bom[0] == (byte) 0xEF && bom[1] == (byte) 0xBB && bom[2] == (byte) 0xBF) {
314             return new BomCharset("UTF-8", true); // in all Javas
315             // 16 and 32 decoders consume the BOM to determine be/le; utf-8 should be consumed here
316         }
317         return null;
318     }
319 
320     private static class BomCharset {
321         private final String charset;
322         private final boolean offset;
323 
BomCharset(String charset, boolean offset)324         public BomCharset(String charset, boolean offset) {
325             this.charset = charset;
326             this.offset = offset;
327         }
328     }
329 }
330