001package org.jsoup.helper;
002
003import org.jsoup.internal.ControllableInputStream;
004import org.jsoup.internal.Normalizer;
005import org.jsoup.internal.SharedConstants;
006import org.jsoup.internal.StringUtil;
007import org.jsoup.nodes.Comment;
008import org.jsoup.nodes.Document;
009import org.jsoup.nodes.Element;
010import org.jsoup.nodes.Node;
011import org.jsoup.nodes.XmlDeclaration;
012import org.jsoup.parser.Parser;
013import org.jsoup.select.Elements;
014import org.jspecify.annotations.Nullable;
015
016import java.io.BufferedReader;
017import java.io.CharArrayReader;
018import java.io.File;
019import java.io.FileInputStream;
020import java.io.IOException;
021import java.io.InputStream;
022import java.io.InputStreamReader;
023import java.io.OutputStream;
024import java.io.UncheckedIOException;
025import java.nio.Buffer;
026import java.nio.ByteBuffer;
027import java.nio.CharBuffer;
028import java.nio.charset.Charset;
029import java.nio.charset.IllegalCharsetNameException;
030import java.util.Locale;
031import java.util.Random;
032import java.util.regex.Matcher;
033import java.util.regex.Pattern;
034import java.util.zip.GZIPInputStream;
035
036import static org.jsoup.internal.SharedConstants.DefaultBufferSize;
037
038/**
039 * Internal static utilities for handling data.
040 *
041 */
042@SuppressWarnings("CharsetObjectCanBeUsed")
043public final class DataUtil {
044    private static final Pattern charsetPattern = Pattern.compile("(?i)\\bcharset=\\s*(?:[\"'])?([^\\s,;\"']*)");
045    public static final Charset UTF_8 = Charset.forName("UTF-8"); // Don't use StandardCharsets, as those only appear in Android API 19, and we target 10.
046    static final String defaultCharsetName = UTF_8.name(); // used if not found in header or meta charset
047    private static final int firstReadBufferSize = 1024 * 5;
048    private static final char[] mimeBoundaryChars =
049            "-_1234567890abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ".toCharArray();
050    static final int boundaryLength = 32;
051
052    private DataUtil() {}
053
054    /**
055     * Loads and parses a file to a Document, with the HtmlParser. Files that are compressed with gzip (and end in {@code .gz} or {@code .z})
056     * are supported in addition to uncompressed files.
057     *
058     * @param file file to load
059     * @param charsetName (optional) character set of input; specify {@code null} to attempt to autodetect. A BOM in
060     *     the file will always override this setting.
061     * @param baseUri base URI of document, to resolve relative links against
062     * @return Document
063     * @throws IOException on IO error
064     */
065    public static Document load(File file, @Nullable String charsetName, String baseUri) throws IOException {
066        return load(file, charsetName, baseUri, Parser.htmlParser());
067    }
068
069    /**
070     * Loads and parses a file to a Document. Files that are compressed with gzip (and end in {@code .gz} or {@code .z})
071     * are supported in addition to uncompressed files.
072     *
073     * @param file file to load
074     * @param charsetName (optional) character set of input; specify {@code null} to attempt to autodetect. A BOM in
075     *     the file will always override this setting.
076     * @param baseUri base URI of document, to resolve relative links against
077     * @param parser alternate {@link Parser#xmlParser() parser} to use.
078
079     * @return Document
080     * @throws IOException on IO error
081     * @since 1.14.2
082     */
083    public static Document load(File file, @Nullable String charsetName, String baseUri, Parser parser) throws IOException {
084        InputStream stream = new FileInputStream(file);
085        String name = Normalizer.lowerCase(file.getName());
086        if (name.endsWith(".gz") || name.endsWith(".z")) {
087            // unfortunately file input streams don't support marks (why not?), so we will close and reopen after read
088            boolean zipped;
089            try {
090                zipped = (stream.read() == 0x1f && stream.read() == 0x8b); // gzip magic bytes
091            } finally {
092                stream.close();
093
094            }
095            stream = zipped ? new GZIPInputStream(new FileInputStream(file)) : new FileInputStream(file);
096        }
097        return parseInputStream(stream, charsetName, baseUri, parser);
098    }
099
100    /**
101     * Parses a Document from an input steam.
102     * @param in input stream to parse. The stream will be closed after reading.
103     * @param charsetName character set of input (optional)
104     * @param baseUri base URI of document, to resolve relative links against
105     * @return Document
106     * @throws IOException on IO error
107     */
108    public static Document load(InputStream in, @Nullable String charsetName, String baseUri) throws IOException {
109        return parseInputStream(in, charsetName, baseUri, Parser.htmlParser());
110    }
111
112    /**
113     * Parses a Document from an input steam, using the provided Parser.
114     * @param in input stream to parse. The stream will be closed after reading.
115     * @param charsetName character set of input (optional)
116     * @param baseUri base URI of document, to resolve relative links against
117     * @param parser alternate {@link Parser#xmlParser() parser} to use.
118     * @return Document
119     * @throws IOException on IO error
120     */
121    public static Document load(InputStream in, @Nullable String charsetName, String baseUri, Parser parser) throws IOException {
122        return parseInputStream(in, charsetName, baseUri, parser);
123    }
124
125    /**
126     * Writes the input stream to the output stream. Doesn't close them.
127     * @param in input stream to read from
128     * @param out output stream to write to
129     * @throws IOException on IO error
130     */
131    static void crossStreams(final InputStream in, final OutputStream out) throws IOException {
132        final byte[] buffer = new byte[DefaultBufferSize];
133        int len;
134        while ((len = in.read(buffer)) != -1) {
135            out.write(buffer, 0, len);
136        }
137    }
138
139    static Document parseInputStream(@Nullable InputStream input, @Nullable String charsetName, String baseUri, Parser parser) throws IOException  {
140        if (input == null) // empty body
141            return new Document(baseUri);
142        input = ControllableInputStream.wrap(input, DefaultBufferSize, 0);
143
144        @Nullable Document doc = null;
145
146        // read the start of the stream and look for a BOM or meta charset
147        try {
148            input.mark(DefaultBufferSize);
149            ByteBuffer firstBytes = readToByteBuffer(input, firstReadBufferSize - 1); // -1 because we read one more to see if completed. First read is < buffer size, so can't be invalid.
150            boolean fullyRead = (input.read() == -1);
151            input.reset();
152
153            // look for BOM - overrides any other header or input
154            BomCharset bomCharset = detectCharsetFromBom(firstBytes);
155            if (bomCharset != null)
156                charsetName = bomCharset.charset;
157
158            if (charsetName == null) { // determine from meta. safe first parse as UTF-8
159                try {
160                    CharBuffer defaultDecoded = UTF_8.decode(firstBytes);
161                    if (defaultDecoded.hasArray())
162                        doc = parser.parseInput(new CharArrayReader(defaultDecoded.array(), defaultDecoded.arrayOffset(), defaultDecoded.limit()), baseUri);
163                    else
164                        doc = parser.parseInput(defaultDecoded.toString(), baseUri);
165                } catch (UncheckedIOException e) {
166                    throw e.getCause();
167                }
168
169                // look for <meta http-equiv="Content-Type" content="text/html;charset=gb2312"> or HTML5 <meta charset="gb2312">
170                Elements metaElements = doc.select("meta[http-equiv=content-type], meta[charset]");
171                String foundCharset = null; // if not found, will keep utf-8 as best attempt
172                for (Element meta : metaElements) {
173                    if (meta.hasAttr("http-equiv"))
174                        foundCharset = getCharsetFromContentType(meta.attr("content"));
175                    if (foundCharset == null && meta.hasAttr("charset"))
176                        foundCharset = meta.attr("charset");
177                    if (foundCharset != null)
178                        break;
179                }
180
181                // look for <?xml encoding='ISO-8859-1'?>
182                if (foundCharset == null && doc.childNodeSize() > 0) {
183                    Node first = doc.childNode(0);
184                    XmlDeclaration decl = null;
185                    if (first instanceof XmlDeclaration)
186                        decl = (XmlDeclaration) first;
187                    else if (first instanceof Comment) {
188                        Comment comment = (Comment) first;
189                        if (comment.isXmlDeclaration())
190                            decl = comment.asXmlDeclaration();
191                    }
192                    if (decl != null) {
193                        if (decl.name().equalsIgnoreCase("xml"))
194                            foundCharset = decl.attr("encoding");
195                    }
196                }
197                foundCharset = validateCharset(foundCharset);
198                if (foundCharset != null && !foundCharset.equalsIgnoreCase(defaultCharsetName)) { // need to re-decode. (case insensitive check here to match how validate works)
199                    foundCharset = foundCharset.trim().replaceAll("[\"']", "");
200                    charsetName = foundCharset;
201                    doc = null;
202                } else if (!fullyRead) {
203                    doc = null;
204                }
205            } else { // specified by content type header (or by user on file load)
206                Validate.notEmpty(charsetName, "Must set charset arg to character set of file to parse. Set to null to attempt to detect from HTML");
207            }
208            if (doc == null) {
209                if (charsetName == null)
210                    charsetName = defaultCharsetName;
211                BufferedReader reader = new BufferedReader(new InputStreamReader(input, Charset.forName(charsetName)), DefaultBufferSize); // Android level does not allow us try-with-resources
212                try {
213                    if (bomCharset != null && bomCharset.offset) { // creating the buffered reader ignores the input pos, so must skip here
214                        long skipped = reader.skip(1);
215                        Validate.isTrue(skipped == 1); // WTF if this fails.
216                    }
217                    try {
218                        doc = parser.parseInput(reader, baseUri);
219                    } catch (UncheckedIOException e) {
220                        // io exception when parsing (not seen before because reading the stream as we go)
221                        throw e.getCause();
222                    }
223                    Charset charset = charsetName.equals(defaultCharsetName) ? UTF_8 : Charset.forName(charsetName);
224                    doc.outputSettings().charset(charset);
225                    if (!charset.canEncode()) {
226                        // some charsets can read but not encode; switch to an encodable charset and update the meta el
227                        doc.charset(UTF_8);
228                    }
229                }
230                finally {
231                    reader.close();
232                }
233            }
234        }
235        finally {
236            input.close();
237        }
238        return doc;
239    }
240
241    /**
242     * Read the input stream into a byte buffer. To deal with slow input streams, you may interrupt the thread this
243     * method is executing on. The data read until being interrupted will be available.
244     * @param inStream the input stream to read from
245     * @param maxSize the maximum size in bytes to read from the stream. Set to 0 to be unlimited.
246     * @return the filled byte buffer
247     * @throws IOException if an exception occurs whilst reading from the input stream.
248     */
249    public static ByteBuffer readToByteBuffer(InputStream inStream, int maxSize) throws IOException {
250        return ControllableInputStream.readToByteBuffer(inStream, maxSize);
251    }
252
253    static ByteBuffer emptyByteBuffer() {
254        return ByteBuffer.allocate(0);
255    }
256
257    /**
258     * Parse out a charset from a content type header. If the charset is not supported, returns null (so the default
259     * will kick in.)
260     * @param contentType e.g. "text/html; charset=EUC-JP"
261     * @return "EUC-JP", or null if not found. Charset is trimmed and uppercased.
262     */
263    static @Nullable String getCharsetFromContentType(@Nullable String contentType) {
264        if (contentType == null) return null;
265        Matcher m = charsetPattern.matcher(contentType);
266        if (m.find()) {
267            String charset = m.group(1).trim();
268            charset = charset.replace("charset=", "");
269            return validateCharset(charset);
270        }
271        return null;
272    }
273
274    private @Nullable static String validateCharset(@Nullable String cs) {
275        if (cs == null || cs.length() == 0) return null;
276        cs = cs.trim().replaceAll("[\"']", "");
277        try {
278            if (Charset.isSupported(cs)) return cs;
279            cs = cs.toUpperCase(Locale.ENGLISH);
280            if (Charset.isSupported(cs)) return cs;
281        } catch (IllegalCharsetNameException e) {
282            // if our this charset matching fails.... we just take the default
283        }
284        return null;
285    }
286
287    /**
288     * Creates a random string, suitable for use as a mime boundary
289     */
290    static String mimeBoundary() {
291        final StringBuilder mime = StringUtil.borrowBuilder();
292        final Random rand = new Random();
293        for (int i = 0; i < boundaryLength; i++) {
294            mime.append(mimeBoundaryChars[rand.nextInt(mimeBoundaryChars.length)]);
295        }
296        return StringUtil.releaseBuilder(mime);
297    }
298
299    private static @Nullable BomCharset detectCharsetFromBom(final ByteBuffer byteData) {
300        @SuppressWarnings("UnnecessaryLocalVariable") final Buffer buffer = byteData; // .mark and rewind used to return Buffer, now ByteBuffer, so cast for backward compat
301        buffer.mark();
302        byte[] bom = new byte[4];
303        if (byteData.remaining() >= bom.length) {
304            byteData.get(bom);
305            buffer.rewind();
306        }
307        if (bom[0] == 0x00 && bom[1] == 0x00 && bom[2] == (byte) 0xFE && bom[3] == (byte) 0xFF || // BE
308            bom[0] == (byte) 0xFF && bom[1] == (byte) 0xFE && bom[2] == 0x00 && bom[3] == 0x00) { // LE
309            return new BomCharset("UTF-32", false); // and I hope it's on your system
310        } else if (bom[0] == (byte) 0xFE && bom[1] == (byte) 0xFF || // BE
311            bom[0] == (byte) 0xFF && bom[1] == (byte) 0xFE) {
312            return new BomCharset("UTF-16", false); // in all Javas
313        } else if (bom[0] == (byte) 0xEF && bom[1] == (byte) 0xBB && bom[2] == (byte) 0xBF) {
314            return new BomCharset("UTF-8", true); // in all Javas
315            // 16 and 32 decoders consume the BOM to determine be/le; utf-8 should be consumed here
316        }
317        return null;
318    }
319
320    private static class BomCharset {
321        private final String charset;
322        private final boolean offset;
323
324        public BomCharset(String charset, boolean offset) {
325            this.charset = charset;
326            this.offset = offset;
327        }
328    }
329}