001package org.jsoup.helper; 002 003import org.jsoup.internal.ControllableInputStream; 004import org.jsoup.internal.Normalizer; 005import org.jsoup.internal.SharedConstants; 006import org.jsoup.internal.StringUtil; 007import org.jsoup.nodes.Comment; 008import org.jsoup.nodes.Document; 009import org.jsoup.nodes.Element; 010import org.jsoup.nodes.Node; 011import org.jsoup.nodes.XmlDeclaration; 012import org.jsoup.parser.Parser; 013import org.jsoup.select.Elements; 014import org.jspecify.annotations.Nullable; 015 016import java.io.BufferedReader; 017import java.io.CharArrayReader; 018import java.io.File; 019import java.io.FileInputStream; 020import java.io.IOException; 021import java.io.InputStream; 022import java.io.InputStreamReader; 023import java.io.OutputStream; 024import java.io.UncheckedIOException; 025import java.nio.Buffer; 026import java.nio.ByteBuffer; 027import java.nio.CharBuffer; 028import java.nio.charset.Charset; 029import java.nio.charset.IllegalCharsetNameException; 030import java.util.Locale; 031import java.util.Random; 032import java.util.regex.Matcher; 033import java.util.regex.Pattern; 034import java.util.zip.GZIPInputStream; 035 036import static org.jsoup.internal.SharedConstants.DefaultBufferSize; 037 038/** 039 * Internal static utilities for handling data. 040 * 041 */ 042@SuppressWarnings("CharsetObjectCanBeUsed") 043public final class DataUtil { 044 private static final Pattern charsetPattern = Pattern.compile("(?i)\\bcharset=\\s*(?:[\"'])?([^\\s,;\"']*)"); 045 public static final Charset UTF_8 = Charset.forName("UTF-8"); // Don't use StandardCharsets, as those only appear in Android API 19, and we target 10. 046 static final String defaultCharsetName = UTF_8.name(); // used if not found in header or meta charset 047 private static final int firstReadBufferSize = 1024 * 5; 048 private static final char[] mimeBoundaryChars = 049 "-_1234567890abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ".toCharArray(); 050 static final int boundaryLength = 32; 051 052 private DataUtil() {} 053 054 /** 055 * Loads and parses a file to a Document, with the HtmlParser. Files that are compressed with gzip (and end in {@code .gz} or {@code .z}) 056 * are supported in addition to uncompressed files. 057 * 058 * @param file file to load 059 * @param charsetName (optional) character set of input; specify {@code null} to attempt to autodetect. A BOM in 060 * the file will always override this setting. 061 * @param baseUri base URI of document, to resolve relative links against 062 * @return Document 063 * @throws IOException on IO error 064 */ 065 public static Document load(File file, @Nullable String charsetName, String baseUri) throws IOException { 066 return load(file, charsetName, baseUri, Parser.htmlParser()); 067 } 068 069 /** 070 * Loads and parses a file to a Document. Files that are compressed with gzip (and end in {@code .gz} or {@code .z}) 071 * are supported in addition to uncompressed files. 072 * 073 * @param file file to load 074 * @param charsetName (optional) character set of input; specify {@code null} to attempt to autodetect. A BOM in 075 * the file will always override this setting. 076 * @param baseUri base URI of document, to resolve relative links against 077 * @param parser alternate {@link Parser#xmlParser() parser} to use. 078 079 * @return Document 080 * @throws IOException on IO error 081 * @since 1.14.2 082 */ 083 public static Document load(File file, @Nullable String charsetName, String baseUri, Parser parser) throws IOException { 084 InputStream stream = new FileInputStream(file); 085 String name = Normalizer.lowerCase(file.getName()); 086 if (name.endsWith(".gz") || name.endsWith(".z")) { 087 // unfortunately file input streams don't support marks (why not?), so we will close and reopen after read 088 boolean zipped; 089 try { 090 zipped = (stream.read() == 0x1f && stream.read() == 0x8b); // gzip magic bytes 091 } finally { 092 stream.close(); 093 094 } 095 stream = zipped ? new GZIPInputStream(new FileInputStream(file)) : new FileInputStream(file); 096 } 097 return parseInputStream(stream, charsetName, baseUri, parser); 098 } 099 100 /** 101 * Parses a Document from an input steam. 102 * @param in input stream to parse. The stream will be closed after reading. 103 * @param charsetName character set of input (optional) 104 * @param baseUri base URI of document, to resolve relative links against 105 * @return Document 106 * @throws IOException on IO error 107 */ 108 public static Document load(InputStream in, @Nullable String charsetName, String baseUri) throws IOException { 109 return parseInputStream(in, charsetName, baseUri, Parser.htmlParser()); 110 } 111 112 /** 113 * Parses a Document from an input steam, using the provided Parser. 114 * @param in input stream to parse. The stream will be closed after reading. 115 * @param charsetName character set of input (optional) 116 * @param baseUri base URI of document, to resolve relative links against 117 * @param parser alternate {@link Parser#xmlParser() parser} to use. 118 * @return Document 119 * @throws IOException on IO error 120 */ 121 public static Document load(InputStream in, @Nullable String charsetName, String baseUri, Parser parser) throws IOException { 122 return parseInputStream(in, charsetName, baseUri, parser); 123 } 124 125 /** 126 * Writes the input stream to the output stream. Doesn't close them. 127 * @param in input stream to read from 128 * @param out output stream to write to 129 * @throws IOException on IO error 130 */ 131 static void crossStreams(final InputStream in, final OutputStream out) throws IOException { 132 final byte[] buffer = new byte[DefaultBufferSize]; 133 int len; 134 while ((len = in.read(buffer)) != -1) { 135 out.write(buffer, 0, len); 136 } 137 } 138 139 static Document parseInputStream(@Nullable InputStream input, @Nullable String charsetName, String baseUri, Parser parser) throws IOException { 140 if (input == null) // empty body 141 return new Document(baseUri); 142 input = ControllableInputStream.wrap(input, DefaultBufferSize, 0); 143 144 @Nullable Document doc = null; 145 146 // read the start of the stream and look for a BOM or meta charset 147 try { 148 input.mark(DefaultBufferSize); 149 ByteBuffer firstBytes = readToByteBuffer(input, firstReadBufferSize - 1); // -1 because we read one more to see if completed. First read is < buffer size, so can't be invalid. 150 boolean fullyRead = (input.read() == -1); 151 input.reset(); 152 153 // look for BOM - overrides any other header or input 154 BomCharset bomCharset = detectCharsetFromBom(firstBytes); 155 if (bomCharset != null) 156 charsetName = bomCharset.charset; 157 158 if (charsetName == null) { // determine from meta. safe first parse as UTF-8 159 try { 160 CharBuffer defaultDecoded = UTF_8.decode(firstBytes); 161 if (defaultDecoded.hasArray()) 162 doc = parser.parseInput(new CharArrayReader(defaultDecoded.array(), defaultDecoded.arrayOffset(), defaultDecoded.limit()), baseUri); 163 else 164 doc = parser.parseInput(defaultDecoded.toString(), baseUri); 165 } catch (UncheckedIOException e) { 166 throw e.getCause(); 167 } 168 169 // look for <meta http-equiv="Content-Type" content="text/html;charset=gb2312"> or HTML5 <meta charset="gb2312"> 170 Elements metaElements = doc.select("meta[http-equiv=content-type], meta[charset]"); 171 String foundCharset = null; // if not found, will keep utf-8 as best attempt 172 for (Element meta : metaElements) { 173 if (meta.hasAttr("http-equiv")) 174 foundCharset = getCharsetFromContentType(meta.attr("content")); 175 if (foundCharset == null && meta.hasAttr("charset")) 176 foundCharset = meta.attr("charset"); 177 if (foundCharset != null) 178 break; 179 } 180 181 // look for <?xml encoding='ISO-8859-1'?> 182 if (foundCharset == null && doc.childNodeSize() > 0) { 183 Node first = doc.childNode(0); 184 XmlDeclaration decl = null; 185 if (first instanceof XmlDeclaration) 186 decl = (XmlDeclaration) first; 187 else if (first instanceof Comment) { 188 Comment comment = (Comment) first; 189 if (comment.isXmlDeclaration()) 190 decl = comment.asXmlDeclaration(); 191 } 192 if (decl != null) { 193 if (decl.name().equalsIgnoreCase("xml")) 194 foundCharset = decl.attr("encoding"); 195 } 196 } 197 foundCharset = validateCharset(foundCharset); 198 if (foundCharset != null && !foundCharset.equalsIgnoreCase(defaultCharsetName)) { // need to re-decode. (case insensitive check here to match how validate works) 199 foundCharset = foundCharset.trim().replaceAll("[\"']", ""); 200 charsetName = foundCharset; 201 doc = null; 202 } else if (!fullyRead) { 203 doc = null; 204 } 205 } else { // specified by content type header (or by user on file load) 206 Validate.notEmpty(charsetName, "Must set charset arg to character set of file to parse. Set to null to attempt to detect from HTML"); 207 } 208 if (doc == null) { 209 if (charsetName == null) 210 charsetName = defaultCharsetName; 211 BufferedReader reader = new BufferedReader(new InputStreamReader(input, Charset.forName(charsetName)), DefaultBufferSize); // Android level does not allow us try-with-resources 212 try { 213 if (bomCharset != null && bomCharset.offset) { // creating the buffered reader ignores the input pos, so must skip here 214 long skipped = reader.skip(1); 215 Validate.isTrue(skipped == 1); // WTF if this fails. 216 } 217 try { 218 doc = parser.parseInput(reader, baseUri); 219 } catch (UncheckedIOException e) { 220 // io exception when parsing (not seen before because reading the stream as we go) 221 throw e.getCause(); 222 } 223 Charset charset = charsetName.equals(defaultCharsetName) ? UTF_8 : Charset.forName(charsetName); 224 doc.outputSettings().charset(charset); 225 if (!charset.canEncode()) { 226 // some charsets can read but not encode; switch to an encodable charset and update the meta el 227 doc.charset(UTF_8); 228 } 229 } 230 finally { 231 reader.close(); 232 } 233 } 234 } 235 finally { 236 input.close(); 237 } 238 return doc; 239 } 240 241 /** 242 * Read the input stream into a byte buffer. To deal with slow input streams, you may interrupt the thread this 243 * method is executing on. The data read until being interrupted will be available. 244 * @param inStream the input stream to read from 245 * @param maxSize the maximum size in bytes to read from the stream. Set to 0 to be unlimited. 246 * @return the filled byte buffer 247 * @throws IOException if an exception occurs whilst reading from the input stream. 248 */ 249 public static ByteBuffer readToByteBuffer(InputStream inStream, int maxSize) throws IOException { 250 return ControllableInputStream.readToByteBuffer(inStream, maxSize); 251 } 252 253 static ByteBuffer emptyByteBuffer() { 254 return ByteBuffer.allocate(0); 255 } 256 257 /** 258 * Parse out a charset from a content type header. If the charset is not supported, returns null (so the default 259 * will kick in.) 260 * @param contentType e.g. "text/html; charset=EUC-JP" 261 * @return "EUC-JP", or null if not found. Charset is trimmed and uppercased. 262 */ 263 static @Nullable String getCharsetFromContentType(@Nullable String contentType) { 264 if (contentType == null) return null; 265 Matcher m = charsetPattern.matcher(contentType); 266 if (m.find()) { 267 String charset = m.group(1).trim(); 268 charset = charset.replace("charset=", ""); 269 return validateCharset(charset); 270 } 271 return null; 272 } 273 274 private @Nullable static String validateCharset(@Nullable String cs) { 275 if (cs == null || cs.length() == 0) return null; 276 cs = cs.trim().replaceAll("[\"']", ""); 277 try { 278 if (Charset.isSupported(cs)) return cs; 279 cs = cs.toUpperCase(Locale.ENGLISH); 280 if (Charset.isSupported(cs)) return cs; 281 } catch (IllegalCharsetNameException e) { 282 // if our this charset matching fails.... we just take the default 283 } 284 return null; 285 } 286 287 /** 288 * Creates a random string, suitable for use as a mime boundary 289 */ 290 static String mimeBoundary() { 291 final StringBuilder mime = StringUtil.borrowBuilder(); 292 final Random rand = new Random(); 293 for (int i = 0; i < boundaryLength; i++) { 294 mime.append(mimeBoundaryChars[rand.nextInt(mimeBoundaryChars.length)]); 295 } 296 return StringUtil.releaseBuilder(mime); 297 } 298 299 private static @Nullable BomCharset detectCharsetFromBom(final ByteBuffer byteData) { 300 @SuppressWarnings("UnnecessaryLocalVariable") final Buffer buffer = byteData; // .mark and rewind used to return Buffer, now ByteBuffer, so cast for backward compat 301 buffer.mark(); 302 byte[] bom = new byte[4]; 303 if (byteData.remaining() >= bom.length) { 304 byteData.get(bom); 305 buffer.rewind(); 306 } 307 if (bom[0] == 0x00 && bom[1] == 0x00 && bom[2] == (byte) 0xFE && bom[3] == (byte) 0xFF || // BE 308 bom[0] == (byte) 0xFF && bom[1] == (byte) 0xFE && bom[2] == 0x00 && bom[3] == 0x00) { // LE 309 return new BomCharset("UTF-32", false); // and I hope it's on your system 310 } else if (bom[0] == (byte) 0xFE && bom[1] == (byte) 0xFF || // BE 311 bom[0] == (byte) 0xFF && bom[1] == (byte) 0xFE) { 312 return new BomCharset("UTF-16", false); // in all Javas 313 } else if (bom[0] == (byte) 0xEF && bom[1] == (byte) 0xBB && bom[2] == (byte) 0xBF) { 314 return new BomCharset("UTF-8", true); // in all Javas 315 // 16 and 32 decoders consume the BOM to determine be/le; utf-8 should be consumed here 316 } 317 return null; 318 } 319 320 private static class BomCharset { 321 private final String charset; 322 private final boolean offset; 323 324 public BomCharset(String charset, boolean offset) { 325 this.charset = charset; 326 this.offset = offset; 327 } 328 } 329}