001package org.jsoup.nodes; 002 003import org.jsoup.Connection; 004import org.jsoup.Jsoup; 005import org.jsoup.helper.DataUtil; 006import org.jsoup.helper.Validate; 007import org.jsoup.internal.StringUtil; 008import org.jsoup.parser.ParseSettings; 009import org.jsoup.parser.Parser; 010import org.jsoup.parser.Tag; 011import org.jsoup.select.Elements; 012import org.jsoup.select.Evaluator; 013import org.jsoup.select.Selector; 014import org.jspecify.annotations.Nullable; 015 016import java.nio.charset.Charset; 017import java.nio.charset.CharsetEncoder; 018import java.util.List; 019 020import static org.jsoup.parser.Parser.NamespaceHtml; 021 022/** 023 A HTML Document. 024 025 @author Jonathan Hedley, jonathan@hedley.net */ 026public class Document extends Element { 027 private @Nullable Connection connection; // the connection this doc was fetched from, if any 028 private OutputSettings outputSettings = new OutputSettings(); 029 private Parser parser; // the parser used to parse this document 030 private QuirksMode quirksMode = QuirksMode.noQuirks; 031 private final String location; 032 private boolean updateMetaCharset = false; 033 034 /** 035 Create a new, empty Document, in the specified namespace. 036 @param namespace the namespace of this Document's root node. 037 @param baseUri base URI of document 038 @see org.jsoup.Jsoup#parse 039 @see #createShell 040 */ 041 public Document(String namespace, String baseUri) { 042 super(Tag.valueOf("#root", namespace, ParseSettings.htmlDefault), baseUri); 043 this.location = baseUri; 044 this.parser = Parser.htmlParser(); // default, but overridable 045 } 046 047 /** 048 Create a new, empty Document, in the HTML namespace. 049 @param baseUri base URI of document 050 @see org.jsoup.Jsoup#parse 051 @see #Document(String namespace, String baseUri) 052 */ 053 public Document(String baseUri) { 054 this(NamespaceHtml, baseUri); 055 } 056 057 /** 058 Create a valid, empty shell of a document, suitable for adding more elements to. 059 @param baseUri baseUri of document 060 @return document with html, head, and body elements. 061 */ 062 public static Document createShell(String baseUri) { 063 Validate.notNull(baseUri); 064 065 Document doc = new Document(baseUri); 066 doc.parser = doc.parser(); 067 Element html = doc.appendElement("html"); 068 html.appendElement("head"); 069 html.appendElement("body"); 070 071 return doc; 072 } 073 074 /** 075 * Get the URL this Document was parsed from. If the starting URL is a redirect, 076 * this will return the final URL from which the document was served from. 077 * <p>Will return an empty string if the location is unknown (e.g. if parsed from a String). 078 * @return location 079 */ 080 public String location() { 081 return location; 082 } 083 084 /** 085 Returns the Connection (Request/Response) object that was used to fetch this document, if any; otherwise, a new 086 default Connection object. This can be used to continue a session, preserving settings and cookies, etc. 087 @return the Connection (session) associated with this Document, or an empty one otherwise. 088 @see Connection#newRequest() 089 */ 090 public Connection connection() { 091 if (connection == null) 092 return Jsoup.newSession(); 093 else 094 return connection; 095 } 096 097 /** 098 * Returns this Document's doctype. 099 * @return document type, or null if not set 100 */ 101 public @Nullable DocumentType documentType() { 102 for (Node node : childNodes) { 103 if (node instanceof DocumentType) 104 return (DocumentType) node; 105 else if (!(node instanceof LeafNode)) // scans forward across comments, text, processing instructions etc 106 break; 107 } 108 return null; 109 // todo - add a set document type? 110 } 111 112 /** 113 Find the root HTML element, or create it if it doesn't exist. 114 @return the root HTML element. 115 */ 116 private Element htmlEl() { 117 Element el = firstElementChild(); 118 while (el != null) { 119 if (el.nameIs("html")) 120 return el; 121 el = el.nextElementSibling(); 122 } 123 return appendElement("html"); 124 } 125 126 /** 127 Get this document's {@code head} element. 128 <p> 129 As a side-effect, if this Document does not already have a HTML structure, it will be created. If you do not want 130 that, use {@code #selectFirst("head")} instead. 131 132 @return {@code head} element. 133 */ 134 public Element head() { 135 final Element html = htmlEl(); 136 Element el = html.firstElementChild(); 137 while (el != null) { 138 if (el.nameIs("head")) 139 return el; 140 el = el.nextElementSibling(); 141 } 142 return html.prependElement("head"); 143 } 144 145 /** 146 Get this document's {@code <body>} or {@code <frameset>} element. 147 <p> 148 As a <b>side-effect</b>, if this Document does not already have a HTML structure, it will be created with a {@code 149 <body>} element. If you do not want that, use {@code #selectFirst("body")} instead. 150 151 @return {@code body} element for documents with a {@code <body>}, a new {@code <body>} element if the document 152 had no contents, or the outermost {@code <frameset> element} for frameset documents. 153 */ 154 public Element body() { 155 final Element html = htmlEl(); 156 Element el = html.firstElementChild(); 157 while (el != null) { 158 if (el.nameIs("body") || el.nameIs("frameset")) 159 return el; 160 el = el.nextElementSibling(); 161 } 162 return html.appendElement("body"); 163 } 164 165 /** 166 Get each of the {@code <form>} elements contained in this document. 167 @return a List of FormElement objects, which will be empty if there are none. 168 @see Elements#forms() 169 @see FormElement#elements() 170 @since 1.15.4 171 */ 172 public List<FormElement> forms() { 173 return select("form").forms(); 174 } 175 176 /** 177 Selects the first {@link FormElement} in this document that matches the query. If none match, throws an 178 {@link IllegalArgumentException}. 179 @param cssQuery a {@link Selector} CSS query 180 @return the first matching {@code <form>} element 181 @throws IllegalArgumentException if no match is found 182 @since 1.15.4 183 */ 184 public FormElement expectForm(String cssQuery) { 185 Elements els = select(cssQuery); 186 for (Element el : els) { 187 if (el instanceof FormElement) return (FormElement) el; 188 } 189 Validate.fail("No form elements matched the query '%s' in the document.", cssQuery); 190 return null; // (not really) 191 } 192 193 /** 194 Get the string contents of the document's {@code title} element. 195 @return Trimmed title, or empty string if none set. 196 */ 197 public String title() { 198 // title is a preserve whitespace tag (for document output), but normalised here 199 Element titleEl = head().selectFirst(titleEval); 200 return titleEl != null ? StringUtil.normaliseWhitespace(titleEl.text()).trim() : ""; 201 } 202 private static final Evaluator titleEval = new Evaluator.Tag("title"); 203 204 /** 205 Set the document's {@code title} element. Updates the existing element, or adds {@code title} to {@code head} if 206 not present 207 @param title string to set as title 208 */ 209 public void title(String title) { 210 Validate.notNull(title); 211 Element titleEl = head().selectFirst(titleEval); 212 if (titleEl == null) // add to head 213 titleEl = head().appendElement("title"); 214 titleEl.text(title); 215 } 216 217 /** 218 Create a new Element, with this document's base uri. Does not make the new element a child of this document. 219 @param tagName element tag name (e.g. {@code a}) 220 @return new element 221 */ 222 public Element createElement(String tagName) { 223 return new Element(Tag.valueOf(tagName, parser.defaultNamespace(), ParseSettings.preserveCase), this.baseUri()); 224 } 225 226 @Override 227 public String outerHtml() { 228 return super.html(); // no outer wrapper tag 229 } 230 231 /** 232 Set the text of the {@code body} of this document. Any existing nodes within the body will be cleared. 233 @param text unencoded text 234 @return this document 235 */ 236 @Override 237 public Element text(String text) { 238 body().text(text); // overridden to not nuke doc structure 239 return this; 240 } 241 242 @Override 243 public String nodeName() { 244 return "#document"; 245 } 246 247 /** 248 * Sets the charset used in this document. This method is equivalent 249 * to {@link OutputSettings#charset(java.nio.charset.Charset) 250 * OutputSettings.charset(Charset)} but in addition it updates the 251 * charset / encoding element within the document. 252 * 253 * <p>This enables 254 * {@link #updateMetaCharsetElement(boolean) meta charset update}.</p> 255 * 256 * <p>If there's no element with charset / encoding information yet it will 257 * be created. Obsolete charset / encoding definitions are removed!</p> 258 * 259 * <p><b>Elements used:</b></p> 260 * 261 * <ul> 262 * <li><b>Html:</b> <i><meta charset="CHARSET"></i></li> 263 * <li><b>Xml:</b> <i><?xml version="1.0" encoding="CHARSET"></i></li> 264 * </ul> 265 * 266 * @param charset Charset 267 * 268 * @see #updateMetaCharsetElement(boolean) 269 * @see OutputSettings#charset(java.nio.charset.Charset) 270 */ 271 public void charset(Charset charset) { 272 updateMetaCharsetElement(true); 273 outputSettings.charset(charset); 274 ensureMetaCharsetElement(); 275 } 276 277 /** 278 * Returns the charset used in this document. This method is equivalent 279 * to {@link OutputSettings#charset()}. 280 * 281 * @return Current Charset 282 * 283 * @see OutputSettings#charset() 284 */ 285 public Charset charset() { 286 return outputSettings.charset(); 287 } 288 289 /** 290 * Sets whether the element with charset information in this document is 291 * updated on changes through {@link #charset(java.nio.charset.Charset) 292 * Document.charset(Charset)} or not. 293 * 294 * <p>If set to <tt>false</tt> <i>(default)</i> there are no elements 295 * modified.</p> 296 * 297 * @param update If <tt>true</tt> the element updated on charset 298 * changes, <tt>false</tt> if not 299 * 300 * @see #charset(java.nio.charset.Charset) 301 */ 302 public void updateMetaCharsetElement(boolean update) { 303 this.updateMetaCharset = update; 304 } 305 306 /** 307 * Returns whether the element with charset information in this document is 308 * updated on changes through {@link #charset(java.nio.charset.Charset) 309 * Document.charset(Charset)} or not. 310 * 311 * @return Returns <tt>true</tt> if the element is updated on charset 312 * changes, <tt>false</tt> if not 313 */ 314 public boolean updateMetaCharsetElement() { 315 return updateMetaCharset; 316 } 317 318 @Override 319 public Document clone() { 320 Document clone = (Document) super.clone(); 321 clone.outputSettings = this.outputSettings.clone(); 322 return clone; 323 } 324 325 @Override 326 public Document shallowClone() { 327 Document clone = new Document(this.tag().namespace(), baseUri()); 328 if (attributes != null) 329 clone.attributes = attributes.clone(); 330 clone.outputSettings = this.outputSettings.clone(); 331 return clone; 332 } 333 334 /** 335 * Ensures a meta charset (html) or xml declaration (xml) with the current 336 * encoding used. This only applies with 337 * {@link #updateMetaCharsetElement(boolean) updateMetaCharset} set to 338 * <tt>true</tt>, otherwise this method does nothing. 339 * 340 * <ul> 341 * <li>An existing element gets updated with the current charset</li> 342 * <li>If there's no element yet it will be inserted</li> 343 * <li>Obsolete elements are removed</li> 344 * </ul> 345 * 346 * <p><b>Elements used:</b></p> 347 * 348 * <ul> 349 * <li><b>Html:</b> <i><meta charset="CHARSET"></i></li> 350 * <li><b>Xml:</b> <i><?xml version="1.0" encoding="CHARSET"></i></li> 351 * </ul> 352 */ 353 private void ensureMetaCharsetElement() { 354 if (updateMetaCharset) { 355 OutputSettings.Syntax syntax = outputSettings().syntax(); 356 357 if (syntax == OutputSettings.Syntax.html) { 358 Element metaCharset = selectFirst("meta[charset]"); 359 if (metaCharset != null) { 360 metaCharset.attr("charset", charset().displayName()); 361 } else { 362 head().appendElement("meta").attr("charset", charset().displayName()); 363 } 364 select("meta[name=charset]").remove(); // Remove obsolete elements 365 } else if (syntax == OutputSettings.Syntax.xml) { 366 Node node = ensureChildNodes().get(0); 367 if (node instanceof XmlDeclaration) { 368 XmlDeclaration decl = (XmlDeclaration) node; 369 if (decl.name().equals("xml")) { 370 decl.attr("encoding", charset().displayName()); 371 if (decl.hasAttr("version")) 372 decl.attr("version", "1.0"); 373 } else { 374 decl = new XmlDeclaration("xml", false); 375 decl.attr("version", "1.0"); 376 decl.attr("encoding", charset().displayName()); 377 prependChild(decl); 378 } 379 } else { 380 XmlDeclaration decl = new XmlDeclaration("xml", false); 381 decl.attr("version", "1.0"); 382 decl.attr("encoding", charset().displayName()); 383 prependChild(decl); 384 } 385 } 386 } 387 } 388 389 390 /** 391 * A Document's output settings control the form of the text() and html() methods. 392 */ 393 public static class OutputSettings implements Cloneable { 394 /** 395 * The output serialization syntax. 396 */ 397 public enum Syntax {html, xml} 398 399 private Entities.EscapeMode escapeMode = Entities.EscapeMode.base; 400 private Charset charset; 401 Entities.CoreCharset coreCharset; // fast encoders for ascii and utf8 402 private final ThreadLocal<CharsetEncoder> encoderThreadLocal = new ThreadLocal<>(); // initialized by start of OuterHtmlVisitor 403 404 private boolean prettyPrint = true; 405 private boolean outline = false; 406 private int indentAmount = 1; 407 private int maxPaddingWidth = 30; 408 private Syntax syntax = Syntax.html; 409 410 public OutputSettings() { 411 charset(DataUtil.UTF_8); 412 } 413 414 /** 415 * Get the document's current HTML escape mode: <code>base</code>, which provides a limited set of named HTML 416 * entities and escapes other characters as numbered entities for maximum compatibility; or <code>extended</code>, 417 * which uses the complete set of HTML named entities. 418 * <p> 419 * The default escape mode is <code>base</code>. 420 * @return the document's current escape mode 421 */ 422 public Entities.EscapeMode escapeMode() { 423 return escapeMode; 424 } 425 426 /** 427 * Set the document's escape mode, which determines how characters are escaped when the output character set 428 * does not support a given character:- using either a named or a numbered escape. 429 * @param escapeMode the new escape mode to use 430 * @return the document's output settings, for chaining 431 */ 432 public OutputSettings escapeMode(Entities.EscapeMode escapeMode) { 433 this.escapeMode = escapeMode; 434 return this; 435 } 436 437 /** 438 * Get the document's current output charset, which is used to control which characters are escaped when 439 * generating HTML (via the <code>html()</code> methods), and which are kept intact. 440 * <p> 441 * Where possible (when parsing from a URL or File), the document's output charset is automatically set to the 442 * input charset. Otherwise, it defaults to UTF-8. 443 * @return the document's current charset. 444 */ 445 public Charset charset() { 446 return charset; 447 } 448 449 /** 450 * Update the document's output charset. 451 * @param charset the new charset to use. 452 * @return the document's output settings, for chaining 453 */ 454 public OutputSettings charset(Charset charset) { 455 this.charset = charset; 456 coreCharset = Entities.CoreCharset.byName(charset.name()); 457 return this; 458 } 459 460 /** 461 * Update the document's output charset. 462 * @param charset the new charset (by name) to use. 463 * @return the document's output settings, for chaining 464 */ 465 public OutputSettings charset(String charset) { 466 charset(Charset.forName(charset)); 467 return this; 468 } 469 470 CharsetEncoder prepareEncoder() { 471 // created at start of OuterHtmlVisitor so each pass has own encoder, so OutputSettings can be shared among threads 472 CharsetEncoder encoder = charset.newEncoder(); 473 encoderThreadLocal.set(encoder); 474 return encoder; 475 } 476 477 CharsetEncoder encoder() { 478 CharsetEncoder encoder = encoderThreadLocal.get(); 479 return encoder != null ? encoder : prepareEncoder(); 480 } 481 482 /** 483 * Get the document's current output syntax. 484 * @return current syntax 485 */ 486 public Syntax syntax() { 487 return syntax; 488 } 489 490 /** 491 * Set the document's output syntax. Either {@code html}, with empty tags and boolean attributes (etc), or 492 * {@code xml}, with self-closing tags. 493 * <p>When set to {@link Document.OutputSettings.Syntax#xml xml}, the {@link #escapeMode() escapeMode} is 494 * automatically set to {@link Entities.EscapeMode#xhtml}, but may be subsequently changed if desired.</p> 495 * @param syntax serialization syntax 496 * @return the document's output settings, for chaining 497 */ 498 public OutputSettings syntax(Syntax syntax) { 499 this.syntax = syntax; 500 if (syntax == Syntax.xml) 501 this.escapeMode(Entities.EscapeMode.xhtml); 502 return this; 503 } 504 505 /** 506 * Get if pretty printing is enabled. Default is true. If disabled, the HTML output methods will not re-format 507 * the output, and the output will generally look like the input. 508 * @return if pretty printing is enabled. 509 */ 510 public boolean prettyPrint() { 511 return prettyPrint; 512 } 513 514 /** 515 * Enable or disable pretty printing. 516 * @param pretty new pretty print setting 517 * @return this, for chaining 518 */ 519 public OutputSettings prettyPrint(boolean pretty) { 520 prettyPrint = pretty; 521 return this; 522 } 523 524 /** 525 * Get if outline mode is enabled. Default is false. If enabled, the HTML output methods will consider 526 * all tags as block. 527 * @return if outline mode is enabled. 528 */ 529 public boolean outline() { 530 return outline; 531 } 532 533 /** 534 * Enable or disable HTML outline mode. 535 * @param outlineMode new outline setting 536 * @return this, for chaining 537 */ 538 public OutputSettings outline(boolean outlineMode) { 539 outline = outlineMode; 540 return this; 541 } 542 543 /** 544 * Get the current tag indent amount, used when pretty printing. 545 * @return the current indent amount 546 */ 547 public int indentAmount() { 548 return indentAmount; 549 } 550 551 /** 552 * Set the indent amount for pretty printing 553 * @param indentAmount number of spaces to use for indenting each level. Must be {@literal >=} 0. 554 * @return this, for chaining 555 */ 556 public OutputSettings indentAmount(int indentAmount) { 557 Validate.isTrue(indentAmount >= 0); 558 this.indentAmount = indentAmount; 559 return this; 560 } 561 562 /** 563 * Get the current max padding amount, used when pretty printing 564 * so very deeply nested nodes don't get insane padding amounts. 565 * @return the current indent amount 566 */ 567 public int maxPaddingWidth() { 568 return maxPaddingWidth; 569 } 570 571 /** 572 * Set the max padding amount for pretty printing so very deeply nested nodes don't get insane padding amounts. 573 * @param maxPaddingWidth number of spaces to use for indenting each level of nested nodes. Must be {@literal >=} -1. 574 * Default is 30 and -1 means unlimited. 575 * @return this, for chaining 576 */ 577 public OutputSettings maxPaddingWidth(int maxPaddingWidth) { 578 Validate.isTrue(maxPaddingWidth >= -1); 579 this.maxPaddingWidth = maxPaddingWidth; 580 return this; 581 } 582 583 @Override 584 public OutputSettings clone() { 585 OutputSettings clone; 586 try { 587 clone = (OutputSettings) super.clone(); 588 } catch (CloneNotSupportedException e) { 589 throw new RuntimeException(e); 590 } 591 clone.charset(charset.name()); // new charset, coreCharset, and charset encoder 592 clone.escapeMode = Entities.EscapeMode.valueOf(escapeMode.name()); 593 // indentAmount, maxPaddingWidth, and prettyPrint are primitives so object.clone() will handle 594 return clone; 595 } 596 } 597 598 /** 599 * Get the document's current output settings. 600 * @return the document's current output settings. 601 */ 602 public OutputSettings outputSettings() { 603 return outputSettings; 604 } 605 606 /** 607 * Set the document's output settings. 608 * @param outputSettings new output settings. 609 * @return this document, for chaining. 610 */ 611 public Document outputSettings(OutputSettings outputSettings) { 612 Validate.notNull(outputSettings); 613 this.outputSettings = outputSettings; 614 return this; 615 } 616 617 public enum QuirksMode { 618 noQuirks, quirks, limitedQuirks 619 } 620 621 public QuirksMode quirksMode() { 622 return quirksMode; 623 } 624 625 public Document quirksMode(QuirksMode quirksMode) { 626 this.quirksMode = quirksMode; 627 return this; 628 } 629 630 /** 631 * Get the parser that was used to parse this document. 632 * @return the parser 633 */ 634 public Parser parser() { 635 return parser; 636 } 637 638 /** 639 * Set the parser used to create this document. This parser is then used when further parsing within this document 640 * is required. 641 * @param parser the configured parser to use when further parsing is required for this document. 642 * @return this document, for chaining. 643 */ 644 public Document parser(Parser parser) { 645 this.parser = parser; 646 return this; 647 } 648 649 /** 650 Set the Connection used to fetch this document. This Connection is used as a session object when further requests are 651 made (e.g. when a form is submitted). 652 653 @param connection to set 654 @return this document, for chaining 655 @see Connection#newRequest() 656 @since 1.14.1 657 */ 658 public Document connection(Connection connection) { 659 Validate.notNull(connection); 660 this.connection = connection; 661 return this; 662 } 663}