001package org.jsoup.nodes;
002
003import org.jsoup.Connection;
004import org.jsoup.Jsoup;
005import org.jsoup.helper.DataUtil;
006import org.jsoup.helper.Validate;
007import org.jsoup.internal.StringUtil;
008import org.jsoup.parser.ParseSettings;
009import org.jsoup.parser.Parser;
010import org.jsoup.parser.Tag;
011import org.jsoup.select.Elements;
012import org.jsoup.select.Evaluator;
013import org.jsoup.select.Selector;
014import org.jspecify.annotations.Nullable;
015
016import java.nio.charset.Charset;
017import java.nio.charset.CharsetEncoder;
018import java.util.List;
019
020import static org.jsoup.parser.Parser.NamespaceHtml;
021
022/**
023 A HTML Document.
024
025 @author Jonathan Hedley, jonathan@hedley.net */
026public class Document extends Element {
027    private @Nullable Connection connection; // the connection this doc was fetched from, if any
028    private OutputSettings outputSettings = new OutputSettings();
029    private Parser parser; // the parser used to parse this document
030    private QuirksMode quirksMode = QuirksMode.noQuirks;
031    private final String location;
032    private boolean updateMetaCharset = false;
033
034    /**
035     Create a new, empty Document, in the specified namespace.
036     @param namespace the namespace of this Document's root node.
037     @param baseUri base URI of document
038     @see org.jsoup.Jsoup#parse
039     @see #createShell
040     */
041    public Document(String namespace, String baseUri) {
042        super(Tag.valueOf("#root", namespace, ParseSettings.htmlDefault), baseUri);
043        this.location = baseUri;
044        this.parser = Parser.htmlParser(); // default, but overridable
045    }
046
047    /**
048     Create a new, empty Document, in the HTML namespace.
049     @param baseUri base URI of document
050     @see org.jsoup.Jsoup#parse
051     @see #Document(String namespace, String baseUri)
052     */
053    public Document(String baseUri) {
054        this(NamespaceHtml, baseUri);
055    }
056
057    /**
058     Create a valid, empty shell of a document, suitable for adding more elements to.
059     @param baseUri baseUri of document
060     @return document with html, head, and body elements.
061     */
062    public static Document createShell(String baseUri) {
063        Validate.notNull(baseUri);
064
065        Document doc = new Document(baseUri);
066        doc.parser = doc.parser();
067        Element html = doc.appendElement("html");
068        html.appendElement("head");
069        html.appendElement("body");
070
071        return doc;
072    }
073
074    /**
075     * Get the URL this Document was parsed from. If the starting URL is a redirect,
076     * this will return the final URL from which the document was served from.
077     * <p>Will return an empty string if the location is unknown (e.g. if parsed from a String).
078     * @return location
079     */
080    public String location() {
081        return location;
082    }
083
084    /**
085     Returns the Connection (Request/Response) object that was used to fetch this document, if any; otherwise, a new
086     default Connection object. This can be used to continue a session, preserving settings and cookies, etc.
087     @return the Connection (session) associated with this Document, or an empty one otherwise.
088     @see Connection#newRequest()
089     */
090    public Connection connection() {
091        if (connection == null)
092            return Jsoup.newSession();
093        else
094            return connection;
095    }
096
097    /**
098     * Returns this Document's doctype.
099     * @return document type, or null if not set
100     */
101    public @Nullable DocumentType documentType() {
102        for (Node node : childNodes) {
103            if (node instanceof DocumentType)
104                return (DocumentType) node;
105            else if (!(node instanceof LeafNode)) // scans forward across comments, text, processing instructions etc
106                break;
107        }
108        return null;
109        // todo - add a set document type?
110    }
111
112    /**
113     Find the root HTML element, or create it if it doesn't exist.
114     @return the root HTML element.
115     */
116    private Element htmlEl() {
117        Element el = firstElementChild();
118        while (el != null) {
119            if (el.nameIs("html"))
120                return el;
121            el = el.nextElementSibling();
122        }
123        return appendElement("html");
124    }
125
126    /**
127     Get this document's {@code head} element.
128     <p>
129     As a side-effect, if this Document does not already have a HTML structure, it will be created. If you do not want
130     that, use {@code #selectFirst("head")} instead.
131
132     @return {@code head} element.
133     */
134    public Element head() {
135        final Element html = htmlEl();
136        Element el = html.firstElementChild();
137        while (el != null) {
138            if (el.nameIs("head"))
139                return el;
140            el = el.nextElementSibling();
141        }
142        return html.prependElement("head");
143    }
144
145    /**
146     Get this document's {@code <body>} or {@code <frameset>} element.
147     <p>
148     As a <b>side-effect</b>, if this Document does not already have a HTML structure, it will be created with a {@code
149    <body>} element. If you do not want that, use {@code #selectFirst("body")} instead.
150
151     @return {@code body} element for documents with a {@code <body>}, a new {@code <body>} element if the document
152     had no contents, or the outermost {@code <frameset> element} for frameset documents.
153     */
154    public Element body() {
155        final Element html = htmlEl();
156        Element el = html.firstElementChild();
157        while (el != null) {
158            if (el.nameIs("body") || el.nameIs("frameset"))
159                return el;
160            el = el.nextElementSibling();
161        }
162        return html.appendElement("body");
163    }
164
165    /**
166     Get each of the {@code <form>} elements contained in this document.
167     @return a List of FormElement objects, which will be empty if there are none.
168     @see Elements#forms()
169     @see FormElement#elements()
170     @since 1.15.4
171     */
172    public List<FormElement> forms() {
173        return select("form").forms();
174    }
175
176    /**
177     Selects the first {@link FormElement} in this document that matches the query. If none match, throws an
178     {@link IllegalArgumentException}.
179     @param cssQuery a {@link Selector} CSS query
180     @return the first matching {@code <form>} element
181     @throws IllegalArgumentException if no match is found
182     @since 1.15.4
183     */
184    public FormElement expectForm(String cssQuery) {
185        Elements els = select(cssQuery);
186        for (Element el : els) {
187            if (el instanceof FormElement) return (FormElement) el;
188        }
189        Validate.fail("No form elements matched the query '%s' in the document.", cssQuery);
190        return null; // (not really)
191    }
192
193    /**
194     Get the string contents of the document's {@code title} element.
195     @return Trimmed title, or empty string if none set.
196     */
197    public String title() {
198        // title is a preserve whitespace tag (for document output), but normalised here
199        Element titleEl = head().selectFirst(titleEval);
200        return titleEl != null ? StringUtil.normaliseWhitespace(titleEl.text()).trim() : "";
201    }
202    private static final Evaluator titleEval = new Evaluator.Tag("title");
203
204    /**
205     Set the document's {@code title} element. Updates the existing element, or adds {@code title} to {@code head} if
206     not present
207     @param title string to set as title
208     */
209    public void title(String title) {
210        Validate.notNull(title);
211        Element titleEl = head().selectFirst(titleEval);
212        if (titleEl == null) // add to head
213            titleEl = head().appendElement("title");
214        titleEl.text(title);
215    }
216
217    /**
218     Create a new Element, with this document's base uri. Does not make the new element a child of this document.
219     @param tagName element tag name (e.g. {@code a})
220     @return new element
221     */
222    public Element createElement(String tagName) {
223        return new Element(Tag.valueOf(tagName, parser.defaultNamespace(), ParseSettings.preserveCase), this.baseUri());
224    }
225
226    @Override
227    public String outerHtml() {
228        return super.html(); // no outer wrapper tag
229    }
230
231    /**
232     Set the text of the {@code body} of this document. Any existing nodes within the body will be cleared.
233     @param text unencoded text
234     @return this document
235     */
236    @Override
237    public Element text(String text) {
238        body().text(text); // overridden to not nuke doc structure
239        return this;
240    }
241
242    @Override
243    public String nodeName() {
244        return "#document";
245    }
246    
247    /**
248     * Sets the charset used in this document. This method is equivalent
249     * to {@link OutputSettings#charset(java.nio.charset.Charset)
250     * OutputSettings.charset(Charset)} but in addition it updates the
251     * charset / encoding element within the document.
252     * 
253     * <p>This enables
254     * {@link #updateMetaCharsetElement(boolean) meta charset update}.</p>
255     * 
256     * <p>If there's no element with charset / encoding information yet it will
257     * be created. Obsolete charset / encoding definitions are removed!</p>
258     * 
259     * <p><b>Elements used:</b></p>
260     * 
261     * <ul>
262     * <li><b>Html:</b> <i>&lt;meta charset="CHARSET"&gt;</i></li>
263     * <li><b>Xml:</b> <i>&lt;?xml version="1.0" encoding="CHARSET"&gt;</i></li>
264     * </ul>
265     * 
266     * @param charset Charset
267     * 
268     * @see #updateMetaCharsetElement(boolean) 
269     * @see OutputSettings#charset(java.nio.charset.Charset) 
270     */
271    public void charset(Charset charset) {
272        updateMetaCharsetElement(true);
273        outputSettings.charset(charset);
274        ensureMetaCharsetElement();
275    }
276    
277    /**
278     * Returns the charset used in this document. This method is equivalent
279     * to {@link OutputSettings#charset()}.
280     * 
281     * @return Current Charset
282     * 
283     * @see OutputSettings#charset() 
284     */
285    public Charset charset() {
286        return outputSettings.charset();
287    }
288    
289    /**
290     * Sets whether the element with charset information in this document is
291     * updated on changes through {@link #charset(java.nio.charset.Charset)
292     * Document.charset(Charset)} or not.
293     * 
294     * <p>If set to <tt>false</tt> <i>(default)</i> there are no elements
295     * modified.</p>
296     * 
297     * @param update If <tt>true</tt> the element updated on charset
298     * changes, <tt>false</tt> if not
299     * 
300     * @see #charset(java.nio.charset.Charset) 
301     */
302    public void updateMetaCharsetElement(boolean update) {
303        this.updateMetaCharset = update;
304    }
305    
306    /**
307     * Returns whether the element with charset information in this document is
308     * updated on changes through {@link #charset(java.nio.charset.Charset)
309     * Document.charset(Charset)} or not.
310     * 
311     * @return Returns <tt>true</tt> if the element is updated on charset
312     * changes, <tt>false</tt> if not
313     */
314    public boolean updateMetaCharsetElement() {
315        return updateMetaCharset;
316    }
317
318    @Override
319    public Document clone() {
320        Document clone = (Document) super.clone();
321        clone.outputSettings = this.outputSettings.clone();
322        return clone;
323    }
324
325    @Override
326    public Document shallowClone() {
327        Document clone = new Document(this.tag().namespace(), baseUri());
328        if (attributes != null)
329            clone.attributes = attributes.clone();
330        clone.outputSettings = this.outputSettings.clone();
331        return clone;
332    }
333    
334    /**
335     * Ensures a meta charset (html) or xml declaration (xml) with the current
336     * encoding used. This only applies with
337     * {@link #updateMetaCharsetElement(boolean) updateMetaCharset} set to
338     * <tt>true</tt>, otherwise this method does nothing.
339     * 
340     * <ul>
341     * <li>An existing element gets updated with the current charset</li>
342     * <li>If there's no element yet it will be inserted</li>
343     * <li>Obsolete elements are removed</li>
344     * </ul>
345     * 
346     * <p><b>Elements used:</b></p>
347     * 
348     * <ul>
349     * <li><b>Html:</b> <i>&lt;meta charset="CHARSET"&gt;</i></li>
350     * <li><b>Xml:</b> <i>&lt;?xml version="1.0" encoding="CHARSET"&gt;</i></li>
351     * </ul>
352     */
353    private void ensureMetaCharsetElement() {
354        if (updateMetaCharset) {
355            OutputSettings.Syntax syntax = outputSettings().syntax();
356
357            if (syntax == OutputSettings.Syntax.html) {
358                Element metaCharset = selectFirst("meta[charset]");
359                if (metaCharset != null) {
360                    metaCharset.attr("charset", charset().displayName());
361                } else {
362                    head().appendElement("meta").attr("charset", charset().displayName());
363                }
364                select("meta[name=charset]").remove(); // Remove obsolete elements
365            } else if (syntax == OutputSettings.Syntax.xml) {
366                Node node = ensureChildNodes().get(0);
367                if (node instanceof XmlDeclaration) {
368                    XmlDeclaration decl = (XmlDeclaration) node;
369                    if (decl.name().equals("xml")) {
370                        decl.attr("encoding", charset().displayName());
371                        if (decl.hasAttr("version"))
372                            decl.attr("version", "1.0");
373                    } else {
374                        decl = new XmlDeclaration("xml", false);
375                        decl.attr("version", "1.0");
376                        decl.attr("encoding", charset().displayName());
377                        prependChild(decl);
378                    }
379                } else {
380                    XmlDeclaration decl = new XmlDeclaration("xml", false);
381                    decl.attr("version", "1.0");
382                    decl.attr("encoding", charset().displayName());
383                    prependChild(decl);
384                }
385            }
386        }
387    }
388    
389
390    /**
391     * A Document's output settings control the form of the text() and html() methods.
392     */
393    public static class OutputSettings implements Cloneable {
394        /**
395         * The output serialization syntax.
396         */
397        public enum Syntax {html, xml}
398
399        private Entities.EscapeMode escapeMode = Entities.EscapeMode.base;
400        private Charset charset;
401        Entities.CoreCharset coreCharset; // fast encoders for ascii and utf8
402        private final ThreadLocal<CharsetEncoder> encoderThreadLocal = new ThreadLocal<>(); // initialized by start of OuterHtmlVisitor
403
404        private boolean prettyPrint = true;
405        private boolean outline = false;
406        private int indentAmount = 1;
407        private int maxPaddingWidth = 30;
408        private Syntax syntax = Syntax.html;
409
410        public OutputSettings() {
411            charset(DataUtil.UTF_8);
412        }
413        
414        /**
415         * Get the document's current HTML escape mode: <code>base</code>, which provides a limited set of named HTML
416         * entities and escapes other characters as numbered entities for maximum compatibility; or <code>extended</code>,
417         * which uses the complete set of HTML named entities.
418         * <p>
419         * The default escape mode is <code>base</code>.
420         * @return the document's current escape mode
421         */
422        public Entities.EscapeMode escapeMode() {
423            return escapeMode;
424        }
425
426        /**
427         * Set the document's escape mode, which determines how characters are escaped when the output character set
428         * does not support a given character:- using either a named or a numbered escape.
429         * @param escapeMode the new escape mode to use
430         * @return the document's output settings, for chaining
431         */
432        public OutputSettings escapeMode(Entities.EscapeMode escapeMode) {
433            this.escapeMode = escapeMode;
434            return this;
435        }
436
437        /**
438         * Get the document's current output charset, which is used to control which characters are escaped when
439         * generating HTML (via the <code>html()</code> methods), and which are kept intact.
440         * <p>
441         * Where possible (when parsing from a URL or File), the document's output charset is automatically set to the
442         * input charset. Otherwise, it defaults to UTF-8.
443         * @return the document's current charset.
444         */
445        public Charset charset() {
446            return charset;
447        }
448
449        /**
450         * Update the document's output charset.
451         * @param charset the new charset to use.
452         * @return the document's output settings, for chaining
453         */
454        public OutputSettings charset(Charset charset) {
455            this.charset = charset;
456            coreCharset = Entities.CoreCharset.byName(charset.name());
457            return this;
458        }
459
460        /**
461         * Update the document's output charset.
462         * @param charset the new charset (by name) to use.
463         * @return the document's output settings, for chaining
464         */
465        public OutputSettings charset(String charset) {
466            charset(Charset.forName(charset));
467            return this;
468        }
469
470        CharsetEncoder prepareEncoder() {
471            // created at start of OuterHtmlVisitor so each pass has own encoder, so OutputSettings can be shared among threads
472            CharsetEncoder encoder = charset.newEncoder();
473            encoderThreadLocal.set(encoder);
474            return encoder;
475        }
476
477        CharsetEncoder encoder() {
478            CharsetEncoder encoder = encoderThreadLocal.get();
479            return encoder != null ? encoder : prepareEncoder();
480        }
481
482        /**
483         * Get the document's current output syntax.
484         * @return current syntax
485         */
486        public Syntax syntax() {
487            return syntax;
488        }
489
490        /**
491         * Set the document's output syntax. Either {@code html}, with empty tags and boolean attributes (etc), or
492         * {@code xml}, with self-closing tags.
493         * <p>When set to {@link Document.OutputSettings.Syntax#xml xml}, the {@link #escapeMode() escapeMode} is
494         * automatically set to {@link Entities.EscapeMode#xhtml}, but may be subsequently changed if desired.</p>
495         * @param syntax serialization syntax
496         * @return the document's output settings, for chaining
497         */
498        public OutputSettings syntax(Syntax syntax) {
499            this.syntax = syntax;
500            if (syntax == Syntax.xml)
501                this.escapeMode(Entities.EscapeMode.xhtml);
502            return this;
503        }
504
505        /**
506         * Get if pretty printing is enabled. Default is true. If disabled, the HTML output methods will not re-format
507         * the output, and the output will generally look like the input.
508         * @return if pretty printing is enabled.
509         */
510        public boolean prettyPrint() {
511            return prettyPrint;
512        }
513
514        /**
515         * Enable or disable pretty printing.
516         * @param pretty new pretty print setting
517         * @return this, for chaining
518         */
519        public OutputSettings prettyPrint(boolean pretty) {
520            prettyPrint = pretty;
521            return this;
522        }
523        
524        /**
525         * Get if outline mode is enabled. Default is false. If enabled, the HTML output methods will consider
526         * all tags as block.
527         * @return if outline mode is enabled.
528         */
529        public boolean outline() {
530            return outline;
531        }
532        
533        /**
534         * Enable or disable HTML outline mode.
535         * @param outlineMode new outline setting
536         * @return this, for chaining
537         */
538        public OutputSettings outline(boolean outlineMode) {
539            outline = outlineMode;
540            return this;
541        }
542
543        /**
544         * Get the current tag indent amount, used when pretty printing.
545         * @return the current indent amount
546         */
547        public int indentAmount() {
548            return indentAmount;
549        }
550
551        /**
552         * Set the indent amount for pretty printing
553         * @param indentAmount number of spaces to use for indenting each level. Must be {@literal >=} 0.
554         * @return this, for chaining
555         */
556        public OutputSettings indentAmount(int indentAmount) {
557            Validate.isTrue(indentAmount >= 0);
558            this.indentAmount = indentAmount;
559            return this;
560        }
561
562        /**
563         * Get the current max padding amount, used when pretty printing
564         * so very deeply nested nodes don't get insane padding amounts.
565         * @return the current indent amount
566         */
567        public int maxPaddingWidth() {
568            return maxPaddingWidth;
569        }
570
571        /**
572         * Set the max padding amount for pretty printing so very deeply nested nodes don't get insane padding amounts.
573         * @param maxPaddingWidth number of spaces to use for indenting each level of nested nodes. Must be {@literal >=} -1.
574         *        Default is 30 and -1 means unlimited.
575         * @return this, for chaining
576         */
577        public OutputSettings maxPaddingWidth(int maxPaddingWidth) {
578            Validate.isTrue(maxPaddingWidth >= -1);
579            this.maxPaddingWidth = maxPaddingWidth;
580            return this;
581        }
582
583        @Override
584        public OutputSettings clone() {
585            OutputSettings clone;
586            try {
587                clone = (OutputSettings) super.clone();
588            } catch (CloneNotSupportedException e) {
589                throw new RuntimeException(e);
590            }
591            clone.charset(charset.name()); // new charset, coreCharset, and charset encoder
592            clone.escapeMode = Entities.EscapeMode.valueOf(escapeMode.name());
593            // indentAmount, maxPaddingWidth, and prettyPrint are primitives so object.clone() will handle
594            return clone;
595        }
596    }
597
598    /**
599     * Get the document's current output settings.
600     * @return the document's current output settings.
601     */
602    public OutputSettings outputSettings() {
603        return outputSettings;
604    }
605
606    /**
607     * Set the document's output settings.
608     * @param outputSettings new output settings.
609     * @return this document, for chaining.
610     */
611    public Document outputSettings(OutputSettings outputSettings) {
612        Validate.notNull(outputSettings);
613        this.outputSettings = outputSettings;
614        return this;
615    }
616
617    public enum QuirksMode {
618        noQuirks, quirks, limitedQuirks
619    }
620
621    public QuirksMode quirksMode() {
622        return quirksMode;
623    }
624
625    public Document quirksMode(QuirksMode quirksMode) {
626        this.quirksMode = quirksMode;
627        return this;
628    }
629
630    /**
631     * Get the parser that was used to parse this document.
632     * @return the parser
633     */
634    public Parser parser() {
635        return parser;
636    }
637
638    /**
639     * Set the parser used to create this document. This parser is then used when further parsing within this document
640     * is required.
641     * @param parser the configured parser to use when further parsing is required for this document.
642     * @return this document, for chaining.
643     */
644    public Document parser(Parser parser) {
645        this.parser = parser;
646        return this;
647    }
648
649    /**
650     Set the Connection used to fetch this document. This Connection is used as a session object when further requests are
651     made (e.g. when a form is submitted).
652
653     @param connection to set
654     @return this document, for chaining
655     @see Connection#newRequest()
656     @since 1.14.1
657     */
658    public Document connection(Connection connection) {
659        Validate.notNull(connection);
660        this.connection = connection;
661        return this;
662    }
663}