001package org.jsoup.parser;
002
003import org.jsoup.nodes.Document;
004import org.jsoup.nodes.Element;
005import org.jsoup.nodes.Node;
006
007import java.io.Reader;
008import java.io.StringReader;
009import java.util.List;
010
011/**
012 Parses HTML or XML into a {@link org.jsoup.nodes.Document}. Generally, it is simpler to use one of the parse methods in
013 {@link org.jsoup.Jsoup}.
014 <p>Note that a Parser instance object is not threadsafe. To reuse a Parser configuration in a multi-threaded
015 environment, use {@link #newInstance()} to make copies. */
016public class Parser {
017    public static final String NamespaceHtml = "http://www.w3.org/1999/xhtml";
018    public static final String NamespaceXml = "http://www.w3.org/XML/1998/namespace";
019    public static final String NamespaceMathml = "http://www.w3.org/1998/Math/MathML";
020    public static final String NamespaceSvg = "http://www.w3.org/2000/svg";
021
022    private TreeBuilder treeBuilder;
023    private ParseErrorList errors;
024    private ParseSettings settings;
025    private boolean trackPosition = false;
026
027    /**
028     * Create a new Parser, using the specified TreeBuilder
029     * @param treeBuilder TreeBuilder to use to parse input into Documents.
030     */
031    public Parser(TreeBuilder treeBuilder) {
032        this.treeBuilder = treeBuilder;
033        settings = treeBuilder.defaultSettings();
034        errors = ParseErrorList.noTracking();
035    }
036
037    /**
038     Creates a new Parser as a deep copy of this; including initializing a new TreeBuilder. Allows independent (multi-threaded) use.
039     @return a copied parser
040     */
041    public Parser newInstance() {
042        return new Parser(this);
043    }
044
045    private Parser(Parser copy) {
046        treeBuilder = copy.treeBuilder.newInstance(); // because extended
047        errors = new ParseErrorList(copy.errors); // only copies size, not contents
048        settings = new ParseSettings(copy.settings);
049        trackPosition = copy.trackPosition;
050    }
051    
052    public Document parseInput(String html, String baseUri) {
053        return treeBuilder.parse(new StringReader(html), baseUri, this);
054    }
055
056    public Document parseInput(Reader inputHtml, String baseUri) {
057        return treeBuilder.parse(inputHtml, baseUri, this);
058    }
059
060    public List<Node> parseFragmentInput(String fragment, Element context, String baseUri) {
061        return treeBuilder.parseFragment(fragment, context, baseUri, this);
062    }
063    // gets & sets
064    /**
065     * Get the TreeBuilder currently in use.
066     * @return current TreeBuilder.
067     */
068    public TreeBuilder getTreeBuilder() {
069        return treeBuilder;
070    }
071
072    /**
073     * Update the TreeBuilder used when parsing content.
074     * @param treeBuilder new TreeBuilder
075     * @return this, for chaining
076     */
077    public Parser setTreeBuilder(TreeBuilder treeBuilder) {
078        this.treeBuilder = treeBuilder;
079        treeBuilder.parser = this;
080        return this;
081    }
082
083    /**
084     * Check if parse error tracking is enabled.
085     * @return current track error state.
086     */
087    public boolean isTrackErrors() {
088        return errors.getMaxSize() > 0;
089    }
090
091    /**
092     * Enable or disable parse error tracking for the next parse.
093     * @param maxErrors the maximum number of errors to track. Set to 0 to disable.
094     * @return this, for chaining
095     */
096    public Parser setTrackErrors(int maxErrors) {
097        errors = maxErrors > 0 ? ParseErrorList.tracking(maxErrors) : ParseErrorList.noTracking();
098        return this;
099    }
100
101    /**
102     * Retrieve the parse errors, if any, from the last parse.
103     * @return list of parse errors, up to the size of the maximum errors tracked.
104     * @see #setTrackErrors(int)
105     */
106    public ParseErrorList getErrors() {
107        return errors;
108    }
109
110    /**
111     Test if position tracking is enabled. If it is, Nodes will have a Position to track where in the original input
112     source they were created from. By default, tracking is not enabled.
113     * @return current track position setting
114     */
115    public boolean isTrackPosition() {
116        return trackPosition;
117    }
118
119    /**
120     Enable or disable source position tracking. If enabled, Nodes will have a Position to track where in the original
121     input source they were created from.
122     @param trackPosition position tracking setting; {@code true} to enable
123     @return this Parser, for chaining
124     */
125    public Parser setTrackPosition(boolean trackPosition) {
126        this.trackPosition = trackPosition;
127        return this;
128    }
129
130    /**
131     Update the ParseSettings of this Parser, to control the case sensitivity of tags and attributes.
132     * @param settings the new settings
133     * @return this Parser
134     */
135    public Parser settings(ParseSettings settings) {
136        this.settings = settings;
137        return this;
138    }
139
140    /**
141     Gets the current ParseSettings for this Parser
142     * @return current ParseSettings
143     */
144    public ParseSettings settings() {
145        return settings;
146    }
147
148    /**
149     (An internal method, visible for Element. For HTML parse, signals that script and style text should be treated as
150     Data Nodes).
151     */
152    public boolean isContentForTagData(String normalName) {
153        return getTreeBuilder().isContentForTagData(normalName);
154    }
155
156    public String defaultNamespace() {
157        return getTreeBuilder().defaultNamespace();
158    }
159
160    // static parse functions below
161    /**
162     * Parse HTML into a Document.
163     *
164     * @param html HTML to parse
165     * @param baseUri base URI of document (i.e. original fetch location), for resolving relative URLs.
166     *
167     * @return parsed Document
168     */
169    public static Document parse(String html, String baseUri) {
170        TreeBuilder treeBuilder = new HtmlTreeBuilder();
171        return treeBuilder.parse(new StringReader(html), baseUri, new Parser(treeBuilder));
172    }
173
174    /**
175     * Parse a fragment of HTML into a list of nodes. The context element, if supplied, supplies parsing context.
176     *
177     * @param fragmentHtml the fragment of HTML to parse
178     * @param context (optional) the element that this HTML fragment is being parsed for (i.e. for inner HTML). This
179     * provides stack context (for implicit element creation).
180     * @param baseUri base URI of document (i.e. original fetch location), for resolving relative URLs.
181     *
182     * @return list of nodes parsed from the input HTML. Note that the context element, if supplied, is not modified.
183     */
184    public static List<Node> parseFragment(String fragmentHtml, Element context, String baseUri) {
185        HtmlTreeBuilder treeBuilder = new HtmlTreeBuilder();
186        return treeBuilder.parseFragment(fragmentHtml, context, baseUri, new Parser(treeBuilder));
187    }
188
189    /**
190     * Parse a fragment of HTML into a list of nodes. The context element, if supplied, supplies parsing context.
191     *
192     * @param fragmentHtml the fragment of HTML to parse
193     * @param context (optional) the element that this HTML fragment is being parsed for (i.e. for inner HTML). This
194     * provides stack context (for implicit element creation).
195     * @param baseUri base URI of document (i.e. original fetch location), for resolving relative URLs.
196     * @param errorList list to add errors to
197     *
198     * @return list of nodes parsed from the input HTML. Note that the context element, if supplied, is not modified.
199     */
200    public static List<Node> parseFragment(String fragmentHtml, Element context, String baseUri, ParseErrorList errorList) {
201        HtmlTreeBuilder treeBuilder = new HtmlTreeBuilder();
202        Parser parser = new Parser(treeBuilder);
203        parser.errors = errorList;
204        return treeBuilder.parseFragment(fragmentHtml, context, baseUri, parser);
205    }
206
207    /**
208     * Parse a fragment of XML into a list of nodes.
209     *
210     * @param fragmentXml the fragment of XML to parse
211     * @param baseUri base URI of document (i.e. original fetch location), for resolving relative URLs.
212     * @return list of nodes parsed from the input XML.
213     */
214    public static List<Node> parseXmlFragment(String fragmentXml, String baseUri) {
215        XmlTreeBuilder treeBuilder = new XmlTreeBuilder();
216        return treeBuilder.parseFragment(fragmentXml, baseUri, new Parser(treeBuilder));
217    }
218
219    /**
220     * Parse a fragment of HTML into the {@code body} of a Document.
221     *
222     * @param bodyHtml fragment of HTML
223     * @param baseUri base URI of document (i.e. original fetch location), for resolving relative URLs.
224     *
225     * @return Document, with empty head, and HTML parsed into body
226     */
227    public static Document parseBodyFragment(String bodyHtml, String baseUri) {
228        Document doc = Document.createShell(baseUri);
229        Element body = doc.body();
230        List<Node> nodeList = parseFragment(bodyHtml, body, baseUri);
231        Node[] nodes = nodeList.toArray(new Node[0]); // the node list gets modified when re-parented
232        for (int i = nodes.length - 1; i > 0; i--) {
233            nodes[i].remove();
234        }
235        for (Node node : nodes) {
236            body.appendChild(node);
237        }
238        return doc;
239    }
240
241    /**
242     * Utility method to unescape HTML entities from a string
243     * @param string HTML escaped string
244     * @param inAttribute if the string is to be escaped in strict mode (as attributes are)
245     * @return an unescaped string
246     */
247    public static String unescapeEntities(String string, boolean inAttribute) {
248        Parser parser = Parser.htmlParser();
249        parser.treeBuilder.initialiseParse(new StringReader(string), "", parser);
250        Tokeniser tokeniser = new Tokeniser(parser.treeBuilder);
251        return tokeniser.unescapeEntities(inAttribute);
252    }
253
254    // builders
255
256    /**
257     * Create a new HTML parser. This parser treats input as HTML5, and enforces the creation of a normalised document,
258     * based on a knowledge of the semantics of the incoming tags.
259     * @return a new HTML parser.
260     */
261    public static Parser htmlParser() {
262        return new Parser(new HtmlTreeBuilder());
263    }
264
265    /**
266     * Create a new XML parser. This parser assumes no knowledge of the incoming tags and does not treat it as HTML,
267     * rather creates a simple tree directly from the input.
268     * @return a new simple XML parser.
269     */
270    public static Parser xmlParser() {
271        return new Parser(new XmlTreeBuilder());
272    }
273}