001package org.jsoup.parser;
002
003import org.jsoup.helper.Validate;
004import org.jsoup.nodes.CDataNode;
005import org.jsoup.nodes.Comment;
006import org.jsoup.nodes.Document;
007import org.jsoup.nodes.DocumentType;
008import org.jsoup.nodes.Element;
009import org.jsoup.nodes.Entities;
010import org.jsoup.nodes.LeafNode;
011import org.jsoup.nodes.Node;
012import org.jsoup.nodes.TextNode;
013import org.jsoup.nodes.XmlDeclaration;
014
015import java.io.Reader;
016import java.io.StringReader;
017import java.util.List;
018
019import static org.jsoup.parser.Parser.NamespaceXml;
020
021/**
022 * Use the {@code XmlTreeBuilder} when you want to parse XML without any of the HTML DOM rules being applied to the
023 * document.
024 * <p>Usage example: {@code Document xmlDoc = Jsoup.parse(html, baseUrl, Parser.xmlParser());}</p>
025 *
026 * @author Jonathan Hedley
027 */
028public class XmlTreeBuilder extends TreeBuilder {
029    @Override ParseSettings defaultSettings() {
030        return ParseSettings.preserveCase;
031    }
032
033    @Override
034    protected void initialiseParse(Reader input, String baseUri, Parser parser) {
035        super.initialiseParse(input, baseUri, parser);
036        stack.add(doc); // place the document onto the stack. differs from HtmlTreeBuilder (not on stack). Note not push()ed, so not onNodeInserted.
037        doc.outputSettings()
038            .syntax(Document.OutputSettings.Syntax.xml)
039            .escapeMode(Entities.EscapeMode.xhtml)
040            .prettyPrint(false); // as XML, we don't understand what whitespace is significant or not
041    }
042
043    Document parse(Reader input, String baseUri) {
044        return parse(input, baseUri, new Parser(this));
045    }
046
047    Document parse(String input, String baseUri) {
048        return parse(new StringReader(input), baseUri, new Parser(this));
049    }
050
051    @Override
052    XmlTreeBuilder newInstance() {
053        return new XmlTreeBuilder();
054    }
055
056    @Override public String defaultNamespace() {
057        return NamespaceXml;
058    }
059
060    @Override
061    protected boolean process(Token token) {
062        currentToken = token;
063
064        // start tag, end tag, doctype, comment, character, eof
065        switch (token.type) {
066            case StartTag:
067                insertElementFor(token.asStartTag());
068                break;
069            case EndTag:
070                popStackToClose(token.asEndTag());
071                break;
072            case Comment:
073                insertCommentFor(token.asComment());
074                break;
075            case Character:
076                insertCharacterFor(token.asCharacter());
077                break;
078            case Doctype:
079                insertDoctypeFor(token.asDoctype());
080                break;
081            case EOF: // could put some normalisation here if desired
082                break;
083            default:
084                Validate.fail("Unexpected token type: " + token.type);
085        }
086        return true;
087    }
088
089    void insertElementFor(Token.StartTag startTag) {
090        Tag tag = tagFor(startTag.name(), settings);
091        if (startTag.attributes != null)
092            startTag.attributes.deduplicate(settings);
093
094        Element el = new Element(tag, null, settings.normalizeAttributes(startTag.attributes));
095        currentElement().appendChild(el);
096        push(el);
097
098        if (startTag.isSelfClosing()) {
099            tag.setSelfClosing();
100            pop(); // push & pop ensures onNodeInserted & onNodeClosed
101        }
102    }
103
104    void insertLeafNode(LeafNode node) {
105        currentElement().appendChild(node);
106        onNodeInserted(node);
107    }
108
109    void insertCommentFor(Token.Comment commentToken) {
110        Comment comment = new Comment(commentToken.getData());
111        LeafNode insert = comment;
112        if (commentToken.bogus && comment.isXmlDeclaration()) {
113            // xml declarations are emitted as bogus comments (which is right for html, but not xml)
114            // so we do a bit of a hack and parse the data as an element to pull the attributes out
115            // todo - refactor this to parse more appropriately
116            XmlDeclaration decl = comment.asXmlDeclaration(); // else, we couldn't parse it as a decl, so leave as a comment
117            if (decl != null)
118                insert = decl;
119        }
120        insertLeafNode(insert);
121    }
122
123    void insertCharacterFor(Token.Character token) {
124        final String data = token.getData();
125        insertLeafNode(token.isCData() ? new CDataNode(data) : new TextNode(data));
126    }
127
128    void insertDoctypeFor(Token.Doctype token) {
129        DocumentType doctypeNode = new DocumentType(settings.normalizeTag(token.getName()), token.getPublicIdentifier(), token.getSystemIdentifier());
130        doctypeNode.setPubSysKey(token.getPubSysKey());
131        insertLeafNode(doctypeNode);
132    }
133
134    /** @deprecated unused and will be removed. */
135    @Deprecated
136    protected void insertNode(Node node) {
137        currentElement().appendChild(node);
138        onNodeInserted(node);
139    }
140
141    /** @deprecated unused and will be removed. */
142    @Deprecated
143    protected void insertNode(Node node, Token token) {
144        currentElement().appendChild(node);
145        onNodeInserted(node);
146    }
147
148    /**
149     * If the stack contains an element with this tag's name, pop up the stack to remove the first occurrence. If not
150     * found, skips.
151     *
152     * @param endTag tag to close
153     */
154    protected void popStackToClose(Token.EndTag endTag) {
155        // like in HtmlTreeBuilder - don't scan up forever for very (artificially) deeply nested stacks
156        String elName = settings.normalizeTag(endTag.tagName);
157        Element firstFound = null;
158
159        final int bottom = stack.size() - 1;
160        final int upper = bottom >= maxQueueDepth ? bottom - maxQueueDepth : 0;
161
162        for (int pos = stack.size() -1; pos >= upper; pos--) {
163            Element next = stack.get(pos);
164            if (next.nodeName().equals(elName)) {
165                firstFound = next;
166                break;
167            }
168        }
169        if (firstFound == null)
170            return; // not found, skip
171
172        for (int pos = stack.size() -1; pos >= 0; pos--) {
173            Element next = pop();
174            if (next == firstFound) {
175                break;
176            }
177        }
178    }
179    private static final int maxQueueDepth = 256; // an arbitrary tension point between real XML and crafted pain
180
181    List<Node> parseFragment(String inputFragment, String baseUri, Parser parser) {
182        initialiseParse(new StringReader(inputFragment), baseUri, parser);
183        runParser();
184        return doc.childNodes();
185    }
186
187    @Override List<Node> parseFragment(String inputFragment, Element context, String baseUri, Parser parser) {
188        return parseFragment(inputFragment, baseUri, parser);
189    }
190}