001package org.jsoup.parser; 002 003import org.jsoup.helper.Validate; 004import org.jsoup.nodes.CDataNode; 005import org.jsoup.nodes.Comment; 006import org.jsoup.nodes.Document; 007import org.jsoup.nodes.DocumentType; 008import org.jsoup.nodes.Element; 009import org.jsoup.nodes.Entities; 010import org.jsoup.nodes.LeafNode; 011import org.jsoup.nodes.Node; 012import org.jsoup.nodes.TextNode; 013import org.jsoup.nodes.XmlDeclaration; 014 015import java.io.Reader; 016import java.io.StringReader; 017import java.util.List; 018 019import static org.jsoup.parser.Parser.NamespaceXml; 020 021/** 022 * Use the {@code XmlTreeBuilder} when you want to parse XML without any of the HTML DOM rules being applied to the 023 * document. 024 * <p>Usage example: {@code Document xmlDoc = Jsoup.parse(html, baseUrl, Parser.xmlParser());}</p> 025 * 026 * @author Jonathan Hedley 027 */ 028public class XmlTreeBuilder extends TreeBuilder { 029 @Override ParseSettings defaultSettings() { 030 return ParseSettings.preserveCase; 031 } 032 033 @Override 034 protected void initialiseParse(Reader input, String baseUri, Parser parser) { 035 super.initialiseParse(input, baseUri, parser); 036 stack.add(doc); // place the document onto the stack. differs from HtmlTreeBuilder (not on stack). Note not push()ed, so not onNodeInserted. 037 doc.outputSettings() 038 .syntax(Document.OutputSettings.Syntax.xml) 039 .escapeMode(Entities.EscapeMode.xhtml) 040 .prettyPrint(false); // as XML, we don't understand what whitespace is significant or not 041 } 042 043 Document parse(Reader input, String baseUri) { 044 return parse(input, baseUri, new Parser(this)); 045 } 046 047 Document parse(String input, String baseUri) { 048 return parse(new StringReader(input), baseUri, new Parser(this)); 049 } 050 051 @Override 052 XmlTreeBuilder newInstance() { 053 return new XmlTreeBuilder(); 054 } 055 056 @Override public String defaultNamespace() { 057 return NamespaceXml; 058 } 059 060 @Override 061 protected boolean process(Token token) { 062 currentToken = token; 063 064 // start tag, end tag, doctype, comment, character, eof 065 switch (token.type) { 066 case StartTag: 067 insertElementFor(token.asStartTag()); 068 break; 069 case EndTag: 070 popStackToClose(token.asEndTag()); 071 break; 072 case Comment: 073 insertCommentFor(token.asComment()); 074 break; 075 case Character: 076 insertCharacterFor(token.asCharacter()); 077 break; 078 case Doctype: 079 insertDoctypeFor(token.asDoctype()); 080 break; 081 case EOF: // could put some normalisation here if desired 082 break; 083 default: 084 Validate.fail("Unexpected token type: " + token.type); 085 } 086 return true; 087 } 088 089 void insertElementFor(Token.StartTag startTag) { 090 Tag tag = tagFor(startTag.name(), settings); 091 if (startTag.attributes != null) 092 startTag.attributes.deduplicate(settings); 093 094 Element el = new Element(tag, null, settings.normalizeAttributes(startTag.attributes)); 095 currentElement().appendChild(el); 096 push(el); 097 098 if (startTag.isSelfClosing()) { 099 tag.setSelfClosing(); 100 pop(); // push & pop ensures onNodeInserted & onNodeClosed 101 } 102 } 103 104 void insertLeafNode(LeafNode node) { 105 currentElement().appendChild(node); 106 onNodeInserted(node); 107 } 108 109 void insertCommentFor(Token.Comment commentToken) { 110 Comment comment = new Comment(commentToken.getData()); 111 LeafNode insert = comment; 112 if (commentToken.bogus && comment.isXmlDeclaration()) { 113 // xml declarations are emitted as bogus comments (which is right for html, but not xml) 114 // so we do a bit of a hack and parse the data as an element to pull the attributes out 115 // todo - refactor this to parse more appropriately 116 XmlDeclaration decl = comment.asXmlDeclaration(); // else, we couldn't parse it as a decl, so leave as a comment 117 if (decl != null) 118 insert = decl; 119 } 120 insertLeafNode(insert); 121 } 122 123 void insertCharacterFor(Token.Character token) { 124 final String data = token.getData(); 125 insertLeafNode(token.isCData() ? new CDataNode(data) : new TextNode(data)); 126 } 127 128 void insertDoctypeFor(Token.Doctype token) { 129 DocumentType doctypeNode = new DocumentType(settings.normalizeTag(token.getName()), token.getPublicIdentifier(), token.getSystemIdentifier()); 130 doctypeNode.setPubSysKey(token.getPubSysKey()); 131 insertLeafNode(doctypeNode); 132 } 133 134 /** @deprecated unused and will be removed. */ 135 @Deprecated 136 protected void insertNode(Node node) { 137 currentElement().appendChild(node); 138 onNodeInserted(node); 139 } 140 141 /** @deprecated unused and will be removed. */ 142 @Deprecated 143 protected void insertNode(Node node, Token token) { 144 currentElement().appendChild(node); 145 onNodeInserted(node); 146 } 147 148 /** 149 * If the stack contains an element with this tag's name, pop up the stack to remove the first occurrence. If not 150 * found, skips. 151 * 152 * @param endTag tag to close 153 */ 154 protected void popStackToClose(Token.EndTag endTag) { 155 // like in HtmlTreeBuilder - don't scan up forever for very (artificially) deeply nested stacks 156 String elName = settings.normalizeTag(endTag.tagName); 157 Element firstFound = null; 158 159 final int bottom = stack.size() - 1; 160 final int upper = bottom >= maxQueueDepth ? bottom - maxQueueDepth : 0; 161 162 for (int pos = stack.size() -1; pos >= upper; pos--) { 163 Element next = stack.get(pos); 164 if (next.nodeName().equals(elName)) { 165 firstFound = next; 166 break; 167 } 168 } 169 if (firstFound == null) 170 return; // not found, skip 171 172 for (int pos = stack.size() -1; pos >= 0; pos--) { 173 Element next = pop(); 174 if (next == firstFound) { 175 break; 176 } 177 } 178 } 179 private static final int maxQueueDepth = 256; // an arbitrary tension point between real XML and crafted pain 180 181 List<Node> parseFragment(String inputFragment, String baseUri, Parser parser) { 182 initialiseParse(new StringReader(inputFragment), baseUri, parser); 183 runParser(); 184 return doc.childNodes(); 185 } 186 187 @Override List<Node> parseFragment(String inputFragment, Element context, String baseUri, Parser parser) { 188 return parseFragment(inputFragment, baseUri, parser); 189 } 190}