001package org.jsoup.helper; 002 003import org.jsoup.internal.StringUtil; 004import org.jsoup.nodes.Attribute; 005import org.jsoup.nodes.Attributes; 006import org.jsoup.parser.HtmlTreeBuilder; 007import org.jsoup.parser.Parser; 008import org.jsoup.select.NodeTraversor; 009import org.jsoup.select.NodeVisitor; 010import org.jsoup.select.Selector; 011import org.w3c.dom.Comment; 012import org.w3c.dom.DOMException; 013import org.w3c.dom.DOMImplementation; 014import org.w3c.dom.Document; 015import org.w3c.dom.DocumentType; 016import org.w3c.dom.Element; 017import org.w3c.dom.Node; 018import org.w3c.dom.NodeList; 019import org.w3c.dom.Text; 020import org.jspecify.annotations.Nullable; 021 022import javax.xml.parsers.DocumentBuilder; 023import javax.xml.parsers.DocumentBuilderFactory; 024import javax.xml.parsers.ParserConfigurationException; 025import javax.xml.transform.OutputKeys; 026import javax.xml.transform.Transformer; 027import javax.xml.transform.TransformerException; 028import javax.xml.transform.TransformerFactory; 029import javax.xml.transform.dom.DOMSource; 030import javax.xml.transform.stream.StreamResult; 031import javax.xml.xpath.XPathConstants; 032import javax.xml.xpath.XPathExpression; 033import javax.xml.xpath.XPathExpressionException; 034import javax.xml.xpath.XPathFactory; 035import javax.xml.xpath.XPathFactoryConfigurationException; 036import java.io.StringWriter; 037import java.util.ArrayList; 038import java.util.HashMap; 039import java.util.List; 040import java.util.Map; 041import java.util.Properties; 042import java.util.Stack; 043 044import static javax.xml.transform.OutputKeys.METHOD; 045import static org.jsoup.nodes.Document.OutputSettings.Syntax; 046 047/** 048 * Helper class to transform a {@link org.jsoup.nodes.Document} to a {@link org.w3c.dom.Document org.w3c.dom.Document}, 049 * for integration with toolsets that use the W3C DOM. 050 */ 051public class W3CDom { 052 /** For W3C Documents created by this class, this property is set on each node to link back to the original jsoup node. */ 053 public static final String SourceProperty = "jsoupSource"; 054 private static final String ContextProperty = "jsoupContextSource"; // tracks the jsoup context element on w3c doc 055 private static final String ContextNodeProperty = "jsoupContextNode"; // the w3c node used as the creating context 056 057 /** 058 To get support for XPath versions > 1, set this property to the classname of an alternate XPathFactory 059 implementation. (For e.g. {@code net.sf.saxon.xpath.XPathFactoryImpl}). 060 */ 061 public static final String XPathFactoryProperty = "javax.xml.xpath.XPathFactory:jsoup"; 062 063 protected DocumentBuilderFactory factory; 064 private boolean namespaceAware = true; // false when using selectXpath, for user's query convenience 065 066 public W3CDom() { 067 factory = DocumentBuilderFactory.newInstance(); 068 factory.setNamespaceAware(true); 069 } 070 071 /** 072 Returns if this W3C DOM is namespace aware. By default, this will be {@code true}, but is disabled for simplicity 073 when using XPath selectors in {@link org.jsoup.nodes.Element#selectXpath(String)}. 074 @return the current namespace aware setting. 075 */ 076 public boolean namespaceAware() { 077 return namespaceAware; 078 } 079 080 /** 081 Update the namespace aware setting. This impacts the factory that is used to create W3C nodes from jsoup nodes. 082 <p>For HTML documents, controls if the document will be in the default {@code http://www.w3.org/1999/xhtml} 083 namespace if otherwise unset.</p>. 084 @param namespaceAware the updated setting 085 @return this W3CDom, for chaining. 086 */ 087 public W3CDom namespaceAware(boolean namespaceAware) { 088 this.namespaceAware = namespaceAware; 089 factory.setNamespaceAware(namespaceAware); 090 return this; 091 } 092 093 /** 094 * Converts a jsoup DOM to a W3C DOM. 095 * 096 * @param in jsoup Document 097 * @return W3C Document 098 */ 099 public static Document convert(org.jsoup.nodes.Document in) { 100 return (new W3CDom().fromJsoup(in)); 101 } 102 103 /** 104 * Serialize a W3C document to a String. Provide Properties to define output settings including if HTML or XML. If 105 * you don't provide the properties ({@code null}), the output will be auto-detected based on the content of the 106 * document. 107 * 108 * @param doc Document 109 * @param properties (optional/nullable) the output properties to use. See {@link 110 * Transformer#setOutputProperties(Properties)} and {@link OutputKeys} 111 * @return Document as string 112 * @see #OutputHtml 113 * @see #OutputXml 114 * @see OutputKeys#ENCODING 115 * @see OutputKeys#OMIT_XML_DECLARATION 116 * @see OutputKeys#STANDALONE 117 * @see OutputKeys#STANDALONE 118 * @see OutputKeys#DOCTYPE_PUBLIC 119 * @see OutputKeys#CDATA_SECTION_ELEMENTS 120 * @see OutputKeys#INDENT 121 * @see OutputKeys#MEDIA_TYPE 122 */ 123 public static String asString(Document doc, @Nullable Map<String, String> properties) { 124 try { 125 DOMSource domSource = new DOMSource(doc); 126 StringWriter writer = new StringWriter(); 127 StreamResult result = new StreamResult(writer); 128 TransformerFactory tf = TransformerFactory.newInstance(); 129 Transformer transformer = tf.newTransformer(); 130 if (properties != null) 131 transformer.setOutputProperties(propertiesFromMap(properties)); 132 133 if (doc.getDoctype() != null) { 134 DocumentType doctype = doc.getDoctype(); 135 if (!StringUtil.isBlank(doctype.getPublicId())) 136 transformer.setOutputProperty(OutputKeys.DOCTYPE_PUBLIC, doctype.getPublicId()); 137 if (!StringUtil.isBlank(doctype.getSystemId())) 138 transformer.setOutputProperty(OutputKeys.DOCTYPE_SYSTEM, doctype.getSystemId()); 139 // handle <!doctype html> for legacy dom. TODO: nicer if <!doctype html> 140 else if (doctype.getName().equalsIgnoreCase("html") 141 && StringUtil.isBlank(doctype.getPublicId()) 142 && StringUtil.isBlank(doctype.getSystemId())) 143 transformer.setOutputProperty(OutputKeys.DOCTYPE_SYSTEM, "about:legacy-compat"); 144 } 145 146 transformer.transform(domSource, result); 147 return writer.toString(); 148 149 } catch (TransformerException e) { 150 throw new IllegalStateException(e); 151 } 152 } 153 154 static Properties propertiesFromMap(Map<String, String> map) { 155 Properties props = new Properties(); 156 props.putAll(map); 157 return props; 158 } 159 160 /** Canned default for HTML output. */ 161 public static HashMap<String, String> OutputHtml() { 162 return methodMap("html"); 163 } 164 165 /** Canned default for XML output. */ 166 public static HashMap<String, String> OutputXml() { 167 return methodMap("xml"); 168 } 169 170 private static HashMap<String, String> methodMap(String method) { 171 HashMap<String, String> map = new HashMap<>(); 172 map.put(METHOD, method); 173 return map; 174 } 175 176 /** 177 * Convert a jsoup Document to a W3C Document. The created nodes will link back to the original 178 * jsoup nodes in the user property {@link #SourceProperty} (but after conversion, changes on one side will not 179 * flow to the other). 180 * 181 * @param in jsoup doc 182 * @return a W3C DOM Document representing the jsoup Document or Element contents. 183 */ 184 public Document fromJsoup(org.jsoup.nodes.Document in) { 185 // just method API backcompat 186 return fromJsoup((org.jsoup.nodes.Element) in); 187 } 188 189 /** 190 * Convert a jsoup DOM to a W3C Document. The created nodes will link back to the original 191 * jsoup nodes in the user property {@link #SourceProperty} (but after conversion, changes on one side will not 192 * flow to the other). The input Element is used as a context node, but the whole surrounding jsoup Document is 193 * converted. (If you just want a subtree converted, use {@link #convert(org.jsoup.nodes.Element, Document)}.) 194 * 195 * @param in jsoup element or doc 196 * @return a W3C DOM Document representing the jsoup Document or Element contents. 197 * @see #sourceNodes(NodeList, Class) 198 * @see #contextNode(Document) 199 */ 200 public Document fromJsoup(org.jsoup.nodes.Element in) { 201 Validate.notNull(in); 202 DocumentBuilder builder; 203 try { 204 builder = factory.newDocumentBuilder(); 205 DOMImplementation impl = builder.getDOMImplementation(); 206 Document out = builder.newDocument(); 207 org.jsoup.nodes.Document inDoc = in.ownerDocument(); 208 org.jsoup.nodes.DocumentType doctype = inDoc != null ? inDoc.documentType() : null; 209 if (doctype != null) { 210 try { 211 org.w3c.dom.DocumentType documentType = impl.createDocumentType(doctype.name(), doctype.publicId(), doctype.systemId()); 212 out.appendChild(documentType); 213 } catch (DOMException ignored) { 214 // invalid / empty doctype dropped 215 } 216 } 217 out.setXmlStandalone(true); 218 // if in is Document, use the root element, not the wrapping document, as the context: 219 org.jsoup.nodes.Element context = (in instanceof org.jsoup.nodes.Document) ? in.firstElementChild() : in; 220 out.setUserData(ContextProperty, context, null); 221 convert(inDoc != null ? inDoc : in, out); 222 return out; 223 } catch (ParserConfigurationException e) { 224 throw new IllegalStateException(e); 225 } 226 } 227 228 /** 229 * Converts a jsoup document into the provided W3C Document. If required, you can set options on the output 230 * document before converting. 231 * 232 * @param in jsoup doc 233 * @param out w3c doc 234 * @see org.jsoup.helper.W3CDom#fromJsoup(org.jsoup.nodes.Element) 235 */ 236 public void convert(org.jsoup.nodes.Document in, Document out) { 237 // just provides method API backcompat 238 convert((org.jsoup.nodes.Element) in, out); 239 } 240 241 /** 242 * Converts a jsoup element into the provided W3C Document. If required, you can set options on the output 243 * document before converting. 244 * 245 * @param in jsoup element 246 * @param out w3c doc 247 * @see org.jsoup.helper.W3CDom#fromJsoup(org.jsoup.nodes.Element) 248 */ 249 public void convert(org.jsoup.nodes.Element in, Document out) { 250 W3CBuilder builder = new W3CBuilder(out); 251 builder.namespaceAware = namespaceAware; 252 org.jsoup.nodes.Document inDoc = in.ownerDocument(); 253 if (inDoc != null) { 254 if (!StringUtil.isBlank(inDoc.location())) { 255 out.setDocumentURI(inDoc.location()); 256 } 257 builder.syntax = inDoc.outputSettings().syntax(); 258 } 259 org.jsoup.nodes.Element rootEl = in instanceof org.jsoup.nodes.Document ? in.firstElementChild() : in; // skip the #root node if a Document 260 NodeTraversor.traverse(builder, rootEl); 261 } 262 263 /** 264 Evaluate an XPath query against the supplied document, and return the results. 265 @param xpath an XPath query 266 @param doc the document to evaluate against 267 @return the matches nodes 268 */ 269 public NodeList selectXpath(String xpath, Document doc) { 270 return selectXpath(xpath, (Node) doc); 271 } 272 273 /** 274 Evaluate an XPath query against the supplied context node, and return the results. 275 @param xpath an XPath query 276 @param contextNode the context node to evaluate against 277 @return the matches nodes 278 */ 279 public NodeList selectXpath(String xpath, Node contextNode) { 280 Validate.notEmptyParam(xpath, "xpath"); 281 Validate.notNullParam(contextNode, "contextNode"); 282 283 NodeList nodeList; 284 try { 285 // if there is a configured XPath factory, use that instead of the Java base impl: 286 String property = System.getProperty(XPathFactoryProperty); 287 final XPathFactory xPathFactory = property != null ? 288 XPathFactory.newInstance("jsoup") : 289 XPathFactory.newInstance(); 290 291 XPathExpression expression = xPathFactory.newXPath().compile(xpath); 292 nodeList = (NodeList) expression.evaluate(contextNode, XPathConstants.NODESET); // love the strong typing here /s 293 Validate.notNull(nodeList); 294 } catch (XPathExpressionException | XPathFactoryConfigurationException e) { 295 throw new Selector.SelectorParseException( 296 e, "Could not evaluate XPath query [%s]: %s", xpath, e.getMessage()); 297 } 298 return nodeList; 299 } 300 301 /** 302 Retrieves the original jsoup DOM nodes from a nodelist created by this convertor. 303 @param nodeList the W3C nodes to get the original jsoup nodes from 304 @param nodeType the jsoup node type to retrieve (e.g. Element, DataNode, etc) 305 @param <T> node type 306 @return a list of the original nodes 307 */ 308 public <T extends org.jsoup.nodes.Node> List<T> sourceNodes(NodeList nodeList, Class<T> nodeType) { 309 Validate.notNull(nodeList); 310 Validate.notNull(nodeType); 311 List<T> nodes = new ArrayList<>(nodeList.getLength()); 312 313 for (int i = 0; i < nodeList.getLength(); i++) { 314 org.w3c.dom.Node node = nodeList.item(i); 315 Object source = node.getUserData(W3CDom.SourceProperty); 316 if (nodeType.isInstance(source)) 317 nodes.add(nodeType.cast(source)); 318 } 319 320 return nodes; 321 } 322 323 /** 324 For a Document created by {@link #fromJsoup(org.jsoup.nodes.Element)}, retrieves the W3C context node. 325 @param wDoc Document created by this class 326 @return the corresponding W3C Node to the jsoup Element that was used as the creating context. 327 */ 328 public Node contextNode(Document wDoc) { 329 return (Node) wDoc.getUserData(ContextNodeProperty); 330 } 331 332 /** 333 * Serialize a W3C document to a String. The output format will be XML or HTML depending on the content of the doc. 334 * 335 * @param doc Document 336 * @return Document as string 337 * @see W3CDom#asString(Document, Map) 338 */ 339 public String asString(Document doc) { 340 return asString(doc, null); 341 } 342 343 /** 344 * Implements the conversion by walking the input. 345 */ 346 protected static class W3CBuilder implements NodeVisitor { 347 // TODO: move the namespace handling stuff into XmlTreeBuilder / HtmlTreeBuilder, now that Tags have namespaces 348 private static final String xmlnsKey = "xmlns"; 349 private static final String xmlnsPrefix = "xmlns:"; 350 351 private final Document doc; 352 private boolean namespaceAware = true; 353 private final Stack<HashMap<String, String>> namespacesStack = new Stack<>(); // stack of namespaces, prefix => urn 354 private Node dest; 355 private Syntax syntax = Syntax.xml; // the syntax (to coerce attributes to). From the input doc if available. 356 /*@Nullable*/ private final org.jsoup.nodes.Element contextElement; // todo - unsure why this can't be marked nullable? 357 358 public W3CBuilder(Document doc) { 359 this.doc = doc; 360 namespacesStack.push(new HashMap<>()); 361 dest = doc; 362 contextElement = (org.jsoup.nodes.Element) doc.getUserData(ContextProperty); // Track the context jsoup Element, so we can save the corresponding w3c element 363 final org.jsoup.nodes.Document inDoc = contextElement.ownerDocument(); 364 if (namespaceAware && inDoc != null && inDoc.parser().getTreeBuilder() instanceof HtmlTreeBuilder) { 365 // as per the WHATWG HTML5 spec ยง 2.1.3, elements are in the HTML namespace by default 366 namespacesStack.peek().put("", Parser.NamespaceHtml); 367 } 368 } 369 370 public void head(org.jsoup.nodes.Node source, int depth) { 371 namespacesStack.push(new HashMap<>(namespacesStack.peek())); // inherit from above on the stack 372 if (source instanceof org.jsoup.nodes.Element) { 373 org.jsoup.nodes.Element sourceEl = (org.jsoup.nodes.Element) source; 374 375 String prefix = updateNamespaces(sourceEl); 376 String namespace = namespaceAware ? namespacesStack.peek().get(prefix) : null; 377 String tagName = sourceEl.tagName(); 378 379 /* Tag names in XML are quite permissive, but less permissive than HTML. Rather than reimplement the validation, 380 we just try to use it as-is. If it fails, insert as a text node instead. We don't try to normalize the 381 tagname to something safe, because that isn't going to be meaningful downstream. This seems(?) to be 382 how browsers handle the situation, also. https://github.com/jhy/jsoup/issues/1093 */ 383 try { 384 // use an empty namespace if none is present but the tag name has a prefix 385 String imputedNamespace = namespace == null && tagName.contains(":") ? "" : namespace; 386 Element el = doc.createElementNS(imputedNamespace, tagName); 387 copyAttributes(sourceEl, el); 388 append(el, sourceEl); 389 if (sourceEl == contextElement) 390 doc.setUserData(ContextNodeProperty, el, null); 391 dest = el; // descend 392 } catch (DOMException e) { 393 append(doc.createTextNode("<" + tagName + ">"), sourceEl); 394 } 395 } else if (source instanceof org.jsoup.nodes.TextNode) { 396 org.jsoup.nodes.TextNode sourceText = (org.jsoup.nodes.TextNode) source; 397 Text text = doc.createTextNode(sourceText.getWholeText()); 398 append(text, sourceText); 399 } else if (source instanceof org.jsoup.nodes.Comment) { 400 org.jsoup.nodes.Comment sourceComment = (org.jsoup.nodes.Comment) source; 401 Comment comment = doc.createComment(sourceComment.getData()); 402 append(comment, sourceComment); 403 } else if (source instanceof org.jsoup.nodes.DataNode) { 404 org.jsoup.nodes.DataNode sourceData = (org.jsoup.nodes.DataNode) source; 405 Text node = doc.createTextNode(sourceData.getWholeData()); 406 append(node, sourceData); 407 } else { 408 // unhandled. note that doctype is not handled here - rather it is used in the initial doc creation 409 } 410 } 411 412 private void append(Node append, org.jsoup.nodes.Node source) { 413 append.setUserData(SourceProperty, source, null); 414 dest.appendChild(append); 415 } 416 417 public void tail(org.jsoup.nodes.Node source, int depth) { 418 if (source instanceof org.jsoup.nodes.Element && dest.getParentNode() instanceof Element) { 419 dest = dest.getParentNode(); // undescend 420 } 421 namespacesStack.pop(); 422 } 423 424 private void copyAttributes(org.jsoup.nodes.Node source, Element el) { 425 for (Attribute attribute : source.attributes()) { 426 String key = Attribute.getValidKey(attribute.getKey(), syntax); 427 if (key != null) { // null if couldn't be coerced to validity 428 el.setAttribute(key, attribute.getValue()); 429 } 430 } 431 } 432 433 /** 434 * Finds any namespaces defined in this element. Returns any tag prefix. 435 */ 436 private String updateNamespaces(org.jsoup.nodes.Element el) { 437 // scan the element for namespace declarations 438 // like: xmlns="blah" or xmlns:prefix="blah" 439 Attributes attributes = el.attributes(); 440 for (Attribute attr : attributes) { 441 String key = attr.getKey(); 442 String prefix; 443 if (key.equals(xmlnsKey)) { 444 prefix = ""; 445 } else if (key.startsWith(xmlnsPrefix)) { 446 prefix = key.substring(xmlnsPrefix.length()); 447 } else { 448 continue; 449 } 450 namespacesStack.peek().put(prefix, attr.getValue()); 451 } 452 453 // get the element prefix if any 454 int pos = el.tagName().indexOf(':'); 455 return pos > 0 ? el.tagName().substring(0, pos) : ""; 456 } 457 458 } 459}