001package org.jsoup.helper;
002
003import org.jsoup.internal.StringUtil;
004import org.jsoup.nodes.Attribute;
005import org.jsoup.nodes.Attributes;
006import org.jsoup.parser.HtmlTreeBuilder;
007import org.jsoup.parser.Parser;
008import org.jsoup.select.NodeTraversor;
009import org.jsoup.select.NodeVisitor;
010import org.jsoup.select.Selector;
011import org.w3c.dom.Comment;
012import org.w3c.dom.DOMException;
013import org.w3c.dom.DOMImplementation;
014import org.w3c.dom.Document;
015import org.w3c.dom.DocumentType;
016import org.w3c.dom.Element;
017import org.w3c.dom.Node;
018import org.w3c.dom.NodeList;
019import org.w3c.dom.Text;
020import org.jspecify.annotations.Nullable;
021
022import javax.xml.parsers.DocumentBuilder;
023import javax.xml.parsers.DocumentBuilderFactory;
024import javax.xml.parsers.ParserConfigurationException;
025import javax.xml.transform.OutputKeys;
026import javax.xml.transform.Transformer;
027import javax.xml.transform.TransformerException;
028import javax.xml.transform.TransformerFactory;
029import javax.xml.transform.dom.DOMSource;
030import javax.xml.transform.stream.StreamResult;
031import javax.xml.xpath.XPathConstants;
032import javax.xml.xpath.XPathExpression;
033import javax.xml.xpath.XPathExpressionException;
034import javax.xml.xpath.XPathFactory;
035import javax.xml.xpath.XPathFactoryConfigurationException;
036import java.io.StringWriter;
037import java.util.ArrayList;
038import java.util.HashMap;
039import java.util.List;
040import java.util.Map;
041import java.util.Properties;
042import java.util.Stack;
043
044import static javax.xml.transform.OutputKeys.METHOD;
045import static org.jsoup.nodes.Document.OutputSettings.Syntax;
046
047/**
048 * Helper class to transform a {@link org.jsoup.nodes.Document} to a {@link org.w3c.dom.Document org.w3c.dom.Document},
049 * for integration with toolsets that use the W3C DOM.
050 */
051public class W3CDom {
052    /** For W3C Documents created by this class, this property is set on each node to link back to the original jsoup node. */
053    public static final String SourceProperty = "jsoupSource";
054    private static final String ContextProperty = "jsoupContextSource"; // tracks the jsoup context element on w3c doc
055    private static final String ContextNodeProperty = "jsoupContextNode"; // the w3c node used as the creating context
056
057    /**
058     To get support for XPath versions > 1, set this property to the classname of an alternate XPathFactory
059     implementation. (For e.g. {@code net.sf.saxon.xpath.XPathFactoryImpl}).
060     */
061    public static final String XPathFactoryProperty = "javax.xml.xpath.XPathFactory:jsoup";
062
063    protected DocumentBuilderFactory factory;
064    private boolean namespaceAware = true; // false when using selectXpath, for user's query convenience
065
066    public W3CDom() {
067        factory = DocumentBuilderFactory.newInstance();
068        factory.setNamespaceAware(true);
069    }
070
071    /**
072     Returns if this W3C DOM is namespace aware. By default, this will be {@code true}, but is disabled for simplicity
073     when using XPath selectors in {@link org.jsoup.nodes.Element#selectXpath(String)}.
074     @return the current namespace aware setting.
075     */
076    public boolean namespaceAware() {
077        return namespaceAware;
078    }
079
080    /**
081     Update the namespace aware setting. This impacts the factory that is used to create W3C nodes from jsoup nodes.
082     <p>For HTML documents, controls if the document will be in the default {@code http://www.w3.org/1999/xhtml}
083     namespace if otherwise unset.</p>.
084     @param namespaceAware the updated setting
085     @return this W3CDom, for chaining.
086     */
087    public W3CDom namespaceAware(boolean namespaceAware) {
088        this.namespaceAware = namespaceAware;
089        factory.setNamespaceAware(namespaceAware);
090        return this;
091    }
092
093    /**
094     * Converts a jsoup DOM to a W3C DOM.
095     *
096     * @param in jsoup Document
097     * @return W3C Document
098     */
099    public static Document convert(org.jsoup.nodes.Document in) {
100        return (new W3CDom().fromJsoup(in));
101    }
102
103    /**
104     * Serialize a W3C document to a String. Provide Properties to define output settings including if HTML or XML. If
105     * you don't provide the properties ({@code null}), the output will be auto-detected based on the content of the
106     * document.
107     *
108     * @param doc Document
109     * @param properties (optional/nullable) the output properties to use. See {@link
110     *     Transformer#setOutputProperties(Properties)} and {@link OutputKeys}
111     * @return Document as string
112     * @see #OutputHtml
113     * @see #OutputXml
114     * @see OutputKeys#ENCODING
115     * @see OutputKeys#OMIT_XML_DECLARATION
116     * @see OutputKeys#STANDALONE
117     * @see OutputKeys#STANDALONE
118     * @see OutputKeys#DOCTYPE_PUBLIC
119     * @see OutputKeys#CDATA_SECTION_ELEMENTS
120     * @see OutputKeys#INDENT
121     * @see OutputKeys#MEDIA_TYPE
122     */
123    public static String asString(Document doc, @Nullable Map<String, String> properties) {
124        try {
125            DOMSource domSource = new DOMSource(doc);
126            StringWriter writer = new StringWriter();
127            StreamResult result = new StreamResult(writer);
128            TransformerFactory tf = TransformerFactory.newInstance();
129            Transformer transformer = tf.newTransformer();
130            if (properties != null)
131                transformer.setOutputProperties(propertiesFromMap(properties));
132
133            if (doc.getDoctype() != null) {
134                DocumentType doctype = doc.getDoctype();
135                if (!StringUtil.isBlank(doctype.getPublicId()))
136                    transformer.setOutputProperty(OutputKeys.DOCTYPE_PUBLIC, doctype.getPublicId());
137                if (!StringUtil.isBlank(doctype.getSystemId()))
138                    transformer.setOutputProperty(OutputKeys.DOCTYPE_SYSTEM, doctype.getSystemId());
139                    // handle <!doctype html> for legacy dom. TODO: nicer if <!doctype html>
140                else if (doctype.getName().equalsIgnoreCase("html")
141                    && StringUtil.isBlank(doctype.getPublicId())
142                    && StringUtil.isBlank(doctype.getSystemId()))
143                    transformer.setOutputProperty(OutputKeys.DOCTYPE_SYSTEM, "about:legacy-compat");
144            }
145
146            transformer.transform(domSource, result);
147            return writer.toString();
148
149        } catch (TransformerException e) {
150            throw new IllegalStateException(e);
151        }
152    }
153
154    static Properties propertiesFromMap(Map<String, String> map) {
155        Properties props = new Properties();
156        props.putAll(map);
157        return props;
158    }
159
160    /** Canned default for HTML output. */
161    public static HashMap<String, String> OutputHtml() {
162        return methodMap("html");
163    }
164
165    /** Canned default for XML output. */
166    public static HashMap<String, String> OutputXml() {
167        return methodMap("xml");
168    }
169
170    private static HashMap<String, String> methodMap(String method) {
171        HashMap<String, String> map = new HashMap<>();
172        map.put(METHOD, method);
173        return map;
174    }
175
176    /**
177     * Convert a jsoup Document to a W3C Document. The created nodes will link back to the original
178     * jsoup nodes in the user property {@link #SourceProperty} (but after conversion, changes on one side will not
179     * flow to the other).
180     *
181     * @param in jsoup doc
182     * @return a W3C DOM Document representing the jsoup Document or Element contents.
183     */
184    public Document fromJsoup(org.jsoup.nodes.Document in) {
185        // just method API backcompat
186        return fromJsoup((org.jsoup.nodes.Element) in);
187    }
188
189    /**
190     * Convert a jsoup DOM to a W3C Document. The created nodes will link back to the original
191     * jsoup nodes in the user property {@link #SourceProperty} (but after conversion, changes on one side will not
192     * flow to the other). The input Element is used as a context node, but the whole surrounding jsoup Document is
193     * converted. (If you just want a subtree converted, use {@link #convert(org.jsoup.nodes.Element, Document)}.)
194     *
195     * @param in jsoup element or doc
196     * @return a W3C DOM Document representing the jsoup Document or Element contents.
197     * @see #sourceNodes(NodeList, Class)
198     * @see #contextNode(Document)
199     */
200    public Document fromJsoup(org.jsoup.nodes.Element in) {
201        Validate.notNull(in);
202        DocumentBuilder builder;
203        try {
204            builder = factory.newDocumentBuilder();
205            DOMImplementation impl = builder.getDOMImplementation();
206            Document out = builder.newDocument();
207            org.jsoup.nodes.Document inDoc = in.ownerDocument();
208            org.jsoup.nodes.DocumentType doctype = inDoc != null ? inDoc.documentType() : null;
209            if (doctype != null) {
210                try {
211                    org.w3c.dom.DocumentType documentType = impl.createDocumentType(doctype.name(), doctype.publicId(), doctype.systemId());
212                    out.appendChild(documentType);
213                } catch (DOMException ignored) {
214                    // invalid / empty doctype dropped
215                }
216            }
217            out.setXmlStandalone(true);
218            // if in is Document, use the root element, not the wrapping document, as the context:
219            org.jsoup.nodes.Element context = (in instanceof org.jsoup.nodes.Document) ? in.firstElementChild() : in;
220            out.setUserData(ContextProperty, context, null);
221            convert(inDoc != null ? inDoc : in, out);
222            return out;
223        } catch (ParserConfigurationException e) {
224            throw new IllegalStateException(e);
225        }
226    }
227
228    /**
229     * Converts a jsoup document into the provided W3C Document. If required, you can set options on the output
230     * document before converting.
231     *
232     * @param in jsoup doc
233     * @param out w3c doc
234     * @see org.jsoup.helper.W3CDom#fromJsoup(org.jsoup.nodes.Element)
235     */
236    public void convert(org.jsoup.nodes.Document in, Document out) {
237        // just provides method API backcompat
238        convert((org.jsoup.nodes.Element) in, out);
239    }
240
241    /**
242     * Converts a jsoup element into the provided W3C Document. If required, you can set options on the output
243     * document before converting.
244     *
245     * @param in jsoup element
246     * @param out w3c doc
247     * @see org.jsoup.helper.W3CDom#fromJsoup(org.jsoup.nodes.Element)
248     */
249    public void convert(org.jsoup.nodes.Element in, Document out) {
250        W3CBuilder builder = new W3CBuilder(out);
251        builder.namespaceAware = namespaceAware;
252        org.jsoup.nodes.Document inDoc = in.ownerDocument();
253        if (inDoc != null) {
254            if (!StringUtil.isBlank(inDoc.location())) {
255                out.setDocumentURI(inDoc.location());
256            }
257            builder.syntax = inDoc.outputSettings().syntax();
258        }
259        org.jsoup.nodes.Element rootEl = in instanceof org.jsoup.nodes.Document ? in.firstElementChild() : in; // skip the #root node if a Document
260        NodeTraversor.traverse(builder, rootEl);
261    }
262
263    /**
264     Evaluate an XPath query against the supplied document, and return the results.
265     @param xpath an XPath query
266     @param doc the document to evaluate against
267     @return the matches nodes
268     */
269    public NodeList selectXpath(String xpath, Document doc) {
270        return selectXpath(xpath, (Node) doc);
271    }
272
273    /**
274     Evaluate an XPath query against the supplied context node, and return the results.
275     @param xpath an XPath query
276     @param contextNode the context node to evaluate against
277     @return the matches nodes
278     */
279    public NodeList selectXpath(String xpath, Node contextNode) {
280        Validate.notEmptyParam(xpath, "xpath");
281        Validate.notNullParam(contextNode, "contextNode");
282
283        NodeList nodeList;
284        try {
285            // if there is a configured XPath factory, use that instead of the Java base impl:
286            String property = System.getProperty(XPathFactoryProperty);
287            final XPathFactory xPathFactory = property != null ?
288                XPathFactory.newInstance("jsoup") :
289                XPathFactory.newInstance();
290
291            XPathExpression expression = xPathFactory.newXPath().compile(xpath);
292            nodeList = (NodeList) expression.evaluate(contextNode, XPathConstants.NODESET); // love the strong typing here /s
293            Validate.notNull(nodeList);
294        } catch (XPathExpressionException | XPathFactoryConfigurationException e) {
295            throw new Selector.SelectorParseException(
296                e, "Could not evaluate XPath query [%s]: %s", xpath, e.getMessage());
297        }
298        return nodeList;
299    }
300
301    /**
302     Retrieves the original jsoup DOM nodes from a nodelist created by this convertor.
303     @param nodeList the W3C nodes to get the original jsoup nodes from
304     @param nodeType the jsoup node type to retrieve (e.g. Element, DataNode, etc)
305     @param <T> node type
306     @return a list of the original nodes
307     */
308    public <T extends org.jsoup.nodes.Node> List<T> sourceNodes(NodeList nodeList, Class<T> nodeType) {
309        Validate.notNull(nodeList);
310        Validate.notNull(nodeType);
311        List<T> nodes = new ArrayList<>(nodeList.getLength());
312
313        for (int i = 0; i < nodeList.getLength(); i++) {
314            org.w3c.dom.Node node = nodeList.item(i);
315            Object source = node.getUserData(W3CDom.SourceProperty);
316            if (nodeType.isInstance(source))
317                nodes.add(nodeType.cast(source));
318        }
319
320        return nodes;
321    }
322
323    /**
324     For a Document created by {@link #fromJsoup(org.jsoup.nodes.Element)}, retrieves the W3C context node.
325     @param wDoc Document created by this class
326     @return the corresponding W3C Node to the jsoup Element that was used as the creating context.
327     */
328    public Node contextNode(Document wDoc) {
329        return (Node) wDoc.getUserData(ContextNodeProperty);
330    }
331
332    /**
333     * Serialize a W3C document to a String. The output format will be XML or HTML depending on the content of the doc.
334     *
335     * @param doc Document
336     * @return Document as string
337     * @see W3CDom#asString(Document, Map)
338     */
339    public String asString(Document doc) {
340        return asString(doc, null);
341    }
342
343    /**
344     * Implements the conversion by walking the input.
345     */
346    protected static class W3CBuilder implements NodeVisitor {
347        // TODO: move the namespace handling stuff into XmlTreeBuilder / HtmlTreeBuilder, now that Tags have namespaces
348        private static final String xmlnsKey = "xmlns";
349        private static final String xmlnsPrefix = "xmlns:";
350
351        private final Document doc;
352        private boolean namespaceAware = true;
353        private final Stack<HashMap<String, String>> namespacesStack = new Stack<>(); // stack of namespaces, prefix => urn
354        private Node dest;
355        private Syntax syntax = Syntax.xml; // the syntax (to coerce attributes to). From the input doc if available.
356        /*@Nullable*/ private final org.jsoup.nodes.Element contextElement; // todo - unsure why this can't be marked nullable?
357
358        public W3CBuilder(Document doc) {
359            this.doc = doc;
360            namespacesStack.push(new HashMap<>());
361            dest = doc;
362            contextElement = (org.jsoup.nodes.Element) doc.getUserData(ContextProperty); // Track the context jsoup Element, so we can save the corresponding w3c element
363            final org.jsoup.nodes.Document inDoc = contextElement.ownerDocument();
364            if (namespaceAware && inDoc != null && inDoc.parser().getTreeBuilder() instanceof HtmlTreeBuilder) {
365              // as per the WHATWG HTML5 spec ยง 2.1.3, elements are in the HTML namespace by default
366              namespacesStack.peek().put("", Parser.NamespaceHtml);
367            }
368          }
369
370        public void head(org.jsoup.nodes.Node source, int depth) {
371            namespacesStack.push(new HashMap<>(namespacesStack.peek())); // inherit from above on the stack
372            if (source instanceof org.jsoup.nodes.Element) {
373                org.jsoup.nodes.Element sourceEl = (org.jsoup.nodes.Element) source;
374
375                String prefix = updateNamespaces(sourceEl);
376                String namespace = namespaceAware ? namespacesStack.peek().get(prefix) : null;
377                String tagName = sourceEl.tagName();
378
379                /* Tag names in XML are quite permissive, but less permissive than HTML. Rather than reimplement the validation,
380                we just try to use it as-is. If it fails, insert as a text node instead. We don't try to normalize the
381                tagname to something safe, because that isn't going to be meaningful downstream. This seems(?) to be
382                how browsers handle the situation, also. https://github.com/jhy/jsoup/issues/1093 */
383                try {
384                    // use an empty namespace if none is present but the tag name has a prefix
385                    String imputedNamespace = namespace == null && tagName.contains(":") ? "" : namespace;
386                    Element el = doc.createElementNS(imputedNamespace, tagName);
387                    copyAttributes(sourceEl, el);
388                    append(el, sourceEl);
389                    if (sourceEl == contextElement)
390                        doc.setUserData(ContextNodeProperty, el, null);
391                    dest = el; // descend
392                } catch (DOMException e) {
393                    append(doc.createTextNode("<" + tagName + ">"), sourceEl);
394                }
395            } else if (source instanceof org.jsoup.nodes.TextNode) {
396                org.jsoup.nodes.TextNode sourceText = (org.jsoup.nodes.TextNode) source;
397                Text text = doc.createTextNode(sourceText.getWholeText());
398                append(text, sourceText);
399            } else if (source instanceof org.jsoup.nodes.Comment) {
400                org.jsoup.nodes.Comment sourceComment = (org.jsoup.nodes.Comment) source;
401                Comment comment = doc.createComment(sourceComment.getData());
402                append(comment, sourceComment);
403            } else if (source instanceof org.jsoup.nodes.DataNode) {
404                org.jsoup.nodes.DataNode sourceData = (org.jsoup.nodes.DataNode) source;
405                Text node = doc.createTextNode(sourceData.getWholeData());
406                append(node, sourceData);
407            } else {
408                // unhandled. note that doctype is not handled here - rather it is used in the initial doc creation
409            }
410        }
411
412        private void append(Node append, org.jsoup.nodes.Node source) {
413            append.setUserData(SourceProperty, source, null);
414            dest.appendChild(append);
415        }
416
417        public void tail(org.jsoup.nodes.Node source, int depth) {
418            if (source instanceof org.jsoup.nodes.Element && dest.getParentNode() instanceof Element) {
419                dest = dest.getParentNode(); // undescend
420            }
421            namespacesStack.pop();
422        }
423
424        private void copyAttributes(org.jsoup.nodes.Node source, Element el) {
425            for (Attribute attribute : source.attributes()) {
426                String key = Attribute.getValidKey(attribute.getKey(), syntax);
427                if (key != null) { // null if couldn't be coerced to validity
428                    el.setAttribute(key, attribute.getValue());
429                }
430            }
431        }
432
433        /**
434         * Finds any namespaces defined in this element. Returns any tag prefix.
435         */
436        private String updateNamespaces(org.jsoup.nodes.Element el) {
437            // scan the element for namespace declarations
438            // like: xmlns="blah" or xmlns:prefix="blah"
439            Attributes attributes = el.attributes();
440            for (Attribute attr : attributes) {
441                String key = attr.getKey();
442                String prefix;
443                if (key.equals(xmlnsKey)) {
444                    prefix = "";
445                } else if (key.startsWith(xmlnsPrefix)) {
446                    prefix = key.substring(xmlnsPrefix.length());
447                } else {
448                    continue;
449                }
450                namespacesStack.peek().put(prefix, attr.getValue());
451            }
452
453            // get the element prefix if any
454            int pos = el.tagName().indexOf(':');
455            return pos > 0 ? el.tagName().substring(0, pos) : "";
456        }
457
458    }
459}