001package org.jsoup.safety;
002
003import org.jsoup.helper.Validate;
004import org.jsoup.nodes.Attribute;
005import org.jsoup.nodes.Attributes;
006import org.jsoup.nodes.DataNode;
007import org.jsoup.nodes.Document;
008import org.jsoup.nodes.Element;
009import org.jsoup.nodes.Node;
010import org.jsoup.nodes.TextNode;
011import org.jsoup.parser.ParseErrorList;
012import org.jsoup.parser.Parser;
013import org.jsoup.select.NodeTraversor;
014import org.jsoup.select.NodeVisitor;
015
016import java.util.List;
017
018/**
019 The safelist based HTML cleaner. Use to ensure that end-user provided HTML contains only the elements and attributes
020 that you are expecting; no junk, and no cross-site scripting attacks!
021 <p>
022 The HTML cleaner parses the input as HTML and then runs it through a safe-list, so the output HTML can only contain
023 HTML that is allowed by the safelist.
024 </p>
025 <p>
026 It is assumed that the input HTML is a body fragment; the clean methods only pull from the source's body, and the
027 canned safe-lists only allow body contained tags.
028 </p>
029 <p>
030 Rather than interacting directly with a Cleaner object, generally see the {@code clean} methods in {@link org.jsoup.Jsoup}.
031 </p>
032 */
033public class Cleaner {
034    private final Safelist safelist;
035
036    /**
037     Create a new cleaner, that sanitizes documents using the supplied safelist.
038     @param safelist safe-list to clean with
039     */
040    public Cleaner(Safelist safelist) {
041        Validate.notNull(safelist);
042        this.safelist = safelist;
043    }
044
045    /**
046     Creates a new, clean document, from the original dirty document, containing only elements allowed by the safelist.
047     The original document is not modified. Only elements from the dirty document's <code>body</code> are used. The
048     OutputSettings of the original document are cloned into the clean document.
049     @param dirtyDocument Untrusted base document to clean.
050     @return cleaned document.
051     */
052    public Document clean(Document dirtyDocument) {
053        Validate.notNull(dirtyDocument);
054
055        Document clean = Document.createShell(dirtyDocument.baseUri());
056        copySafeNodes(dirtyDocument.body(), clean.body());
057        clean.outputSettings(dirtyDocument.outputSettings().clone());
058
059        return clean;
060    }
061
062    /**
063     Determines if the input document's <b>body</b> is valid, against the safelist. It is considered valid if all the
064     tags and attributes in the input HTML are allowed by the safelist, and that there is no content in the
065     <code>head</code>.
066     <p>
067     This method is intended to be used in a user interface as a validator for user input. Note that regardless of the
068     output of this method, the input document <b>must always</b> be normalized using a method such as
069     {@link #clean(Document)}, and the result of that method used to store or serialize the document before later reuse
070     such as presentation to end users. This ensures that enforced attributes are set correctly, and that any
071     differences between how a given browser and how jsoup parses the input HTML are normalized.
072     </p>
073     <p>Example:
074     <pre>{@code
075     Document inputDoc = Jsoup.parse(inputHtml);
076     Cleaner cleaner = new Cleaner(Safelist.relaxed());
077     boolean isValid = cleaner.isValid(inputDoc);
078     Document normalizedDoc = cleaner.clean(inputDoc);
079     }</pre>
080     </p>
081     @param dirtyDocument document to test
082     @return true if no tags or attributes need to be removed; false if they do
083     */
084    public boolean isValid(Document dirtyDocument) {
085        Validate.notNull(dirtyDocument);
086
087        Document clean = Document.createShell(dirtyDocument.baseUri());
088        int numDiscarded = copySafeNodes(dirtyDocument.body(), clean.body());
089        return numDiscarded == 0
090            && dirtyDocument.head().childNodes().isEmpty(); // because we only look at the body, but we start from a shell, make sure there's nothing in the head
091    }
092
093    /**
094     Determines if the input document's <b>body HTML</b> is valid, against the safelist. It is considered valid if all
095     the tags and attributes in the input HTML are allowed by the safelist.
096     <p>
097     This method is intended to be used in a user interface as a validator for user input. Note that regardless of the
098     output of this method, the input document <b>must always</b> be normalized using a method such as
099     {@link #clean(Document)}, and the result of that method used to store or serialize the document before later reuse
100     such as presentation to end users. This ensures that enforced attributes are set correctly, and that any
101     differences between how a given browser and how jsoup parses the input HTML are normalized.
102     </p>
103     <p>Example:
104     <pre>{@code
105     Document inputDoc = Jsoup.parse(inputHtml);
106     Cleaner cleaner = new Cleaner(Safelist.relaxed());
107     boolean isValid = cleaner.isValidBodyHtml(inputHtml);
108     Document normalizedDoc = cleaner.clean(inputDoc);
109     }</pre>
110     </p>
111     @param bodyHtml HTML fragment to test
112     @return true if no tags or attributes need to be removed; false if they do
113     */
114    public boolean isValidBodyHtml(String bodyHtml) {
115        Document clean = Document.createShell("");
116        Document dirty = Document.createShell("");
117        ParseErrorList errorList = ParseErrorList.tracking(1);
118        List<Node> nodes = Parser.parseFragment(bodyHtml, dirty.body(), "", errorList);
119        dirty.body().insertChildren(0, nodes);
120        int numDiscarded = copySafeNodes(dirty.body(), clean.body());
121        return numDiscarded == 0 && errorList.isEmpty();
122    }
123
124    /**
125     Iterates the input and copies trusted nodes (tags, attributes, text) into the destination.
126     */
127    private final class CleaningVisitor implements NodeVisitor {
128        private int numDiscarded = 0;
129        private final Element root;
130        private Element destination; // current element to append nodes to
131
132        private CleaningVisitor(Element root, Element destination) {
133            this.root = root;
134            this.destination = destination;
135        }
136
137        @Override public void head(Node source, int depth) {
138            if (source instanceof Element) {
139                Element sourceEl = (Element) source;
140
141                if (safelist.isSafeTag(sourceEl.normalName())) { // safe, clone and copy safe attrs
142                    ElementMeta meta = createSafeElement(sourceEl);
143                    Element destChild = meta.el;
144                    destination.appendChild(destChild);
145
146                    numDiscarded += meta.numAttribsDiscarded;
147                    destination = destChild;
148                } else if (source != root) { // not a safe tag, so don't add. don't count root against discarded.
149                    numDiscarded++;
150                }
151            } else if (source instanceof TextNode) {
152                TextNode sourceText = (TextNode) source;
153                TextNode destText = new TextNode(sourceText.getWholeText());
154                destination.appendChild(destText);
155            } else if (source instanceof DataNode && safelist.isSafeTag(source.parent().normalName())) {
156              DataNode sourceData = (DataNode) source;
157              DataNode destData = new DataNode(sourceData.getWholeData());
158              destination.appendChild(destData);
159            } else { // else, we don't care about comments, xml proc instructions, etc
160                numDiscarded++;
161            }
162        }
163
164        @Override public void tail(Node source, int depth) {
165            if (source instanceof Element && safelist.isSafeTag(source.normalName())) {
166                destination = destination.parent(); // would have descended, so pop destination stack
167            }
168        }
169    }
170
171    private int copySafeNodes(Element source, Element dest) {
172        CleaningVisitor cleaningVisitor = new CleaningVisitor(source, dest);
173        NodeTraversor.traverse(cleaningVisitor, source);
174        return cleaningVisitor.numDiscarded;
175    }
176
177    private ElementMeta createSafeElement(Element sourceEl) {
178        Element dest = sourceEl.shallowClone(); // reuses tag, clones attributes and preserves any user data
179        String sourceTag = sourceEl.tagName();
180        Attributes destAttrs = dest.attributes();
181        dest.clearAttributes(); // clear all non-internal attributes, ready for safe copy
182
183        int numDiscarded = 0;
184        Attributes sourceAttrs = sourceEl.attributes();
185        for (Attribute sourceAttr : sourceAttrs) {
186            if (safelist.isSafeAttribute(sourceTag, sourceEl, sourceAttr))
187                destAttrs.put(sourceAttr);
188            else
189                numDiscarded++;
190        }
191        Attributes enforcedAttrs = safelist.getEnforcedAttributes(sourceTag);
192        destAttrs.addAll(enforcedAttrs);
193        dest.attributes().addAll(destAttrs); // re-attach, if removed in clear
194        return new ElementMeta(dest, numDiscarded);
195    }
196
197    private static class ElementMeta {
198        Element el;
199        int numAttribsDiscarded;
200
201        ElementMeta(Element el, int numAttribsDiscarded) {
202            this.el = el;
203            this.numAttribsDiscarded = numAttribsDiscarded;
204        }
205    }
206
207}