001package org.jsoup.safety; 002 003import org.jsoup.helper.Validate; 004import org.jsoup.nodes.Attribute; 005import org.jsoup.nodes.Attributes; 006import org.jsoup.nodes.DataNode; 007import org.jsoup.nodes.Document; 008import org.jsoup.nodes.Element; 009import org.jsoup.nodes.Node; 010import org.jsoup.nodes.TextNode; 011import org.jsoup.parser.ParseErrorList; 012import org.jsoup.parser.Parser; 013import org.jsoup.select.NodeTraversor; 014import org.jsoup.select.NodeVisitor; 015 016import java.util.List; 017 018/** 019 The safelist based HTML cleaner. Use to ensure that end-user provided HTML contains only the elements and attributes 020 that you are expecting; no junk, and no cross-site scripting attacks! 021 <p> 022 The HTML cleaner parses the input as HTML and then runs it through a safe-list, so the output HTML can only contain 023 HTML that is allowed by the safelist. 024 </p> 025 <p> 026 It is assumed that the input HTML is a body fragment; the clean methods only pull from the source's body, and the 027 canned safe-lists only allow body contained tags. 028 </p> 029 <p> 030 Rather than interacting directly with a Cleaner object, generally see the {@code clean} methods in {@link org.jsoup.Jsoup}. 031 </p> 032 */ 033public class Cleaner { 034 private final Safelist safelist; 035 036 /** 037 Create a new cleaner, that sanitizes documents using the supplied safelist. 038 @param safelist safe-list to clean with 039 */ 040 public Cleaner(Safelist safelist) { 041 Validate.notNull(safelist); 042 this.safelist = safelist; 043 } 044 045 /** 046 Creates a new, clean document, from the original dirty document, containing only elements allowed by the safelist. 047 The original document is not modified. Only elements from the dirty document's <code>body</code> are used. The 048 OutputSettings of the original document are cloned into the clean document. 049 @param dirtyDocument Untrusted base document to clean. 050 @return cleaned document. 051 */ 052 public Document clean(Document dirtyDocument) { 053 Validate.notNull(dirtyDocument); 054 055 Document clean = Document.createShell(dirtyDocument.baseUri()); 056 copySafeNodes(dirtyDocument.body(), clean.body()); 057 clean.outputSettings(dirtyDocument.outputSettings().clone()); 058 059 return clean; 060 } 061 062 /** 063 Determines if the input document's <b>body</b> is valid, against the safelist. It is considered valid if all the 064 tags and attributes in the input HTML are allowed by the safelist, and that there is no content in the 065 <code>head</code>. 066 <p> 067 This method is intended to be used in a user interface as a validator for user input. Note that regardless of the 068 output of this method, the input document <b>must always</b> be normalized using a method such as 069 {@link #clean(Document)}, and the result of that method used to store or serialize the document before later reuse 070 such as presentation to end users. This ensures that enforced attributes are set correctly, and that any 071 differences between how a given browser and how jsoup parses the input HTML are normalized. 072 </p> 073 <p>Example: 074 <pre>{@code 075 Document inputDoc = Jsoup.parse(inputHtml); 076 Cleaner cleaner = new Cleaner(Safelist.relaxed()); 077 boolean isValid = cleaner.isValid(inputDoc); 078 Document normalizedDoc = cleaner.clean(inputDoc); 079 }</pre> 080 </p> 081 @param dirtyDocument document to test 082 @return true if no tags or attributes need to be removed; false if they do 083 */ 084 public boolean isValid(Document dirtyDocument) { 085 Validate.notNull(dirtyDocument); 086 087 Document clean = Document.createShell(dirtyDocument.baseUri()); 088 int numDiscarded = copySafeNodes(dirtyDocument.body(), clean.body()); 089 return numDiscarded == 0 090 && dirtyDocument.head().childNodes().isEmpty(); // because we only look at the body, but we start from a shell, make sure there's nothing in the head 091 } 092 093 /** 094 Determines if the input document's <b>body HTML</b> is valid, against the safelist. It is considered valid if all 095 the tags and attributes in the input HTML are allowed by the safelist. 096 <p> 097 This method is intended to be used in a user interface as a validator for user input. Note that regardless of the 098 output of this method, the input document <b>must always</b> be normalized using a method such as 099 {@link #clean(Document)}, and the result of that method used to store or serialize the document before later reuse 100 such as presentation to end users. This ensures that enforced attributes are set correctly, and that any 101 differences between how a given browser and how jsoup parses the input HTML are normalized. 102 </p> 103 <p>Example: 104 <pre>{@code 105 Document inputDoc = Jsoup.parse(inputHtml); 106 Cleaner cleaner = new Cleaner(Safelist.relaxed()); 107 boolean isValid = cleaner.isValidBodyHtml(inputHtml); 108 Document normalizedDoc = cleaner.clean(inputDoc); 109 }</pre> 110 </p> 111 @param bodyHtml HTML fragment to test 112 @return true if no tags or attributes need to be removed; false if they do 113 */ 114 public boolean isValidBodyHtml(String bodyHtml) { 115 Document clean = Document.createShell(""); 116 Document dirty = Document.createShell(""); 117 ParseErrorList errorList = ParseErrorList.tracking(1); 118 List<Node> nodes = Parser.parseFragment(bodyHtml, dirty.body(), "", errorList); 119 dirty.body().insertChildren(0, nodes); 120 int numDiscarded = copySafeNodes(dirty.body(), clean.body()); 121 return numDiscarded == 0 && errorList.isEmpty(); 122 } 123 124 /** 125 Iterates the input and copies trusted nodes (tags, attributes, text) into the destination. 126 */ 127 private final class CleaningVisitor implements NodeVisitor { 128 private int numDiscarded = 0; 129 private final Element root; 130 private Element destination; // current element to append nodes to 131 132 private CleaningVisitor(Element root, Element destination) { 133 this.root = root; 134 this.destination = destination; 135 } 136 137 @Override public void head(Node source, int depth) { 138 if (source instanceof Element) { 139 Element sourceEl = (Element) source; 140 141 if (safelist.isSafeTag(sourceEl.normalName())) { // safe, clone and copy safe attrs 142 ElementMeta meta = createSafeElement(sourceEl); 143 Element destChild = meta.el; 144 destination.appendChild(destChild); 145 146 numDiscarded += meta.numAttribsDiscarded; 147 destination = destChild; 148 } else if (source != root) { // not a safe tag, so don't add. don't count root against discarded. 149 numDiscarded++; 150 } 151 } else if (source instanceof TextNode) { 152 TextNode sourceText = (TextNode) source; 153 TextNode destText = new TextNode(sourceText.getWholeText()); 154 destination.appendChild(destText); 155 } else if (source instanceof DataNode && safelist.isSafeTag(source.parent().normalName())) { 156 DataNode sourceData = (DataNode) source; 157 DataNode destData = new DataNode(sourceData.getWholeData()); 158 destination.appendChild(destData); 159 } else { // else, we don't care about comments, xml proc instructions, etc 160 numDiscarded++; 161 } 162 } 163 164 @Override public void tail(Node source, int depth) { 165 if (source instanceof Element && safelist.isSafeTag(source.normalName())) { 166 destination = destination.parent(); // would have descended, so pop destination stack 167 } 168 } 169 } 170 171 private int copySafeNodes(Element source, Element dest) { 172 CleaningVisitor cleaningVisitor = new CleaningVisitor(source, dest); 173 NodeTraversor.traverse(cleaningVisitor, source); 174 return cleaningVisitor.numDiscarded; 175 } 176 177 private ElementMeta createSafeElement(Element sourceEl) { 178 Element dest = sourceEl.shallowClone(); // reuses tag, clones attributes and preserves any user data 179 String sourceTag = sourceEl.tagName(); 180 Attributes destAttrs = dest.attributes(); 181 dest.clearAttributes(); // clear all non-internal attributes, ready for safe copy 182 183 int numDiscarded = 0; 184 Attributes sourceAttrs = sourceEl.attributes(); 185 for (Attribute sourceAttr : sourceAttrs) { 186 if (safelist.isSafeAttribute(sourceTag, sourceEl, sourceAttr)) 187 destAttrs.put(sourceAttr); 188 else 189 numDiscarded++; 190 } 191 Attributes enforcedAttrs = safelist.getEnforcedAttributes(sourceTag); 192 destAttrs.addAll(enforcedAttrs); 193 dest.attributes().addAll(destAttrs); // re-attach, if removed in clear 194 return new ElementMeta(dest, numDiscarded); 195 } 196 197 private static class ElementMeta { 198 Element el; 199 int numAttribsDiscarded; 200 201 ElementMeta(Element el, int numAttribsDiscarded) { 202 this.el = el; 203 this.numAttribsDiscarded = numAttribsDiscarded; 204 } 205 } 206 207}