001package org.jsoup.parser; 002 003import org.jsoup.helper.Validate; 004import org.jsoup.internal.Normalizer; 005 006import java.util.HashMap; 007import java.util.Map; 008import java.util.function.Consumer; 009 010/** 011 * Tag capabilities. 012 * 013 * @author Jonathan Hedley, jonathan@hedley.net 014 */ 015public class Tag implements Cloneable { 016 private static final Map<String, Tag> Tags = new HashMap<>(); // map of known tags 017 018 private String tagName; 019 private final String normalName; // always the lower case version of this tag, regardless of case preservation mode 020 private String namespace; 021 private boolean isBlock = true; // block 022 private boolean formatAsBlock = true; // should be formatted as a block 023 private boolean empty = false; // can hold nothing; e.g. img 024 private boolean selfClosing = false; // can self close (<foo />). used for unknown tags that self close, without forcing them as empty. 025 private boolean preserveWhitespace = false; // for pre, textarea, script etc 026 private boolean formList = false; // a control that appears in forms: input, textarea, output etc 027 private boolean formSubmit = false; // a control that can be submitted in a form: input etc 028 029 private Tag(String tagName, String namespace) { 030 this.tagName = tagName; 031 normalName = Normalizer.lowerCase(tagName); 032 this.namespace = namespace; 033 } 034 035 /** 036 * Get this tag's name. 037 * 038 * @return the tag's name 039 */ 040 public String getName() { 041 return tagName; 042 } 043 044 /** 045 * Get this tag's normalized (lowercased) name. 046 * @return the tag's normal name. 047 */ 048 public String normalName() { 049 return normalName; 050 } 051 052 public String namespace() { 053 return namespace; 054 } 055 056 /** 057 * Get a Tag by name. If not previously defined (unknown), returns a new generic tag, that can do anything. 058 * <p> 059 * Pre-defined tags (p, div etc) will be ==, but unknown tags are not registered and will only .equals(). 060 * </p> 061 * 062 * @param tagName Name of tag, e.g. "p". Case-insensitive. 063 * @param namespace the namespace for the tag. 064 * @param settings used to control tag name sensitivity 065 * @return The tag, either defined or new generic. 066 */ 067 public static Tag valueOf(String tagName, String namespace, ParseSettings settings) { 068 Validate.notEmpty(tagName); 069 Validate.notNull(namespace); 070 Tag tag = Tags.get(tagName); 071 if (tag != null && tag.namespace.equals(namespace)) 072 return tag; 073 074 tagName = settings.normalizeTag(tagName); // the name we'll use 075 Validate.notEmpty(tagName); 076 String normalName = Normalizer.lowerCase(tagName); // the lower-case name to get tag settings off 077 tag = Tags.get(normalName); 078 if (tag != null && tag.namespace.equals(namespace)) { 079 if (settings.preserveTagCase() && !tagName.equals(normalName)) { 080 tag = tag.clone(); // get a new version vs the static one, so name update doesn't reset all 081 tag.tagName = tagName; 082 } 083 return tag; 084 } 085 086 // not defined: create default; go anywhere, do anything! (incl be inside a <p>) 087 tag = new Tag(tagName, namespace); 088 tag.isBlock = false; 089 090 return tag; 091 } 092 093 /** 094 * Get a Tag by name. If not previously defined (unknown), returns a new generic tag, that can do anything. 095 * <p> 096 * Pre-defined tags (P, DIV etc) will be ==, but unknown tags are not registered and will only .equals(). 097 * </p> 098 * 099 * @param tagName Name of tag, e.g. "p". <b>Case sensitive</b>. 100 * @return The tag, either defined or new generic. 101 * @see #valueOf(String tagName, String namespace, ParseSettings settings) 102 */ 103 public static Tag valueOf(String tagName) { 104 return valueOf(tagName, Parser.NamespaceHtml, ParseSettings.preserveCase); 105 } 106 107 /** 108 * Get a Tag by name. If not previously defined (unknown), returns a new generic tag, that can do anything. 109 * <p> 110 * Pre-defined tags (P, DIV etc) will be ==, but unknown tags are not registered and will only .equals(). 111 * </p> 112 * 113 * @param tagName Name of tag, e.g. "p". <b>Case sensitive</b>. 114 * @param settings used to control tag name sensitivity 115 * @return The tag, either defined or new generic. 116 * @see #valueOf(String tagName, String namespace, ParseSettings settings) 117 */ 118 public static Tag valueOf(String tagName, ParseSettings settings) { 119 return valueOf(tagName, Parser.NamespaceHtml, settings); 120 } 121 122 /** 123 * Gets if this is a block tag. 124 * 125 * @return if block tag 126 */ 127 public boolean isBlock() { 128 return isBlock; 129 } 130 131 /** 132 * Gets if this tag should be formatted as a block (or as inline) 133 * 134 * @return if should be formatted as block or inline 135 */ 136 public boolean formatAsBlock() { 137 return formatAsBlock; 138 } 139 140 /** 141 * Gets if this tag is an inline tag. 142 * 143 * @return if this tag is an inline tag. 144 */ 145 public boolean isInline() { 146 return !isBlock; 147 } 148 149 /** 150 * Get if this is an empty tag 151 * 152 * @return if this is an empty tag 153 */ 154 public boolean isEmpty() { 155 return empty; 156 } 157 158 /** 159 * Get if this tag is self-closing. 160 * 161 * @return if this tag should be output as self-closing. 162 */ 163 public boolean isSelfClosing() { 164 return empty || selfClosing; 165 } 166 167 /** 168 * Get if this is a pre-defined tag, or was auto created on parsing. 169 * 170 * @return if a known tag 171 */ 172 public boolean isKnownTag() { 173 return Tags.containsKey(tagName); 174 } 175 176 /** 177 * Check if this tagname is a known tag. 178 * 179 * @param tagName name of tag 180 * @return if known HTML tag 181 */ 182 public static boolean isKnownTag(String tagName) { 183 return Tags.containsKey(tagName); 184 } 185 186 /** 187 * Get if this tag should preserve whitespace within child text nodes. 188 * 189 * @return if preserve whitespace 190 */ 191 public boolean preserveWhitespace() { 192 return preserveWhitespace; 193 } 194 195 /** 196 * Get if this tag represents a control associated with a form. E.g. input, textarea, output 197 * @return if associated with a form 198 */ 199 public boolean isFormListed() { 200 return formList; 201 } 202 203 /** 204 * Get if this tag represents an element that should be submitted with a form. E.g. input, option 205 * @return if submittable with a form 206 */ 207 public boolean isFormSubmittable() { 208 return formSubmit; 209 } 210 211 Tag setSelfClosing() { 212 selfClosing = true; 213 return this; 214 } 215 216 @Override 217 public boolean equals(Object o) { 218 if (this == o) return true; 219 if (!(o instanceof Tag)) return false; 220 221 Tag tag = (Tag) o; 222 223 if (!tagName.equals(tag.tagName)) return false; 224 if (empty != tag.empty) return false; 225 if (formatAsBlock != tag.formatAsBlock) return false; 226 if (isBlock != tag.isBlock) return false; 227 if (preserveWhitespace != tag.preserveWhitespace) return false; 228 if (selfClosing != tag.selfClosing) return false; 229 if (formList != tag.formList) return false; 230 return formSubmit == tag.formSubmit; 231 } 232 233 @Override 234 public int hashCode() { 235 int result = tagName.hashCode(); 236 result = 31 * result + (isBlock ? 1 : 0); 237 result = 31 * result + (formatAsBlock ? 1 : 0); 238 result = 31 * result + (empty ? 1 : 0); 239 result = 31 * result + (selfClosing ? 1 : 0); 240 result = 31 * result + (preserveWhitespace ? 1 : 0); 241 result = 31 * result + (formList ? 1 : 0); 242 result = 31 * result + (formSubmit ? 1 : 0); 243 return result; 244 } 245 246 @Override 247 public String toString() { 248 return tagName; 249 } 250 251 @Override 252 protected Tag clone() { 253 try { 254 return (Tag) super.clone(); 255 } catch (CloneNotSupportedException e) { 256 throw new RuntimeException(e); 257 } 258 } 259 260 // internal static initialisers: 261 // prepped from http://www.w3.org/TR/REC-html40/sgml/dtd.html and other sources 262 private static final String[] blockTags = { 263 "html", "head", "body", "frameset", "script", "noscript", "style", "meta", "link", "title", "frame", 264 "noframes", "section", "nav", "aside", "hgroup", "header", "footer", "p", "h1", "h2", "h3", "h4", "h5", "h6", 265 "ul", "ol", "pre", "div", "blockquote", "hr", "address", "figure", "figcaption", "form", "fieldset", "ins", 266 "del", "dl", "dt", "dd", "li", "table", "caption", "thead", "tfoot", "tbody", "colgroup", "col", "tr", "th", 267 "td", "video", "audio", "canvas", "details", "menu", "plaintext", "template", "article", "main", 268 "svg", "math", "center", "template", 269 "dir", "applet", "marquee", "listing" // deprecated but still known / special handling 270 }; 271 private static final String[] inlineTags = { 272 "object", "base", "font", "tt", "i", "b", "u", "big", "small", "em", "strong", "dfn", "code", "samp", "kbd", 273 "var", "cite", "abbr", "time", "acronym", "mark", "ruby", "rt", "rp", "rtc", "a", "img", "br", "wbr", "map", "q", 274 "sub", "sup", "bdo", "iframe", "embed", "span", "input", "select", "textarea", "label", "button", "optgroup", 275 "option", "legend", "datalist", "keygen", "output", "progress", "meter", "area", "param", "source", "track", 276 "summary", "command", "device", "area", "basefont", "bgsound", "menuitem", "param", "source", "track", 277 "data", "bdi", "s", "strike", "nobr", 278 "rb", // deprecated but still known / special handling 279 "text", // in SVG NS 280 "mi", "mo", "msup", "mn", "mtext" // in MathML NS, to ensure inline 281 }; 282 private static final String[] emptyTags = { 283 "meta", "link", "base", "frame", "img", "br", "wbr", "embed", "hr", "input", "keygen", "col", "command", 284 "device", "area", "basefont", "bgsound", "menuitem", "param", "source", "track" 285 }; 286 // todo - rework this to format contents as inline; and update html emitter in Element. Same output, just neater. 287 private static final String[] formatAsInlineTags = { 288 "title", "a", "p", "h1", "h2", "h3", "h4", "h5", "h6", "pre", "address", "li", "th", "td", "script", "style", 289 "ins", "del", "s" 290 }; 291 private static final String[] preserveWhitespaceTags = { 292 "pre", "plaintext", "title", "textarea" 293 // script is not here as it is a data node, which always preserve whitespace 294 }; 295 // todo: I think we just need submit tags, and can scrub listed 296 private static final String[] formListedTags = { 297 "button", "fieldset", "input", "keygen", "object", "output", "select", "textarea" 298 }; 299 private static final String[] formSubmitTags = { 300 "input", "keygen", "object", "select", "textarea" 301 }; 302 303 private static final Map<String, String[]> namespaces = new HashMap<>(); 304 static { 305 namespaces.put(Parser.NamespaceMathml, new String[]{"math", "mi", "mo", "msup", "mn", "mtext"}); 306 namespaces.put(Parser.NamespaceSvg, new String[]{"svg", "text"}); 307 // We don't need absolute coverage here as other cases will be inferred by the HtmlTreeBuilder 308 } 309 310 private static void setupTags(String[] tagNames, Consumer<Tag> tagModifier) { 311 for (String tagName : tagNames) { 312 Tag tag = Tags.get(tagName); 313 if (tag == null) { 314 tag = new Tag(tagName, Parser.NamespaceHtml); 315 Tags.put(tag.tagName, tag); 316 } 317 tagModifier.accept(tag); 318 } 319 } 320 321 static { 322 setupTags(blockTags, tag -> { 323 tag.isBlock = true; 324 tag.formatAsBlock = true; 325 }); 326 327 setupTags(inlineTags, tag -> { 328 tag.isBlock = false; 329 tag.formatAsBlock = false; 330 }); 331 332 setupTags(emptyTags, tag -> tag.empty = true); 333 setupTags(formatAsInlineTags, tag -> tag.formatAsBlock = false); 334 setupTags(preserveWhitespaceTags, tag -> tag.preserveWhitespace = true); 335 setupTags(formListedTags, tag -> tag.formList = true); 336 setupTags(formSubmitTags, tag -> tag.formSubmit = true); 337 for (Map.Entry<String, String[]> ns : namespaces.entrySet()) { 338 setupTags(ns.getValue(), tag -> tag.namespace = ns.getKey()); 339 } 340 } 341}