001package org.jsoup.nodes; 002 003import org.jsoup.SerializationException; 004import org.jsoup.internal.StringUtil; 005import org.jsoup.helper.Validate; 006import org.jsoup.nodes.Document.OutputSettings; 007import org.jsoup.parser.CharacterReader; 008import org.jsoup.parser.Parser; 009import org.jspecify.annotations.Nullable; 010 011import java.io.IOException; 012import java.nio.charset.CharsetEncoder; 013import java.util.Arrays; 014import java.util.HashMap; 015 016import static org.jsoup.nodes.Document.OutputSettings.*; 017import static org.jsoup.nodes.Entities.EscapeMode.base; 018import static org.jsoup.nodes.Entities.EscapeMode.extended; 019 020/** 021 * HTML entities, and escape routines. Source: <a href="http://www.w3.org/TR/html5/named-character-references.html#named-character-references">W3C 022 * HTML named character references</a>. 023 */ 024public class Entities { 025 private static final int empty = -1; 026 private static final String emptyName = ""; 027 static final int codepointRadix = 36; 028 private static final char[] codeDelims = {',', ';'}; 029 private static final HashMap<String, String> multipoints = new HashMap<>(); // name -> multiple character references 030 031 public enum EscapeMode { 032 /** 033 * Restricted entities suitable for XHTML output: lt, gt, amp, and quot only. 034 */ 035 xhtml(EntitiesData.xmlPoints, 4), 036 /** 037 * Default HTML output entities. 038 */ 039 base(EntitiesData.basePoints, 106), 040 /** 041 * Complete HTML entities. 042 */ 043 extended(EntitiesData.fullPoints, 2125); 044 045 // table of named references to their codepoints. sorted so we can binary search. built by BuildEntities. 046 private String[] nameKeys; 047 private int[] codeVals; // limitation is the few references with multiple characters; those go into multipoints. 048 049 // table of codepoints to named entities. 050 private int[] codeKeys; // we don't support multicodepoints to single named value currently 051 private String[] nameVals; 052 053 EscapeMode(String file, int size) { 054 load(this, file, size); 055 } 056 057 int codepointForName(final String name) { 058 int index = Arrays.binarySearch(nameKeys, name); 059 return index >= 0 ? codeVals[index] : empty; 060 } 061 062 String nameForCodepoint(final int codepoint) { 063 final int index = Arrays.binarySearch(codeKeys, codepoint); 064 if (index >= 0) { 065 // the results are ordered so lower case versions of same codepoint come after uppercase, and we prefer to emit lower 066 // (and binary search for same item with multi results is undefined 067 return (index < nameVals.length - 1 && codeKeys[index + 1] == codepoint) ? 068 nameVals[index + 1] : nameVals[index]; 069 } 070 return emptyName; 071 } 072 073 private int size() { 074 return nameKeys.length; 075 } 076 } 077 078 private Entities() { 079 } 080 081 /** 082 * Check if the input is a known named entity 083 * 084 * @param name the possible entity name (e.g. "lt" or "amp") 085 * @return true if a known named entity 086 */ 087 public static boolean isNamedEntity(final String name) { 088 return extended.codepointForName(name) != empty; 089 } 090 091 /** 092 * Check if the input is a known named entity in the base entity set. 093 * 094 * @param name the possible entity name (e.g. "lt" or "amp") 095 * @return true if a known named entity in the base set 096 * @see #isNamedEntity(String) 097 */ 098 public static boolean isBaseNamedEntity(final String name) { 099 return base.codepointForName(name) != empty; 100 } 101 102 /** 103 * Get the character(s) represented by the named entity 104 * 105 * @param name entity (e.g. "lt" or "amp") 106 * @return the string value of the character(s) represented by this entity, or "" if not defined 107 */ 108 public static String getByName(String name) { 109 String val = multipoints.get(name); 110 if (val != null) 111 return val; 112 int codepoint = extended.codepointForName(name); 113 if (codepoint != empty) 114 return new String(new int[]{codepoint}, 0, 1); 115 return emptyName; 116 } 117 118 public static int codepointsForName(final String name, final int[] codepoints) { 119 String val = multipoints.get(name); 120 if (val != null) { 121 codepoints[0] = val.codePointAt(0); 122 codepoints[1] = val.codePointAt(1); 123 return 2; 124 } 125 int codepoint = extended.codepointForName(name); 126 if (codepoint != empty) { 127 codepoints[0] = codepoint; 128 return 1; 129 } 130 return 0; 131 } 132 133 /** 134 * HTML escape an input string. That is, {@code <} is returned as {@code <} 135 * 136 * @param string the un-escaped string to escape 137 * @param out the output settings to use 138 * @return the escaped string 139 */ 140 public static String escape(String string, OutputSettings out) { 141 if (string == null) 142 return ""; 143 StringBuilder accum = StringUtil.borrowBuilder(); 144 try { 145 escape(accum, string, out, false, false, false, false); 146 } catch (IOException e) { 147 throw new SerializationException(e); // doesn't happen 148 } 149 return StringUtil.releaseBuilder(accum); 150 } 151 152 /** 153 * HTML escape an input string, using the default settings (UTF-8, base entities). That is, {@code <} is returned as 154 * {@code <} 155 * 156 * @param string the un-escaped string to escape 157 * @return the escaped string 158 */ 159 public static String escape(String string) { 160 if (DefaultOutput == null) 161 DefaultOutput = new OutputSettings(); 162 return escape(string, DefaultOutput); 163 } 164 private static @Nullable OutputSettings DefaultOutput; // lazy-init, to break circular dependency with OutputSettings 165 166 // this method does a lot, but other breakups cause rescanning and stringbuilder generations 167 static void escape(Appendable accum, String string, OutputSettings out, 168 boolean inAttribute, boolean normaliseWhite, boolean stripLeadingWhite, boolean trimTrailing) throws IOException { 169 170 boolean lastWasWhite = false; 171 boolean reachedNonWhite = false; 172 final EscapeMode escapeMode = out.escapeMode(); 173 final CharsetEncoder encoder = out.encoder(); 174 final CoreCharset coreCharset = out.coreCharset; // init in out.prepareEncoder() 175 final int length = string.length(); 176 177 int codePoint; 178 boolean skipped = false; 179 for (int offset = 0; offset < length; offset += Character.charCount(codePoint)) { 180 codePoint = string.codePointAt(offset); 181 182 if (normaliseWhite) { 183 if (StringUtil.isWhitespace(codePoint)) { 184 if (stripLeadingWhite && !reachedNonWhite) continue; 185 if (lastWasWhite) continue; 186 if (trimTrailing) { 187 skipped = true; 188 continue; 189 } 190 accum.append(' '); 191 lastWasWhite = true; 192 continue; 193 } else { 194 lastWasWhite = false; 195 reachedNonWhite = true; 196 if (skipped) { 197 accum.append(' '); // wasn't the end, so need to place a normalized space 198 skipped = false; 199 } 200 } 201 } 202 // surrogate pairs, split implementation for efficiency on single char common case (saves creating strings, char[]): 203 if (codePoint < Character.MIN_SUPPLEMENTARY_CODE_POINT) { 204 final char c = (char) codePoint; 205 // html specific and required escapes: 206 switch (c) { 207 case '&': 208 accum.append("&"); 209 break; 210 case 0xA0: 211 if (escapeMode != EscapeMode.xhtml) 212 accum.append(" "); 213 else 214 accum.append(" "); 215 break; 216 case '<': 217 // escape when in character data or when in a xml attribute val or XML syntax; not needed in html attr val 218 if (!inAttribute || escapeMode == EscapeMode.xhtml || out.syntax() == Syntax.xml) 219 accum.append("<"); 220 else 221 accum.append(c); 222 break; 223 case '>': 224 if (!inAttribute) 225 accum.append(">"); 226 else 227 accum.append(c); 228 break; 229 case '"': 230 if (inAttribute) 231 accum.append("""); 232 else 233 accum.append(c); 234 break; 235 // we escape ascii control <x20 (other than tab, line-feed, carriage return) for XML compliance (required) and HTML ease of reading (not required) - https://www.w3.org/TR/xml/#charsets 236 case 0x9: 237 case 0xA: 238 case 0xD: 239 accum.append(c); 240 break; 241 default: 242 if (c < 0x20 || !canEncode(coreCharset, c, encoder)) 243 appendEncoded(accum, escapeMode, codePoint); 244 else 245 accum.append(c); 246 } 247 } else { 248 final String c = new String(Character.toChars(codePoint)); 249 if (encoder.canEncode(c)) // uses fallback encoder for simplicity 250 accum.append(c); 251 else 252 appendEncoded(accum, escapeMode, codePoint); 253 } 254 } 255 } 256 257 private static void appendEncoded(Appendable accum, EscapeMode escapeMode, int codePoint) throws IOException { 258 final String name = escapeMode.nameForCodepoint(codePoint); 259 if (!emptyName.equals(name)) // ok for identity check 260 accum.append('&').append(name).append(';'); 261 else 262 accum.append("&#x").append(Integer.toHexString(codePoint)).append(';'); 263 } 264 265 /** 266 * Un-escape an HTML escaped string. That is, {@code <} is returned as {@code <}. 267 * 268 * @param string the HTML string to un-escape 269 * @return the unescaped string 270 */ 271 public static String unescape(String string) { 272 return unescape(string, false); 273 } 274 275 /** 276 * Unescape the input string. 277 * 278 * @param string to un-HTML-escape 279 * @param strict if "strict" (that is, requires trailing ';' char, otherwise that's optional) 280 * @return unescaped string 281 */ 282 static String unescape(String string, boolean strict) { 283 return Parser.unescapeEntities(string, strict); 284 } 285 286 /* 287 * Provides a fast-path for Encoder.canEncode, which drastically improves performance on Android post JellyBean. 288 * After KitKat, the implementation of canEncode degrades to the point of being useless. For non ASCII or UTF, 289 * performance may be bad. We can add more encoders for common character sets that are impacted by performance 290 * issues on Android if required. 291 * 292 * Benchmarks: * 293 * OLD toHtml() impl v New (fastpath) in millis 294 * Wiki: 1895, 16 295 * CNN: 6378, 55 296 * Alterslash: 3013, 28 297 * Jsoup: 167, 2 298 */ 299 private static boolean canEncode(final CoreCharset charset, final char c, final CharsetEncoder fallback) { 300 // todo add more charset tests if impacted by Android's bad perf in canEncode 301 switch (charset) { 302 case ascii: 303 return c < 0x80; 304 case utf: 305 return true; // real is:!(Character.isLowSurrogate(c) || Character.isHighSurrogate(c)); - but already check above 306 default: 307 return fallback.canEncode(c); 308 } 309 } 310 311 enum CoreCharset { 312 ascii, utf, fallback; 313 314 static CoreCharset byName(final String name) { 315 if (name.equals("US-ASCII")) 316 return ascii; 317 if (name.startsWith("UTF-")) // covers UTF-8, UTF-16, et al 318 return utf; 319 return fallback; 320 } 321 } 322 323 private static void load(EscapeMode e, String pointsData, int size) { 324 e.nameKeys = new String[size]; 325 e.codeVals = new int[size]; 326 e.codeKeys = new int[size]; 327 e.nameVals = new String[size]; 328 329 int i = 0; 330 CharacterReader reader = new CharacterReader(pointsData); 331 try { 332 while (!reader.isEmpty()) { 333 // NotNestedLessLess=10913,824;1887& 334 335 final String name = reader.consumeTo('='); 336 reader.advance(); 337 final int cp1 = Integer.parseInt(reader.consumeToAny(codeDelims), codepointRadix); 338 final char codeDelim = reader.current(); 339 reader.advance(); 340 final int cp2; 341 if (codeDelim == ',') { 342 cp2 = Integer.parseInt(reader.consumeTo(';'), codepointRadix); 343 reader.advance(); 344 } else { 345 cp2 = empty; 346 } 347 final String indexS = reader.consumeTo('&'); 348 final int index = Integer.parseInt(indexS, codepointRadix); 349 reader.advance(); 350 351 e.nameKeys[i] = name; 352 e.codeVals[i] = cp1; 353 e.codeKeys[index] = cp1; 354 e.nameVals[index] = name; 355 356 if (cp2 != empty) { 357 multipoints.put(name, new String(new int[]{cp1, cp2}, 0, 2)); 358 } 359 i++; 360 } 361 362 Validate.isTrue(i == size, "Unexpected count of entities loaded"); 363 } finally { 364 reader.close(); 365 } 366 } 367}