001package org.jsoup.nodes;
002
003import org.jsoup.SerializationException;
004import org.jsoup.internal.StringUtil;
005import org.jsoup.helper.Validate;
006import org.jsoup.nodes.Document.OutputSettings;
007import org.jsoup.parser.CharacterReader;
008import org.jsoup.parser.Parser;
009import org.jspecify.annotations.Nullable;
010
011import java.io.IOException;
012import java.nio.charset.CharsetEncoder;
013import java.util.Arrays;
014import java.util.HashMap;
015
016import static org.jsoup.nodes.Document.OutputSettings.*;
017import static org.jsoup.nodes.Entities.EscapeMode.base;
018import static org.jsoup.nodes.Entities.EscapeMode.extended;
019
020/**
021 * HTML entities, and escape routines. Source: <a href="http://www.w3.org/TR/html5/named-character-references.html#named-character-references">W3C
022 * HTML named character references</a>.
023 */
024public class Entities {
025    private static final int empty = -1;
026    private static final String emptyName = "";
027    static final int codepointRadix = 36;
028    private static final char[] codeDelims = {',', ';'};
029    private static final HashMap<String, String> multipoints = new HashMap<>(); // name -> multiple character references
030
031    public enum EscapeMode {
032        /**
033         * Restricted entities suitable for XHTML output: lt, gt, amp, and quot only.
034         */
035        xhtml(EntitiesData.xmlPoints, 4),
036        /**
037         * Default HTML output entities.
038         */
039        base(EntitiesData.basePoints, 106),
040        /**
041         * Complete HTML entities.
042         */
043        extended(EntitiesData.fullPoints, 2125);
044
045        // table of named references to their codepoints. sorted so we can binary search. built by BuildEntities.
046        private String[] nameKeys;
047        private int[] codeVals; // limitation is the few references with multiple characters; those go into multipoints.
048
049        // table of codepoints to named entities.
050        private int[] codeKeys; // we don't support multicodepoints to single named value currently
051        private String[] nameVals;
052
053        EscapeMode(String file, int size) {
054            load(this, file, size);
055        }
056
057        int codepointForName(final String name) {
058            int index = Arrays.binarySearch(nameKeys, name);
059            return index >= 0 ? codeVals[index] : empty;
060        }
061
062        String nameForCodepoint(final int codepoint) {
063            final int index = Arrays.binarySearch(codeKeys, codepoint);
064            if (index >= 0) {
065                // the results are ordered so lower case versions of same codepoint come after uppercase, and we prefer to emit lower
066                // (and binary search for same item with multi results is undefined
067                return (index < nameVals.length - 1 && codeKeys[index + 1] == codepoint) ?
068                    nameVals[index + 1] : nameVals[index];
069            }
070            return emptyName;
071        }
072
073        private int size() {
074            return nameKeys.length;
075        }
076    }
077
078    private Entities() {
079    }
080
081    /**
082     * Check if the input is a known named entity
083     *
084     * @param name the possible entity name (e.g. "lt" or "amp")
085     * @return true if a known named entity
086     */
087    public static boolean isNamedEntity(final String name) {
088        return extended.codepointForName(name) != empty;
089    }
090
091    /**
092     * Check if the input is a known named entity in the base entity set.
093     *
094     * @param name the possible entity name (e.g. "lt" or "amp")
095     * @return true if a known named entity in the base set
096     * @see #isNamedEntity(String)
097     */
098    public static boolean isBaseNamedEntity(final String name) {
099        return base.codepointForName(name) != empty;
100    }
101
102    /**
103     * Get the character(s) represented by the named entity
104     *
105     * @param name entity (e.g. "lt" or "amp")
106     * @return the string value of the character(s) represented by this entity, or "" if not defined
107     */
108    public static String getByName(String name) {
109        String val = multipoints.get(name);
110        if (val != null)
111            return val;
112        int codepoint = extended.codepointForName(name);
113        if (codepoint != empty)
114            return new String(new int[]{codepoint}, 0, 1);
115        return emptyName;
116    }
117
118    public static int codepointsForName(final String name, final int[] codepoints) {
119        String val = multipoints.get(name);
120        if (val != null) {
121            codepoints[0] = val.codePointAt(0);
122            codepoints[1] = val.codePointAt(1);
123            return 2;
124        }
125        int codepoint = extended.codepointForName(name);
126        if (codepoint != empty) {
127            codepoints[0] = codepoint;
128            return 1;
129        }
130        return 0;
131    }
132
133    /**
134     * HTML escape an input string. That is, {@code <} is returned as {@code &lt;}
135     *
136     * @param string the un-escaped string to escape
137     * @param out the output settings to use
138     * @return the escaped string
139     */
140    public static String escape(String string, OutputSettings out) {
141        if (string == null)
142            return "";
143        StringBuilder accum = StringUtil.borrowBuilder();
144        try {
145            escape(accum, string, out, false, false, false, false);
146        } catch (IOException e) {
147            throw new SerializationException(e); // doesn't happen
148        }
149        return StringUtil.releaseBuilder(accum);
150    }
151
152    /**
153     * HTML escape an input string, using the default settings (UTF-8, base entities). That is, {@code <} is returned as
154     * {@code &lt;}
155     *
156     * @param string the un-escaped string to escape
157     * @return the escaped string
158     */
159    public static String escape(String string) {
160        if (DefaultOutput == null)
161            DefaultOutput = new OutputSettings();
162        return escape(string, DefaultOutput);
163    }
164    private static @Nullable OutputSettings DefaultOutput; // lazy-init, to break circular dependency with OutputSettings
165
166    // this method does a lot, but other breakups cause rescanning and stringbuilder generations
167    static void escape(Appendable accum, String string, OutputSettings out,
168                       boolean inAttribute, boolean normaliseWhite, boolean stripLeadingWhite, boolean trimTrailing) throws IOException {
169
170        boolean lastWasWhite = false;
171        boolean reachedNonWhite = false;
172        final EscapeMode escapeMode = out.escapeMode();
173        final CharsetEncoder encoder = out.encoder();
174        final CoreCharset coreCharset = out.coreCharset; // init in out.prepareEncoder()
175        final int length = string.length();
176
177        int codePoint;
178        boolean skipped = false;
179        for (int offset = 0; offset < length; offset += Character.charCount(codePoint)) {
180            codePoint = string.codePointAt(offset);
181
182            if (normaliseWhite) {
183                if (StringUtil.isWhitespace(codePoint)) {
184                    if (stripLeadingWhite && !reachedNonWhite) continue;
185                    if (lastWasWhite) continue;
186                    if (trimTrailing) {
187                        skipped = true;
188                        continue;
189                    }
190                    accum.append(' ');
191                    lastWasWhite = true;
192                    continue;
193                } else {
194                    lastWasWhite = false;
195                    reachedNonWhite = true;
196                    if (skipped) {
197                        accum.append(' '); // wasn't the end, so need to place a normalized space
198                        skipped = false;
199                    }
200                }
201            }
202            // surrogate pairs, split implementation for efficiency on single char common case (saves creating strings, char[]):
203            if (codePoint < Character.MIN_SUPPLEMENTARY_CODE_POINT) {
204                final char c = (char) codePoint;
205                // html specific and required escapes:
206                switch (c) {
207                    case '&':
208                        accum.append("&amp;");
209                        break;
210                    case 0xA0:
211                        if (escapeMode != EscapeMode.xhtml)
212                            accum.append("&nbsp;");
213                        else
214                            accum.append("&#xa0;");
215                        break;
216                    case '<':
217                        // escape when in character data or when in a xml attribute val or XML syntax; not needed in html attr val
218                        if (!inAttribute || escapeMode == EscapeMode.xhtml || out.syntax() == Syntax.xml)
219                            accum.append("&lt;");
220                        else
221                            accum.append(c);
222                        break;
223                    case '>':
224                        if (!inAttribute)
225                            accum.append("&gt;");
226                        else
227                            accum.append(c);
228                        break;
229                    case '"':
230                        if (inAttribute)
231                            accum.append("&quot;");
232                        else
233                            accum.append(c);
234                        break;
235                    // we escape ascii control <x20 (other than tab, line-feed, carriage return)  for XML compliance (required) and HTML ease of reading (not required) - https://www.w3.org/TR/xml/#charsets
236                    case 0x9:
237                    case 0xA:
238                    case 0xD:
239                        accum.append(c);
240                        break;
241                    default:
242                        if (c < 0x20 || !canEncode(coreCharset, c, encoder))
243                            appendEncoded(accum, escapeMode, codePoint);
244                        else
245                            accum.append(c);
246                }
247            } else {
248                final String c = new String(Character.toChars(codePoint));
249                if (encoder.canEncode(c)) // uses fallback encoder for simplicity
250                    accum.append(c);
251                else
252                    appendEncoded(accum, escapeMode, codePoint);
253            }
254        }
255    }
256
257    private static void appendEncoded(Appendable accum, EscapeMode escapeMode, int codePoint) throws IOException {
258        final String name = escapeMode.nameForCodepoint(codePoint);
259        if (!emptyName.equals(name)) // ok for identity check
260            accum.append('&').append(name).append(';');
261        else
262            accum.append("&#x").append(Integer.toHexString(codePoint)).append(';');
263    }
264
265    /**
266     * Un-escape an HTML escaped string. That is, {@code &lt;} is returned as {@code <}.
267     *
268     * @param string the HTML string to un-escape
269     * @return the unescaped string
270     */
271    public static String unescape(String string) {
272        return unescape(string, false);
273    }
274
275    /**
276     * Unescape the input string.
277     *
278     * @param string to un-HTML-escape
279     * @param strict if "strict" (that is, requires trailing ';' char, otherwise that's optional)
280     * @return unescaped string
281     */
282    static String unescape(String string, boolean strict) {
283        return Parser.unescapeEntities(string, strict);
284    }
285
286    /*
287     * Provides a fast-path for Encoder.canEncode, which drastically improves performance on Android post JellyBean.
288     * After KitKat, the implementation of canEncode degrades to the point of being useless. For non ASCII or UTF,
289     * performance may be bad. We can add more encoders for common character sets that are impacted by performance
290     * issues on Android if required.
291     *
292     * Benchmarks:     *
293     * OLD toHtml() impl v New (fastpath) in millis
294     * Wiki: 1895, 16
295     * CNN: 6378, 55
296     * Alterslash: 3013, 28
297     * Jsoup: 167, 2
298     */
299    private static boolean canEncode(final CoreCharset charset, final char c, final CharsetEncoder fallback) {
300        // todo add more charset tests if impacted by Android's bad perf in canEncode
301        switch (charset) {
302            case ascii:
303                return c < 0x80;
304            case utf:
305                return true; // real is:!(Character.isLowSurrogate(c) || Character.isHighSurrogate(c)); - but already check above
306            default:
307                return fallback.canEncode(c);
308        }
309    }
310
311    enum CoreCharset {
312        ascii, utf, fallback;
313
314        static CoreCharset byName(final String name) {
315            if (name.equals("US-ASCII"))
316                return ascii;
317            if (name.startsWith("UTF-")) // covers UTF-8, UTF-16, et al
318                return utf;
319            return fallback;
320        }
321    }
322
323    private static void load(EscapeMode e, String pointsData, int size) {
324        e.nameKeys = new String[size];
325        e.codeVals = new int[size];
326        e.codeKeys = new int[size];
327        e.nameVals = new String[size];
328
329        int i = 0;
330        CharacterReader reader = new CharacterReader(pointsData);
331        try {
332            while (!reader.isEmpty()) {
333                // NotNestedLessLess=10913,824;1887&
334
335                final String name = reader.consumeTo('=');
336                reader.advance();
337                final int cp1 = Integer.parseInt(reader.consumeToAny(codeDelims), codepointRadix);
338                final char codeDelim = reader.current();
339                reader.advance();
340                final int cp2;
341                if (codeDelim == ',') {
342                    cp2 = Integer.parseInt(reader.consumeTo(';'), codepointRadix);
343                    reader.advance();
344                } else {
345                    cp2 = empty;
346                }
347                final String indexS = reader.consumeTo('&');
348                final int index = Integer.parseInt(indexS, codepointRadix);
349                reader.advance();
350
351                e.nameKeys[i] = name;
352                e.codeVals[i] = cp1;
353                e.codeKeys[index] = cp1;
354                e.nameVals[index] = name;
355
356                if (cp2 != empty) {
357                    multipoints.put(name, new String(new int[]{cp1, cp2}, 0, 2));
358                }
359                i++;
360            }
361
362            Validate.isTrue(i == size, "Unexpected count of entities loaded");
363        } finally {
364            reader.close();
365        }
366    }
367}