001package org.jsoup.parser;
002
003import org.jsoup.UncheckedIOException;
004import org.jsoup.helper.Validate;
005import org.jspecify.annotations.Nullable;
006
007import java.io.IOException;
008import java.io.Reader;
009import java.io.StringReader;
010import java.util.ArrayList;
011import java.util.Arrays;
012import java.util.Collections;
013import java.util.Locale;
014
015/**
016 CharacterReader consumes tokens off a string. Used internally by jsoup. API subject to changes.
017 */
018public final class CharacterReader {
019    static final char EOF = (char) -1;
020    private static final int maxStringCacheLen = 12;
021    static final int maxBufferLen = 1024 * 32; // visible for testing
022    static final int readAheadLimit = (int) (maxBufferLen * 0.75); // visible for testing
023    private static final int minReadAheadLen = 1024; // the minimum mark length supported. No HTML entities can be larger than this.
024
025    private char[] charBuf;
026    private Reader reader;
027    private int bufLength;
028    private int bufSplitPoint;
029    private int bufPos;
030    private int readerPos;
031    private int bufMark = -1;
032    private static final int stringCacheSize = 512;
033    private String[] stringCache = new String[stringCacheSize]; // holds reused strings in this doc, to lessen garbage
034
035    @Nullable private ArrayList<Integer> newlinePositions = null; // optionally track the pos() position of newlines - scans during bufferUp()
036    private int lineNumberOffset = 1; // line numbers start at 1; += newlinePosition[indexof(pos)]
037
038    public CharacterReader(Reader input, int sz) {
039        Validate.notNull(input);
040        Validate.isTrue(input.markSupported());
041        reader = input;
042        charBuf = new char[Math.min(sz, maxBufferLen)];
043        bufferUp();
044    }
045
046    public CharacterReader(Reader input) {
047        this(input, maxBufferLen);
048    }
049
050    public CharacterReader(String input) {
051        this(new StringReader(input), input.length());
052    }
053
054    public void close() {
055        if (reader == null)
056            return;
057        try {
058            reader.close();
059        } catch (IOException ignored) {
060        } finally {
061            reader = null;
062            charBuf = null;
063            stringCache = null;
064        }
065    }
066
067    private boolean readFully; // if the underlying stream has been completely read, no value in further buffering
068    private void bufferUp() {
069        if (readFully || bufPos < bufSplitPoint)
070            return;
071
072        final int pos;
073        final int offset;
074        if (bufMark != -1) {
075            pos = bufMark;
076            offset = bufPos - bufMark;
077        } else {
078            pos = bufPos;
079            offset = 0;
080        }
081
082        try {
083            final long skipped = reader.skip(pos);
084            reader.mark(maxBufferLen);
085            int read = 0;
086            while (read <= minReadAheadLen) {
087                int thisRead = reader.read(charBuf, read, charBuf.length - read);
088                if (thisRead == -1)
089                    readFully = true;
090                if (thisRead <= 0)
091                    break;
092                read += thisRead;
093            }
094            reader.reset();
095            if (read > 0) {
096                Validate.isTrue(skipped == pos); // Previously asserted that there is room in buf to skip, so this will be a WTF
097                bufLength = read;
098                readerPos += pos;
099                bufPos = offset;
100                if (bufMark != -1)
101                    bufMark = 0;
102                bufSplitPoint = Math.min(bufLength, readAheadLimit);
103            }
104        } catch (IOException e) {
105            throw new UncheckedIOException(e);
106        }
107        scanBufferForNewlines(); // if enabled, we index newline positions for line number tracking
108        lastIcSeq = null; // cache for last containsIgnoreCase(seq)
109    }
110
111    /**
112     * Gets the position currently read to in the content. Starts at 0.
113     * @return current position
114     */
115    public int pos() {
116        return readerPos + bufPos;
117    }
118
119    /** Tests if the buffer has been fully read. */
120    boolean readFully() {
121        return readFully;
122    }
123
124    /**
125     Enables or disables line number tracking. By default, will be <b>off</b>.Tracking line numbers improves the
126     legibility of parser error messages, for example. Tracking should be enabled before any content is read to be of
127     use.
128
129     @param track set tracking on|off
130     @since 1.14.3
131     */
132    public void trackNewlines(boolean track) {
133        if (track && newlinePositions == null) {
134            newlinePositions = new ArrayList<>(maxBufferLen / 80); // rough guess of likely count
135            scanBufferForNewlines(); // first pass when enabled; subsequently called during bufferUp
136        }
137        else if (!track)
138            newlinePositions = null;
139    }
140
141    /**
142     Check if the tracking of newlines is enabled.
143     @return the current newline tracking state
144     @since 1.14.3
145     */
146    public boolean isTrackNewlines() {
147        return newlinePositions != null;
148    }
149
150    /**
151     Get the current line number (that the reader has consumed to). Starts at line #1.
152     @return the current line number, or 1 if line tracking is not enabled.
153     @since 1.14.3
154     @see #trackNewlines(boolean)
155     */
156    public int lineNumber() {
157        return lineNumber(pos());
158    }
159
160    int lineNumber(int pos) {
161        // note that this impl needs to be called before the next buffer up or line numberoffset will be wrong. if that
162        // causes issues, can remove the reset of newlinepositions during buffer, at the cost of a larger tracking array
163        if (!isTrackNewlines())
164            return 1;
165
166        int i = lineNumIndex(pos);
167        if (i == -1)
168            return lineNumberOffset; // first line
169        return i + lineNumberOffset + 1;
170    }
171
172    /**
173     Get the current column number (that the reader has consumed to). Starts at column #1.
174     @return the current column number
175     @since 1.14.3
176     @see #trackNewlines(boolean)
177     */
178    public int columnNumber() {
179        return columnNumber(pos());
180    }
181
182    int columnNumber(int pos) {
183        if (!isTrackNewlines())
184            return pos + 1;
185
186        int i = lineNumIndex(pos);
187        if (i == -1)
188          return pos + 1;
189        return pos - newlinePositions.get(i) + 1;
190    }
191
192    /**
193     Get a formatted string representing the current line and column positions. E.g. <code>5:10</code> indicating line
194     number 5 and column number 10.
195     @return line:col position
196     @since 1.14.3
197     @see #trackNewlines(boolean)
198     */
199    String posLineCol() {
200        return lineNumber() + ":" + columnNumber();
201    }
202
203    private int lineNumIndex(int pos) {
204        if (!isTrackNewlines()) return 0;
205        int i = Collections.binarySearch(newlinePositions, pos);
206        if (i < -1) i = Math.abs(i) - 2;
207        return i;
208    }
209
210    /**
211     Scans the buffer for newline position, and tracks their location in newlinePositions.
212     */
213    private void scanBufferForNewlines() {
214        if (!isTrackNewlines())
215            return;
216
217        if (newlinePositions.size() > 0) {
218            // work out the line number that we have read up to (as we have likely scanned past this point)
219            int index = lineNumIndex(readerPos);
220            if (index == -1) index = 0; // first line
221            int linePos = newlinePositions.get(index);
222            lineNumberOffset += index; // the num lines we've read up to
223            newlinePositions.clear();
224            newlinePositions.add(linePos); // roll the last read pos to first, for cursor num after buffer
225        }
226
227        for (int i = bufPos; i < bufLength; i++) {
228            if (charBuf[i] == '\n')
229                newlinePositions.add(1 + readerPos + i);
230        }
231    }
232
233    /**
234     * Tests if all the content has been read.
235     * @return true if nothing left to read.
236     */
237    public boolean isEmpty() {
238        bufferUp();
239        return bufPos >= bufLength;
240    }
241
242    private boolean isEmptyNoBufferUp() {
243        return bufPos >= bufLength;
244    }
245
246    /**
247     * Get the char at the current position.
248     * @return char
249     */
250    public char current() {
251        bufferUp();
252        return isEmptyNoBufferUp() ? EOF : charBuf[bufPos];
253    }
254
255    char consume() {
256        bufferUp();
257        char val = isEmptyNoBufferUp() ? EOF : charBuf[bufPos];
258        bufPos++;
259        return val;
260    }
261
262    /**
263     Unconsume one character (bufPos--). MUST only be called directly after a consume(), and no chance of a bufferUp.
264     */
265    void unconsume() {
266        if (bufPos < 1)
267            throw new UncheckedIOException(new IOException("WTF: No buffer left to unconsume.")); // a bug if this fires, need to trace it.
268
269        bufPos--;
270    }
271
272    /**
273     * Moves the current position by one.
274     */
275    public void advance() {
276        bufPos++;
277    }
278
279    void mark() {
280        // make sure there is enough look ahead capacity
281        if (bufLength - bufPos < minReadAheadLen)
282            bufSplitPoint = 0;
283
284        bufferUp();
285        bufMark = bufPos;
286    }
287
288    void unmark() {
289        bufMark = -1;
290    }
291
292    void rewindToMark() {
293        if (bufMark == -1)
294            throw new UncheckedIOException(new IOException("Mark invalid"));
295
296        bufPos = bufMark;
297        unmark();
298    }
299
300    /**
301     * Returns the number of characters between the current position and the next instance of the input char
302     * @param c scan target
303     * @return offset between current position and next instance of target. -1 if not found.
304     */
305    int nextIndexOf(char c) {
306        // doesn't handle scanning for surrogates
307        bufferUp();
308        for (int i = bufPos; i < bufLength; i++) {
309            if (c == charBuf[i])
310                return i - bufPos;
311        }
312        return -1;
313    }
314
315    /**
316     * Returns the number of characters between the current position and the next instance of the input sequence
317     *
318     * @param seq scan target
319     * @return offset between current position and next instance of target. -1 if not found.
320     */
321    int nextIndexOf(CharSequence seq) {
322        bufferUp();
323        // doesn't handle scanning for surrogates
324        char startChar = seq.charAt(0);
325        for (int offset = bufPos; offset < bufLength; offset++) {
326            // scan to first instance of startchar:
327            if (startChar != charBuf[offset])
328                while(++offset < bufLength && startChar != charBuf[offset]) { /* empty */ }
329            int i = offset + 1;
330            int last = i + seq.length()-1;
331            if (offset < bufLength && last <= bufLength) {
332                for (int j = 1; i < last && seq.charAt(j) == charBuf[i]; i++, j++) { /* empty */ }
333                if (i == last) // found full sequence
334                    return offset - bufPos;
335            }
336        }
337        return -1;
338    }
339
340    /**
341     * Reads characters up to the specific char.
342     * @param c the delimiter
343     * @return the chars read
344     */
345    public String consumeTo(char c) {
346        int offset = nextIndexOf(c);
347        if (offset != -1) {
348            String consumed = cacheString(charBuf, stringCache, bufPos, offset);
349            bufPos += offset;
350            return consumed;
351        } else {
352            return consumeToEnd();
353        }
354    }
355
356    String consumeTo(String seq) {
357        int offset = nextIndexOf(seq);
358        if (offset != -1) {
359            String consumed = cacheString(charBuf, stringCache, bufPos, offset);
360            bufPos += offset;
361            return consumed;
362        } else if (bufLength - bufPos < seq.length()) {
363            // nextIndexOf() did a bufferUp(), so if the buffer is shorter than the search string, we must be at EOF
364            return consumeToEnd();
365        } else {
366            // the string we're looking for may be straddling a buffer boundary, so keep (length - 1) characters
367            // unread in case they contain the beginning of the search string
368            int endPos = bufLength - seq.length() + 1;
369            String consumed = cacheString(charBuf, stringCache, bufPos, endPos - bufPos);
370            bufPos = endPos;
371            return consumed;
372        }
373    }
374
375    /**
376     * Read characters until the first of any delimiters is found.
377     * @param chars delimiters to scan for
378     * @return characters read up to the matched delimiter.
379     */
380    public String consumeToAny(final char... chars) {
381        bufferUp();
382        int pos = bufPos;
383        final int start = pos;
384        final int remaining = bufLength;
385        final char[] val = charBuf;
386        final int charLen = chars.length;
387        int i;
388
389        OUTER: while (pos < remaining) {
390            for (i = 0; i < charLen; i++) {
391                if (val[pos] == chars[i])
392                    break OUTER;
393            }
394            pos++;
395        }
396
397        bufPos = pos;
398        return pos > start ? cacheString(charBuf, stringCache, start, pos -start) : "";
399    }
400
401    String consumeToAnySorted(final char... chars) {
402        bufferUp();
403        int pos = bufPos;
404        final int start = pos;
405        final int remaining = bufLength;
406        final char[] val = charBuf;
407
408        while (pos < remaining) {
409            if (Arrays.binarySearch(chars, val[pos]) >= 0)
410                break;
411            pos++;
412        }
413        bufPos = pos;
414        return bufPos > start ? cacheString(charBuf, stringCache, start, pos -start) : "";
415    }
416
417    String consumeData() {
418        // &, <, null
419        //bufferUp(); // no need to bufferUp, just called consume()
420        int pos = bufPos;
421        final int start = pos;
422        final int remaining = bufLength;
423        final char[] val = charBuf;
424
425        OUTER: while (pos < remaining) {
426            switch (val[pos]) {
427                case '&':
428                case '<':
429                case TokeniserState.nullChar:
430                    break OUTER;
431                default:
432                    pos++;
433            }
434        }
435        bufPos = pos;
436        return pos > start ? cacheString(charBuf, stringCache, start, pos -start) : "";
437    }
438
439    String consumeAttributeQuoted(final boolean single) {
440        // null, " or ', &
441        //bufferUp(); // no need to bufferUp, just called consume()
442        int pos = bufPos;
443        final int start = pos;
444        final int remaining = bufLength;
445        final char[] val = charBuf;
446
447        OUTER: while (pos < remaining) {
448            switch (val[pos]) {
449                case '&':
450                case TokeniserState.nullChar:
451                    break OUTER;
452                case '\'':
453                    if (single) break OUTER;
454                    break;
455                case '"':
456                    if (!single) break OUTER;
457                    break;
458            }
459            pos++;
460        }
461        bufPos = pos;
462        return pos > start ? cacheString(charBuf, stringCache, start, pos -start) : "";
463    }
464
465
466    String consumeRawData() {
467        // <, null
468        //bufferUp(); // no need to bufferUp, just called consume()
469        int pos = bufPos;
470        final int start = pos;
471        final int remaining = bufLength;
472        final char[] val = charBuf;
473
474        OUTER: while (pos < remaining) {
475            switch (val[pos]) {
476                case '<':
477                case TokeniserState.nullChar:
478                    break OUTER;
479                default:
480                    pos++;
481            }
482        }
483        bufPos = pos;
484        return pos > start ? cacheString(charBuf, stringCache, start, pos -start) : "";
485    }
486
487    String consumeTagName() {
488        // '\t', '\n', '\r', '\f', ' ', '/', '>'
489        // NOTE: out of spec, added '<' to fix common author bugs; does not stop and append on nullChar but eats
490        bufferUp();
491        int pos = bufPos;
492        final int start = pos;
493        final int remaining = bufLength;
494        final char[] val = charBuf;
495
496        OUTER: while (pos < remaining) {
497            switch (val[pos]) {
498                case '\t':
499                case '\n':
500                case '\r':
501                case '\f':
502                case ' ':
503                case '/':
504                case '>':
505                case '<':
506                    break OUTER;
507            }
508            pos++;
509        }
510
511        bufPos = pos;
512        return pos > start ? cacheString(charBuf, stringCache, start, pos -start) : "";
513    }
514
515    String consumeToEnd() {
516        bufferUp();
517        String data = cacheString(charBuf, stringCache, bufPos, bufLength - bufPos);
518        bufPos = bufLength;
519        return data;
520    }
521
522    String consumeLetterSequence() {
523        bufferUp();
524        int start = bufPos;
525        while (bufPos < bufLength) {
526            char c = charBuf[bufPos];
527            if ((c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z') || Character.isLetter(c))
528                bufPos++;
529            else
530                break;
531        }
532
533        return cacheString(charBuf, stringCache, start, bufPos - start);
534    }
535
536    String consumeLetterThenDigitSequence() {
537        bufferUp();
538        int start = bufPos;
539        while (bufPos < bufLength) {
540            char c = charBuf[bufPos];
541            if ((c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z') || Character.isLetter(c))
542                bufPos++;
543            else
544                break;
545        }
546        while (!isEmptyNoBufferUp()) {
547            char c = charBuf[bufPos];
548            if (c >= '0' && c <= '9')
549                bufPos++;
550            else
551                break;
552        }
553
554        return cacheString(charBuf, stringCache, start, bufPos - start);
555    }
556
557    String consumeHexSequence() {
558        bufferUp();
559        int start = bufPos;
560        while (bufPos < bufLength) {
561            char c = charBuf[bufPos];
562            if ((c >= '0' && c <= '9') || (c >= 'A' && c <= 'F') || (c >= 'a' && c <= 'f'))
563                bufPos++;
564            else
565                break;
566        }
567        return cacheString(charBuf, stringCache, start, bufPos - start);
568    }
569
570    String consumeDigitSequence() {
571        bufferUp();
572        int start = bufPos;
573        while (bufPos < bufLength) {
574            char c = charBuf[bufPos];
575            if (c >= '0' && c <= '9')
576                bufPos++;
577            else
578                break;
579        }
580        return cacheString(charBuf, stringCache, start, bufPos - start);
581    }
582
583    boolean matches(char c) {
584        return !isEmpty() && charBuf[bufPos] == c;
585
586    }
587
588    boolean matches(String seq) {
589        bufferUp();
590        int scanLength = seq.length();
591        if (scanLength > bufLength - bufPos)
592            return false;
593
594        for (int offset = 0; offset < scanLength; offset++)
595            if (seq.charAt(offset) != charBuf[bufPos +offset])
596                return false;
597        return true;
598    }
599
600    boolean matchesIgnoreCase(String seq) {
601        bufferUp();
602        int scanLength = seq.length();
603        if (scanLength > bufLength - bufPos)
604            return false;
605
606        for (int offset = 0; offset < scanLength; offset++) {
607            char upScan = Character.toUpperCase(seq.charAt(offset));
608            char upTarget = Character.toUpperCase(charBuf[bufPos + offset]);
609            if (upScan != upTarget)
610                return false;
611        }
612        return true;
613    }
614
615    boolean matchesAny(char... seq) {
616        if (isEmpty())
617            return false;
618
619        bufferUp();
620        char c = charBuf[bufPos];
621        for (char seek : seq) {
622            if (seek == c)
623                return true;
624        }
625        return false;
626    }
627
628    boolean matchesAnySorted(char[] seq) {
629        bufferUp();
630        return !isEmpty() && Arrays.binarySearch(seq, charBuf[bufPos]) >= 0;
631    }
632
633    boolean matchesLetter() {
634        if (isEmpty())
635            return false;
636        char c = charBuf[bufPos];
637        return (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z') || Character.isLetter(c);
638    }
639
640    /**
641     Checks if the current pos matches an ascii alpha (A-Z a-z) per https://infra.spec.whatwg.org/#ascii-alpha
642     @return if it matches or not
643     */
644    boolean matchesAsciiAlpha() {
645        if (isEmpty())
646            return false;
647        char c = charBuf[bufPos];
648        return (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z');
649    }
650
651    boolean matchesDigit() {
652        if (isEmpty())
653            return false;
654        char c = charBuf[bufPos];
655        return (c >= '0' && c <= '9');
656    }
657
658    boolean matchConsume(String seq) {
659        bufferUp();
660        if (matches(seq)) {
661            bufPos += seq.length();
662            return true;
663        } else {
664            return false;
665        }
666    }
667
668    boolean matchConsumeIgnoreCase(String seq) {
669        if (matchesIgnoreCase(seq)) {
670            bufPos += seq.length();
671            return true;
672        } else {
673            return false;
674        }
675    }
676
677    // we maintain a cache of the previously scanned sequence, and return that if applicable on repeated scans.
678    // that improves the situation where there is a sequence of <p<p<p<p<p<p<p...</title> and we're bashing on the <p
679    // looking for the </title>. Resets in bufferUp()
680    @Nullable private String lastIcSeq; // scan cache
681    private int lastIcIndex; // nearest found indexOf
682
683    /** Used to check presence of </title>, </style> when we're in RCData and see a <xxx. Only finds consistent case. */
684    boolean containsIgnoreCase(String seq) {
685        if (seq.equals(lastIcSeq)) {
686            if (lastIcIndex == -1) return false;
687            if (lastIcIndex >= bufPos) return true;
688        }
689        lastIcSeq = seq;
690
691        String loScan = seq.toLowerCase(Locale.ENGLISH);
692        int lo = nextIndexOf(loScan);
693        if (lo > -1) {
694            lastIcIndex = bufPos + lo; return true;
695        }
696
697        String hiScan = seq.toUpperCase(Locale.ENGLISH);
698        int hi = nextIndexOf(hiScan);
699        boolean found = hi > -1;
700        lastIcIndex = found ? bufPos + hi : -1; // we don't care about finding the nearest, just that buf contains
701        return found;
702    }
703
704    @Override
705    public String toString() {
706        if (bufLength - bufPos < 0)
707            return "";
708        return new String(charBuf, bufPos, bufLength - bufPos);
709    }
710
711    /**
712     * Caches short strings, as a flyweight pattern, to reduce GC load. Just for this doc, to prevent leaks.
713     * <p />
714     * Simplistic, and on hash collisions just falls back to creating a new string, vs a full HashMap with Entry list.
715     * That saves both having to create objects as hash keys, and running through the entry list, at the expense of
716     * some more duplicates.
717     */
718    private static String cacheString(final char[] charBuf, final String[] stringCache, final int start, final int count) {
719        // limit (no cache):
720        if (count > maxStringCacheLen)
721            return new String(charBuf, start, count);
722        if (count < 1)
723            return "";
724
725        // calculate hash:
726        int hash = 0;
727        for (int i = 0; i < count; i++) {
728            hash = 31 * hash + charBuf[start + i];
729        }
730
731        // get from cache
732        final int index = hash & stringCacheSize - 1;
733        String cached = stringCache[index];
734
735        if (cached != null && rangeEquals(charBuf, start, count, cached)) // positive hit
736            return cached;
737        else {
738            cached = new String(charBuf, start, count);
739            stringCache[index] = cached; // add or replace, assuming most recently used are most likely to recur next
740        }
741
742        return cached;
743    }
744
745    /**
746     * Check if the value of the provided range equals the string.
747     */
748    static boolean rangeEquals(final char[] charBuf, final int start, int count, final String cached) {
749        if (count == cached.length()) {
750            int i = start;
751            int j = 0;
752            while (count-- != 0) {
753                if (charBuf[i++] != cached.charAt(j++))
754                    return false;
755            }
756            return true;
757        }
758        return false;
759    }
760
761    // just used for testing
762    boolean rangeEquals(final int start, final int count, final String cached) {
763        return rangeEquals(charBuf, start, count, cached);
764    }
765}