001package org.jsoup.parser;
002
003import org.jsoup.internal.StringUtil;
004import org.jsoup.helper.Validate;
005
006/**
007 * A character queue with parsing helpers.
008 *
009 * @author Jonathan Hedley
010 */
011public class TokenQueue {
012    private String queue;
013    private int pos = 0;
014    
015    private static final char ESC = '\\'; // escape char for chomp balanced.
016
017    /**
018     Create a new TokenQueue.
019     @param data string of data to back queue.
020     */
021    public TokenQueue(String data) {
022        Validate.notNull(data);
023        queue = data;
024    }
025
026    /**
027     * Is the queue empty?
028     * @return true if no data left in queue.
029     */
030    public boolean isEmpty() {
031        return remainingLength() == 0;
032    }
033    
034    private int remainingLength() {
035        return queue.length() - pos;
036    }
037
038    /**
039     Add a string to the start of the queue.
040     @param seq string to add.
041     */
042    public void addFirst(String seq) {
043        // not very performant, but an edge case
044        queue = seq + queue.substring(pos);
045        pos = 0;
046    }
047
048    /**
049     * Tests if the next characters on the queue match the sequence. Case insensitive.
050     * @param seq String to check queue for.
051     * @return true if the next characters match.
052     */
053    public boolean matches(String seq) {
054        return queue.regionMatches(true, pos, seq, 0, seq.length());
055    }
056
057    /**
058     Tests if the next characters match any of the sequences. Case insensitive.
059     @param seq list of strings to case insensitively check for
060     @return true of any matched, false if none did
061     */
062    public boolean matchesAny(String... seq) {
063        for (String s : seq) {
064            if (matches(s))
065                return true;
066        }
067        return false;
068    }
069
070    public boolean matchesAny(char... seq) {
071        if (isEmpty())
072            return false;
073
074        for (char c: seq) {
075            if (queue.charAt(pos) == c)
076                return true;
077        }
078        return false;
079    }
080
081    /**
082     * Tests if the queue matches the sequence (as with match), and if they do, removes the matched string from the
083     * queue.
084     * @param seq String to search for, and if found, remove from queue.
085     * @return true if found and removed, false if not found.
086     */
087    public boolean matchChomp(String seq) {
088        if (matches(seq)) {
089            pos += seq.length();
090            return true;
091        } else {
092            return false;
093        }
094    }
095
096    /**
097     Tests if queue starts with a whitespace character.
098     @return if starts with whitespace
099     */
100    public boolean matchesWhitespace() {
101        return !isEmpty() && StringUtil.isWhitespace(queue.charAt(pos));
102    }
103
104    /**
105     Test if the queue matches a word character (letter or digit).
106     @return if matches a word character
107     */
108    public boolean matchesWord() {
109        return !isEmpty() && Character.isLetterOrDigit(queue.charAt(pos));
110    }
111
112    /**
113     * Drops the next character off the queue.
114     */
115    public void advance() {
116        if (!isEmpty()) pos++;
117    }
118
119    /**
120     * Consume one character off queue.
121     * @return first character on queue.
122     */
123    public char consume() {
124        return queue.charAt(pos++);
125    }
126
127    /**
128     * Consumes the supplied sequence of the queue. If the queue does not start with the supplied sequence, will
129     * throw an illegal state exception -- but you should be running match() against that condition.
130     <p>
131     Case insensitive.
132     * @param seq sequence to remove from head of queue.
133     */
134    public void consume(String seq) {
135        if (!matches(seq))
136            throw new IllegalStateException("Queue did not match expected sequence");
137        int len = seq.length();
138        if (len > remainingLength())
139            throw new IllegalStateException("Queue not long enough to consume sequence");
140        
141        pos += len;
142    }
143
144    /**
145     * Pulls a string off the queue, up to but exclusive of the match sequence, or to the queue running out.
146     * @param seq String to end on (and not include in return, but leave on queue). <b>Case sensitive.</b>
147     * @return The matched data consumed from queue.
148     */
149    public String consumeTo(String seq) {
150        int offset = queue.indexOf(seq, pos);
151        if (offset != -1) {
152            String consumed = queue.substring(pos, offset);
153            pos += consumed.length();
154            return consumed;
155        } else {
156            return remainder();
157        }
158    }
159    
160    public String consumeToIgnoreCase(String seq) {
161        int start = pos;
162        String first = seq.substring(0, 1);
163        boolean canScan = first.toLowerCase().equals(first.toUpperCase()); // if first is not cased, use index of
164        while (!isEmpty()) {
165            if (matches(seq))
166                break;
167            
168            if (canScan) {
169                int skip = queue.indexOf(first, pos) - pos;
170                if (skip == 0) // this char is the skip char, but not match, so force advance of pos
171                    pos++;
172                else if (skip < 0) // no chance of finding, grab to end
173                    pos = queue.length();
174                else
175                    pos += skip;
176            }
177            else
178                pos++;
179        }
180
181        return queue.substring(start, pos);
182    }
183
184    /**
185     Consumes to the first sequence provided, or to the end of the queue. Leaves the terminator on the queue.
186     @param seq any number of terminators to consume to. <b>Case insensitive.</b>
187     @return consumed string   
188     */
189    // todo: method name. not good that consumeTo cares for case, and consume to any doesn't. And the only use for this
190    // is a case sensitive time...
191    public String consumeToAny(String... seq) {
192        int start = pos;
193        while (!isEmpty() && !matchesAny(seq)) {
194            pos++;
195        }
196
197        return queue.substring(start, pos);
198    }
199
200    /**
201     * Pulls a string off the queue (like consumeTo), and then pulls off the matched string (but does not return it).
202     * <p>
203     * If the queue runs out of characters before finding the seq, will return as much as it can (and queue will go
204     * isEmpty() == true).
205     * @param seq String to match up to, and not include in return, and to pull off queue. <b>Case sensitive.</b>
206     * @return Data matched from queue.
207     */
208    public String chompTo(String seq) {
209        String data = consumeTo(seq);
210        matchChomp(seq);
211        return data;
212    }
213    
214    public String chompToIgnoreCase(String seq) {
215        String data = consumeToIgnoreCase(seq); // case insensitive scan
216        matchChomp(seq);
217        return data;
218    }
219
220    /**
221     * Pulls a balanced string off the queue. E.g. if queue is "(one (two) three) four", (,) will return "one (two) three",
222     * and leave " four" on the queue. Unbalanced openers and closers can be quoted (with ' or ") or escaped (with \). Those escapes will be left
223     * in the returned string, which is suitable for regexes (where we need to preserve the escape), but unsuitable for
224     * contains text strings; use unescape for that.
225     * @param open opener
226     * @param close closer
227     * @return data matched from the queue
228     */
229    public String chompBalanced(char open, char close) {
230        int start = -1;
231        int end = -1;
232        int depth = 0;
233        char last = 0;
234        boolean inSingleQuote = false;
235        boolean inDoubleQuote = false;
236        boolean inRegexQE = false; // regex \Q .. \E escapes from Pattern.quote()
237
238        do {
239            if (isEmpty()) break;
240            char c = consume();
241            if (last != ESC) {
242                if (c == '\'' && c != open && !inDoubleQuote)
243                    inSingleQuote = !inSingleQuote;
244                else if (c == '"' && c != open && !inSingleQuote)
245                    inDoubleQuote = !inDoubleQuote;
246                if (inSingleQuote || inDoubleQuote || inRegexQE){
247                    last = c;
248                    continue;
249                }
250
251                if (c == open) {
252                    depth++;
253                    if (start == -1)
254                        start = pos;
255                }
256                else if (c == close)
257                    depth--;
258            } else if (c == 'Q') {
259                inRegexQE = true;
260            } else if (c == 'E') {
261                inRegexQE = false;
262            }
263
264            if (depth > 0 && last != 0)
265                end = pos; // don't include the outer match pair in the return
266            last = c;
267        } while (depth > 0);
268        final String out = (end >= 0) ? queue.substring(start, end) : "";
269        if (depth > 0) {// ran out of queue before seeing enough )
270            Validate.fail("Did not find balanced marker at '" + out + "'");
271        }
272        return out;
273    }
274    
275    /**
276     * Unescape a \ escaped string.
277     * @param in backslash escaped string
278     * @return unescaped string
279     */
280    public static String unescape(String in) {
281        StringBuilder out = StringUtil.borrowBuilder();
282        char last = 0;
283        for (char c : in.toCharArray()) {
284            if (c == ESC) {
285                if (last == ESC) {
286                    out.append(c);
287                    c = 0;
288                }
289            }
290            else 
291                out.append(c);
292            last = c;
293        }
294        return StringUtil.releaseBuilder(out);
295    }
296
297    /*
298    Given a CSS identifier (such as a tag, ID, or class), escape any CSS special characters that would otherwise not be
299    valid in a selector.
300     */
301    public static String escapeCssIdentifier(String in) {
302        StringBuilder out = StringUtil.borrowBuilder();
303        TokenQueue q = new TokenQueue(in);
304        while (!q.isEmpty()) {
305            if (q.matchesCssIdentifier(ElementSelectorChars)) {
306                out.append(q.consume());
307            } else {
308                out.append(ESC).append(q.consume());
309            }
310        }
311        return StringUtil.releaseBuilder(out);
312    }
313
314    /**
315     * Pulls the next run of whitespace characters of the queue.
316     * @return Whether consuming whitespace or not
317     */
318    public boolean consumeWhitespace() {
319        boolean seen = false;
320        while (matchesWhitespace()) {
321            pos++;
322            seen = true;
323        }
324        return seen;
325    }
326
327    /**
328     * Retrieves the next run of word type (letter or digit) off the queue.
329     * @return String of word characters from queue, or empty string if none.
330     */
331    public String consumeWord() {
332        int start = pos;
333        while (matchesWord())
334            pos++;
335        return queue.substring(start, pos);
336    }
337
338    
339    /**
340     * Consume a CSS element selector (tag name, but | instead of : for namespaces (or *| for wildcard namespace), to not conflict with :pseudo selects).
341     * 
342     * @return tag name
343     */
344    public String consumeElementSelector() {
345        return consumeEscapedCssIdentifier(ElementSelectorChars);
346    }
347    private static final String[] ElementSelectorChars = {"*|", "|", "_", "-"};
348
349    /**
350     Consume a CSS identifier (ID or class) off the queue (letter, digit, -, _)
351     http://www.w3.org/TR/CSS2/syndata.html#value-def-identifier
352     @return identifier
353     */
354    public String consumeCssIdentifier() {
355        return consumeEscapedCssIdentifier(CssIdentifierChars);
356    }
357    private static final String[] CssIdentifierChars = {"-", "_"};
358
359
360    private String consumeEscapedCssIdentifier(String... matches) {
361        int start = pos;
362        boolean escaped = false;
363        while (!isEmpty()) {
364            if (queue.charAt(pos) == ESC && remainingLength() >1 ) {
365                escaped = true;
366                pos+=2; // skip the escape and the escaped
367            } else if (matchesCssIdentifier(matches)) {
368                pos++;
369            } else {
370                break;
371            }
372        }
373
374        String consumed = queue.substring(start, pos);
375        return escaped ? unescape(consumed) : consumed;
376    }
377
378    private boolean matchesCssIdentifier(String... matches) {
379        return matchesWord() || matchesAny(matches);
380    }
381
382    /**
383     Consume and return whatever is left on the queue.
384     @return remained of queue.
385     */
386    public String remainder() {
387        final String remainder = queue.substring(pos);
388        pos = queue.length();
389        return remainder;
390    }
391    
392    @Override
393    public String toString() {
394        return queue.substring(pos);
395    }
396}