001package org.jsoup.select;
002
003import org.jsoup.internal.StringUtil;
004import org.jsoup.helper.Validate;
005import org.jsoup.parser.TokenQueue;
006
007import java.util.ArrayList;
008import java.util.List;
009import java.util.regex.Matcher;
010import java.util.regex.Pattern;
011
012import static org.jsoup.select.StructuralEvaluator.ImmediateParentRun;
013import static org.jsoup.internal.Normalizer.normalize;
014
015/**
016 * Parses a CSS selector into an Evaluator tree.
017 */
018public class QueryParser {
019    private final static char[] Combinators = {',', '>', '+', '~', ' '};
020    private final static String[] AttributeEvals = new String[]{"=", "!=", "^=", "$=", "*=", "~="};
021
022    private final TokenQueue tq;
023    private final String query;
024    private final List<Evaluator> evals = new ArrayList<>();
025
026    /**
027     * Create a new QueryParser.
028     * @param query CSS query
029     */
030    private QueryParser(String query) {
031        Validate.notEmpty(query);
032        query = query.trim();
033        this.query = query;
034        this.tq = new TokenQueue(query);
035    }
036
037    /**
038     * Parse a CSS query into an Evaluator.
039     * @param query CSS query
040     * @return Evaluator
041     * @see Selector selector query syntax
042     */
043    public static Evaluator parse(String query) {
044        try {
045            QueryParser p = new QueryParser(query);
046            return p.parse();
047        } catch (IllegalArgumentException e) {
048            throw new Selector.SelectorParseException(e.getMessage());
049        }
050    }
051
052    /**
053     * Parse the query
054     * @return Evaluator
055     */
056    Evaluator parse() {
057        tq.consumeWhitespace();
058
059        if (tq.matchesAny(Combinators)) { // if starts with a combinator, use root as elements
060            evals.add(new StructuralEvaluator.Root());
061            combinator(tq.consume());
062        } else {
063            evals.add(consumeEvaluator());
064        }
065
066        while (!tq.isEmpty()) {
067            // hierarchy and extras
068            boolean seenWhite = tq.consumeWhitespace();
069
070            if (tq.matchesAny(Combinators)) {
071                combinator(tq.consume());
072            } else if (seenWhite) {
073                combinator(' ');
074            } else { // E.class, E#id, E[attr] etc. AND
075                evals.add(consumeEvaluator()); // take next el, #. etc off queue
076            }
077        }
078
079        if (evals.size() == 1)
080            return evals.get(0);
081
082        return new CombiningEvaluator.And(evals);
083    }
084
085    private void combinator(char combinator) {
086        tq.consumeWhitespace();
087        String subQuery = consumeSubQuery(); // support multi > childs
088
089        Evaluator rootEval; // the new topmost evaluator
090        Evaluator currentEval; // the evaluator the new eval will be combined to. could be root, or rightmost or.
091        Evaluator newEval = parse(subQuery); // the evaluator to add into target evaluator
092        boolean replaceRightMost = false;
093
094        if (evals.size() == 1) {
095            rootEval = currentEval = evals.get(0);
096            // make sure OR (,) has precedence:
097            if (rootEval instanceof CombiningEvaluator.Or && combinator != ',') {
098                currentEval = ((CombiningEvaluator.Or) currentEval).rightMostEvaluator();
099                assert currentEval != null; // rightMost signature can return null (if none set), but always will have one by this point
100                replaceRightMost = true;
101            }
102        }
103        else {
104            rootEval = currentEval = new CombiningEvaluator.And(evals);
105        }
106        evals.clear();
107
108        // for most combinators: change the current eval into an AND of the current eval and the new eval
109        switch (combinator) {
110            case '>':
111                ImmediateParentRun run = currentEval instanceof ImmediateParentRun ?
112                        (ImmediateParentRun) currentEval : new ImmediateParentRun(currentEval);
113                run.add(newEval);
114                currentEval = run;
115                break;
116            case ' ':
117                currentEval = new CombiningEvaluator.And(new StructuralEvaluator.Parent(currentEval), newEval);
118                break;
119            case '+':
120                currentEval = new CombiningEvaluator.And(new StructuralEvaluator.ImmediatePreviousSibling(currentEval), newEval);
121                break;
122            case '~':
123                currentEval = new CombiningEvaluator.And(new StructuralEvaluator.PreviousSibling(currentEval), newEval);
124                break;
125            case ',':
126                CombiningEvaluator.Or or;
127                if (currentEval instanceof CombiningEvaluator.Or) {
128                    or = (CombiningEvaluator.Or) currentEval;
129                } else {
130                    or = new CombiningEvaluator.Or();
131                    or.add(currentEval);
132                }
133                or.add(newEval);
134                currentEval = or;
135                break;
136            default:
137                throw new Selector.SelectorParseException("Unknown combinator '%s'", combinator);
138        }
139
140        if (replaceRightMost)
141            ((CombiningEvaluator.Or) rootEval).replaceRightMostEvaluator(currentEval);
142        else rootEval = currentEval;
143        evals.add(rootEval);
144    }
145
146    private String consumeSubQuery() {
147        StringBuilder sq = StringUtil.borrowBuilder();
148        boolean seenClause = false; // eat until we hit a combinator after eating something else
149        while (!tq.isEmpty()) {
150            if (tq.matchesAny(Combinators)) {
151                if (seenClause)
152                    break;
153                sq.append(tq.consume());
154                continue;
155            }
156            seenClause = true;
157            if (tq.matches("("))
158                sq.append("(").append(tq.chompBalanced('(', ')')).append(")");
159            else if (tq.matches("["))
160                sq.append("[").append(tq.chompBalanced('[', ']')).append("]");
161            else
162                sq.append(tq.consume());
163        }
164        return StringUtil.releaseBuilder(sq);
165    }
166
167    private Evaluator consumeEvaluator() {
168        if (tq.matchChomp("#"))
169            return byId();
170        else if (tq.matchChomp("."))
171            return byClass();
172        else if (tq.matchesWord() || tq.matches("*|"))
173            return byTag();
174        else if (tq.matches("["))
175            return byAttribute();
176        else if (tq.matchChomp("*"))
177            return new Evaluator.AllElements();
178        else if (tq.matchChomp(":"))
179            return parsePseudoSelector();
180                else // unhandled
181            throw new Selector.SelectorParseException("Could not parse query '%s': unexpected token at '%s'", query, tq.remainder());
182    }
183
184    private Evaluator parsePseudoSelector() {
185        final String pseudo = tq.consumeCssIdentifier();
186        switch (pseudo) {
187            case "lt":
188                return new Evaluator.IndexLessThan(consumeIndex());
189            case "gt":
190                return new Evaluator.IndexGreaterThan(consumeIndex());
191            case "eq":
192                return new Evaluator.IndexEquals(consumeIndex());
193            case "has":
194                return has();
195            case "is":
196                return is();
197            case "contains":
198                return contains(false);
199            case "containsOwn":
200                return contains(true);
201            case "containsWholeText":
202                return containsWholeText(false);
203            case "containsWholeOwnText":
204                return containsWholeText(true);
205            case "containsData":
206                return containsData();
207            case "matches":
208                return matches(false);
209            case "matchesOwn":
210                return matches(true);
211            case "matchesWholeText":
212                return matchesWholeText(false);
213            case "matchesWholeOwnText":
214                return matchesWholeText(true);
215            case "not":
216                return not();
217            case "nth-child":
218                return cssNthChild(false, false);
219            case "nth-last-child":
220                return cssNthChild(true, false);
221            case "nth-of-type":
222                return cssNthChild(false, true);
223            case "nth-last-of-type":
224                return cssNthChild(true, true);
225            case "first-child":
226                return new Evaluator.IsFirstChild();
227            case "last-child":
228                return new Evaluator.IsLastChild();
229            case "first-of-type":
230                return new Evaluator.IsFirstOfType();
231            case "last-of-type":
232                return new Evaluator.IsLastOfType();
233            case "only-child":
234                return new Evaluator.IsOnlyChild();
235            case "only-of-type":
236                return new Evaluator.IsOnlyOfType();
237            case "empty":
238                return new Evaluator.IsEmpty();
239            case "root":
240                return new Evaluator.IsRoot();
241            case "matchText":
242                return new Evaluator.MatchText();
243            default:
244                throw new Selector.SelectorParseException("Could not parse query '%s': unexpected token at '%s'", query, tq.remainder());
245        }
246    }
247
248    private Evaluator byId() {
249        String id = tq.consumeCssIdentifier();
250        Validate.notEmpty(id);
251        return new Evaluator.Id(id);
252    }
253
254    private Evaluator byClass() {
255        String className = tq.consumeCssIdentifier();
256        Validate.notEmpty(className);
257        return new Evaluator.Class(className.trim());
258    }
259
260    private Evaluator byTag() {
261        // todo - these aren't dealing perfectly with case sensitivity. For case sensitive parsers, we should also make
262        // the tag in the selector case-sensitive (and also attribute names). But for now, normalize (lower-case) for
263        // consistency - both the selector and the element tag
264        String tagName = normalize(tq.consumeElementSelector());
265        Validate.notEmpty(tagName);
266        final Evaluator eval;
267
268        // namespaces: wildcard match equals(tagName) or ending in ":"+tagName
269        if (tagName.startsWith("*|")) {
270            String plainTag = tagName.substring(2); // strip *|
271            eval = new CombiningEvaluator.Or(
272                new Evaluator.Tag(plainTag),
273                new Evaluator.TagEndsWith(tagName.replace("*|", ":"))
274            );
275        } else {
276            // namespaces: if element name is "abc:def", selector must be "abc|def", so flip:
277            if (tagName.contains("|"))
278                tagName = tagName.replace("|", ":");
279
280            eval = new Evaluator.Tag(tagName);
281        }
282        return eval;
283    }
284
285    private Evaluator byAttribute() {
286        TokenQueue cq = new TokenQueue(tq.chompBalanced('[', ']')); // content queue
287        String key = cq.consumeToAny(AttributeEvals); // eq, not, start, end, contain, match, (no val)
288        Validate.notEmpty(key);
289        cq.consumeWhitespace();
290        final Evaluator eval;
291
292        if (cq.isEmpty()) {
293            if (key.startsWith("^"))
294                eval = new Evaluator.AttributeStarting(key.substring(1));
295            else if (key.equals("*")) // any attribute
296                eval = new Evaluator.AttributeStarting("");
297            else
298                eval = new Evaluator.Attribute(key);
299        } else {
300            if (cq.matchChomp("="))
301                eval = new Evaluator.AttributeWithValue(key, cq.remainder());
302            else if (cq.matchChomp("!="))
303                eval = new Evaluator.AttributeWithValueNot(key, cq.remainder());
304            else if (cq.matchChomp("^="))
305                eval = new Evaluator.AttributeWithValueStarting(key, cq.remainder());
306            else if (cq.matchChomp("$="))
307                eval = new Evaluator.AttributeWithValueEnding(key, cq.remainder());
308            else if (cq.matchChomp("*="))
309                eval = new Evaluator.AttributeWithValueContaining(key, cq.remainder());
310            else if (cq.matchChomp("~="))
311                eval = new Evaluator.AttributeWithValueMatching(key, Pattern.compile(cq.remainder()));
312            else
313                throw new Selector.SelectorParseException("Could not parse attribute query '%s': unexpected token at '%s'", query, cq.remainder());
314        }
315        return eval;
316    }
317
318    //pseudo selectors :first-child, :last-child, :nth-child, ...
319    private static final Pattern NTH_AB = Pattern.compile("(([+-])?(\\d+)?)n(\\s*([+-])?\\s*\\d+)?", Pattern.CASE_INSENSITIVE);
320    private static final Pattern NTH_B  = Pattern.compile("([+-])?(\\d+)");
321
322        private Evaluator cssNthChild(boolean backwards, boolean ofType) {
323                String arg = normalize(consumeParens());
324                Matcher mAB = NTH_AB.matcher(arg);
325                Matcher mB = NTH_B.matcher(arg);
326                final int a, b;
327                if ("odd".equals(arg)) {
328                        a = 2;
329                        b = 1;
330                } else if ("even".equals(arg)) {
331                        a = 2;
332                        b = 0;
333                } else if (mAB.matches()) {
334                        a = mAB.group(3) != null ? Integer.parseInt(mAB.group(1).replaceFirst("^\\+", "")) : 1;
335                        b = mAB.group(4) != null ? Integer.parseInt(mAB.group(4).replaceFirst("^\\+", "")) : 0;
336                } else if (mB.matches()) {
337                        a = 0;
338                        b = Integer.parseInt(mB.group().replaceFirst("^\\+", ""));
339                } else {
340                        throw new Selector.SelectorParseException("Could not parse nth-index '%s': unexpected format", arg);
341                }
342
343        final Evaluator eval;
344                if (ofType)
345                        if (backwards)
346                                eval = new Evaluator.IsNthLastOfType(a, b);
347                        else
348                                eval = new Evaluator.IsNthOfType(a, b);
349                else {
350                        if (backwards)
351                                eval = (new Evaluator.IsNthLastChild(a, b));
352                        else
353                                eval = new Evaluator.IsNthChild(a, b);
354                }
355        return eval;
356        }
357
358    private String consumeParens() {
359        return tq.chompBalanced('(', ')');
360    }
361
362    private int consumeIndex() {
363        String index = consumeParens().trim();
364        Validate.isTrue(StringUtil.isNumeric(index), "Index must be numeric");
365        return Integer.parseInt(index);
366    }
367
368    // pseudo selector :has(el)
369    private Evaluator has() {
370        String subQuery = consumeParens();
371        Validate.notEmpty(subQuery, ":has(selector) sub-select must not be empty");
372        return new StructuralEvaluator.Has(parse(subQuery));
373    }
374
375    // psuedo selector :is()
376    private Evaluator is() {
377        String subQuery = consumeParens();
378        Validate.notEmpty(subQuery, ":is(selector) sub-select must not be empty");
379        return new StructuralEvaluator.Is(parse(subQuery));
380    }
381
382    // pseudo selector :contains(text), containsOwn(text)
383    private Evaluator contains(boolean own) {
384        String query = own ? ":containsOwn" : ":contains";
385        String searchText = TokenQueue.unescape(consumeParens());
386        Validate.notEmpty(searchText, query + "(text) query must not be empty");
387        return own
388            ? new Evaluator.ContainsOwnText(searchText)
389            : new Evaluator.ContainsText(searchText);
390    }
391
392    private Evaluator containsWholeText(boolean own) {
393        String query = own ? ":containsWholeOwnText" : ":containsWholeText";
394        String searchText = TokenQueue.unescape(consumeParens());
395        Validate.notEmpty(searchText, query + "(text) query must not be empty");
396        return own
397            ? new Evaluator.ContainsWholeOwnText(searchText)
398            : new Evaluator.ContainsWholeText(searchText);
399    }
400
401    // pseudo selector :containsData(data)
402    private Evaluator containsData() {
403        String searchText = TokenQueue.unescape(consumeParens());
404        Validate.notEmpty(searchText, ":containsData(text) query must not be empty");
405        return new Evaluator.ContainsData(searchText);
406    }
407
408    // :matches(regex), matchesOwn(regex)
409    private Evaluator matches(boolean own) {
410        String query = own ? ":matchesOwn" : ":matches";
411        String regex = consumeParens(); // don't unescape, as regex bits will be escaped
412        Validate.notEmpty(regex, query + "(regex) query must not be empty");
413
414        return own
415            ? new Evaluator.MatchesOwn(Pattern.compile(regex))
416            : new Evaluator.Matches(Pattern.compile(regex));
417    }
418
419    // :matches(regex), matchesOwn(regex)
420    private Evaluator matchesWholeText(boolean own) {
421        String query = own ? ":matchesWholeOwnText" : ":matchesWholeText";
422        String regex = consumeParens(); // don't unescape, as regex bits will be escaped
423        Validate.notEmpty(regex, query + "(regex) query must not be empty");
424
425        return own
426            ? new Evaluator.MatchesWholeOwnText(Pattern.compile(regex))
427            : new Evaluator.MatchesWholeText(Pattern.compile(regex));
428    }
429
430    // :not(selector)
431    private Evaluator not() {
432        String subQuery = consumeParens();
433        Validate.notEmpty(subQuery, ":not(selector) subselect must not be empty");
434
435        return new StructuralEvaluator.Not(parse(subQuery));
436    }
437
438    @Override
439    public String toString() {
440        return query;
441    }
442}