001package org.jsoup.select; 002 003import org.jsoup.internal.StringUtil; 004import org.jsoup.helper.Validate; 005import org.jsoup.parser.TokenQueue; 006 007import java.util.ArrayList; 008import java.util.List; 009import java.util.regex.Matcher; 010import java.util.regex.Pattern; 011 012import static org.jsoup.select.StructuralEvaluator.ImmediateParentRun; 013import static org.jsoup.internal.Normalizer.normalize; 014 015/** 016 * Parses a CSS selector into an Evaluator tree. 017 */ 018public class QueryParser { 019 private final static char[] Combinators = {',', '>', '+', '~', ' '}; 020 private final static String[] AttributeEvals = new String[]{"=", "!=", "^=", "$=", "*=", "~="}; 021 022 private final TokenQueue tq; 023 private final String query; 024 private final List<Evaluator> evals = new ArrayList<>(); 025 026 /** 027 * Create a new QueryParser. 028 * @param query CSS query 029 */ 030 private QueryParser(String query) { 031 Validate.notEmpty(query); 032 query = query.trim(); 033 this.query = query; 034 this.tq = new TokenQueue(query); 035 } 036 037 /** 038 * Parse a CSS query into an Evaluator. 039 * @param query CSS query 040 * @return Evaluator 041 * @see Selector selector query syntax 042 */ 043 public static Evaluator parse(String query) { 044 try { 045 QueryParser p = new QueryParser(query); 046 return p.parse(); 047 } catch (IllegalArgumentException e) { 048 throw new Selector.SelectorParseException(e.getMessage()); 049 } 050 } 051 052 /** 053 * Parse the query 054 * @return Evaluator 055 */ 056 Evaluator parse() { 057 tq.consumeWhitespace(); 058 059 if (tq.matchesAny(Combinators)) { // if starts with a combinator, use root as elements 060 evals.add(new StructuralEvaluator.Root()); 061 combinator(tq.consume()); 062 } else { 063 evals.add(consumeEvaluator()); 064 } 065 066 while (!tq.isEmpty()) { 067 // hierarchy and extras 068 boolean seenWhite = tq.consumeWhitespace(); 069 070 if (tq.matchesAny(Combinators)) { 071 combinator(tq.consume()); 072 } else if (seenWhite) { 073 combinator(' '); 074 } else { // E.class, E#id, E[attr] etc. AND 075 evals.add(consumeEvaluator()); // take next el, #. etc off queue 076 } 077 } 078 079 if (evals.size() == 1) 080 return evals.get(0); 081 082 return new CombiningEvaluator.And(evals); 083 } 084 085 private void combinator(char combinator) { 086 tq.consumeWhitespace(); 087 String subQuery = consumeSubQuery(); // support multi > childs 088 089 Evaluator rootEval; // the new topmost evaluator 090 Evaluator currentEval; // the evaluator the new eval will be combined to. could be root, or rightmost or. 091 Evaluator newEval = parse(subQuery); // the evaluator to add into target evaluator 092 boolean replaceRightMost = false; 093 094 if (evals.size() == 1) { 095 rootEval = currentEval = evals.get(0); 096 // make sure OR (,) has precedence: 097 if (rootEval instanceof CombiningEvaluator.Or && combinator != ',') { 098 currentEval = ((CombiningEvaluator.Or) currentEval).rightMostEvaluator(); 099 assert currentEval != null; // rightMost signature can return null (if none set), but always will have one by this point 100 replaceRightMost = true; 101 } 102 } 103 else { 104 rootEval = currentEval = new CombiningEvaluator.And(evals); 105 } 106 evals.clear(); 107 108 // for most combinators: change the current eval into an AND of the current eval and the new eval 109 switch (combinator) { 110 case '>': 111 ImmediateParentRun run = currentEval instanceof ImmediateParentRun ? 112 (ImmediateParentRun) currentEval : new ImmediateParentRun(currentEval); 113 run.add(newEval); 114 currentEval = run; 115 break; 116 case ' ': 117 currentEval = new CombiningEvaluator.And(new StructuralEvaluator.Parent(currentEval), newEval); 118 break; 119 case '+': 120 currentEval = new CombiningEvaluator.And(new StructuralEvaluator.ImmediatePreviousSibling(currentEval), newEval); 121 break; 122 case '~': 123 currentEval = new CombiningEvaluator.And(new StructuralEvaluator.PreviousSibling(currentEval), newEval); 124 break; 125 case ',': 126 CombiningEvaluator.Or or; 127 if (currentEval instanceof CombiningEvaluator.Or) { 128 or = (CombiningEvaluator.Or) currentEval; 129 } else { 130 or = new CombiningEvaluator.Or(); 131 or.add(currentEval); 132 } 133 or.add(newEval); 134 currentEval = or; 135 break; 136 default: 137 throw new Selector.SelectorParseException("Unknown combinator '%s'", combinator); 138 } 139 140 if (replaceRightMost) 141 ((CombiningEvaluator.Or) rootEval).replaceRightMostEvaluator(currentEval); 142 else rootEval = currentEval; 143 evals.add(rootEval); 144 } 145 146 private String consumeSubQuery() { 147 StringBuilder sq = StringUtil.borrowBuilder(); 148 boolean seenClause = false; // eat until we hit a combinator after eating something else 149 while (!tq.isEmpty()) { 150 if (tq.matchesAny(Combinators)) { 151 if (seenClause) 152 break; 153 sq.append(tq.consume()); 154 continue; 155 } 156 seenClause = true; 157 if (tq.matches("(")) 158 sq.append("(").append(tq.chompBalanced('(', ')')).append(")"); 159 else if (tq.matches("[")) 160 sq.append("[").append(tq.chompBalanced('[', ']')).append("]"); 161 else 162 sq.append(tq.consume()); 163 } 164 return StringUtil.releaseBuilder(sq); 165 } 166 167 private Evaluator consumeEvaluator() { 168 if (tq.matchChomp("#")) 169 return byId(); 170 else if (tq.matchChomp(".")) 171 return byClass(); 172 else if (tq.matchesWord() || tq.matches("*|")) 173 return byTag(); 174 else if (tq.matches("[")) 175 return byAttribute(); 176 else if (tq.matchChomp("*")) 177 return new Evaluator.AllElements(); 178 else if (tq.matchChomp(":")) 179 return parsePseudoSelector(); 180 else // unhandled 181 throw new Selector.SelectorParseException("Could not parse query '%s': unexpected token at '%s'", query, tq.remainder()); 182 } 183 184 private Evaluator parsePseudoSelector() { 185 final String pseudo = tq.consumeCssIdentifier(); 186 switch (pseudo) { 187 case "lt": 188 return new Evaluator.IndexLessThan(consumeIndex()); 189 case "gt": 190 return new Evaluator.IndexGreaterThan(consumeIndex()); 191 case "eq": 192 return new Evaluator.IndexEquals(consumeIndex()); 193 case "has": 194 return has(); 195 case "is": 196 return is(); 197 case "contains": 198 return contains(false); 199 case "containsOwn": 200 return contains(true); 201 case "containsWholeText": 202 return containsWholeText(false); 203 case "containsWholeOwnText": 204 return containsWholeText(true); 205 case "containsData": 206 return containsData(); 207 case "matches": 208 return matches(false); 209 case "matchesOwn": 210 return matches(true); 211 case "matchesWholeText": 212 return matchesWholeText(false); 213 case "matchesWholeOwnText": 214 return matchesWholeText(true); 215 case "not": 216 return not(); 217 case "nth-child": 218 return cssNthChild(false, false); 219 case "nth-last-child": 220 return cssNthChild(true, false); 221 case "nth-of-type": 222 return cssNthChild(false, true); 223 case "nth-last-of-type": 224 return cssNthChild(true, true); 225 case "first-child": 226 return new Evaluator.IsFirstChild(); 227 case "last-child": 228 return new Evaluator.IsLastChild(); 229 case "first-of-type": 230 return new Evaluator.IsFirstOfType(); 231 case "last-of-type": 232 return new Evaluator.IsLastOfType(); 233 case "only-child": 234 return new Evaluator.IsOnlyChild(); 235 case "only-of-type": 236 return new Evaluator.IsOnlyOfType(); 237 case "empty": 238 return new Evaluator.IsEmpty(); 239 case "root": 240 return new Evaluator.IsRoot(); 241 case "matchText": 242 return new Evaluator.MatchText(); 243 default: 244 throw new Selector.SelectorParseException("Could not parse query '%s': unexpected token at '%s'", query, tq.remainder()); 245 } 246 } 247 248 private Evaluator byId() { 249 String id = tq.consumeCssIdentifier(); 250 Validate.notEmpty(id); 251 return new Evaluator.Id(id); 252 } 253 254 private Evaluator byClass() { 255 String className = tq.consumeCssIdentifier(); 256 Validate.notEmpty(className); 257 return new Evaluator.Class(className.trim()); 258 } 259 260 private Evaluator byTag() { 261 // todo - these aren't dealing perfectly with case sensitivity. For case sensitive parsers, we should also make 262 // the tag in the selector case-sensitive (and also attribute names). But for now, normalize (lower-case) for 263 // consistency - both the selector and the element tag 264 String tagName = normalize(tq.consumeElementSelector()); 265 Validate.notEmpty(tagName); 266 final Evaluator eval; 267 268 // namespaces: wildcard match equals(tagName) or ending in ":"+tagName 269 if (tagName.startsWith("*|")) { 270 String plainTag = tagName.substring(2); // strip *| 271 eval = new CombiningEvaluator.Or( 272 new Evaluator.Tag(plainTag), 273 new Evaluator.TagEndsWith(tagName.replace("*|", ":")) 274 ); 275 } else { 276 // namespaces: if element name is "abc:def", selector must be "abc|def", so flip: 277 if (tagName.contains("|")) 278 tagName = tagName.replace("|", ":"); 279 280 eval = new Evaluator.Tag(tagName); 281 } 282 return eval; 283 } 284 285 private Evaluator byAttribute() { 286 TokenQueue cq = new TokenQueue(tq.chompBalanced('[', ']')); // content queue 287 String key = cq.consumeToAny(AttributeEvals); // eq, not, start, end, contain, match, (no val) 288 Validate.notEmpty(key); 289 cq.consumeWhitespace(); 290 final Evaluator eval; 291 292 if (cq.isEmpty()) { 293 if (key.startsWith("^")) 294 eval = new Evaluator.AttributeStarting(key.substring(1)); 295 else if (key.equals("*")) // any attribute 296 eval = new Evaluator.AttributeStarting(""); 297 else 298 eval = new Evaluator.Attribute(key); 299 } else { 300 if (cq.matchChomp("=")) 301 eval = new Evaluator.AttributeWithValue(key, cq.remainder()); 302 else if (cq.matchChomp("!=")) 303 eval = new Evaluator.AttributeWithValueNot(key, cq.remainder()); 304 else if (cq.matchChomp("^=")) 305 eval = new Evaluator.AttributeWithValueStarting(key, cq.remainder()); 306 else if (cq.matchChomp("$=")) 307 eval = new Evaluator.AttributeWithValueEnding(key, cq.remainder()); 308 else if (cq.matchChomp("*=")) 309 eval = new Evaluator.AttributeWithValueContaining(key, cq.remainder()); 310 else if (cq.matchChomp("~=")) 311 eval = new Evaluator.AttributeWithValueMatching(key, Pattern.compile(cq.remainder())); 312 else 313 throw new Selector.SelectorParseException("Could not parse attribute query '%s': unexpected token at '%s'", query, cq.remainder()); 314 } 315 return eval; 316 } 317 318 //pseudo selectors :first-child, :last-child, :nth-child, ... 319 private static final Pattern NTH_AB = Pattern.compile("(([+-])?(\\d+)?)n(\\s*([+-])?\\s*\\d+)?", Pattern.CASE_INSENSITIVE); 320 private static final Pattern NTH_B = Pattern.compile("([+-])?(\\d+)"); 321 322 private Evaluator cssNthChild(boolean backwards, boolean ofType) { 323 String arg = normalize(consumeParens()); 324 Matcher mAB = NTH_AB.matcher(arg); 325 Matcher mB = NTH_B.matcher(arg); 326 final int a, b; 327 if ("odd".equals(arg)) { 328 a = 2; 329 b = 1; 330 } else if ("even".equals(arg)) { 331 a = 2; 332 b = 0; 333 } else if (mAB.matches()) { 334 a = mAB.group(3) != null ? Integer.parseInt(mAB.group(1).replaceFirst("^\\+", "")) : 1; 335 b = mAB.group(4) != null ? Integer.parseInt(mAB.group(4).replaceFirst("^\\+", "")) : 0; 336 } else if (mB.matches()) { 337 a = 0; 338 b = Integer.parseInt(mB.group().replaceFirst("^\\+", "")); 339 } else { 340 throw new Selector.SelectorParseException("Could not parse nth-index '%s': unexpected format", arg); 341 } 342 343 final Evaluator eval; 344 if (ofType) 345 if (backwards) 346 eval = new Evaluator.IsNthLastOfType(a, b); 347 else 348 eval = new Evaluator.IsNthOfType(a, b); 349 else { 350 if (backwards) 351 eval = (new Evaluator.IsNthLastChild(a, b)); 352 else 353 eval = new Evaluator.IsNthChild(a, b); 354 } 355 return eval; 356 } 357 358 private String consumeParens() { 359 return tq.chompBalanced('(', ')'); 360 } 361 362 private int consumeIndex() { 363 String index = consumeParens().trim(); 364 Validate.isTrue(StringUtil.isNumeric(index), "Index must be numeric"); 365 return Integer.parseInt(index); 366 } 367 368 // pseudo selector :has(el) 369 private Evaluator has() { 370 String subQuery = consumeParens(); 371 Validate.notEmpty(subQuery, ":has(selector) sub-select must not be empty"); 372 return new StructuralEvaluator.Has(parse(subQuery)); 373 } 374 375 // psuedo selector :is() 376 private Evaluator is() { 377 String subQuery = consumeParens(); 378 Validate.notEmpty(subQuery, ":is(selector) sub-select must not be empty"); 379 return new StructuralEvaluator.Is(parse(subQuery)); 380 } 381 382 // pseudo selector :contains(text), containsOwn(text) 383 private Evaluator contains(boolean own) { 384 String query = own ? ":containsOwn" : ":contains"; 385 String searchText = TokenQueue.unescape(consumeParens()); 386 Validate.notEmpty(searchText, query + "(text) query must not be empty"); 387 return own 388 ? new Evaluator.ContainsOwnText(searchText) 389 : new Evaluator.ContainsText(searchText); 390 } 391 392 private Evaluator containsWholeText(boolean own) { 393 String query = own ? ":containsWholeOwnText" : ":containsWholeText"; 394 String searchText = TokenQueue.unescape(consumeParens()); 395 Validate.notEmpty(searchText, query + "(text) query must not be empty"); 396 return own 397 ? new Evaluator.ContainsWholeOwnText(searchText) 398 : new Evaluator.ContainsWholeText(searchText); 399 } 400 401 // pseudo selector :containsData(data) 402 private Evaluator containsData() { 403 String searchText = TokenQueue.unescape(consumeParens()); 404 Validate.notEmpty(searchText, ":containsData(text) query must not be empty"); 405 return new Evaluator.ContainsData(searchText); 406 } 407 408 // :matches(regex), matchesOwn(regex) 409 private Evaluator matches(boolean own) { 410 String query = own ? ":matchesOwn" : ":matches"; 411 String regex = consumeParens(); // don't unescape, as regex bits will be escaped 412 Validate.notEmpty(regex, query + "(regex) query must not be empty"); 413 414 return own 415 ? new Evaluator.MatchesOwn(Pattern.compile(regex)) 416 : new Evaluator.Matches(Pattern.compile(regex)); 417 } 418 419 // :matches(regex), matchesOwn(regex) 420 private Evaluator matchesWholeText(boolean own) { 421 String query = own ? ":matchesWholeOwnText" : ":matchesWholeText"; 422 String regex = consumeParens(); // don't unescape, as regex bits will be escaped 423 Validate.notEmpty(regex, query + "(regex) query must not be empty"); 424 425 return own 426 ? new Evaluator.MatchesWholeOwnText(Pattern.compile(regex)) 427 : new Evaluator.MatchesWholeText(Pattern.compile(regex)); 428 } 429 430 // :not(selector) 431 private Evaluator not() { 432 String subQuery = consumeParens(); 433 Validate.notEmpty(subQuery, ":not(selector) subselect must not be empty"); 434 435 return new StructuralEvaluator.Not(parse(subQuery)); 436 } 437 438 @Override 439 public String toString() { 440 return query; 441 } 442}