001package org.jsoup.parser; 002 003import org.jsoup.internal.StringUtil; 004import org.jsoup.helper.Validate; 005 006/** 007 * A character queue with parsing helpers. 008 * 009 * @author Jonathan Hedley 010 */ 011public class TokenQueue { 012 private String queue; 013 private int pos = 0; 014 015 private static final char ESC = '\\'; // escape char for chomp balanced. 016 017 /** 018 Create a new TokenQueue. 019 @param data string of data to back queue. 020 */ 021 public TokenQueue(String data) { 022 Validate.notNull(data); 023 queue = data; 024 } 025 026 /** 027 * Is the queue empty? 028 * @return true if no data left in queue. 029 */ 030 public boolean isEmpty() { 031 return remainingLength() == 0; 032 } 033 034 private int remainingLength() { 035 return queue.length() - pos; 036 } 037 038 /** 039 Add a string to the start of the queue. 040 @param seq string to add. 041 */ 042 public void addFirst(String seq) { 043 // not very performant, but an edge case 044 queue = seq + queue.substring(pos); 045 pos = 0; 046 } 047 048 /** 049 * Tests if the next characters on the queue match the sequence. Case insensitive. 050 * @param seq String to check queue for. 051 * @return true if the next characters match. 052 */ 053 public boolean matches(String seq) { 054 return queue.regionMatches(true, pos, seq, 0, seq.length()); 055 } 056 057 /** 058 Tests if the next characters match any of the sequences. Case insensitive. 059 @param seq list of strings to case insensitively check for 060 @return true of any matched, false if none did 061 */ 062 public boolean matchesAny(String... seq) { 063 for (String s : seq) { 064 if (matches(s)) 065 return true; 066 } 067 return false; 068 } 069 070 public boolean matchesAny(char... seq) { 071 if (isEmpty()) 072 return false; 073 074 for (char c: seq) { 075 if (queue.charAt(pos) == c) 076 return true; 077 } 078 return false; 079 } 080 081 /** 082 * Tests if the queue matches the sequence (as with match), and if they do, removes the matched string from the 083 * queue. 084 * @param seq String to search for, and if found, remove from queue. 085 * @return true if found and removed, false if not found. 086 */ 087 public boolean matchChomp(String seq) { 088 if (matches(seq)) { 089 pos += seq.length(); 090 return true; 091 } else { 092 return false; 093 } 094 } 095 096 /** 097 Tests if queue starts with a whitespace character. 098 @return if starts with whitespace 099 */ 100 public boolean matchesWhitespace() { 101 return !isEmpty() && StringUtil.isWhitespace(queue.charAt(pos)); 102 } 103 104 /** 105 Test if the queue matches a word character (letter or digit). 106 @return if matches a word character 107 */ 108 public boolean matchesWord() { 109 return !isEmpty() && Character.isLetterOrDigit(queue.charAt(pos)); 110 } 111 112 /** 113 * Drops the next character off the queue. 114 */ 115 public void advance() { 116 if (!isEmpty()) pos++; 117 } 118 119 /** 120 * Consume one character off queue. 121 * @return first character on queue. 122 */ 123 public char consume() { 124 return queue.charAt(pos++); 125 } 126 127 /** 128 * Consumes the supplied sequence of the queue. If the queue does not start with the supplied sequence, will 129 * throw an illegal state exception -- but you should be running match() against that condition. 130 <p> 131 Case insensitive. 132 * @param seq sequence to remove from head of queue. 133 */ 134 public void consume(String seq) { 135 if (!matches(seq)) 136 throw new IllegalStateException("Queue did not match expected sequence"); 137 int len = seq.length(); 138 if (len > remainingLength()) 139 throw new IllegalStateException("Queue not long enough to consume sequence"); 140 141 pos += len; 142 } 143 144 /** 145 * Pulls a string off the queue, up to but exclusive of the match sequence, or to the queue running out. 146 * @param seq String to end on (and not include in return, but leave on queue). <b>Case sensitive.</b> 147 * @return The matched data consumed from queue. 148 */ 149 public String consumeTo(String seq) { 150 int offset = queue.indexOf(seq, pos); 151 if (offset != -1) { 152 String consumed = queue.substring(pos, offset); 153 pos += consumed.length(); 154 return consumed; 155 } else { 156 return remainder(); 157 } 158 } 159 160 public String consumeToIgnoreCase(String seq) { 161 int start = pos; 162 String first = seq.substring(0, 1); 163 boolean canScan = first.toLowerCase().equals(first.toUpperCase()); // if first is not cased, use index of 164 while (!isEmpty()) { 165 if (matches(seq)) 166 break; 167 168 if (canScan) { 169 int skip = queue.indexOf(first, pos) - pos; 170 if (skip == 0) // this char is the skip char, but not match, so force advance of pos 171 pos++; 172 else if (skip < 0) // no chance of finding, grab to end 173 pos = queue.length(); 174 else 175 pos += skip; 176 } 177 else 178 pos++; 179 } 180 181 return queue.substring(start, pos); 182 } 183 184 /** 185 Consumes to the first sequence provided, or to the end of the queue. Leaves the terminator on the queue. 186 @param seq any number of terminators to consume to. <b>Case insensitive.</b> 187 @return consumed string 188 */ 189 // todo: method name. not good that consumeTo cares for case, and consume to any doesn't. And the only use for this 190 // is a case sensitive time... 191 public String consumeToAny(String... seq) { 192 int start = pos; 193 while (!isEmpty() && !matchesAny(seq)) { 194 pos++; 195 } 196 197 return queue.substring(start, pos); 198 } 199 200 /** 201 * Pulls a string off the queue (like consumeTo), and then pulls off the matched string (but does not return it). 202 * <p> 203 * If the queue runs out of characters before finding the seq, will return as much as it can (and queue will go 204 * isEmpty() == true). 205 * @param seq String to match up to, and not include in return, and to pull off queue. <b>Case sensitive.</b> 206 * @return Data matched from queue. 207 */ 208 public String chompTo(String seq) { 209 String data = consumeTo(seq); 210 matchChomp(seq); 211 return data; 212 } 213 214 public String chompToIgnoreCase(String seq) { 215 String data = consumeToIgnoreCase(seq); // case insensitive scan 216 matchChomp(seq); 217 return data; 218 } 219 220 /** 221 * Pulls a balanced string off the queue. E.g. if queue is "(one (two) three) four", (,) will return "one (two) three", 222 * and leave " four" on the queue. Unbalanced openers and closers can be quoted (with ' or ") or escaped (with \). Those escapes will be left 223 * in the returned string, which is suitable for regexes (where we need to preserve the escape), but unsuitable for 224 * contains text strings; use unescape for that. 225 * @param open opener 226 * @param close closer 227 * @return data matched from the queue 228 */ 229 public String chompBalanced(char open, char close) { 230 int start = -1; 231 int end = -1; 232 int depth = 0; 233 char last = 0; 234 boolean inSingleQuote = false; 235 boolean inDoubleQuote = false; 236 boolean inRegexQE = false; // regex \Q .. \E escapes from Pattern.quote() 237 238 do { 239 if (isEmpty()) break; 240 char c = consume(); 241 if (last != ESC) { 242 if (c == '\'' && c != open && !inDoubleQuote) 243 inSingleQuote = !inSingleQuote; 244 else if (c == '"' && c != open && !inSingleQuote) 245 inDoubleQuote = !inDoubleQuote; 246 if (inSingleQuote || inDoubleQuote || inRegexQE){ 247 last = c; 248 continue; 249 } 250 251 if (c == open) { 252 depth++; 253 if (start == -1) 254 start = pos; 255 } 256 else if (c == close) 257 depth--; 258 } else if (c == 'Q') { 259 inRegexQE = true; 260 } else if (c == 'E') { 261 inRegexQE = false; 262 } 263 264 if (depth > 0 && last != 0) 265 end = pos; // don't include the outer match pair in the return 266 last = c; 267 } while (depth > 0); 268 final String out = (end >= 0) ? queue.substring(start, end) : ""; 269 if (depth > 0) {// ran out of queue before seeing enough ) 270 Validate.fail("Did not find balanced marker at '" + out + "'"); 271 } 272 return out; 273 } 274 275 /** 276 * Unescape a \ escaped string. 277 * @param in backslash escaped string 278 * @return unescaped string 279 */ 280 public static String unescape(String in) { 281 StringBuilder out = StringUtil.borrowBuilder(); 282 char last = 0; 283 for (char c : in.toCharArray()) { 284 if (c == ESC) { 285 if (last == ESC) { 286 out.append(c); 287 c = 0; 288 } 289 } 290 else 291 out.append(c); 292 last = c; 293 } 294 return StringUtil.releaseBuilder(out); 295 } 296 297 /* 298 Given a CSS identifier (such as a tag, ID, or class), escape any CSS special characters that would otherwise not be 299 valid in a selector. 300 */ 301 public static String escapeCssIdentifier(String in) { 302 StringBuilder out = StringUtil.borrowBuilder(); 303 TokenQueue q = new TokenQueue(in); 304 while (!q.isEmpty()) { 305 if (q.matchesCssIdentifier(ElementSelectorChars)) { 306 out.append(q.consume()); 307 } else { 308 out.append(ESC).append(q.consume()); 309 } 310 } 311 return StringUtil.releaseBuilder(out); 312 } 313 314 /** 315 * Pulls the next run of whitespace characters of the queue. 316 * @return Whether consuming whitespace or not 317 */ 318 public boolean consumeWhitespace() { 319 boolean seen = false; 320 while (matchesWhitespace()) { 321 pos++; 322 seen = true; 323 } 324 return seen; 325 } 326 327 /** 328 * Retrieves the next run of word type (letter or digit) off the queue. 329 * @return String of word characters from queue, or empty string if none. 330 */ 331 public String consumeWord() { 332 int start = pos; 333 while (matchesWord()) 334 pos++; 335 return queue.substring(start, pos); 336 } 337 338 339 /** 340 * Consume a CSS element selector (tag name, but | instead of : for namespaces (or *| for wildcard namespace), to not conflict with :pseudo selects). 341 * 342 * @return tag name 343 */ 344 public String consumeElementSelector() { 345 return consumeEscapedCssIdentifier(ElementSelectorChars); 346 } 347 private static final String[] ElementSelectorChars = {"*|", "|", "_", "-"}; 348 349 /** 350 Consume a CSS identifier (ID or class) off the queue (letter, digit, -, _) 351 http://www.w3.org/TR/CSS2/syndata.html#value-def-identifier 352 @return identifier 353 */ 354 public String consumeCssIdentifier() { 355 return consumeEscapedCssIdentifier(CssIdentifierChars); 356 } 357 private static final String[] CssIdentifierChars = {"-", "_"}; 358 359 360 private String consumeEscapedCssIdentifier(String... matches) { 361 int start = pos; 362 boolean escaped = false; 363 while (!isEmpty()) { 364 if (queue.charAt(pos) == ESC && remainingLength() >1 ) { 365 escaped = true; 366 pos+=2; // skip the escape and the escaped 367 } else if (matchesCssIdentifier(matches)) { 368 pos++; 369 } else { 370 break; 371 } 372 } 373 374 String consumed = queue.substring(start, pos); 375 return escaped ? unescape(consumed) : consumed; 376 } 377 378 private boolean matchesCssIdentifier(String... matches) { 379 return matchesWord() || matchesAny(matches); 380 } 381 382 /** 383 Consume and return whatever is left on the queue. 384 @return remained of queue. 385 */ 386 public String remainder() { 387 final String remainder = queue.substring(pos); 388 pos = queue.length(); 389 return remainder; 390 } 391 392 @Override 393 public String toString() { 394 return queue.substring(pos); 395 } 396}