001package org.jsoup.parser; 002 003import org.jsoup.helper.Validate; 004import org.jsoup.internal.Normalizer; 005import org.jsoup.internal.StringUtil; 006import org.jsoup.nodes.Attributes; 007import org.jsoup.nodes.CDataNode; 008import org.jsoup.nodes.Comment; 009import org.jsoup.nodes.DataNode; 010import org.jsoup.nodes.Document; 011import org.jsoup.nodes.Element; 012import org.jsoup.nodes.FormElement; 013import org.jsoup.nodes.Node; 014import org.jsoup.nodes.TextNode; 015import org.jspecify.annotations.Nullable; 016 017import java.io.Reader; 018import java.io.StringReader; 019import java.util.ArrayList; 020import java.util.List; 021 022import static org.jsoup.internal.StringUtil.inSorted; 023import static org.jsoup.parser.HtmlTreeBuilderState.Constants.InTableFoster; 024import static org.jsoup.parser.HtmlTreeBuilderState.ForeignContent; 025import static org.jsoup.parser.Parser.NamespaceHtml; 026 027/** 028 * HTML Tree Builder; creates a DOM from Tokens. 029 */ 030public class HtmlTreeBuilder extends TreeBuilder { 031 // tag searches. must be sorted, used in inSorted. HtmlTreeBuilderTest validates they're sorted. 032 static final String[] TagsSearchInScope = new String[]{"applet", "caption", "html", "marquee", "object", "table", "td", "th"}; 033 static final String[] TagSearchList = new String[]{"ol", "ul"}; 034 static final String[] TagSearchButton = new String[]{"button"}; 035 static final String[] TagSearchTableScope = new String[]{"html", "table"}; 036 static final String[] TagSearchSelectScope = new String[]{"optgroup", "option"}; 037 static final String[] TagSearchEndTags = new String[]{"dd", "dt", "li", "optgroup", "option", "p", "rb", "rp", "rt", "rtc"}; 038 static final String[] TagThoroughSearchEndTags = new String[]{"caption", "colgroup", "dd", "dt", "li", "optgroup", "option", "p", "rb", "rp", "rt", "rtc", "tbody", "td", "tfoot", "th", "thead", "tr"}; 039 static final String[] TagSearchSpecial = new String[]{"address", "applet", "area", "article", "aside", "base", "basefont", "bgsound", 040 "blockquote", "body", "br", "button", "caption", "center", "col", "colgroup", "command", "dd", 041 "details", "dir", "div", "dl", "dt", "embed", "fieldset", "figcaption", "figure", "footer", "form", 042 "frame", "frameset", "h1", "h2", "h3", "h4", "h5", "h6", "head", "header", "hgroup", "hr", "html", 043 "iframe", "img", "input", "isindex", "li", "link", "listing", "marquee", "menu", "meta", "nav", 044 "noembed", "noframes", "noscript", "object", "ol", "p", "param", "plaintext", "pre", "script", 045 "section", "select", "style", "summary", "table", "tbody", "td", "textarea", "tfoot", "th", "thead", 046 "title", "tr", "ul", "wbr", "xmp"}; 047 static final String[] TagMathMlTextIntegration = new String[]{"mi", "mn", "mo", "ms", "mtext"}; 048 static final String[] TagSvgHtmlIntegration = new String[]{"desc", "foreignObject", "title"}; 049 050 public static final int MaxScopeSearchDepth = 100; // prevents the parser bogging down in exceptionally broken pages 051 052 private HtmlTreeBuilderState state; // the current state 053 private HtmlTreeBuilderState originalState; // original / marked state 054 055 private boolean baseUriSetFromDoc; 056 private @Nullable Element headElement; // the current head element 057 private @Nullable FormElement formElement; // the current form element 058 private @Nullable Element contextElement; // fragment parse context -- could be null even if fragment parsing 059 private ArrayList<Element> formattingElements; // active (open) formatting elements 060 private ArrayList<HtmlTreeBuilderState> tmplInsertMode; // stack of Template Insertion modes 061 private List<Token.Character> pendingTableCharacters; // chars in table to be shifted out 062 private Token.EndTag emptyEnd; // reused empty end tag 063 064 private boolean framesetOk; // if ok to go into frameset 065 private boolean fosterInserts; // if next inserts should be fostered 066 private boolean fragmentParsing; // if parsing a fragment of html 067 068 @Override ParseSettings defaultSettings() { 069 return ParseSettings.htmlDefault; 070 } 071 072 @Override 073 HtmlTreeBuilder newInstance() { 074 return new HtmlTreeBuilder(); 075 } 076 077 @Override 078 protected void initialiseParse(Reader input, String baseUri, Parser parser) { 079 super.initialiseParse(input, baseUri, parser); 080 081 // this is a bit mucky. todo - probably just create new parser objects to ensure all reset. 082 state = HtmlTreeBuilderState.Initial; 083 originalState = null; 084 baseUriSetFromDoc = false; 085 headElement = null; 086 formElement = null; 087 contextElement = null; 088 formattingElements = new ArrayList<>(); 089 tmplInsertMode = new ArrayList<>(); 090 pendingTableCharacters = new ArrayList<>(); 091 emptyEnd = new Token.EndTag(this); 092 framesetOk = true; 093 fosterInserts = false; 094 fragmentParsing = false; 095 } 096 097 @Override List<Node> parseFragment(String inputFragment, @Nullable Element context, String baseUri, Parser parser) { 098 // context may be null 099 state = HtmlTreeBuilderState.Initial; 100 initialiseParse(new StringReader(inputFragment), baseUri, parser); 101 contextElement = context; 102 fragmentParsing = true; 103 Element root = null; 104 105 if (context != null) { 106 if (context.ownerDocument() != null) // quirks setup: 107 doc.quirksMode(context.ownerDocument().quirksMode()); 108 109 // initialise the tokeniser state: 110 String contextTag = context.normalName(); 111 switch (contextTag) { 112 case "title": 113 case "textarea": 114 tokeniser.transition(TokeniserState.Rcdata); 115 break; 116 case "iframe": 117 case "noembed": 118 case "noframes": 119 case "style": 120 case "xmp": 121 tokeniser.transition(TokeniserState.Rawtext); 122 break; 123 case "script": 124 tokeniser.transition(TokeniserState.ScriptData); 125 break; 126 case "plaintext": 127 tokeniser.transition(TokeniserState.PLAINTEXT); 128 break; 129 case "template": 130 tokeniser.transition(TokeniserState.Data); 131 pushTemplateMode(HtmlTreeBuilderState.InTemplate); 132 break; 133 default: 134 tokeniser.transition(TokeniserState.Data); 135 } 136 root = new Element(tagFor(contextTag, settings), baseUri); 137 doc.appendChild(root); 138 push(root); 139 resetInsertionMode(); 140 141 // setup form element to nearest form on context (up ancestor chain). ensures form controls are associated 142 // with form correctly 143 Element formSearch = context; 144 while (formSearch != null) { 145 if (formSearch instanceof FormElement) { 146 formElement = (FormElement) formSearch; 147 break; 148 } 149 formSearch = formSearch.parent(); 150 } 151 } 152 153 runParser(); 154 if (context != null) { 155 // depending on context and the input html, content may have been added outside of the root el 156 // e.g. context=p, input=div, the div will have been pushed out. 157 List<Node> nodes = root.siblingNodes(); 158 if (!nodes.isEmpty()) 159 root.insertChildren(-1, nodes); 160 return root.childNodes(); 161 } 162 else 163 return doc.childNodes(); 164 } 165 166 @Override 167 protected boolean process(Token token) { 168 HtmlTreeBuilderState dispatch = useCurrentOrForeignInsert(token) ? this.state : ForeignContent; 169 return dispatch.process(token, this); 170 } 171 172 boolean useCurrentOrForeignInsert(Token token) { 173 // https://html.spec.whatwg.org/multipage/parsing.html#tree-construction 174 // If the stack of open elements is empty 175 if (stack.isEmpty()) 176 return true; 177 final Element el = currentElement(); 178 final String ns = el.tag().namespace(); 179 180 // If the adjusted current node is an element in the HTML namespace 181 if (NamespaceHtml.equals(ns)) 182 return true; 183 184 // If the adjusted current node is a MathML text integration point and the token is a start tag whose tag name is neither "mglyph" nor "malignmark" 185 // If the adjusted current node is a MathML text integration point and the token is a character token 186 if (isMathmlTextIntegration(el)) { 187 if (token.isStartTag() 188 && !"mglyph".equals(token.asStartTag().normalName) 189 && !"malignmark".equals(token.asStartTag().normalName)) 190 return true; 191 if (token.isCharacter()) 192 return true; 193 } 194 // If the adjusted current node is a MathML annotation-xml element and the token is a start tag whose tag name is "svg" 195 if (Parser.NamespaceMathml.equals(ns) 196 && el.nameIs("annotation-xml") 197 && token.isStartTag() 198 && "svg".equals(token.asStartTag().normalName)) 199 return true; 200 201 // If the adjusted current node is an HTML integration point and the token is a start tag 202 // If the adjusted current node is an HTML integration point and the token is a character token 203 if (isHtmlIntegration(el) 204 && (token.isStartTag() || token.isCharacter())) 205 return true; 206 207 // If the token is an end-of-file token 208 return token.isEOF(); 209 } 210 211 static boolean isMathmlTextIntegration(Element el) { 212 /* 213 A node is a MathML text integration point if it is one of the following elements: 214 A MathML mi element 215 A MathML mo element 216 A MathML mn element 217 A MathML ms element 218 A MathML mtext element 219 */ 220 return (Parser.NamespaceMathml.equals(el.tag().namespace()) 221 && StringUtil.inSorted(el.normalName(), TagMathMlTextIntegration)); 222 } 223 224 static boolean isHtmlIntegration(Element el) { 225 /* 226 A node is an HTML integration point if it is one of the following elements: 227 A MathML annotation-xml element whose start tag token had an attribute with the name "encoding" whose value was an ASCII case-insensitive match for the string "text/html" 228 A MathML annotation-xml element whose start tag token had an attribute with the name "encoding" whose value was an ASCII case-insensitive match for the string "application/xhtml+xml" 229 An SVG foreignObject element 230 An SVG desc element 231 An SVG title element 232 */ 233 if (Parser.NamespaceMathml.equals(el.tag().namespace()) 234 && el.nameIs("annotation-xml")) { 235 String encoding = Normalizer.normalize(el.attr("encoding")); 236 if (encoding.equals("text/html") || encoding.equals("application/xhtml+xml")) 237 return true; 238 } 239 if (Parser.NamespaceSvg.equals(el.tag().namespace()) 240 && StringUtil.in(el.tagName(), TagSvgHtmlIntegration)) // note using .tagName for case-sensitive hit here of foreignObject 241 return true; 242 243 return false; 244 } 245 246 boolean process(Token token, HtmlTreeBuilderState state) { 247 return state.process(token, this); 248 } 249 250 void transition(HtmlTreeBuilderState state) { 251 this.state = state; 252 } 253 254 HtmlTreeBuilderState state() { 255 return state; 256 } 257 258 void markInsertionMode() { 259 originalState = state; 260 } 261 262 HtmlTreeBuilderState originalState() { 263 return originalState; 264 } 265 266 void framesetOk(boolean framesetOk) { 267 this.framesetOk = framesetOk; 268 } 269 270 boolean framesetOk() { 271 return framesetOk; 272 } 273 274 Document getDocument() { 275 return doc; 276 } 277 278 String getBaseUri() { 279 return baseUri; 280 } 281 282 void maybeSetBaseUri(Element base) { 283 if (baseUriSetFromDoc) // only listen to the first <base href> in parse 284 return; 285 286 String href = base.absUrl("href"); 287 if (href.length() != 0) { // ignore <base target> etc 288 baseUri = href; 289 baseUriSetFromDoc = true; 290 doc.setBaseUri(href); // set on the doc so doc.createElement(Tag) will get updated base, and to update all descendants 291 } 292 } 293 294 boolean isFragmentParsing() { 295 return fragmentParsing; 296 } 297 298 void error(HtmlTreeBuilderState state) { 299 if (parser.getErrors().canAddError()) 300 parser.getErrors().add(new ParseError(reader, "Unexpected %s token [%s] when in state [%s]", 301 currentToken.tokenType(), currentToken, state)); 302 } 303 304 Element createElementFor(Token.StartTag startTag, String namespace, boolean forcePreserveCase) { 305 // dedupe and normalize the attributes: 306 Attributes attributes = startTag.attributes; 307 if (!forcePreserveCase) 308 attributes = settings.normalizeAttributes(attributes); 309 if (attributes != null && !attributes.isEmpty()) { 310 int dupes = attributes.deduplicate(settings); 311 if (dupes > 0) { 312 error("Dropped duplicate attribute(s) in tag [%s]", startTag.normalName); 313 } 314 } 315 316 Tag tag = tagFor(startTag.tagName, namespace, 317 forcePreserveCase ? ParseSettings.preserveCase : settings); 318 319 return (tag.normalName().equals("form")) ? 320 new FormElement(tag, null, attributes) : 321 new Element(tag, null, attributes); 322 } 323 324 /** Inserts an HTML element for the given tag) */ 325 Element insertElementFor(final Token.StartTag startTag) { 326 Element el = createElementFor(startTag, NamespaceHtml, false); 327 doInsertElement(el, startTag); 328 329 // handle self-closing tags. when the spec expects an empty tag, will directly hit insertEmpty, so won't generate this fake end tag. 330 if (startTag.isSelfClosing()) { 331 Tag tag = el.tag(); 332 if (tag.isKnownTag()) { 333 if (!tag.isEmpty()) 334 tokeniser.error("Tag [%s] cannot be self closing; not a void tag", tag.normalName()); 335 // else: ok 336 } 337 else { // unknown tag: remember this is self-closing, for output 338 tag.setSelfClosing(); 339 } 340 341 // effectively a pop, but fiddles with the state. handles empty style, title etc which would otherwise leave us in data state 342 tokeniser.transition(TokeniserState.Data); // handles <script />, otherwise needs breakout steps from script data 343 tokeniser.emit(emptyEnd.reset().name(el.tagName())); // ensure we get out of whatever state we are in. emitted for yielded processing 344 } 345 346 return el; 347 } 348 349 /** 350 Inserts a foreign element. Preserves the case of the tag name and of the attributes. 351 */ 352 Element insertForeignElementFor(final Token.StartTag startTag, String namespace) { 353 Element el = createElementFor(startTag, namespace, true); 354 doInsertElement(el, startTag); 355 356 if (startTag.isSelfClosing()) { 357 el.tag().setSelfClosing(); // remember this is self-closing for output 358 pop(); 359 } 360 361 return el; 362 } 363 364 Element insertEmptyElementFor(Token.StartTag startTag) { 365 Element el = createElementFor(startTag, NamespaceHtml, false); 366 doInsertElement(el, startTag); 367 pop(); 368 return el; 369 } 370 371 FormElement insertFormElement(Token.StartTag startTag, boolean onStack, boolean checkTemplateStack) { 372 FormElement el = (FormElement) createElementFor(startTag, NamespaceHtml, false); 373 374 if (checkTemplateStack) { 375 if(!onStack("template")) 376 setFormElement(el); 377 } else 378 setFormElement(el); 379 380 doInsertElement(el, startTag); 381 if (!onStack) pop(); 382 return el; 383 } 384 385 /** Inserts the Element onto the stack. All element inserts must run through this method. Performs any general 386 tests on the Element before insertion. 387 * @param el the Element to insert and make the current element 388 * @param token the token this element was parsed from. If null, uses a zero-width current token as intrinsic insert 389 */ 390 private void doInsertElement(Element el, @Nullable Token token) { 391 if (el.tag().isFormListed() && formElement != null) 392 formElement.addElement(el); // connect form controls to their form element 393 394 // in HTML, the xmlns attribute if set must match what the parser set the tag's namespace to 395 if (el.hasAttr("xmlns") && !el.attr("xmlns").equals(el.tag().namespace())) 396 error("Invalid xmlns attribute [%s] on tag [%s]", el.attr("xmlns"), el.tagName()); 397 398 if (isFosterInserts() && StringUtil.inSorted(currentElement().normalName(), InTableFoster)) 399 insertInFosterParent(el); 400 else 401 currentElement().appendChild(el); 402 403 push(el); 404 } 405 406 void insertCommentNode(Token.Comment token) { 407 Comment node = new Comment(token.getData()); 408 currentElement().appendChild(node); 409 onNodeInserted(node); 410 } 411 412 /** Inserts the provided character token into the current element. */ 413 void insertCharacterNode(Token.Character characterToken) { 414 Element el = currentElement(); // will be doc if no current element; allows for whitespace to be inserted into the doc root object (not on the stack) 415 insertCharacterToElement(characterToken, el); 416 } 417 418 /** Inserts the provided character token into the provided element. */ 419 void insertCharacterToElement(Token.Character characterToken, Element el) { 420 final Node node; 421 final String tagName = el.normalName(); 422 final String data = characterToken.getData(); 423 424 if (characterToken.isCData()) 425 node = new CDataNode(data); 426 else if (isContentForTagData(tagName)) 427 node = new DataNode(data); 428 else 429 node = new TextNode(data); 430 el.appendChild(node); // doesn't use insertNode, because we don't foster these; and will always have a stack. 431 onNodeInserted(node); 432 } 433 434 ArrayList<Element> getStack() { 435 return stack; 436 } 437 438 boolean onStack(Element el) { 439 return onStack(stack, el); 440 } 441 442 /** Checks if there is an HTML element with the given name on the stack. */ 443 boolean onStack(String elName) { 444 return getFromStack(elName) != null; 445 } 446 447 private static final int maxQueueDepth = 256; // an arbitrary tension point between real HTML and crafted pain 448 private static boolean onStack(ArrayList<Element> queue, Element element) { 449 final int bottom = queue.size() - 1; 450 final int upper = bottom >= maxQueueDepth ? bottom - maxQueueDepth : 0; 451 for (int pos = bottom; pos >= upper; pos--) { 452 Element next = queue.get(pos); 453 if (next == element) { 454 return true; 455 } 456 } 457 return false; 458 } 459 460 /** Gets the nearest (lowest) HTML element with the given name from the stack. */ 461 @Nullable 462 Element getFromStack(String elName) { 463 final int bottom = stack.size() - 1; 464 final int upper = bottom >= maxQueueDepth ? bottom - maxQueueDepth : 0; 465 for (int pos = bottom; pos >= upper; pos--) { 466 Element next = stack.get(pos); 467 if (next.elementIs(elName, NamespaceHtml)) { 468 return next; 469 } 470 } 471 return null; 472 } 473 474 boolean removeFromStack(Element el) { 475 for (int pos = stack.size() -1; pos >= 0; pos--) { 476 Element next = stack.get(pos); 477 if (next == el) { 478 stack.remove(pos); 479 onNodeClosed(el); 480 return true; 481 } 482 } 483 return false; 484 } 485 486 /** Pops the stack until the given HTML element is removed. */ 487 @Nullable 488 Element popStackToClose(String elName) { 489 for (int pos = stack.size() -1; pos >= 0; pos--) { 490 Element el = pop(); 491 if (el.elementIs(elName, NamespaceHtml)) { 492 return el; 493 } 494 } 495 return null; 496 } 497 498 /** Pops the stack until an element with the supplied name is removed, irrespective of namespace. */ 499 @Nullable 500 Element popStackToCloseAnyNamespace(String elName) { 501 for (int pos = stack.size() -1; pos >= 0; pos--) { 502 Element el = pop(); 503 if (el.nameIs(elName)) { 504 return el; 505 } 506 } 507 return null; 508 } 509 510 /** Pops the stack until one of the given HTML elements is removed. */ 511 void popStackToClose(String... elNames) { // elnames is sorted, comes from Constants 512 for (int pos = stack.size() -1; pos >= 0; pos--) { 513 Element el = pop(); 514 if (inSorted(el.normalName(), elNames) && NamespaceHtml.equals(el.tag().namespace())) { 515 break; 516 } 517 } 518 } 519 520 void clearStackToTableContext() { 521 clearStackToContext("table", "template"); 522 } 523 524 void clearStackToTableBodyContext() { 525 clearStackToContext("tbody", "tfoot", "thead", "template"); 526 } 527 528 void clearStackToTableRowContext() { 529 clearStackToContext("tr", "template"); 530 } 531 532 /** Removes elements from the stack until one of the supplied HTML elements is removed. */ 533 private void clearStackToContext(String... nodeNames) { 534 for (int pos = stack.size() -1; pos >= 0; pos--) { 535 Element next = stack.get(pos); 536 if (NamespaceHtml.equals(next.tag().namespace()) && 537 (StringUtil.in(next.normalName(), nodeNames) || next.nameIs("html"))) 538 break; 539 else 540 pop(); 541 } 542 } 543 544 @Nullable Element aboveOnStack(Element el) { 545 assert onStack(el); 546 for (int pos = stack.size() -1; pos >= 0; pos--) { 547 Element next = stack.get(pos); 548 if (next == el) { 549 return stack.get(pos-1); 550 } 551 } 552 return null; 553 } 554 555 void insertOnStackAfter(Element after, Element in) { 556 int i = stack.lastIndexOf(after); 557 Validate.isTrue(i != -1); 558 stack.add(i+1, in); 559 } 560 561 void replaceOnStack(Element out, Element in) { 562 replaceInQueue(stack, out, in); 563 } 564 565 private static void replaceInQueue(ArrayList<Element> queue, Element out, Element in) { 566 int i = queue.lastIndexOf(out); 567 Validate.isTrue(i != -1); 568 queue.set(i, in); 569 } 570 571 /** 572 * Reset the insertion mode, by searching up the stack for an appropriate insertion mode. The stack search depth 573 * is limited to {@link #maxQueueDepth}. 574 * @return true if the insertion mode was actually changed. 575 */ 576 boolean resetInsertionMode() { 577 // https://html.spec.whatwg.org/multipage/parsing.html#the-insertion-mode 578 boolean last = false; 579 final int bottom = stack.size() - 1; 580 final int upper = bottom >= maxQueueDepth ? bottom - maxQueueDepth : 0; 581 final HtmlTreeBuilderState origState = this.state; 582 583 if (stack.size() == 0) { // nothing left of stack, just get to body 584 transition(HtmlTreeBuilderState.InBody); 585 } 586 587 LOOP: for (int pos = bottom; pos >= upper; pos--) { 588 Element node = stack.get(pos); 589 if (pos == upper) { 590 last = true; 591 if (fragmentParsing) 592 node = contextElement; 593 } 594 String name = node != null ? node.normalName() : ""; 595 if (!NamespaceHtml.equals(node.tag().namespace())) 596 continue; // only looking for HTML elements here 597 598 switch (name) { 599 case "select": 600 transition(HtmlTreeBuilderState.InSelect); 601 // todo - should loop up (with some limit) and check for table or template hits 602 break LOOP; 603 case "td": 604 case "th": 605 if (!last) { 606 transition(HtmlTreeBuilderState.InCell); 607 break LOOP; 608 } 609 break; 610 case "tr": 611 transition(HtmlTreeBuilderState.InRow); 612 break LOOP; 613 case "tbody": 614 case "thead": 615 case "tfoot": 616 transition(HtmlTreeBuilderState.InTableBody); 617 break LOOP; 618 case "caption": 619 transition(HtmlTreeBuilderState.InCaption); 620 break LOOP; 621 case "colgroup": 622 transition(HtmlTreeBuilderState.InColumnGroup); 623 break LOOP; 624 case "table": 625 transition(HtmlTreeBuilderState.InTable); 626 break LOOP; 627 case "template": 628 HtmlTreeBuilderState tmplState = currentTemplateMode(); 629 Validate.notNull(tmplState, "Bug: no template insertion mode on stack!"); 630 transition(tmplState); 631 break LOOP; 632 case "head": 633 if (!last) { 634 transition(HtmlTreeBuilderState.InHead); 635 break LOOP; 636 } 637 break; 638 case "body": 639 transition(HtmlTreeBuilderState.InBody); 640 break LOOP; 641 case "frameset": 642 transition(HtmlTreeBuilderState.InFrameset); 643 break LOOP; 644 case "html": 645 transition(headElement == null ? HtmlTreeBuilderState.BeforeHead : HtmlTreeBuilderState.AfterHead); 646 break LOOP; 647 } 648 if (last) { 649 transition(HtmlTreeBuilderState.InBody); 650 break; 651 } 652 } 653 return state != origState; 654 } 655 656 /** Places the body back onto the stack and moves to InBody, for cases in AfterBody / AfterAfterBody when more content comes */ 657 void resetBody() { 658 if (!onStack("body")) { 659 stack.add(doc.body()); // not onNodeInserted, as already seen 660 } 661 transition(HtmlTreeBuilderState.InBody); 662 } 663 664 // todo: tidy up in specific scope methods 665 private final String[] specificScopeTarget = {null}; 666 667 private boolean inSpecificScope(String targetName, String[] baseTypes, String[] extraTypes) { 668 specificScopeTarget[0] = targetName; 669 return inSpecificScope(specificScopeTarget, baseTypes, extraTypes); 670 } 671 672 private boolean inSpecificScope(String[] targetNames, String[] baseTypes, @Nullable String[] extraTypes) { 673 // https://html.spec.whatwg.org/multipage/parsing.html#has-an-element-in-the-specific-scope 674 final int bottom = stack.size() -1; 675 final int top = bottom > MaxScopeSearchDepth ? bottom - MaxScopeSearchDepth : 0; 676 // don't walk too far up the tree 677 678 for (int pos = bottom; pos >= top; pos--) { 679 Element el = stack.get(pos); 680 if (!el.tag().namespace().equals(NamespaceHtml)) continue; 681 682 final String elName = el.normalName(); 683 if (inSorted(elName, targetNames)) 684 return true; 685 if (inSorted(elName, baseTypes)) 686 return false; 687 if (extraTypes != null && inSorted(elName, extraTypes)) 688 return false; 689 } 690 //Validate.fail("Should not be reachable"); // would end up false because hitting 'html' at root (basetypes) 691 return false; 692 } 693 694 boolean inScope(String[] targetNames) { 695 return inSpecificScope(targetNames, TagsSearchInScope, null); 696 } 697 698 boolean inScope(String targetName) { 699 return inScope(targetName, null); 700 } 701 702 boolean inScope(String targetName, String[] extras) { 703 return inSpecificScope(targetName, TagsSearchInScope, extras); 704 // todo: in mathml namespace: mi, mo, mn, ms, mtext annotation-xml 705 // todo: in svg namespace: forignOjbect, desc, title 706 } 707 708 boolean inListItemScope(String targetName) { 709 return inScope(targetName, TagSearchList); 710 } 711 712 boolean inButtonScope(String targetName) { 713 return inScope(targetName, TagSearchButton); 714 } 715 716 boolean inTableScope(String targetName) { 717 return inSpecificScope(targetName, TagSearchTableScope, null); 718 } 719 720 boolean inSelectScope(String targetName) { 721 for (int pos = stack.size() -1; pos >= 0; pos--) { 722 Element el = stack.get(pos); 723 String elName = el.normalName(); 724 if (elName.equals(targetName)) 725 return true; 726 if (!inSorted(elName, TagSearchSelectScope)) // all elements except 727 return false; 728 } 729 Validate.fail("Should not be reachable"); 730 return false; 731 } 732 733 /** Tests if there is some element on the stack that is not in the provided set. */ 734 boolean onStackNot(String[] allowedTags) { 735 final int bottom = stack.size() -1; 736 final int top = bottom > MaxScopeSearchDepth ? bottom - MaxScopeSearchDepth : 0; 737 // don't walk too far up the tree 738 739 for (int pos = bottom; pos >= top; pos--) { 740 final String elName = stack.get(pos).normalName(); 741 if (!inSorted(elName, allowedTags)) 742 return true; 743 } 744 return false; 745 } 746 747 void setHeadElement(Element headElement) { 748 this.headElement = headElement; 749 } 750 751 Element getHeadElement() { 752 return headElement; 753 } 754 755 boolean isFosterInserts() { 756 return fosterInserts; 757 } 758 759 void setFosterInserts(boolean fosterInserts) { 760 this.fosterInserts = fosterInserts; 761 } 762 763 @Nullable FormElement getFormElement() { 764 return formElement; 765 } 766 767 void setFormElement(FormElement formElement) { 768 this.formElement = formElement; 769 } 770 771 void resetPendingTableCharacters() { 772 pendingTableCharacters.clear(); 773 } 774 775 List<Token.Character> getPendingTableCharacters() { 776 return pendingTableCharacters; 777 } 778 779 void addPendingTableCharacters(Token.Character c) { 780 // make a clone of the token to maintain its state (as Tokens are otherwise reset) 781 Token.Character clone = c.clone(); 782 pendingTableCharacters.add(clone); 783 } 784 785 /** 786 13.2.6.3 Closing elements that have implied end tags 787 When the steps below require the UA to generate implied end tags, then, while the current node is a dd element, a dt element, an li element, an optgroup element, an option element, a p element, an rb element, an rp element, an rt element, or an rtc element, the UA must pop the current node off the stack of open elements. 788 789 If a step requires the UA to generate implied end tags but lists an element to exclude from the process, then the UA must perform the above steps as if that element was not in the above list. 790 791 When the steps below require the UA to generate all implied end tags thoroughly, then, while the current node is a caption element, a colgroup element, a dd element, a dt element, an li element, an optgroup element, an option element, a p element, an rb element, an rp element, an rt element, an rtc element, a tbody element, a td element, a tfoot element, a th element, a thead element, or a tr element, the UA must pop the current node off the stack of open elements. 792 793 @param excludeTag If a step requires the UA to generate implied end tags but lists an element to exclude from the 794 process, then the UA must perform the above steps as if that element was not in the above list. 795 */ 796 void generateImpliedEndTags(String excludeTag) { 797 while (inSorted(currentElement().normalName(), TagSearchEndTags)) { 798 if (excludeTag != null && currentElementIs(excludeTag)) 799 break; 800 pop(); 801 } 802 } 803 804 void generateImpliedEndTags() { 805 generateImpliedEndTags(false); 806 } 807 808 /** 809 Pops HTML elements off the stack according to the implied end tag rules 810 @param thorough if we are thorough (includes table elements etc) or not 811 */ 812 void generateImpliedEndTags(boolean thorough) { 813 final String[] search = thorough ? TagThoroughSearchEndTags : TagSearchEndTags; 814 while (NamespaceHtml.equals(currentElement().tag().namespace()) 815 && inSorted(currentElement().normalName(), search)) { 816 pop(); 817 } 818 } 819 820 void closeElement(String name) { 821 generateImpliedEndTags(name); 822 if (!name.equals(currentElement().normalName())) error(state()); 823 popStackToClose(name); 824 } 825 826 static boolean isSpecial(Element el) { 827 // todo: mathml's mi, mo, mn 828 // todo: svg's foreigObject, desc, title 829 String name = el.normalName(); 830 return inSorted(name, TagSearchSpecial); 831 } 832 833 Element lastFormattingElement() { 834 return formattingElements.size() > 0 ? formattingElements.get(formattingElements.size()-1) : null; 835 } 836 837 int positionOfElement(Element el){ 838 for (int i = 0; i < formattingElements.size(); i++){ 839 if (el == formattingElements.get(i)) 840 return i; 841 } 842 return -1; 843 } 844 845 Element removeLastFormattingElement() { 846 int size = formattingElements.size(); 847 if (size > 0) 848 return formattingElements.remove(size-1); 849 else 850 return null; 851 } 852 853 // active formatting elements 854 void pushActiveFormattingElements(Element in) { 855 checkActiveFormattingElements(in); 856 formattingElements.add(in); 857 } 858 859 void pushWithBookmark(Element in, int bookmark){ 860 checkActiveFormattingElements(in); 861 // catch any range errors and assume bookmark is incorrect - saves a redundant range check. 862 try { 863 formattingElements.add(bookmark, in); 864 } catch (IndexOutOfBoundsException e) { 865 formattingElements.add(in); 866 } 867 } 868 869 void checkActiveFormattingElements(Element in){ 870 int numSeen = 0; 871 final int size = formattingElements.size() -1; 872 int ceil = size - maxUsedFormattingElements; if (ceil <0) ceil = 0; 873 874 for (int pos = size; pos >= ceil; pos--) { 875 Element el = formattingElements.get(pos); 876 if (el == null) // marker 877 break; 878 879 if (isSameFormattingElement(in, el)) 880 numSeen++; 881 882 if (numSeen == 3) { 883 formattingElements.remove(pos); 884 break; 885 } 886 } 887 } 888 889 private static boolean isSameFormattingElement(Element a, Element b) { 890 // same if: same namespace, tag, and attributes. Element.equals only checks tag, might in future check children 891 return a.normalName().equals(b.normalName()) && 892 // a.namespace().equals(b.namespace()) && 893 a.attributes().equals(b.attributes()); 894 // todo: namespaces 895 } 896 897 void reconstructFormattingElements() { 898 if (stack.size() > maxQueueDepth) 899 return; 900 Element last = lastFormattingElement(); 901 if (last == null || onStack(last)) 902 return; 903 904 Element entry = last; 905 int size = formattingElements.size(); 906 int ceil = size - maxUsedFormattingElements; if (ceil <0) ceil = 0; 907 int pos = size - 1; 908 boolean skip = false; 909 while (true) { 910 if (pos == ceil) { // step 4. if none before, skip to 8 911 skip = true; 912 break; 913 } 914 entry = formattingElements.get(--pos); // step 5. one earlier than entry 915 if (entry == null || onStack(entry)) // step 6 - neither marker nor on stack 916 break; // jump to 8, else continue back to 4 917 } 918 while(true) { 919 if (!skip) // step 7: on later than entry 920 entry = formattingElements.get(++pos); 921 Validate.notNull(entry); // should not occur, as we break at last element 922 923 // 8. create new element from element, 9 insert into current node, onto stack 924 skip = false; // can only skip increment from 4. 925 Element newEl = new Element(tagFor(entry.normalName(), settings), null, entry.attributes().clone()); 926 doInsertElement(newEl, null); 927 928 // 10. replace entry with new entry 929 formattingElements.set(pos, newEl); 930 931 // 11 932 if (pos == size-1) // if not last entry in list, jump to 7 933 break; 934 } 935 } 936 private static final int maxUsedFormattingElements = 12; // limit how many elements get recreated 937 938 void clearFormattingElementsToLastMarker() { 939 while (!formattingElements.isEmpty()) { 940 Element el = removeLastFormattingElement(); 941 if (el == null) 942 break; 943 } 944 } 945 946 void removeFromActiveFormattingElements(Element el) { 947 for (int pos = formattingElements.size() -1; pos >= 0; pos--) { 948 Element next = formattingElements.get(pos); 949 if (next == el) { 950 formattingElements.remove(pos); 951 break; 952 } 953 } 954 } 955 956 boolean isInActiveFormattingElements(Element el) { 957 return onStack(formattingElements, el); 958 } 959 960 @Nullable 961 Element getActiveFormattingElement(String nodeName) { 962 for (int pos = formattingElements.size() -1; pos >= 0; pos--) { 963 Element next = formattingElements.get(pos); 964 if (next == null) // scope marker 965 break; 966 else if (next.nameIs(nodeName)) 967 return next; 968 } 969 return null; 970 } 971 972 void replaceActiveFormattingElement(Element out, Element in) { 973 replaceInQueue(formattingElements, out, in); 974 } 975 976 void insertMarkerToFormattingElements() { 977 formattingElements.add(null); 978 } 979 980 void insertInFosterParent(Node in) { 981 Element fosterParent; 982 Element lastTable = getFromStack("table"); 983 boolean isLastTableParent = false; 984 if (lastTable != null) { 985 if (lastTable.parent() != null) { 986 fosterParent = lastTable.parent(); 987 isLastTableParent = true; 988 } else 989 fosterParent = aboveOnStack(lastTable); 990 } else { // no table == frag 991 fosterParent = stack.get(0); 992 } 993 994 if (isLastTableParent) { 995 Validate.notNull(lastTable); // last table cannot be null by this point. 996 lastTable.before(in); 997 } 998 else 999 fosterParent.appendChild(in); 1000 } 1001 1002 // Template Insertion Mode stack 1003 void pushTemplateMode(HtmlTreeBuilderState state) { 1004 tmplInsertMode.add(state); 1005 } 1006 1007 @Nullable HtmlTreeBuilderState popTemplateMode() { 1008 if (tmplInsertMode.size() > 0) { 1009 return tmplInsertMode.remove(tmplInsertMode.size() -1); 1010 } else { 1011 return null; 1012 } 1013 } 1014 1015 int templateModeSize() { 1016 return tmplInsertMode.size(); 1017 } 1018 1019 @Nullable HtmlTreeBuilderState currentTemplateMode() { 1020 return (tmplInsertMode.size() > 0) ? tmplInsertMode.get(tmplInsertMode.size() -1) : null; 1021 } 1022 1023 @Override 1024 public String toString() { 1025 return "TreeBuilder{" + 1026 "currentToken=" + currentToken + 1027 ", state=" + state + 1028 ", currentElement=" + currentElement() + 1029 '}'; 1030 } 1031 1032 @Override protected boolean isContentForTagData(final String normalName) { 1033 return (normalName.equals("script") || normalName.equals("style")); 1034 } 1035}