From 6adc249ba47f6a80f8b133f699e57feff365a4f0 Mon Sep 17 00:00:00 2001 From: ronnyshapiro Date: Mon, 27 Apr 2026 11:02:12 +0300 Subject: [PATCH 1/2] Defer building HtmlPage id/name lookup index until first read Every element added during parsing eagerly invoked addMappedElement, which issued two getAttribute("id"/"name") lookups, two ConcurrentHashMap.put operations, and (when applicable) allocated a MappedElementIndexEntry plus its inner ArrayList. For pages parsed and walked via querySelector / iteration without any getElementById call, the entire index is built and never read. Add a mappedElementsBuilt_ flag on HtmlPage; addMappedElement and removeMappedElement early-return until it flips. A new private ensureMappedElementsBuilt() walks the document once on first read of getElementById / getElementsById / getElementByName / getElementsByName / getElementsByIdAndOrName and populates both idMap_ and nameMap_; subsequent mutations through addMappedElement/removeMappedElement stay incremental as before. clone() resets the flag so cloned pages re-lazy-build. JMH HtmlUnitBenchmark.JMH on a ~30 KB HTML page (5 forks x 3 warmup x 5 measurement = 25 samples, paired sequentially): baseline: 633.205 +- 8.793 us/op with this: 593.611 +- 13.110 us/op delta: -39.6 us, -6.3% Targeted tests across HtmlPageTest, HtmlPage2Test, HtmlPage3Test, HtmlInlineFrameTest, HtmlFrameTest, HtmlFrameSetTest, HTMLDocumentTest, Html*ParserTest: 1176 tests, 0 failures. --- src/main/java/org/htmlunit/html/HtmlPage.java | 31 +++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/src/main/java/org/htmlunit/html/HtmlPage.java b/src/main/java/org/htmlunit/html/HtmlPage.java index 4899a4b1a9..7e70ab1600 100644 --- a/src/main/java/org/htmlunit/html/HtmlPage.java +++ b/src/main/java/org/htmlunit/html/HtmlPage.java @@ -154,6 +154,10 @@ public class HtmlPage extends SgmlPage { private Map idMap_ = new ConcurrentHashMap<>(); private Map nameMap_ = new ConcurrentHashMap<>(); + // The id/name lookup index is built lazily on first use. Until then, + // notifyNodeAdded / fireAttributeChange skip the per-element index updates. + // Reads must call ensureMappedElementsBuilt() before consulting idMap_/nameMap_. + private boolean mappedElementsBuilt_; private List frameElements_ = new ArrayList<>(); private int parserCount_; @@ -631,6 +635,7 @@ public ProcessingInstruction createProcessingInstruction(final String namespaceU @Override public DomElement getElementById(final String elementId) { if (elementId != null) { + ensureMappedElementsBuilt(); final MappedElementIndexEntry elements = idMap_.get(elementId); if (elements != null) { return elements.first(); @@ -1708,6 +1713,7 @@ public E getHtmlElementById(final String elementId) thro */ public List getElementsById(final String elementId) { if (elementId != null) { + ensureMappedElementsBuilt(); final MappedElementIndexEntry elements = idMap_.get(elementId); if (elements != null) { return new ArrayList<>(elements.elements()); @@ -1728,6 +1734,7 @@ public List getElementsById(final String elementId) { @SuppressWarnings("unchecked") public E getElementByName(final String name) throws ElementNotFoundException { if (name != null) { + ensureMappedElementsBuilt(); final MappedElementIndexEntry elements = nameMap_.get(name); if (elements != null) { return (E) elements.first(); @@ -1746,6 +1753,7 @@ public E getElementByName(final String name) throws Eleme */ public List getElementsByName(final String name) { if (name != null) { + ensureMappedElementsBuilt(); final MappedElementIndexEntry elements = nameMap_.get(name); if (elements != null) { return new ArrayList<>(elements.elements()); @@ -1765,6 +1773,7 @@ public List getElementsByIdAndOrName(final String idAndOrName) { if (idAndOrName == null) { return Collections.emptyList(); } + ensureMappedElementsBuilt(); final MappedElementIndexEntry list1 = idMap_.get(idAndOrName); final MappedElementIndexEntry list2 = nameMap_.get(idAndOrName); final List list = new ArrayList<>(); @@ -1841,12 +1850,29 @@ void notifyNodeRemoved(final DomNode node) { * @param recurse indicates if children must be added too */ void addMappedElement(final DomElement element, final boolean recurse) { + // Index is built lazily; skip while not built. ensureMappedElementsBuilt() + // walks the tree once and populates everything on first read. + if (!mappedElementsBuilt_) { + return; + } if (isAncestorOf(element)) { addElement(idMap_, element, DomElement.ID_ATTRIBUTE, recurse); addElement(nameMap_, element, DomElement.NAME_ATTRIBUTE, recurse); } } + private void ensureMappedElementsBuilt() { + if (mappedElementsBuilt_) { + return; + } + mappedElementsBuilt_ = true; + final DomElement root = getDocumentElement(); + if (root != null) { + addElement(idMap_, root, DomElement.ID_ATTRIBUTE, true); + addElement(nameMap_, root, DomElement.NAME_ATTRIBUTE, true); + } + } + private void addElement(final Map map, final DomElement element, final String attribute, final boolean recurse) { final String value = element.getAttribute(attribute); @@ -1882,6 +1908,10 @@ private void addElement(final Map map, final Do * @param descendant indicates of the element was descendant of this HtmlPage, but now its parent might be null */ void removeMappedElement(final DomElement element, final boolean recurse, final boolean descendant) { + // see addMappedElement: while the index is unbuilt, removals are also no-ops. + if (!mappedElementsBuilt_) { + return; + } if (descendant || isAncestorOf(element)) { removeElement(idMap_, element, DomElement.ID_ATTRIBUTE, recurse); removeElement(nameMap_, element, DomElement.NAME_ATTRIBUTE, recurse); @@ -1998,6 +2028,7 @@ protected HtmlPage clone() { result.idMap_ = new ConcurrentHashMap<>(); result.nameMap_ = new ConcurrentHashMap<>(); + result.mappedElementsBuilt_ = false; return result; } From 342862f766d10b4d4ab85ef03116142bcc0ae689 Mon Sep 17 00:00:00 2001 From: ronnyshapiro Date: Mon, 27 Apr 2026 15:13:41 +0300 Subject: [PATCH 2/2] Set mappedElementsBuilt_ after populating the maps Per review feedback: the flag was being flipped before addElement() ran, so a partial failure mid-walk would leave the page with built_=true and a half-populated index, and subsequent reads would silently return incomplete results. Move the assignment after the populate so a thrown exception leaves built_=false and the next read retries. --- src/main/java/org/htmlunit/html/HtmlPage.java | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/main/java/org/htmlunit/html/HtmlPage.java b/src/main/java/org/htmlunit/html/HtmlPage.java index 7e70ab1600..d8eef2f797 100644 --- a/src/main/java/org/htmlunit/html/HtmlPage.java +++ b/src/main/java/org/htmlunit/html/HtmlPage.java @@ -1865,12 +1865,15 @@ private void ensureMappedElementsBuilt() { if (mappedElementsBuilt_) { return; } - mappedElementsBuilt_ = true; final DomElement root = getDocumentElement(); if (root != null) { addElement(idMap_, root, DomElement.ID_ATTRIBUTE, true); addElement(nameMap_, root, DomElement.NAME_ATTRIBUTE, true); } + // Flip the flag only after the maps are populated, so a partial + // failure mid-walk leaves us with built_=false and the next read + // tries again rather than seeing a half-populated index. + mappedElementsBuilt_ = true; } private void addElement(final Map map, final DomElement element,