From a0c9a448dcc9ea6aec8e144144a289c55c4bd47c Mon Sep 17 00:00:00 2001
From: Simba Zhang <solderzzc@gmail.com>
Date: Wed, 18 Mar 2026 13:31:24 -0700
Subject: [PATCH 01/11] feat: emit open_report event for Aegis embedded browser

---
 .../scripts/run-benchmark.cjs                 | 19 +++++++++++++------
 1 file changed, 13 insertions(+), 6 deletions(-)
diff --git a/skills/analysis/home-security-benchmark/scripts/run-benchmark.cjs b/skills/analysis/home-security-benchmark/scripts/run-benchmark.cjs
index 3193e7f8..ed61d218 100644
--- a/skills/analysis/home-security-benchmark/scripts/run-benchmark.cjs
+++ b/skills/analysis/home-security-benchmark/scripts/run-benchmark.cjs
@@ -560,12 +560,19 @@ function saveLiveProgress(startedAt, suitesCompleted, totalSuites, nextSuiteName
         });
 
         // Open browser on first save (so user sees live progress from the start)
-        if (!_liveReportOpened && !NO_OPEN && !IS_SKILL_MODE && reportPath) {
-            try {
-                const openCmd = process.platform === 'darwin' ? 'open' : 'xdg-open';
-                execSync(`${openCmd} "${reportPath}"`, { stdio: 'ignore' });
-                log('  📊 Live report opened in browser (auto-refreshes every 5s)');
-            } catch { }
+        if (!_liveReportOpened && !NO_OPEN && reportPath) {
+            if (IS_SKILL_MODE) {
+                // Ask Aegis to open in its embedded browser window
+                emit({ event: 'open_report', reportPath });
+                log('  📊 Requested Aegis to open live report');
+            } else {
+                // Standalone: open in system browser
+                try {
+                    const openCmd = process.platform === 'darwin' ? 'open' : 'xdg-open';
+                    execSync(`${openCmd} "${reportPath}"`, { stdio: 'ignore' });
+                    log('  📊 Live report opened in browser (auto-refreshes every 5s)');
+                } catch { }
+            }
             _liveReportOpened = true;
         }
     } catch (err) {

From 5d001e8df3e96e876078cd92919d62b274cab67c Mon Sep 17 00:00:00 2001
From: Simba Zhang <solderzzc@gmail.com>
Date: Wed, 18 Mar 2026 13:38:19 -0700
Subject: [PATCH 02/11] feat: per-test live progress updates in commander
 center

- saveLiveProgress() called after each test, not just each suite
- Include in-progress suite in live data for Quality/Vision tabs
- Skip fixture image embedding in live mode (~43MB savings per regeneration)
- Enhanced live banner with test name and test count
---
 .../scripts/generate-report.cjs               | 11 +++---
 .../scripts/run-benchmark.cjs                 | 36 +++++++++++++++----
 2 files changed, 36 insertions(+), 11 deletions(-)

diff --git a/skills/analysis/home-security-benchmark/scripts/generate-report.cjs b/skills/analysis/home-security-benchmark/scripts/generate-report.cjs
index cb44bc41..556ac5e0 100644
--- a/skills/analysis/home-security-benchmark/scripts/generate-report.cjs
+++ b/skills/analysis/home-security-benchmark/scripts/generate-report.cjs
@@ -61,8 +61,9 @@ function generateReport(resultsDir = RESULTS_DIR, opts = {}) {
     }).filter(r => r.data);
 
     // Load fixture images for Vision tab (base64)
+    // Skip in live mode — saves ~43MB of base64 per regeneration, making per-test updates instant
     const fixtureImages = {};
-    if (fs.existsSync(FIXTURES_DIR)) {
+    if (!liveMode && fs.existsSync(FIXTURES_DIR)) {
         try {
             const frames = fs.readdirSync(FIXTURES_DIR).filter(f => /\.(png|jpg|jpeg)$/i.test(f));
             for (const f of frames) {
@@ -865,15 +866,17 @@ function buildLiveBanner(status) {
     if (!status) {
         return `<div class="live-banner"><span class="live-dot"></span> Benchmark starting\u2026</div>`;
     }
-    const { suitesCompleted = 0, totalSuites = 0, currentSuite = '', startedAt = '' } = status;
+    const { suitesCompleted = 0, totalSuites = 0, currentSuite = '', currentTest = '', testsCompleted = 0, startedAt = '' } = status;
     const pct = totalSuites > 0 ? Math.round((suitesCompleted / totalSuites) * 100) : 0;
     const elapsed = startedAt ? Math.round((Date.now() - new Date(startedAt).getTime()) / 1000) : 0;
     const elapsedStr = elapsed > 60 ? Math.floor(elapsed / 60) + 'm ' + (elapsed % 60) + 's' : elapsed + 's';
+    const testInfo = currentTest ? ` — ✅ <em>${escHtml(currentTest)}</em>` : '';
     return `<div class="live-banner">
         <span class="live-dot"></span>
         <strong>LIVE</strong> — Suite ${suitesCompleted}/${totalSuites} (${pct}%)
-        ${currentSuite ? ' — <em>' + currentSuite + '</em>' : ''}
-        <span style="margin-left:auto;font-size:0.78rem">${elapsedStr} elapsed</span>
+        ${currentSuite ? ' — 🔧 <em>' + escHtml(currentSuite) + '</em>' : ''}
+        ${testInfo}
+        <span style="margin-left:auto;font-size:0.78rem">${testsCompleted} tests · ${elapsedStr} elapsed</span>
         <div class="live-progress"><div class="live-progress-bar" style="width:${pct}%"></div></div>
     </div>`;
 }
diff --git a/skills/analysis/home-security-benchmark/scripts/run-benchmark.cjs b/skills/analysis/home-security-benchmark/scripts/run-benchmark.cjs
index ed61d218..4f50e56e 100644
--- a/skills/analysis/home-security-benchmark/scripts/run-benchmark.cjs
+++ b/skills/analysis/home-security-benchmark/scripts/run-benchmark.cjs
@@ -507,21 +507,31 @@ function assert(condition, msg) {
 
 // ─── Live progress: intermediate saves + report regeneration ────────────────
 let _liveReportOpened = false;
+let _runStartedAt = null;     // Set when runSuites() begins
+let _currentTestName = null;  // Set during test execution for live banner
+let _currentSuiteIndex = 0;   // Current suite index for live progress
+let _totalSuites = 0;         // Total number of suites
 
 /**
  * Save the current (in-progress) results to disk and regenerate the live report.
- * Called after each suite completes so the browser auto-refreshes with updated data.
+ * Called after each test completes so the browser auto-refreshes with updated data.
  */
-function saveLiveProgress(startedAt, suitesCompleted, totalSuites, nextSuiteName) {
+function saveLiveProgress(startedAt, suitesCompleted, totalSuites, nextSuiteName, currentTest) {
     try {
         fs.mkdirSync(RESULTS_DIR, { recursive: true });
 
         // Save current results as a live file (will be overwritten each time)
         const liveFile = path.join(RESULTS_DIR, '_live_progress.json');
+        // Include the in-progress suite so Quality/Vision tabs can render partial data
+        const liveSuites = [...results.suites];
+        if (currentSuite && currentSuite.tests.length > 0 && !results.suites.includes(currentSuite)) {
+            liveSuites.push(currentSuite);
+        }
         const liveResults = {
             ...results,
+            suites: liveSuites,
             _live: true,
-            _progress: { suitesCompleted, totalSuites, startedAt },
+            _progress: { suitesCompleted, totalSuites, startedAt, currentTest: currentTest || null },
         };
         fs.writeFileSync(liveFile, JSON.stringify(liveResults, null, 2));
 
@@ -549,12 +559,16 @@ function saveLiveProgress(startedAt, suitesCompleted, totalSuites, nextSuiteName
         // Clear require cache to pick up any code changes
         delete require.cache[require.resolve(reportScript)];
         const { generateReport } = require(reportScript);
+        const testsCompleted = liveSuites.reduce((n, s) => n + s.tests.length, 0);
+        const testsTotal = liveSuites.reduce((n, s) => n + s.tests.length, 0) + (currentTest ? 0 : 0);
         const reportPath = generateReport(RESULTS_DIR, {
             liveMode: true,
             liveStatus: {
                 suitesCompleted,
                 totalSuites,
-                currentSuite: nextSuiteName || 'Finishing...',
+                currentSuite: currentSuite?.name || nextSuiteName || 'Finishing...',
+                currentTest: currentTest || null,
+                testsCompleted,
                 startedAt,
             },
         });
@@ -582,9 +596,11 @@ function saveLiveProgress(startedAt, suitesCompleted, totalSuites, nextSuiteName
 }
 
 async function runSuites() {
-    const startedAt = new Date().toISOString();
+    _runStartedAt = new Date().toISOString();
+    _totalSuites = suites.length;
     for (let si = 0; si < suites.length; si++) {
         const s = suites[si];
+        _currentSuiteIndex = si;
         currentSuite = { name: s.name, tests: [], passed: 0, failed: 0, skipped: 0, timeMs: 0 };
         log(`\n${'─'.repeat(60)}`);
         log(`  ${s.name}`);
@@ -601,8 +617,8 @@ async function runSuites() {
 
         emit({ event: 'suite_end', suite: s.name, passed: currentSuite.passed, failed: currentSuite.failed, skipped: currentSuite.skipped, timeMs: currentSuite.timeMs });
 
-        // Live progress: save intermediate results + regenerate report after each suite
-        saveLiveProgress(startedAt, si + 1, suites.length, si + 1 < suites.length ? suites[si + 1]?.name : null);
+        // Live progress: save after suite (also saved per-test, but suite boundary is a clean checkpoint)
+        saveLiveProgress(_runStartedAt, si + 1, suites.length, si + 1 < suites.length ? suites[si + 1]?.name : null);
     }
 }
 
@@ -648,6 +664,12 @@ async function test(name, fn) {
     currentSuite.timeMs += testResult.timeMs;
     currentSuite.tests.push(testResult);
     emit({ event: 'test_result', suite: currentSuite.name, test: name, status: testResult.status, timeMs: testResult.timeMs, detail: testResult.detail.slice(0, 120), tokens: testResult.tokens, perf: testResult.perf });
+
+    // Live progress: save after each test for real-time updates in commander center
+    if (_runStartedAt) {
+        _currentTestName = null; // Test just completed
+        saveLiveProgress(_runStartedAt, _currentSuiteIndex, _totalSuites, null, name);
+    }
 }
 
 function skip(name, reason) {

From 2309e54582cdb8312ca6f81adc82cd9c3a20bfe5 Mon Sep 17 00:00:00 2001
From: Simba Zhang <solderzzc@gmail.com>
Date: Wed, 18 Mar 2026 13:48:25 -0700
Subject: [PATCH 03/11] fix: syntax error in collapsed toggle + stateful live
 reload

- Use HTML entities (&#39;) for quotes in onclick to avoid multi-level escaping
- Replace <meta http-equiv=refresh> with JS setTimeout for stateful reload
- Preserve active tab + scroll position across refreshes via sessionStorage
---
 .../scripts/generate-report.cjs               | 39 +++++++++++++++++--
 1 file changed, 36 insertions(+), 3 deletions(-)

diff --git a/skills/analysis/home-security-benchmark/scripts/generate-report.cjs b/skills/analysis/home-security-benchmark/scripts/generate-report.cjs
index 556ac5e0..af5d886e 100644
--- a/skills/analysis/home-security-benchmark/scripts/generate-report.cjs
+++ b/skills/analysis/home-security-benchmark/scripts/generate-report.cjs
@@ -132,8 +132,8 @@ function buildHTML(allResults, fixtureImages, { liveMode = false, liveStatus = n
 
     const fixtureJSON = JSON.stringify(fixtureImages);
 
-    // Live mode: auto-refresh meta tag
-    const refreshMeta = liveMode ? '<meta http-equiv="refresh" content="5">' : '';
+    // Live mode: JS-based reload (stateful, preserves active tab + scroll)
+    const refreshMeta = '';
     const liveBannerHTML = liveMode ? buildLiveBanner(liveStatus) : '';
 
     return `<!DOCTYPE html>
@@ -435,7 +435,7 @@ function buildSidebar() {
     let html = '';
     for (const [family, runs] of Object.entries(groups)) {
         html += '<div class="model-group">';
-        html += '<div class="model-group-label" onclick="this.parentElement.classList.toggle(\'collapsed\')"><span class="arrow">▾</span> ' + esc(family) + ' <span style="color:var(--text-muted);font-weight:400">(' + runs.length + ')</span></div>';
+        html += '<div class="model-group-label" onclick="this.parentElement.classList.toggle(&#39;collapsed&#39;)"><span class="arrow">▾</span> ' + esc(family) + ' <span style="color:var(--text-muted);font-weight:400">(' + runs.length + ')</span></div>';
         html += '<div class="run-list">';
         for (const r of runs.reverse()) {
             const sel = selectedIndices.has(r._idx);
@@ -838,6 +838,38 @@ function refresh() {
     renderActiveTab();
 }
 
+// ═══════════════════════════════════════════════════════════════════════════════
+// LIVE RELOAD (stateful — preserves tab + scroll)
+// ═══════════════════════════════════════════════════════════════════════════════
+const IS_LIVE = ${liveMode ? 'true' : 'false'};
+
+function saveState() {
+    try {
+        sessionStorage.setItem('_bench_tab', getActiveTab());
+        sessionStorage.setItem('_bench_scroll', String(window.scrollY));
+    } catch {}
+}
+
+function restoreState() {
+    try {
+        const tab = sessionStorage.getItem('_bench_tab');
+        if (tab && tab !== 'performance') {
+            document.querySelectorAll('.tab').forEach(t => t.classList.remove('active'));
+            document.querySelectorAll('.tab-panel').forEach(p => p.classList.remove('active'));
+            const tabEl = document.querySelector('.tab[data-tab="' + tab + '"]');
+            if (tabEl) tabEl.classList.add('active');
+            const panel = document.getElementById('tab-' + tab);
+            if (panel) panel.classList.add('active');
+        }
+        const scroll = parseInt(sessionStorage.getItem('_bench_scroll') || '0');
+        if (scroll > 0) setTimeout(() => window.scrollTo(0, scroll), 50);
+    } catch {}
+}
+
+if (IS_LIVE) {
+    setTimeout(() => { saveState(); location.reload(); }, 5000);
+}
+
 // ═══════════════════════════════════════════════════════════════════════════════
 // INIT
 // ═══════════════════════════════════════════════════════════════════════════════
@@ -847,6 +879,7 @@ document.getElementById('btn-compare').addEventListener('click', () => {
     if (selectedIndices.size > 1) renderActiveTab();
 });
 
+restoreState();
 refresh();
 </script>
 </body>

From e46f6a5bab5a4289920ac33619244c9cc145c39a Mon Sep 17 00:00:00 2001
From: Simba Zhang <solderzzc@gmail.com>
Date: Wed, 18 Mar 2026 13:54:27 -0700
Subject: [PATCH 04/11] fix: live performance metrics + collapsed syntax error
 + stateful reload

- Compute live perfSummary from accumulated TTFT/decode arrays
- TTFT, Decode Speed, Server Prefill/Decode now update in real-time
- Fix SyntaxError: use HTML entities for collapsed toggle onclick
- Replace meta refresh with JS setTimeout + sessionStorage state
---
 .../scripts/run-benchmark.cjs                 | 23 ++++++++++++++++++-
 1 file changed, 22 insertions(+), 1 deletion(-)

diff --git a/skills/analysis/home-security-benchmark/scripts/run-benchmark.cjs b/skills/analysis/home-security-benchmark/scripts/run-benchmark.cjs
index 4f50e56e..fd9217a1 100644
--- a/skills/analysis/home-security-benchmark/scripts/run-benchmark.cjs
+++ b/skills/analysis/home-security-benchmark/scripts/run-benchmark.cjs
@@ -537,6 +537,27 @@ function saveLiveProgress(startedAt, suitesCompleted, totalSuites, nextSuiteName
 
         // Build a temporary index with just the live file
         const indexFile = path.join(RESULTS_DIR, 'index.json');
+
+        // Compute live performance summary from accumulated data
+        const ttftArr = [...results.perfTotals.ttftMs];
+        const decArr = [...results.perfTotals.decodeTokensPerSec];
+        const livePerfSummary = (ttftArr.length > 0 || decArr.length > 0) ? {
+            ttft: ttftArr.length > 0 ? {
+                avgMs: Math.round(ttftArr.reduce((a, b) => a + b, 0) / ttftArr.length),
+                p50Ms: [...ttftArr].sort((a, b) => a - b)[Math.floor(ttftArr.length * 0.5)],
+                p95Ms: [...ttftArr].sort((a, b) => a - b)[Math.floor(ttftArr.length * 0.95)],
+                samples: ttftArr.length,
+            } : null,
+            decode: decArr.length > 0 ? {
+                avgTokensPerSec: parseFloat((decArr.reduce((a, b) => a + b, 0) / decArr.length).toFixed(1)),
+                samples: decArr.length,
+            } : null,
+            server: {
+                prefillTokensPerSec: results.perfTotals.prefillTokensPerSec,
+                decodeTokensPerSec: results.perfTotals.serverDecodeTokensPerSec,
+            },
+        } : null;
+
         const liveIndex = [{
             file: '_live_progress.json',
             model: results.model.name || 'loading...',
@@ -550,7 +571,7 @@ function saveLiveProgress(startedAt, suitesCompleted, totalSuites, nextSuiteName
             vlmPassed: 0, vlmTotal: 0,
             timeMs: Date.now() - new Date(startedAt).getTime(),
             tokens: results.tokenTotals.total,
-            perfSummary: null,
+            perfSummary: livePerfSummary,
         }];
         fs.writeFileSync(indexFile, JSON.stringify(liveIndex, null, 2));
 

From c59668ee25fff6f260c5c608d1577a6670f41a76 Mon Sep 17 00:00:00 2001
From: Simba Zhang <solderzzc@gmail.com>
Date: Wed, 18 Mar 2026 13:58:56 -0700
Subject: [PATCH 05/11] fix: scrape server metrics after each suite for live
 prefill/decode stats

---
 .../analysis/home-security-benchmark/scripts/run-benchmark.cjs | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/skills/analysis/home-security-benchmark/scripts/run-benchmark.cjs b/skills/analysis/home-security-benchmark/scripts/run-benchmark.cjs
index fd9217a1..45004982 100644
--- a/skills/analysis/home-security-benchmark/scripts/run-benchmark.cjs
+++ b/skills/analysis/home-security-benchmark/scripts/run-benchmark.cjs
@@ -638,6 +638,9 @@ async function runSuites() {
 
         emit({ event: 'suite_end', suite: s.name, passed: currentSuite.passed, failed: currentSuite.failed, skipped: currentSuite.skipped, timeMs: currentSuite.timeMs });
 
+        // Scrape server metrics after each suite so live perf cards update
+        await scrapeServerMetrics();
+
         // Live progress: save after suite (also saved per-test, but suite boundary is a clean checkpoint)
         saveLiveProgress(_runStartedAt, si + 1, suites.length, si + 1 < suites.length ? suites[si + 1]?.name : null);
     }

From 8f4334283ff3c1a0ec5205bc3eb4e9c56478c0cb Mon Sep 17 00:00:00 2001
From: Simba Zhang <solderzzc@gmail.com>
Date: Wed, 18 Mar 2026 14:07:03 -0700
Subject: [PATCH 06/11] fix: preserve previous runs in live index for
 comparison sidebar

---
 .../home-security-benchmark/scripts/run-benchmark.cjs    | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/skills/analysis/home-security-benchmark/scripts/run-benchmark.cjs b/skills/analysis/home-security-benchmark/scripts/run-benchmark.cjs
index 45004982..b41a3a52 100644
--- a/skills/analysis/home-security-benchmark/scripts/run-benchmark.cjs
+++ b/skills/analysis/home-security-benchmark/scripts/run-benchmark.cjs
@@ -558,7 +558,10 @@ function saveLiveProgress(startedAt, suitesCompleted, totalSuites, nextSuiteName
             },
         } : null;
 
-        const liveIndex = [{
+        // Preserve previous runs in index for comparison sidebar
+        let existingIndex = [];
+        try { existingIndex = JSON.parse(fs.readFileSync(indexFile, 'utf8')).filter(e => e.file !== '_live_progress.json'); } catch { }
+        const liveEntry = {
             file: '_live_progress.json',
             model: results.model.name || 'loading...',
             vlm: results.model.vlm || null,
@@ -572,8 +575,8 @@ function saveLiveProgress(startedAt, suitesCompleted, totalSuites, nextSuiteName
             timeMs: Date.now() - new Date(startedAt).getTime(),
             tokens: results.tokenTotals.total,
             perfSummary: livePerfSummary,
-        }];
-        fs.writeFileSync(indexFile, JSON.stringify(liveIndex, null, 2));
+        };
+        fs.writeFileSync(indexFile, JSON.stringify([...existingIndex, liveEntry], null, 2));
 
         // Regenerate report in live mode
         const reportScript = path.join(__dirname, 'generate-report.cjs');

From 74c03678b5adeb573e8383acd4bec19bb4194132 Mon Sep 17 00:00:00 2001
From: Simba Zhang <solderzzc@gmail.com>
Date: Wed, 18 Mar 2026 14:13:36 -0700
Subject: [PATCH 07/11] feat: GPU utilization + memory metrics in live
 commander center

- sampleResourceMetrics() parses ioreg for Apple Silicon MPS stats
- GPU utilization, renderer %, GPU memory, system memory tracked
- Sampled after each suite, included in live perfSummary
- 3 new hero cards: GPU Utilization, GPU Memory, System Memory
---
 .../scripts/generate-report.cjs               |  8 +++
 .../scripts/run-benchmark.cjs                 | 53 +++++++++++++++++++
 2 files changed, 61 insertions(+)

diff --git a/skills/analysis/home-security-benchmark/scripts/generate-report.cjs b/skills/analysis/home-security-benchmark/scripts/generate-report.cjs
index af5d886e..32eb86c1 100644
--- a/skills/analysis/home-security-benchmark/scripts/generate-report.cjs
+++ b/skills/analysis/home-security-benchmark/scripts/generate-report.cjs
@@ -509,6 +509,14 @@ function renderPerformance() {
     html += statCard('Server Decode', fmt(srvDecode), 'tok/s', 'From llama-server /metrics');
     html += statCard('Total Time', fmt(totalTime / 1000), 's', run.total + ' tests');
     html += statCard('Throughput', fmt(tokPerSec), 'tok/s', fmtK(run.tokens || 0) + ' total tokens');
+
+    // GPU & Memory cards (from resource samples)
+    const res = perf?.resource;
+    if (res) {
+        html += statCard('GPU Utilization', res.gpu ? res.gpu.util + '' : '—', '%', res.gpu ? 'Renderer: ' + res.gpu.renderer + '% · Tiler: ' + res.gpu.tiler + '%' : 'MPS not available');
+        html += statCard('GPU Memory', res.gpu?.memUsedGB != null ? fmt(res.gpu.memUsedGB) : '—', 'GB', res.gpu?.memAllocGB != null ? 'Alloc: ' + fmt(res.gpu.memAllocGB) + ' GB' : 'MPS not available');
+        html += statCard('System Memory', fmt(res.sys?.usedGB), 'GB', 'of ' + fmt(res.sys?.totalGB) + ' GB total · Free: ' + fmt(res.sys?.freeGB) + ' GB');
+    }
     html += '</div>';
 
     // Comparison table if multiple selected
diff --git a/skills/analysis/home-security-benchmark/scripts/run-benchmark.cjs b/skills/analysis/home-security-benchmark/scripts/run-benchmark.cjs
index b41a3a52..53301fdf 100644
--- a/skills/analysis/home-security-benchmark/scripts/run-benchmark.cjs
+++ b/skills/analysis/home-security-benchmark/scripts/run-benchmark.cjs
@@ -157,6 +157,7 @@ const results = {
     totals: { passed: 0, failed: 0, skipped: 0, total: 0, timeMs: 0 },
     tokenTotals: { prompt: 0, completion: 0, total: 0 },
     perfTotals: { ttftMs: [], decodeTokensPerSec: [], prefillTokensPerSec: null, serverDecodeTokensPerSec: null },
+    resourceSamples: [],  // GPU/memory snapshots taken after each suite
 };
 
 async function llmCall(messages, opts = {}) {
@@ -505,6 +506,52 @@ function assert(condition, msg) {
     if (!condition) throw new Error(msg || 'Assertion failed');
 }
 
+// ─── Resource Metrics (GPU/MPS + Memory) ─────────────────────────────────────
+
+/**
+ * Sample GPU (Apple Silicon MPS) utilization and system memory.
+ * Uses `ioreg` for GPU stats (no sudo needed).
+ */
+function sampleResourceMetrics() {
+    const os = require('os');
+    const sample = {
+        timestamp: new Date().toISOString(),
+        sys: {
+            totalGB: parseFloat((os.totalmem() / 1073741824).toFixed(1)),
+            freeGB: parseFloat((os.freemem() / 1073741824).toFixed(1)),
+            usedGB: parseFloat(((os.totalmem() - os.freemem()) / 1073741824).toFixed(1)),
+        },
+        process: {
+            rssMB: parseFloat((process.memoryUsage().rss / 1048576).toFixed(0)),
+        },
+        gpu: null,
+    };
+
+    // Apple Silicon GPU via ioreg (macOS only)
+    if (process.platform === 'darwin') {
+        try {
+            const out = execSync('ioreg -r -c AGXAccelerator 2>/dev/null', { encoding: 'utf8', timeout: 3000 });
+            const m = (key) => { const r = new RegExp('"' + key + '"=(\\d+)'); const match = out.match(r); return match ? parseInt(match[1]) : null; };
+            const deviceUtil = m('Device Utilization %');
+            const rendererUtil = m('Renderer Utilization %');
+            const tilerUtil = m('Tiler Utilization %');
+            const memUsed = m('In use system memory');
+            const memAlloc = m('Alloc system memory');
+            if (deviceUtil !== null) {
+                sample.gpu = {
+                    util: deviceUtil,
+                    renderer: rendererUtil,
+                    tiler: tilerUtil,
+                    memUsedGB: memUsed ? parseFloat((memUsed / 1073741824).toFixed(1)) : null,
+                    memAllocGB: memAlloc ? parseFloat((memAlloc / 1073741824).toFixed(1)) : null,
+                };
+            }
+        } catch { /* ioreg not available or timed out */ }
+    }
+
+    return sample;
+}
+
 // ─── Live progress: intermediate saves + report regeneration ────────────────
 let _liveReportOpened = false;
 let _runStartedAt = null;     // Set when runSuites() begins
@@ -556,6 +603,7 @@ function saveLiveProgress(startedAt, suitesCompleted, totalSuites, nextSuiteName
                 prefillTokensPerSec: results.perfTotals.prefillTokensPerSec,
                 decodeTokensPerSec: results.perfTotals.serverDecodeTokensPerSec,
             },
+            resource: results.resourceSamples.length > 0 ? results.resourceSamples[results.resourceSamples.length - 1] : null,
         } : null;
 
         // Preserve previous runs in index for comparison sidebar
@@ -641,6 +689,11 @@ async function runSuites() {
 
         emit({ event: 'suite_end', suite: s.name, passed: currentSuite.passed, failed: currentSuite.failed, skipped: currentSuite.skipped, timeMs: currentSuite.timeMs });
 
+        // Sample resource metrics (GPU + memory) after each suite
+        const resourceSample = sampleResourceMetrics();
+        resourceSample.suite = s.name;
+        results.resourceSamples.push(resourceSample);
+
         // Scrape server metrics after each suite so live perf cards update
         await scrapeServerMetrics();
 

From 36ac255140d22cab120bc11374699a86cee95a51 Mon Sep 17 00:00:00 2001
From: Simba Zhang <solderzzc@gmail.com>
Date: Wed, 18 Mar 2026 14:24:36 -0700
Subject: [PATCH 08/11] fix: error handling for tab rendering + resource data
 in final index

---
 .../scripts/generate-report.cjs                      | 12 +++++++++---
 .../scripts/run-benchmark.cjs                        |  5 ++++-
 2 files changed, 13 insertions(+), 4 deletions(-)

diff --git a/skills/analysis/home-security-benchmark/scripts/generate-report.cjs b/skills/analysis/home-security-benchmark/scripts/generate-report.cjs
index 32eb86c1..8e003929 100644
--- a/skills/analysis/home-security-benchmark/scripts/generate-report.cjs
+++ b/skills/analysis/home-security-benchmark/scripts/generate-report.cjs
@@ -832,9 +832,15 @@ function getActiveTab() {
 
 function renderActiveTab() {
     const tab = getActiveTab();
-    if (tab === 'performance') renderPerformance();
-    else if (tab === 'quality') renderQuality();
-    else if (tab === 'vision') renderVision();
+    try {
+        if (tab === 'performance') renderPerformance();
+        else if (tab === 'quality') renderQuality();
+        else if (tab === 'vision') renderVision();
+    } catch (e) {
+        const panel = document.getElementById('tab-' + tab);
+        if (panel) panel.innerHTML = '<div style="color:var(--red);padding:2rem"><strong>Render error:</strong> ' + e.message + '<br><pre>' + e.stack + '</pre></div>';
+        console.error('Tab render error:', e);
+    }
 }
 
 // ═══════════════════════════════════════════════════════════════════════════════
diff --git a/skills/analysis/home-security-benchmark/scripts/run-benchmark.cjs b/skills/analysis/home-security-benchmark/scripts/run-benchmark.cjs
index 53301fdf..8598be17 100644
--- a/skills/analysis/home-security-benchmark/scripts/run-benchmark.cjs
+++ b/skills/analysis/home-security-benchmark/scripts/run-benchmark.cjs
@@ -2466,7 +2466,10 @@ async function main() {
         vlmPassed, vlmTotal,
         timeMs,
         tokens: results.tokenTotals.total,
-        perfSummary: results.perfSummary || null,
+        perfSummary: {
+            ...(results.perfSummary || {}),
+            resource: results.resourceSamples?.length > 0 ? results.resourceSamples[results.resourceSamples.length - 1] : null,
+        },
     });
     fs.writeFileSync(indexFile, JSON.stringify(index, null, 2));
 

From 6ea146391c6a243722629947e67b83d01c561907 Mon Sep 17 00:00:00 2001
From: Simba Zhang <solderzzc@gmail.com>
Date: Wed, 18 Mar 2026 14:28:47 -0700
Subject: [PATCH 09/11] fix: persist selection and primary index across live
 reloads via sessionStorage

---
 .../scripts/generate-report.cjs                    | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/skills/analysis/home-security-benchmark/scripts/generate-report.cjs b/skills/analysis/home-security-benchmark/scripts/generate-report.cjs
index 8e003929..365163ec 100644
--- a/skills/analysis/home-security-benchmark/scripts/generate-report.cjs
+++ b/skills/analysis/home-security-benchmark/scripts/generate-report.cjs
@@ -861,11 +861,25 @@ function saveState() {
     try {
         sessionStorage.setItem('_bench_tab', getActiveTab());
         sessionStorage.setItem('_bench_scroll', String(window.scrollY));
+        sessionStorage.setItem('_bench_selected', JSON.stringify([...selectedIndices]));
+        sessionStorage.setItem('_bench_primary', String(primaryIndex));
     } catch {}
 }
 
 function restoreState() {
     try {
+        // Restore selection
+        const savedSel = sessionStorage.getItem('_bench_selected');
+        if (savedSel) {
+            const arr = JSON.parse(savedSel).filter(i => i >= 0 && i < ALL_RUNS.length);
+            if (arr.length > 0) { selectedIndices = new Set(arr); }
+        }
+        const savedPrimary = sessionStorage.getItem('_bench_primary');
+        if (savedPrimary != null) {
+            const pi = parseInt(savedPrimary);
+            if (pi >= 0 && pi < ALL_RUNS.length) primaryIndex = pi;
+        }
+        // Restore tab
         const tab = sessionStorage.getItem('_bench_tab');
         if (tab && tab !== 'performance') {
             document.querySelectorAll('.tab').forEach(t => t.classList.remove('active'));

From 90e11c41a8a25a097003c155b7479f5529559faa Mon Sep 17 00:00:00 2001
From: Simba Zhang <solderzzc@gmail.com>
Date: Wed, 18 Mar 2026 14:33:36 -0700
Subject: [PATCH 10/11] feat: high-level quality comparison table (pass rate,
 LLM/VLM, time, throughput)

---
 .../scripts/generate-report.cjs               | 30 ++++++++++++++++++-
 1 file changed, 29 insertions(+), 1 deletion(-)

diff --git a/skills/analysis/home-security-benchmark/scripts/generate-report.cjs b/skills/analysis/home-security-benchmark/scripts/generate-report.cjs
index 365163ec..f625d166 100644
--- a/skills/analysis/home-security-benchmark/scripts/generate-report.cjs
+++ b/skills/analysis/home-security-benchmark/scripts/generate-report.cjs
@@ -620,7 +620,35 @@ function renderQuality() {
 
     // Multi-run comparison
     if (sel.length > 1) {
-        html += '<div class="section-title">Quality Comparison</div>';
+        // High-level summary comparison
+        html += '<div class="section-title">Overall Comparison</div>';
+        html += '<div class="table-wrap"><table class="compare-table"><thead><tr><th>Metric</th>';
+        for (const r of sel) html += '<th class="model-col">' + esc(modelShort(r.model)) + '<br><span style="font-weight:400;font-size:0.68rem">' + shortDate(r.timestamp) + '</span></th>';
+        html += '</tr></thead><tbody>';
+        const hiRows = [
+            ['Pass Rate', r => r.total > 0 ? pct(r.passed, r.total) + '%' : '—'],
+            ['Score', r => r.passed + '/' + r.total],
+            ['LLM Score', r => r.llmTotal > 0 ? (r.llmPassed || 0) + '/' + (r.llmTotal || 0) : '—'],
+            ['VLM Score', r => r.vlmTotal > 0 ? (r.vlmPassed || 0) + '/' + (r.vlmTotal || 0) : '—'],
+            ['Failed', r => String(r.failed)],
+            ['Time', r => fmt(r.timeMs / 1000) + 's'],
+            ['Throughput', r => r.timeMs > 0 && r.tokens ? fmt(r.tokens / (r.timeMs / 1000)) + ' tok/s' : '—'],
+        ];
+        for (const [label, fn] of hiRows) {
+            html += '<tr><td>' + label + '</td>';
+            // Find best value for highlighting
+            const vals = sel.map(fn);
+            for (let i = 0; i < sel.length; i++) {
+                const isBest = label === 'Failed' ? vals[i] === String(Math.min(...sel.map(r => r.failed))) :
+                    label === 'Pass Rate' ? vals[i] === pct(Math.max(...sel.map(r => r.passed)), sel[0].total) + '%' : false;
+                html += '<td' + (isBest && sel.length > 1 ? ' style="color:var(--green);font-weight:600"' : '') + '>' + vals[i] + '</td>';
+            }
+            html += '</tr>';
+        }
+        html += '</tbody></table></div>';
+
+        // Per-suite breakdown
+        html += '<div class="section-title">Suite Comparison</div>';
         html += '<div class="table-wrap"><table class="compare-table"><thead><tr><th>Suite</th>';
         for (const r of sel) html += '<th class="model-col">' + esc(modelShort(r.model)) + '</th>';
         html += '</tr></thead><tbody>';

From 40d5f64c24a21cd0b1f5bd0c3fcff5e4e32a033c Mon Sep 17 00:00:00 2001
From: Simba Zhang <solderzzc@gmail.com>
Date: Wed, 18 Mar 2026 14:34:48 -0700
Subject: [PATCH 11/11] fix: hide VLM Score row when no runs have VLM data

---
 .../home-security-benchmark/scripts/generate-report.cjs        | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/skills/analysis/home-security-benchmark/scripts/generate-report.cjs b/skills/analysis/home-security-benchmark/scripts/generate-report.cjs
index f625d166..d5dda66d 100644
--- a/skills/analysis/home-security-benchmark/scripts/generate-report.cjs
+++ b/skills/analysis/home-security-benchmark/scripts/generate-report.cjs
@@ -625,11 +625,12 @@ function renderQuality() {
         html += '<div class="table-wrap"><table class="compare-table"><thead><tr><th>Metric</th>';
         for (const r of sel) html += '<th class="model-col">' + esc(modelShort(r.model)) + '<br><span style="font-weight:400;font-size:0.68rem">' + shortDate(r.timestamp) + '</span></th>';
         html += '</tr></thead><tbody>';
+        const hasVlm = sel.some(r => r.vlmTotal > 0);
         const hiRows = [
             ['Pass Rate', r => r.total > 0 ? pct(r.passed, r.total) + '%' : '—'],
             ['Score', r => r.passed + '/' + r.total],
             ['LLM Score', r => r.llmTotal > 0 ? (r.llmPassed || 0) + '/' + (r.llmTotal || 0) : '—'],
-            ['VLM Score', r => r.vlmTotal > 0 ? (r.vlmPassed || 0) + '/' + (r.vlmTotal || 0) : '—'],
+            ...(hasVlm ? [['VLM Score', r => r.vlmTotal > 0 ? (r.vlmPassed || 0) + '/' + (r.vlmTotal || 0) : '—']] : []),
             ['Failed', r => String(r.failed)],
             ['Time', r => fmt(r.timeMs / 1000) + 's'],
             ['Throughput', r => r.timeMs > 0 && r.tokens ? fmt(r.tokens / (r.timeMs / 1000)) + ' tok/s' : '—'],