From a0c9a448dcc9ea6aec8e144144a289c55c4bd47c Mon Sep 17 00:00:00 2001 From: Simba Zhang Date: Wed, 18 Mar 2026 13:31:24 -0700 Subject: [PATCH 01/11] feat: emit open_report event for Aegis embedded browser --- .../scripts/run-benchmark.cjs | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/skills/analysis/home-security-benchmark/scripts/run-benchmark.cjs b/skills/analysis/home-security-benchmark/scripts/run-benchmark.cjs index 3193e7f8..ed61d218 100644 --- a/skills/analysis/home-security-benchmark/scripts/run-benchmark.cjs +++ b/skills/analysis/home-security-benchmark/scripts/run-benchmark.cjs @@ -560,12 +560,19 @@ function saveLiveProgress(startedAt, suitesCompleted, totalSuites, nextSuiteName }); // Open browser on first save (so user sees live progress from the start) - if (!_liveReportOpened && !NO_OPEN && !IS_SKILL_MODE && reportPath) { - try { - const openCmd = process.platform === 'darwin' ? 'open' : 'xdg-open'; - execSync(`${openCmd} "${reportPath}"`, { stdio: 'ignore' }); - log(' ๐Ÿ“Š Live report opened in browser (auto-refreshes every 5s)'); - } catch { } + if (!_liveReportOpened && !NO_OPEN && reportPath) { + if (IS_SKILL_MODE) { + // Ask Aegis to open in its embedded browser window + emit({ event: 'open_report', reportPath }); + log(' ๐Ÿ“Š Requested Aegis to open live report'); + } else { + // Standalone: open in system browser + try { + const openCmd = process.platform === 'darwin' ? 'open' : 'xdg-open'; + execSync(`${openCmd} "${reportPath}"`, { stdio: 'ignore' }); + log(' ๐Ÿ“Š Live report opened in browser (auto-refreshes every 5s)'); + } catch { } + } _liveReportOpened = true; } } catch (err) { From 5d001e8df3e96e876078cd92919d62b274cab67c Mon Sep 17 00:00:00 2001 From: Simba Zhang Date: Wed, 18 Mar 2026 13:38:19 -0700 Subject: [PATCH 02/11] feat: per-test live progress updates in commander center - saveLiveProgress() called after each test, not just each suite - Include in-progress suite in live data for Quality/Vision tabs - Skip fixture image embedding in live mode (~43MB savings per regeneration) - Enhanced live banner with test name and test count --- .../scripts/generate-report.cjs | 11 +++--- .../scripts/run-benchmark.cjs | 36 +++++++++++++++---- 2 files changed, 36 insertions(+), 11 deletions(-) diff --git a/skills/analysis/home-security-benchmark/scripts/generate-report.cjs b/skills/analysis/home-security-benchmark/scripts/generate-report.cjs index cb44bc41..556ac5e0 100644 --- a/skills/analysis/home-security-benchmark/scripts/generate-report.cjs +++ b/skills/analysis/home-security-benchmark/scripts/generate-report.cjs @@ -61,8 +61,9 @@ function generateReport(resultsDir = RESULTS_DIR, opts = {}) { }).filter(r => r.data); // Load fixture images for Vision tab (base64) + // Skip in live mode โ€” saves ~43MB of base64 per regeneration, making per-test updates instant const fixtureImages = {}; - if (fs.existsSync(FIXTURES_DIR)) { + if (!liveMode && fs.existsSync(FIXTURES_DIR)) { try { const frames = fs.readdirSync(FIXTURES_DIR).filter(f => /\.(png|jpg|jpeg)$/i.test(f)); for (const f of frames) { @@ -865,15 +866,17 @@ function buildLiveBanner(status) { if (!status) { return `
Benchmark starting\u2026
`; } - const { suitesCompleted = 0, totalSuites = 0, currentSuite = '', startedAt = '' } = status; + const { suitesCompleted = 0, totalSuites = 0, currentSuite = '', currentTest = '', testsCompleted = 0, startedAt = '' } = status; const pct = totalSuites > 0 ? Math.round((suitesCompleted / totalSuites) * 100) : 0; const elapsed = startedAt ? Math.round((Date.now() - new Date(startedAt).getTime()) / 1000) : 0; const elapsedStr = elapsed > 60 ? Math.floor(elapsed / 60) + 'm ' + (elapsed % 60) + 's' : elapsed + 's'; + const testInfo = currentTest ? ` โ€” โœ… ${escHtml(currentTest)}` : ''; return `
LIVE โ€” Suite ${suitesCompleted}/${totalSuites} (${pct}%) - ${currentSuite ? ' โ€” ' + currentSuite + '' : ''} - ${elapsedStr} elapsed + ${currentSuite ? ' โ€” ๐Ÿ”ง ' + escHtml(currentSuite) + '' : ''} + ${testInfo} + ${testsCompleted} tests ยท ${elapsedStr} elapsed
`; } diff --git a/skills/analysis/home-security-benchmark/scripts/run-benchmark.cjs b/skills/analysis/home-security-benchmark/scripts/run-benchmark.cjs index ed61d218..4f50e56e 100644 --- a/skills/analysis/home-security-benchmark/scripts/run-benchmark.cjs +++ b/skills/analysis/home-security-benchmark/scripts/run-benchmark.cjs @@ -507,21 +507,31 @@ function assert(condition, msg) { // โ”€โ”€โ”€ Live progress: intermediate saves + report regeneration โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ let _liveReportOpened = false; +let _runStartedAt = null; // Set when runSuites() begins +let _currentTestName = null; // Set during test execution for live banner +let _currentSuiteIndex = 0; // Current suite index for live progress +let _totalSuites = 0; // Total number of suites /** * Save the current (in-progress) results to disk and regenerate the live report. - * Called after each suite completes so the browser auto-refreshes with updated data. + * Called after each test completes so the browser auto-refreshes with updated data. */ -function saveLiveProgress(startedAt, suitesCompleted, totalSuites, nextSuiteName) { +function saveLiveProgress(startedAt, suitesCompleted, totalSuites, nextSuiteName, currentTest) { try { fs.mkdirSync(RESULTS_DIR, { recursive: true }); // Save current results as a live file (will be overwritten each time) const liveFile = path.join(RESULTS_DIR, '_live_progress.json'); + // Include the in-progress suite so Quality/Vision tabs can render partial data + const liveSuites = [...results.suites]; + if (currentSuite && currentSuite.tests.length > 0 && !results.suites.includes(currentSuite)) { + liveSuites.push(currentSuite); + } const liveResults = { ...results, + suites: liveSuites, _live: true, - _progress: { suitesCompleted, totalSuites, startedAt }, + _progress: { suitesCompleted, totalSuites, startedAt, currentTest: currentTest || null }, }; fs.writeFileSync(liveFile, JSON.stringify(liveResults, null, 2)); @@ -549,12 +559,16 @@ function saveLiveProgress(startedAt, suitesCompleted, totalSuites, nextSuiteName // Clear require cache to pick up any code changes delete require.cache[require.resolve(reportScript)]; const { generateReport } = require(reportScript); + const testsCompleted = liveSuites.reduce((n, s) => n + s.tests.length, 0); + const testsTotal = liveSuites.reduce((n, s) => n + s.tests.length, 0) + (currentTest ? 0 : 0); const reportPath = generateReport(RESULTS_DIR, { liveMode: true, liveStatus: { suitesCompleted, totalSuites, - currentSuite: nextSuiteName || 'Finishing...', + currentSuite: currentSuite?.name || nextSuiteName || 'Finishing...', + currentTest: currentTest || null, + testsCompleted, startedAt, }, }); @@ -582,9 +596,11 @@ function saveLiveProgress(startedAt, suitesCompleted, totalSuites, nextSuiteName } async function runSuites() { - const startedAt = new Date().toISOString(); + _runStartedAt = new Date().toISOString(); + _totalSuites = suites.length; for (let si = 0; si < suites.length; si++) { const s = suites[si]; + _currentSuiteIndex = si; currentSuite = { name: s.name, tests: [], passed: 0, failed: 0, skipped: 0, timeMs: 0 }; log(`\n${'โ”€'.repeat(60)}`); log(` ${s.name}`); @@ -601,8 +617,8 @@ async function runSuites() { emit({ event: 'suite_end', suite: s.name, passed: currentSuite.passed, failed: currentSuite.failed, skipped: currentSuite.skipped, timeMs: currentSuite.timeMs }); - // Live progress: save intermediate results + regenerate report after each suite - saveLiveProgress(startedAt, si + 1, suites.length, si + 1 < suites.length ? suites[si + 1]?.name : null); + // Live progress: save after suite (also saved per-test, but suite boundary is a clean checkpoint) + saveLiveProgress(_runStartedAt, si + 1, suites.length, si + 1 < suites.length ? suites[si + 1]?.name : null); } } @@ -648,6 +664,12 @@ async function test(name, fn) { currentSuite.timeMs += testResult.timeMs; currentSuite.tests.push(testResult); emit({ event: 'test_result', suite: currentSuite.name, test: name, status: testResult.status, timeMs: testResult.timeMs, detail: testResult.detail.slice(0, 120), tokens: testResult.tokens, perf: testResult.perf }); + + // Live progress: save after each test for real-time updates in commander center + if (_runStartedAt) { + _currentTestName = null; // Test just completed + saveLiveProgress(_runStartedAt, _currentSuiteIndex, _totalSuites, null, name); + } } function skip(name, reason) { From 2309e54582cdb8312ca6f81adc82cd9c3a20bfe5 Mon Sep 17 00:00:00 2001 From: Simba Zhang Date: Wed, 18 Mar 2026 13:48:25 -0700 Subject: [PATCH 03/11] fix: syntax error in collapsed toggle + stateful live reload - Use HTML entities (') for quotes in onclick to avoid multi-level escaping - Replace with JS setTimeout for stateful reload - Preserve active tab + scroll position across refreshes via sessionStorage --- .../scripts/generate-report.cjs | 39 +++++++++++++++++-- 1 file changed, 36 insertions(+), 3 deletions(-) diff --git a/skills/analysis/home-security-benchmark/scripts/generate-report.cjs b/skills/analysis/home-security-benchmark/scripts/generate-report.cjs index 556ac5e0..af5d886e 100644 --- a/skills/analysis/home-security-benchmark/scripts/generate-report.cjs +++ b/skills/analysis/home-security-benchmark/scripts/generate-report.cjs @@ -132,8 +132,8 @@ function buildHTML(allResults, fixtureImages, { liveMode = false, liveStatus = n const fixtureJSON = JSON.stringify(fixtureImages); - // Live mode: auto-refresh meta tag - const refreshMeta = liveMode ? '' : ''; + // Live mode: JS-based reload (stateful, preserves active tab + scroll) + const refreshMeta = ''; const liveBannerHTML = liveMode ? buildLiveBanner(liveStatus) : ''; return ` @@ -435,7 +435,7 @@ function buildSidebar() { let html = ''; for (const [family, runs] of Object.entries(groups)) { html += '
'; - html += '
โ–พ ' + esc(family) + ' (' + runs.length + ')
'; + html += '
โ–พ ' + esc(family) + ' (' + runs.length + ')
'; html += '
'; for (const r of runs.reverse()) { const sel = selectedIndices.has(r._idx); @@ -838,6 +838,38 @@ function refresh() { renderActiveTab(); } +// โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ• +// LIVE RELOAD (stateful โ€” preserves tab + scroll) +// โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ• +const IS_LIVE = ${liveMode ? 'true' : 'false'}; + +function saveState() { + try { + sessionStorage.setItem('_bench_tab', getActiveTab()); + sessionStorage.setItem('_bench_scroll', String(window.scrollY)); + } catch {} +} + +function restoreState() { + try { + const tab = sessionStorage.getItem('_bench_tab'); + if (tab && tab !== 'performance') { + document.querySelectorAll('.tab').forEach(t => t.classList.remove('active')); + document.querySelectorAll('.tab-panel').forEach(p => p.classList.remove('active')); + const tabEl = document.querySelector('.tab[data-tab="' + tab + '"]'); + if (tabEl) tabEl.classList.add('active'); + const panel = document.getElementById('tab-' + tab); + if (panel) panel.classList.add('active'); + } + const scroll = parseInt(sessionStorage.getItem('_bench_scroll') || '0'); + if (scroll > 0) setTimeout(() => window.scrollTo(0, scroll), 50); + } catch {} +} + +if (IS_LIVE) { + setTimeout(() => { saveState(); location.reload(); }, 5000); +} + // โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ• // INIT // โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ• @@ -847,6 +879,7 @@ document.getElementById('btn-compare').addEventListener('click', () => { if (selectedIndices.size > 1) renderActiveTab(); }); +restoreState(); refresh(); From e46f6a5bab5a4289920ac33619244c9cc145c39a Mon Sep 17 00:00:00 2001 From: Simba Zhang Date: Wed, 18 Mar 2026 13:54:27 -0700 Subject: [PATCH 04/11] fix: live performance metrics + collapsed syntax error + stateful reload - Compute live perfSummary from accumulated TTFT/decode arrays - TTFT, Decode Speed, Server Prefill/Decode now update in real-time - Fix SyntaxError: use HTML entities for collapsed toggle onclick - Replace meta refresh with JS setTimeout + sessionStorage state --- .../scripts/run-benchmark.cjs | 23 ++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) diff --git a/skills/analysis/home-security-benchmark/scripts/run-benchmark.cjs b/skills/analysis/home-security-benchmark/scripts/run-benchmark.cjs index 4f50e56e..fd9217a1 100644 --- a/skills/analysis/home-security-benchmark/scripts/run-benchmark.cjs +++ b/skills/analysis/home-security-benchmark/scripts/run-benchmark.cjs @@ -537,6 +537,27 @@ function saveLiveProgress(startedAt, suitesCompleted, totalSuites, nextSuiteName // Build a temporary index with just the live file const indexFile = path.join(RESULTS_DIR, 'index.json'); + + // Compute live performance summary from accumulated data + const ttftArr = [...results.perfTotals.ttftMs]; + const decArr = [...results.perfTotals.decodeTokensPerSec]; + const livePerfSummary = (ttftArr.length > 0 || decArr.length > 0) ? { + ttft: ttftArr.length > 0 ? { + avgMs: Math.round(ttftArr.reduce((a, b) => a + b, 0) / ttftArr.length), + p50Ms: [...ttftArr].sort((a, b) => a - b)[Math.floor(ttftArr.length * 0.5)], + p95Ms: [...ttftArr].sort((a, b) => a - b)[Math.floor(ttftArr.length * 0.95)], + samples: ttftArr.length, + } : null, + decode: decArr.length > 0 ? { + avgTokensPerSec: parseFloat((decArr.reduce((a, b) => a + b, 0) / decArr.length).toFixed(1)), + samples: decArr.length, + } : null, + server: { + prefillTokensPerSec: results.perfTotals.prefillTokensPerSec, + decodeTokensPerSec: results.perfTotals.serverDecodeTokensPerSec, + }, + } : null; + const liveIndex = [{ file: '_live_progress.json', model: results.model.name || 'loading...', @@ -550,7 +571,7 @@ function saveLiveProgress(startedAt, suitesCompleted, totalSuites, nextSuiteName vlmPassed: 0, vlmTotal: 0, timeMs: Date.now() - new Date(startedAt).getTime(), tokens: results.tokenTotals.total, - perfSummary: null, + perfSummary: livePerfSummary, }]; fs.writeFileSync(indexFile, JSON.stringify(liveIndex, null, 2)); From c59668ee25fff6f260c5c608d1577a6670f41a76 Mon Sep 17 00:00:00 2001 From: Simba Zhang Date: Wed, 18 Mar 2026 13:58:56 -0700 Subject: [PATCH 05/11] fix: scrape server metrics after each suite for live prefill/decode stats --- .../analysis/home-security-benchmark/scripts/run-benchmark.cjs | 3 +++ 1 file changed, 3 insertions(+) diff --git a/skills/analysis/home-security-benchmark/scripts/run-benchmark.cjs b/skills/analysis/home-security-benchmark/scripts/run-benchmark.cjs index fd9217a1..45004982 100644 --- a/skills/analysis/home-security-benchmark/scripts/run-benchmark.cjs +++ b/skills/analysis/home-security-benchmark/scripts/run-benchmark.cjs @@ -638,6 +638,9 @@ async function runSuites() { emit({ event: 'suite_end', suite: s.name, passed: currentSuite.passed, failed: currentSuite.failed, skipped: currentSuite.skipped, timeMs: currentSuite.timeMs }); + // Scrape server metrics after each suite so live perf cards update + await scrapeServerMetrics(); + // Live progress: save after suite (also saved per-test, but suite boundary is a clean checkpoint) saveLiveProgress(_runStartedAt, si + 1, suites.length, si + 1 < suites.length ? suites[si + 1]?.name : null); } From 8f4334283ff3c1a0ec5205bc3eb4e9c56478c0cb Mon Sep 17 00:00:00 2001 From: Simba Zhang Date: Wed, 18 Mar 2026 14:07:03 -0700 Subject: [PATCH 06/11] fix: preserve previous runs in live index for comparison sidebar --- .../home-security-benchmark/scripts/run-benchmark.cjs | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/skills/analysis/home-security-benchmark/scripts/run-benchmark.cjs b/skills/analysis/home-security-benchmark/scripts/run-benchmark.cjs index 45004982..b41a3a52 100644 --- a/skills/analysis/home-security-benchmark/scripts/run-benchmark.cjs +++ b/skills/analysis/home-security-benchmark/scripts/run-benchmark.cjs @@ -558,7 +558,10 @@ function saveLiveProgress(startedAt, suitesCompleted, totalSuites, nextSuiteName }, } : null; - const liveIndex = [{ + // Preserve previous runs in index for comparison sidebar + let existingIndex = []; + try { existingIndex = JSON.parse(fs.readFileSync(indexFile, 'utf8')).filter(e => e.file !== '_live_progress.json'); } catch { } + const liveEntry = { file: '_live_progress.json', model: results.model.name || 'loading...', vlm: results.model.vlm || null, @@ -572,8 +575,8 @@ function saveLiveProgress(startedAt, suitesCompleted, totalSuites, nextSuiteName timeMs: Date.now() - new Date(startedAt).getTime(), tokens: results.tokenTotals.total, perfSummary: livePerfSummary, - }]; - fs.writeFileSync(indexFile, JSON.stringify(liveIndex, null, 2)); + }; + fs.writeFileSync(indexFile, JSON.stringify([...existingIndex, liveEntry], null, 2)); // Regenerate report in live mode const reportScript = path.join(__dirname, 'generate-report.cjs'); From 74c03678b5adeb573e8383acd4bec19bb4194132 Mon Sep 17 00:00:00 2001 From: Simba Zhang Date: Wed, 18 Mar 2026 14:13:36 -0700 Subject: [PATCH 07/11] feat: GPU utilization + memory metrics in live commander center - sampleResourceMetrics() parses ioreg for Apple Silicon MPS stats - GPU utilization, renderer %, GPU memory, system memory tracked - Sampled after each suite, included in live perfSummary - 3 new hero cards: GPU Utilization, GPU Memory, System Memory --- .../scripts/generate-report.cjs | 8 +++ .../scripts/run-benchmark.cjs | 53 +++++++++++++++++++ 2 files changed, 61 insertions(+) diff --git a/skills/analysis/home-security-benchmark/scripts/generate-report.cjs b/skills/analysis/home-security-benchmark/scripts/generate-report.cjs index af5d886e..32eb86c1 100644 --- a/skills/analysis/home-security-benchmark/scripts/generate-report.cjs +++ b/skills/analysis/home-security-benchmark/scripts/generate-report.cjs @@ -509,6 +509,14 @@ function renderPerformance() { html += statCard('Server Decode', fmt(srvDecode), 'tok/s', 'From llama-server /metrics'); html += statCard('Total Time', fmt(totalTime / 1000), 's', run.total + ' tests'); html += statCard('Throughput', fmt(tokPerSec), 'tok/s', fmtK(run.tokens || 0) + ' total tokens'); + + // GPU & Memory cards (from resource samples) + const res = perf?.resource; + if (res) { + html += statCard('GPU Utilization', res.gpu ? res.gpu.util + '' : 'โ€”', '%', res.gpu ? 'Renderer: ' + res.gpu.renderer + '% ยท Tiler: ' + res.gpu.tiler + '%' : 'MPS not available'); + html += statCard('GPU Memory', res.gpu?.memUsedGB != null ? fmt(res.gpu.memUsedGB) : 'โ€”', 'GB', res.gpu?.memAllocGB != null ? 'Alloc: ' + fmt(res.gpu.memAllocGB) + ' GB' : 'MPS not available'); + html += statCard('System Memory', fmt(res.sys?.usedGB), 'GB', 'of ' + fmt(res.sys?.totalGB) + ' GB total ยท Free: ' + fmt(res.sys?.freeGB) + ' GB'); + } html += '
'; // Comparison table if multiple selected diff --git a/skills/analysis/home-security-benchmark/scripts/run-benchmark.cjs b/skills/analysis/home-security-benchmark/scripts/run-benchmark.cjs index b41a3a52..53301fdf 100644 --- a/skills/analysis/home-security-benchmark/scripts/run-benchmark.cjs +++ b/skills/analysis/home-security-benchmark/scripts/run-benchmark.cjs @@ -157,6 +157,7 @@ const results = { totals: { passed: 0, failed: 0, skipped: 0, total: 0, timeMs: 0 }, tokenTotals: { prompt: 0, completion: 0, total: 0 }, perfTotals: { ttftMs: [], decodeTokensPerSec: [], prefillTokensPerSec: null, serverDecodeTokensPerSec: null }, + resourceSamples: [], // GPU/memory snapshots taken after each suite }; async function llmCall(messages, opts = {}) { @@ -505,6 +506,52 @@ function assert(condition, msg) { if (!condition) throw new Error(msg || 'Assertion failed'); } +// โ”€โ”€โ”€ Resource Metrics (GPU/MPS + Memory) โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ + +/** + * Sample GPU (Apple Silicon MPS) utilization and system memory. + * Uses `ioreg` for GPU stats (no sudo needed). + */ +function sampleResourceMetrics() { + const os = require('os'); + const sample = { + timestamp: new Date().toISOString(), + sys: { + totalGB: parseFloat((os.totalmem() / 1073741824).toFixed(1)), + freeGB: parseFloat((os.freemem() / 1073741824).toFixed(1)), + usedGB: parseFloat(((os.totalmem() - os.freemem()) / 1073741824).toFixed(1)), + }, + process: { + rssMB: parseFloat((process.memoryUsage().rss / 1048576).toFixed(0)), + }, + gpu: null, + }; + + // Apple Silicon GPU via ioreg (macOS only) + if (process.platform === 'darwin') { + try { + const out = execSync('ioreg -r -c AGXAccelerator 2>/dev/null', { encoding: 'utf8', timeout: 3000 }); + const m = (key) => { const r = new RegExp('"' + key + '"=(\\d+)'); const match = out.match(r); return match ? parseInt(match[1]) : null; }; + const deviceUtil = m('Device Utilization %'); + const rendererUtil = m('Renderer Utilization %'); + const tilerUtil = m('Tiler Utilization %'); + const memUsed = m('In use system memory'); + const memAlloc = m('Alloc system memory'); + if (deviceUtil !== null) { + sample.gpu = { + util: deviceUtil, + renderer: rendererUtil, + tiler: tilerUtil, + memUsedGB: memUsed ? parseFloat((memUsed / 1073741824).toFixed(1)) : null, + memAllocGB: memAlloc ? parseFloat((memAlloc / 1073741824).toFixed(1)) : null, + }; + } + } catch { /* ioreg not available or timed out */ } + } + + return sample; +} + // โ”€โ”€โ”€ Live progress: intermediate saves + report regeneration โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ let _liveReportOpened = false; let _runStartedAt = null; // Set when runSuites() begins @@ -556,6 +603,7 @@ function saveLiveProgress(startedAt, suitesCompleted, totalSuites, nextSuiteName prefillTokensPerSec: results.perfTotals.prefillTokensPerSec, decodeTokensPerSec: results.perfTotals.serverDecodeTokensPerSec, }, + resource: results.resourceSamples.length > 0 ? results.resourceSamples[results.resourceSamples.length - 1] : null, } : null; // Preserve previous runs in index for comparison sidebar @@ -641,6 +689,11 @@ async function runSuites() { emit({ event: 'suite_end', suite: s.name, passed: currentSuite.passed, failed: currentSuite.failed, skipped: currentSuite.skipped, timeMs: currentSuite.timeMs }); + // Sample resource metrics (GPU + memory) after each suite + const resourceSample = sampleResourceMetrics(); + resourceSample.suite = s.name; + results.resourceSamples.push(resourceSample); + // Scrape server metrics after each suite so live perf cards update await scrapeServerMetrics(); From 36ac255140d22cab120bc11374699a86cee95a51 Mon Sep 17 00:00:00 2001 From: Simba Zhang Date: Wed, 18 Mar 2026 14:24:36 -0700 Subject: [PATCH 08/11] fix: error handling for tab rendering + resource data in final index --- .../scripts/generate-report.cjs | 12 +++++++++--- .../scripts/run-benchmark.cjs | 5 ++++- 2 files changed, 13 insertions(+), 4 deletions(-) diff --git a/skills/analysis/home-security-benchmark/scripts/generate-report.cjs b/skills/analysis/home-security-benchmark/scripts/generate-report.cjs index 32eb86c1..8e003929 100644 --- a/skills/analysis/home-security-benchmark/scripts/generate-report.cjs +++ b/skills/analysis/home-security-benchmark/scripts/generate-report.cjs @@ -832,9 +832,15 @@ function getActiveTab() { function renderActiveTab() { const tab = getActiveTab(); - if (tab === 'performance') renderPerformance(); - else if (tab === 'quality') renderQuality(); - else if (tab === 'vision') renderVision(); + try { + if (tab === 'performance') renderPerformance(); + else if (tab === 'quality') renderQuality(); + else if (tab === 'vision') renderVision(); + } catch (e) { + const panel = document.getElementById('tab-' + tab); + if (panel) panel.innerHTML = '
Render error: ' + e.message + '
' + e.stack + '
'; + console.error('Tab render error:', e); + } } // โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ• diff --git a/skills/analysis/home-security-benchmark/scripts/run-benchmark.cjs b/skills/analysis/home-security-benchmark/scripts/run-benchmark.cjs index 53301fdf..8598be17 100644 --- a/skills/analysis/home-security-benchmark/scripts/run-benchmark.cjs +++ b/skills/analysis/home-security-benchmark/scripts/run-benchmark.cjs @@ -2466,7 +2466,10 @@ async function main() { vlmPassed, vlmTotal, timeMs, tokens: results.tokenTotals.total, - perfSummary: results.perfSummary || null, + perfSummary: { + ...(results.perfSummary || {}), + resource: results.resourceSamples?.length > 0 ? results.resourceSamples[results.resourceSamples.length - 1] : null, + }, }); fs.writeFileSync(indexFile, JSON.stringify(index, null, 2)); From 6ea146391c6a243722629947e67b83d01c561907 Mon Sep 17 00:00:00 2001 From: Simba Zhang Date: Wed, 18 Mar 2026 14:28:47 -0700 Subject: [PATCH 09/11] fix: persist selection and primary index across live reloads via sessionStorage --- .../scripts/generate-report.cjs | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/skills/analysis/home-security-benchmark/scripts/generate-report.cjs b/skills/analysis/home-security-benchmark/scripts/generate-report.cjs index 8e003929..365163ec 100644 --- a/skills/analysis/home-security-benchmark/scripts/generate-report.cjs +++ b/skills/analysis/home-security-benchmark/scripts/generate-report.cjs @@ -861,11 +861,25 @@ function saveState() { try { sessionStorage.setItem('_bench_tab', getActiveTab()); sessionStorage.setItem('_bench_scroll', String(window.scrollY)); + sessionStorage.setItem('_bench_selected', JSON.stringify([...selectedIndices])); + sessionStorage.setItem('_bench_primary', String(primaryIndex)); } catch {} } function restoreState() { try { + // Restore selection + const savedSel = sessionStorage.getItem('_bench_selected'); + if (savedSel) { + const arr = JSON.parse(savedSel).filter(i => i >= 0 && i < ALL_RUNS.length); + if (arr.length > 0) { selectedIndices = new Set(arr); } + } + const savedPrimary = sessionStorage.getItem('_bench_primary'); + if (savedPrimary != null) { + const pi = parseInt(savedPrimary); + if (pi >= 0 && pi < ALL_RUNS.length) primaryIndex = pi; + } + // Restore tab const tab = sessionStorage.getItem('_bench_tab'); if (tab && tab !== 'performance') { document.querySelectorAll('.tab').forEach(t => t.classList.remove('active')); From 90e11c41a8a25a097003c155b7479f5529559faa Mon Sep 17 00:00:00 2001 From: Simba Zhang Date: Wed, 18 Mar 2026 14:33:36 -0700 Subject: [PATCH 10/11] feat: high-level quality comparison table (pass rate, LLM/VLM, time, throughput) --- .../scripts/generate-report.cjs | 30 ++++++++++++++++++- 1 file changed, 29 insertions(+), 1 deletion(-) diff --git a/skills/analysis/home-security-benchmark/scripts/generate-report.cjs b/skills/analysis/home-security-benchmark/scripts/generate-report.cjs index 365163ec..f625d166 100644 --- a/skills/analysis/home-security-benchmark/scripts/generate-report.cjs +++ b/skills/analysis/home-security-benchmark/scripts/generate-report.cjs @@ -620,7 +620,35 @@ function renderQuality() { // Multi-run comparison if (sel.length > 1) { - html += '
Quality Comparison
'; + // High-level summary comparison + html += '
Overall Comparison
'; + html += '
'; + for (const r of sel) html += ''; + html += ''; + const hiRows = [ + ['Pass Rate', r => r.total > 0 ? pct(r.passed, r.total) + '%' : 'โ€”'], + ['Score', r => r.passed + '/' + r.total], + ['LLM Score', r => r.llmTotal > 0 ? (r.llmPassed || 0) + '/' + (r.llmTotal || 0) : 'โ€”'], + ['VLM Score', r => r.vlmTotal > 0 ? (r.vlmPassed || 0) + '/' + (r.vlmTotal || 0) : 'โ€”'], + ['Failed', r => String(r.failed)], + ['Time', r => fmt(r.timeMs / 1000) + 's'], + ['Throughput', r => r.timeMs > 0 && r.tokens ? fmt(r.tokens / (r.timeMs / 1000)) + ' tok/s' : 'โ€”'], + ]; + for (const [label, fn] of hiRows) { + html += ''; + // Find best value for highlighting + const vals = sel.map(fn); + for (let i = 0; i < sel.length; i++) { + const isBest = label === 'Failed' ? vals[i] === String(Math.min(...sel.map(r => r.failed))) : + label === 'Pass Rate' ? vals[i] === pct(Math.max(...sel.map(r => r.passed)), sel[0].total) + '%' : false; + html += ' 1 ? ' style="color:var(--green);font-weight:600"' : '') + '>' + vals[i] + ''; + } + html += ''; + } + html += '
Metric' + esc(modelShort(r.model)) + '
' + shortDate(r.timestamp) + '
' + label + '
'; + + // Per-suite breakdown + html += '
Suite Comparison
'; html += '
'; for (const r of sel) html += ''; html += ''; From 40d5f64c24a21cd0b1f5bd0c3fcff5e4e32a033c Mon Sep 17 00:00:00 2001 From: Simba Zhang Date: Wed, 18 Mar 2026 14:34:48 -0700 Subject: [PATCH 11/11] fix: hide VLM Score row when no runs have VLM data --- .../home-security-benchmark/scripts/generate-report.cjs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/skills/analysis/home-security-benchmark/scripts/generate-report.cjs b/skills/analysis/home-security-benchmark/scripts/generate-report.cjs index f625d166..d5dda66d 100644 --- a/skills/analysis/home-security-benchmark/scripts/generate-report.cjs +++ b/skills/analysis/home-security-benchmark/scripts/generate-report.cjs @@ -625,11 +625,12 @@ function renderQuality() { html += '
Suite' + esc(modelShort(r.model)) + '
'; for (const r of sel) html += ''; html += ''; + const hasVlm = sel.some(r => r.vlmTotal > 0); const hiRows = [ ['Pass Rate', r => r.total > 0 ? pct(r.passed, r.total) + '%' : 'โ€”'], ['Score', r => r.passed + '/' + r.total], ['LLM Score', r => r.llmTotal > 0 ? (r.llmPassed || 0) + '/' + (r.llmTotal || 0) : 'โ€”'], - ['VLM Score', r => r.vlmTotal > 0 ? (r.vlmPassed || 0) + '/' + (r.vlmTotal || 0) : 'โ€”'], + ...(hasVlm ? [['VLM Score', r => r.vlmTotal > 0 ? (r.vlmPassed || 0) + '/' + (r.vlmTotal || 0) : 'โ€”']] : []), ['Failed', r => String(r.failed)], ['Time', r => fmt(r.timeMs / 1000) + 's'], ['Throughput', r => r.timeMs > 0 && r.tokens ? fmt(r.tokens / (r.timeMs / 1000)) + ' tok/s' : 'โ€”'],
Metric' + esc(modelShort(r.model)) + '
' + shortDate(r.timestamp) + '