diff --git a/skills/analysis/home-security-benchmark/scripts/generate-report.cjs b/skills/analysis/home-security-benchmark/scripts/generate-report.cjs index e78da13..cb44bc4 100644 --- a/skills/analysis/home-security-benchmark/scripts/generate-report.cjs +++ b/skills/analysis/home-security-benchmark/scripts/generate-report.cjs @@ -1,14 +1,17 @@ #!/usr/bin/env node /** - * HTML Report Generator for Home Security AI Benchmark + * HomeSec-Bench Operations Center — Report Generator * - * Reads JSON result files from the benchmarks directory and generates - * a self-contained HTML report with: - * - Pass/fail scorecard per suite - * - Latency charts (inline SVG) - * - Token usage breakdown - * - Historical comparison table - * - System configuration + * Generates a self-contained HTML dashboard with three views: + * ⚡ Performance — TTFT, decode tok/s, server metrics, trend charts + * ✅ Quality — Suite pass/fail, test details, comparison tables + * 🖼️ Vision — VLM image grid with pass/fail overlays and model responses + * + * Features: + * - Run picker sidebar with model-grouped history + multi-select + * - Side-by-side comparison tables across selected runs + * - Export to Markdown for community sharing + * - Embeds all data into a single offline-capable HTML file * * Usage: * node generate-report.cjs [results-dir] @@ -21,260 +24,830 @@ const os = require('os'); const RESULTS_DIR = process.argv[2] || path.join(os.homedir(), '.aegis-ai', 'benchmarks'); -function generateReport(resultsDir = RESULTS_DIR) { +// ─── Fixture image directory (for Vision tab) ────────────────────────────────── +const FIXTURES_DIR = path.join(__dirname, '..', 'fixtures', 'frames'); + +/** + * Generate the report HTML. + * @param {string} resultsDir - Directory containing benchmark results + * @param {object} opts - Options + * @param {boolean} opts.liveMode - If true, adds auto-refresh (5s) and a live progress banner + * @param {object} opts.liveStatus - Live status info: { suitesCompleted, totalSuites, currentSuite, startedAt } + */ +function generateReport(resultsDir = RESULTS_DIR, opts = {}) { const dir = resultsDir || RESULTS_DIR; + const { liveMode = false, liveStatus = null } = opts; - // Load all result files + // Load index — gracefully handle missing/empty for live mode const indexFile = path.join(dir, 'index.json'); - if (!fs.existsSync(indexFile)) { - console.error(`No index.json found in ${dir}. Run the benchmark first.`); - process.exit(1); - } + let index = []; + try { + if (fs.existsSync(indexFile)) { + index = JSON.parse(fs.readFileSync(indexFile, 'utf8')); + } + } catch { } - const index = JSON.parse(fs.readFileSync(indexFile, 'utf8')); - if (index.length === 0) { - console.error('No benchmark results found.'); + if (index.length === 0 && !liveMode) { + console.error(`No benchmark results found in ${dir}. Run the benchmark first.`); process.exit(1); } - // Load the latest result for detailed view - const latestEntry = index[index.length - 1]; - const latestFile = path.join(dir, latestEntry.file); - const latest = JSON.parse(fs.readFileSync(latestFile, 'utf8')); - - // Load all results for comparison + // Load all result files with full data const allResults = index.map(entry => { try { const data = JSON.parse(fs.readFileSync(path.join(dir, entry.file), 'utf8')); return { ...entry, data }; - } catch { return entry; } - }); + } catch { return { ...entry, data: null }; } + }).filter(r => r.data); - const html = buildHTML(latest, allResults); + // Load fixture images for Vision tab (base64) + const fixtureImages = {}; + if (fs.existsSync(FIXTURES_DIR)) { + try { + const frames = fs.readdirSync(FIXTURES_DIR).filter(f => /\.(png|jpg|jpeg)$/i.test(f)); + for (const f of frames) { + const imgPath = path.join(FIXTURES_DIR, f); + const ext = f.split('.').pop().toLowerCase(); + const mime = ext === 'png' ? 'image/png' : 'image/jpeg'; + const b64 = fs.readFileSync(imgPath).toString('base64'); + fixtureImages[f] = `data:${mime};base64,${b64}`; + } + } catch (e) { + console.warn(' ⚠️ Could not load fixture images:', e.message); + } + } + + const html = buildHTML(allResults, fixtureImages, { liveMode, liveStatus }); const reportPath = path.join(dir, 'report.html'); fs.writeFileSync(reportPath, html); - console.log(` Report saved: ${reportPath}`); - - // Try to open in browser - try { - const { execSync } = require('child_process'); - if (process.platform === 'darwin') execSync(`open "${reportPath}"`); - else if (process.platform === 'linux') execSync(`xdg-open "${reportPath}"`); - else if (process.platform === 'win32') execSync(`start "" "${reportPath}"`); - } catch { } + // Suppress log noise during live updates + if (!liveMode) console.log(` Report saved: ${reportPath}`); return reportPath; } -function buildHTML(latest, allResults) { - const { totals, tokenTotals, model, system, suites } = latest; - const passRate = totals.total > 0 ? ((totals.passed / totals.total) * 100).toFixed(0) : 0; - const tokPerSec = totals.timeMs > 0 ? (tokenTotals.total / (totals.timeMs / 1000)).toFixed(1) : '?'; - - // Build suite rows - const suiteRows = suites.map(s => { - const pct = s.tests.length > 0 ? ((s.passed / s.tests.length) * 100).toFixed(0) : 0; - const color = s.failed === 0 ? '#22c55e' : s.passed > s.failed ? '#f59e0b' : '#ef4444'; - return ` - ${s.name} - ${s.passed}/${s.tests.length} - ${(s.timeMs / 1000).toFixed(1)}s -
- `; - }).join('\n'); - - // Build test detail rows - const testRows = suites.flatMap(s => - s.tests.map(t => { - const icon = t.status === 'pass' ? '✅' : t.status === 'fail' ? '❌' : '⏭️'; - const cls = t.status === 'fail' ? 'fail-row' : ''; - return ` - ${icon} - ${s.name} - ${t.name} - ${t.timeMs}ms - ${escHtml(t.detail.slice(0, 120))} - `; - }) - ).join('\n'); - - // Build latency chart data (SVG bar chart) - const allTests = suites.flatMap(s => s.tests.filter(t => t.status !== 'skip')); - const maxLatency = Math.max(...allTests.map(t => t.timeMs), 1); - const barHeight = 22; - const chartHeight = allTests.length * (barHeight + 4) + 40; - const chartBars = allTests.map((t, i) => { - const w = (t.timeMs / maxLatency) * 500; - const y = i * (barHeight + 4) + 30; - const color = t.status === 'pass' ? '#22c55e' : '#ef4444'; - const label = t.name.length > 30 ? t.name.slice(0, 28) + '…' : t.name; - return ` - ${escHtml(label)} - ${t.timeMs}ms`; - }).join('\n'); - - // Build historical comparison table - const historyRows = allResults.slice().reverse().map(r => { - const ts = new Date(r.timestamp).toLocaleDateString() + ' ' + new Date(r.timestamp).toLocaleTimeString(); - const isCurrent = r.file === (allResults[allResults.length - 1]?.file); - const vlmModel = r.vlm || (r.data?.model?.vlm) || ''; - const modelLabel = (r.model || '?') + (vlmModel ? `
VLM: ${vlmModel}` : ''); - // LLM/VLM split (fallback for older runs without split data) - const hasLlmVlm = r.llmTotal !== undefined; - const llmLabel = hasLlmVlm ? `${r.llmPassed}/${r.llmTotal}` : `${r.passed}/${r.total}`; - const llmPct = hasLlmVlm && r.llmTotal > 0 ? ((r.llmPassed / r.llmTotal) * 100).toFixed(0) + '%' : (r.total > 0 ? ((r.passed / r.total) * 100).toFixed(0) + '%' : '—'); - const vlmLabel = hasLlmVlm && r.vlmTotal > 0 ? `${r.vlmPassed}/${r.vlmTotal}` : '—'; - const vlmPct = hasLlmVlm && r.vlmTotal > 0 ? ((r.vlmPassed / r.vlmTotal) * 100).toFixed(0) + '%' : '—'; - return ` - ${ts}${isCurrent ? ' ⬅️' : ''} - ${modelLabel} - ${llmLabel} - ${llmPct} - ${vlmLabel} - ${vlmPct} - ${(r.timeMs / 1000).toFixed(1)}s - ${r.tokens || '?'} - `; - }).join('\n'); +function esc(str) { + return String(str || '').replace(/&/g, '&').replace(//g, '>').replace(/"/g, '"').replace(/'/g, '''); +} + +function buildHTML(allResults, fixtureImages, { liveMode = false, liveStatus = null } = {}) { + // Serialize data for embedded JS + const embeddedData = JSON.stringify(allResults.map(r => ({ + file: r.file, + model: r.model, + vlm: r.vlm || r.data?.model?.vlm || null, + timestamp: r.timestamp || r.data?.timestamp, + passed: r.passed, + failed: r.failed, + total: r.total, + llmPassed: r.llmPassed, + llmTotal: r.llmTotal, + vlmPassed: r.vlmPassed, + vlmTotal: r.vlmTotal, + timeMs: r.timeMs, + tokens: r.tokens || r.data?.tokenTotals?.total, + perfSummary: r.perfSummary || r.data?.perfSummary || null, + system: r.data?.system || {}, + tokenTotals: r.data?.tokenTotals || {}, + suites: (r.data?.suites || []).map(s => ({ + name: s.name, + passed: s.passed, + failed: s.failed, + skipped: s.skipped, + timeMs: s.timeMs, + tests: s.tests.map(t => ({ + name: t.name, + status: t.status, + timeMs: t.timeMs, + detail: (t.detail || '').slice(0, 200), + tokens: t.tokens || {}, + perf: t.perf || {}, + fixture: t.fixture || null, + vlmResponse: t.vlmResponse || null, + vlmPrompt: t.vlmPrompt || null, + })), + })), + }))); + + const fixtureJSON = JSON.stringify(fixtureImages); + + // Live mode: auto-refresh meta tag + const refreshMeta = liveMode ? '' : ''; + const liveBannerHTML = liveMode ? buildLiveBanner(liveStatus) : ''; return ` -Home Security AI Benchmark — ${model.name || 'Report'} +${refreshMeta} +HomeSec-Bench ${liveMode ? '🔴 LIVE' : 'Operations Center'} + + -
+${liveBannerHTML} +
-

🛡️ Home Security AI Benchmark

-

${new Date(latest.timestamp).toLocaleDateString()} ${new Date(latest.timestamp).toLocaleTimeString()}

- -
-
-
Pass Rate
-
${passRate}%
-
${totals.passed}/${totals.total} tests passed
-
-
-
Total Time
-
${(totals.timeMs / 1000).toFixed(1)}s
-
${suites.length} suites
+ + -

Suite Summary

- - - ${suiteRows} -
SuiteResultTimePass Rate
- -

Latency Chart

- - Response Latency per Test (ms) - ${chartBars} - - -

Test Details

- - - ${testRows} -
SuiteTestTimeDetail
- -

Token Usage

-
-
-
Prompt Tokens
-
${tokenTotals.prompt.toLocaleString()}
-
-
-
Completion Tokens
-
${tokenTotals.completion.toLocaleString()}
+ +
+
+
⚡ Performance
+
✅ Quality
+
🖼️ Vision
-
-
Total Tokens
-
${tokenTotals.total.toLocaleString()}
-
-
-
Throughput
-
${tokPerSec}
-
tokens/second
+ +
+ +
+ + +
+ + +
-
-${allResults.length > 1 ? `

Historical Comparison

- - - ${historyRows} -
DateModelLLMLLM %VLMVLM %TimeTokens
` : ''} - -

System Configuration

-
-
OS${system.os || '?'}
-
CPU${system.cpu || '?'}
-
Cores${system.cpuCores || '?'}
-
RAM${system.totalMemoryGB || '?'} GB total
-
Free RAM${system.freeMemoryGB || '?'} GB
-
Node${system.nodeVersion || '?'}
-
Process RSS${system.processMemoryMB?.rss || '?'} MB
-
Heap Used${system.processMemoryMB?.heapUsed || '?'} MB
+
+ Home Security AI Benchmark Suite • DeepCamera / SharpAI • Generated ${new Date().toISOString().slice(0, 19)} +
+
-
- Home Security AI Benchmark Suite • DeepCamera / SharpAI • Generated ${new Date().toISOString()} -
+
-
+ `; } @@ -288,4 +861,21 @@ if (require.main === module) { generateReport(); } +function buildLiveBanner(status) { + if (!status) { + return `
Benchmark starting\u2026
`; + } + const { suitesCompleted = 0, totalSuites = 0, currentSuite = '', startedAt = '' } = status; + const pct = totalSuites > 0 ? Math.round((suitesCompleted / totalSuites) * 100) : 0; + const elapsed = startedAt ? Math.round((Date.now() - new Date(startedAt).getTime()) / 1000) : 0; + const elapsedStr = elapsed > 60 ? Math.floor(elapsed / 60) + 'm ' + (elapsed % 60) + 's' : elapsed + 's'; + return `
+ + LIVE — Suite ${suitesCompleted}/${totalSuites} (${pct}%) + ${currentSuite ? ' — ' + currentSuite + '' : ''} + ${elapsedStr} elapsed +
+
`; +} + module.exports = { generateReport }; diff --git a/skills/analysis/home-security-benchmark/scripts/run-benchmark.cjs b/skills/analysis/home-security-benchmark/scripts/run-benchmark.cjs index c9bd3be..3193e7f 100644 --- a/skills/analysis/home-security-benchmark/scripts/run-benchmark.cjs +++ b/skills/analysis/home-security-benchmark/scripts/run-benchmark.cjs @@ -505,8 +505,79 @@ function assert(condition, msg) { if (!condition) throw new Error(msg || 'Assertion failed'); } +// ─── Live progress: intermediate saves + report regeneration ──────────────── +let _liveReportOpened = false; + +/** + * Save the current (in-progress) results to disk and regenerate the live report. + * Called after each suite completes so the browser auto-refreshes with updated data. + */ +function saveLiveProgress(startedAt, suitesCompleted, totalSuites, nextSuiteName) { + try { + fs.mkdirSync(RESULTS_DIR, { recursive: true }); + + // Save current results as a live file (will be overwritten each time) + const liveFile = path.join(RESULTS_DIR, '_live_progress.json'); + const liveResults = { + ...results, + _live: true, + _progress: { suitesCompleted, totalSuites, startedAt }, + }; + fs.writeFileSync(liveFile, JSON.stringify(liveResults, null, 2)); + + // Build a temporary index with just the live file + const indexFile = path.join(RESULTS_DIR, 'index.json'); + const liveIndex = [{ + file: '_live_progress.json', + model: results.model.name || 'loading...', + vlm: results.model.vlm || null, + timestamp: results.timestamp, + passed: results.totals.passed, + failed: results.totals.failed, + total: results.totals.total, + llmPassed: results.totals.passed, // Simplified for live view + llmTotal: results.totals.total, + vlmPassed: 0, vlmTotal: 0, + timeMs: Date.now() - new Date(startedAt).getTime(), + tokens: results.tokenTotals.total, + perfSummary: null, + }]; + fs.writeFileSync(indexFile, JSON.stringify(liveIndex, null, 2)); + + // Regenerate report in live mode + const reportScript = path.join(__dirname, 'generate-report.cjs'); + // Clear require cache to pick up any code changes + delete require.cache[require.resolve(reportScript)]; + const { generateReport } = require(reportScript); + const reportPath = generateReport(RESULTS_DIR, { + liveMode: true, + liveStatus: { + suitesCompleted, + totalSuites, + currentSuite: nextSuiteName || 'Finishing...', + startedAt, + }, + }); + + // Open browser on first save (so user sees live progress from the start) + if (!_liveReportOpened && !NO_OPEN && !IS_SKILL_MODE && reportPath) { + try { + const openCmd = process.platform === 'darwin' ? 'open' : 'xdg-open'; + execSync(`${openCmd} "${reportPath}"`, { stdio: 'ignore' }); + log(' 📊 Live report opened in browser (auto-refreshes every 5s)'); + } catch { } + _liveReportOpened = true; + } + } catch (err) { + // Non-fatal — live progress is a nice-to-have + log(` ⚠️ Live progress update failed: ${err.message}`); + } +} + async function runSuites() { - for (const s of suites) { + const startedAt = new Date().toISOString(); + for (let si = 0; si < suites.length; si++) { + const s = suites[si]; currentSuite = { name: s.name, tests: [], passed: 0, failed: 0, skipped: 0, timeMs: 0 }; log(`\n${'─'.repeat(60)}`); log(` ${s.name}`); @@ -522,12 +593,16 @@ async function runSuites() { results.totals.total += currentSuite.tests.length; emit({ event: 'suite_end', suite: s.name, passed: currentSuite.passed, failed: currentSuite.failed, skipped: currentSuite.skipped, timeMs: currentSuite.timeMs }); + + // Live progress: save intermediate results + regenerate report after each suite + saveLiveProgress(startedAt, si + 1, suites.length, si + 1 < suites.length ? suites[si + 1]?.name : null); } } // ─── Per-test token + perf accumulators (set by test(), read by llmCall) ────── let _currentTestTokens = null; let _currentTestPerf = null; +let _vlmTestMeta = null; // VLM fixture metadata (set during VLM tests, read after test() completes) async function test(name, fn) { const testResult = { name, status: 'pass', timeMs: 0, detail: '', tokens: { prompt: 0, completion: 0, total: 0 }, perf: {} }; @@ -2025,18 +2100,37 @@ suite('📸 VLM Scene Analysis', async () => { const framePath = path.join(FIXTURES_DIR, 'frames', t.file); if (!fs.existsSync(framePath)) { skip(t.name, `File missing: ${t.file}`); return; } const desc = await vlmAnalyze(framePath, t.prompt); - if (t.expect === null) { - // Just check we got a meaningful response - assert(desc.length > 20, `Response too short: ${desc.length} chars`); - return `${desc.length} chars ✓`; - } - const lower = desc.toLowerCase(); - const matched = t.expect.some(term => lower.includes(term)); - assert(matched, - `Expected one of [${t.expect.slice(0, 4).join(', ')}...] in: "${desc.slice(0, 80)}"`); - const hits = t.expect.filter(term => lower.includes(term)); - return `${desc.length} chars, matched: ${hits.join(', ')} ✓`; + + // Save fixture filename + VLM response for Vision tab in report + const lastTest = currentSuite.tests.length > 0 ? null : undefined; // will be set after push + // Attach after test() pushes — use a post-hook via the return + const result = (() => { + if (t.expect === null) { + assert(desc.length > 20, `Response too short: ${desc.length} chars`); + return `${desc.length} chars ✓`; + } + const lower = desc.toLowerCase(); + const matched = t.expect.some(term => lower.includes(term)); + assert(matched, + `Expected one of [${t.expect.slice(0, 4).join(', ')}...] in: "${desc.slice(0, 80)}"`); + const hits = t.expect.filter(term => lower.includes(term)); + return `${desc.length} chars, matched: ${hits.join(', ')} ✓`; + })(); + + // Stash fixture + response on the test result (test() pushes to currentSuite.tests) + // We set it as a closure-accessible value; the test() function reads the return value. + // After test() completes, we patch the last test entry with VLM metadata. + _vlmTestMeta = { fixture: t.file, vlmResponse: desc.slice(0, 300), prompt: t.prompt }; + return result; }); + // Patch the last pushed test with VLM metadata (fixture filename + response preview) + if (_vlmTestMeta && currentSuite.tests.length > 0) { + const lastTest = currentSuite.tests[currentSuite.tests.length - 1]; + lastTest.fixture = _vlmTestMeta.fixture; + lastTest.vlmResponse = _vlmTestMeta.vlmResponse; + lastTest.vlmPrompt = _vlmTestMeta.prompt; + _vlmTestMeta = null; + } } }); @@ -2233,16 +2327,18 @@ async function main() { // Save results fs.mkdirSync(RESULTS_DIR, { recursive: true }); + // Clean up live progress file (replaced by final results) + try { fs.unlinkSync(path.join(RESULTS_DIR, '_live_progress.json')); } catch { } const modelSlug = (results.model.name || 'unknown').replace(/[^a-zA-Z0-9_.-]/g, '_'); const ts = new Date().toISOString().replace(/[:.]/g, '-').slice(0, 19); const resultFile = path.join(RESULTS_DIR, `${modelSlug}_${ts}.json`); fs.writeFileSync(resultFile, JSON.stringify(results, null, 2)); log(`\n Results saved: ${resultFile}`); - // Update index + // Update index (filter out any live progress entries) const indexFile = path.join(RESULTS_DIR, 'index.json'); let index = []; - try { index = JSON.parse(fs.readFileSync(indexFile, 'utf8')); } catch { } + try { index = JSON.parse(fs.readFileSync(indexFile, 'utf8')).filter(e => e.file !== '_live_progress.json'); } catch { } // Compute LLM vs VLM split (only count image analysis suites as VLM) const isVlmImageSuite = (name) => name.includes('VLM Scene') || name.includes('📸'); const vlmSuites = results.suites.filter(s => isVlmImageSuite(s.name)); @@ -2265,16 +2361,19 @@ async function main() { }); fs.writeFileSync(indexFile, JSON.stringify(index, null, 2)); - // Always generate report (skip only on explicit --no-open with no --report flag) + // Always generate final report (without live mode) let reportPath = null; log('\n Generating HTML report...'); try { const reportScript = path.join(__dirname, 'generate-report.cjs'); + // Clear require cache to get latest version + delete require.cache[require.resolve(reportScript)]; reportPath = require(reportScript).generateReport(RESULTS_DIR); log(` ✅ Report: ${reportPath}`); // Auto-open in browser — only in standalone mode (Aegis handles its own opening) - if (!NO_OPEN && !IS_SKILL_MODE && reportPath) { + // Skip if live mode already opened the browser earlier + if (!_liveReportOpened && !NO_OPEN && !IS_SKILL_MODE && reportPath) { try { const openCmd = process.platform === 'darwin' ? 'open' : 'xdg-open'; execSync(`${openCmd} "${reportPath}"`, { stdio: 'ignore' });