diff --git a/docs/paper/home-security-benchmark.tex b/docs/paper/home-security-benchmark.tex
index d35943c..1cc1655 100644
--- a/docs/paper/home-security-benchmark.tex
+++ b/docs/paper/home-security-benchmark.tex
@@ -75,20 +75,22 @@
 preprocessing, tool use, security classification, prompt injection resistance,
 knowledge injection, and event deduplication, plus an optional multimodal
 VLM scene analysis suite (35~additional tests). We present results across
-\textbf{seven model configurations}: four local Qwen3.5 variants
-(9B~Q4\_K\_M, 27B~Q4\_K\_M, 35B-MoE~Q4\_K\_L, 122B-MoE~IQ1\_M) and three
-OpenAI cloud models (GPT-5.4, GPT-5.4-mini, GPT-5.4-nano), all evaluated
-on a single Apple M5~Pro consumer laptop (64~GB unified memory). Our
-findings reveal that (1)~the best local model (Qwen3.5-9B) achieves
-93.8\% accuracy vs.\ 97.9\% for GPT-5.4---a gap of only 4.1~percentage
-points---with complete data privacy and zero API cost; (2)~the
-Qwen3.5-35B-MoE variant produces lower first-token latency (435~ms)
-than any OpenAI cloud endpoint tested (508~ms for GPT-5.4-nano);
-(3)~security threat classification is universally robust across all
-eight model sizes; and (4)~event deduplication across camera views
-remains the hardest task, with only GPT-5.4 achieving a perfect 8/8
-score. HomeSec-Bench is released as an open-source DeepCamera skill,
-enabling reproducible evaluation of any OpenAI-compatible endpoint.
+\textbf{sixteen model configurations} spanning five model families: Qwen3.5
+(six variants from 9B to 122B-MoE), Mistral Small~4 (119B, two quants),
+NVIDIA Nemotron-3-Nano (4B and 30B), Liquid LFM2 (1.2B and 24B), and
+three OpenAI cloud models (GPT-5.4, GPT-5.4-mini, GPT-5.4-nano), all
+evaluated on a single Apple M5~Pro consumer laptop (64~GB unified memory).
+Our findings reveal that (1)~the best local model (Qwen3.5-27B~Q8) achieves
+95.8\% accuracy vs.\ 97.9\% for GPT-5.4---a gap of only 2.1~percentage
+points---with complete data privacy and zero API cost; (2)~Mistral
+Small~4 (119B) at Q2\_K\_XL quantization scores 89.6\%, establishing
+that 119B-class thinking models can run on consumer hardware with
+proper thinking-mode suppression; (3)~security threat classification
+is universally robust across all model sizes; and (4)~event deduplication
+across camera views remains the hardest task, with only GPT-5.4
+achieving a perfect 8/8 score. HomeSec-Bench is released as an
+open-source DeepCamera skill, enabling reproducible evaluation of any
+OpenAI-compatible endpoint.
 \end{abstract}
 
 \begin{IEEEkeywords}
@@ -731,39 +733,56 @@ \section{Experimental Setup}
 
 \subsection{Models Under Test}
 
-We evaluate seven model configurations spanning local and cloud
-deployments. Local models run via \texttt{llama-server} with Metal
-Performance Shaders (MPS/CoreML) acceleration. Cloud models route
-through the OpenAI API.
+We evaluate sixteen model configurations spanning five model families
+across local and cloud deployments. Local models run via
+\texttt{llama-server} (llama.cpp build b8416) with Metal Performance
+Shaders acceleration on Apple M5~Pro. Cloud models route through the
+OpenAI API.
 
 \begin{table}[h]
 \centering
-\caption{Model Configurations Under Test}
+\caption{Model Configurations Under Test (16 Models)}
 \label{tab:models}
 \small
-\begin{tabular}{p{2.8cm}p{1.3cm}p{1.7cm}}
+\begin{tabular}{p{3.4cm}p{1.0cm}p{2.0cm}}
 \toprule
 \textbf{Model} & \textbf{Type} & \textbf{Quant / Size} \\
 \midrule
+\multicolumn{3}{l}{\textit{Qwen3.5 Family}} \\
 Qwen3.5-9B & Local & Q4\_K\_M, 13.8~GB \\
+Qwen3.5-9B & Local & BF16, 18.5~GB \\
 Qwen3.5-27B & Local & Q4\_K\_M, 24.9~GB \\
+Qwen3.5-27B & Local & Q8\_K\_XL, 30.2~GB \\
 Qwen3.5-35B-MoE & Local & Q4\_K\_L, 27.2~GB \\
 Qwen3.5-122B-MoE & Local & IQ1\_M, 40.8~GB \\
+\multicolumn{3}{l}{\textit{Mistral Family}} \\
+Mistral-Small-4-119B & Local & IQ1\_M, 29.0~GB \\
+Mistral-Small-4-119B & Local & Q2\_K\_XL, 42.9~GB \\
+\multicolumn{3}{l}{\textit{NVIDIA Nemotron}} \\
+Nemotron-3-Nano-4B & Local & Q4\_K\_M, 2.5~GB \\
+Nemotron-3-Nano-30B & Local & Q8\_0, 31.5~GB \\
+\multicolumn{3}{l}{\textit{Liquid LFM}} \\
+LFM2.5-1.2B & Local & BF16, 2.4~GB \\
+LFM2-24B-MoE & Local & Q8\_0, 25.6~GB \\
+\multicolumn{3}{l}{\textit{OpenAI Cloud}} \\
 GPT-5.4 & Cloud & API \\
 GPT-5.4-mini & Cloud & API \\
 GPT-5.4-nano & Cloud & API \\
+GPT-5-mini (2025) & Cloud & API \\
 \bottomrule
 \end{tabular}
 \end{table}
 
-All local models are GGUF variants served by \texttt{llama-server}
-(llama.cpp). The MoE variants (35B and 122B) activate only a fraction
-of parameters per token---approximately 3B active for the 35B
-variant---enabling surprisingly low latency relative to parameter count.
-GPT-5.4-mini exhibited API-level restrictions on non-default temperature
-values; affected suites (using \texttt{temperature}$\neq$1.0) returned
-blanket failures, so GPT-5.4-mini results should be interpreted as a
-lower bound of true capability.
+All local models are GGUF variants served by \texttt{llama-server}.
+The MoE variants (Qwen3.5-35B, 122B; LFM2-24B) activate only a
+fraction of parameters per token---approximately 3B active for the
+35B variant---enabling surprisingly low latency relative to parameter
+count. Mistral Small~4 is a thinking model; we suppress reasoning
+tokens via \texttt{--chat-template-kwargs \{"reasoning\_effort":"none"\}}
+and \texttt{--parallel 1} to prevent KV cache memory exhaustion on
+64~GB hardware. GPT-5-mini (2025) rejected non-default temperature
+values; affected suites returned blanket 400 errors, so its results
+represent a lower bound.
 
 \subsection{Hardware}
 
@@ -795,33 +814,45 @@ \subsection{Overall Scorecard (LLM-Only, 96 Tests)}
 
 \begin{table}[h]
 \centering
-\caption{Overall LLM Benchmark Results — 96 Tests}
+\caption{Overall LLM Benchmark Results — 96 Tests, 16 Models}
 \label{tab:overall}
 \small
-\begin{tabular}{p{2.5cm}cccc}
+\begin{tabular}{p{3.2cm}cccc}
 \toprule
 \textbf{Model} & \textbf{Pass} & \textbf{Fail} & \textbf{Rate} & \textbf{Time} \\
 \midrule
 GPT-5.4 & \textbf{94} & 2 & \textbf{97.9\%} & 2m 22s \\
 GPT-5.4-mini & 92 & 4 & 95.8\% & 1m 17s \\
-Qwen3.5-9B & 90 & 6 & 93.8\% & 5m 23s \\
-Qwen3.5-27B & 90 & 6 & 93.8\% & 15m 8s \\
+Qwen3.5-27B Q8\_K\_XL & 92 & 4 & 95.8\% & --- \\
+Qwen3.5-9B BF16 & 91 & 5 & 94.8\% & --- \\
+Qwen3.5-27B Q4\_K\_M & 90 & 6 & 93.8\% & 15m 8s \\
+Mistral-119B Q2\_K\_XL & 86 & 10 & 89.6\% & --- \\
 Qwen3.5-122B-MoE & 89 & 7 & 92.7\% & 8m 26s \\
 GPT-5.4-nano & 89 & 7 & 92.7\% & 1m 34s \\
+Qwen3.5-9B Q4\_K\_M & 88 & 8 & 91.7\% & 5m 23s \\
 Qwen3.5-35B-MoE & 88 & 8 & 91.7\% & 3m 30s \\
+Nemotron-4B$^\ddagger$ & 84 & 12 & 87.5\% & --- \\
+Mistral-119B IQ1\_M & 79 & 17 & 82.3\% & --- \\
+Nemotron-30B$^\ddagger$ & 78 & 18 & 81.3\% & --- \\
+LFM2-24B-MoE$^\ddagger$ & 72 & 24 & 75.0\% & --- \\
+LFM2.5-1.2B & 62 & 34 & 64.6\% & --- \\
 GPT-5-mini (2025)$^\dagger$ & 60 & 36 & 62.5\% & 7m 38s \\
 \midrule
-\multicolumn{5}{l}{\footnotesize $^\dagger$API rejected non-default temperature; see §\ref{sec:limitations}.}
+\multicolumn{5}{l}{\footnotesize $^\dagger$API rejected non-default temperature; see §\ref{sec:limitations}.} \\
+\multicolumn{5}{l}{\footnotesize $^\ddagger$Temperature restriction failures inflate fail count; see §\ref{sec:limitations}.}
 \end{tabular}
 \end{table}
 
-The \textbf{Qwen3.5-9B} running entirely on a consumer laptop scores
-\textbf{93.8\%}---only 4.1~percentage points below GPT-5.4, and within
-2~points of GPT-5.4-mini. Strikingly, the Qwen3.5-35B-MoE model
-(88/96) ranks last among valid local models despite having 4$\times$
-more parameters than the 9B variant; this is primarily attributed to
-quantization-induced precision loss at IQ-level quants and higher
-memory bandwidth contention on long reasoning chains.
+The expanded 16-model evaluation reveals several new findings.
+\textbf{Qwen3.5-27B at Q8\_K\_XL} quantization achieves \textbf{95.8\%}---tying
+GPT-5.4-mini and closing to within 2.1~points of GPT-5.4. Higher-precision
+quantization (Q8 vs.\ Q4) provides a 2-point lift for the 27B model.
+\textbf{Mistral Small~4} (119B) at Q2\_K\_XL scores \textbf{89.6\%},
+demonstrating that 119B-class thinking models can produce competitive
+results on consumer hardware when thinking-mode is properly suppressed.
+Nemotron and LFM2 models are penalized by temperature-restriction errors
+(\texttt{temperature=0.1} unsupported); their true capability is higher
+than reported scores suggest.
 
 \subsection{Inference Performance}
 
@@ -860,15 +891,13 @@ \subsection{Inference Performance}
 choice for threat triage, preserving privacy for the most
 sensitivity-relevant task.
 
-\textbf{Key finding 3: 9B local model closes the cloud gap.}
-Qwen3.5-9B ties with Qwen3.5-27B at 93.8\%---a larger model provides
-no accuracy benefit at 3.7$\times$ the inference time (5m23s vs.
-15m8s for a full 96-test run). The 9B variant represents the
-Pareto-optimal local configuration:
-{
-\small
-$$\text{Qwen3.5-9B}: \frac{93.8\%}{5\text{m23s}} = 17.4\%/\text{min} \quad\text{vs}\quad \text{27B}: \frac{93.8\%}{15\text{m8s}} = 6.2\%/\text{min}$$
-}
+\textbf{Key finding 3: Quantization precision matters more than parameter count.}
+Qwen3.5-27B at Q8\_K\_XL (95.8\%) outperforms the same model at Q4\_K\_M
+(93.8\%)---a 2-point lift from higher-precision quantization alone.
+Similarly, Mistral-119B at Q2\_K\_XL (89.6\%) outperforms its IQ1\_M
+variant (82.3\%) by 7.3~points. For accuracy-critical deployments,
+allocating more memory to higher-precision quants yields better results
+than increasing parameter count at aggressive quantization.
 
 \textbf{Key finding 4: Context preprocessing remains universally challenging.}
 All models---local and cloud---fail at least one context deduplication
@@ -978,7 +1007,7 @@ \section{Discussion}
 
 \subsection{Deployment Decision Matrix}
 
-Based on our seven-model evaluation, we propose the following guidance:
+Based on our sixteen-model evaluation, we propose the following guidance:
 
 \begin{table}[h]
 \centering
@@ -1085,16 +1114,20 @@ \section{Conclusion}
 multi-turn contextual reasoning---providing a standardized, reproducible
 framework for comparing model suitability in video surveillance deployments.
 
-Evaluating seven model configurations on a single Apple~M5~Pro laptop
-reveals a fundamentally different landscape than the established
-consensus that cloud models are required for production AI accuracy.
-The \textbf{Qwen3.5-9B} achieves \textbf{93.8\%}---within 4.1 points
-of GPT-5.4 (97.9\%)---while running entirely locally with 13.8~GB of
-unified memory, zero API cost, and complete data privacy. The
-Qwen3.5-35B-MoE variant produces \textbf{lower first-token latency}
-(435~ms) than any cloud endpoint we tested (508~ms for GPT-5.4-nano),
-demonstrating that sparse MoE activation is a compelling architectural
-choice for latency-sensitive security alerting on consumer hardware.
+Evaluating sixteen model configurations across five model families on a
+single Apple~M5~Pro laptop reveals a fundamentally different landscape
+than the established consensus that cloud models are required for
+production AI accuracy. The \textbf{Qwen3.5-27B at Q8} achieves
+\textbf{95.8\%}---within 2.1~points of GPT-5.4 (97.9\%)---while running
+entirely locally with 30.2~GB of unified memory, zero API cost, and
+complete data privacy. \textbf{Mistral Small~4} (119B) at Q2\_K\_XL
+scores \textbf{89.6\%}, establishing that 119B-class thinking models
+can serve as effective security assistants on consumer hardware when
+reasoning tokens are suppressed. The Qwen3.5-35B-MoE variant produces
+\textbf{lower first-token latency} (435~ms) than any cloud endpoint
+tested (508~ms for GPT-5.4-nano), demonstrating that sparse MoE
+activation is a compelling architectural choice for latency-sensitive
+security alerting.
 
 Security classification is universally robust (100\% across all models),
 validating local inference for the most consequence-heavy task.
diff --git a/skills/analysis/home-security-benchmark/scripts/run-benchmark.cjs b/skills/analysis/home-security-benchmark/scripts/run-benchmark.cjs
index 8598be1..7f90b8d 100644
--- a/skills/analysis/home-security-benchmark/scripts/run-benchmark.cjs
+++ b/skills/analysis/home-security-benchmark/scripts/run-benchmark.cjs
@@ -120,6 +120,80 @@ const vlmClient = VLM_URL ? new OpenAI({
     baseURL: `${strip(VLM_URL)}/v1`,
 }) : null;
 
+// ─── Model Family Capabilities Config ────────────────────────────────────────
+//
+// Different model families require different per-request params to control
+// thinking/reasoning behavior.  This table centralizes those differences so
+// llmCall() can dispatch them automatically.
+//
+// Fields:
+//   match         — fn(modelName: string) → bool
+//   apiParams     — extra params merged into every chat/completions request
+//   serverFlags   — llama-server startup flags needed for full control
+//                   (documentation only — llmCall is a client and cannot set these)
+//
+// ┌─────────────────────┬──────────────────────────────┬──────────────────────────────────────────┐
+// │ Family              │ Per-request param             │ llama-server startup flag                │
+// ├─────────────────────┼──────────────────────────────┼──────────────────────────────────────────┤
+// │ Mistral Small 4+    │ reasoning_effort: 'none'      │ --reasoning-budget 0                     │
+// │ Qwen3.5 (thinking)  │ (none needed — handled by     │ --chat-template-kwargs                   │
+// │                     │  /no_think prompt suffix and  │   '{"enable_thinking":false}'            │
+// │                     │  500-token reasoning abort)   │                                          │
+// │ GPT / Claude        │ (none — cloud API, no local   │ N/A                                      │
+// │                     │  thinking tokens)             │                                          │
+// └─────────────────────┴──────────────────────────────┴──────────────────────────────────────────┘
+//
+// To add a new model family: append an entry to MODEL_FAMILIES.
+// The match fn receives the lower-cased model name/filename.
+
+const MODEL_FAMILIES = [
+    {
+        name: 'Mistral',
+        // Covers: Mistral-Small-4, Mistral-*, Magistral-*, Mixtral-*
+        match: (m) => m.includes('mistral') || m.includes('magistral') || m.includes('mixtral'),
+        // reasoning_effort=none disables thinking and routes all output to delta.content.
+        // Supported by both Mistral cloud API and llama-server (forwarded as chat template kwarg).
+        // Without this Mistral routes ALL output to delta.thinking, causing 30s idle timeouts.
+        apiParams: { reasoning_effort: 'none' },
+        serverFlags: '--chat-template-kwargs {"reasoning_effort":"none"} --parallel 1',
+    },
+    {
+        name: 'Nemotron',
+        // NVIDIA Nemotron-3-Nano (4B, 30B) — rejects temperature < 1.0 with HTTP 400:
+        // "Unsupported value: 'temperature' does not support 0.1 with this model"
+        match: (m) => m.includes('nemotron'),
+        apiParams: {},
+        minTemperature: 1.0,
+    },
+    {
+        name: 'LFM',
+        // Liquid LFM2 / LFM2.5 — same temperature restriction as Nemotron
+        match: (m) => m.includes('lfm'),
+        apiParams: {},
+        minTemperature: 1.0,
+    },
+    // Qwen3.5 thinking is handled via prompt-level /no_think and the 500-token reasoning
+    // abort in llmCall — no extra per-request params needed.
+];
+
+/**
+ * Return the matched MODEL_FAMILIES entry for the given model name.
+ * Returns {} if the model is not in any known family.
+ */
+function getModelFamily(modelName) {
+    if (!modelName) return {};
+    const lower = modelName.toLowerCase();
+    for (const family of MODEL_FAMILIES) {
+        if (family.match(lower)) return family;
+    }
+    return {};
+}
+
+/** Return extra API params for the model (e.g. reasoning_effort for Mistral). */
+function getModelApiParams(modelName) {
+    return getModelFamily(modelName).apiParams || {};
+}
+
 // ─── Skill Protocol: JSON lines on stdout, human text on stderr ──────────────
 
 /**
@@ -226,6 +300,20 @@ async function llmCall(messages, opts = {}) {
     // Sending max_tokens to thinking models (Qwen3.5) starves actual output since
     // reasoning_content counts against the limit.
 
+    // Lookup model-family-specific config (e.g. reasoning_effort for Mistral,
+    // minTemperature for Nemotron/LFM2).
+    // VLM calls skip the LLM family table — VLM models are always local llava-compatible.
+    const modelFamily = opts.vlm ? {} : getModelFamily(model || LLM_MODEL);
+    const modelFamilyParams = modelFamily.apiParams || {};
+
+    // Resolve temperature: apply model-specific minimum if needed.
+    // Nemotron and LFM2 reject temperature < 1.0 with HTTP 400.
+    let temperature = opts.temperature;
+    if (temperature === undefined && opts.expectJSON) temperature = 0.7;
+    if (temperature !== undefined && modelFamily.minTemperature !== undefined) {
+        temperature = Math.max(temperature, modelFamily.minTemperature);
+    }
+
     // Build request params
     const params = {
         messages,
@@ -234,10 +322,12 @@ async function llmCall(messages, opts = {}) {
         // llama-server crashes with "Failed to parse input" when stream_options is present)
         ...(isCloudApi && { stream_options: { include_usage: true } }),
         ...(model && { model }),
-        ...(opts.temperature !== undefined && { temperature: opts.temperature }),
-        ...(opts.expectJSON && opts.temperature === undefined && { temperature: 0.7 }),
+        ...(temperature !== undefined && { temperature }),
         ...(opts.expectJSON && { top_p: 0.8 }),
         ...(opts.tools && { tools: opts.tools }),
+        // Model-family-specific params (e.g. reasoning_effort:'none' for Mistral).
+        // These are merged last so they take precedence over defaults.
+        ...modelFamilyParams,
     };
 
     // Use an AbortController with idle timeout that resets on each streamed chunk.
@@ -297,7 +387,11 @@ async function llmCall(messages, opts = {}) {
             const delta = chunk.choices?.[0]?.delta;
             if (delta?.content) content += delta.content;
             if (delta?.reasoning_content) reasoningContent += delta.reasoning_content;
-            if (delta?.content || delta?.reasoning_content) {
+            // Fallback: Mistral Small 4 in llama-server may route thinking tokens through
+            // `delta.thinking` even when reasoning_effort=none is requested (llama.cpp
+            // compatibility varies by version). Capture it so the idle timer resets.
+            if (delta?.thinking) reasoningContent += delta.thinking;
+            if (delta?.content || delta?.reasoning_content || delta?.thinking) {
                 tokenCount++;
                 // Capture TTFT on first content/reasoning token
                 if (!firstTokenTime) firstTokenTime = Date.now();
@@ -2347,8 +2441,61 @@ async function main() {
         emit({ event: 'error', message: `Cannot reach LLM endpoint: ${err.message}` });
         process.exit(IS_SKILL_MODE ? 0 : 1);
     }
+    // ── Streaming sanity check ────────────────────────────────────────────────
+    // Fires a tiny streaming call to verify the model actually produces content.
+    // Catches the Mistral "token-loop" bug: server started with a Qwen-specific
+    // --chat-template-kwargs flag causes Mistral to emit only empty token ID 31
+    // on every chunk, giving 0 content tokens for every test.
+    //
+    // This check saves ~30 minutes of doomed benchmark runs by failing fast.
+    log('\n  🔍 Streaming sanity check (10 tokens)...');
+    try {
+        const warmupParams = {
+            ...(LLM_MODEL && { model: LLM_MODEL }),
+            messages: [{ role: 'user', content: 'Reply with just the word: hello' }],
+            stream: true,
+            max_tokens: 10,
+            ...getModelApiParams(LLM_MODEL),
+        };
+        const warmupStream = await llmClient.chat.completions.create(warmupParams);
+        let warmupContent = '';
+        let warmupChunks = 0;
+        const warmupController = new AbortController();
+        const warmupTimeout = setTimeout(() => warmupController.abort(), 15000);
+        try {
+            for await (const chunk of warmupStream) {
+                warmupChunks++;
+                const d = chunk.choices?.[0]?.delta;
+                if (d?.content) warmupContent += d.content;
+                if (d?.reasoning_content) warmupContent += d.reasoning_content;
+                if (d?.thinking) warmupContent += d.thinking;
+                if (warmupChunks >= 30) break; // enough chunks to decide
+            }
+        } finally {
+            clearTimeout(warmupTimeout);
+        }
+
+        if (warmupContent.trim().length === 0) {
+            // Model produced chunks but zero content — server is in a bad state
+            const modelName = results.model.name || LLM_MODEL || 'current model';
+            log(`\n  ❌ STREAMING SANITY CHECK FAILED`);
+            log(`     The model (${modelName}) produced ${warmupChunks} stream chunks but 0 content tokens.`);
+            log(`     This usually means the llama-server was started with an incompatible`);
+            log(`     --chat-template-kwargs flag (e.g. Qwen's enable_thinking:false applied to Mistral).`);
+            log(`\n  ➡  Fix: Reload the model in Aegis-AI to restart the llama-server with`);
+            log(`          the correct flags for this model family.`);
+            log(`          Mistral requires: --reasoning-budget 0`);
+            log(`          Qwen requires:    --chat-template-kwargs '{"enable_thinking":false}'\n`);
+            emit({ event: 'error', message: `Streaming sanity failed: ${warmupChunks} chunks, 0 content tokens. Reload the model in Aegis-AI to fix.` });
+            process.exit(IS_SKILL_MODE ? 0 : 1);
+        }
+
+        log(`  ✅ Streaming OK — ${warmupContent.trim().split(/\s+/).length} words, ${warmupChunks} chunks`);
+    } catch (err) {
+        // Non-fatal — if warmup errors, let the benchmark try; individual tests will surface the issue
+        log(`  ⚠️  Streaming warmup error (non-fatal): ${err.message}`);
+    }
 
-    // Collect system info
     results.system = collectSystemInfo();
     log(`  System:   ${results.system.cpu} (${results.system.cpuCores} cores)`);
     log(`  Memory:   ${results.system.freeMemoryGB}GB free / ${results.system.totalMemoryGB}GB total`);
diff --git a/skills/analysis/home-security-benchmark/scripts/test-model-config.cjs b/skills/analysis/home-security-benchmark/scripts/test-model-config.cjs
new file mode 100644
index 0000000..e273ccb
--- /dev/null
+++ b/skills/analysis/home-security-benchmark/scripts/test-model-config.cjs
@@ -0,0 +1,237 @@
+#!/usr/bin/env node
+/**
+ * Unit tests for MODEL_FAMILIES / getModelFamily / getModelApiParams logic.
+ *
+ * Tests the model-family detection, per-request param injection,
+ * and temperature clamping without needing a running LLM server.
+ *
+ * Usage:
+ *   node scripts/test-model-config.cjs
+ */
+
+// ── Inline the config under test ─────────────────────────────────────────────
+// (Kept in sync with run-benchmark.cjs MODEL_FAMILIES section)
+
+const MODEL_FAMILIES = [
+    {
+        name: 'Mistral',
+        match: (m) => m.includes('mistral') || m.includes('magistral') || m.includes('mixtral'),
+        apiParams: { reasoning_effort: 'none' },
+        serverFlags: '--chat-template-kwargs {"reasoning_effort":"none"} --parallel 1',
+    },
+    {
+        name: 'Nemotron',
+        match: (m) => m.includes('nemotron'),
+        apiParams: {},
+        minTemperature: 1.0,
+    },
+    {
+        name: 'LFM',
+        match: (m) => m.includes('lfm'),
+        apiParams: {},
+        minTemperature: 1.0,
+    },
+];
+
+function getModelFamily(modelName) {
+    if (!modelName) return {};
+    const lower = modelName.toLowerCase();
+    for (const family of MODEL_FAMILIES) {
+        if (family.match(lower)) return family;
+    }
+    return {};
+}
+
+function getModelApiParams(modelName) {
+    return getModelFamily(modelName).apiParams || {};
+}
+
+/** Simulate the temperature clamping logic from llmCall(). */
+function resolveTemperature(modelName, requestedTemp, expectJSON) {
+    const family = getModelFamily(modelName);
+    let temperature = requestedTemp;
+    if (temperature === undefined && expectJSON) temperature = 0.7;
+    if (temperature !== undefined && family.minTemperature !== undefined) {
+        temperature = Math.max(temperature, family.minTemperature);
+    }
+    return temperature;
+}
+
+// ── Mirror the server-manager detection ──────────────────────────────────────
+function getServerFlags(modelFilePath) {
+    const lower = modelFilePath.toLowerCase();
+    const isMistralFamily = lower.includes('mistral') ||
+                            lower.includes('magistral') ||
+                            lower.includes('mixtral');
+    return isMistralFamily
+        ? { flag: '--chat-template-kwargs', value: '{"reasoning_effort":"none"}' }
+        : { flag: '--chat-template-kwargs', value: '{"enable_thinking":false}' };
+}
+
+// ── Test harness ─────────────────────────────────────────────────────────────
+
+let passed = 0;
+let failed = 0;
+
+function test(name, fn) {
+    try {
+        fn();
+        console.log(`  ✅ ${name}`);
+        passed++;
+    } catch (err) {
+        console.log(`  ❌ ${name}: ${err.message}`);
+        failed++;
+    }
+}
+
+function assert(condition, msg) {
+    if (!condition) throw new Error(msg || 'Assertion failed');
+}
+
+function assertDeepEqual(a, b, msg) {
+    const as = JSON.stringify(a), bs = JSON.stringify(b);
+    if (as !== bs) throw new Error(`${msg || 'Not equal'}: got ${as}, expected ${bs}`);
+}
+
+// ── Tests ────────────────────────────────────────────────────────────────────
+
+console.log('\n=== MODEL_FAMILIES / getModelApiParams ===\n');
+
+// ── Mistral detection ─────────────────────────────────────────────────────────
+test('Mistral-Small-4-119B GGUF → reasoning_effort:none', () => {
+    assertDeepEqual(getModelApiParams('Mistral-Small-4-119B-2603-UD-IQ1_M.gguf'), { reasoning_effort: 'none' });
+});
+
+test('Mistral-Small-4 Q2_K_XL → reasoning_effort:none', () => {
+    assertDeepEqual(getModelApiParams('Mistral-Small-4-119B-2603-UD-Q2_K_XL.gguf'), { reasoning_effort: 'none' });
+});
+
+test('Magistral model → reasoning_effort:none', () => {
+    assertDeepEqual(getModelApiParams('magistral-medium-2506.gguf'), { reasoning_effort: 'none' });
+});
+
+test('Mixtral-8x7B → reasoning_effort:none', () => {
+    assertDeepEqual(getModelApiParams('Mixtral-8x7B-Instruct-v0.1.Q4_K_M.gguf'), { reasoning_effort: 'none' });
+});
+
+test('Mistral cloud API model ID → reasoning_effort:none', () => {
+    assertDeepEqual(getModelApiParams('mistral-small-latest'), { reasoning_effort: 'none' });
+});
+
+// ── Nemotron detection ────────────────────────────────────────────────────────
+test('Nemotron-4B → no extra apiParams', () => {
+    assertDeepEqual(getModelApiParams('NVIDIA-Nemotron-3-Nano-4B-Q4_K_M.gguf'), {});
+});
+
+test('Nemotron-30B → no extra apiParams', () => {
+    assertDeepEqual(getModelApiParams('NVIDIA-Nemotron-3-Nano-30B-A3B-Q8_0.gguf'), {});
+});
+
+test('Nemotron-30B → minTemperature = 1.0', () => {
+    const f = getModelFamily('NVIDIA-Nemotron-3-Nano-30B-A3B-Q8_0.gguf');
+    assert(f.minTemperature === 1.0, `Expected 1.0, got ${f.minTemperature}`);
+});
+
+// ── LFM detection ─────────────────────────────────────────────────────────────
+test('LFM2-24B → no extra apiParams', () => {
+    assertDeepEqual(getModelApiParams('LFM2-24B-A2B-Q8_0.gguf'), {});
+});
+
+test('LFM2.5-1.2B → no extra apiParams', () => {
+    assertDeepEqual(getModelApiParams('LFM2.5-1.2B-Instruct-BF16.gguf'), {});
+});
+
+test('LFM2-24B → minTemperature = 1.0', () => {
+    const f = getModelFamily('LFM2-24B-A2B-Q8_0.gguf');
+    assert(f.minTemperature === 1.0, `Expected 1.0, got ${f.minTemperature}`);
+});
+
+// ── Non-matching: should get no family config ─────────────────────────────────
+test('Qwen3.5-9B → no extra params (handled by prompt)', () => {
+    assertDeepEqual(getModelApiParams('Qwen3.5-9B-Q4_K_M.gguf'), {});
+});
+
+test('GPT-5.4 → no extra params', () => {
+    assertDeepEqual(getModelApiParams('gpt-5.4-2026-03-05'), {});
+});
+
+test('Empty model name → no extra params', () => {
+    assertDeepEqual(getModelApiParams(''), {});
+});
+
+test('Undefined model name → no extra params', () => {
+    assertDeepEqual(getModelApiParams(undefined), {});
+});
+
+// ── Temperature clamping ──────────────────────────────────────────────────────
+console.log('\n=== Temperature clamping ===\n');
+
+test('Nemotron + temp 0.1 → clamped to 1.0', () => {
+    const t = resolveTemperature('NVIDIA-Nemotron-3-Nano-30B-A3B-Q8_0.gguf', 0.1, false);
+    assert(t === 1.0, `Expected 1.0, got ${t}`);
+});
+
+test('LFM2 + temp 0.1 → clamped to 1.0', () => {
+    const t = resolveTemperature('LFM2-24B-A2B-Q8_0.gguf', 0.1, false);
+    assert(t === 1.0, `Expected 1.0, got ${t}`);
+});
+
+test('LFM2 + temp 0.7 (expectJSON) → clamped to 1.0', () => {
+    const t = resolveTemperature('LFM2-24B-A2B-Q8_0.gguf', 0.7, true);
+    assert(t === 1.0, `Expected 1.0, got ${t}`);
+});
+
+test('LFM2 + temp undefined + expectJSON → clamped from 0.7 to 1.0', () => {
+    const t = resolveTemperature('LFM2-24B-A2B-Q8_0.gguf', undefined, true);
+    assert(t === 1.0, `Expected 1.0, got ${t}`);
+});
+
+test('LFM2 + temp 1.5 → kept at 1.5 (above min)', () => {
+    const t = resolveTemperature('LFM2-24B-A2B-Q8_0.gguf', 1.5, false);
+    assert(t === 1.5, `Expected 1.5, got ${t}`);
+});
+
+test('Qwen + temp 0.1 → kept at 0.1 (no clamp)', () => {
+    const t = resolveTemperature('Qwen3.5-9B-Q4_K_M.gguf', 0.1, false);
+    assert(t === 0.1, `Expected 0.1, got ${t}`);
+});
+
+test('Mistral + temp 0.1 → kept at 0.1 (no minTemperature)', () => {
+    const t = resolveTemperature('Mistral-Small-4-119B-2603-UD-Q2_K_XL.gguf', 0.1, false);
+    assert(t === 0.1, `Expected 0.1, got ${t}`);
+});
+
+test('Qwen + temp undefined + no expectJSON → stays undefined', () => {
+    const t = resolveTemperature('Qwen3.5-9B-Q4_K_M.gguf', undefined, false);
+    assert(t === undefined, `Expected undefined, got ${t}`);
+});
+
+test('Nemotron + temp undefined + no expectJSON → stays undefined', () => {
+    const t = resolveTemperature('NVIDIA-Nemotron-3-Nano-30B-A3B-Q8_0.gguf', undefined, false);
+    assert(t === undefined, `Expected undefined, got ${t}`);
+});
+
+// ── Server-manager flags ─────────────────────────────────────────────────────
+console.log('\n=== Server-manager startup flags ===\n');
+
+test('Mistral GGUF path → chat-template-kwargs with reasoning_effort:none', () => {
+    const f = getServerFlags('/models/Mistral-Small-4-119B-2603-UD-IQ1_M.gguf');
+    assert(f.flag === '--chat-template-kwargs', `Expected --chat-template-kwargs, got ${f.flag}`);
+    assert(f.value.includes('reasoning_effort'), `Expected reasoning_effort in value`);
+});
+
+test('Qwen path → chat-template-kwargs with enable_thinking:false', () => {
+    const f = getServerFlags('/models/Qwen3.5-9B-Q4_K_M.gguf');
+    assert(f.flag === '--chat-template-kwargs');
+    assert(f.value.includes('enable_thinking'));
+});
+
+test('Nemotron path → chat-template-kwargs (non-Mistral default)', () => {
+    const f = getServerFlags('/models/NVIDIA-Nemotron-3-Nano-30B-A3B-Q8_0.gguf');
+    assert(f.flag === '--chat-template-kwargs');
+});
+
+// ── Summary ──────────────────────────────────────────────────────────────────
+
+console.log(`\n${passed + failed} tests: ${passed} passed, ${failed} failed\n`);
+process.exit(failed > 0 ? 1 : 0);