diff --git a/.beads/issues.jsonl b/.beads/issues.jsonl
index 7f02b1a..5866e43 100644
--- a/.beads/issues.jsonl
+++ b/.beads/issues.jsonl
@@ -13,5 +13,5 @@
 {"id":"openadapt-evals-hvm","title":"VL model fix PR #18 ready to merge","notes":"2026-02-08: openadapt-ml PR #18 was already merged on 2026-01-29. VL model fix is done.","status":"closed","priority":0,"issue_type":"task","owner":"richard.abrich@gmail.com","created_at":"2026-01-29T16:17:03.491938-05:00","created_by":"Richard Abrich","updated_at":"2026-02-08T12:55:19.233249-05:00","closed_at":"2026-02-08T12:55:19.233249-05:00","close_reason":"PR #18 already merged 2026-01-29"}
 {"id":"openadapt-evals-mx8","title":"Analyze evaluation results and publish findings","description":"After demo-conditioned evaluation completes, analyze results: success rates, failure modes, demo impact. Create data-driven roadmap for improvements.","status":"open","priority":1,"issue_type":"task","owner":"richard.abrich@gmail.com","created_at":"2026-02-14T12:23:06.328838-05:00","created_by":"Richard Abrich","updated_at":"2026-02-14T12:23:06.328838-05:00"}
 {"id":"openadapt-evals-sz4","title":"RCA: Windows product key prompt recurring issue","status":"closed","priority":0,"issue_type":"task","owner":"richard.abrich@gmail.com","created_at":"2026-01-20T18:59:36.266286-05:00","created_by":"Richard Abrich","updated_at":"2026-01-20T20:32:06.493102-05:00","closed_at":"2026-01-20T20:32:06.493102-05:00","close_reason":"RCA complete - root cause is VERSION mismatch (CLI=11, Dockerfile=11e). Fix documented in RECURRING_ISSUES.md and WINDOWS_PRODUCT_KEY_RCA.md"}
-{"id":"openadapt-evals-vcb","title":"Run demo-conditioned WAA evaluation","description":"Once demos are recorded, run WAA evaluation with demo-conditioned agents (RetrievalAugmentedAgent with real demos). Target: measure improvement over zero-shot baseline. Requires real demos from recording task.","notes":"Feb 24: Fixed Docker data-root (ephemeral /mnt -\u003e persistent /home/azureuser/docker) in PRs #37+#38 (v0.4.2). Verified pool-pause/resume cycle works. Created fresh pool (waa-pool-00, 172.173.66.131, D8ds_v4, centralus). Running zero-shot eval on 12 harder tasks with api-claude (claude-sonnet-4-5). 10/12 tasks completed, all scoring 0.00 as expected. Results in benchmark_results/zs_harder_12_zs_*. Next: complete ZS eval, then record demos + annotate + run DC eval.","status":"open","priority":0,"issue_type":"task","owner":"richard.abrich@gmail.com","created_at":"2026-02-14T12:23:04.624305-05:00","created_by":"Richard Abrich","updated_at":"2026-02-24T13:51:58.517565-05:00"}
+{"id":"openadapt-evals-vcb","title":"Run demo-conditioned WAA evaluation","description":"Once demos are recorded, run WAA evaluation with demo-conditioned agents (RetrievalAugmentedAgent with real demos). Target: measure improvement over zero-shot baseline. Requires real demos from recording task.","notes":"Feb 24: Fixed Docker data-root (ephemeral /mnt -\u003e persistent /home/azureuser/docker) in PRs #37+#38 (v0.4.2). Verified pool-pause/resume cycle works. Created fresh pool (waa-pool-00, 172.173.66.131, D8ds_v4, centralus). Running zero-shot eval on 12 harder tasks with api-claude (claude-sonnet-4-5). 10/12 tasks completed, all scoring 0.00 as expected. Results in benchmark_results/zs_harder_12_zs_*. Next: complete ZS eval, then record demos + annotate + run DC eval.","status":"open","priority":0,"issue_type":"task","owner":"richard.abrich@gmail.com","created_at":"2026-02-14T12:23:04.624305-05:00","created_by":"Richard Abrich","updated_at":"2026-02-24T16:31:54.323876-05:00"}
 {"id":"openadapt-evals-wis","title":"Add pre-flight check to detect Windows install issues","status":"closed","priority":1,"issue_type":"task","owner":"richard.abrich@gmail.com","created_at":"2026-01-20T18:59:36.865052-05:00","created_by":"Richard Abrich","updated_at":"2026-01-20T20:32:06.757261-05:00","closed_at":"2026-01-20T20:32:06.757261-05:00","close_reason":"Duplicate of openadapt-evals-0dt"}
diff --git a/openadapt_evals/adapters/base.py b/openadapt_evals/adapters/base.py
index ce57554..f67c976 100644
--- a/openadapt_evals/adapters/base.py
+++ b/openadapt_evals/adapters/base.py
@@ -116,7 +116,7 @@ class BenchmarkAction:
         raw_action: Original benchmark action (lossless).
     """
 
-    type: str  # "click", "type", "scroll", "key", "drag", "answer", "done"
+    type: str  # "click", "type", "scroll", "key", "drag", "answer", "done", "error"
 
     # Pointer actions - coordinates
     x: float | None = None  # Normalized [0,1] or pixel
@@ -176,6 +176,7 @@ class BenchmarkResult:
     # Diagnostics
     error: str | None = None
     reason: str | None = None  # Why success/fail
+    error_type: str | None = None  # "infrastructure", "agent", "evaluation", or None
 
     # Timing
     total_time_seconds: float = 0.0
diff --git a/openadapt_evals/adapters/waa/live.py b/openadapt_evals/adapters/waa/live.py
index c58c98b..9f4a27b 100644
--- a/openadapt_evals/adapters/waa/live.py
+++ b/openadapt_evals/adapters/waa/live.py
@@ -431,9 +431,9 @@ def step(
         # Wait for UI to settle
         time.sleep(self.config.action_delay)
 
-        # Check if done
+        # Check if done (error actions are also terminal)
         done = (
-            action.type == "done" or
+            action.type in ("done", "error") or
             self._step_count >= self.config.max_steps
         )
 
@@ -530,11 +530,14 @@ def evaluate(self, task: BenchmarkTask) -> BenchmarkResult:
                 score=0.0,
                 num_steps=self._step_count,
                 reason="Evaluation timed out",
+                error_type="infrastructure",
             )
 
         except requests.RequestException as e:
             logger.error(f"Evaluation request error: {e}")
-            return self._evaluate_fallback(task)
+            result = self._evaluate_fallback(task)
+            result.error_type = "infrastructure"
+            return result
 
     def _evaluate_fallback(self, task: BenchmarkTask) -> BenchmarkResult:
         """Fallback when proper evaluation unavailable - returns failure.
@@ -602,7 +605,7 @@ def _get_observation(self) -> BenchmarkObservation:
         try:
             resp = requests.get(
                 f"{self.config.server_url}/screenshot",
-                timeout=30.0
+                timeout=self.config.timeout
             )
             if resp.status_code == 200:
                 screenshot = resp.content
@@ -626,7 +629,7 @@ def _get_observation(self) -> BenchmarkObservation:
             resp = requests.get(
                 f"{self.config.server_url}/accessibility",
                 params={"backend": self.config.a11y_backend},
-                timeout=30.0
+                timeout=self.config.timeout
             )
             if resp.status_code == 200:
                 result = resp.json()
@@ -856,7 +859,7 @@ def _translate_action(self, action: BenchmarkAction) -> str | None:
             Python command string to execute via /execute_windows endpoint,
             or None for actions that don't need execution.
         """
-        if action.type == "done":
+        if action.type in ("done", "error"):
             return None
 
         if action.type == "wait":
diff --git a/openadapt_evals/adapters/waa/mock.py b/openadapt_evals/adapters/waa/mock.py
index 550c2c0..60f80fa 100644
--- a/openadapt_evals/adapters/waa/mock.py
+++ b/openadapt_evals/adapters/waa/mock.py
@@ -640,7 +640,7 @@ def step(
         if action.type == "type" and action.text:
             self._text_entered = action.text
 
-        done = action.type == "done" or self._step_count >= 15
+        done = action.type in ("done", "error") or self._step_count >= 15
         return self._mock_observation(), done, {"step": self._step_count}
 
     def evaluate(self, task: BenchmarkTask) -> BenchmarkResult:
@@ -700,13 +700,11 @@ def evaluate(self, task: BenchmarkTask) -> BenchmarkResult:
         # Success criteria:
         # 1. Clicked Submit (ID 4) - primary success path
         # 2. Typed something AND clicked OK (ID 1) - form submission path
-        # 3. Called DONE after at least 2 actions - reasonable completion
         clicked_submit = "4" in clicked_ids
         clicked_ok = "1" in clicked_ids
         form_submitted = typed_text and clicked_ok
-        reasonable_completion = called_done and len(self._actions) >= 2
 
-        success = clicked_submit or form_submitted or reasonable_completion
+        success = clicked_submit or form_submitted
 
         # Calculate partial credit score
         score = 0.0
diff --git a/openadapt_evals/agents/claude_computer_use_agent.py b/openadapt_evals/agents/claude_computer_use_agent.py
index 194675b..bcb807e 100644
--- a/openadapt_evals/agents/claude_computer_use_agent.py
+++ b/openadapt_evals/agents/claude_computer_use_agent.py
@@ -160,6 +160,13 @@ def act(
         self._step_count += 1
         screenshot_b64 = self._encode_screenshot(observation)
 
+        if screenshot_b64 is None:
+            logger.warning("No screenshot available from environment")
+            return BenchmarkAction(
+                type="error",
+                raw_action={"reason": "no_screenshot", "error_type": "infrastructure"},
+            )
+
         if self._step_count == 1:
             # First step: send task instruction + initial screenshot
             self._messages = self._build_initial_messages(
@@ -179,7 +186,8 @@ def act(
             response = self._call_api()
             if response is None:
                 return BenchmarkAction(
-                    type="done", raw_action={"error": "API call failed"}
+                    type="error",
+                    raw_action={"reason": "api_call_failed", "error_type": "infrastructure"},
                 )
 
             # Add assistant response to conversation
@@ -205,14 +213,14 @@ def act(
             # Real action — return to runner
             return self._process_response(response, observation)
 
-        # Exhausted retries on screenshot/wait — return done
+        # Exhausted retries on screenshot/wait — return error (not done)
         logger.warning(
             f"Exhausted {self.MAX_INTERNAL_RETRIES} internal retries on "
             "screenshot/wait actions"
         )
         return BenchmarkAction(
-            type="done",
-            raw_action={"reason": "max_internal_retries_exceeded"},
+            type="error",
+            raw_action={"reason": "max_internal_retries_exceeded", "error_type": "infrastructure"},
         )
 
     def _build_initial_messages(
diff --git a/openadapt_evals/agents/policy_agent.py b/openadapt_evals/agents/policy_agent.py
index 7284397..8bc2fda 100644
--- a/openadapt_evals/agents/policy_agent.py
+++ b/openadapt_evals/agents/policy_agent.py
@@ -34,42 +34,52 @@ class PolicyAgent(BenchmarkAgent):
     benchmark evaluation. The model is expected to be trained using
     the openadapt-ml training pipeline.
 
+    Prompt format is aligned with convert_demos.py training data.
+
     Args:
-        checkpoint_path: Path to model checkpoint.
-        model_name: Name of the model architecture (default: qwen3-vl).
+        checkpoint_path: Path to LoRA adapter weights.
+        model_name: HuggingFace model name (must contain 'Qwen3-VL' or 'Qwen2.5-VL').
         device: Device to run on ('cuda' or 'cpu').
-        use_accessibility_tree: Whether to include a11y tree in prompts.
+        use_thinking: Whether to include <think> instruction in prompts.
     """
 
     def __init__(
         self,
         checkpoint_path: str | None = None,
-        model_name: str = "qwen3-vl",
+        model_name: str = "Qwen/Qwen3-VL-8B-Instruct",
         device: str = "cuda",
-        use_accessibility_tree: bool = True,
+        use_thinking: bool = True,
     ):
         self.checkpoint_path = checkpoint_path
         self.model_name = model_name
         self.device = device
-        self.use_accessibility_tree = use_accessibility_tree
+        self.use_thinking = use_thinking
 
         # Lazy load model to avoid import overhead
         self._model = None
         self._processor = None
+        self._previous_actions: list[str] = []
+        self._temp_files: list[str] = []
 
     def _load_model(self) -> None:
-        """Load the model from checkpoint."""
+        """Load the model adapter from checkpoint."""
         if self._model is not None:
             return
 
         try:
-            # Import from openadapt-ml
-            from openadapt_ml.vlm import load_model_and_processor
-
-            self._model, self._processor = load_model_and_processor(
+            import torch
+            from openadapt_ml.models.qwen_vl import QwenVLAdapter
+
+            device = torch.device(self.device) if isinstance(self.device, str) else self.device
+            lora_config = (
+                {"weights_path": self.checkpoint_path}
+                if self.checkpoint_path
+                else None
+            )
+            self._model = QwenVLAdapter.from_pretrained(
                 model_name=self.model_name,
-                checkpoint_path=self.checkpoint_path,
-                device=self.device,
+                lora_config=lora_config,
+                device=device,
             )
             logger.info(f"PolicyAgent loaded model from {self.checkpoint_path}")
         except ImportError as e:
@@ -99,13 +109,17 @@ def act(
         # Ensure model is loaded
         self._load_model()
 
-        # Build prompt
+        # Build prompt (aligned with training format)
         prompt = self._build_prompt(observation, task, history)
 
         # Get model prediction
         try:
             response = self._run_inference(observation, prompt)
             action = self._parse_response(response, observation)
+
+            # Track action in training format for "Previous actions" section
+            self._previous_actions.append(self._format_action_qwen(action))
+
             return action
         except Exception as e:
             logger.error(f"Inference failed: {e}")
@@ -117,7 +131,19 @@ def _build_prompt(
         task: BenchmarkTask,
         history: list[tuple[BenchmarkObservation, BenchmarkAction]] | None = None,
     ) -> str:
-        """Build prompt for the model.
+        """Build user-turn text aligned with convert_demos.convert_step.
+
+        Format matches training data exactly::
+
+            <image>
+            Instruction: {instruction}
+
+            Previous actions:
+              Step 0: {action}
+              Step 1: {action}
+
+            First reason about what you see in <think>...</think> tags,
+            then output exactly one action.
 
         Args:
             observation: Current observation.
@@ -127,59 +153,112 @@ def _build_prompt(
         Returns:
             Prompt string.
         """
-        parts = [f"TASK: {task.instruction}"]
-
-        # Add context
-        if observation.window_title:
-            parts.append(f"Current window: {observation.window_title}")
-
-        # Add accessibility tree if enabled
-        if self.use_accessibility_tree and observation.accessibility_tree:
-            from openadapt_evals.agents.base import format_accessibility_tree
-            tree_str = format_accessibility_tree(observation.accessibility_tree)
-            if len(tree_str) > 4000:
-                tree_str = tree_str[:4000] + "\n... (truncated)"
-            parts.append(f"UI Elements:\n{tree_str}")
-
-        # Add history
-        if history:
-            from openadapt_evals.agents.base import action_to_string
-            history_str = "\n".join(
-                f"Step {i+1}: {action_to_string(action)}"
-                for i, (_, action) in enumerate(history[-5:])
+        parts = ["<image>"]
+        parts.append(f"Instruction: {task.instruction}")
+
+        # Previous actions (matches training format)
+        if self._previous_actions:
+            parts.append("")
+            parts.append("Previous actions:")
+            for i, act in enumerate(self._previous_actions):
+                parts.append(f"  Step {i}: {act}")
+
+        # Tail instruction
+        parts.append("")
+        if self.use_thinking:
+            parts.append(
+                "First reason about what you see in <think>...</think> "
+                "tags, then output exactly one action."
             )
-            parts.append(f"Previous actions:\n{history_str}")
+        else:
+            parts.append("Output exactly one action.")
+
+        return "\n".join(parts)
 
-        parts.append("\nWhat is the next action?")
-        return "\n\n".join(parts)
+    @staticmethod
+    def _format_action_qwen(action: BenchmarkAction) -> str:
+        """Format action matching convert_demos._format_action_qwen training format.
+
+        Uses [0, 1000] coordinate range and lowercase function-call style
+        to match what the model was trained on.
+        """
+        def _to_1000(v: float | None) -> int:
+            return round((v or 0.0) * 1000)
+
+        if action.type == "click":
+            return f"click(x={_to_1000(action.x)}, y={_to_1000(action.y)})"
+        if action.type == "double_click":
+            return f"double_click(x={_to_1000(action.x)}, y={_to_1000(action.y)})"
+        if action.type == "right_click":
+            return f"right_click(x={_to_1000(action.x)}, y={_to_1000(action.y)})"
+        if action.type == "type":
+            return f'type(text="{action.text or ""}")'
+        if action.type == "key":
+            keys = (action.modifiers or []) + ([action.key] if action.key else [])
+            keys_fmt = ", ".join(f'"{k}"' for k in keys)
+            return f"press(keys=[{keys_fmt}])"
+        if action.type == "scroll":
+            return f'scroll(direction="{action.scroll_direction or "down"}", amount=3)'
+        if action.type == "drag":
+            return (
+                f"drag(from_coord=[{_to_1000(action.x)}, {_to_1000(action.y)}], "
+                f"to_coord=[{_to_1000(action.end_x)}, {_to_1000(action.end_y)}])"
+            )
+        if action.type == "done":
+            return "finished()"
+        return f"# unknown: {action.type}"
+
+    def _build_sample(self, observation: BenchmarkObservation, prompt: str) -> dict:
+        """Build SFT-style sample matching training format from convert_demos.py.
+
+        NOTE: No system message is included here because
+        ``QwenVLAdapter.generate()`` only extracts the user role message
+        and drops any system role.  The model was trained under the same
+        conditions (no system prompt), so omitting it at inference keeps
+        behaviour consistent.
+
+        Args:
+            observation: Observation with screenshot.
+            prompt: User-turn prompt text (from _build_prompt).
+
+        Returns:
+            SFT sample dict with messages and optional images.
+        """
+        sample = {
+            "messages": [
+                {"role": "user", "content": prompt},
+            ],
+        }
+        if observation.screenshot_path:
+            sample["images"] = [observation.screenshot_path]
+        return sample
 
     def _run_inference(self, observation: BenchmarkObservation, prompt: str) -> str:
-        """Run model inference.
+        """Run model inference using SFT-style message format.
 
         Args:
             observation: Observation with screenshot.
-            prompt: Prompt text.
+            prompt: User-turn prompt text.
 
         Returns:
             Model response text.
         """
-        from openadapt_ml.vlm import run_inference
-
-        # Get screenshot as PIL Image
-        if observation.screenshot:
-            from PIL import Image
-            from io import BytesIO
-            image = Image.open(BytesIO(observation.screenshot))
-        else:
+        if not observation.screenshot and not observation.screenshot_path:
             raise ValueError("No screenshot in observation")
 
-        response = run_inference(
-            model=self._model,
-            processor=self._processor,
-            image=image,
-            prompt=prompt,
-            device=self.device,
-        )
+        sample = self._build_sample(observation, prompt)
+
+        # If screenshot_path is missing but bytes are available, write to temp file
+        if "images" not in sample and observation.screenshot:
+            import tempfile
+            tmp = tempfile.NamedTemporaryFile(suffix=".png", delete=False)
+            tmp.write(observation.screenshot)
+            tmp.close()
+            sample["images"] = [tmp.name]
+            self._temp_files.append(tmp.name)
+
+        # Use the adapter's generate method (works with both local and remote)
+        response = self._model.generate(sample)
         return response
 
     def _parse_response(
@@ -187,6 +266,12 @@ def _parse_response(
     ) -> BenchmarkAction:
         """Parse model response into BenchmarkAction.
 
+        Uses ``parse_qwen_action`` from ``qwen3vl_agent`` which handles the
+        lowercase keyword format the trained model outputs (e.g.
+        ``click(x=500, y=300)``).  The base ``parse_action_response`` only
+        recognises UPPERCASE format (``CLICK(0.5, 0.3)``), so every Qwen
+        model output would silently fall through to ``type="done"``.
+
         Args:
             response: Model response text.
             observation: Observation for coordinate normalization.
@@ -194,10 +279,17 @@ def _parse_response(
         Returns:
             Parsed action.
         """
-        from openadapt_evals.agents.base import parse_action_response
-        return parse_action_response(response, observation)
+        from openadapt_evals.agents.qwen3vl_agent import parse_qwen_action
+        return parse_qwen_action(response, observation.viewport)
 
     def reset(self) -> None:
         """Reset agent state between tasks."""
-        # Nothing to reset for policy agent
-        pass
+        import os
+
+        self._previous_actions = []
+        for path in self._temp_files:
+            try:
+                os.unlink(path)
+            except OSError:
+                pass
+        self._temp_files = []
diff --git a/openadapt_evals/agents/qwen3vl_agent.py b/openadapt_evals/agents/qwen3vl_agent.py
index a459ba2..fc95a02 100644
--- a/openadapt_evals/agents/qwen3vl_agent.py
+++ b/openadapt_evals/agents/qwen3vl_agent.py
@@ -204,7 +204,7 @@ def parse_qwen_action(
     # --- wait() ---
     if _RE_WAIT.search(action_str):
         raw["is_wait"] = True
-        return BenchmarkAction(type="done", raw_action=raw)
+        return BenchmarkAction(type="wait", raw_action=raw)
 
     # --- double_click(x=<int>, y=<int>) --- (check before click)
     m = _RE_DOUBLE_CLICK.search(action_str)
@@ -734,8 +734,9 @@ def _run_remote_inference(
                 with open(adapter_config) as f:
                     cfg = json.load(f)
                 base_model = cfg.get("base_model_name_or_path", DEFAULT_MODEL)
-                # Adapter is assumed to be uploaded to volume at /training/results/final
-                adapter_path = "/training/results/final"
+                # Adapter is uploaded to volume at /adapter by upload_adapter_to_volume()
+                # Volume mounts at /training, so full path is /training/adapter
+                adapter_path = "/training/adapter"
             else:
                 base_model = self.model_path
 
diff --git a/openadapt_evals/benchmarks/runner.py b/openadapt_evals/benchmarks/runner.py
index ab8a8e6..d295929 100644
--- a/openadapt_evals/benchmarks/runner.py
+++ b/openadapt_evals/benchmarks/runner.py
@@ -292,6 +292,7 @@ def _run_single_task(
 
         done = False
         steps = 0
+        action = None
         max_steps = task.time_limit_steps or config.max_steps
 
         while not done and steps < max_steps:
@@ -337,8 +338,11 @@ def _run_single_task(
                 config.on_step(obs, action, steps)
 
             # Check for terminal action
-            if action.type == "done":
-                logger.info(f"Step {steps}: Agent signaled task completion")
+            if action.type in ("done", "error"):
+                if action.type == "error":
+                    logger.error(f"Step {steps}: Agent error: {action.raw_action}")
+                else:
+                    logger.info(f"Step {steps}: Agent signaled task completion")
                 done = True
                 break
 
@@ -365,6 +369,11 @@ def _run_single_task(
         logger.info("Evaluating task result")
         result = adapter.evaluate(task)
 
+        # Propagate error_type from agent error action
+        if action is not None and action.type == "error" and action.raw_action:
+            result.error_type = action.raw_action.get("error_type", "agent")
+            result.error = result.error or action.raw_action.get("reason")
+
         # Update result with trajectory info
         result.steps = history if config.save_trajectories else []
         result.num_steps = steps
diff --git a/openadapt_evals/infrastructure/pool.py b/openadapt_evals/infrastructure/pool.py
index d64f055..05c8a67 100644
--- a/openadapt_evals/infrastructure/pool.py
+++ b/openadapt_evals/infrastructure/pool.py
@@ -134,7 +134,15 @@ class PoolRunResult:
 WAA_START_SCRIPT = """
 # Check if container already running
 if docker ps --format '{{.Names}}' | grep -q '^winarena$'; then
-    echo "ALREADY_RUNNING"
+    # Container is up, but socat proxy may be dead (e.g. after docker restart).
+    # Ensure the evaluate-server proxy is running.
+    if ! pgrep -f 'socat.*5051' >/dev/null 2>&1; then
+        which socat >/dev/null 2>&1 || sudo DEBIAN_FRONTEND=noninteractive apt-get install -y -qq socat
+        nohup socat TCP-LISTEN:5051,fork,reuseaddr EXEC:"docker exec -i winarena socat - TCP\\:127.0.0.1\\:5050" > /dev/null 2>&1 &
+        echo "ALREADY_RUNNING (socat proxy restarted)"
+    else
+        echo "ALREADY_RUNNING"
+    fi
     exit 0
 fi
 
diff --git a/openadapt_evals/server/evaluate_endpoint.py b/openadapt_evals/server/evaluate_endpoint.py
index 6126838..747a676 100644
--- a/openadapt_evals/server/evaluate_endpoint.py
+++ b/openadapt_evals/server/evaluate_endpoint.py
@@ -365,6 +365,14 @@ def evaluate_task_state(
             "success": score >= 1.0,
         })
 
+    # Guard empty metric_results (e.g. func_spec was [])
+    if not metric_results:
+        return {
+            "success": False,
+            "score": 0.0,
+            "reason": "No metrics could be computed (empty func spec)",
+        }
+
     # Combine results based on conjunction
     if conjunction == "or":
         final_score = max(r["score"] for r in metric_results)
diff --git a/scripts/run_dc_eval.py b/scripts/run_dc_eval.py
new file mode 100644
index 0000000..0001123
--- /dev/null
+++ b/scripts/run_dc_eval.py
@@ -0,0 +1,317 @@
+#!/usr/bin/env python3
+"""Run demo-conditioned evaluation with auto-recovery.
+
+Runs each task individually with its matching demo file. Includes:
+- SSH tunnel health check + reconnect before each task
+- WAA server health check with container restart if dead
+- Retry on transient failures
+
+Usage:
+    # 3-task validation with ClaudeComputerUseAgent
+    python scripts/run_dc_eval.py \
+      --agent api-claude-cu \
+      --tasks 0e763496,70745df8,fba2c100
+
+    # All 12 tasks
+    python scripts/run_dc_eval.py --agent api-claude-cu
+"""
+
+from __future__ import annotations
+
+import argparse
+import subprocess
+import sys
+import time
+from pathlib import Path
+
+import requests
+
+HARDER_TASK_IDS = [
+    "04d9aeaf-7bed-4024-bedb-e10e6f00eb7f-WOS",
+    "0a0faba3-5580-44df-965d-f562a99b291c-WOS",
+    "0bf05a7d-b28b-44d2-955a-50b41e24012a-WOS",
+    "0e763496-b6bb-4508-a427-fad0b6c3e195-WOS",
+    "4bcb1253-a636-4df4-8cb0-a35c04dfef31-WOS",
+    "70745df8-f2f5-42bd-8074-fbc10334fcc5-2-WOS",
+    "8b1ce5f2-59d2-4dcc-b0b0-666a714b9a14-WOS",
+    "e2b5e914-ffe1-44d2-8e92-58f8c5d92bb2-WOS",
+    "ec71221e-ac43-46f9-89b8-ee7d80f7e1c5-WOS",
+    "fba2c100-79e8-42df-ae74-b592418d54f4-WOS",
+    "INF-0d95d28a-9587-433b-a805-1fbe5467d598-WOS",
+    "INF-5ac2891a-eacd-4954-b339-98abba077adb-WOS",
+]
+
+
+def short_id(task_id: str) -> str:
+    return task_id[:8]
+
+
+def _kill_tunnels():
+    subprocess.run(
+        "ps aux | grep 'ssh.*5001' | grep -v grep | awk '{print $2}' | xargs kill 2>/dev/null",
+        shell=True, capture_output=True,
+    )
+
+
+def _start_tunnel(vm_user: str, vm_ip: str) -> bool:
+    cmd = [
+        "ssh", "-f", "-N",
+        "-o", "StrictHostKeyChecking=no",
+        "-o", "ServerAliveInterval=15",
+        "-o", "ServerAliveCountMax=3",
+        "-o", "TCPKeepAlive=yes",
+        "-o", "ExitOnForwardFailure=yes",
+        "-L", "5001:localhost:5000",
+        "-L", "5050:localhost:5051",
+        "-L", "8006:localhost:8006",
+        f"{vm_user}@{vm_ip}",
+    ]
+    result = subprocess.run(cmd, capture_output=True)
+    return result.returncode == 0
+
+
+def _probe(server: str, timeout: int = 10) -> bool:
+    try:
+        resp = requests.get(f"{server}/probe", timeout=timeout)
+        return resp.ok
+    except Exception:
+        return False
+
+
+def _setup_eval_proxy(vm_user: str, vm_ip: str) -> bool:
+    """(Re-)establish socat proxy for the evaluate server on the VM.
+
+    Docker port forwarding for port 5050 is broken due to QEMU's custom
+    bridge networking (--cap-add NET_ADMIN).  Work around it by running
+    socat on the VM host that pipes through ``docker exec`` into the
+    container's localhost:5050.  The SSH tunnel maps local 5050 → VM 5051.
+    """
+    script = (
+        "killall socat 2>/dev/null || true; sleep 1; "
+        "which socat >/dev/null 2>&1 "
+        "|| sudo DEBIAN_FRONTEND=noninteractive apt-get install -y -qq socat; "
+        "nohup socat TCP-LISTEN:5051,fork,reuseaddr "
+        "'EXEC:docker exec -i winarena socat - TCP\\:127.0.0.1\\:5050' "
+        "</dev/null >/dev/null 2>&1 &"
+    )
+    result = subprocess.run(
+        ["ssh", "-o", "StrictHostKeyChecking=no", f"{vm_user}@{vm_ip}", script],
+        capture_output=True, timeout=30,
+    )
+    if result.returncode != 0:
+        print(f"  socat proxy setup failed: {result.stderr.decode()}")
+        return False
+    print("  socat proxy for evaluate server established (VM:5051 → container:5050)")
+    return True
+
+
+def _restart_container(vm_user: str, vm_ip: str) -> bool:
+    """Restart the WAA container and re-establish the evaluate proxy."""
+    print("  Restarting WAA container (Windows will reboot)...")
+    result = subprocess.run(
+        ["ssh", "-o", "StrictHostKeyChecking=no", f"{vm_user}@{vm_ip}",
+         "docker restart winarena"],
+        capture_output=True, timeout=120,
+    )
+    if result.returncode != 0:
+        print(f"  Container restart failed: {result.stderr.decode()}")
+        return False
+    print("  Container restarted, re-establishing evaluate proxy...")
+    _setup_eval_proxy(vm_user, vm_ip)
+    return True
+
+
+def ensure_waa_ready(
+    server: str,
+    vm_user: str,
+    vm_ip: str,
+    max_wait: int = 420,
+    evaluate_url: str | None = None,
+) -> bool:
+    """Ensure WAA is reachable, recovering via tunnel reconnect or container restart.
+
+    Recovery sequence:
+    1. Probe → OK: return True
+    2. Reconnect tunnel → Probe → OK: return True
+    3. Restart container → Reconnect tunnel → Wait for probe: return True/False
+
+    If evaluate_url is provided, both the WAA server and evaluate server must respond.
+    """
+    # Step 1: Quick probe
+    if _probe(server) and (evaluate_url is None or _probe(evaluate_url)):
+        return True
+
+    # Step 2: Reconnect tunnel + ensure socat proxy
+    print("  WAA unreachable, reconnecting tunnel...")
+    _kill_tunnels()
+    time.sleep(1)
+    _setup_eval_proxy(vm_user, vm_ip)
+    if _start_tunnel(vm_user, vm_ip):
+        time.sleep(3)
+        if _probe(server) and (evaluate_url is None or _probe(evaluate_url)):
+            print("  Tunnel reconnected, WAA ready!")
+            return True
+
+    # Step 3: Tunnel up but WAA not responding → container restart
+    print("  Tunnel OK but WAA server dead, restarting container...")
+    _kill_tunnels()
+    if not _restart_container(vm_user, vm_ip):
+        return False
+
+    # Wait for Windows to boot + Flask server to start
+    time.sleep(10)
+    _kill_tunnels()
+    time.sleep(1)
+    if not _start_tunnel(vm_user, vm_ip):
+        print("  Failed to reconnect tunnel after restart")
+        return False
+
+    deadline = time.time() + max_wait
+    last_print = 0
+    while time.time() < deadline:
+        elapsed = int(time.time() - (deadline - max_wait))
+        if elapsed - last_print >= 15:
+            print(f"  [{elapsed}s] Waiting for WAA server...")
+            last_print = elapsed
+        if _probe(server, timeout=10) and (evaluate_url is None or _probe(evaluate_url, timeout=10)):
+            print(f"  WAA ready after {elapsed}s!")
+            return True
+        time.sleep(10)
+
+    print(f"  TIMEOUT: WAA not ready after {max_wait}s")
+    return False
+
+
+def main() -> int:
+    parser = argparse.ArgumentParser(description="Run DC eval with auto-recovery")
+    parser.add_argument("--agent", default="api-claude-cu", help="Agent type")
+    parser.add_argument("--demo-dir", default="annotated_demos", help="Demo directory")
+    parser.add_argument("--server", default="http://localhost:5001")
+    parser.add_argument("--evaluate-url", default="http://localhost:5050")
+    parser.add_argument("--max-steps", type=int, default=15)
+    parser.add_argument("--output", default="benchmark_results")
+    parser.add_argument("--tasks", help="Comma-separated task IDs or prefixes (default: all 12)")
+    parser.add_argument("--start-from", type=int, default=0, help="Task index to start from")
+    parser.add_argument("--vm-ip", default="172.173.66.131", help="VM IP")
+    parser.add_argument("--vm-user", default="azureuser", help="VM SSH user")
+    parser.add_argument("--zs-only", action="store_true", help="Run zero-shot only (no demo)")
+    parser.add_argument("--dc-only", action="store_true", help="Run demo-conditioned only")
+    args = parser.parse_args()
+
+    demo_dir = Path(args.demo_dir)
+    output_dir = Path(args.output)
+
+    # Resolve task IDs (support prefix matching)
+    if args.tasks:
+        raw_ids = [t.strip() for t in args.tasks.split(",")]
+        task_ids = []
+        for raw in raw_ids:
+            matches = [tid for tid in HARDER_TASK_IDS if tid.startswith(raw)]
+            if len(matches) == 1:
+                task_ids.append(matches[0])
+            elif len(matches) > 1:
+                print(f"Ambiguous prefix '{raw}': {matches}")
+                return 1
+            else:
+                task_ids.append(raw)  # Use as-is
+    else:
+        task_ids = HARDER_TASK_IDS
+
+    # Build run conditions
+    conditions = []  # (task_id, run_name, demo_path_or_None)
+    for tid in task_ids:
+        sid = short_id(tid)
+        if not args.dc_only:
+            conditions.append((tid, f"val_zs_{sid}", None))
+        if not args.zs_only:
+            demo_path = demo_dir / f"{tid}.txt"
+            if not demo_path.exists():
+                demo_path = demo_dir / f"{tid}.json"
+            if not demo_path.exists():
+                print(f"WARNING: No demo for {tid}, skipping DC condition")
+                continue
+            conditions.append((tid, f"val_dc_{sid}", demo_path))
+
+    print(f"Eval: {len(conditions)} runs ({len(task_ids)} tasks) with {args.agent}")
+    print(f"VM: {args.vm_ip}")
+    print()
+
+    # Verify initial WAA health
+    if not ensure_waa_ready(args.server, args.vm_user, args.vm_ip, evaluate_url=args.evaluate_url):
+        print("ERROR: Cannot reach WAA server or evaluate server")
+        return 1
+
+    results = {}
+    start_time = time.time()
+
+    for i, (tid, run_name, demo_path) in enumerate(conditions):
+        if i < args.start_from:
+            print(f"[{i+1}/{len(conditions)}] Skipping {run_name}")
+            continue
+
+        # Health check before each run
+        if not ensure_waa_ready(args.server, args.vm_user, args.vm_ip, evaluate_url=args.evaluate_url):
+            print(f"  Skipping {run_name} - WAA unreachable after recovery")
+            results[run_name] = {"status": "SKIP", "returncode": -1, "elapsed_s": 0}
+            continue
+
+        cond_label = "DC" if demo_path else "ZS"
+        print(f"{'=' * 60}")
+        print(f"[{i+1}/{len(conditions)}] {cond_label}: {tid}")
+        if demo_path:
+            print(f"  Demo: {demo_path.name} ({demo_path.stat().st_size} bytes)")
+        print(f"  Run: {run_name}")
+        print(f"{'=' * 60}")
+
+        task_start = time.time()
+
+        cmd = [
+            sys.executable, "-m", "openadapt_evals.benchmarks.cli",
+            "run",
+            "--agent", args.agent,
+            "--tasks", tid,
+            "--server", args.server,
+            "--evaluate-url", args.evaluate_url,
+            "--max-steps", str(args.max_steps),
+            "--output", str(output_dir),
+            "--run-name", run_name,
+        ]
+        if demo_path:
+            cmd.extend(["--demo", str(demo_path.resolve())])
+
+        result = subprocess.run(cmd)
+        elapsed = time.time() - task_start
+
+        status = "OK" if result.returncode == 0 else f"FAIL (rc={result.returncode})"
+        results[run_name] = {
+            "status": status,
+            "returncode": result.returncode,
+            "elapsed_s": elapsed,
+            "task_id": tid,
+            "condition": cond_label,
+        }
+
+        print(f"\n  -> {status} ({elapsed:.0f}s)\n")
+
+    # Summary
+    total_time = time.time() - start_time
+    ok = sum(1 for r in results.values() if r["returncode"] == 0)
+
+    print(f"\n{'=' * 60}")
+    print(f"EVAL SUMMARY ({args.agent})")
+    print(f"{'=' * 60}")
+    print(f"  Runs: {ok}/{len(results)} completed")
+    print(f"  Total time: {total_time:.0f}s ({total_time/60:.1f}min)")
+    print()
+
+    for name, r in results.items():
+        cond = r.get("condition", "?")
+        print(f"  {name:30s}  {cond:2s}  {r['status']:15s}  {r['elapsed_s']:.0f}s")
+
+    print(f"\n  Results in: {output_dir}/val_*/")
+
+    return 0 if ok == len(results) else 1
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/tests/test_claude_computer_use_agent.py b/tests/test_claude_computer_use_agent.py
index 0953124..a56325b 100644
--- a/tests/test_claude_computer_use_agent.py
+++ b/tests/test_claude_computer_use_agent.py
@@ -228,8 +228,8 @@ def test_no_tool_use_returns_done(self, agent, mock_anthropic_client):
         assert action.type == "done"
         assert "no_tool_use" in action.raw_action.get("reason", "")
 
-    def test_screenshot_action_returns_done(self, agent, mock_anthropic_client):
-        """Screenshot action maps to done (next step sends screenshot back)."""
+    def test_screenshot_action_returns_error_on_exhaustion(self, agent, mock_anthropic_client):
+        """Screenshot action returns error after exhausting retries."""
         response = create_mock_response(
             create_tool_use_block("screenshot")
         )
@@ -237,10 +237,11 @@ def test_screenshot_action_returns_done(self, agent, mock_anthropic_client):
 
         action = agent.act(make_observation(), make_task())
 
-        assert action.type == "done"
+        assert action.type == "error"
+        assert action.raw_action["reason"] == "max_internal_retries_exceeded"
 
-    def test_wait_action_returns_done(self, agent, mock_anthropic_client):
-        """Wait action maps to done."""
+    def test_wait_action_returns_error_on_exhaustion(self, agent, mock_anthropic_client):
+        """Wait action returns error after exhausting retries."""
         response = create_mock_response(
             create_tool_use_block("wait", duration=1.0)
         )
@@ -248,7 +249,8 @@ def test_wait_action_returns_done(self, agent, mock_anthropic_client):
 
         action = agent.act(make_observation(), make_task())
 
-        assert action.type == "done"
+        assert action.type == "error"
+        assert action.raw_action["reason"] == "max_internal_retries_exceeded"
 
 
 class TestConversationManagement:
@@ -431,30 +433,28 @@ def test_viewport_updates_display_dimensions(self, agent, mock_anthropic_client)
         assert abs(action.x - 0.5) < 0.01  # 960/1920 = 0.5
         assert abs(action.y - 0.5) < 0.01  # 540/1080 = 0.5
 
-    def test_no_screenshot_still_works(self, agent, mock_anthropic_client):
-        """Agent works even without a screenshot."""
-        response = create_mock_response(
-            create_text_block("I cannot see the screen.")
-        )
-        mock_anthropic_client.beta.messages.create.return_value = response
-
+    def test_no_screenshot_returns_error(self, agent, mock_anthropic_client):
+        """Agent returns error when no screenshot is available."""
         obs = BenchmarkObservation(screenshot=None)
         action = agent.act(obs, make_task())
 
-        assert action.type == "done"
+        assert action.type == "error"
+        assert action.raw_action["reason"] == "no_screenshot"
+        assert action.raw_action["error_type"] == "infrastructure"
 
 
 class TestEdgeCases:
     """Test edge cases and error handling."""
 
-    def test_api_error_returns_done(self, agent, mock_anthropic_client):
-        """API error results in done action."""
+    def test_api_error_returns_error(self, agent, mock_anthropic_client):
+        """API error results in error action."""
         mock_anthropic_client.beta.messages.create.side_effect = Exception("API error")
 
         action = agent.act(make_observation(), make_task())
 
-        assert action.type == "done"
-        assert "error" in (action.raw_action or {})
+        assert action.type == "error"
+        assert action.raw_action["reason"] == "api_call_failed"
+        assert action.raw_action["error_type"] == "infrastructure"
 
     def test_unknown_action_returns_done(self, agent, mock_anthropic_client):
         """Unknown action type returns done."""
diff --git a/tests/test_mock_adapter.py b/tests/test_mock_adapter.py
index 12ee40a..1004d23 100644
--- a/tests/test_mock_adapter.py
+++ b/tests/test_mock_adapter.py
@@ -288,18 +288,18 @@ def test_evaluate_success_on_type_and_ok(self, mock_adapter):
         assert result.success is True
         assert result.score == 1.0
 
-    def test_evaluate_success_on_done_with_actions(self, mock_adapter):
-        """Test that calling DONE after 2+ actions results in success."""
+    def test_evaluate_no_false_positive_on_done_with_actions(self, mock_adapter):
+        """Test that calling DONE after non-target actions does NOT count as success."""
         task = mock_adapter.list_tasks()[0]
         mock_adapter.reset(task)
 
-        # Take two meaningful actions
+        # Click random area (not Submit/OK) and call done
         mock_adapter.step(BenchmarkAction(type="click", x=0.5, y=0.5))
         mock_adapter.step(BenchmarkAction(type="done"))
 
         result = mock_adapter.evaluate(task)
-        assert result.success is True
-        assert result.score == 1.0
+        assert result.success is False
+        assert result.score < 1.0
 
     def test_evaluate_partial_score_on_some_actions(self, mock_adapter):
         """Test that partial actions get partial credit."""
diff --git a/tests/test_qwen3vl_agent.py b/tests/test_qwen3vl_agent.py
index 7018986..0223d09 100644
--- a/tests/test_qwen3vl_agent.py
+++ b/tests/test_qwen3vl_agent.py
@@ -158,7 +158,7 @@ def test_drag_stores_qwen_coords(self):
 
     def test_wait(self):
         action = parse_qwen_action("wait()")
-        assert action.type == "done"
+        assert action.type == "wait"
         assert action.raw_action.get("is_wait") is True
 
     def test_finished(self):