OpenAdaptAI · abrichr · Feb 25, 2026 · Feb 25, 2026 · Feb 25, 2026 · Feb 25, 2026
diff --git a/.beads/issues.jsonl b/.beads/issues.jsonl
@@ -13,5 +13,5 @@
 {"id":"openadapt-evals-hvm","title":"VL model fix PR #18 ready to merge","notes":"2026-02-08: openadapt-ml PR #18 was already merged on 2026-01-29. VL model fix is done.","status":"closed","priority":0,"issue_type":"task","owner":"richard.abrich@gmail.com","created_at":"2026-01-29T16:17:03.491938-05:00","created_by":"Richard Abrich","updated_at":"2026-02-08T12:55:19.233249-05:00","closed_at":"2026-02-08T12:55:19.233249-05:00","close_reason":"PR #18 already merged 2026-01-29"}
 {"id":"openadapt-evals-mx8","title":"Analyze evaluation results and publish findings","description":"After demo-conditioned evaluation completes, analyze results: success rates, failure modes, demo impact. Create data-driven roadmap for improvements.","status":"open","priority":1,"issue_type":"task","owner":"richard.abrich@gmail.com","created_at":"2026-02-14T12:23:06.328838-05:00","created_by":"Richard Abrich","updated_at":"2026-02-14T12:23:06.328838-05:00"}
 {"id":"openadapt-evals-sz4","title":"RCA: Windows product key prompt recurring issue","status":"closed","priority":0,"issue_type":"task","owner":"richard.abrich@gmail.com","created_at":"2026-01-20T18:59:36.266286-05:00","created_by":"Richard Abrich","updated_at":"2026-01-20T20:32:06.493102-05:00","closed_at":"2026-01-20T20:32:06.493102-05:00","close_reason":"RCA complete - root cause is VERSION mismatch (CLI=11, Dockerfile=11e). Fix documented in RECURRING_ISSUES.md and WINDOWS_PRODUCT_KEY_RCA.md"}
-{"id":"openadapt-evals-vcb","title":"Run demo-conditioned WAA evaluation","description":"Once demos are recorded, run WAA evaluation with demo-conditioned agents (RetrievalAugmentedAgent with real demos). Target: measure improvement over zero-shot baseline. Requires real demos from recording task.","notes":"Feb 24: Fixed Docker data-root (ephemeral /mnt -\u003e persistent /home/azureuser/docker) in PRs #37+#38 (v0.4.2). Verified pool-pause/resume cycle works. Created fresh pool (waa-pool-00, 172.173.66.131, D8ds_v4, centralus). Running zero-shot eval on 12 harder tasks with api-claude (claude-sonnet-4-5). 10/12 tasks completed, all scoring 0.00 as expected. Results in benchmark_results/zs_harder_12_zs_*. Next: complete ZS eval, then record demos + annotate + run DC eval.","status":"open","priority":0,"issue_type":"task","owner":"richard.abrich@gmail.com","created_at":"2026-02-14T12:23:04.624305-05:00","created_by":"Richard Abrich","updated_at":"2026-02-24T13:51:58.517565-05:00"}
+{"id":"openadapt-evals-vcb","title":"Run demo-conditioned WAA evaluation","description":"Once demos are recorded, run WAA evaluation with demo-conditioned agents (RetrievalAugmentedAgent with real demos). Target: measure improvement over zero-shot baseline. Requires real demos from recording task.","notes":"Feb 24: Fixed Docker data-root (ephemeral /mnt -\u003e persistent /home/azureuser/docker) in PRs #37+#38 (v0.4.2). Verified pool-pause/resume cycle works. Created fresh pool (waa-pool-00, 172.173.66.131, D8ds_v4, centralus). Running zero-shot eval on 12 harder tasks with api-claude (claude-sonnet-4-5). 10/12 tasks completed, all scoring 0.00 as expected. Results in benchmark_results/zs_harder_12_zs_*. Next: complete ZS eval, then record demos + annotate + run DC eval.","status":"open","priority":0,"issue_type":"task","owner":"richard.abrich@gmail.com","created_at":"2026-02-14T12:23:04.624305-05:00","created_by":"Richard Abrich","updated_at":"2026-02-24T16:31:54.323876-05:00"}
 {"id":"openadapt-evals-wis","title":"Add pre-flight check to detect Windows install issues","status":"closed","priority":1,"issue_type":"task","owner":"richard.abrich@gmail.com","created_at":"2026-01-20T18:59:36.865052-05:00","created_by":"Richard Abrich","updated_at":"2026-01-20T20:32:06.757261-05:00","closed_at":"2026-01-20T20:32:06.757261-05:00","close_reason":"Duplicate of openadapt-evals-0dt"}
diff --git a/openadapt_evals/adapters/base.py b/openadapt_evals/adapters/base.py
@@ -116,7 +116,7 @@ class BenchmarkAction:
         raw_action: Original benchmark action (lossless).
     """
 
-    type: str  # "click", "type", "scroll", "key", "drag", "answer", "done"
+    type: str  # "click", "type", "scroll", "key", "drag", "answer", "done", "error"
 
     # Pointer actions - coordinates
     x: float | None = None  # Normalized [0,1] or pixel
@@ -176,6 +176,7 @@ class BenchmarkResult:
     # Diagnostics
     error: str | None = None
     reason: str | None = None  # Why success/fail
+    error_type: str | None = None  # "infrastructure", "agent", "evaluation", or None
 
     # Timing
     total_time_seconds: float = 0.0

diff --git a/openadapt_evals/adapters/waa/live.py b/openadapt_evals/adapters/waa/live.py
@@ -431,9 +431,9 @@ def step(
         # Wait for UI to settle
         time.sleep(self.config.action_delay)
 
-        # Check if done
+        # Check if done (error actions are also terminal)
         done = (
-            action.type == "done" or
+            action.type in ("done", "error") or
             self._step_count >= self.config.max_steps
         )
 
@@ -530,11 +530,14 @@ def evaluate(self, task: BenchmarkTask) -> BenchmarkResult:
                 score=0.0,
                 num_steps=self._step_count,
                 reason="Evaluation timed out",
+                error_type="infrastructure",
             )
 
         except requests.RequestException as e:
             logger.error(f"Evaluation request error: {e}")
-            return self._evaluate_fallback(task)
+            result = self._evaluate_fallback(task)
+            result.error_type = "infrastructure"
+            return result
 
     def _evaluate_fallback(self, task: BenchmarkTask) -> BenchmarkResult:
         """Fallback when proper evaluation unavailable - returns failure.
@@ -602,7 +605,7 @@ def _get_observation(self) -> BenchmarkObservation:
         try:
             resp = requests.get(
                 f"{self.config.server_url}/screenshot",
-                timeout=30.0
+                timeout=self.config.timeout
             )
             if resp.status_code == 200:
                 screenshot = resp.content
@@ -626,7 +629,7 @@ def _get_observation(self) -> BenchmarkObservation:
             resp = requests.get(
                 f"{self.config.server_url}/accessibility",
                 params={"backend": self.config.a11y_backend},
-                timeout=30.0
+                timeout=self.config.timeout
             )
             if resp.status_code == 200:
                 result = resp.json()
@@ -856,7 +859,7 @@ def _translate_action(self, action: BenchmarkAction) -> str | None:
             Python command string to execute via /execute_windows endpoint,
             or None for actions that don't need execution.
         """
-        if action.type == "done":
+        if action.type in ("done", "error"):
             return None
 
         if action.type == "wait":

diff --git a/openadapt_evals/adapters/waa/mock.py b/openadapt_evals/adapters/waa/mock.py
@@ -640,7 +640,7 @@ def step(
         if action.type == "type" and action.text:
             self._text_entered = action.text
 
-        done = action.type == "done" or self._step_count >= 15
+        done = action.type in ("done", "error") or self._step_count >= 15
         return self._mock_observation(), done, {"step": self._step_count}
 
     def evaluate(self, task: BenchmarkTask) -> BenchmarkResult:
@@ -700,13 +700,11 @@ def evaluate(self, task: BenchmarkTask) -> BenchmarkResult:
         # Success criteria:
         # 1. Clicked Submit (ID 4) - primary success path
         # 2. Typed something AND clicked OK (ID 1) - form submission path
-        # 3. Called DONE after at least 2 actions - reasonable completion
         clicked_submit = "4" in clicked_ids
         clicked_ok = "1" in clicked_ids
         form_submitted = typed_text and clicked_ok
-        reasonable_completion = called_done and len(self._actions) >= 2
 
-        success = clicked_submit or form_submitted or reasonable_completion
+        success = clicked_submit or form_submitted
 
         # Calculate partial credit score
         score = 0.0

diff --git a/openadapt_evals/agents/claude_computer_use_agent.py b/openadapt_evals/agents/claude_computer_use_agent.py
@@ -160,6 +160,13 @@ def act(
         self._step_count += 1
         screenshot_b64 = self._encode_screenshot(observation)
 
+        if screenshot_b64 is None:
+            logger.warning("No screenshot available from environment")
+            return BenchmarkAction(
+                type="error",
+                raw_action={"reason": "no_screenshot", "error_type": "infrastructure"},
+            )
+
         if self._step_count == 1:
             # First step: send task instruction + initial screenshot
             self._messages = self._build_initial_messages(
@@ -179,7 +186,8 @@ def act(
             response = self._call_api()
             if response is None:
                 return BenchmarkAction(
-                    type="done", raw_action={"error": "API call failed"}
+                    type="error",
+                    raw_action={"reason": "api_call_failed", "error_type": "infrastructure"},
                 )
 
             # Add assistant response to conversation
@@ -205,14 +213,14 @@ def act(
             # Real action — return to runner
             return self._process_response(response, observation)
 
-        # Exhausted retries on screenshot/wait — return done
+        # Exhausted retries on screenshot/wait — return error (not done)
         logger.warning(
             f"Exhausted {self.MAX_INTERNAL_RETRIES} internal retries on "
             "screenshot/wait actions"
         )
         return BenchmarkAction(
-            type="done",
-            raw_action={"reason": "max_internal_retries_exceeded"},
+            type="error",
+            raw_action={"reason": "max_internal_retries_exceeded", "error_type": "infrastructure"},
         )
 
     def _build_initial_messages(