Skip to content
2 changes: 1 addition & 1 deletion .beads/issues.jsonl
Original file line number Diff line number Diff line change
Expand Up @@ -13,5 +13,5 @@
{"id":"openadapt-evals-hvm","title":"VL model fix PR #18 ready to merge","notes":"2026-02-08: openadapt-ml PR #18 was already merged on 2026-01-29. VL model fix is done.","status":"closed","priority":0,"issue_type":"task","owner":"richard.abrich@gmail.com","created_at":"2026-01-29T16:17:03.491938-05:00","created_by":"Richard Abrich","updated_at":"2026-02-08T12:55:19.233249-05:00","closed_at":"2026-02-08T12:55:19.233249-05:00","close_reason":"PR #18 already merged 2026-01-29"}
{"id":"openadapt-evals-mx8","title":"Analyze evaluation results and publish findings","description":"After demo-conditioned evaluation completes, analyze results: success rates, failure modes, demo impact. Create data-driven roadmap for improvements.","status":"open","priority":1,"issue_type":"task","owner":"richard.abrich@gmail.com","created_at":"2026-02-14T12:23:06.328838-05:00","created_by":"Richard Abrich","updated_at":"2026-02-14T12:23:06.328838-05:00"}
{"id":"openadapt-evals-sz4","title":"RCA: Windows product key prompt recurring issue","status":"closed","priority":0,"issue_type":"task","owner":"richard.abrich@gmail.com","created_at":"2026-01-20T18:59:36.266286-05:00","created_by":"Richard Abrich","updated_at":"2026-01-20T20:32:06.493102-05:00","closed_at":"2026-01-20T20:32:06.493102-05:00","close_reason":"RCA complete - root cause is VERSION mismatch (CLI=11, Dockerfile=11e). Fix documented in RECURRING_ISSUES.md and WINDOWS_PRODUCT_KEY_RCA.md"}
{"id":"openadapt-evals-vcb","title":"Run demo-conditioned WAA evaluation","description":"Once demos are recorded, run WAA evaluation with demo-conditioned agents (RetrievalAugmentedAgent with real demos). Target: measure improvement over zero-shot baseline. Requires real demos from recording task.","notes":"Feb 24: Fixed Docker data-root (ephemeral /mnt -\u003e persistent /home/azureuser/docker) in PRs #37+#38 (v0.4.2). Verified pool-pause/resume cycle works. Created fresh pool (waa-pool-00, 172.173.66.131, D8ds_v4, centralus). Running zero-shot eval on 12 harder tasks with api-claude (claude-sonnet-4-5). 10/12 tasks completed, all scoring 0.00 as expected. Results in benchmark_results/zs_harder_12_zs_*. Next: complete ZS eval, then record demos + annotate + run DC eval.","status":"open","priority":0,"issue_type":"task","owner":"richard.abrich@gmail.com","created_at":"2026-02-14T12:23:04.624305-05:00","created_by":"Richard Abrich","updated_at":"2026-02-24T13:51:58.517565-05:00"}
{"id":"openadapt-evals-vcb","title":"Run demo-conditioned WAA evaluation","description":"Once demos are recorded, run WAA evaluation with demo-conditioned agents (RetrievalAugmentedAgent with real demos). Target: measure improvement over zero-shot baseline. Requires real demos from recording task.","notes":"Feb 24: Fixed Docker data-root (ephemeral /mnt -\u003e persistent /home/azureuser/docker) in PRs #37+#38 (v0.4.2). Verified pool-pause/resume cycle works. Created fresh pool (waa-pool-00, 172.173.66.131, D8ds_v4, centralus). Running zero-shot eval on 12 harder tasks with api-claude (claude-sonnet-4-5). 10/12 tasks completed, all scoring 0.00 as expected. Results in benchmark_results/zs_harder_12_zs_*. Next: complete ZS eval, then record demos + annotate + run DC eval.","status":"open","priority":0,"issue_type":"task","owner":"richard.abrich@gmail.com","created_at":"2026-02-14T12:23:04.624305-05:00","created_by":"Richard Abrich","updated_at":"2026-02-24T16:31:54.323876-05:00"}
{"id":"openadapt-evals-wis","title":"Add pre-flight check to detect Windows install issues","status":"closed","priority":1,"issue_type":"task","owner":"richard.abrich@gmail.com","created_at":"2026-01-20T18:59:36.865052-05:00","created_by":"Richard Abrich","updated_at":"2026-01-20T20:32:06.757261-05:00","closed_at":"2026-01-20T20:32:06.757261-05:00","close_reason":"Duplicate of openadapt-evals-0dt"}
3 changes: 2 additions & 1 deletion openadapt_evals/adapters/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -116,7 +116,7 @@ class BenchmarkAction:
raw_action: Original benchmark action (lossless).
"""

type: str # "click", "type", "scroll", "key", "drag", "answer", "done"
type: str # "click", "type", "scroll", "key", "drag", "answer", "done", "error"

# Pointer actions - coordinates
x: float | None = None # Normalized [0,1] or pixel
Expand Down Expand Up @@ -176,6 +176,7 @@ class BenchmarkResult:
# Diagnostics
error: str | None = None
reason: str | None = None # Why success/fail
error_type: str | None = None # "infrastructure", "agent", "evaluation", or None

# Timing
total_time_seconds: float = 0.0
Expand Down
15 changes: 9 additions & 6 deletions openadapt_evals/adapters/waa/live.py
Original file line number Diff line number Diff line change
Expand Up @@ -431,9 +431,9 @@ def step(
# Wait for UI to settle
time.sleep(self.config.action_delay)

# Check if done
# Check if done (error actions are also terminal)
done = (
action.type == "done" or
action.type in ("done", "error") or
self._step_count >= self.config.max_steps
)

Expand Down Expand Up @@ -530,11 +530,14 @@ def evaluate(self, task: BenchmarkTask) -> BenchmarkResult:
score=0.0,
num_steps=self._step_count,
reason="Evaluation timed out",
error_type="infrastructure",
)

except requests.RequestException as e:
logger.error(f"Evaluation request error: {e}")
return self._evaluate_fallback(task)
result = self._evaluate_fallback(task)
result.error_type = "infrastructure"
return result

def _evaluate_fallback(self, task: BenchmarkTask) -> BenchmarkResult:
"""Fallback when proper evaluation unavailable - returns failure.
Expand Down Expand Up @@ -602,7 +605,7 @@ def _get_observation(self) -> BenchmarkObservation:
try:
resp = requests.get(
f"{self.config.server_url}/screenshot",
timeout=30.0
timeout=self.config.timeout
)
if resp.status_code == 200:
screenshot = resp.content
Expand All @@ -626,7 +629,7 @@ def _get_observation(self) -> BenchmarkObservation:
resp = requests.get(
f"{self.config.server_url}/accessibility",
params={"backend": self.config.a11y_backend},
timeout=30.0
timeout=self.config.timeout
)
if resp.status_code == 200:
result = resp.json()
Expand Down Expand Up @@ -856,7 +859,7 @@ def _translate_action(self, action: BenchmarkAction) -> str | None:
Python command string to execute via /execute_windows endpoint,
or None for actions that don't need execution.
"""
if action.type == "done":
if action.type in ("done", "error"):
return None

if action.type == "wait":
Expand Down
6 changes: 2 additions & 4 deletions openadapt_evals/adapters/waa/mock.py
Original file line number Diff line number Diff line change
Expand Up @@ -640,7 +640,7 @@ def step(
if action.type == "type" and action.text:
self._text_entered = action.text

done = action.type == "done" or self._step_count >= 15
done = action.type in ("done", "error") or self._step_count >= 15
return self._mock_observation(), done, {"step": self._step_count}

def evaluate(self, task: BenchmarkTask) -> BenchmarkResult:
Expand Down Expand Up @@ -700,13 +700,11 @@ def evaluate(self, task: BenchmarkTask) -> BenchmarkResult:
# Success criteria:
# 1. Clicked Submit (ID 4) - primary success path
# 2. Typed something AND clicked OK (ID 1) - form submission path
# 3. Called DONE after at least 2 actions - reasonable completion
clicked_submit = "4" in clicked_ids
clicked_ok = "1" in clicked_ids
form_submitted = typed_text and clicked_ok
reasonable_completion = called_done and len(self._actions) >= 2

success = clicked_submit or form_submitted or reasonable_completion
success = clicked_submit or form_submitted

# Calculate partial credit score
score = 0.0
Expand Down
16 changes: 12 additions & 4 deletions openadapt_evals/agents/claude_computer_use_agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -160,6 +160,13 @@ def act(
self._step_count += 1
screenshot_b64 = self._encode_screenshot(observation)

if screenshot_b64 is None:
logger.warning("No screenshot available from environment")
return BenchmarkAction(
type="error",
raw_action={"reason": "no_screenshot", "error_type": "infrastructure"},
)

if self._step_count == 1:
# First step: send task instruction + initial screenshot
self._messages = self._build_initial_messages(
Expand All @@ -179,7 +186,8 @@ def act(
response = self._call_api()
if response is None:
return BenchmarkAction(
type="done", raw_action={"error": "API call failed"}
type="error",
raw_action={"reason": "api_call_failed", "error_type": "infrastructure"},
)

# Add assistant response to conversation
Expand All @@ -205,14 +213,14 @@ def act(
# Real action — return to runner
return self._process_response(response, observation)

# Exhausted retries on screenshot/wait — return done
# Exhausted retries on screenshot/wait — return error (not done)
logger.warning(
f"Exhausted {self.MAX_INTERNAL_RETRIES} internal retries on "
"screenshot/wait actions"
)
return BenchmarkAction(
type="done",
raw_action={"reason": "max_internal_retries_exceeded"},
type="error",
raw_action={"reason": "max_internal_retries_exceeded", "error_type": "infrastructure"},
)

def _build_initial_messages(
Expand Down
Loading