diff --git a/.beads/issues.jsonl b/.beads/issues.jsonl index 7f02b1a..5866e43 100644 --- a/.beads/issues.jsonl +++ b/.beads/issues.jsonl @@ -13,5 +13,5 @@ {"id":"openadapt-evals-hvm","title":"VL model fix PR #18 ready to merge","notes":"2026-02-08: openadapt-ml PR #18 was already merged on 2026-01-29. VL model fix is done.","status":"closed","priority":0,"issue_type":"task","owner":"richard.abrich@gmail.com","created_at":"2026-01-29T16:17:03.491938-05:00","created_by":"Richard Abrich","updated_at":"2026-02-08T12:55:19.233249-05:00","closed_at":"2026-02-08T12:55:19.233249-05:00","close_reason":"PR #18 already merged 2026-01-29"} {"id":"openadapt-evals-mx8","title":"Analyze evaluation results and publish findings","description":"After demo-conditioned evaluation completes, analyze results: success rates, failure modes, demo impact. Create data-driven roadmap for improvements.","status":"open","priority":1,"issue_type":"task","owner":"richard.abrich@gmail.com","created_at":"2026-02-14T12:23:06.328838-05:00","created_by":"Richard Abrich","updated_at":"2026-02-14T12:23:06.328838-05:00"} {"id":"openadapt-evals-sz4","title":"RCA: Windows product key prompt recurring issue","status":"closed","priority":0,"issue_type":"task","owner":"richard.abrich@gmail.com","created_at":"2026-01-20T18:59:36.266286-05:00","created_by":"Richard Abrich","updated_at":"2026-01-20T20:32:06.493102-05:00","closed_at":"2026-01-20T20:32:06.493102-05:00","close_reason":"RCA complete - root cause is VERSION mismatch (CLI=11, Dockerfile=11e). Fix documented in RECURRING_ISSUES.md and WINDOWS_PRODUCT_KEY_RCA.md"} -{"id":"openadapt-evals-vcb","title":"Run demo-conditioned WAA evaluation","description":"Once demos are recorded, run WAA evaluation with demo-conditioned agents (RetrievalAugmentedAgent with real demos). Target: measure improvement over zero-shot baseline. Requires real demos from recording task.","notes":"Feb 24: Fixed Docker data-root (ephemeral /mnt -\u003e persistent /home/azureuser/docker) in PRs #37+#38 (v0.4.2). Verified pool-pause/resume cycle works. Created fresh pool (waa-pool-00, 172.173.66.131, D8ds_v4, centralus). Running zero-shot eval on 12 harder tasks with api-claude (claude-sonnet-4-5). 10/12 tasks completed, all scoring 0.00 as expected. Results in benchmark_results/zs_harder_12_zs_*. Next: complete ZS eval, then record demos + annotate + run DC eval.","status":"open","priority":0,"issue_type":"task","owner":"richard.abrich@gmail.com","created_at":"2026-02-14T12:23:04.624305-05:00","created_by":"Richard Abrich","updated_at":"2026-02-24T13:51:58.517565-05:00"} +{"id":"openadapt-evals-vcb","title":"Run demo-conditioned WAA evaluation","description":"Once demos are recorded, run WAA evaluation with demo-conditioned agents (RetrievalAugmentedAgent with real demos). Target: measure improvement over zero-shot baseline. Requires real demos from recording task.","notes":"Feb 24: Fixed Docker data-root (ephemeral /mnt -\u003e persistent /home/azureuser/docker) in PRs #37+#38 (v0.4.2). Verified pool-pause/resume cycle works. Created fresh pool (waa-pool-00, 172.173.66.131, D8ds_v4, centralus). Running zero-shot eval on 12 harder tasks with api-claude (claude-sonnet-4-5). 10/12 tasks completed, all scoring 0.00 as expected. Results in benchmark_results/zs_harder_12_zs_*. Next: complete ZS eval, then record demos + annotate + run DC eval.","status":"open","priority":0,"issue_type":"task","owner":"richard.abrich@gmail.com","created_at":"2026-02-14T12:23:04.624305-05:00","created_by":"Richard Abrich","updated_at":"2026-02-24T16:31:54.323876-05:00"} {"id":"openadapt-evals-wis","title":"Add pre-flight check to detect Windows install issues","status":"closed","priority":1,"issue_type":"task","owner":"richard.abrich@gmail.com","created_at":"2026-01-20T18:59:36.865052-05:00","created_by":"Richard Abrich","updated_at":"2026-01-20T20:32:06.757261-05:00","closed_at":"2026-01-20T20:32:06.757261-05:00","close_reason":"Duplicate of openadapt-evals-0dt"} diff --git a/openadapt_evals/adapters/base.py b/openadapt_evals/adapters/base.py index ce57554..f67c976 100644 --- a/openadapt_evals/adapters/base.py +++ b/openadapt_evals/adapters/base.py @@ -116,7 +116,7 @@ class BenchmarkAction: raw_action: Original benchmark action (lossless). """ - type: str # "click", "type", "scroll", "key", "drag", "answer", "done" + type: str # "click", "type", "scroll", "key", "drag", "answer", "done", "error" # Pointer actions - coordinates x: float | None = None # Normalized [0,1] or pixel @@ -176,6 +176,7 @@ class BenchmarkResult: # Diagnostics error: str | None = None reason: str | None = None # Why success/fail + error_type: str | None = None # "infrastructure", "agent", "evaluation", or None # Timing total_time_seconds: float = 0.0 diff --git a/openadapt_evals/adapters/waa/live.py b/openadapt_evals/adapters/waa/live.py index c58c98b..9f4a27b 100644 --- a/openadapt_evals/adapters/waa/live.py +++ b/openadapt_evals/adapters/waa/live.py @@ -431,9 +431,9 @@ def step( # Wait for UI to settle time.sleep(self.config.action_delay) - # Check if done + # Check if done (error actions are also terminal) done = ( - action.type == "done" or + action.type in ("done", "error") or self._step_count >= self.config.max_steps ) @@ -530,11 +530,14 @@ def evaluate(self, task: BenchmarkTask) -> BenchmarkResult: score=0.0, num_steps=self._step_count, reason="Evaluation timed out", + error_type="infrastructure", ) except requests.RequestException as e: logger.error(f"Evaluation request error: {e}") - return self._evaluate_fallback(task) + result = self._evaluate_fallback(task) + result.error_type = "infrastructure" + return result def _evaluate_fallback(self, task: BenchmarkTask) -> BenchmarkResult: """Fallback when proper evaluation unavailable - returns failure. @@ -602,7 +605,7 @@ def _get_observation(self) -> BenchmarkObservation: try: resp = requests.get( f"{self.config.server_url}/screenshot", - timeout=30.0 + timeout=self.config.timeout ) if resp.status_code == 200: screenshot = resp.content @@ -626,7 +629,7 @@ def _get_observation(self) -> BenchmarkObservation: resp = requests.get( f"{self.config.server_url}/accessibility", params={"backend": self.config.a11y_backend}, - timeout=30.0 + timeout=self.config.timeout ) if resp.status_code == 200: result = resp.json() @@ -856,7 +859,7 @@ def _translate_action(self, action: BenchmarkAction) -> str | None: Python command string to execute via /execute_windows endpoint, or None for actions that don't need execution. """ - if action.type == "done": + if action.type in ("done", "error"): return None if action.type == "wait": diff --git a/openadapt_evals/adapters/waa/mock.py b/openadapt_evals/adapters/waa/mock.py index 550c2c0..60f80fa 100644 --- a/openadapt_evals/adapters/waa/mock.py +++ b/openadapt_evals/adapters/waa/mock.py @@ -640,7 +640,7 @@ def step( if action.type == "type" and action.text: self._text_entered = action.text - done = action.type == "done" or self._step_count >= 15 + done = action.type in ("done", "error") or self._step_count >= 15 return self._mock_observation(), done, {"step": self._step_count} def evaluate(self, task: BenchmarkTask) -> BenchmarkResult: @@ -700,13 +700,11 @@ def evaluate(self, task: BenchmarkTask) -> BenchmarkResult: # Success criteria: # 1. Clicked Submit (ID 4) - primary success path # 2. Typed something AND clicked OK (ID 1) - form submission path - # 3. Called DONE after at least 2 actions - reasonable completion clicked_submit = "4" in clicked_ids clicked_ok = "1" in clicked_ids form_submitted = typed_text and clicked_ok - reasonable_completion = called_done and len(self._actions) >= 2 - success = clicked_submit or form_submitted or reasonable_completion + success = clicked_submit or form_submitted # Calculate partial credit score score = 0.0 diff --git a/openadapt_evals/agents/claude_computer_use_agent.py b/openadapt_evals/agents/claude_computer_use_agent.py index 194675b..bcb807e 100644 --- a/openadapt_evals/agents/claude_computer_use_agent.py +++ b/openadapt_evals/agents/claude_computer_use_agent.py @@ -160,6 +160,13 @@ def act( self._step_count += 1 screenshot_b64 = self._encode_screenshot(observation) + if screenshot_b64 is None: + logger.warning("No screenshot available from environment") + return BenchmarkAction( + type="error", + raw_action={"reason": "no_screenshot", "error_type": "infrastructure"}, + ) + if self._step_count == 1: # First step: send task instruction + initial screenshot self._messages = self._build_initial_messages( @@ -179,7 +186,8 @@ def act( response = self._call_api() if response is None: return BenchmarkAction( - type="done", raw_action={"error": "API call failed"} + type="error", + raw_action={"reason": "api_call_failed", "error_type": "infrastructure"}, ) # Add assistant response to conversation @@ -205,14 +213,14 @@ def act( # Real action — return to runner return self._process_response(response, observation) - # Exhausted retries on screenshot/wait — return done + # Exhausted retries on screenshot/wait — return error (not done) logger.warning( f"Exhausted {self.MAX_INTERNAL_RETRIES} internal retries on " "screenshot/wait actions" ) return BenchmarkAction( - type="done", - raw_action={"reason": "max_internal_retries_exceeded"}, + type="error", + raw_action={"reason": "max_internal_retries_exceeded", "error_type": "infrastructure"}, ) def _build_initial_messages( diff --git a/openadapt_evals/agents/policy_agent.py b/openadapt_evals/agents/policy_agent.py index 7284397..8bc2fda 100644 --- a/openadapt_evals/agents/policy_agent.py +++ b/openadapt_evals/agents/policy_agent.py @@ -34,42 +34,52 @@ class PolicyAgent(BenchmarkAgent): benchmark evaluation. The model is expected to be trained using the openadapt-ml training pipeline. + Prompt format is aligned with convert_demos.py training data. + Args: - checkpoint_path: Path to model checkpoint. - model_name: Name of the model architecture (default: qwen3-vl). + checkpoint_path: Path to LoRA adapter weights. + model_name: HuggingFace model name (must contain 'Qwen3-VL' or 'Qwen2.5-VL'). device: Device to run on ('cuda' or 'cpu'). - use_accessibility_tree: Whether to include a11y tree in prompts. + use_thinking: Whether to include instruction in prompts. """ def __init__( self, checkpoint_path: str | None = None, - model_name: str = "qwen3-vl", + model_name: str = "Qwen/Qwen3-VL-8B-Instruct", device: str = "cuda", - use_accessibility_tree: bool = True, + use_thinking: bool = True, ): self.checkpoint_path = checkpoint_path self.model_name = model_name self.device = device - self.use_accessibility_tree = use_accessibility_tree + self.use_thinking = use_thinking # Lazy load model to avoid import overhead self._model = None self._processor = None + self._previous_actions: list[str] = [] + self._temp_files: list[str] = [] def _load_model(self) -> None: - """Load the model from checkpoint.""" + """Load the model adapter from checkpoint.""" if self._model is not None: return try: - # Import from openadapt-ml - from openadapt_ml.vlm import load_model_and_processor - - self._model, self._processor = load_model_and_processor( + import torch + from openadapt_ml.models.qwen_vl import QwenVLAdapter + + device = torch.device(self.device) if isinstance(self.device, str) else self.device + lora_config = ( + {"weights_path": self.checkpoint_path} + if self.checkpoint_path + else None + ) + self._model = QwenVLAdapter.from_pretrained( model_name=self.model_name, - checkpoint_path=self.checkpoint_path, - device=self.device, + lora_config=lora_config, + device=device, ) logger.info(f"PolicyAgent loaded model from {self.checkpoint_path}") except ImportError as e: @@ -99,13 +109,17 @@ def act( # Ensure model is loaded self._load_model() - # Build prompt + # Build prompt (aligned with training format) prompt = self._build_prompt(observation, task, history) # Get model prediction try: response = self._run_inference(observation, prompt) action = self._parse_response(response, observation) + + # Track action in training format for "Previous actions" section + self._previous_actions.append(self._format_action_qwen(action)) + return action except Exception as e: logger.error(f"Inference failed: {e}") @@ -117,7 +131,19 @@ def _build_prompt( task: BenchmarkTask, history: list[tuple[BenchmarkObservation, BenchmarkAction]] | None = None, ) -> str: - """Build prompt for the model. + """Build user-turn text aligned with convert_demos.convert_step. + + Format matches training data exactly:: + + + Instruction: {instruction} + + Previous actions: + Step 0: {action} + Step 1: {action} + + First reason about what you see in ... tags, + then output exactly one action. Args: observation: Current observation. @@ -127,59 +153,112 @@ def _build_prompt( Returns: Prompt string. """ - parts = [f"TASK: {task.instruction}"] - - # Add context - if observation.window_title: - parts.append(f"Current window: {observation.window_title}") - - # Add accessibility tree if enabled - if self.use_accessibility_tree and observation.accessibility_tree: - from openadapt_evals.agents.base import format_accessibility_tree - tree_str = format_accessibility_tree(observation.accessibility_tree) - if len(tree_str) > 4000: - tree_str = tree_str[:4000] + "\n... (truncated)" - parts.append(f"UI Elements:\n{tree_str}") - - # Add history - if history: - from openadapt_evals.agents.base import action_to_string - history_str = "\n".join( - f"Step {i+1}: {action_to_string(action)}" - for i, (_, action) in enumerate(history[-5:]) + parts = [""] + parts.append(f"Instruction: {task.instruction}") + + # Previous actions (matches training format) + if self._previous_actions: + parts.append("") + parts.append("Previous actions:") + for i, act in enumerate(self._previous_actions): + parts.append(f" Step {i}: {act}") + + # Tail instruction + parts.append("") + if self.use_thinking: + parts.append( + "First reason about what you see in ... " + "tags, then output exactly one action." ) - parts.append(f"Previous actions:\n{history_str}") + else: + parts.append("Output exactly one action.") + + return "\n".join(parts) - parts.append("\nWhat is the next action?") - return "\n\n".join(parts) + @staticmethod + def _format_action_qwen(action: BenchmarkAction) -> str: + """Format action matching convert_demos._format_action_qwen training format. + + Uses [0, 1000] coordinate range and lowercase function-call style + to match what the model was trained on. + """ + def _to_1000(v: float | None) -> int: + return round((v or 0.0) * 1000) + + if action.type == "click": + return f"click(x={_to_1000(action.x)}, y={_to_1000(action.y)})" + if action.type == "double_click": + return f"double_click(x={_to_1000(action.x)}, y={_to_1000(action.y)})" + if action.type == "right_click": + return f"right_click(x={_to_1000(action.x)}, y={_to_1000(action.y)})" + if action.type == "type": + return f'type(text="{action.text or ""}")' + if action.type == "key": + keys = (action.modifiers or []) + ([action.key] if action.key else []) + keys_fmt = ", ".join(f'"{k}"' for k in keys) + return f"press(keys=[{keys_fmt}])" + if action.type == "scroll": + return f'scroll(direction="{action.scroll_direction or "down"}", amount=3)' + if action.type == "drag": + return ( + f"drag(from_coord=[{_to_1000(action.x)}, {_to_1000(action.y)}], " + f"to_coord=[{_to_1000(action.end_x)}, {_to_1000(action.end_y)}])" + ) + if action.type == "done": + return "finished()" + return f"# unknown: {action.type}" + + def _build_sample(self, observation: BenchmarkObservation, prompt: str) -> dict: + """Build SFT-style sample matching training format from convert_demos.py. + + NOTE: No system message is included here because + ``QwenVLAdapter.generate()`` only extracts the user role message + and drops any system role. The model was trained under the same + conditions (no system prompt), so omitting it at inference keeps + behaviour consistent. + + Args: + observation: Observation with screenshot. + prompt: User-turn prompt text (from _build_prompt). + + Returns: + SFT sample dict with messages and optional images. + """ + sample = { + "messages": [ + {"role": "user", "content": prompt}, + ], + } + if observation.screenshot_path: + sample["images"] = [observation.screenshot_path] + return sample def _run_inference(self, observation: BenchmarkObservation, prompt: str) -> str: - """Run model inference. + """Run model inference using SFT-style message format. Args: observation: Observation with screenshot. - prompt: Prompt text. + prompt: User-turn prompt text. Returns: Model response text. """ - from openadapt_ml.vlm import run_inference - - # Get screenshot as PIL Image - if observation.screenshot: - from PIL import Image - from io import BytesIO - image = Image.open(BytesIO(observation.screenshot)) - else: + if not observation.screenshot and not observation.screenshot_path: raise ValueError("No screenshot in observation") - response = run_inference( - model=self._model, - processor=self._processor, - image=image, - prompt=prompt, - device=self.device, - ) + sample = self._build_sample(observation, prompt) + + # If screenshot_path is missing but bytes are available, write to temp file + if "images" not in sample and observation.screenshot: + import tempfile + tmp = tempfile.NamedTemporaryFile(suffix=".png", delete=False) + tmp.write(observation.screenshot) + tmp.close() + sample["images"] = [tmp.name] + self._temp_files.append(tmp.name) + + # Use the adapter's generate method (works with both local and remote) + response = self._model.generate(sample) return response def _parse_response( @@ -187,6 +266,12 @@ def _parse_response( ) -> BenchmarkAction: """Parse model response into BenchmarkAction. + Uses ``parse_qwen_action`` from ``qwen3vl_agent`` which handles the + lowercase keyword format the trained model outputs (e.g. + ``click(x=500, y=300)``). The base ``parse_action_response`` only + recognises UPPERCASE format (``CLICK(0.5, 0.3)``), so every Qwen + model output would silently fall through to ``type="done"``. + Args: response: Model response text. observation: Observation for coordinate normalization. @@ -194,10 +279,17 @@ def _parse_response( Returns: Parsed action. """ - from openadapt_evals.agents.base import parse_action_response - return parse_action_response(response, observation) + from openadapt_evals.agents.qwen3vl_agent import parse_qwen_action + return parse_qwen_action(response, observation.viewport) def reset(self) -> None: """Reset agent state between tasks.""" - # Nothing to reset for policy agent - pass + import os + + self._previous_actions = [] + for path in self._temp_files: + try: + os.unlink(path) + except OSError: + pass + self._temp_files = [] diff --git a/openadapt_evals/agents/qwen3vl_agent.py b/openadapt_evals/agents/qwen3vl_agent.py index a459ba2..fc95a02 100644 --- a/openadapt_evals/agents/qwen3vl_agent.py +++ b/openadapt_evals/agents/qwen3vl_agent.py @@ -204,7 +204,7 @@ def parse_qwen_action( # --- wait() --- if _RE_WAIT.search(action_str): raw["is_wait"] = True - return BenchmarkAction(type="done", raw_action=raw) + return BenchmarkAction(type="wait", raw_action=raw) # --- double_click(x=, y=) --- (check before click) m = _RE_DOUBLE_CLICK.search(action_str) @@ -734,8 +734,9 @@ def _run_remote_inference( with open(adapter_config) as f: cfg = json.load(f) base_model = cfg.get("base_model_name_or_path", DEFAULT_MODEL) - # Adapter is assumed to be uploaded to volume at /training/results/final - adapter_path = "/training/results/final" + # Adapter is uploaded to volume at /adapter by upload_adapter_to_volume() + # Volume mounts at /training, so full path is /training/adapter + adapter_path = "/training/adapter" else: base_model = self.model_path diff --git a/openadapt_evals/benchmarks/runner.py b/openadapt_evals/benchmarks/runner.py index ab8a8e6..d295929 100644 --- a/openadapt_evals/benchmarks/runner.py +++ b/openadapt_evals/benchmarks/runner.py @@ -292,6 +292,7 @@ def _run_single_task( done = False steps = 0 + action = None max_steps = task.time_limit_steps or config.max_steps while not done and steps < max_steps: @@ -337,8 +338,11 @@ def _run_single_task( config.on_step(obs, action, steps) # Check for terminal action - if action.type == "done": - logger.info(f"Step {steps}: Agent signaled task completion") + if action.type in ("done", "error"): + if action.type == "error": + logger.error(f"Step {steps}: Agent error: {action.raw_action}") + else: + logger.info(f"Step {steps}: Agent signaled task completion") done = True break @@ -365,6 +369,11 @@ def _run_single_task( logger.info("Evaluating task result") result = adapter.evaluate(task) + # Propagate error_type from agent error action + if action is not None and action.type == "error" and action.raw_action: + result.error_type = action.raw_action.get("error_type", "agent") + result.error = result.error or action.raw_action.get("reason") + # Update result with trajectory info result.steps = history if config.save_trajectories else [] result.num_steps = steps diff --git a/openadapt_evals/infrastructure/pool.py b/openadapt_evals/infrastructure/pool.py index d64f055..05c8a67 100644 --- a/openadapt_evals/infrastructure/pool.py +++ b/openadapt_evals/infrastructure/pool.py @@ -134,7 +134,15 @@ class PoolRunResult: WAA_START_SCRIPT = """ # Check if container already running if docker ps --format '{{.Names}}' | grep -q '^winarena$'; then - echo "ALREADY_RUNNING" + # Container is up, but socat proxy may be dead (e.g. after docker restart). + # Ensure the evaluate-server proxy is running. + if ! pgrep -f 'socat.*5051' >/dev/null 2>&1; then + which socat >/dev/null 2>&1 || sudo DEBIAN_FRONTEND=noninteractive apt-get install -y -qq socat + nohup socat TCP-LISTEN:5051,fork,reuseaddr EXEC:"docker exec -i winarena socat - TCP\\:127.0.0.1\\:5050" > /dev/null 2>&1 & + echo "ALREADY_RUNNING (socat proxy restarted)" + else + echo "ALREADY_RUNNING" + fi exit 0 fi diff --git a/openadapt_evals/server/evaluate_endpoint.py b/openadapt_evals/server/evaluate_endpoint.py index 6126838..747a676 100644 --- a/openadapt_evals/server/evaluate_endpoint.py +++ b/openadapt_evals/server/evaluate_endpoint.py @@ -365,6 +365,14 @@ def evaluate_task_state( "success": score >= 1.0, }) + # Guard empty metric_results (e.g. func_spec was []) + if not metric_results: + return { + "success": False, + "score": 0.0, + "reason": "No metrics could be computed (empty func spec)", + } + # Combine results based on conjunction if conjunction == "or": final_score = max(r["score"] for r in metric_results) diff --git a/scripts/run_dc_eval.py b/scripts/run_dc_eval.py new file mode 100644 index 0000000..0001123 --- /dev/null +++ b/scripts/run_dc_eval.py @@ -0,0 +1,317 @@ +#!/usr/bin/env python3 +"""Run demo-conditioned evaluation with auto-recovery. + +Runs each task individually with its matching demo file. Includes: +- SSH tunnel health check + reconnect before each task +- WAA server health check with container restart if dead +- Retry on transient failures + +Usage: + # 3-task validation with ClaudeComputerUseAgent + python scripts/run_dc_eval.py \ + --agent api-claude-cu \ + --tasks 0e763496,70745df8,fba2c100 + + # All 12 tasks + python scripts/run_dc_eval.py --agent api-claude-cu +""" + +from __future__ import annotations + +import argparse +import subprocess +import sys +import time +from pathlib import Path + +import requests + +HARDER_TASK_IDS = [ + "04d9aeaf-7bed-4024-bedb-e10e6f00eb7f-WOS", + "0a0faba3-5580-44df-965d-f562a99b291c-WOS", + "0bf05a7d-b28b-44d2-955a-50b41e24012a-WOS", + "0e763496-b6bb-4508-a427-fad0b6c3e195-WOS", + "4bcb1253-a636-4df4-8cb0-a35c04dfef31-WOS", + "70745df8-f2f5-42bd-8074-fbc10334fcc5-2-WOS", + "8b1ce5f2-59d2-4dcc-b0b0-666a714b9a14-WOS", + "e2b5e914-ffe1-44d2-8e92-58f8c5d92bb2-WOS", + "ec71221e-ac43-46f9-89b8-ee7d80f7e1c5-WOS", + "fba2c100-79e8-42df-ae74-b592418d54f4-WOS", + "INF-0d95d28a-9587-433b-a805-1fbe5467d598-WOS", + "INF-5ac2891a-eacd-4954-b339-98abba077adb-WOS", +] + + +def short_id(task_id: str) -> str: + return task_id[:8] + + +def _kill_tunnels(): + subprocess.run( + "ps aux | grep 'ssh.*5001' | grep -v grep | awk '{print $2}' | xargs kill 2>/dev/null", + shell=True, capture_output=True, + ) + + +def _start_tunnel(vm_user: str, vm_ip: str) -> bool: + cmd = [ + "ssh", "-f", "-N", + "-o", "StrictHostKeyChecking=no", + "-o", "ServerAliveInterval=15", + "-o", "ServerAliveCountMax=3", + "-o", "TCPKeepAlive=yes", + "-o", "ExitOnForwardFailure=yes", + "-L", "5001:localhost:5000", + "-L", "5050:localhost:5051", + "-L", "8006:localhost:8006", + f"{vm_user}@{vm_ip}", + ] + result = subprocess.run(cmd, capture_output=True) + return result.returncode == 0 + + +def _probe(server: str, timeout: int = 10) -> bool: + try: + resp = requests.get(f"{server}/probe", timeout=timeout) + return resp.ok + except Exception: + return False + + +def _setup_eval_proxy(vm_user: str, vm_ip: str) -> bool: + """(Re-)establish socat proxy for the evaluate server on the VM. + + Docker port forwarding for port 5050 is broken due to QEMU's custom + bridge networking (--cap-add NET_ADMIN). Work around it by running + socat on the VM host that pipes through ``docker exec`` into the + container's localhost:5050. The SSH tunnel maps local 5050 → VM 5051. + """ + script = ( + "killall socat 2>/dev/null || true; sleep 1; " + "which socat >/dev/null 2>&1 " + "|| sudo DEBIAN_FRONTEND=noninteractive apt-get install -y -qq socat; " + "nohup socat TCP-LISTEN:5051,fork,reuseaddr " + "'EXEC:docker exec -i winarena socat - TCP\\:127.0.0.1\\:5050' " + "/dev/null 2>&1 &" + ) + result = subprocess.run( + ["ssh", "-o", "StrictHostKeyChecking=no", f"{vm_user}@{vm_ip}", script], + capture_output=True, timeout=30, + ) + if result.returncode != 0: + print(f" socat proxy setup failed: {result.stderr.decode()}") + return False + print(" socat proxy for evaluate server established (VM:5051 → container:5050)") + return True + + +def _restart_container(vm_user: str, vm_ip: str) -> bool: + """Restart the WAA container and re-establish the evaluate proxy.""" + print(" Restarting WAA container (Windows will reboot)...") + result = subprocess.run( + ["ssh", "-o", "StrictHostKeyChecking=no", f"{vm_user}@{vm_ip}", + "docker restart winarena"], + capture_output=True, timeout=120, + ) + if result.returncode != 0: + print(f" Container restart failed: {result.stderr.decode()}") + return False + print(" Container restarted, re-establishing evaluate proxy...") + _setup_eval_proxy(vm_user, vm_ip) + return True + + +def ensure_waa_ready( + server: str, + vm_user: str, + vm_ip: str, + max_wait: int = 420, + evaluate_url: str | None = None, +) -> bool: + """Ensure WAA is reachable, recovering via tunnel reconnect or container restart. + + Recovery sequence: + 1. Probe → OK: return True + 2. Reconnect tunnel → Probe → OK: return True + 3. Restart container → Reconnect tunnel → Wait for probe: return True/False + + If evaluate_url is provided, both the WAA server and evaluate server must respond. + """ + # Step 1: Quick probe + if _probe(server) and (evaluate_url is None or _probe(evaluate_url)): + return True + + # Step 2: Reconnect tunnel + ensure socat proxy + print(" WAA unreachable, reconnecting tunnel...") + _kill_tunnels() + time.sleep(1) + _setup_eval_proxy(vm_user, vm_ip) + if _start_tunnel(vm_user, vm_ip): + time.sleep(3) + if _probe(server) and (evaluate_url is None or _probe(evaluate_url)): + print(" Tunnel reconnected, WAA ready!") + return True + + # Step 3: Tunnel up but WAA not responding → container restart + print(" Tunnel OK but WAA server dead, restarting container...") + _kill_tunnels() + if not _restart_container(vm_user, vm_ip): + return False + + # Wait for Windows to boot + Flask server to start + time.sleep(10) + _kill_tunnels() + time.sleep(1) + if not _start_tunnel(vm_user, vm_ip): + print(" Failed to reconnect tunnel after restart") + return False + + deadline = time.time() + max_wait + last_print = 0 + while time.time() < deadline: + elapsed = int(time.time() - (deadline - max_wait)) + if elapsed - last_print >= 15: + print(f" [{elapsed}s] Waiting for WAA server...") + last_print = elapsed + if _probe(server, timeout=10) and (evaluate_url is None or _probe(evaluate_url, timeout=10)): + print(f" WAA ready after {elapsed}s!") + return True + time.sleep(10) + + print(f" TIMEOUT: WAA not ready after {max_wait}s") + return False + + +def main() -> int: + parser = argparse.ArgumentParser(description="Run DC eval with auto-recovery") + parser.add_argument("--agent", default="api-claude-cu", help="Agent type") + parser.add_argument("--demo-dir", default="annotated_demos", help="Demo directory") + parser.add_argument("--server", default="http://localhost:5001") + parser.add_argument("--evaluate-url", default="http://localhost:5050") + parser.add_argument("--max-steps", type=int, default=15) + parser.add_argument("--output", default="benchmark_results") + parser.add_argument("--tasks", help="Comma-separated task IDs or prefixes (default: all 12)") + parser.add_argument("--start-from", type=int, default=0, help="Task index to start from") + parser.add_argument("--vm-ip", default="172.173.66.131", help="VM IP") + parser.add_argument("--vm-user", default="azureuser", help="VM SSH user") + parser.add_argument("--zs-only", action="store_true", help="Run zero-shot only (no demo)") + parser.add_argument("--dc-only", action="store_true", help="Run demo-conditioned only") + args = parser.parse_args() + + demo_dir = Path(args.demo_dir) + output_dir = Path(args.output) + + # Resolve task IDs (support prefix matching) + if args.tasks: + raw_ids = [t.strip() for t in args.tasks.split(",")] + task_ids = [] + for raw in raw_ids: + matches = [tid for tid in HARDER_TASK_IDS if tid.startswith(raw)] + if len(matches) == 1: + task_ids.append(matches[0]) + elif len(matches) > 1: + print(f"Ambiguous prefix '{raw}': {matches}") + return 1 + else: + task_ids.append(raw) # Use as-is + else: + task_ids = HARDER_TASK_IDS + + # Build run conditions + conditions = [] # (task_id, run_name, demo_path_or_None) + for tid in task_ids: + sid = short_id(tid) + if not args.dc_only: + conditions.append((tid, f"val_zs_{sid}", None)) + if not args.zs_only: + demo_path = demo_dir / f"{tid}.txt" + if not demo_path.exists(): + demo_path = demo_dir / f"{tid}.json" + if not demo_path.exists(): + print(f"WARNING: No demo for {tid}, skipping DC condition") + continue + conditions.append((tid, f"val_dc_{sid}", demo_path)) + + print(f"Eval: {len(conditions)} runs ({len(task_ids)} tasks) with {args.agent}") + print(f"VM: {args.vm_ip}") + print() + + # Verify initial WAA health + if not ensure_waa_ready(args.server, args.vm_user, args.vm_ip, evaluate_url=args.evaluate_url): + print("ERROR: Cannot reach WAA server or evaluate server") + return 1 + + results = {} + start_time = time.time() + + for i, (tid, run_name, demo_path) in enumerate(conditions): + if i < args.start_from: + print(f"[{i+1}/{len(conditions)}] Skipping {run_name}") + continue + + # Health check before each run + if not ensure_waa_ready(args.server, args.vm_user, args.vm_ip, evaluate_url=args.evaluate_url): + print(f" Skipping {run_name} - WAA unreachable after recovery") + results[run_name] = {"status": "SKIP", "returncode": -1, "elapsed_s": 0} + continue + + cond_label = "DC" if demo_path else "ZS" + print(f"{'=' * 60}") + print(f"[{i+1}/{len(conditions)}] {cond_label}: {tid}") + if demo_path: + print(f" Demo: {demo_path.name} ({demo_path.stat().st_size} bytes)") + print(f" Run: {run_name}") + print(f"{'=' * 60}") + + task_start = time.time() + + cmd = [ + sys.executable, "-m", "openadapt_evals.benchmarks.cli", + "run", + "--agent", args.agent, + "--tasks", tid, + "--server", args.server, + "--evaluate-url", args.evaluate_url, + "--max-steps", str(args.max_steps), + "--output", str(output_dir), + "--run-name", run_name, + ] + if demo_path: + cmd.extend(["--demo", str(demo_path.resolve())]) + + result = subprocess.run(cmd) + elapsed = time.time() - task_start + + status = "OK" if result.returncode == 0 else f"FAIL (rc={result.returncode})" + results[run_name] = { + "status": status, + "returncode": result.returncode, + "elapsed_s": elapsed, + "task_id": tid, + "condition": cond_label, + } + + print(f"\n -> {status} ({elapsed:.0f}s)\n") + + # Summary + total_time = time.time() - start_time + ok = sum(1 for r in results.values() if r["returncode"] == 0) + + print(f"\n{'=' * 60}") + print(f"EVAL SUMMARY ({args.agent})") + print(f"{'=' * 60}") + print(f" Runs: {ok}/{len(results)} completed") + print(f" Total time: {total_time:.0f}s ({total_time/60:.1f}min)") + print() + + for name, r in results.items(): + cond = r.get("condition", "?") + print(f" {name:30s} {cond:2s} {r['status']:15s} {r['elapsed_s']:.0f}s") + + print(f"\n Results in: {output_dir}/val_*/") + + return 0 if ok == len(results) else 1 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/tests/test_claude_computer_use_agent.py b/tests/test_claude_computer_use_agent.py index 0953124..a56325b 100644 --- a/tests/test_claude_computer_use_agent.py +++ b/tests/test_claude_computer_use_agent.py @@ -228,8 +228,8 @@ def test_no_tool_use_returns_done(self, agent, mock_anthropic_client): assert action.type == "done" assert "no_tool_use" in action.raw_action.get("reason", "") - def test_screenshot_action_returns_done(self, agent, mock_anthropic_client): - """Screenshot action maps to done (next step sends screenshot back).""" + def test_screenshot_action_returns_error_on_exhaustion(self, agent, mock_anthropic_client): + """Screenshot action returns error after exhausting retries.""" response = create_mock_response( create_tool_use_block("screenshot") ) @@ -237,10 +237,11 @@ def test_screenshot_action_returns_done(self, agent, mock_anthropic_client): action = agent.act(make_observation(), make_task()) - assert action.type == "done" + assert action.type == "error" + assert action.raw_action["reason"] == "max_internal_retries_exceeded" - def test_wait_action_returns_done(self, agent, mock_anthropic_client): - """Wait action maps to done.""" + def test_wait_action_returns_error_on_exhaustion(self, agent, mock_anthropic_client): + """Wait action returns error after exhausting retries.""" response = create_mock_response( create_tool_use_block("wait", duration=1.0) ) @@ -248,7 +249,8 @@ def test_wait_action_returns_done(self, agent, mock_anthropic_client): action = agent.act(make_observation(), make_task()) - assert action.type == "done" + assert action.type == "error" + assert action.raw_action["reason"] == "max_internal_retries_exceeded" class TestConversationManagement: @@ -431,30 +433,28 @@ def test_viewport_updates_display_dimensions(self, agent, mock_anthropic_client) assert abs(action.x - 0.5) < 0.01 # 960/1920 = 0.5 assert abs(action.y - 0.5) < 0.01 # 540/1080 = 0.5 - def test_no_screenshot_still_works(self, agent, mock_anthropic_client): - """Agent works even without a screenshot.""" - response = create_mock_response( - create_text_block("I cannot see the screen.") - ) - mock_anthropic_client.beta.messages.create.return_value = response - + def test_no_screenshot_returns_error(self, agent, mock_anthropic_client): + """Agent returns error when no screenshot is available.""" obs = BenchmarkObservation(screenshot=None) action = agent.act(obs, make_task()) - assert action.type == "done" + assert action.type == "error" + assert action.raw_action["reason"] == "no_screenshot" + assert action.raw_action["error_type"] == "infrastructure" class TestEdgeCases: """Test edge cases and error handling.""" - def test_api_error_returns_done(self, agent, mock_anthropic_client): - """API error results in done action.""" + def test_api_error_returns_error(self, agent, mock_anthropic_client): + """API error results in error action.""" mock_anthropic_client.beta.messages.create.side_effect = Exception("API error") action = agent.act(make_observation(), make_task()) - assert action.type == "done" - assert "error" in (action.raw_action or {}) + assert action.type == "error" + assert action.raw_action["reason"] == "api_call_failed" + assert action.raw_action["error_type"] == "infrastructure" def test_unknown_action_returns_done(self, agent, mock_anthropic_client): """Unknown action type returns done.""" diff --git a/tests/test_mock_adapter.py b/tests/test_mock_adapter.py index 12ee40a..1004d23 100644 --- a/tests/test_mock_adapter.py +++ b/tests/test_mock_adapter.py @@ -288,18 +288,18 @@ def test_evaluate_success_on_type_and_ok(self, mock_adapter): assert result.success is True assert result.score == 1.0 - def test_evaluate_success_on_done_with_actions(self, mock_adapter): - """Test that calling DONE after 2+ actions results in success.""" + def test_evaluate_no_false_positive_on_done_with_actions(self, mock_adapter): + """Test that calling DONE after non-target actions does NOT count as success.""" task = mock_adapter.list_tasks()[0] mock_adapter.reset(task) - # Take two meaningful actions + # Click random area (not Submit/OK) and call done mock_adapter.step(BenchmarkAction(type="click", x=0.5, y=0.5)) mock_adapter.step(BenchmarkAction(type="done")) result = mock_adapter.evaluate(task) - assert result.success is True - assert result.score == 1.0 + assert result.success is False + assert result.score < 1.0 def test_evaluate_partial_score_on_some_actions(self, mock_adapter): """Test that partial actions get partial credit.""" diff --git a/tests/test_qwen3vl_agent.py b/tests/test_qwen3vl_agent.py index 7018986..0223d09 100644 --- a/tests/test_qwen3vl_agent.py +++ b/tests/test_qwen3vl_agent.py @@ -158,7 +158,7 @@ def test_drag_stores_qwen_coords(self): def test_wait(self): action = parse_qwen_action("wait()") - assert action.type == "done" + assert action.type == "wait" assert action.raw_action.get("is_wait") is True def test_finished(self):