Thanks to visit codestin.com
Credit goes to github.com

Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
183 changes: 183 additions & 0 deletions libs/python/agent/agent/benchmark.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,183 @@
#!/usr/bin/env python3
"""
Offline OSWorld-G evaluation runner.

Implements run_offline_dataset(dataset, split, **agent_kwargs) that evaluates a ComputerAgent
against an offline dataset by wiring a mock computer tool that surfaces the row image via
screenshot() and tracks clicks against the provided bounding box.

Dataset default: "MMInstruction/OSWorld-G"
Each row is expected to have keys:
- "instruction": str # instruction text
- "image": PIL.Image.Image # screenshot image
- "image_size": Tuple[int, int] # (width, height)
- "box_coordinates": Tuple[int,int,int,int] # (x, y, w, h)

The mock tool exposes methods: screenshot, click, get_dimensions, get_environment.
We run agent.run(instruction) and consume up to 5 steps.
We mark success if any click landed within the target bbox.
"""

from __future__ import annotations

import asyncio
import base64
import io
from dataclasses import dataclass
from typing import Any, Dict, List, Optional, Tuple

from datasets import load_dataset
from PIL import Image

# Import ComputerAgent (v0.4.x style)
from agent import ComputerAgent # type: ignore

@dataclass
class RowContext:
image: Image.Image
image_size: Tuple[int, int]
bbox_xywh: Tuple[int, int, int, int]


def make_offline_computer(row_ctx: RowContext) -> Tuple[Dict[str, Any], Dict[str, Any]]:
"""
Create a dict-based mock computer tool compatible with CustomComputerHandler.

Returns a tuple of (functions_dict, state_dict) where state_dict captures
click_history and clicked_in_bbox for post-run evaluation.
"""
state: Dict[str, Any] = {
"click_history": [],
"clicked_in_bbox": False,
}

# Functions expected by CustomComputerHandler
def screenshot_func() -> Image.Image:
# Return PIL Image; handler converts to base64
return row_ctx.image

def click_func(x: int, y: int, button: str = "left") -> None:
state["click_history"].append((int(x), int(y)))
bx, by, bw, bh = row_ctx.bbox_xywh
if bx <= x <= bx + bw and by <= y <= by + bh:
state["clicked_in_bbox"] = True

def get_environment_func() -> str:
return "linux"

def get_dimensions_func() -> Tuple[int, int]:
return (int(row_ctx.image_size[0]), int(row_ctx.image_size[1]))

functions: Dict[str, Any] = {
"screenshot": screenshot_func,
# For getters, CustomComputerHandler checks both 'get_{attr}' and '{attr}'
"environment": get_environment_func,
"dimensions": get_dimensions_func,
"click": click_func,
}

return functions, state


async def run_offline_dataset(
dataset: str = "MMInstruction/OSWorld-G",
split: str = "test",
max_turns: int = 5,
**agent_kwargs: Any,
) -> Dict[str, Any]:
"""
Evaluate a ComputerAgent on an offline dataset with a mock computer tool.

Args:
dataset: HF dataset path or local dataset identifier (default: "MMInstruction/OSWorld-G")
split: dataset split to use (default: "test")
max_turns: maximum number of agent steps to consume per example (default: 5)
**agent_kwargs: any ComputerAgent kwargs (e.g., model="...", etc.)
If 'tools' is provided, the mock computer will be appended.

Returns:
A dict with summary metrics and per-row results.
"""
# Load dataset
ds = load_dataset(dataset, split=split)

# Prepare results
total = 0
successes = 0
per_row: List[Dict[str, Any]] = []

for idx, row in enumerate(ds):
total += 1
# Extract expected fields
# Some datasets package image as PIL already; if not, convert
img: Image.Image = row["image"]
if not isinstance(img, Image.Image):
# Attempt to convert if the dataset returns arrays
img = Image.fromarray(img)
img_size: Tuple[int, int] = tuple(row["image_size"]) # (w, h)
bbox_xywh: Tuple[int, int, int, int] = tuple(row["box_coordinates"]) # (x,y,w,h)
instruction: str = row["instruction"]

row_ctx = RowContext(image=img, image_size=(int(img_size[0]), int(img_size[1])), bbox_xywh=(
int(bbox_xywh[0]), int(bbox_xywh[1]), int(bbox_xywh[2]), int(bbox_xywh[3])
))
functions, state = make_offline_computer(row_ctx)

# Build tools list from kwargs + our mock
tools = list(agent_kwargs.get("tools", []))
tools.append(functions)

# Build agent kwargs: ensure tools
ak = dict(agent_kwargs)
ak["tools"] = tools

# Instantiate agent
agent = ComputerAgent(**ak) # expects model=..., tools=[...]

# Run the agent for up to max_turns steps
# The API supports passing a plain string or a message list; follow the user's instruction style.
step_count = 0
async for _ in agent.run(instruction): # type: ignore
step_count += 1
if step_count >= max_turns:
break

success = bool(state["clicked_in_bbox"]) # type: ignore
if success:
successes += 1

per_row.append({
"index": idx,
"instruction": instruction,
"clicked_in_bbox": success,
"num_turns": step_count,
"clicks": list(state["click_history"]), # type: ignore
"bbox_xywh": list(bbox_xywh),
"image_size": list(img_size),
})

accuracy = successes / total if total else 0.0
summary = {
"dataset": dataset,
"split": split,
"total": total,
"successes": successes,
"accuracy": accuracy,
}

return {"summary": summary, "results": per_row}


async def _demo():
"""Simple demo entrypoint for quick local testing via `python osworld_offline.py`."""
# Example: minimal required kwargs include a model string
out = await run_offline_dataset(
dataset="MMInstruction/OSWorld-G",
split="test",
model="anthropic/claude-3-5-sonnet-20241022",
)
print(out["summary"]) # noqa: T201


if __name__ == "__main__":
asyncio.run(_demo())
58 changes: 57 additions & 1 deletion notebooks/hud_hackathon.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -161,6 +161,9 @@
" max_concurrent=20, # Tune to your infra\n",
" max_steps=50, # Safety cap per task\n",
" split=\"train[:3]\" # Limit to just 3 tasks\n",
" # instructions=\"...\" # Set a custom system prompt\n",
" # callbacks=[], # Set custom callbacks\n",
" # tools=[your_python_func], # Add custom tools\n",
")\n",
"\n",
"# results is a list from hud.datasets.run_dataset; inspect/aggregate as needed\n",
Expand All @@ -169,6 +172,45 @@
"pprint(results[:3])"
]
},
{
"cell_type": "markdown",
"id": "920a09ed",
"metadata": {},
"source": [
"If HUD is not available, you can also test your agent offline on the OSWorld-G evaluation set"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "207d1b96",
"metadata": {},
"outputs": [],
"source": [
"from agent.benchmark import run_offline_dataset\n",
"import json\n",
"import logging\n",
"\n",
"# Choose a model and any ComputerAgent kwargs you want\n",
"# Examples:\n",
"# model=\"openai/computer-use-preview\"\n",
"# model=\"anthropic/claude-3-5-sonnet-20241022\"\n",
"# You can also pass other ComputerAgent kwargs like only_n_most_recent_images, callbacks, etc.\n",
"results = await run_offline_dataset(\n",
" dataset=\"MMInstruction/OSWorld-G\",\n",
" split=\"test[:50]\",\n",
" model=\"openai/computer-use-preview\",\n",
" only_n_most_recent_images=3,\n",
" verbosity=logging.INFO, # Print agent outputs \n",
" # instructions=\"...\" # Set a custom system prompt\n",
" # callbacks=[], # Set custom callbacks\n",
" # tools=[your_python_func], # Add custom tools\n",
")\n",
"\n",
"print(\"Summary:\", json.dumps(results[\"summary\"], indent=2))\n",
"print(\"First 3 results:\", json.dumps(results[\"results\"][:3], indent=2))"
]
},
{
"cell_type": "markdown",
"id": "5b89a103",
Expand All @@ -185,8 +227,22 @@
}
],
"metadata": {
"kernelspec": {
"display_name": ".venv",
"language": "python",
"name": "python3"
},
"language_info": {
"name": "python"
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.11"
}
},
"nbformat": 4,
Expand Down
Loading