Compare commits
35 Commits
fix/browse
...
exp/click_
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
74b7ec397e | ||
|
|
6dec35475b | ||
|
|
f9c56546cc | ||
|
|
670b8d9745 | ||
|
|
76b6869219 | ||
|
|
95303f4374 | ||
|
|
bccccce0a7 | ||
|
|
cda9965927 | ||
|
|
31eb93bdf8 | ||
|
|
3386f0a5ce | ||
|
|
b8ed09eaba | ||
|
|
907f10e7c8 | ||
|
|
db8bec9b59 | ||
|
|
2879574219 | ||
|
|
7125029049 | ||
|
|
e6c6c29472 | ||
|
|
d35c02b223 | ||
|
|
b0c5383407 | ||
|
|
bf9fc96f42 | ||
|
|
307c02c44a | ||
|
|
d865931db3 | ||
|
|
4e55be8b9f | ||
|
|
a39dfa52f3 | ||
|
|
a4584142a1 | ||
|
|
f538615a4f | ||
|
|
08968ff16e | ||
|
|
dca8b8555f | ||
|
|
98cc128d3b | ||
|
|
a6ae8bba56 | ||
|
|
a5c3769e4e | ||
|
|
4051fe189b | ||
|
|
eb08cac743 | ||
|
|
144a10946d | ||
|
|
e30f29dd06 | ||
|
|
b4e08d3a13 |
1
.gitignore
vendored
@@ -33,3 +33,4 @@ packages/browseros/build/tools/
|
||||
# AI SDK DevTools traces
|
||||
.devtools/
|
||||
.omc/
|
||||
packages/browseros-agent/tools/dogfood/browseros-dogfood
|
||||
|
||||
@@ -6,7 +6,7 @@ build-backend = "setuptools.build_meta"
|
||||
name = "browseros"
|
||||
version = "0.0.1"
|
||||
description = "BrowserOS Build System"
|
||||
requires-python = ">=3.12"
|
||||
requires-python = ">=3.11"
|
||||
dependencies = [
|
||||
"click>=8.0.0",
|
||||
"typer>=0.12.0",
|
||||
|
||||
8
prototypes/click_eval/.gitignore
vendored
Normal file
@@ -0,0 +1,8 @@
|
||||
/.venv/
|
||||
/.env
|
||||
/runs/
|
||||
/.pytest_cache/
|
||||
/.ruff_cache/
|
||||
/src/*.egg-info/
|
||||
__pycache__/
|
||||
*.pyc
|
||||
302
prototypes/click_eval/README.md
Normal file
@@ -0,0 +1,302 @@
|
||||
# Click Eval Prototype
|
||||
|
||||
Tiny VLM click-point evaluation harness.
|
||||
|
||||
## Layout
|
||||
|
||||
- `src/click_eval/`: runtime package and CLI
|
||||
- `tests/`: fixture-based tests with no network calls
|
||||
- `examples/`: default task/model config files and sample screenshot
|
||||
- `runs/`: suggested output location
|
||||
|
||||
## Input
|
||||
|
||||
Create a JSONL task file:
|
||||
|
||||
```jsonl
|
||||
{"task_id":"chat_1","image_path":"screenshots/page.png","instruction":"click the chat button"}
|
||||
{"task_id":"send_1","image_path":"screenshots/page.png","instruction":"click send","gt_point":[510,742]}
|
||||
```
|
||||
|
||||
`image_path` is resolved relative to the task file. Configured judge model(s)
|
||||
are called and cached in the run output. If `gt_point` is absent, the task is
|
||||
left unscored: judge outputs are recorded for inspection, but no median fallback
|
||||
GT is created and no green `GT` marker is rendered. If `gt_point` is present,
|
||||
that provided point is the scoring GT.
|
||||
|
||||
The bundled Hacker News sample screenshot is captured at `1920x1152` DPR 1. That
|
||||
is a higher-resolution desktop sample while still being much smaller than a 4K
|
||||
screenshot for local GUI models.
|
||||
|
||||
The default model config is `examples/models.json`. The abbreviated cloud/API
|
||||
portion is:
|
||||
|
||||
```json
|
||||
{
|
||||
"judge_models": [
|
||||
{
|
||||
"name": "openai-computer-use-judge",
|
||||
"provider": "openai_computer_use",
|
||||
"model": "computer-use-preview"
|
||||
},
|
||||
{
|
||||
"name": "claude-opus-4.7-judge",
|
||||
"provider": "openrouter",
|
||||
"model": "anthropic/claude-opus-4.7"
|
||||
},
|
||||
{
|
||||
"name": "gpt-5.5-judge",
|
||||
"provider": "openrouter",
|
||||
"model": "openai/gpt-5.5"
|
||||
},
|
||||
{
|
||||
"name": "gemini-3.1-pro-judge",
|
||||
"provider": "openrouter",
|
||||
"model": "google/gemini-3.1-pro-preview"
|
||||
}
|
||||
],
|
||||
"candidate_models": [
|
||||
{
|
||||
"name": "qwen3-vl-8b-instruct",
|
||||
"provider": "openrouter",
|
||||
"model": "qwen/qwen3-vl-8b-instruct"
|
||||
},
|
||||
{
|
||||
"name": "qwen3-vl-8b-thinking",
|
||||
"provider": "openrouter",
|
||||
"model": "qwen/qwen3-vl-8b-thinking"
|
||||
},
|
||||
{
|
||||
"name": "ui-tars-1.5-7b",
|
||||
"provider": "openrouter",
|
||||
"model": "bytedance/ui-tars-1.5-7b"
|
||||
},
|
||||
{"name": "glm-4.5v", "provider": "openrouter", "model": "z-ai/glm-4.5v"},
|
||||
{"name": "glm-4.6v", "provider": "openrouter", "model": "z-ai/glm-4.6v"},
|
||||
{
|
||||
"name": "glm-5v-turbo",
|
||||
"provider": "openrouter",
|
||||
"model": "z-ai/glm-5v-turbo"
|
||||
},
|
||||
{"name": "moondream", "provider": "moondream", "model": "moondream-cloud"},
|
||||
{
|
||||
"name": "gemini-3.1-pro",
|
||||
"provider": "openrouter",
|
||||
"model": "google/gemini-3.1-pro-preview"
|
||||
}
|
||||
]
|
||||
}
|
||||
```
|
||||
|
||||
The `name` is only the short label shown in plots and summary files. OpenRouter
|
||||
is the default provider, but the examples keep it explicit for routability
|
||||
audits. The default judge IDs were checked against
|
||||
`https://openrouter.ai/api/v1/models` on 2026-04-27 and are:
|
||||
|
||||
- `anthropic/claude-opus-4.7`
|
||||
- `openai/gpt-5.5`
|
||||
- `google/gemini-3.1-pro-preview`
|
||||
|
||||
The active OpenRouter click-model shortlist was checked against
|
||||
`https://openrouter.ai/api/v1/models` on 2026-04-26 and includes:
|
||||
|
||||
- `qwen/qwen3-vl-8b-instruct`
|
||||
- `qwen/qwen3-vl-8b-thinking`
|
||||
- `bytedance/ui-tars-1.5-7b`
|
||||
- `z-ai/glm-4.5v`
|
||||
- `z-ai/glm-4.6v`
|
||||
- `z-ai/glm-5v-turbo`
|
||||
|
||||
Shortlist models not found in the current OpenRouter catalog are documented
|
||||
below as `local_hf` candidates. They are included in `examples/models.json`, but
|
||||
the provider checks for a CUDA/NVIDIA GPU before importing local inference
|
||||
dependencies or downloading weights. If no usable CUDA GPU is present, they are
|
||||
recorded as skipped with the CUDA detection reason.
|
||||
|
||||
| Model | Hosting | Setup needed |
|
||||
| --- | --- | --- |
|
||||
| `Qwen/Qwen3-VL-2B-Instruct` | Hugging Face | Included as `local_hf`; small generic Qwen3-VL baseline. |
|
||||
| `Qwen/Qwen3-VL-2B-Thinking` | Hugging Face | Included as `local_hf`; small generic Qwen3-VL thinking baseline. |
|
||||
| `Qwen/Qwen2.5-VL-3B-Instruct` | Hugging Face | Included as `local_hf`; generic Qwen2.5-VL baseline using a relative point prompt. |
|
||||
| `mPLUG/GUI-Owl-1.5-2B-Instruct` | Hugging Face | Included as `local_hf`; Qwen3-VL GUI-agent adapter. |
|
||||
| `mPLUG/GUI-Owl-1.5-4B-Instruct` | Hugging Face | Included as `local_hf`; Qwen3-VL GUI-agent adapter. |
|
||||
| `mPLUG/GUI-Owl-1.5-8B-Instruct` | Hugging Face | Included as `local_hf`; Qwen3-VL GUI-agent adapter. |
|
||||
| `vocaela/KV-Ground-8B-BaseGuiOwl1.5-0315` | Hugging Face | Included as `local_hf`; high-performing ScreenSpot-Pro GUI grounder, non-commercial license. |
|
||||
| `inclusionAI/UI-Venus-1.5-2B` | Hugging Face | Included as `local_hf`; small Qwen3-VL GUI agent. |
|
||||
| `inclusionAI/UI-Venus-1.5-8B` | Hugging Face | Included as `local_hf`; strong Apache-2.0 GUI agent/grounder. |
|
||||
| `Hcompany/Holo2-4B` | Hugging Face | Included as `local_hf`; Qwen3-VL computer-use model. |
|
||||
| `Hcompany/Holo2-8B` | Hugging Face | Included as `local_hf`; Qwen3-VL computer-use model. |
|
||||
| `Salesforce/GTA1-7B` | Hugging Face | Included as `local_hf`; outputs `pyautogui.click(...)` coordinates after Qwen smart resize. |
|
||||
| `xlangai/OpenCUA-7B` | Hugging Face | Included as `local_hf`; outputs `pyautogui.click(...)` coordinates after Qwen smart resize. |
|
||||
| `InfiX-ai/InfiGUI-G1-3B` | Hugging Face | Included as `local_hf`; outputs JSON `point_2d` coordinates after Qwen smart resize. |
|
||||
| `InfiX-ai/InfiGUI-G1-7B` | Hugging Face | Included as `local_hf`; outputs JSON `point_2d` coordinates after Qwen smart resize. |
|
||||
| `tencent/POINTS-GUI-G` | Hugging Face | Included as `local_hf`; outputs normalized `(x, y)` coordinates and needs `WePOINTS`. |
|
||||
| `Tongyi-MAI/MAI-UI-8B` | Hugging Face | Included as `local_hf`; Qwen3-VL GUI agent, may need `HF_TOKEN` depending on access. |
|
||||
| `allenai/MolmoPoint-GUI-8B` | Hugging Face | Included as `local_hf`; outputs pointing tokens, so model-specific parser tuning may improve results. |
|
||||
| `microsoft/Fara-7B` | Hugging Face and Microsoft Foundry | Included as `local_hf`; Foundry use would need endpoint credentials and a separate adapter. |
|
||||
| `ServiceNow/GroundNext-7B-V0` | Hugging Face and Azure AI Foundry | Included as `local_hf`; Azure use would need endpoint credentials and a separate adapter. |
|
||||
| `osunlp/UGround-V1-2B` | Hugging Face | Included as `local_hf`; smaller UGround model using normalized coordinates. |
|
||||
| `osunlp/UGround-V1-7B` | Hugging Face | Included as `local_hf`; the model card also documents vLLM OpenAI-compatible serving. |
|
||||
| `ByteDance-Seed/UI-TARS-2B-SFT` | Hugging Face | Included as `local_hf`; small UI-TARS model using the normalized point adapter. |
|
||||
| `zonghanHZH/ZonUI-3B` | Hugging Face | Included as `local_hf`; lightweight Qwen2.5-VL GUI grounding model. |
|
||||
| `Yuqi-Zhou/GUI-G1-3B-v1` | Hugging Face | Included as `local_hf`; 3B GUI grounding model using JSON `point_2d` output. |
|
||||
| `xlangai/Jedi-3B-1080p` | Hugging Face | Included as `local_hf`; OSWorld-G Qwen2.5-VL click/tool-call model. |
|
||||
| `xlangai/Jedi-7B-1080p` | Hugging Face | Included as `local_hf`; larger Jedi click/tool-call model. |
|
||||
| `Tongyi-MiA/UI-Ins-7B` | Hugging Face | Included as `local_hf`; GUI grounding model using tool-call coordinates. |
|
||||
| `osunlp/GUI-Drag-7B` | Hugging Face | Included as `local_hf`; drag-focused GUI model with preserved click behavior. |
|
||||
| `OS-Copilot/OS-Atlas-Base-4B` | Hugging Face | Included as `local_hf`; outputs normalized coordinates/boxes, so parser tuning may improve results. |
|
||||
| `OS-Copilot/OS-Atlas-Base-7B` | Hugging Face | Included as `local_hf`; outputs normalized coordinates/boxes, so parser tuning may improve results. |
|
||||
| `showlab/ShowUI-2B` | Hugging Face | Included as `local_hf`; parser tuning may be needed for action-dictionary outputs. |
|
||||
| `Qwen/Qwen3-VL-4B-Instruct` | Hugging Face | Included as `local_hf`; not currently routable through OpenRouter. |
|
||||
| `Qwen/Qwen3-VL-4B-Thinking` | Hugging Face | Included as `local_hf`; not currently routable through OpenRouter. |
|
||||
|
||||
The 2026-04-26 pass also found promising custom-head local models such as
|
||||
`microsoft/GUI-Actor-3B-Qwen2.5-VL`, `inclusionAI/V2P-7B`, and
|
||||
`TESS-Computer/qwen-click-dit`. Those are not in the default list yet because
|
||||
their model cards require custom Python model classes or action heads beyond
|
||||
plain `transformers` loading.
|
||||
|
||||
For HF-local models, install optional local dependencies with:
|
||||
|
||||
```bash
|
||||
uv sync --extra local
|
||||
```
|
||||
|
||||
This installs `torch`, `torchvision`, `transformers`, `accelerate`, `einops`,
|
||||
`qwen-vl-utils`, `safetensors`, `timm`, `sentencepiece`, `protobuf`,
|
||||
`requests`, `tiktoken`, and `WePOINTS`. `torch>=2.6` is required for models
|
||||
that still ship PyTorch `.bin` weights because older PyTorch releases are
|
||||
blocked by the CVE-2025-32434 `torch.load` guard. MolmoPoint also expects
|
||||
`einops`, and the Qwen-derived GUI models use `qwen-vl-utils` for image
|
||||
preprocessing. POINTS-GUI-G requires FlashAttention 2 at runtime, but it is not
|
||||
installed by the local extra because its native build must match the active
|
||||
Python, PyTorch, and CUDA environment.
|
||||
|
||||
The local provider is intentionally conservative: it only runs when PyTorch can
|
||||
use CUDA, and it skips non-offloaded models whose estimated VRAM exceeds the
|
||||
detected GPU memory. Models marked `allow_cpu_offload` use Transformers
|
||||
`device_map="auto"`; other local models load directly onto `cuda:0`. This
|
||||
means a misconfigured container where `nvidia-smi` works but `torch.cuda` does
|
||||
not will be skipped instead of silently running an 8B model on CPU. Local
|
||||
generation uses the CLI `--timeout` value as the Transformers `max_time` budget.
|
||||
Several model-specific adapters are included for MolmoPoint, GroundNext,
|
||||
UGround, OS-Atlas, ShowUI, Qwen3-VL/MAI-UI, OpenCUA, GTA1, InfiGUI, and POINTS-GUI-G.
|
||||
Local model configs use `fp16` and CPU offload for the larger checkpoints
|
||||
instead of quantization. MolmoPoint is the
|
||||
exception: its official inference path uses BF16 autocast, and FP16 overflows in
|
||||
its pointing-token generation path. Timing for offloaded models will include
|
||||
CPU-GPU transfer overhead. The local runner keeps one HF model loaded while it
|
||||
runs every task for that model, then unloads it and clears the CUDA cache before
|
||||
the next local model. For gated/private downloads, set `HF_TOKEN`. For
|
||||
Azure/Foundry-hosted variants, expect an endpoint URL plus API key and a
|
||||
dedicated provider adapter.
|
||||
|
||||
Moondream candidates use a provider-qualified entry. OpenAI Computer Use judges
|
||||
or candidates use `provider: "openai_computer_use"` and require
|
||||
`OPENAI_API_KEY`. The default Gemini candidate uses OpenRouter, so it only needs
|
||||
`OPENROUTER_API_KEY`.
|
||||
|
||||
```json
|
||||
{
|
||||
"candidate_models": [
|
||||
{
|
||||
"name": "openai-computer-use-judge",
|
||||
"provider": "openai_computer_use",
|
||||
"model": "computer-use-preview"
|
||||
},
|
||||
{
|
||||
"name": "moondream",
|
||||
"provider": "moondream",
|
||||
"model": "moondream-cloud"
|
||||
},
|
||||
{
|
||||
"name": "gemini-3.1-pro",
|
||||
"provider": "openrouter",
|
||||
"model": "google/gemini-3.1-pro-preview"
|
||||
}
|
||||
]
|
||||
}
|
||||
```
|
||||
|
||||
## Run
|
||||
|
||||
```bash
|
||||
cd prototypes/click_eval
|
||||
uv sync
|
||||
export OPENROUTER_API_KEY=...
|
||||
# Optional, for Moondream candidates:
|
||||
export MOONDREAM_API_KEY=...
|
||||
# Optional, for OpenAI Computer Use judges/candidates:
|
||||
export OPENAI_API_KEY=...
|
||||
uv run click-eval run
|
||||
```
|
||||
|
||||
Without `uv`, use:
|
||||
|
||||
```bash
|
||||
cd prototypes/click_eval
|
||||
python -m pip install -r requirements.txt
|
||||
python -m click_eval run
|
||||
```
|
||||
|
||||
On an interactive terminal, `run` shows tqdm progress bars for tasks and model
|
||||
calls. In non-interactive output, it prints plain status lines instead. Use
|
||||
`--no-progress` to suppress both.
|
||||
Use `--limit N` to run only the first N tasks, and `--model-limit N` to run
|
||||
only the first N candidate models.
|
||||
|
||||
The CLI also loads `OPENAI_API_KEY`, `MOONDREAM_API_KEY`, `GEMINI_API_KEY`,
|
||||
`GOOGLE_API_KEY`, and `OPENROUTER_API_KEY` from a local `.env` file in
|
||||
`prototypes/click_eval/` or the current working directory. `GEMINI_API_KEY` is
|
||||
only needed if you manually add a native `provider: "gemini"` entry.
|
||||
Moondream calls use `POST https://api.moondream.ai/v1/point` with the screenshot
|
||||
as a base64 data URL and the click instruction converted to an object query.
|
||||
OpenRouter Claude calls resize screenshots client-side before upload when the
|
||||
image exceeds Claude's no-resize long-edge limit, then remap parsed coordinates
|
||||
from the resized image back to original screenshot pixels. Claude Opus 4.7 uses a
|
||||
2576 px long-edge target; older Claude models use 1568 px.
|
||||
OpenAI Computer Use calls use the Responses API with `computer_use_preview`,
|
||||
request `detail: "original"`, send the original screenshot dimensions, and
|
||||
return a `click` action in that display coordinate space.
|
||||
The default Gemini candidate uses OpenRouter's regular multimodal chat API and
|
||||
the same JSON point prompt as the other OpenRouter VLMs. Native Gemini Computer
|
||||
Use support remains available for manually configured `provider: "gemini"`
|
||||
entries.
|
||||
|
||||
During a run, the CLI first resolves task GT/judge overlays, then runs
|
||||
candidate predictions model-major: one model is evaluated across all tasks
|
||||
before moving to the next model. It also prints compact status lines for GT
|
||||
resolution, provider/model calls, prediction failures, and the output directory.
|
||||
|
||||
OpenRouter GT judges are sent concurrently in bounded batches of 4. OpenRouter
|
||||
candidate calls are also concurrent across tasks for the current model in
|
||||
batches of 4. Local HF/GPU candidates stay synchronous and serial per model to
|
||||
avoid GPU memory contention; Moondream and native Gemini provider calls remain
|
||||
synchronous.
|
||||
|
||||
Outputs:
|
||||
|
||||
- `resolved_tasks.jsonl`: task manifest with `gt_point` when provided
|
||||
- `predictions.jsonl`: raw candidate responses and parsed points
|
||||
- `scores.csv`: per-task L2 distances when a scoring GT is available
|
||||
- `summary.json`: aggregate metrics per model
|
||||
- `annotated/*.png`: screenshot overlays with the scoring `GT` when present,
|
||||
judge points (`GT1`, `GT2`, ...), and candidate predictions
|
||||
|
||||
`predictions.jsonl`, `scores.csv`, and `summary.json` include per-model
|
||||
`duration_seconds` timing fields. Skipped local models are marked with
|
||||
`skipped=true` and an error message explaining the skip reason.
|
||||
|
||||
By default, `click-eval run` uses:
|
||||
|
||||
- `examples/tasks.jsonl`
|
||||
- `examples/models.json`
|
||||
- `runs/<timestamp>`
|
||||
|
||||
## Development
|
||||
|
||||
```bash
|
||||
cd prototypes/click_eval
|
||||
uv run pytest
|
||||
uv run ruff check .
|
||||
```
|
||||
440
prototypes/click_eval/examples/models.json
Normal file
@@ -0,0 +1,440 @@
|
||||
{
|
||||
"judge_models": [
|
||||
{
|
||||
"name": "openai-computer-use-judge",
|
||||
"provider": "openai_computer_use",
|
||||
"model": "computer-use-preview"
|
||||
},
|
||||
{
|
||||
"name": "claude-opus-4.7-judge",
|
||||
"provider": "openrouter",
|
||||
"model": "anthropic/claude-opus-4.7"
|
||||
},
|
||||
{
|
||||
"name": "gpt-5.5-judge",
|
||||
"provider": "openrouter",
|
||||
"model": "openai/gpt-5.5"
|
||||
},
|
||||
{
|
||||
"name": "gemini-3.1-pro-judge",
|
||||
"provider": "openrouter",
|
||||
"model": "google/gemini-3.1-pro-preview"
|
||||
}
|
||||
],
|
||||
"candidate_models": [
|
||||
{
|
||||
"name": "qwen3-vl-8b-instruct",
|
||||
"provider": "openrouter",
|
||||
"model": "qwen/qwen3-vl-8b-instruct"
|
||||
},
|
||||
{
|
||||
"name": "qwen3-vl-8b-thinking",
|
||||
"provider": "openrouter",
|
||||
"model": "qwen/qwen3-vl-8b-thinking"
|
||||
},
|
||||
{
|
||||
"name": "ui-tars-1.5-7b",
|
||||
"provider": "openrouter",
|
||||
"model": "bytedance/ui-tars-1.5-7b"
|
||||
},
|
||||
{
|
||||
"name": "glm-4.5v",
|
||||
"provider": "openrouter",
|
||||
"model": "z-ai/glm-4.5v"
|
||||
},
|
||||
{
|
||||
"name": "glm-4.6v",
|
||||
"provider": "openrouter",
|
||||
"model": "z-ai/glm-4.6v"
|
||||
},
|
||||
{
|
||||
"name": "glm-5v-turbo",
|
||||
"provider": "openrouter",
|
||||
"model": "z-ai/glm-5v-turbo"
|
||||
},
|
||||
{
|
||||
"name": "moondream",
|
||||
"provider": "moondream",
|
||||
"model": "moondream-cloud"
|
||||
},
|
||||
{
|
||||
"name": "gemini-3.1-pro",
|
||||
"provider": "openrouter",
|
||||
"model": "google/gemini-3.1-pro-preview"
|
||||
},
|
||||
{
|
||||
"name": "qwen3-vl-2b-instruct-local",
|
||||
"provider": "local_hf",
|
||||
"model": "Qwen/Qwen3-VL-2B-Instruct",
|
||||
"adapter": "qwen3_vl",
|
||||
"dtype": "fp16",
|
||||
"allow_cpu_offload": true,
|
||||
"estimated_vram_gb": 8,
|
||||
"max_pixels": 1310720,
|
||||
"max_new_tokens": 128
|
||||
},
|
||||
{
|
||||
"name": "qwen3-vl-2b-thinking-local",
|
||||
"provider": "local_hf",
|
||||
"model": "Qwen/Qwen3-VL-2B-Thinking",
|
||||
"adapter": "qwen3_vl",
|
||||
"dtype": "fp16",
|
||||
"allow_cpu_offload": true,
|
||||
"estimated_vram_gb": 8,
|
||||
"max_pixels": 1310720,
|
||||
"max_new_tokens": 1024
|
||||
},
|
||||
{
|
||||
"name": "qwen3-vl-4b-instruct-local",
|
||||
"provider": "local_hf",
|
||||
"model": "Qwen/Qwen3-VL-4B-Instruct",
|
||||
"adapter": "qwen3_vl",
|
||||
"dtype": "fp16",
|
||||
"estimated_vram_gb": 12,
|
||||
"max_pixels": 1310720,
|
||||
"max_new_tokens": 128
|
||||
},
|
||||
{
|
||||
"name": "qwen3-vl-4b-thinking-local",
|
||||
"provider": "local_hf",
|
||||
"model": "Qwen/Qwen3-VL-4B-Thinking",
|
||||
"adapter": "qwen3_vl",
|
||||
"dtype": "fp16",
|
||||
"allow_cpu_offload": true,
|
||||
"estimated_vram_gb": 12,
|
||||
"max_pixels": 1310720,
|
||||
"max_new_tokens": 1024
|
||||
},
|
||||
{
|
||||
"name": "qwen2.5-vl-3b-instruct-local",
|
||||
"provider": "local_hf",
|
||||
"model": "Qwen/Qwen2.5-VL-3B-Instruct",
|
||||
"adapter": "qwen25_point_1000",
|
||||
"dtype": "fp16",
|
||||
"allow_cpu_offload": true,
|
||||
"estimated_vram_gb": 10,
|
||||
"min_pixels": 3136,
|
||||
"max_pixels": 1003520,
|
||||
"max_new_tokens": 128
|
||||
},
|
||||
{
|
||||
"name": "gui-owl-1.5-2b-local",
|
||||
"provider": "local_hf",
|
||||
"model": "mPLUG/GUI-Owl-1.5-2B-Instruct",
|
||||
"adapter": "qwen3_vl",
|
||||
"dtype": "fp16",
|
||||
"estimated_vram_gb": 8,
|
||||
"max_pixels": 1310720,
|
||||
"max_new_tokens": 128
|
||||
},
|
||||
{
|
||||
"name": "gui-owl-1.5-4b-local",
|
||||
"provider": "local_hf",
|
||||
"model": "mPLUG/GUI-Owl-1.5-4B-Instruct",
|
||||
"adapter": "qwen3_vl",
|
||||
"dtype": "fp16",
|
||||
"allow_cpu_offload": true,
|
||||
"estimated_vram_gb": 12,
|
||||
"max_pixels": 1310720,
|
||||
"max_new_tokens": 128
|
||||
},
|
||||
{
|
||||
"name": "gui-owl-1.5-8b-local",
|
||||
"provider": "local_hf",
|
||||
"model": "mPLUG/GUI-Owl-1.5-8B-Instruct",
|
||||
"adapter": "qwen3_vl",
|
||||
"dtype": "fp16",
|
||||
"allow_cpu_offload": true,
|
||||
"estimated_vram_gb": 24,
|
||||
"max_pixels": 1310720,
|
||||
"max_new_tokens": 128
|
||||
},
|
||||
{
|
||||
"name": "kv-ground-8b-local",
|
||||
"provider": "local_hf",
|
||||
"model": "vocaela/KV-Ground-8B-BaseGuiOwl1.5-0315",
|
||||
"adapter": "qwen3_vl",
|
||||
"dtype": "fp16",
|
||||
"allow_cpu_offload": true,
|
||||
"estimated_vram_gb": 24,
|
||||
"max_pixels": 1310720,
|
||||
"max_new_tokens": 128
|
||||
},
|
||||
{
|
||||
"name": "ui-venus-1.5-2b-local",
|
||||
"provider": "local_hf",
|
||||
"model": "inclusionAI/UI-Venus-1.5-2B",
|
||||
"adapter": "qwen3_vl",
|
||||
"dtype": "fp16",
|
||||
"estimated_vram_gb": 8,
|
||||
"max_pixels": 1310720,
|
||||
"max_new_tokens": 128
|
||||
},
|
||||
{
|
||||
"name": "ui-venus-1.5-8b-local",
|
||||
"provider": "local_hf",
|
||||
"model": "inclusionAI/UI-Venus-1.5-8B",
|
||||
"adapter": "qwen3_vl",
|
||||
"dtype": "fp16",
|
||||
"allow_cpu_offload": true,
|
||||
"estimated_vram_gb": 24,
|
||||
"max_pixels": 1310720,
|
||||
"max_new_tokens": 128
|
||||
},
|
||||
{
|
||||
"name": "holo2-4b-local",
|
||||
"provider": "local_hf",
|
||||
"model": "Hcompany/Holo2-4B",
|
||||
"adapter": "holo2",
|
||||
"dtype": "fp16",
|
||||
"allow_cpu_offload": true,
|
||||
"estimated_vram_gb": 12,
|
||||
"max_pixels": 1310720,
|
||||
"max_new_tokens": 64
|
||||
},
|
||||
{
|
||||
"name": "holo2-8b-local",
|
||||
"provider": "local_hf",
|
||||
"model": "Hcompany/Holo2-8B",
|
||||
"adapter": "holo2",
|
||||
"dtype": "fp16",
|
||||
"allow_cpu_offload": true,
|
||||
"estimated_vram_gb": 24,
|
||||
"max_pixels": 1310720,
|
||||
"max_new_tokens": 64
|
||||
},
|
||||
{
|
||||
"name": "gta1-7b-local",
|
||||
"provider": "local_hf",
|
||||
"model": "Salesforce/GTA1-7B",
|
||||
"adapter": "gta1",
|
||||
"dtype": "fp16",
|
||||
"allow_cpu_offload": true,
|
||||
"estimated_vram_gb": 24,
|
||||
"min_pixels": 3136,
|
||||
"max_pixels": 1003520,
|
||||
"max_new_tokens": 512
|
||||
},
|
||||
{
|
||||
"name": "opencua-7b-local",
|
||||
"provider": "local_hf",
|
||||
"model": "xlangai/OpenCUA-7B",
|
||||
"adapter": "opencua",
|
||||
"dtype": "fp16",
|
||||
"allow_cpu_offload": true,
|
||||
"estimated_vram_gb": 24,
|
||||
"min_pixels": 3136,
|
||||
"max_pixels": 1003520,
|
||||
"max_new_tokens": 512
|
||||
},
|
||||
{
|
||||
"name": "infigui-g1-3b-local",
|
||||
"provider": "local_hf",
|
||||
"model": "InfiX-ai/InfiGUI-G1-3B",
|
||||
"adapter": "infigui",
|
||||
"dtype": "fp16",
|
||||
"allow_cpu_offload": true,
|
||||
"estimated_vram_gb": 12,
|
||||
"min_pixels": 3136,
|
||||
"max_pixels": 1003520,
|
||||
"max_new_tokens": 512
|
||||
},
|
||||
{
|
||||
"name": "infigui-g1-7b-local",
|
||||
"provider": "local_hf",
|
||||
"model": "InfiX-ai/InfiGUI-G1-7B",
|
||||
"adapter": "infigui",
|
||||
"dtype": "fp16",
|
||||
"allow_cpu_offload": true,
|
||||
"estimated_vram_gb": 24,
|
||||
"min_pixels": 3136,
|
||||
"max_pixels": 1003520,
|
||||
"max_new_tokens": 512
|
||||
},
|
||||
{
|
||||
"name": "points-gui-g-local",
|
||||
"provider": "local_hf",
|
||||
"model": "tencent/POINTS-GUI-G",
|
||||
"adapter": "points_gui_g",
|
||||
"dtype": "fp16",
|
||||
"allow_cpu_offload": true,
|
||||
"estimated_vram_gb": 24,
|
||||
"max_new_tokens": 128
|
||||
},
|
||||
{
|
||||
"name": "mai-ui-8b-local",
|
||||
"provider": "local_hf",
|
||||
"model": "Tongyi-MAI/MAI-UI-8B",
|
||||
"adapter": "qwen3_vl",
|
||||
"dtype": "fp16",
|
||||
"allow_cpu_offload": true,
|
||||
"estimated_vram_gb": 24,
|
||||
"max_pixels": 1310720,
|
||||
"max_new_tokens": 128
|
||||
},
|
||||
{
|
||||
"name": "molmopoint-gui-8b-local",
|
||||
"provider": "local_hf",
|
||||
"model": "allenai/MolmoPoint-GUI-8B",
|
||||
"adapter": "molmopoint",
|
||||
"dtype": "bf16",
|
||||
"allow_cpu_offload": true,
|
||||
"estimated_vram_gb": 24,
|
||||
"max_new_tokens": 200
|
||||
},
|
||||
{
|
||||
"name": "fara-7b-local",
|
||||
"provider": "local_hf",
|
||||
"model": "microsoft/Fara-7B",
|
||||
"adapter": "qwen25_tool_absolute",
|
||||
"dtype": "fp16",
|
||||
"allow_cpu_offload": true,
|
||||
"estimated_vram_gb": 20,
|
||||
"min_pixels": 3136,
|
||||
"max_pixels": 1003520,
|
||||
"max_new_tokens": 512
|
||||
},
|
||||
{
|
||||
"name": "groundnext-7b-local",
|
||||
"provider": "local_hf",
|
||||
"model": "ServiceNow/GroundNext-7B-V0",
|
||||
"adapter": "groundnext",
|
||||
"dtype": "fp16",
|
||||
"allow_cpu_offload": true,
|
||||
"estimated_vram_gb": 24,
|
||||
"max_pixels": 1003520,
|
||||
"max_new_tokens": 128
|
||||
},
|
||||
{
|
||||
"name": "uground-v1-7b-local",
|
||||
"provider": "local_hf",
|
||||
"model": "osunlp/UGround-V1-7B",
|
||||
"adapter": "uground",
|
||||
"dtype": "fp16",
|
||||
"allow_cpu_offload": true,
|
||||
"estimated_vram_gb": 24,
|
||||
"max_pixels": 1003520
|
||||
},
|
||||
{
|
||||
"name": "uground-v1-2b-local",
|
||||
"provider": "local_hf",
|
||||
"model": "osunlp/UGround-V1-2B",
|
||||
"adapter": "uground",
|
||||
"dtype": "fp16",
|
||||
"allow_cpu_offload": true,
|
||||
"estimated_vram_gb": 8,
|
||||
"max_pixels": 1003520
|
||||
},
|
||||
{
|
||||
"name": "ui-tars-2b-sft-local",
|
||||
"provider": "local_hf",
|
||||
"model": "ByteDance-Seed/UI-TARS-2B-SFT",
|
||||
"adapter": "uground",
|
||||
"dtype": "fp16",
|
||||
"allow_cpu_offload": true,
|
||||
"estimated_vram_gb": 8,
|
||||
"max_pixels": 1003520
|
||||
},
|
||||
{
|
||||
"name": "zonui-3b-local",
|
||||
"provider": "local_hf",
|
||||
"model": "zonghanHZH/ZonUI-3B",
|
||||
"adapter": "zonui",
|
||||
"dtype": "fp16",
|
||||
"allow_cpu_offload": true,
|
||||
"estimated_vram_gb": 10,
|
||||
"min_pixels": 3136,
|
||||
"max_pixels": 1003520,
|
||||
"max_new_tokens": 64
|
||||
},
|
||||
{
|
||||
"name": "gui-g1-3b-local",
|
||||
"provider": "local_hf",
|
||||
"model": "Yuqi-Zhou/GUI-G1-3B-v1",
|
||||
"adapter": "qwen25_point_1000",
|
||||
"dtype": "fp16",
|
||||
"allow_cpu_offload": true,
|
||||
"estimated_vram_gb": 10,
|
||||
"min_pixels": 3136,
|
||||
"max_pixels": 1003520,
|
||||
"max_new_tokens": 128
|
||||
},
|
||||
{
|
||||
"name": "jedi-3b-1080p-local",
|
||||
"provider": "local_hf",
|
||||
"model": "xlangai/Jedi-3B-1080p",
|
||||
"adapter": "qwen25_tool_absolute",
|
||||
"dtype": "fp16",
|
||||
"allow_cpu_offload": true,
|
||||
"estimated_vram_gb": 10,
|
||||
"min_pixels": 3136,
|
||||
"max_pixels": 1003520,
|
||||
"max_new_tokens": 128
|
||||
},
|
||||
{
|
||||
"name": "jedi-7b-1080p-local",
|
||||
"provider": "local_hf",
|
||||
"model": "xlangai/Jedi-7B-1080p",
|
||||
"adapter": "qwen25_tool_absolute",
|
||||
"dtype": "fp16",
|
||||
"allow_cpu_offload": true,
|
||||
"estimated_vram_gb": 24,
|
||||
"min_pixels": 3136,
|
||||
"max_pixels": 1003520,
|
||||
"max_new_tokens": 128
|
||||
},
|
||||
{
|
||||
"name": "ui-ins-7b-local",
|
||||
"provider": "local_hf",
|
||||
"model": "Tongyi-MiA/UI-Ins-7B",
|
||||
"adapter": "qwen25_tool_absolute",
|
||||
"dtype": "fp16",
|
||||
"allow_cpu_offload": true,
|
||||
"estimated_vram_gb": 24,
|
||||
"min_pixels": 3136,
|
||||
"max_pixels": 1003520,
|
||||
"max_new_tokens": 256
|
||||
},
|
||||
{
|
||||
"name": "gui-drag-7b-local",
|
||||
"provider": "local_hf",
|
||||
"model": "osunlp/GUI-Drag-7B",
|
||||
"adapter": "gui_drag",
|
||||
"dtype": "fp16",
|
||||
"allow_cpu_offload": true,
|
||||
"estimated_vram_gb": 24,
|
||||
"min_pixels": 12544,
|
||||
"max_pixels": 2116800,
|
||||
"max_new_tokens": 1024
|
||||
},
|
||||
{
|
||||
"name": "os-atlas-base-4b-local",
|
||||
"provider": "local_hf",
|
||||
"model": "OS-Copilot/OS-Atlas-Base-4B",
|
||||
"adapter": "os_atlas_4b",
|
||||
"dtype": "fp16",
|
||||
"allow_cpu_offload": true,
|
||||
"estimated_vram_gb": 12,
|
||||
"max_new_tokens": 128
|
||||
},
|
||||
{
|
||||
"name": "os-atlas-base-7b-local",
|
||||
"provider": "local_hf",
|
||||
"model": "OS-Copilot/OS-Atlas-Base-7B",
|
||||
"adapter": "os_atlas_7b",
|
||||
"dtype": "fp16",
|
||||
"allow_cpu_offload": true,
|
||||
"estimated_vram_gb": 24,
|
||||
"max_pixels": 1003520
|
||||
},
|
||||
{
|
||||
"name": "showui-2b-local",
|
||||
"provider": "local_hf",
|
||||
"model": "showlab/ShowUI-2B",
|
||||
"adapter": "showui",
|
||||
"dtype": "fp16",
|
||||
"estimated_vram_gb": 8,
|
||||
"max_pixels": 1053696
|
||||
}
|
||||
]
|
||||
}
|
||||
@@ -0,0 +1,49 @@
|
||||
{
|
||||
"url": "https://www.aliexpress.com/",
|
||||
"screenshot": "prototypes/click_eval/examples/screenshots/aliexpress_search_by_image.png",
|
||||
"image_size": {
|
||||
"width": 1920,
|
||||
"height": 1152
|
||||
},
|
||||
"device_scale_factor": 1,
|
||||
"target": {
|
||||
"label": "Search by image",
|
||||
"selector": "img[alt=\"Search by image\"]",
|
||||
"rect": {
|
||||
"x": 1189.171875,
|
||||
"y": 25.5,
|
||||
"width": 24,
|
||||
"height": 29
|
||||
},
|
||||
"image_rect": {
|
||||
"x": 1189.171875,
|
||||
"y": 25.5,
|
||||
"width": 24,
|
||||
"height": 24
|
||||
},
|
||||
"center": {
|
||||
"x": 1201.171875,
|
||||
"y": 40
|
||||
}
|
||||
},
|
||||
"viewport": {
|
||||
"width": 1920,
|
||||
"height": 1152,
|
||||
"dpr": 1,
|
||||
"scrollX": 0,
|
||||
"scrollY": 0,
|
||||
"url": "https://www.aliexpress.com/",
|
||||
"title": "AliExpress - Affordable Chinese Stores & Free Shipping - Online Shopping"
|
||||
},
|
||||
"captured_at_utc": "2026-04-29T15:07:12.753Z",
|
||||
"dismissed_overlays": [
|
||||
"Free gift modal",
|
||||
"Cookie banner",
|
||||
"Notification prompt"
|
||||
],
|
||||
"caveats": [
|
||||
"Live AliExpress homepage; products, placeholder text, and layout can change upstream.",
|
||||
"Popups were dismissed before capture so the search by image button is unobstructed.",
|
||||
"Animations/transitions were reduced before capture for determinism."
|
||||
]
|
||||
}
|
||||
|
After Width: | Height: | Size: 760 KiB |
@@ -0,0 +1,289 @@
|
||||
{
|
||||
"url": "https://www.bustracker.ie/?q=208",
|
||||
"screenshot": "prototypes/click_eval/examples/screenshots/bustracker_208_cork_city_centre.png",
|
||||
"image_size": {
|
||||
"width": 1920,
|
||||
"height": 1152
|
||||
},
|
||||
"device_scale_factor": 1,
|
||||
"target": {
|
||||
"label": "Route 208 bus closest to Cork city centre",
|
||||
"selector": ".leaflet-marker-icon.route-rectangle",
|
||||
"center": {
|
||||
"x": 1068,
|
||||
"y": 511
|
||||
},
|
||||
"rect": {
|
||||
"x": 1038.0191650390625,
|
||||
"y": 490.70770263671875,
|
||||
"width": 59.961669921875,
|
||||
"height": 40.5845947265625,
|
||||
"top": 490.70770263671875,
|
||||
"left": 1038.0191650390625,
|
||||
"right": 1097.9808349609375,
|
||||
"bottom": 531.2922973632812
|
||||
},
|
||||
"text": "208",
|
||||
"latLon": {
|
||||
"lat": 51.89778723113112,
|
||||
"lon": -8.476724624633789
|
||||
},
|
||||
"distanceToCorkCentreM": 110.6165039682958,
|
||||
"markerIndex": 4
|
||||
},
|
||||
"viewport": {
|
||||
"width": 1920,
|
||||
"height": 1152,
|
||||
"dpr": 1,
|
||||
"scrollX": 0,
|
||||
"scrollY": 0,
|
||||
"url": "https://www.bustracker.ie/?q=208",
|
||||
"title": "bustracker"
|
||||
},
|
||||
"mapRect": {
|
||||
"x": 0,
|
||||
"y": 64,
|
||||
"width": 1920,
|
||||
"height": 1088,
|
||||
"top": 64,
|
||||
"left": 0,
|
||||
"right": 1920,
|
||||
"bottom": 1152
|
||||
},
|
||||
"updated": "Updated 31 seconds ago",
|
||||
"corkCityCentre": {
|
||||
"lat": 51.8985,
|
||||
"lon": -8.4756
|
||||
},
|
||||
"tileRef": {
|
||||
"src": "https://a.tile.openstreetmap.org/14/7805/5419.png",
|
||||
"z": 14,
|
||||
"xTile": 7805,
|
||||
"yTile": 5419,
|
||||
"rect": {
|
||||
"x": 757,
|
||||
"y": 401,
|
||||
"width": 256,
|
||||
"height": 256,
|
||||
"top": 401,
|
||||
"left": 757,
|
||||
"right": 1013,
|
||||
"bottom": 657
|
||||
}
|
||||
},
|
||||
"rankedMarkers": [
|
||||
{
|
||||
"index": 4,
|
||||
"center": {
|
||||
"x": 1068,
|
||||
"y": 511
|
||||
},
|
||||
"rect": {
|
||||
"x": 1038.0191650390625,
|
||||
"y": 490.70770263671875,
|
||||
"width": 59.961669921875,
|
||||
"height": 40.5845947265625,
|
||||
"top": 490.70770263671875,
|
||||
"left": 1038.0191650390625,
|
||||
"right": 1097.9808349609375,
|
||||
"bottom": 531.2922973632812
|
||||
},
|
||||
"text": "208",
|
||||
"latLon": {
|
||||
"lat": 51.89778723113112,
|
||||
"lon": -8.476724624633789
|
||||
},
|
||||
"distanceToCorkCentreM": 110.6165039682958
|
||||
},
|
||||
{
|
||||
"index": 2,
|
||||
"center": {
|
||||
"x": 1199,
|
||||
"y": 547
|
||||
},
|
||||
"rect": {
|
||||
"x": 1171.151611328125,
|
||||
"y": 531.351318359375,
|
||||
"width": 55.69677734375,
|
||||
"height": 31.29736328125,
|
||||
"top": 531.351318359375,
|
||||
"left": 1171.151611328125,
|
||||
"right": 1226.848388671875,
|
||||
"bottom": 562.648681640625
|
||||
},
|
||||
"text": "208",
|
||||
"latLon": {
|
||||
"lat": 51.895880514665095,
|
||||
"lon": -8.46548080444336
|
||||
},
|
||||
"distanceToCorkCentreM": 752.9542979626465
|
||||
},
|
||||
{
|
||||
"index": 5,
|
||||
"center": {
|
||||
"x": 1316,
|
||||
"y": 304.99998474121094
|
||||
},
|
||||
"rect": {
|
||||
"x": 1290.1763916015625,
|
||||
"y": 273.68878173828125,
|
||||
"width": 51.647216796875,
|
||||
"height": 62.622406005859375,
|
||||
"top": 273.68878173828125,
|
||||
"left": 1290.1763916015625,
|
||||
"right": 1341.8236083984375,
|
||||
"bottom": 336.3111877441406
|
||||
},
|
||||
"text": "208",
|
||||
"latLon": {
|
||||
"lat": 51.9086963310864,
|
||||
"lon": -8.455438613891602
|
||||
},
|
||||
"distanceToCorkCentreM": 1788.4811355104382
|
||||
},
|
||||
{
|
||||
"index": 7,
|
||||
"center": {
|
||||
"x": 719,
|
||||
"y": 690
|
||||
},
|
||||
"rect": {
|
||||
"x": 703.7605590820312,
|
||||
"y": 662.3701171875,
|
||||
"width": 30.4788818359375,
|
||||
"height": 55.259765625,
|
||||
"top": 662.3701171875,
|
||||
"left": 703.7605590820312,
|
||||
"right": 734.2394409179688,
|
||||
"bottom": 717.6298828125
|
||||
},
|
||||
"text": "208",
|
||||
"latLon": {
|
||||
"lat": 51.88830581406175,
|
||||
"lon": -8.50667953491211
|
||||
},
|
||||
"distanceToCorkCentreM": 2415.2448491301698
|
||||
},
|
||||
{
|
||||
"index": 3,
|
||||
"center": {
|
||||
"x": 720,
|
||||
"y": 697
|
||||
},
|
||||
"rect": {
|
||||
"x": 704.7605590820312,
|
||||
"y": 669.3701171875,
|
||||
"width": 30.4788818359375,
|
||||
"height": 55.259765625,
|
||||
"top": 669.3701171875,
|
||||
"left": 704.7605590820312,
|
||||
"right": 735.2394409179688,
|
||||
"bottom": 724.6298828125
|
||||
},
|
||||
"text": "208",
|
||||
"latLon": {
|
||||
"lat": 51.88793499174197,
|
||||
"lon": -8.506593704223633
|
||||
},
|
||||
"distanceToCorkCentreM": 2429.719568050658
|
||||
},
|
||||
{
|
||||
"index": 1,
|
||||
"center": {
|
||||
"x": 1623,
|
||||
"y": 201.00000762939453
|
||||
},
|
||||
"rect": {
|
||||
"x": 1607.5078125,
|
||||
"y": 173.23468017578125,
|
||||
"width": 30.984375,
|
||||
"height": 55.53065490722656,
|
||||
"top": 173.23468017578125,
|
||||
"left": 1607.5078125,
|
||||
"right": 1638.4921875,
|
||||
"bottom": 228.7653350830078
|
||||
},
|
||||
"text": "208",
|
||||
"latLon": {
|
||||
"lat": 51.91420282988519,
|
||||
"lon": -8.429088592529297
|
||||
},
|
||||
"distanceToCorkCentreM": 3637.2647634017035
|
||||
},
|
||||
{
|
||||
"index": 8,
|
||||
"center": {
|
||||
"x": 1623,
|
||||
"y": 200.00000762939453
|
||||
},
|
||||
"rect": {
|
||||
"x": 1607.5078125,
|
||||
"y": 172.23468017578125,
|
||||
"width": 30.984375,
|
||||
"height": 55.53065490722656,
|
||||
"top": 172.23468017578125,
|
||||
"left": 1607.5078125,
|
||||
"right": 1638.4921875,
|
||||
"bottom": 227.7653350830078
|
||||
},
|
||||
"text": "208",
|
||||
"latLon": {
|
||||
"lat": 51.91425577372331,
|
||||
"lon": -8.429088592529297
|
||||
},
|
||||
"distanceToCorkCentreM": 3640.0928836883722
|
||||
},
|
||||
{
|
||||
"index": 6,
|
||||
"center": {
|
||||
"x": 386,
|
||||
"y": 985
|
||||
},
|
||||
"rect": {
|
||||
"x": 354.7040710449219,
|
||||
"y": 957.9580078125,
|
||||
"width": 62.59185791015625,
|
||||
"height": 54.083984375,
|
||||
"top": 957.9580078125,
|
||||
"left": 354.7040710449219,
|
||||
"right": 417.2959289550781,
|
||||
"bottom": 1012.0419921875
|
||||
},
|
||||
"text": "208",
|
||||
"latLon": {
|
||||
"lat": 51.872675649487675,
|
||||
"lon": -8.535261154174805
|
||||
},
|
||||
"distanceToCorkCentreM": 5001.261433750701
|
||||
},
|
||||
{
|
||||
"index": 0,
|
||||
"center": {
|
||||
"x": 297,
|
||||
"y": 1016
|
||||
},
|
||||
"rect": {
|
||||
"x": 268.0628662109375,
|
||||
"y": 998.1650390625,
|
||||
"width": 57.874267578125,
|
||||
"height": 35.669921875,
|
||||
"top": 998.1650390625,
|
||||
"left": 268.0628662109375,
|
||||
"right": 325.9371337890625,
|
||||
"bottom": 1033.8349609375
|
||||
},
|
||||
"text": "208",
|
||||
"latLon": {
|
||||
"lat": 51.87103284208375,
|
||||
"lon": -8.542900085449219
|
||||
},
|
||||
"distanceToCorkCentreM": 5537.540585075056
|
||||
}
|
||||
],
|
||||
"captured_at_utc": "2026-04-29T15:11:01.299Z",
|
||||
"caveats": [
|
||||
"BusTracker is live data; bus marker positions and closest vehicle can change between captures.",
|
||||
"Closest marker was selected by projecting visible Leaflet marker centers to OSM tile lat/lon and comparing to Cork city centre lat/lon.",
|
||||
"The GT point is the center of the selected route-rectangle marker in viewport/screenshot coordinates."
|
||||
]
|
||||
}
|
||||
|
After Width: | Height: | Size: 2.4 MiB |
@@ -0,0 +1,44 @@
|
||||
{
|
||||
"url": "https://www.formula1.com/en/results/2026/drivers",
|
||||
"screenshot": "prototypes/click_eval/examples/screenshots/f1_2026_drivers_fia_logo.png",
|
||||
"image_size": {
|
||||
"width": 1920,
|
||||
"height": 1152
|
||||
},
|
||||
"target": {
|
||||
"selector": "a[href=\"https://www.fia.com/\"] svg[role=\"img\"]",
|
||||
"label": "FIA",
|
||||
"href": "https://www.fia.com/",
|
||||
"rect": {
|
||||
"x": 1707.328125,
|
||||
"y": 64,
|
||||
"width": 45.171875,
|
||||
"height": 32
|
||||
},
|
||||
"center": {
|
||||
"x": 1729.9140625,
|
||||
"y": 80
|
||||
},
|
||||
"occludingElement": {
|
||||
"tag": "path",
|
||||
"text": "",
|
||||
"id": null,
|
||||
"class": null
|
||||
},
|
||||
"viewport": {
|
||||
"width": 1920,
|
||||
"height": 1152,
|
||||
"dpr": 1,
|
||||
"scrollX": 0,
|
||||
"scrollY": 0,
|
||||
"url": "https://www.formula1.com/en/results/2026/drivers",
|
||||
"title": "2026 DRIVERS' STANDINGS"
|
||||
}
|
||||
},
|
||||
"consent_dismissed": true,
|
||||
"captured_at_utc": "2026-04-28T11:22:17.556Z",
|
||||
"caveats": [
|
||||
"Live Formula 1 page; content can change upstream.",
|
||||
"Animations/transitions were reduced before capture for determinism."
|
||||
]
|
||||
}
|
||||
|
After Width: | Height: | Size: 456 KiB |
|
After Width: | Height: | Size: 296 KiB |
@@ -0,0 +1,66 @@
|
||||
{
|
||||
"url": "https://izz.ie/pages/table-booking",
|
||||
"screenshot": "prototypes/click_eval/examples/screenshots/izz_table_booking_footer.png",
|
||||
"image_size": {
|
||||
"width": 1920,
|
||||
"height": 1152
|
||||
},
|
||||
"device_scale_factor": 1,
|
||||
"targets": {
|
||||
"mastercard": {
|
||||
"label": "Mastercard logo",
|
||||
"selector": "footer svg[aria-labelledby=\"pi-master\"]",
|
||||
"rect": {
|
||||
"x": 1343,
|
||||
"y": 961.1875,
|
||||
"width": 38,
|
||||
"height": 24,
|
||||
"top": 961.1875,
|
||||
"left": 1343,
|
||||
"right": 1381,
|
||||
"bottom": 985.1875
|
||||
},
|
||||
"center": {
|
||||
"x": 1362,
|
||||
"y": 973.1875
|
||||
},
|
||||
"text": "Mastercard"
|
||||
},
|
||||
"tiktok": {
|
||||
"label": "TikTok footer logo",
|
||||
"selector": "footer a[href*=\"tiktok.com\"]",
|
||||
"rect": {
|
||||
"x": 1420,
|
||||
"y": 1059.5,
|
||||
"width": 30,
|
||||
"height": 30,
|
||||
"top": 1059.5,
|
||||
"left": 1420,
|
||||
"right": 1450,
|
||||
"bottom": 1089.5
|
||||
},
|
||||
"center": {
|
||||
"x": 1435,
|
||||
"y": 1074.5
|
||||
},
|
||||
"href": "https://www.tiktok.com/@izzcafeireland",
|
||||
"text": "TikTok"
|
||||
},
|
||||
"viewport": {
|
||||
"width": 1920,
|
||||
"height": 1152,
|
||||
"dpr": 1,
|
||||
"scrollX": 0,
|
||||
"scrollY": 5322,
|
||||
"url": "https://izz.ie/pages/table-booking",
|
||||
"title": "Table Booking – Izz Cafe",
|
||||
"docHeight": 6474
|
||||
}
|
||||
},
|
||||
"captured_at_utc": "2026-04-29T15:10:52.819Z",
|
||||
"caveats": [
|
||||
"Footer capture after scrolling to the bottom of the live Shopify page.",
|
||||
"The TikTok target is the footer Follow-us row icon, not the duplicated announcement-bar/social icon.",
|
||||
"A small cookie banner remains visible near the bottom but does not cover the Mastercard payment logo or the selected footer TikTok logo."
|
||||
]
|
||||
}
|
||||
|
After Width: | Height: | Size: 319 KiB |
@@ -0,0 +1,45 @@
|
||||
{
|
||||
"leave": {
|
||||
"text": "leave",
|
||||
"selector": ".subscribe-button .option.remove.active",
|
||||
"rect": {
|
||||
"x": 1605,
|
||||
"y": 308,
|
||||
"width": 43.890625,
|
||||
"height": 16
|
||||
}
|
||||
},
|
||||
"sixthPermalink": {
|
||||
"text": "permalink",
|
||||
"href": "https://old.reddit.com/r/pics/comments/1sx21hf/someone_paid_8700_dollars_for_this_in_1995_i_paid/oimt6n0/",
|
||||
"selector": ".comment:nth-of-type(6) a:has-text(\"permalink\")",
|
||||
"commentId": "thing_t1_oimt6n0",
|
||||
"author": "DizzyTelevision09",
|
||||
"bodyPreview": "You've been paying off student loans for the past 30 years? That's crazy.",
|
||||
"rect": {
|
||||
"x": 166,
|
||||
"y": 1125.265625,
|
||||
"width": 58.546875,
|
||||
"height": 12
|
||||
}
|
||||
},
|
||||
"title": "Someone paid $8700 dollars for this in 1995. I paid $15 today. : pics",
|
||||
"pageUrl": "https://old.reddit.com/r/pics/comments/1sx21hf/someone_paid_8700_dollars_for_this_in_1995_i_paid/",
|
||||
"scrollHeight": 17771,
|
||||
"imageSize": {
|
||||
"width": 1920,
|
||||
"height": 1600
|
||||
},
|
||||
"deviceScaleFactor": 1,
|
||||
"targets": {
|
||||
"reddit_pics_leave_button": [
|
||||
1626.9,
|
||||
316
|
||||
],
|
||||
"reddit_pics_6th_comment_permalink": [
|
||||
195.3,
|
||||
1131.3
|
||||
]
|
||||
},
|
||||
"caveat": "Fresh Old Reddit session is logged out, so the real visible button is join. The hidden leave link exists in the DOM; screenshot flips the subscribe widget into its subscribed/leave visual state for this requested target."
|
||||
}
|
||||
|
After Width: | Height: | Size: 627 KiB |
@@ -0,0 +1,26 @@
|
||||
{
|
||||
"url": "https://www.temu.com/",
|
||||
"screenshot": "prototypes/click_eval/examples/screenshots/temu_feedback.png",
|
||||
"image_size": {
|
||||
"width": 1920,
|
||||
"height": 1152
|
||||
},
|
||||
"device_scale_factor": 1,
|
||||
"target": {
|
||||
"label": "Feedback",
|
||||
"selector": "#mainToolbar [role=\"button\"]",
|
||||
"rect": {
|
||||
"x": 1860,
|
||||
"y": 1016,
|
||||
"width": 56,
|
||||
"height": 56
|
||||
},
|
||||
"center": {
|
||||
"x": 1888,
|
||||
"y": 1044
|
||||
}
|
||||
},
|
||||
"caveats": [
|
||||
"Temu displayed a privacy/cookie settings dialog that dims the page; the bottom-right Feedback button remains visible in the screenshot."
|
||||
]
|
||||
}
|
||||
BIN
prototypes/click_eval/examples/screenshots/temu_feedback.png
Normal file
|
After Width: | Height: | Size: 1.2 MiB |
11
prototypes/click_eval/examples/tasks.jsonl
Normal file
@@ -0,0 +1,11 @@
|
||||
{"task_id":"hn_second_comments","image_path":"screenshots/hacker_news_browser.png","instruction":"Click the comments link under the second item.","gt_point":[442.8,101.5]}
|
||||
{"task_id":"hn_7th_title_link","image_path":"screenshots/hacker_news_browser.png","instruction":"Click the title link of the 7th post.","gt_point":[359.2,261.5]}
|
||||
{"task_id":"hn_19th_hide","image_path":"screenshots/hacker_news_browser.png","instruction":"Click the hide link on the 19th post.","gt_point":[354.9,696.5]}
|
||||
{"task_id":"temu_feedback","image_path":"screenshots/temu_feedback.png","instruction":"Click the Feedback button on the bottom right.","gt_point":[1888.0,1044.0]}
|
||||
{"task_id":"f1_2026_drivers_fia_logo","image_path":"screenshots/f1_2026_drivers_fia_logo.png","instruction":"Click the FIA logo on the top right.","gt_point":[1729.914,80.0]}
|
||||
{"task_id":"reddit_pics_leave_button","image_path":"screenshots/reddit_pics_leave_permalink.png","instruction":"Click the leave button near the top of the right sidebar.","gt_point":[1626.9,316.0]}
|
||||
{"task_id":"reddit_pics_6th_comment_permalink","image_path":"screenshots/reddit_pics_leave_permalink.png","instruction":"Click the permalink button for the 6th comment.","gt_point":[195.3,1131.3]}
|
||||
{"task_id":"izz_table_booking_footer_mastercard","image_path":"screenshots/izz_table_booking_footer.png","instruction":"Click the Mastercard logo at the very bottom of the page.","gt_point":[1362.0,973.2]}
|
||||
{"task_id":"izz_table_booking_footer_tiktok","image_path":"screenshots/izz_table_booking_footer.png","instruction":"Click the TikTok logo in the footer social row.","gt_point":[1435.0,1074.5]}
|
||||
{"task_id":"aliexpress_search_by_image","image_path":"screenshots/aliexpress_search_by_image.png","instruction":"Click the search by image button.","gt_point":[1201.171875,40]}
|
||||
{"task_id":"bustracker_208_cork_city_centre_bus","image_path":"screenshots/bustracker_208_cork_city_centre.png","instruction":"Click the bus closest to Cork city centre.","gt_point":[1068.0,511.0]}
|
||||
56
prototypes/click_eval/pyproject.toml
Normal file
@@ -0,0 +1,56 @@
|
||||
[build-system]
|
||||
requires = ["setuptools>=68"]
|
||||
build-backend = "setuptools.build_meta"
|
||||
|
||||
[project]
|
||||
name = "click-eval"
|
||||
version = "0.1.0"
|
||||
description = "Quick OpenRouter VLM click-point evaluation prototype"
|
||||
requires-python = ">=3.12"
|
||||
dependencies = [
|
||||
"google-genai>=1.0.0",
|
||||
"Pillow>=10.0.0",
|
||||
"tqdm>=4.67.3",
|
||||
]
|
||||
|
||||
[project.optional-dependencies]
|
||||
local = [
|
||||
"accelerate>=1.0.0",
|
||||
"decord2>=0.2.0",
|
||||
"einops>=0.8.0",
|
||||
"hf_transfer>=0.1.9",
|
||||
"protobuf>=4.25.0",
|
||||
"qwen-vl-utils>=0.0.14",
|
||||
"requests>=2.32.0",
|
||||
"safetensors>=0.4.3",
|
||||
"sentencepiece>=0.2.0",
|
||||
"timm>=1.0.0",
|
||||
"torch>=2.6.0",
|
||||
"torchvision>=0.21.0",
|
||||
"transformers>=4.57.1,<5.0.0",
|
||||
"WePOINTS @ git+https://github.com/WePOINTS/WePOINTS.git",
|
||||
"tiktoken>=0.7.0",
|
||||
]
|
||||
|
||||
[project.scripts]
|
||||
click-eval = "click_eval.cli:main"
|
||||
|
||||
[dependency-groups]
|
||||
dev = [
|
||||
"basedpyright>=1.39.3",
|
||||
"pylsp-rope>=0.1.17",
|
||||
"pytest>=8.0.0",
|
||||
"python-lsp-server>=1.14.0",
|
||||
"ruff>=0.14.0",
|
||||
]
|
||||
|
||||
[tool.setuptools.packages.find]
|
||||
where = ["src"]
|
||||
|
||||
[tool.pytest.ini_options]
|
||||
testpaths = ["tests"]
|
||||
pythonpath = ["src"]
|
||||
|
||||
[tool.ruff]
|
||||
line-length = 88
|
||||
target-version = "py312"
|
||||
1
prototypes/click_eval/requirements.txt
Normal file
@@ -0,0 +1 @@
|
||||
-e .[local]
|
||||
5
prototypes/click_eval/src/click_eval/__init__.py
Normal file
@@ -0,0 +1,5 @@
|
||||
"""Quick VLM click-point evaluation prototype."""
|
||||
|
||||
__all__ = ["__version__"]
|
||||
|
||||
__version__ = "0.1.0"
|
||||
5
prototypes/click_eval/src/click_eval/__main__.py
Normal file
@@ -0,0 +1,5 @@
|
||||
from .cli import main
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
raise SystemExit(main())
|
||||
128
prototypes/click_eval/src/click_eval/cli.py
Normal file
@@ -0,0 +1,128 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import sys
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
|
||||
from .env import load_dotenv
|
||||
from .harness import RunOptions, run_eval
|
||||
from .providers import ProviderClient
|
||||
|
||||
PROJECT_ROOT = Path(__file__).resolve().parents[2]
|
||||
DEFAULT_TASKS = PROJECT_ROOT / "examples" / "tasks.jsonl"
|
||||
DEFAULT_MODELS = PROJECT_ROOT / "examples" / "models.json"
|
||||
|
||||
|
||||
def main(argv: list[str] | None = None) -> int:
|
||||
parser = argparse.ArgumentParser(description="Quick VLM click-point eval harness")
|
||||
subparsers = parser.add_subparsers(dest="command", required=True)
|
||||
|
||||
run_parser = subparsers.add_parser("run", help="run a click-point evaluation")
|
||||
run_parser.add_argument(
|
||||
"--tasks",
|
||||
type=Path,
|
||||
default=DEFAULT_TASKS,
|
||||
help=f"JSONL task manifest (default: {DEFAULT_TASKS})",
|
||||
)
|
||||
run_parser.add_argument(
|
||||
"--models",
|
||||
type=Path,
|
||||
default=DEFAULT_MODELS,
|
||||
help=f"model config JSON (default: {DEFAULT_MODELS})",
|
||||
)
|
||||
run_parser.add_argument(
|
||||
"--out",
|
||||
type=Path,
|
||||
help="output run directory (default: runs/<timestamp>)",
|
||||
)
|
||||
run_parser.add_argument("--no-annotate", action="store_true", help="skip annotated PNGs")
|
||||
run_parser.add_argument("--fail-fast", action="store_true", help="stop on first GT error")
|
||||
run_parser.add_argument("--no-progress", action="store_true", help="hide progress/log output")
|
||||
run_parser.add_argument("--limit", type=int, help="only run the first N tasks")
|
||||
run_parser.add_argument(
|
||||
"--model-limit",
|
||||
type=int,
|
||||
help="only run the first N candidate models",
|
||||
)
|
||||
run_parser.add_argument(
|
||||
"--timeout",
|
||||
type=int,
|
||||
default=60 * 4,
|
||||
help="API timeout seconds; also used as local HF generation max_time",
|
||||
)
|
||||
|
||||
args = parser.parse_args(argv)
|
||||
if args.command == "run":
|
||||
load_dotenv(PROJECT_ROOT / ".env")
|
||||
load_dotenv(Path.cwd() / ".env")
|
||||
out_dir = args.out or _default_out_dir()
|
||||
client = ProviderClient(
|
||||
timeout_seconds=args.timeout,
|
||||
log_callback=None if args.no_progress else _stderr_log,
|
||||
)
|
||||
summary = run_eval(
|
||||
RunOptions(
|
||||
tasks_path=args.tasks,
|
||||
models_path=args.models,
|
||||
out_dir=out_dir,
|
||||
annotate=not args.no_annotate,
|
||||
fail_fast=args.fail_fast,
|
||||
limit=args.limit,
|
||||
model_limit=args.model_limit,
|
||||
progress=not args.no_progress,
|
||||
),
|
||||
client.predict_point,
|
||||
)
|
||||
print(f"Wrote run to {out_dir}")
|
||||
result_rows = summary.get("result_rows", [])
|
||||
if isinstance(result_rows, list):
|
||||
print(_format_result_table(result_rows))
|
||||
return 0
|
||||
|
||||
parser.error(f"unknown command: {args.command}")
|
||||
return 2
|
||||
|
||||
|
||||
def _default_out_dir() -> Path:
|
||||
timestamp = datetime.now().strftime("%Y%m%d-%H%M%S")
|
||||
return PROJECT_ROOT / "runs" / timestamp
|
||||
|
||||
|
||||
def _stderr_log(message: str) -> None:
|
||||
print(message, file=sys.stderr)
|
||||
|
||||
|
||||
def _format_result_table(rows: list[dict[str, object]]) -> str:
|
||||
headers = ["Model", "Status", "L2", "Duration", "Reason"]
|
||||
table_rows = [
|
||||
[
|
||||
str(row.get("model", "")),
|
||||
str(row.get("status", "")),
|
||||
_format_l2(row.get("l2")),
|
||||
_format_duration(row.get("duration_seconds")),
|
||||
str(row.get("reason") or ""),
|
||||
]
|
||||
for row in rows
|
||||
]
|
||||
widths = [
|
||||
max([len(headers[index]), *(len(row[index]) for row in table_rows)])
|
||||
for index in range(len(headers))
|
||||
]
|
||||
lines = ["Results"]
|
||||
lines.append(_format_table_line(headers, widths))
|
||||
lines.append(_format_table_line(["-" * width for width in widths], widths))
|
||||
lines.extend(_format_table_line(row, widths) for row in table_rows)
|
||||
return "\n".join(lines)
|
||||
|
||||
|
||||
def _format_table_line(cells: list[str], widths: list[int]) -> str:
|
||||
return " ".join(cell.ljust(widths[index]) for index, cell in enumerate(cells))
|
||||
|
||||
|
||||
def _format_l2(value: object) -> str:
|
||||
return "n/a" if value is None else f"{float(value):.2f}px"
|
||||
|
||||
|
||||
def _format_duration(value: object) -> str:
|
||||
return "n/a" if value is None else f"{float(value):.2f}s"
|
||||
59
prototypes/click_eval/src/click_eval/contracts.py
Normal file
@@ -0,0 +1,59 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from dataclasses import dataclass
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class Point:
|
||||
x: float
|
||||
y: float
|
||||
|
||||
def as_list(self) -> list[float]:
|
||||
return [self.x, self.y]
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class ClickTask:
|
||||
task_id: str
|
||||
image_path: Path
|
||||
image_path_text: str
|
||||
instruction: str
|
||||
gt_point: Point | None
|
||||
raw: dict[str, Any]
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class ModelSpec:
|
||||
name: str
|
||||
model_id: str
|
||||
provider: str = "openrouter"
|
||||
estimated_vram_gb: float | None = None
|
||||
adapter: str | None = None
|
||||
quantization: str | None = None
|
||||
allow_cpu_offload: bool = False
|
||||
dtype: str | None = None
|
||||
attn_implementation: str | None = None
|
||||
min_pixels: int | None = None
|
||||
max_pixels: int | None = None
|
||||
max_new_tokens: int | None = None
|
||||
revision: str | None = None
|
||||
use_safetensors: bool | None = None
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class ModelReply:
|
||||
text: str
|
||||
raw: dict[str, Any] | None = None
|
||||
|
||||
|
||||
class ModelSkipped(RuntimeError):
|
||||
"""Raised when a model is intentionally skipped for environment reasons."""
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class ParsedPoint:
|
||||
point: Point | None
|
||||
reason: str | None = None
|
||||
error: str | None = None
|
||||
27
prototypes/click_eval/src/click_eval/env.py
Normal file
@@ -0,0 +1,27 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
def load_dotenv(path: Path) -> None:
|
||||
if not path.exists():
|
||||
return
|
||||
|
||||
for line in path.read_text(encoding="utf-8").splitlines():
|
||||
stripped = line.strip()
|
||||
if not stripped or stripped.startswith("#") or "=" not in stripped:
|
||||
continue
|
||||
|
||||
key, value = stripped.split("=", 1)
|
||||
key = key.strip()
|
||||
if not key or key in os.environ:
|
||||
continue
|
||||
|
||||
os.environ[key] = _unquote(value.strip())
|
||||
|
||||
|
||||
def _unquote(value: str) -> str:
|
||||
if len(value) >= 2 and value[0] == value[-1] and value[0] in {"'", '"'}:
|
||||
return value[1:-1]
|
||||
return value
|
||||
241
prototypes/click_eval/src/click_eval/gemini.py
Normal file
@@ -0,0 +1,241 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import os
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
from .contracts import ModelReply, Point
|
||||
from .image_utils import image_size
|
||||
|
||||
|
||||
class GeminiComputerUseClient:
|
||||
def __init__(
|
||||
self,
|
||||
api_key: str | None = None,
|
||||
timeout_seconds: int = 90,
|
||||
) -> None:
|
||||
self.api_key = (
|
||||
api_key
|
||||
or os.environ.get("GEMINI_API_KEY")
|
||||
or os.environ.get("GOOGLE_API_KEY")
|
||||
)
|
||||
if not self.api_key:
|
||||
raise RuntimeError("GEMINI_API_KEY or GOOGLE_API_KEY is required")
|
||||
self.timeout_seconds = timeout_seconds
|
||||
|
||||
def predict_point(
|
||||
self,
|
||||
model_id: str,
|
||||
image_path: Path,
|
||||
instruction: str,
|
||||
purpose: str,
|
||||
) -> ModelReply:
|
||||
try:
|
||||
from google import genai
|
||||
from google.genai import types
|
||||
from google.genai.types import Content, Part
|
||||
except ImportError as exc:
|
||||
raise RuntimeError(
|
||||
"google-genai is required for provider=gemini; run `uv sync`"
|
||||
) from exc
|
||||
|
||||
width, height = image_size(image_path)
|
||||
client = genai.Client(api_key=self.api_key)
|
||||
contents = [
|
||||
Content(
|
||||
role="user",
|
||||
parts=[
|
||||
Part(text=_computer_use_prompt(instruction, purpose)),
|
||||
Part.from_bytes(
|
||||
data=image_path.read_bytes(),
|
||||
mime_type="image/png",
|
||||
),
|
||||
],
|
||||
)
|
||||
]
|
||||
config = types.GenerateContentConfig(
|
||||
tools=[
|
||||
types.Tool(
|
||||
computer_use=types.ComputerUse(
|
||||
environment=types.Environment.ENVIRONMENT_BROWSER,
|
||||
excluded_predefined_functions=_excluded_functions(),
|
||||
)
|
||||
)
|
||||
],
|
||||
temperature=0,
|
||||
)
|
||||
_set_high_media_resolution(config, types)
|
||||
response = client.models.generate_content(
|
||||
model=model_id,
|
||||
contents=contents,
|
||||
config=config,
|
||||
)
|
||||
call = _first_function_call(response)
|
||||
raw = _raw_response(response)
|
||||
if call is None:
|
||||
return ModelReply(text=_response_text(response), raw=raw)
|
||||
|
||||
point = _point_from_call(call)
|
||||
if point is None:
|
||||
return ModelReply(text=_response_text(response), raw=raw)
|
||||
|
||||
scaled = _scale_computer_use_point(model_id, point, width, height)
|
||||
return ModelReply(
|
||||
text=json.dumps(
|
||||
{
|
||||
"x": scaled.x,
|
||||
"y": scaled.y,
|
||||
"reason": f"Gemini Computer Use function_call {call['name']}",
|
||||
"display_x": point.x,
|
||||
"display_y": point.y,
|
||||
}
|
||||
),
|
||||
raw=raw,
|
||||
)
|
||||
|
||||
|
||||
def _computer_use_prompt(instruction: str, purpose: str) -> str:
|
||||
role_line = (
|
||||
"Choose the ground-truth click point for this instruction."
|
||||
if purpose == "ground_truth"
|
||||
else "Predict the click point for this instruction."
|
||||
)
|
||||
return (
|
||||
f"{role_line}\n\n"
|
||||
"Use the screenshot and emit exactly one Computer Use `click_at` action. "
|
||||
"Do not navigate, type, scroll, hover, or wait. Choose the center of the "
|
||||
"target UI element when possible.\n\n"
|
||||
f"Instruction: {instruction}"
|
||||
)
|
||||
|
||||
|
||||
def _excluded_functions() -> list[str]:
|
||||
return [
|
||||
"open_web_browser",
|
||||
"wait_5_seconds",
|
||||
"go_back",
|
||||
"go_forward",
|
||||
"search",
|
||||
"navigate",
|
||||
"hover_at",
|
||||
"type_text_at",
|
||||
"key_combination",
|
||||
"scroll_document",
|
||||
"drag_and_drop",
|
||||
]
|
||||
|
||||
|
||||
def _set_high_media_resolution(config, types) -> None:
|
||||
media_resolution = getattr(types, "MediaResolution", None)
|
||||
if media_resolution is None:
|
||||
return
|
||||
value = (
|
||||
getattr(media_resolution, "MEDIA_RESOLUTION_HIGH", None)
|
||||
or getattr(media_resolution, "HIGH", None)
|
||||
)
|
||||
if value is not None:
|
||||
try:
|
||||
config.media_resolution = value
|
||||
except (AttributeError, TypeError, ValueError):
|
||||
return
|
||||
|
||||
|
||||
def _first_function_call(response) -> dict[str, Any] | None:
|
||||
raw_call = _first_function_call_from_dict(_raw_response(response))
|
||||
if raw_call is not None:
|
||||
return raw_call
|
||||
|
||||
for candidate in getattr(response, "candidates", []) or []:
|
||||
content = getattr(candidate, "content", None)
|
||||
for part in getattr(content, "parts", []) or []:
|
||||
function_call = getattr(part, "function_call", None)
|
||||
if function_call is not None:
|
||||
return _function_call_dict(function_call)
|
||||
return None
|
||||
|
||||
|
||||
def _function_call_dict(function_call) -> dict[str, Any]:
|
||||
name = getattr(function_call, "name", None)
|
||||
args = getattr(function_call, "args", None)
|
||||
return {"name": str(name or ""), "args": _plain_dict(args)}
|
||||
|
||||
|
||||
def _first_function_call_from_dict(value: Any) -> dict[str, Any] | None:
|
||||
if isinstance(value, dict):
|
||||
function_call = value.get("functionCall") or value.get("function_call")
|
||||
if isinstance(function_call, dict):
|
||||
return {
|
||||
"name": str(function_call.get("name") or ""),
|
||||
"args": _plain_dict(function_call.get("args")),
|
||||
}
|
||||
for child in value.values():
|
||||
found = _first_function_call_from_dict(child)
|
||||
if found is not None:
|
||||
return found
|
||||
if isinstance(value, list):
|
||||
for child in value:
|
||||
found = _first_function_call_from_dict(child)
|
||||
if found is not None:
|
||||
return found
|
||||
return None
|
||||
|
||||
|
||||
def _point_from_call(call: dict[str, Any]) -> Point | None:
|
||||
args = call.get("args")
|
||||
if not isinstance(args, dict):
|
||||
return None
|
||||
try:
|
||||
return Point(x=float(args["x"]), y=float(args["y"]))
|
||||
except (KeyError, TypeError, ValueError):
|
||||
return None
|
||||
|
||||
|
||||
def _scale_computer_use_point(
|
||||
model_id: str, point: Point, width: int, height: int
|
||||
) -> Point:
|
||||
if model_id.startswith("gemini-3-"):
|
||||
return point
|
||||
return Point(x=point.x / 1000 * width, y=point.y / 1000 * height)
|
||||
|
||||
|
||||
def _response_text(response) -> str:
|
||||
parts = []
|
||||
for candidate in getattr(response, "candidates", []) or []:
|
||||
content = getattr(candidate, "content", None)
|
||||
for part in getattr(content, "parts", []) or []:
|
||||
text = getattr(part, "text", None)
|
||||
if text:
|
||||
parts.append(str(text))
|
||||
if parts:
|
||||
return "\n".join(parts)
|
||||
return json.dumps(_raw_response(response), ensure_ascii=False)
|
||||
|
||||
|
||||
def _raw_response(response) -> dict[str, Any]:
|
||||
for method_name in ("to_json_dict", "model_dump", "dict"):
|
||||
method = getattr(response, method_name, None)
|
||||
if method is None:
|
||||
continue
|
||||
try:
|
||||
value = method()
|
||||
except TypeError:
|
||||
continue
|
||||
if isinstance(value, dict):
|
||||
return _plain_dict(value)
|
||||
return {"repr": repr(response)}
|
||||
|
||||
|
||||
def _plain_dict(value: Any) -> Any:
|
||||
if isinstance(value, dict):
|
||||
return {str(key): _plain_dict(item) for key, item in value.items()}
|
||||
if isinstance(value, list):
|
||||
return [_plain_dict(item) for item in value]
|
||||
if isinstance(value, tuple):
|
||||
return [_plain_dict(item) for item in value]
|
||||
if isinstance(value, (str, int, float, bool)) or value is None:
|
||||
return value
|
||||
try:
|
||||
return dict(value)
|
||||
except (TypeError, ValueError):
|
||||
return repr(value)
|
||||
653
prototypes/click_eval/src/click_eval/harness.py
Normal file
@@ -0,0 +1,653 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import csv
|
||||
import concurrent.futures
|
||||
import json
|
||||
import math
|
||||
import statistics
|
||||
import sys
|
||||
import time
|
||||
from contextlib import nullcontext
|
||||
from dataclasses import dataclass
|
||||
from pathlib import Path
|
||||
from typing import Callable
|
||||
|
||||
from tqdm import tqdm
|
||||
|
||||
from .contracts import ClickTask, ModelReply, ModelSkipped, ModelSpec, Point
|
||||
from .io import load_model_config, load_tasks, write_jsonl
|
||||
from .parsing import parse_point_response, parse_point_value
|
||||
from .scoring import SCORE_FIELDNAMES, score_point, summarize_scores
|
||||
from .viz import annotate_image
|
||||
|
||||
PredictPoint = Callable[[ModelSpec, Path, str, str], ModelReply]
|
||||
OPENROUTER_CANDIDATE_CONCURRENCY = 4
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class RunOptions:
|
||||
tasks_path: Path
|
||||
models_path: Path
|
||||
out_dir: Path
|
||||
annotate: bool = True
|
||||
fail_fast: bool = False
|
||||
limit: int | None = None
|
||||
model_limit: int | None = None
|
||||
progress: bool = True
|
||||
|
||||
|
||||
@dataclass
|
||||
class TaskRunState:
|
||||
task: ClickTask
|
||||
gt_point: Point | None
|
||||
judge_annotations: list[dict[str, object]]
|
||||
annotations: list[dict[str, object]]
|
||||
|
||||
|
||||
def run_eval(options: RunOptions, predict_point: PredictPoint) -> dict[str, object]:
|
||||
judges, candidates, config = load_model_config(options.models_path)
|
||||
tasks = load_tasks(options.tasks_path)
|
||||
if options.limit is not None:
|
||||
tasks = tasks[: options.limit]
|
||||
if options.model_limit is not None:
|
||||
if options.model_limit < 1:
|
||||
raise ValueError("model_limit must be at least 1")
|
||||
candidates = candidates[: options.model_limit]
|
||||
|
||||
options.out_dir.mkdir(parents=True, exist_ok=True)
|
||||
_log(
|
||||
options,
|
||||
f"Loaded {len(tasks)} task(s), {len(candidates)} candidate model(s). "
|
||||
f"Output: {options.out_dir}",
|
||||
)
|
||||
resolved_rows: list[dict[str, object]] = []
|
||||
prediction_rows: list[dict[str, object]] = []
|
||||
score_rows: list[dict[str, object]] = []
|
||||
task_states: list[TaskRunState] = []
|
||||
|
||||
task_iter = _progress(options, tasks, desc="GT", unit="task")
|
||||
for task in task_iter:
|
||||
if task.gt_point is not None and judges:
|
||||
_log(options, _judge_overlay_log_message(task.task_id, judges))
|
||||
elif task.gt_point is not None:
|
||||
_log(options, f"[{task.task_id}] Using provided GT")
|
||||
elif judges:
|
||||
_log(options, _judge_overlay_without_gt_log_message(task.task_id, judges))
|
||||
else:
|
||||
_log(options, f"[{task.task_id}] No GT; scoring will be n/a")
|
||||
try:
|
||||
gt_point, resolved = _resolve_ground_truth(task, judges, predict_point)
|
||||
except Exception as exc:
|
||||
_log(options, f"[{task.task_id}] GT failed: {exc}")
|
||||
if options.fail_fast:
|
||||
raise
|
||||
resolved = dict(task.raw)
|
||||
resolved["ground_truth_error"] = str(exc)
|
||||
resolved_rows.append(resolved)
|
||||
continue
|
||||
|
||||
if gt_point is None:
|
||||
_log(options, f"[{task.task_id}] GT: n/a")
|
||||
else:
|
||||
_log(options, f"[{task.task_id}] GT: ({gt_point.x:.1f}, {gt_point.y:.1f})")
|
||||
_log_judge_statuses(options, task.task_id, resolved)
|
||||
resolved_rows.append(resolved)
|
||||
judge_annotations = _judge_annotations(resolved, gt_point)
|
||||
task_states.append(
|
||||
TaskRunState(
|
||||
task=task,
|
||||
gt_point=gt_point,
|
||||
judge_annotations=judge_annotations,
|
||||
annotations=[],
|
||||
)
|
||||
)
|
||||
|
||||
for model, state, prediction in _predict_candidates_for_tasks(
|
||||
options, task_states, candidates, predict_point
|
||||
):
|
||||
prediction_rows.append(prediction)
|
||||
parsed_point = prediction.get("_point")
|
||||
point = parsed_point if isinstance(parsed_point, Point) else None
|
||||
score = score_point(
|
||||
state.task.task_id,
|
||||
model.name,
|
||||
state.gt_point,
|
||||
point,
|
||||
error=str(prediction.get("error") or ""),
|
||||
)
|
||||
score["duration_seconds"] = prediction.get("duration_seconds", "")
|
||||
score["skipped"] = bool(prediction.get("skipped"))
|
||||
score_rows.append(score)
|
||||
if point is not None:
|
||||
state.annotations.append(
|
||||
{
|
||||
"model": model.name,
|
||||
"point": point,
|
||||
"l2": score["l2"] if score["l2"] != "" else None,
|
||||
}
|
||||
)
|
||||
|
||||
for state in task_states:
|
||||
if options.annotate:
|
||||
annotate_image(
|
||||
state.task.image_path,
|
||||
options.out_dir / "annotated" / f"{state.task.task_id}.png",
|
||||
state.gt_point,
|
||||
state.annotations,
|
||||
judge_points=state.judge_annotations,
|
||||
)
|
||||
|
||||
for row in prediction_rows:
|
||||
row.pop("_point", None)
|
||||
|
||||
summary = {
|
||||
"tasks": len(tasks),
|
||||
"models": [model.name for model in candidates],
|
||||
"judge_models": [judge.model_id for judge in judges],
|
||||
"config": {
|
||||
key: value
|
||||
for key, value in config.items()
|
||||
if key not in {"candidate_models", "judge_model", "judge_models"}
|
||||
},
|
||||
"summary": summarize_scores(score_rows),
|
||||
"result_rows": _build_result_rows(score_rows),
|
||||
}
|
||||
|
||||
write_jsonl(options.out_dir / "resolved_tasks.jsonl", resolved_rows)
|
||||
write_jsonl(options.out_dir / "predictions.jsonl", prediction_rows)
|
||||
_write_scores_csv(options.out_dir / "scores.csv", score_rows)
|
||||
(options.out_dir / "summary.json").write_text(
|
||||
json.dumps(summary, indent=2, ensure_ascii=False) + "\n", encoding="utf-8"
|
||||
)
|
||||
_log(options, f"Wrote results to {options.out_dir}")
|
||||
return summary
|
||||
|
||||
|
||||
def _predict_candidates_for_tasks(
|
||||
options: RunOptions,
|
||||
task_states: list[TaskRunState],
|
||||
candidates: list[ModelSpec],
|
||||
predict_point: PredictPoint,
|
||||
) -> list[tuple[ModelSpec, TaskRunState, dict[str, object]]]:
|
||||
predictions: list[tuple[ModelSpec, TaskRunState, dict[str, object]]] = []
|
||||
progress_bar = _candidate_progress(
|
||||
options, len(task_states) * len(candidates), "Candidates"
|
||||
)
|
||||
try:
|
||||
for model in candidates:
|
||||
with _model_run_context(predict_point, model):
|
||||
if model.provider.lower() == "openrouter":
|
||||
for state, prediction in _predict_openrouter_model_for_tasks(
|
||||
options, model, task_states, predict_point
|
||||
):
|
||||
predictions.append((model, state, prediction))
|
||||
_log_prediction_status(
|
||||
options, state.task.task_id, model, prediction
|
||||
)
|
||||
_update_progress(progress_bar)
|
||||
continue
|
||||
|
||||
for state in task_states:
|
||||
_log_running(options, state.task.task_id, model)
|
||||
prediction = _predict_candidate(
|
||||
state.task, model, predict_point
|
||||
)
|
||||
predictions.append((model, state, prediction))
|
||||
_log_prediction_status(
|
||||
options, state.task.task_id, model, prediction
|
||||
)
|
||||
_update_progress(progress_bar)
|
||||
finally:
|
||||
if progress_bar is not None:
|
||||
progress_bar.close()
|
||||
|
||||
return predictions
|
||||
|
||||
|
||||
def _predict_openrouter_model_for_tasks(
|
||||
options: RunOptions,
|
||||
model: ModelSpec,
|
||||
task_states: list[TaskRunState],
|
||||
predict_point: PredictPoint,
|
||||
) -> list[tuple[TaskRunState, dict[str, object]]]:
|
||||
if not task_states:
|
||||
return []
|
||||
|
||||
for state in task_states:
|
||||
_log_running(options, state.task.task_id, model)
|
||||
|
||||
max_workers = min(OPENROUTER_CANDIDATE_CONCURRENCY, len(task_states))
|
||||
with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
|
||||
futures = [
|
||||
executor.submit(_predict_candidate, state.task, model, predict_point)
|
||||
for state in task_states
|
||||
]
|
||||
return [
|
||||
(state, future.result())
|
||||
for state, future in zip(task_states, futures, strict=True)
|
||||
]
|
||||
|
||||
|
||||
def _model_run_context(predict_point: PredictPoint, model: ModelSpec):
|
||||
owner = getattr(predict_point, "__self__", None)
|
||||
context_factory = getattr(owner, "model_run_context", None)
|
||||
if context_factory is None:
|
||||
return nullcontext()
|
||||
return context_factory(model)
|
||||
|
||||
|
||||
def _resolve_ground_truth(
|
||||
task, judges: list[ModelSpec], predict_point: PredictPoint
|
||||
) -> tuple[Point | None, dict[str, object]]:
|
||||
resolved = dict(task.raw)
|
||||
if task.gt_point is not None:
|
||||
resolved["gt_point"] = task.gt_point.as_list()
|
||||
if judges:
|
||||
judge_rows, successful = _resolve_judges(task, judges, predict_point)
|
||||
resolved["gt_judges"] = judge_rows
|
||||
resolved["gt_models"] = [
|
||||
judge.model_id for judge, _point, _reason, _raw in successful
|
||||
]
|
||||
resolved["gt_model"] = "provided"
|
||||
resolved["gt_reason"] = "provided gt_point; judges recorded for overlay"
|
||||
return task.gt_point, resolved
|
||||
|
||||
resolved["gt_point"] = None
|
||||
resolved["gt_model"] = None
|
||||
resolved["gt_reason"] = "missing gt_point; candidates are unscored"
|
||||
if judges:
|
||||
judge_rows, successful = _resolve_judges(task, judges, predict_point)
|
||||
resolved["gt_judges"] = judge_rows
|
||||
resolved["gt_models"] = [
|
||||
judge.model_id for judge, _point, _reason, _raw in successful
|
||||
]
|
||||
resolved["gt_reason"] = (
|
||||
"missing gt_point; judges recorded for overlay only"
|
||||
)
|
||||
return None, resolved
|
||||
|
||||
|
||||
def _resolve_judges(
|
||||
task,
|
||||
judges: list[ModelSpec],
|
||||
predict_point: PredictPoint,
|
||||
) -> tuple[list[dict[str, object]], list[tuple[ModelSpec, Point, str | None, str]]]:
|
||||
successful: list[tuple[ModelSpec, Point, str | None, str]] = []
|
||||
judge_results = _predict_judges_for_task(task, judges, predict_point)
|
||||
judge_rows = [row for row, _point, _reason, _raw_text in judge_results]
|
||||
for judge, (_row, point, reason, raw_text) in zip(
|
||||
judges, judge_results, strict=True
|
||||
):
|
||||
if point is not None:
|
||||
successful.append((judge, point, reason, raw_text))
|
||||
return judge_rows, successful
|
||||
|
||||
|
||||
def _predict_judge(
|
||||
task,
|
||||
judge: ModelSpec,
|
||||
predict_point: PredictPoint,
|
||||
) -> tuple[dict[str, object], Point | None, str | None, str]:
|
||||
row: dict[str, object] = {
|
||||
"name": judge.name,
|
||||
"provider": judge.provider,
|
||||
"model_id": judge.model_id,
|
||||
"point": None,
|
||||
"reason": None,
|
||||
"raw_text": None,
|
||||
"error": None,
|
||||
"skipped": False,
|
||||
"duration_seconds": None,
|
||||
}
|
||||
started = time.perf_counter()
|
||||
try:
|
||||
reply = predict_point(judge, task.image_path, task.instruction, "ground_truth")
|
||||
except ModelSkipped as exc:
|
||||
row["duration_seconds"] = time.perf_counter() - started
|
||||
row["skipped"] = True
|
||||
row["error"] = str(exc)
|
||||
return row, None, None, ""
|
||||
except Exception as exc:
|
||||
row["duration_seconds"] = time.perf_counter() - started
|
||||
row["error"] = str(exc)
|
||||
return row, None, None, ""
|
||||
|
||||
row["duration_seconds"] = time.perf_counter() - started
|
||||
parsed = parse_point_response(reply.text)
|
||||
row["raw_text"] = reply.text
|
||||
row["reason"] = parsed.reason
|
||||
if parsed.point is None:
|
||||
row["error"] = parsed.error
|
||||
return row, None, parsed.reason, reply.text
|
||||
|
||||
row["point"] = parsed.point.as_list()
|
||||
return row, parsed.point, parsed.reason, reply.text
|
||||
|
||||
|
||||
def _judge_overlay_log_message(task_id: str, judges: list[ModelSpec]) -> str:
|
||||
names = ", ".join(judge.name for judge in judges)
|
||||
return (
|
||||
f"[{task_id}] Using provided GT and resolving "
|
||||
f"{len(judges)} judge overlay(s): {names}"
|
||||
)
|
||||
|
||||
|
||||
def _judge_overlay_without_gt_log_message(task_id: str, judges: list[ModelSpec]) -> str:
|
||||
names = ", ".join(judge.name for judge in judges)
|
||||
return (
|
||||
f"[{task_id}] No GT; resolving {len(judges)} judge overlay(s) "
|
||||
f"without scoring fallback: {names}"
|
||||
)
|
||||
|
||||
|
||||
def _judge_annotations(
|
||||
resolved: dict[str, object], gt_point: Point | None
|
||||
) -> list[dict[str, object]]:
|
||||
rows = resolved.get("gt_judges")
|
||||
if not isinstance(rows, list):
|
||||
return []
|
||||
|
||||
annotations: list[dict[str, object]] = []
|
||||
for index, row in enumerate(rows, start=1):
|
||||
if not isinstance(row, dict):
|
||||
continue
|
||||
point = parse_point_value(row.get("point"))
|
||||
annotations.append(
|
||||
{
|
||||
"label": f"GT{index}",
|
||||
"model": str(
|
||||
row.get("name") or row.get("model_id") or f"judge-{index}"
|
||||
),
|
||||
"point": point,
|
||||
"l2": (
|
||||
math.hypot(point.x - gt_point.x, point.y - gt_point.y)
|
||||
if point is not None and gt_point is not None
|
||||
else None
|
||||
),
|
||||
"error": row.get("error"),
|
||||
"skipped": row.get("skipped"),
|
||||
}
|
||||
)
|
||||
return annotations
|
||||
|
||||
|
||||
def _log_judge_statuses(
|
||||
options: RunOptions, task_id: str, resolved: dict[str, object]
|
||||
) -> None:
|
||||
rows = resolved.get("gt_judges")
|
||||
if not isinstance(rows, list):
|
||||
return
|
||||
|
||||
for index, row in enumerate(rows, start=1):
|
||||
if not isinstance(row, dict):
|
||||
continue
|
||||
name = str(row.get("name") or row.get("model_id") or f"judge-{index}")
|
||||
duration = row.get("duration_seconds")
|
||||
duration_text = (
|
||||
f" in {float(duration):.2f}s"
|
||||
if isinstance(duration, int | float)
|
||||
else ""
|
||||
)
|
||||
error = row.get("error")
|
||||
if row.get("skipped"):
|
||||
_log(options, f"[{task_id}] GT{index} {name} skipped{duration_text}: {error}")
|
||||
continue
|
||||
if error:
|
||||
_log(options, f"[{task_id}] GT{index} {name} failed{duration_text}: {error}")
|
||||
continue
|
||||
point = parse_point_value(row.get("point"))
|
||||
if point is None:
|
||||
_log(options, f"[{task_id}] GT{index} {name} finished{duration_text}: no point")
|
||||
continue
|
||||
_log(
|
||||
options,
|
||||
f"[{task_id}] GT{index} {name} finished{duration_text}: "
|
||||
f"({point.x:.1f}, {point.y:.1f})",
|
||||
)
|
||||
|
||||
|
||||
def _predict_judges_for_task(
|
||||
task,
|
||||
judges: list[ModelSpec],
|
||||
predict_point: PredictPoint,
|
||||
) -> list[tuple[dict[str, object], Point | None, str | None, str]]:
|
||||
results: list[tuple[dict[str, object], Point | None, str | None, str] | None] = [
|
||||
None
|
||||
] * len(judges)
|
||||
start = 0
|
||||
while start < len(judges):
|
||||
judge = judges[start]
|
||||
if judge.provider.lower() == "openrouter":
|
||||
end = start + 1
|
||||
while (
|
||||
end < len(judges)
|
||||
and judges[end].provider.lower() == "openrouter"
|
||||
):
|
||||
end += 1
|
||||
group_results = _predict_openrouter_judges(
|
||||
task, judges[start:end], predict_point
|
||||
)
|
||||
for offset, result in enumerate(group_results):
|
||||
results[start + offset] = result
|
||||
start = end
|
||||
continue
|
||||
|
||||
results[start] = _predict_judge(task, judge, predict_point)
|
||||
start += 1
|
||||
|
||||
return [result for result in results if result is not None]
|
||||
|
||||
|
||||
def _predict_openrouter_judges(
|
||||
task,
|
||||
judges: list[ModelSpec],
|
||||
predict_point: PredictPoint,
|
||||
) -> list[tuple[dict[str, object], Point | None, str | None, str]]:
|
||||
max_workers = min(OPENROUTER_CANDIDATE_CONCURRENCY, len(judges))
|
||||
with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
|
||||
futures = [
|
||||
executor.submit(_predict_judge, task, judge, predict_point)
|
||||
for judge in judges
|
||||
]
|
||||
return [future.result() for future in futures]
|
||||
|
||||
|
||||
def _predict_candidate(
|
||||
task, model: ModelSpec, predict_point: PredictPoint
|
||||
) -> dict[str, object]:
|
||||
base: dict[str, object] = {
|
||||
"task_id": task.task_id,
|
||||
"image_path": task.image_path_text,
|
||||
"instruction": task.instruction,
|
||||
"model": model.name,
|
||||
"model_id": model.model_id,
|
||||
"point": None,
|
||||
"reason": None,
|
||||
"raw_text": None,
|
||||
"error": None,
|
||||
"skipped": False,
|
||||
"duration_seconds": None,
|
||||
}
|
||||
started = time.perf_counter()
|
||||
try:
|
||||
reply = predict_point(model, task.image_path, task.instruction, "candidate")
|
||||
except ModelSkipped as exc:
|
||||
base["duration_seconds"] = time.perf_counter() - started
|
||||
base["skipped"] = True
|
||||
base["error"] = str(exc)
|
||||
return base
|
||||
except Exception as exc:
|
||||
base["duration_seconds"] = time.perf_counter() - started
|
||||
base["error"] = str(exc)
|
||||
return base
|
||||
|
||||
base["duration_seconds"] = time.perf_counter() - started
|
||||
parsed = parse_point_response(reply.text)
|
||||
base["raw_text"] = reply.text
|
||||
base["reason"] = parsed.reason
|
||||
if parsed.point is None:
|
||||
base["error"] = parsed.error
|
||||
return base
|
||||
|
||||
base["point"] = parsed.point.as_list()
|
||||
base["_point"] = parsed.point
|
||||
return base
|
||||
|
||||
|
||||
def _log_running(options: RunOptions, task_id: str, model: ModelSpec) -> None:
|
||||
_log(
|
||||
options,
|
||||
f"[{task_id}] Running {model.name} ({model.provider}/{model.model_id})",
|
||||
)
|
||||
|
||||
|
||||
def _log_prediction_status(
|
||||
options: RunOptions,
|
||||
task_id: str,
|
||||
model: ModelSpec,
|
||||
prediction: dict[str, object],
|
||||
) -> None:
|
||||
duration = prediction.get("duration_seconds")
|
||||
duration_text = f" in {float(duration):.2f}s" if isinstance(duration, float) else ""
|
||||
if prediction.get("skipped"):
|
||||
_log(
|
||||
options,
|
||||
f"[{task_id}] {model.name} skipped{duration_text}: {prediction['error']}",
|
||||
)
|
||||
elif prediction.get("error"):
|
||||
_log(
|
||||
options,
|
||||
f"[{task_id}] {model.name} failed{duration_text}: {prediction['error']}",
|
||||
)
|
||||
else:
|
||||
_log(options, f"[{task_id}] {model.name} finished{duration_text}")
|
||||
|
||||
|
||||
def _write_scores_csv(path: Path, rows: list[dict[str, object]]) -> None:
|
||||
path.parent.mkdir(parents=True, exist_ok=True)
|
||||
with path.open("w", encoding="utf-8", newline="") as handle:
|
||||
writer = csv.DictWriter(handle, fieldnames=SCORE_FIELDNAMES)
|
||||
writer.writeheader()
|
||||
writer.writerows(rows)
|
||||
|
||||
|
||||
def _build_result_rows(score_rows: list[dict[str, object]]) -> list[dict[str, object]]:
|
||||
grouped: dict[str, list[dict[str, object]]] = {}
|
||||
for row in score_rows:
|
||||
grouped.setdefault(str(row["model"]), []).append(row)
|
||||
|
||||
result_rows: list[dict[str, object]] = []
|
||||
for model_name, rows in grouped.items():
|
||||
distances = [float(row["l2"]) for row in rows if row.get("l2") != ""]
|
||||
durations = [
|
||||
float(row["duration_seconds"])
|
||||
for row in rows
|
||||
if row.get("duration_seconds") not in {"", None}
|
||||
]
|
||||
skipped = sum(1 for row in rows if row.get("skipped") is True)
|
||||
errors = sum(
|
||||
1 for row in rows if row.get("error") and row.get("skipped") is not True
|
||||
)
|
||||
status = _result_status(len(rows), len(distances), errors, skipped)
|
||||
result_rows.append(
|
||||
{
|
||||
"model": model_name,
|
||||
"status": status,
|
||||
"l2": statistics.fmean(distances) if distances else None,
|
||||
"duration_seconds": statistics.fmean(durations) if durations else None,
|
||||
"reason": _result_reason(rows, status),
|
||||
}
|
||||
)
|
||||
|
||||
return sorted(result_rows, key=_result_sort_key)
|
||||
|
||||
|
||||
def _result_status(total: int, scored: int, errors: int, skipped: int) -> str:
|
||||
if scored == total and errors == 0 and skipped == 0:
|
||||
return "ok"
|
||||
if scored > 0 and (errors or skipped):
|
||||
return "partial"
|
||||
if errors:
|
||||
return "error"
|
||||
if skipped:
|
||||
return "skipped"
|
||||
if scored == 0:
|
||||
return "unscored"
|
||||
return "ok"
|
||||
|
||||
|
||||
def _result_reason(rows: list[dict[str, object]], status: str) -> str:
|
||||
if status == "ok":
|
||||
return ""
|
||||
if status == "partial":
|
||||
return _partial_result_reason(rows)
|
||||
for row in rows:
|
||||
if status == "skipped" and row.get("skipped") is not True:
|
||||
continue
|
||||
reason = row.get("error")
|
||||
if reason:
|
||||
return str(reason)
|
||||
return "no score"
|
||||
|
||||
|
||||
def _partial_result_reason(rows: list[dict[str, object]]) -> str:
|
||||
errors = [
|
||||
str(row.get("error"))
|
||||
for row in rows
|
||||
if row.get("error") and row.get("skipped") is not True
|
||||
]
|
||||
skipped = [row for row in rows if row.get("skipped") is True]
|
||||
parts: list[str] = []
|
||||
if errors:
|
||||
parts.append(f"{len(errors)} error(s)")
|
||||
if skipped:
|
||||
parts.append(f"{len(skipped)} skipped")
|
||||
prefix = ", ".join(parts) if parts else "partial"
|
||||
return f"{prefix}; first: {errors[0]}" if errors else prefix
|
||||
|
||||
|
||||
def _result_sort_key(row: dict[str, object]) -> tuple[int, float, str]:
|
||||
l2 = row.get("l2")
|
||||
is_ranked = l2 is not None
|
||||
return (
|
||||
0 if is_ranked else 1,
|
||||
float(l2) if l2 is not None else float("inf"),
|
||||
str(row["model"]),
|
||||
)
|
||||
|
||||
|
||||
def _progress(options: RunOptions, items, **kwargs):
|
||||
if not _show_progress(options):
|
||||
return items
|
||||
return tqdm(items, dynamic_ncols=True, **kwargs)
|
||||
|
||||
|
||||
def _candidate_progress(options: RunOptions, total: int, task_id: str):
|
||||
if not _show_progress(options):
|
||||
return None
|
||||
desc = task_id if task_id == "Candidates" else f"{task_id} candidates"
|
||||
return tqdm(
|
||||
total=total,
|
||||
desc=desc,
|
||||
unit="call",
|
||||
leave=False,
|
||||
dynamic_ncols=True,
|
||||
)
|
||||
|
||||
|
||||
def _update_progress(progress_bar) -> None:
|
||||
if progress_bar is not None:
|
||||
progress_bar.update(1)
|
||||
|
||||
|
||||
def _log(options: RunOptions, message: str) -> None:
|
||||
if not options.progress:
|
||||
return
|
||||
if _show_progress(options):
|
||||
tqdm.write(message, file=sys.stderr)
|
||||
return
|
||||
print(message, file=sys.stderr)
|
||||
|
||||
|
||||
def _show_progress(options: RunOptions) -> bool:
|
||||
return options.progress and sys.stderr.isatty()
|
||||
30
prototypes/click_eval/src/click_eval/image_utils.py
Normal file
@@ -0,0 +1,30 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import base64
|
||||
import mimetypes
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
def require_pillow():
|
||||
try:
|
||||
from PIL import Image, ImageDraw, ImageFont
|
||||
except ImportError as exc:
|
||||
raise RuntimeError(
|
||||
"Pillow is required for image dimensions and annotations. "
|
||||
"Install prototypes/click_eval/requirements.txt or run with "
|
||||
"`uv run --with pillow ...`."
|
||||
) from exc
|
||||
|
||||
return Image, ImageDraw, ImageFont
|
||||
|
||||
|
||||
def image_size(path: Path) -> tuple[int, int]:
|
||||
Image, _, _ = require_pillow()
|
||||
with Image.open(path) as image:
|
||||
return image.size
|
||||
|
||||
|
||||
def image_data_url(path: Path) -> str:
|
||||
mime_type = mimetypes.guess_type(path.name)[0] or "image/png"
|
||||
encoded = base64.b64encode(path.read_bytes()).decode("ascii")
|
||||
return f"data:{mime_type};base64,{encoded}"
|
||||
145
prototypes/click_eval/src/click_eval/io.py
Normal file
@@ -0,0 +1,145 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
from pathlib import Path
|
||||
from typing import Any, Iterable
|
||||
|
||||
from .contracts import ClickTask, ModelSpec
|
||||
from .parsing import parse_point_value
|
||||
|
||||
|
||||
def read_jsonl(path: Path) -> list[dict[str, Any]]:
|
||||
rows: list[dict[str, Any]] = []
|
||||
with path.open("r", encoding="utf-8") as handle:
|
||||
for line_number, line in enumerate(handle, start=1):
|
||||
stripped = line.strip()
|
||||
if not stripped:
|
||||
continue
|
||||
try:
|
||||
rows.append(json.loads(stripped))
|
||||
except json.JSONDecodeError as exc:
|
||||
raise ValueError(f"{path}:{line_number}: invalid JSONL: {exc}") from exc
|
||||
return rows
|
||||
|
||||
|
||||
def write_jsonl(path: Path, rows: Iterable[dict[str, Any]]) -> None:
|
||||
path.parent.mkdir(parents=True, exist_ok=True)
|
||||
with path.open("w", encoding="utf-8") as handle:
|
||||
for row in rows:
|
||||
handle.write(json.dumps(row, ensure_ascii=False) + "\n")
|
||||
|
||||
|
||||
def load_tasks(path: Path) -> list[ClickTask]:
|
||||
rows = read_jsonl(path)
|
||||
tasks: list[ClickTask] = []
|
||||
base = path.parent
|
||||
for index, row in enumerate(rows, start=1):
|
||||
try:
|
||||
task_id = str(row["task_id"])
|
||||
image_path_text = str(row["image_path"])
|
||||
instruction = str(row["instruction"])
|
||||
except KeyError as exc:
|
||||
raise ValueError(f"{path}:{index}: missing required field {exc}") from exc
|
||||
|
||||
image_path = Path(image_path_text)
|
||||
if not image_path.is_absolute():
|
||||
image_path = base / image_path
|
||||
|
||||
gt_point = parse_point_value(row.get("gt_point"))
|
||||
tasks.append(
|
||||
ClickTask(
|
||||
task_id=task_id,
|
||||
image_path=image_path,
|
||||
image_path_text=image_path_text,
|
||||
instruction=instruction,
|
||||
gt_point=gt_point,
|
||||
raw=dict(row),
|
||||
)
|
||||
)
|
||||
return tasks
|
||||
|
||||
|
||||
def load_model_config(
|
||||
path: Path,
|
||||
) -> tuple[list[ModelSpec], list[ModelSpec], dict[str, Any]]:
|
||||
config = json.loads(path.read_text(encoding="utf-8"))
|
||||
judges = _judge_specs(config)
|
||||
|
||||
candidate_entries = config.get("candidate_models") or []
|
||||
candidates = [_model_spec(entry) for entry in candidate_entries]
|
||||
if not candidates:
|
||||
raise ValueError(f"{path}: candidate_models must contain at least one model")
|
||||
|
||||
return judges, candidates, config
|
||||
|
||||
|
||||
def _judge_specs(config: dict[str, Any]) -> list[ModelSpec]:
|
||||
if "judge_models" in config:
|
||||
entries = config.get("judge_models") or []
|
||||
if isinstance(entries, (str, dict)):
|
||||
entries = [entries]
|
||||
if not isinstance(entries, list):
|
||||
raise ValueError("judge_models must be a model entry or list of entries")
|
||||
return [
|
||||
_model_spec(entry, default_name=f"judge-{index}")
|
||||
for index, entry in enumerate(entries, start=1)
|
||||
]
|
||||
|
||||
if config.get("judge_model"):
|
||||
return [_model_spec(config["judge_model"], default_name="judge")]
|
||||
|
||||
return []
|
||||
|
||||
|
||||
def _model_spec(entry: Any, default_name: str | None = None) -> ModelSpec:
|
||||
if isinstance(entry, str):
|
||||
return ModelSpec(name=default_name or entry, model_id=entry)
|
||||
if isinstance(entry, dict):
|
||||
model_id = str(entry.get("model") or entry.get("id") or "")
|
||||
if not model_id:
|
||||
raise ValueError(f"model entry is missing model/id: {entry}")
|
||||
return ModelSpec(
|
||||
name=str(entry.get("name") or default_name or model_id),
|
||||
model_id=model_id,
|
||||
provider=str(entry.get("provider") or "openrouter"),
|
||||
estimated_vram_gb=_optional_float(
|
||||
entry.get("estimated_vram_gb") or entry.get("vram_gb")
|
||||
),
|
||||
adapter=_optional_string(entry.get("adapter")),
|
||||
quantization=_optional_string(entry.get("quantization")),
|
||||
allow_cpu_offload=_optional_bool(entry.get("allow_cpu_offload")) or False,
|
||||
dtype=_optional_string(entry.get("dtype")),
|
||||
attn_implementation=_optional_string(entry.get("attn_implementation")),
|
||||
min_pixels=_optional_int(entry.get("min_pixels") or entry.get("image_min_pixels")),
|
||||
max_pixels=_optional_int(entry.get("max_pixels") or entry.get("image_max_pixels")),
|
||||
max_new_tokens=_optional_int(entry.get("max_new_tokens")),
|
||||
revision=_optional_string(entry.get("revision")),
|
||||
use_safetensors=_optional_bool(entry.get("use_safetensors")),
|
||||
)
|
||||
raise ValueError(f"invalid model entry: {entry!r}")
|
||||
|
||||
|
||||
def _optional_float(value: Any) -> float | None:
|
||||
if value is None or value == "":
|
||||
return None
|
||||
return float(value)
|
||||
|
||||
|
||||
def _optional_int(value: Any) -> int | None:
|
||||
if value is None or value == "":
|
||||
return None
|
||||
return int(value)
|
||||
|
||||
|
||||
def _optional_string(value: Any) -> str | None:
|
||||
if value is None or value == "":
|
||||
return None
|
||||
return str(value)
|
||||
|
||||
|
||||
def _optional_bool(value: Any) -> bool | None:
|
||||
if value is None or value == "":
|
||||
return None
|
||||
if isinstance(value, str):
|
||||
return value.strip().lower() in {"1", "true", "yes", "on"}
|
||||
return bool(value)
|
||||
1972
prototypes/click_eval/src/click_eval/local_hf.py
Normal file
96
prototypes/click_eval/src/click_eval/moondream.py
Normal file
@@ -0,0 +1,96 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import urllib.error
|
||||
import urllib.request
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
from .contracts import ModelReply
|
||||
from .image_utils import image_data_url, image_size
|
||||
|
||||
MOONDREAM_POINT_URL = "https://api.moondream.ai/v1/point"
|
||||
|
||||
|
||||
class MoondreamClient:
|
||||
def __init__(
|
||||
self,
|
||||
api_key: str | None = None,
|
||||
base_url: str = MOONDREAM_POINT_URL,
|
||||
timeout_seconds: int = 90,
|
||||
) -> None:
|
||||
self.api_key = api_key or os.environ.get("MOONDREAM_API_KEY")
|
||||
if not self.api_key:
|
||||
raise RuntimeError("MOONDREAM_API_KEY is required")
|
||||
self.base_url = base_url
|
||||
self.timeout_seconds = timeout_seconds
|
||||
|
||||
def predict_point(
|
||||
self,
|
||||
model_id: str,
|
||||
image_path: Path,
|
||||
instruction: str,
|
||||
purpose: str,
|
||||
) -> ModelReply:
|
||||
width, height = image_size(image_path)
|
||||
payload = {
|
||||
"image_url": image_data_url(image_path),
|
||||
"object": _object_query(instruction),
|
||||
}
|
||||
raw = self._post(payload)
|
||||
raw.setdefault("model", model_id)
|
||||
return ModelReply(text=_point_text(raw, width, height), raw=raw)
|
||||
|
||||
def _post(self, payload: dict[str, Any]) -> dict[str, Any]:
|
||||
request = urllib.request.Request(
|
||||
self.base_url,
|
||||
data=json.dumps(payload).encode("utf-8"),
|
||||
headers={
|
||||
"X-Moondream-Auth": self.api_key,
|
||||
"Content-Type": "application/json",
|
||||
"User-Agent": "click-eval/0.1",
|
||||
},
|
||||
method="POST",
|
||||
)
|
||||
try:
|
||||
with urllib.request.urlopen(
|
||||
request, timeout=self.timeout_seconds
|
||||
) as response:
|
||||
body = response.read().decode("utf-8")
|
||||
except urllib.error.HTTPError as exc:
|
||||
detail = exc.read().decode("utf-8", errors="replace")
|
||||
raise RuntimeError(f"Moondream HTTP {exc.code}: {detail}") from exc
|
||||
except urllib.error.URLError as exc:
|
||||
raise RuntimeError(f"Moondream request failed: {exc}") from exc
|
||||
|
||||
return json.loads(body)
|
||||
|
||||
|
||||
def _point_text(raw: dict[str, Any], width: int, height: int) -> str:
|
||||
try:
|
||||
first_point = raw["points"][0]
|
||||
x = float(first_point["x"]) * width
|
||||
y = float(first_point["y"]) * height
|
||||
except (KeyError, IndexError, TypeError, ValueError) as exc:
|
||||
raise RuntimeError(f"Unexpected Moondream response shape: {raw}") from exc
|
||||
|
||||
return json.dumps(
|
||||
{
|
||||
"x": x,
|
||||
"y": y,
|
||||
"reason": "first point returned by Moondream point API",
|
||||
}
|
||||
)
|
||||
|
||||
|
||||
def _object_query(instruction: str) -> str:
|
||||
stripped = instruction.strip().rstrip(".")
|
||||
query = re.sub(
|
||||
r"^(?:please\s+)?(?:click|tap|press|select|choose|open)\s+(?:on\s+)?",
|
||||
"",
|
||||
stripped,
|
||||
flags=re.IGNORECASE,
|
||||
).strip()
|
||||
return query or stripped
|
||||
191
prototypes/click_eval/src/click_eval/openai_cu.py
Normal file
@@ -0,0 +1,191 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import base64
|
||||
import io
|
||||
import json
|
||||
import os
|
||||
import urllib.error
|
||||
import urllib.request
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
from .contracts import ModelReply, Point
|
||||
from .image_utils import image_size, require_pillow
|
||||
|
||||
OPENAI_RESPONSES_URL = "https://api.openai.com/v1/responses"
|
||||
|
||||
|
||||
class OpenAIComputerUseClient:
|
||||
def __init__(
|
||||
self,
|
||||
api_key: str | None = None,
|
||||
base_url: str = OPENAI_RESPONSES_URL,
|
||||
timeout_seconds: int = 90,
|
||||
max_output_tokens: int = 1024,
|
||||
) -> None:
|
||||
self.api_key = api_key or os.environ.get("OPENAI_API_KEY")
|
||||
if not self.api_key:
|
||||
raise RuntimeError("OPENAI_API_KEY is required")
|
||||
self.base_url = base_url
|
||||
self.timeout_seconds = timeout_seconds
|
||||
self.max_output_tokens = max_output_tokens
|
||||
|
||||
def predict_point(
|
||||
self,
|
||||
model_id: str,
|
||||
image_path: Path,
|
||||
instruction: str,
|
||||
purpose: str,
|
||||
) -> ModelReply:
|
||||
original_width, original_height = image_size(image_path)
|
||||
screenshot = _original_screenshot(image_path)
|
||||
raw = self._post(
|
||||
{
|
||||
"model": model_id,
|
||||
"max_output_tokens": self.max_output_tokens,
|
||||
"truncation": "auto",
|
||||
"tools": [
|
||||
{
|
||||
"type": "computer_use_preview",
|
||||
"display_width": screenshot.width,
|
||||
"display_height": screenshot.height,
|
||||
"environment": "browser",
|
||||
}
|
||||
],
|
||||
"input": [
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{
|
||||
"type": "input_text",
|
||||
"text": _computer_use_prompt(instruction, purpose),
|
||||
},
|
||||
{
|
||||
"type": "input_image",
|
||||
"image_url": screenshot.data_url,
|
||||
"detail": "original",
|
||||
},
|
||||
],
|
||||
}
|
||||
],
|
||||
}
|
||||
)
|
||||
point = _point_from_computer_response(raw)
|
||||
if point is None:
|
||||
return ModelReply(text=_raw_text(raw), raw=raw)
|
||||
return _reply_from_point(
|
||||
point, screenshot, original_width, original_height, raw
|
||||
)
|
||||
|
||||
def _post(self, payload: dict[str, Any]) -> dict[str, Any]:
|
||||
request = urllib.request.Request(
|
||||
self.base_url,
|
||||
data=json.dumps(payload).encode("utf-8"),
|
||||
headers={
|
||||
"Authorization": f"Bearer {self.api_key}",
|
||||
"Content-Type": "application/json",
|
||||
},
|
||||
method="POST",
|
||||
)
|
||||
try:
|
||||
with urllib.request.urlopen(
|
||||
request, timeout=self.timeout_seconds
|
||||
) as response:
|
||||
body = response.read().decode("utf-8")
|
||||
except urllib.error.HTTPError as exc:
|
||||
detail = exc.read().decode("utf-8", errors="replace")
|
||||
raise RuntimeError(f"OpenAI HTTP {exc.code}: {detail}") from exc
|
||||
except urllib.error.URLError as exc:
|
||||
raise RuntimeError(f"OpenAI request failed: {exc}") from exc
|
||||
return json.loads(body)
|
||||
|
||||
|
||||
def _computer_use_prompt(instruction: str, purpose: str) -> str:
|
||||
role_line = (
|
||||
"Choose the ground-truth click point for this instruction."
|
||||
if purpose == "ground_truth"
|
||||
else "Predict the click point for this instruction."
|
||||
)
|
||||
return (
|
||||
f"{role_line}\n\n"
|
||||
"Use the computer tool and emit exactly one left-button click action at "
|
||||
"the center of the target UI element. Do not type, scroll, navigate, "
|
||||
"wait, or use any other action.\n\n"
|
||||
f"Instruction: {instruction}"
|
||||
)
|
||||
|
||||
|
||||
class _ScaledScreenshot:
|
||||
def __init__(self, width: int, height: int, data_url: str) -> None:
|
||||
self.width = width
|
||||
self.height = height
|
||||
self.data_url = data_url
|
||||
|
||||
|
||||
def _original_screenshot(path: Path) -> _ScaledScreenshot:
|
||||
Image, _, _ = require_pillow()
|
||||
with Image.open(path) as source:
|
||||
image = source.convert("RGB")
|
||||
buffer = io.BytesIO()
|
||||
image.save(buffer, format="PNG")
|
||||
encoded = base64.b64encode(buffer.getvalue()).decode("ascii")
|
||||
return _ScaledScreenshot(
|
||||
width=image.width,
|
||||
height=image.height,
|
||||
data_url=f"data:image/png;base64,{encoded}",
|
||||
)
|
||||
|
||||
|
||||
def _point_from_computer_response(raw: dict[str, Any]) -> Point | None:
|
||||
for item in raw.get("output", []):
|
||||
if not isinstance(item, dict) or item.get("type") != "computer_call":
|
||||
continue
|
||||
action = item.get("action")
|
||||
if not isinstance(action, dict) or action.get("type") != "click":
|
||||
continue
|
||||
try:
|
||||
return Point(x=float(action["x"]), y=float(action["y"]))
|
||||
except (KeyError, TypeError, ValueError):
|
||||
return None
|
||||
return None
|
||||
|
||||
|
||||
def _reply_from_point(
|
||||
point: Point,
|
||||
screenshot: _ScaledScreenshot,
|
||||
original_width: int,
|
||||
original_height: int,
|
||||
raw: dict[str, Any],
|
||||
) -> ModelReply:
|
||||
scaled = Point(
|
||||
x=point.x * original_width / screenshot.width,
|
||||
y=point.y * original_height / screenshot.height,
|
||||
)
|
||||
return ModelReply(
|
||||
text=json.dumps(
|
||||
{
|
||||
"x": scaled.x,
|
||||
"y": scaled.y,
|
||||
"reason": "OpenAI Computer Use click action",
|
||||
"display_x": point.x,
|
||||
"display_y": point.y,
|
||||
"display_width": screenshot.width,
|
||||
"display_height": screenshot.height,
|
||||
"original_width": original_width,
|
||||
"original_height": original_height,
|
||||
}
|
||||
),
|
||||
raw=raw,
|
||||
)
|
||||
|
||||
|
||||
def _raw_text(raw: dict[str, Any]) -> str:
|
||||
text_parts = []
|
||||
for item in raw.get("output", []):
|
||||
if not isinstance(item, dict):
|
||||
continue
|
||||
if item.get("type") == "message":
|
||||
for content in item.get("content", []):
|
||||
if isinstance(content, dict) and "text" in content:
|
||||
text_parts.append(str(content["text"]))
|
||||
return "\n".join(text_parts) if text_parts else json.dumps(raw, ensure_ascii=False)
|
||||
319
prototypes/click_eval/src/click_eval/openrouter.py
Normal file
@@ -0,0 +1,319 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import base64
|
||||
import io
|
||||
import json
|
||||
import os
|
||||
import urllib.error
|
||||
import urllib.request
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
from .contracts import ModelReply
|
||||
from .image_utils import image_data_url, image_size, require_pillow
|
||||
from .parsing import parse_point_response
|
||||
|
||||
OPENROUTER_URL = "https://openrouter.ai/api/v1/chat/completions"
|
||||
GPT_POINT_MAX_TOKENS = 8192
|
||||
LENGTH_RETRY_MAX_TOKENS = 16384
|
||||
|
||||
|
||||
class OpenRouterClient:
|
||||
def __init__(
|
||||
self,
|
||||
api_key: str | None = None,
|
||||
base_url: str = OPENROUTER_URL,
|
||||
timeout_seconds: int = 90,
|
||||
temperature: float = 0.0,
|
||||
max_tokens: int = 512,
|
||||
) -> None:
|
||||
self.api_key = api_key or os.environ.get("OPENROUTER_API_KEY")
|
||||
if not self.api_key:
|
||||
raise RuntimeError("OPENROUTER_API_KEY is required")
|
||||
self.base_url = base_url
|
||||
self.timeout_seconds = timeout_seconds
|
||||
self.temperature = temperature
|
||||
self.max_tokens = max_tokens
|
||||
|
||||
def predict_point(
|
||||
self,
|
||||
model_id: str,
|
||||
image_path: Path,
|
||||
instruction: str,
|
||||
purpose: str,
|
||||
) -> ModelReply:
|
||||
width, height = image_size(image_path)
|
||||
image_payload = _image_payload_for_model(model_id, image_path, width, height)
|
||||
prompt = _point_prompt(
|
||||
instruction,
|
||||
image_payload.width,
|
||||
image_payload.height,
|
||||
purpose,
|
||||
original_width=width,
|
||||
original_height=height,
|
||||
resized=image_payload.resized,
|
||||
)
|
||||
payload = {
|
||||
"model": model_id,
|
||||
"temperature": self.temperature,
|
||||
"max_tokens": _max_tokens_for_point_call(model_id, self.max_tokens),
|
||||
"messages": [
|
||||
{
|
||||
"role": "system",
|
||||
"content": (
|
||||
"You identify a single click coordinate in screenshots. "
|
||||
"Return only valid JSON."
|
||||
),
|
||||
},
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{"type": "text", "text": prompt},
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {"url": image_payload.data_url},
|
||||
},
|
||||
],
|
||||
},
|
||||
],
|
||||
}
|
||||
reasoning = _reasoning_for_point_call(model_id)
|
||||
if reasoning is not None:
|
||||
payload["reasoning"] = reasoning
|
||||
if _force_json_response_format(model_id):
|
||||
payload["response_format"] = {"type": "json_object"}
|
||||
raw = self._post(payload)
|
||||
try:
|
||||
text = _message_text(raw)
|
||||
except RuntimeError:
|
||||
if not _is_null_content_length_response(raw):
|
||||
raise
|
||||
payload["max_tokens"] = max(
|
||||
int(payload.get("max_tokens") or 0), LENGTH_RETRY_MAX_TOKENS
|
||||
)
|
||||
raw = self._post(payload)
|
||||
text = _message_text(raw)
|
||||
if image_payload.resized:
|
||||
return _rescaled_reply(text, raw, image_payload, width, height)
|
||||
return ModelReply(text=text, raw=raw)
|
||||
|
||||
def _post(self, payload: dict[str, Any]) -> dict[str, Any]:
|
||||
headers = {
|
||||
"Authorization": f"Bearer {self.api_key}",
|
||||
"Content-Type": "application/json",
|
||||
}
|
||||
referer = os.environ.get("OPENROUTER_HTTP_REFERER")
|
||||
title = os.environ.get("OPENROUTER_TITLE", "BrowserOS click eval")
|
||||
if referer:
|
||||
headers["HTTP-Referer"] = referer
|
||||
if title:
|
||||
headers["X-Title"] = title
|
||||
|
||||
request = urllib.request.Request(
|
||||
self.base_url,
|
||||
data=json.dumps(payload).encode("utf-8"),
|
||||
headers=headers,
|
||||
method="POST",
|
||||
)
|
||||
try:
|
||||
with urllib.request.urlopen(
|
||||
request, timeout=self.timeout_seconds
|
||||
) as response:
|
||||
body = response.read().decode("utf-8")
|
||||
except urllib.error.HTTPError as exc:
|
||||
detail = exc.read().decode("utf-8", errors="replace")
|
||||
raise RuntimeError(f"OpenRouter HTTP {exc.code}: {detail}") from exc
|
||||
except urllib.error.URLError as exc:
|
||||
raise RuntimeError(f"OpenRouter request failed: {exc}") from exc
|
||||
|
||||
return json.loads(body)
|
||||
|
||||
|
||||
def _point_prompt(
|
||||
instruction: str,
|
||||
width: int,
|
||||
height: int,
|
||||
purpose: str,
|
||||
*,
|
||||
original_width: int,
|
||||
original_height: int,
|
||||
resized: bool,
|
||||
) -> str:
|
||||
role_line = (
|
||||
"Choose the ground-truth click point for this instruction."
|
||||
if purpose == "ground_truth"
|
||||
else "Predict where this model should click for this instruction."
|
||||
)
|
||||
resize_line = ""
|
||||
if resized:
|
||||
resize_line = (
|
||||
"The attached image was resized client-side from the original "
|
||||
f"{original_width}x{original_height} screenshot. Return coordinates "
|
||||
"in the attached image's pixel coordinate space; the harness will "
|
||||
"map them back to the original screenshot.\n"
|
||||
)
|
||||
return (
|
||||
f"{role_line}\n\n"
|
||||
f"Screenshot size: {width}x{height} pixels.\n"
|
||||
f"{resize_line}"
|
||||
"Coordinate system: x increases left to right, y increases top to bottom, "
|
||||
"origin is the top-left pixel of the screenshot.\n\n"
|
||||
f"Instruction: {instruction}\n\n"
|
||||
"Choose the center of the target UI element. If your natural output is "
|
||||
"a bounding box, convert it to its center point. Always estimate a point; "
|
||||
"do not answer with a label, description, placeholder, or bounding box.\n\n"
|
||||
"The requested target is present in the screenshot. Never answer that no "
|
||||
"target exists; choose the closest matching visible UI element if uncertain.\n\n"
|
||||
"Return only this JSON shape with numeric pixel coordinates, no markdown:\n"
|
||||
'{"x": 123, "y": 456, "reason": "short reason"}'
|
||||
)
|
||||
|
||||
|
||||
def _message_text(raw: dict[str, Any]) -> str:
|
||||
try:
|
||||
choice = raw["choices"][0]
|
||||
message = choice["message"]
|
||||
content = message["content"]
|
||||
except (KeyError, IndexError, TypeError) as exc:
|
||||
raise RuntimeError(f"Unexpected OpenRouter response shape: {raw}") from exc
|
||||
|
||||
if isinstance(content, str):
|
||||
return content
|
||||
if isinstance(content, list):
|
||||
parts = []
|
||||
for item in content:
|
||||
if isinstance(item, dict) and item.get("type") == "text":
|
||||
parts.append(str(item.get("text", "")))
|
||||
return "\n".join(parts)
|
||||
|
||||
if content is None:
|
||||
finish_reason = choice.get("finish_reason")
|
||||
has_reasoning = bool(message.get("reasoning"))
|
||||
raise RuntimeError(
|
||||
"OpenRouter returned null message.content "
|
||||
f"(finish_reason={finish_reason}, has_reasoning={has_reasoning})"
|
||||
)
|
||||
|
||||
return str(content)
|
||||
|
||||
|
||||
def _reasoning_for_point_call(model_id: str) -> dict[str, object] | None:
|
||||
lowered = model_id.lower()
|
||||
if lowered.startswith("z-ai/glm-"):
|
||||
return {"effort": "none", "exclude": True}
|
||||
if lowered.startswith("openai/") or "gpt-5" in lowered:
|
||||
effort = os.environ.get("OPENROUTER_GPT_POINT_REASONING_EFFORT", "low")
|
||||
return {"effort": effort, "exclude": True}
|
||||
return None
|
||||
|
||||
|
||||
def _force_json_response_format(model_id: str) -> bool:
|
||||
lowered = model_id.lower()
|
||||
if lowered.startswith("z-ai/glm-5v"):
|
||||
return False
|
||||
return lowered.startswith("z-ai/glm-") or lowered.startswith("openai/")
|
||||
|
||||
|
||||
def _max_tokens_for_point_call(model_id: str, default: int) -> int:
|
||||
lowered = model_id.lower()
|
||||
if lowered.startswith("openai/") or "gpt-5" in lowered:
|
||||
return max(default, GPT_POINT_MAX_TOKENS)
|
||||
return default
|
||||
|
||||
|
||||
def _is_null_content_length_response(raw: dict[str, Any]) -> bool:
|
||||
try:
|
||||
choice = raw["choices"][0]
|
||||
return (
|
||||
choice.get("finish_reason") == "length"
|
||||
and choice.get("message", {}).get("content") is None
|
||||
)
|
||||
except (KeyError, IndexError, TypeError, AttributeError):
|
||||
return False
|
||||
|
||||
|
||||
class _ImagePayload:
|
||||
def __init__(
|
||||
self,
|
||||
width: int,
|
||||
height: int,
|
||||
data_url: str,
|
||||
resized: bool,
|
||||
) -> None:
|
||||
self.width = width
|
||||
self.height = height
|
||||
self.data_url = data_url
|
||||
self.resized = resized
|
||||
|
||||
|
||||
def _image_payload_for_model(
|
||||
model_id: str, image_path: Path, width: int, height: int
|
||||
) -> _ImagePayload:
|
||||
max_long_edge = _claude_max_long_edge(model_id)
|
||||
if max_long_edge is None or max(width, height) <= max_long_edge:
|
||||
return _ImagePayload(
|
||||
width=width,
|
||||
height=height,
|
||||
data_url=image_data_url(image_path),
|
||||
resized=False,
|
||||
)
|
||||
|
||||
Image, _, _ = require_pillow()
|
||||
with Image.open(image_path) as source:
|
||||
image = source.convert("RGB")
|
||||
scale = max_long_edge / max(width, height)
|
||||
image = image.resize(
|
||||
(max(1, round(width * scale)), max(1, round(height * scale))),
|
||||
Image.Resampling.LANCZOS,
|
||||
)
|
||||
buffer = io.BytesIO()
|
||||
image.save(buffer, format="PNG")
|
||||
encoded = base64.b64encode(buffer.getvalue()).decode("ascii")
|
||||
return _ImagePayload(
|
||||
width=image.width,
|
||||
height=image.height,
|
||||
data_url=f"data:image/png;base64,{encoded}",
|
||||
resized=True,
|
||||
)
|
||||
|
||||
|
||||
def _claude_max_long_edge(model_id: str) -> int | None:
|
||||
lowered = model_id.lower()
|
||||
if not lowered.startswith("anthropic/claude"):
|
||||
return None
|
||||
if "opus-4.7" in lowered or "opus-4-7" in lowered:
|
||||
return 2576
|
||||
return 1568
|
||||
|
||||
|
||||
def _rescaled_reply(
|
||||
text: str,
|
||||
raw: dict[str, Any],
|
||||
image_payload: _ImagePayload,
|
||||
original_width: int,
|
||||
original_height: int,
|
||||
) -> ModelReply:
|
||||
parsed = parse_point_response(text)
|
||||
if parsed.point is None:
|
||||
return ModelReply(text=text, raw=raw)
|
||||
|
||||
point = parsed.point
|
||||
scaled_x = point.x * original_width / image_payload.width
|
||||
scaled_y = point.y * original_height / image_payload.height
|
||||
return ModelReply(
|
||||
text=json.dumps(
|
||||
{
|
||||
"x": scaled_x,
|
||||
"y": scaled_y,
|
||||
"reason": parsed.reason or "OpenRouter image resized client-side",
|
||||
"display_x": point.x,
|
||||
"display_y": point.y,
|
||||
"display_width": image_payload.width,
|
||||
"display_height": image_payload.height,
|
||||
"original_width": original_width,
|
||||
"original_height": original_height,
|
||||
"raw_text": text,
|
||||
}
|
||||
),
|
||||
raw=raw,
|
||||
)
|
||||
539
prototypes/click_eval/src/click_eval/parsing.py
Normal file
@@ -0,0 +1,539 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import ast
|
||||
import json
|
||||
import math
|
||||
import re
|
||||
from typing import Any
|
||||
|
||||
from .contracts import ParsedPoint, Point
|
||||
|
||||
_NUMBER_PATTERN = r"[-+]?(?:\d+(?:\.\d*)?|\.\d+)"
|
||||
_POINT_KEY_PATTERN = (
|
||||
r"['\"]?(?:click_point|point_2d|POINT_2D|POINT|bbox_2d|coordinates|"
|
||||
r"coordinate|coords|xy|screen_point|click_position|click_coordinates|"
|
||||
r"cursor_position|target_position|start_box|position|bbox|box|target|"
|
||||
r"point|center|location)['\"]?"
|
||||
)
|
||||
_ACTION_CONTEXT_PATTERN = (
|
||||
r"(?:tool_call|computer_use|pyautogui|left_click|right_click|double_click|"
|
||||
r"middle_click|mouse_move|tap|click|Action\.[A-Za-z_]+)"
|
||||
)
|
||||
|
||||
|
||||
def parse_point_value(value: Any) -> Point | None:
|
||||
if isinstance(value, Point):
|
||||
return value
|
||||
|
||||
if isinstance(value, str):
|
||||
point = _point_from_keyed_text(value)
|
||||
if point is not None:
|
||||
return point
|
||||
return _point_from_standalone_numeric_text(value)
|
||||
|
||||
if isinstance(value, (list, tuple)):
|
||||
if len(value) == 1:
|
||||
return parse_point_value(value[0])
|
||||
if len(value) >= 4:
|
||||
x1 = _to_float(value[0])
|
||||
y1 = _to_float(value[1])
|
||||
x2 = _to_float(value[2])
|
||||
y2 = _to_float(value[3])
|
||||
if None not in {x1, y1, x2, y2}:
|
||||
return Point(x=(x1 + x2) / 2, y=(y1 + y2) / 2)
|
||||
if len(value) >= 2:
|
||||
direct_point = _point_from_numbers(value[0], value[1])
|
||||
if direct_point is not None:
|
||||
return direct_point
|
||||
if len(value) >= 2 and isinstance(value[0], (list, tuple)):
|
||||
first = parse_point_value(value[0])
|
||||
second = parse_point_value(value[1])
|
||||
if first is not None and second is not None:
|
||||
return Point(x=(first.x + second.x) / 2, y=(first.y + second.y) / 2)
|
||||
for item in value:
|
||||
point = parse_point_value(item)
|
||||
if point is not None:
|
||||
return point
|
||||
|
||||
if isinstance(value, dict):
|
||||
if "x" in value and "y" in value:
|
||||
point = _point_from_numbers(value["x"], value["y"])
|
||||
if point is not None:
|
||||
return point
|
||||
for key in ("x", "y"):
|
||||
point = parse_point_value(value[key])
|
||||
if point is not None:
|
||||
return point
|
||||
for key in (
|
||||
"point",
|
||||
"POINT",
|
||||
"click_point",
|
||||
"clickPoint",
|
||||
"target_point",
|
||||
"coordinate",
|
||||
"Coordinate",
|
||||
"coordinates",
|
||||
"coords",
|
||||
"xy",
|
||||
"point_2d",
|
||||
"POINT_2D",
|
||||
"bbox_2d",
|
||||
"click_position",
|
||||
"click_coordinates",
|
||||
"cursor_position",
|
||||
"target_position",
|
||||
"position",
|
||||
"bbox",
|
||||
"box",
|
||||
"start_box",
|
||||
"arguments",
|
||||
):
|
||||
if key in value:
|
||||
point = parse_point_value(value[key])
|
||||
if point is not None:
|
||||
return point
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def _strip_thinking(text: str) -> str:
|
||||
marker = "</think>"
|
||||
if marker in text:
|
||||
return text.split(marker, 1)[1].strip()
|
||||
return text
|
||||
|
||||
|
||||
def parse_point_response(text: str) -> ParsedPoint:
|
||||
text = _strip_thinking(text)
|
||||
fallback_error: str | None = None
|
||||
|
||||
for value_text in _structured_value_candidates(text):
|
||||
obj, error = _parse_structured_value(value_text)
|
||||
if error is not None:
|
||||
fallback_error = fallback_error or error
|
||||
point = _point_from_text_fragment(value_text)
|
||||
if point is not None:
|
||||
return ParsedPoint(point=point)
|
||||
continue
|
||||
|
||||
point = parse_point_value(obj)
|
||||
if point is not None:
|
||||
reason = obj.get("reason") if isinstance(obj, dict) else None
|
||||
return ParsedPoint(
|
||||
point=point, reason=str(reason) if reason is not None else None
|
||||
)
|
||||
|
||||
point = _point_from_text_fragment(value_text)
|
||||
if point is not None:
|
||||
return ParsedPoint(point=point)
|
||||
fallback_error = fallback_error or "response did not contain numeric x/y"
|
||||
|
||||
point = _point_from_text_fragment(text)
|
||||
if point is not None:
|
||||
return ParsedPoint(point=point)
|
||||
|
||||
return ParsedPoint(
|
||||
point=None,
|
||||
error=_parse_failure_message(text, fallback_error),
|
||||
)
|
||||
|
||||
|
||||
def _point_from_numbers(x_value: Any, y_value: Any) -> Point | None:
|
||||
x = _to_float(x_value)
|
||||
y = _to_float(y_value)
|
||||
if x is None or y is None:
|
||||
return None
|
||||
|
||||
if not math.isfinite(x) or not math.isfinite(y):
|
||||
return None
|
||||
|
||||
return Point(x=x, y=y)
|
||||
|
||||
|
||||
def _to_float(value: Any) -> float | None:
|
||||
try:
|
||||
return float(value)
|
||||
except (TypeError, ValueError):
|
||||
return None
|
||||
|
||||
|
||||
def _point_from_text_fragment(text: str) -> Point | None:
|
||||
for parser in (
|
||||
_point_from_keyed_text,
|
||||
_point_from_action_context,
|
||||
_point_from_coordinate_context,
|
||||
_point_from_standalone_numeric_text,
|
||||
):
|
||||
point = parser(text)
|
||||
if point is not None:
|
||||
return point
|
||||
return None
|
||||
|
||||
|
||||
def _point_from_keyed_text(text: str) -> Point | None:
|
||||
click_call = re.search(
|
||||
rf"(?:pyautogui\.)?click\s*\(\s*(?:x\s*=\s*)?({_NUMBER_PATTERN})"
|
||||
rf"\s*,\s*(?:y\s*=\s*)?({_NUMBER_PATTERN})",
|
||||
text,
|
||||
flags=re.IGNORECASE,
|
||||
)
|
||||
if click_call:
|
||||
return _point_from_numbers(click_call.group(1), click_call.group(2))
|
||||
|
||||
keyed_numbers = _point_from_keyed_numeric_tail(text)
|
||||
if keyed_numbers is not None:
|
||||
return keyed_numbers
|
||||
|
||||
keyed_sequence = re.search(
|
||||
rf"(?<![A-Za-z0-9_]){_POINT_KEY_PATTERN}"
|
||||
rf"(?![A-Za-z0-9_])\s*(?::|=)?\s*['\"]?[\[(]\s*"
|
||||
rf"({_NUMBER_PATTERN}(?:\s*,\s*{_NUMBER_PATTERN}){{1,3}})",
|
||||
text,
|
||||
flags=re.IGNORECASE,
|
||||
)
|
||||
if keyed_sequence:
|
||||
numbers = re.findall(_NUMBER_PATTERN, keyed_sequence.group(1))
|
||||
point = parse_point_value(numbers[:4])
|
||||
if point is not None:
|
||||
return point
|
||||
|
||||
keyed_pair = re.search(
|
||||
rf"(?<![A-Za-z0-9_]){_POINT_KEY_PATTERN}"
|
||||
rf"(?![A-Za-z0-9_])\s*(?::|=)?\s*"
|
||||
rf"({_NUMBER_PATTERN})\s*,\s*({_NUMBER_PATTERN})",
|
||||
text,
|
||||
flags=re.IGNORECASE,
|
||||
)
|
||||
if keyed_pair:
|
||||
return _point_from_numbers(keyed_pair.group(1), keyed_pair.group(2))
|
||||
|
||||
xy_point = _point_from_xy_text(text)
|
||||
if xy_point is not None:
|
||||
return xy_point
|
||||
|
||||
malformed_x_pair = re.search(
|
||||
rf"(?<![A-Za-z0-9_])['\"]?x['\"]?(?![A-Za-z0-9_])"
|
||||
rf"\s*(?::|=)\s*({_NUMBER_PATTERN})\s*,\s*['\"]?\s*({_NUMBER_PATTERN})"
|
||||
rf"(?=\s*(?:['\"]?\s*,|\}}|\]|\)|$))",
|
||||
text,
|
||||
flags=re.IGNORECASE,
|
||||
)
|
||||
if malformed_x_pair:
|
||||
return _point_from_numbers(malformed_x_pair.group(1), malformed_x_pair.group(2))
|
||||
|
||||
action_pair = re.search(
|
||||
rf"(?:left_click|double_click|right_click|mouse_move|click|tap)"
|
||||
rf"\s*(?:at|to|\(|coordinate|coordinates)\s*[^0-9+\-.]{{0,80}}"
|
||||
rf"({_NUMBER_PATTERN})\s*[, ]\s*({_NUMBER_PATTERN})",
|
||||
text,
|
||||
flags=re.IGNORECASE,
|
||||
)
|
||||
if action_pair:
|
||||
return _point_from_numbers(action_pair.group(1), action_pair.group(2))
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def _point_from_action_context(text: str) -> Point | None:
|
||||
if not re.search(_ACTION_CONTEXT_PATTERN, text, flags=re.IGNORECASE):
|
||||
return None
|
||||
|
||||
action_window = _point_from_action_windows(text)
|
||||
if action_window is not None:
|
||||
return action_window
|
||||
|
||||
tagged_window = _point_from_tagged_tool_calls(text)
|
||||
if tagged_window is not None:
|
||||
return tagged_window
|
||||
|
||||
return _point_from_bracketed_pairs(text)
|
||||
|
||||
|
||||
def _point_from_action_windows(text: str) -> Point | None:
|
||||
patterns = (
|
||||
r"(?:left_click|right_click|double_click|middle_click|tap|click|"
|
||||
r"Action\.[A-Za-z_]*CLICK[A-Za-z_]*)",
|
||||
_ACTION_CONTEXT_PATTERN,
|
||||
)
|
||||
for pattern in patterns:
|
||||
for match in re.finditer(pattern, text, flags=re.IGNORECASE):
|
||||
window = text[match.start() : match.end() + 260]
|
||||
point = _point_from_keyed_numeric_tail(window)
|
||||
if point is not None:
|
||||
return point
|
||||
point = _point_from_bracketed_pairs(window)
|
||||
if point is not None:
|
||||
return point
|
||||
point = _point_from_colon_numeric_pair(window)
|
||||
if point is not None:
|
||||
return point
|
||||
point = _point_from_xy_text(window)
|
||||
if point is not None:
|
||||
return point
|
||||
return None
|
||||
|
||||
|
||||
def _point_from_tagged_tool_calls(text: str) -> Point | None:
|
||||
for tagged in re.finditer(
|
||||
r"<tool_call>\s*(.*?)\s*</tool_call>",
|
||||
text,
|
||||
flags=re.DOTALL | re.IGNORECASE,
|
||||
):
|
||||
content = tagged.group(1)
|
||||
point = _point_from_keyed_numeric_tail(content)
|
||||
if point is not None:
|
||||
return point
|
||||
point = _point_from_bracketed_pairs(content)
|
||||
if point is not None:
|
||||
return point
|
||||
return None
|
||||
|
||||
|
||||
def _point_from_bracketed_pairs(text: str) -> Point | None:
|
||||
for match in re.finditer(
|
||||
rf"[\[(]\s*({_NUMBER_PATTERN}(?:\s*[, ]\s*{_NUMBER_PATTERN}){{1,3}})",
|
||||
text,
|
||||
):
|
||||
point = parse_point_value(re.findall(_NUMBER_PATTERN, match.group(1))[:4])
|
||||
if point is not None:
|
||||
return point
|
||||
return None
|
||||
|
||||
|
||||
def _point_from_colon_numeric_pair(text: str) -> Point | None:
|
||||
for match in re.finditer(
|
||||
rf"(?::|[\[(])\s*({_NUMBER_PATTERN})\s*,\s*({_NUMBER_PATTERN})"
|
||||
rf"(?=\s*(?:[\]}}\)]|,|$))",
|
||||
text,
|
||||
):
|
||||
point = _point_from_numbers(match.group(1), match.group(2))
|
||||
if point is not None:
|
||||
return point
|
||||
return None
|
||||
|
||||
|
||||
def _point_from_coordinate_context(text: str) -> Point | None:
|
||||
context = (
|
||||
r"(?:coordinates?|coords?|point|position|center|centre|location|"
|
||||
r"target|click(?:\s+(?:point|position|coordinate|location))?)"
|
||||
)
|
||||
relation = (
|
||||
r"(?:(?:is|are|at|to|around|approximately|approx\.?|about|near|"
|
||||
r"would\s+be|should\s+be)\b|=|:)"
|
||||
)
|
||||
patterns = (
|
||||
rf"\b{context}\b[^\n\r.;]{{0,120}}\b{relation}\s*"
|
||||
rf"[\[(]?\s*(?:x\s*(?:=|:)\s*)?({_NUMBER_PATTERN})\s*,\s*"
|
||||
rf"(?:y\s*(?:=|:)\s*)?({_NUMBER_PATTERN})",
|
||||
rf"\b(?:at|around|approximately|approx\.?|about|near)\b"
|
||||
rf"[^\n\r.;]{{0,40}}[\[(]\s*({_NUMBER_PATTERN})\s*,\s*"
|
||||
rf"({_NUMBER_PATTERN})",
|
||||
)
|
||||
for pattern in patterns:
|
||||
for match in re.finditer(pattern, text, flags=re.IGNORECASE):
|
||||
point = _point_from_numbers(match.group(1), match.group(2))
|
||||
if point is not None:
|
||||
return point
|
||||
return None
|
||||
|
||||
|
||||
def _point_from_keyed_numeric_tail(text: str) -> Point | None:
|
||||
for match in re.finditer(
|
||||
rf"(?<![A-Za-z0-9_]){_POINT_KEY_PATTERN}(?![A-Za-z0-9_])",
|
||||
text,
|
||||
flags=re.IGNORECASE,
|
||||
):
|
||||
tail = text[match.end() : match.end() + 220]
|
||||
assigned = re.match(r"\s*(?::|=|\(|is\b|are\b|at\b)\s*", tail)
|
||||
if assigned is None:
|
||||
continue
|
||||
point = _point_from_numeric_text(tail[assigned.end() :])
|
||||
if point is not None:
|
||||
return point
|
||||
return None
|
||||
|
||||
|
||||
def _point_from_standalone_numeric_text(text: str) -> Point | None:
|
||||
stripped = text.strip().strip("`")
|
||||
if not re.fullmatch(r"[\s\[\]\(\)\{\},;:'\"xXyY=.+\-0-9]+", stripped):
|
||||
return None
|
||||
return _point_from_numeric_text(stripped)
|
||||
|
||||
|
||||
def _point_from_numeric_text(text: str) -> Point | None:
|
||||
segment = text[:220]
|
||||
xy_point = _point_from_xy_text(segment)
|
||||
if xy_point is not None:
|
||||
return xy_point
|
||||
|
||||
wrapped_numbers = _constructor_wrapped_numbers(segment)
|
||||
if len(wrapped_numbers) >= 2:
|
||||
return parse_point_value(wrapped_numbers[:4])
|
||||
|
||||
sequence = re.search(
|
||||
rf"[\[(]\s*({_NUMBER_PATTERN}(?:\s*[, ]\s*{_NUMBER_PATTERN}){{1,3}})",
|
||||
segment,
|
||||
)
|
||||
numbers = (
|
||||
re.findall(_NUMBER_PATTERN, sequence.group(1))
|
||||
if sequence
|
||||
else re.findall(_NUMBER_PATTERN, segment)
|
||||
)
|
||||
if len(numbers) >= 4:
|
||||
return parse_point_value(numbers[:4])
|
||||
if len(numbers) >= 2:
|
||||
return parse_point_value(numbers[:2])
|
||||
return None
|
||||
|
||||
|
||||
def _constructor_wrapped_numbers(text: str) -> list[str]:
|
||||
return [
|
||||
match.group(1)
|
||||
for match in re.finditer(
|
||||
rf"(?:[A-Za-z_][\w]*\.)?[A-Za-z_][\w]*\(\s*({_NUMBER_PATTERN})\s*\)",
|
||||
text,
|
||||
)
|
||||
]
|
||||
|
||||
|
||||
def _point_from_xy_text(text: str) -> Point | None:
|
||||
x_match = re.search(
|
||||
rf"(?<![A-Za-z0-9_])['\"]?x['\"]?(?![A-Za-z0-9_])"
|
||||
rf"\s*(?::|=|is\b)\s*(?:[A-Za-z_][\w.]*\(\s*)?({_NUMBER_PATTERN})",
|
||||
text,
|
||||
flags=re.IGNORECASE,
|
||||
)
|
||||
y_match = re.search(
|
||||
rf"(?<![A-Za-z0-9_])['\"]?y['\"]?(?![A-Za-z0-9_])"
|
||||
rf"\s*(?::|=|is\b)\s*(?:[A-Za-z_][\w.]*\(\s*)?({_NUMBER_PATTERN})",
|
||||
text,
|
||||
flags=re.IGNORECASE,
|
||||
)
|
||||
if x_match and y_match:
|
||||
return _point_from_numbers(x_match.group(1), y_match.group(1))
|
||||
return None
|
||||
|
||||
|
||||
def _parse_structured_value(text: str) -> tuple[Any | None, str | None]:
|
||||
try:
|
||||
return json.loads(text), None
|
||||
except json.JSONDecodeError as json_error:
|
||||
try:
|
||||
return ast.literal_eval(text), None
|
||||
except (SyntaxError, ValueError) as literal_error:
|
||||
return (
|
||||
None,
|
||||
f"invalid JSON/Python literal: {json_error.msg}; {literal_error}",
|
||||
)
|
||||
|
||||
|
||||
def _parse_failure_message(text: str, structured_error: str | None) -> str:
|
||||
message = "response did not contain a numeric point value"
|
||||
if structured_error:
|
||||
message += "; structured JSON/Python parse failed"
|
||||
return f"{message}; raw preview: {_raw_preview(text)}"
|
||||
|
||||
|
||||
def _raw_preview(text: str, max_chars: int = 240) -> str:
|
||||
compact = " ".join(text.strip().split())
|
||||
if len(compact) <= max_chars:
|
||||
return compact
|
||||
return f"{compact[: max_chars - 1]}..."
|
||||
|
||||
|
||||
def _structured_value_candidates(text: str) -> list[str]:
|
||||
candidates: list[str] = []
|
||||
seen: set[str] = set()
|
||||
|
||||
def add(value: str | None) -> None:
|
||||
if value is None:
|
||||
return
|
||||
stripped = value.strip()
|
||||
if not stripped or stripped in seen:
|
||||
return
|
||||
seen.add(stripped)
|
||||
candidates.append(stripped)
|
||||
|
||||
for fenced in re.finditer(r"```(?:json)?\s*(.*?)\s*```", text, flags=re.DOTALL):
|
||||
add(fenced.group(1))
|
||||
|
||||
for tagged in re.finditer(
|
||||
r"<tool_call>\s*(.*?)\s*</tool_call>",
|
||||
text,
|
||||
flags=re.DOTALL | re.IGNORECASE,
|
||||
):
|
||||
add(tagged.group(1))
|
||||
|
||||
for tagged in re.finditer(
|
||||
r"<(?:answer|final|json)[^>]*>\s*(.*?)\s*</(?:answer|final|json)>",
|
||||
text,
|
||||
flags=re.DOTALL | re.IGNORECASE,
|
||||
):
|
||||
add(tagged.group(1))
|
||||
|
||||
for tagged in re.finditer(
|
||||
r"<\|(?:point|box)_start\|>\s*(.*?)\s*<\|(?:point|box)_end\|>",
|
||||
text,
|
||||
flags=re.DOTALL | re.IGNORECASE,
|
||||
):
|
||||
add(tagged.group(1))
|
||||
|
||||
for tagged in re.finditer(
|
||||
r"<(?:point|box)[^>]*>\s*(.*?)\s*</(?:point|box)>",
|
||||
text,
|
||||
flags=re.DOTALL | re.IGNORECASE,
|
||||
):
|
||||
add(tagged.group(1))
|
||||
|
||||
for opener, closer in (("{", "}"), ("[", "]"), ("(", ")")):
|
||||
for value in _balanced_values(text, opener, closer):
|
||||
add(value)
|
||||
|
||||
return candidates
|
||||
|
||||
|
||||
def _balanced_values(text: str, opener: str, closer: str) -> list[str]:
|
||||
values: list[str] = []
|
||||
index = 0
|
||||
while index < len(text):
|
||||
start = text.find(opener, index)
|
||||
if start == -1:
|
||||
break
|
||||
value = _balanced_from(text, start, opener, closer)
|
||||
if value is None:
|
||||
index = start + 1
|
||||
continue
|
||||
values.append(value)
|
||||
index = start + len(value)
|
||||
return values
|
||||
|
||||
|
||||
def _balanced_from(text: str, start: int, opener: str, closer: str) -> str | None:
|
||||
depth = 0
|
||||
in_string = False
|
||||
escaped = False
|
||||
string_quote = ""
|
||||
|
||||
for index in range(start, len(text)):
|
||||
char = text[index]
|
||||
|
||||
if in_string:
|
||||
if escaped:
|
||||
escaped = False
|
||||
elif char == "\\":
|
||||
escaped = True
|
||||
elif char == string_quote:
|
||||
in_string = False
|
||||
continue
|
||||
|
||||
if char in {"'", '"'}:
|
||||
in_string = True
|
||||
string_quote = char
|
||||
elif char == opener:
|
||||
depth += 1
|
||||
elif char == closer:
|
||||
depth -= 1
|
||||
if depth == 0:
|
||||
return text[start : index + 1]
|
||||
|
||||
return None
|
||||
93
prototypes/click_eval/src/click_eval/providers.py
Normal file
@@ -0,0 +1,93 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from contextlib import nullcontext
|
||||
from pathlib import Path
|
||||
from typing import Callable
|
||||
|
||||
from .contracts import ModelReply, ModelSpec
|
||||
from .gemini import GeminiComputerUseClient
|
||||
from .local_hf import LocalHFClient
|
||||
from .moondream import MoondreamClient
|
||||
from .openai_cu import OpenAIComputerUseClient
|
||||
from .openrouter import OpenRouterClient
|
||||
|
||||
|
||||
class ProviderClient:
|
||||
def __init__(
|
||||
self,
|
||||
timeout_seconds: int = 90,
|
||||
log_callback: Callable[[str], None] | None = None,
|
||||
) -> None:
|
||||
self.timeout_seconds = timeout_seconds
|
||||
self._log_callback = log_callback
|
||||
self._openrouter: OpenRouterClient | None = None
|
||||
self._moondream: MoondreamClient | None = None
|
||||
self._gemini: GeminiComputerUseClient | None = None
|
||||
self._openai_cu: OpenAIComputerUseClient | None = None
|
||||
self._local_hf: LocalHFClient | None = None
|
||||
|
||||
def predict_point(
|
||||
self,
|
||||
model: ModelSpec,
|
||||
image_path: Path,
|
||||
instruction: str,
|
||||
purpose: str,
|
||||
) -> ModelReply:
|
||||
provider = model.provider.lower()
|
||||
if provider == "openrouter":
|
||||
return self._openrouter_client().predict_point(
|
||||
model.model_id, image_path, instruction, purpose
|
||||
)
|
||||
if provider == "moondream":
|
||||
return self._moondream_client().predict_point(
|
||||
model.model_id, image_path, instruction, purpose
|
||||
)
|
||||
if provider == "gemini":
|
||||
return self._gemini_client().predict_point(
|
||||
model.model_id, image_path, instruction, purpose
|
||||
)
|
||||
if provider == "openai_computer_use":
|
||||
return self._openai_cu_client().predict_point(
|
||||
model.model_id, image_path, instruction, purpose
|
||||
)
|
||||
if provider == "local_hf":
|
||||
return self._local_hf_client().predict_point(
|
||||
model, image_path, instruction, purpose
|
||||
)
|
||||
|
||||
raise RuntimeError(f"Unsupported model provider: {model.provider}")
|
||||
|
||||
def model_run_context(self, model: ModelSpec):
|
||||
if model.provider.lower() == "local_hf":
|
||||
return self._local_hf_client().retain_loaded_models()
|
||||
return nullcontext()
|
||||
|
||||
def _openrouter_client(self) -> OpenRouterClient:
|
||||
if self._openrouter is None:
|
||||
self._openrouter = OpenRouterClient(timeout_seconds=self.timeout_seconds)
|
||||
return self._openrouter
|
||||
|
||||
def _moondream_client(self) -> MoondreamClient:
|
||||
if self._moondream is None:
|
||||
self._moondream = MoondreamClient(timeout_seconds=self.timeout_seconds)
|
||||
return self._moondream
|
||||
|
||||
def _gemini_client(self) -> GeminiComputerUseClient:
|
||||
if self._gemini is None:
|
||||
self._gemini = GeminiComputerUseClient(timeout_seconds=self.timeout_seconds)
|
||||
return self._gemini
|
||||
|
||||
def _openai_cu_client(self) -> OpenAIComputerUseClient:
|
||||
if self._openai_cu is None:
|
||||
self._openai_cu = OpenAIComputerUseClient(
|
||||
timeout_seconds=self.timeout_seconds
|
||||
)
|
||||
return self._openai_cu
|
||||
|
||||
def _local_hf_client(self) -> LocalHFClient:
|
||||
if self._local_hf is None:
|
||||
self._local_hf = LocalHFClient(
|
||||
timeout_seconds=self.timeout_seconds,
|
||||
log_callback=self._log_callback,
|
||||
)
|
||||
return self._local_hf
|
||||
101
prototypes/click_eval/src/click_eval/scoring.py
Normal file
@@ -0,0 +1,101 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import math
|
||||
import statistics
|
||||
from collections import defaultdict
|
||||
from typing import Any
|
||||
|
||||
from .contracts import Point
|
||||
|
||||
SCORE_FIELDNAMES = [
|
||||
"task_id",
|
||||
"model",
|
||||
"gt_x",
|
||||
"gt_y",
|
||||
"pred_x",
|
||||
"pred_y",
|
||||
"dx",
|
||||
"dy",
|
||||
"l2",
|
||||
"duration_seconds",
|
||||
"skipped",
|
||||
"error",
|
||||
]
|
||||
|
||||
|
||||
def score_point(
|
||||
task_id: str,
|
||||
model_name: str,
|
||||
gt: Point | None,
|
||||
pred: Point | None,
|
||||
error: str | None = None,
|
||||
) -> dict[str, Any]:
|
||||
row: dict[str, Any] = {
|
||||
"task_id": task_id,
|
||||
"model": model_name,
|
||||
"gt_x": gt.x if gt is not None else "",
|
||||
"gt_y": gt.y if gt is not None else "",
|
||||
"pred_x": "",
|
||||
"pred_y": "",
|
||||
"dx": "",
|
||||
"dy": "",
|
||||
"l2": "",
|
||||
"error": error or "",
|
||||
}
|
||||
|
||||
if pred is None:
|
||||
return row
|
||||
|
||||
row.update({"pred_x": pred.x, "pred_y": pred.y})
|
||||
if gt is None:
|
||||
return row
|
||||
|
||||
dx = pred.x - gt.x
|
||||
dy = pred.y - gt.y
|
||||
l2 = math.hypot(dx, dy)
|
||||
|
||||
row.update(
|
||||
{
|
||||
"dx": dx,
|
||||
"dy": dy,
|
||||
"l2": l2,
|
||||
}
|
||||
)
|
||||
|
||||
return row
|
||||
|
||||
|
||||
def summarize_scores(score_rows: list[dict[str, Any]]) -> dict[str, Any]:
|
||||
grouped: dict[str, list[dict[str, Any]]] = defaultdict(list)
|
||||
for row in score_rows:
|
||||
grouped[str(row["model"])].append(row)
|
||||
|
||||
summary: dict[str, Any] = {}
|
||||
for model_name, rows in grouped.items():
|
||||
distances = [float(row["l2"]) for row in rows if row["l2"] != ""]
|
||||
durations = [
|
||||
float(row["duration_seconds"])
|
||||
for row in rows
|
||||
if row.get("duration_seconds") not in {"", None}
|
||||
]
|
||||
model_summary: dict[str, Any] = {
|
||||
"total": len(rows),
|
||||
"scored": len(distances),
|
||||
"skipped": sum(1 for row in rows if row.get("skipped") is True),
|
||||
"errors": sum(
|
||||
1
|
||||
for row in rows
|
||||
if row.get("error") and row.get("skipped") is not True
|
||||
),
|
||||
"mean_l2": statistics.fmean(distances) if distances else None,
|
||||
"median_l2": statistics.median(distances) if distances else None,
|
||||
"mean_duration_seconds": statistics.fmean(durations)
|
||||
if durations
|
||||
else None,
|
||||
"median_duration_seconds": statistics.median(durations)
|
||||
if durations
|
||||
else None,
|
||||
}
|
||||
summary[model_name] = model_summary
|
||||
|
||||
return summary
|
||||
116
prototypes/click_eval/src/click_eval/viz.py
Normal file
@@ -0,0 +1,116 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
from .contracts import Point
|
||||
from .image_utils import require_pillow
|
||||
|
||||
COLORS = [
|
||||
(56, 132, 255),
|
||||
(255, 149, 0),
|
||||
(175, 82, 222),
|
||||
(255, 45, 85),
|
||||
(90, 200, 250),
|
||||
(255, 204, 0),
|
||||
]
|
||||
GT_COLOR = (20, 180, 70)
|
||||
JUDGE_COLORS = [
|
||||
(0, 128, 128),
|
||||
(180, 90, 0),
|
||||
(120, 80, 220),
|
||||
]
|
||||
|
||||
|
||||
def annotate_image(
|
||||
image_path: Path,
|
||||
output_path: Path,
|
||||
gt_point: Point | None,
|
||||
predictions: list[dict[str, Any]],
|
||||
judge_points: list[dict[str, Any]] | None = None,
|
||||
) -> None:
|
||||
Image, ImageDraw, ImageFont = require_pillow()
|
||||
with Image.open(image_path) as source:
|
||||
image = source.convert("RGB")
|
||||
|
||||
draw = ImageDraw.Draw(image)
|
||||
font = ImageFont.load_default()
|
||||
legend_lines: list[tuple[str, tuple[int, int, int]]] = []
|
||||
|
||||
if gt_point is not None:
|
||||
legend_lines.append(("GT", GT_COLOR))
|
||||
_draw_marker(draw, gt_point, GT_COLOR, "GT")
|
||||
|
||||
for index, judge in enumerate(judge_points or []):
|
||||
color = JUDGE_COLORS[index % len(JUDGE_COLORS)]
|
||||
label = str(judge.get("label") or f"GT{index + 1}")
|
||||
model_name = str(judge.get("model") or f"judge-{index + 1}")
|
||||
point = judge.get("point")
|
||||
l2 = judge.get("l2")
|
||||
if isinstance(point, Point):
|
||||
_draw_marker(draw, point, color, label)
|
||||
distance = f"{float(l2):.1f}px" if l2 is not None else "n/a"
|
||||
legend_lines.append((f"{label} {model_name}: {distance}", color))
|
||||
else:
|
||||
error = str(judge.get("error") or "no point")
|
||||
legend_lines.append((f"{label} {model_name}: {_short_text(error)}", color))
|
||||
|
||||
for index, prediction in enumerate(_sort_predictions_by_l2(predictions)):
|
||||
color = COLORS[index % len(COLORS)]
|
||||
model_name = str(prediction["model"])
|
||||
point = prediction.get("point")
|
||||
l2 = prediction.get("l2")
|
||||
if isinstance(point, Point):
|
||||
_draw_marker(draw, point, color, str(index + 1))
|
||||
distance = f"{float(l2):.1f}px" if l2 is not None else "n/a"
|
||||
legend_lines.append((f"{index + 1} {model_name}: {distance}", color))
|
||||
else:
|
||||
legend_lines.append((f"{index + 1} {model_name}: error", color))
|
||||
|
||||
_draw_legend(draw, legend_lines, font)
|
||||
output_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
image.save(output_path)
|
||||
|
||||
|
||||
def _sort_predictions_by_l2(predictions: list[dict[str, Any]]) -> list[dict[str, Any]]:
|
||||
def sort_key(item: tuple[int, dict[str, Any]]) -> tuple[int, float, int]:
|
||||
index, prediction = item
|
||||
l2 = prediction.get("l2")
|
||||
if l2 is None:
|
||||
return (1, 0.0, index)
|
||||
try:
|
||||
return (0, float(l2), index)
|
||||
except (TypeError, ValueError):
|
||||
return (1, 0.0, index)
|
||||
|
||||
return [prediction for _, prediction in sorted(enumerate(predictions), key=sort_key)]
|
||||
|
||||
|
||||
def _draw_marker(draw, point: Point, color: tuple[int, int, int], label: str) -> None:
|
||||
x = int(round(point.x))
|
||||
y = int(round(point.y))
|
||||
radius = 8
|
||||
draw.ellipse((x - radius, y - radius, x + radius, y + radius), outline=color, width=3)
|
||||
draw.line((x - 12, y, x + 12, y), fill=color, width=2)
|
||||
draw.line((x, y - 12, x, y + 12), fill=color, width=2)
|
||||
draw.text((x + 10, y + 8), label, fill=color)
|
||||
|
||||
|
||||
def _draw_legend(draw, lines, font) -> None:
|
||||
if not lines:
|
||||
return
|
||||
padding = 6
|
||||
line_height = 14
|
||||
width = max(90, max(len(text) for text, _ in lines) * 7 + padding * 2)
|
||||
height = padding * 2 + line_height * len(lines)
|
||||
draw.rectangle((0, 0, width, height), fill=(255, 255, 255), outline=(30, 30, 30))
|
||||
for index, (text, color) in enumerate(lines):
|
||||
y = padding + index * line_height
|
||||
draw.text((padding, y), text, fill=color, font=font)
|
||||
|
||||
|
||||
def _short_text(text: str, max_chars: int = 80) -> str:
|
||||
compact = " ".join(text.split())
|
||||
if len(compact) <= max_chars:
|
||||
return compact
|
||||
return compact[: max_chars - 1] + "..."
|
||||
53
prototypes/click_eval/tests/test_local_hf.py
Normal file
@@ -0,0 +1,53 @@
|
||||
from types import SimpleNamespace
|
||||
|
||||
from click_eval.local_hf import (
|
||||
_patch_qwen25_mrope_section,
|
||||
_qwen25_manual_prompt_text,
|
||||
)
|
||||
|
||||
|
||||
def test_qwen25_manual_prompt_text_includes_image_placeholder():
|
||||
text = _qwen25_manual_prompt_text(
|
||||
[
|
||||
{"role": "system", "content": "system prompt"},
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{"type": "image", "image": object()},
|
||||
{"type": "text", "text": "click the button"},
|
||||
],
|
||||
},
|
||||
]
|
||||
)
|
||||
|
||||
assert "<|vision_start|><|image_pad|><|vision_end|>" in text
|
||||
assert text.endswith("<|im_start|>assistant\n")
|
||||
assert "click the button" in text
|
||||
|
||||
|
||||
def test_patch_qwen25_mrope_section_adds_missing_default():
|
||||
text_config = SimpleNamespace(rope_scaling={"type": "default"})
|
||||
hf_model = SimpleNamespace(
|
||||
config=SimpleNamespace(text_config=text_config),
|
||||
model=None,
|
||||
language_model=None,
|
||||
)
|
||||
|
||||
_patch_qwen25_mrope_section(hf_model)
|
||||
|
||||
assert text_config.rope_scaling["mrope_section"] == [16, 24, 24]
|
||||
|
||||
|
||||
def test_patch_qwen25_mrope_section_preserves_existing_value():
|
||||
text_config = SimpleNamespace(
|
||||
rope_scaling={"type": "default", "mrope_section": [1, 2, 3]}
|
||||
)
|
||||
hf_model = SimpleNamespace(
|
||||
config=SimpleNamespace(text_config=text_config),
|
||||
model=None,
|
||||
language_model=None,
|
||||
)
|
||||
|
||||
_patch_qwen25_mrope_section(hf_model)
|
||||
|
||||
assert text_config.rope_scaling["mrope_section"] == [1, 2, 3]
|
||||