Skip to content

Commit e166298

Browse files
committed
Add VLM-based element locator (Anthropic/OpenAI)
1 parent 519d28f commit e166298

17 files changed

Lines changed: 742 additions & 0 deletions

File tree

je_auto_control/__init__.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,10 @@
4545
click_accessibility_element, find_accessibility_element,
4646
list_accessibility_elements,
4747
)
48+
# VLM element locator (headless)
49+
from je_auto_control.utils.vision import (
50+
VLMNotAvailableError, click_by_description, locate_by_description,
51+
)
4852
# Clipboard (headless)
4953
from je_auto_control.utils.clipboard.clipboard import (
5054
get_clipboard, set_clipboard,
@@ -221,6 +225,8 @@ def start_autocontrol_gui(*args, **kwargs):
221225
"AccessibilityElement", "AccessibilityNotAvailableError",
222226
"list_accessibility_elements", "find_accessibility_element",
223227
"click_accessibility_element",
228+
# VLM locator
229+
"VLMNotAvailableError", "locate_by_description", "click_by_description",
224230
"generate_html", "generate_html_report", "generate_json", "generate_json_report", "generate_xml",
225231
"generate_xml_report", "get_dir_files_as_list", "create_project_dir", "start_autocontrol_socket_server",
226232
"callback_executor", "package_manager", "ShellManager", "default_shell_manager",

je_auto_control/gui/language_wrapper/english.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222
"tab_report": "Report",
2323
"tab_run_history": "Run History",
2424
"tab_accessibility": "Accessibility",
25+
"tab_vlm": "AI Locator",
2526

2627
# Auto Click Tab
2728
"interval_time": "Interval (ms):",
@@ -299,6 +300,19 @@
299300
"a11y_no_selection": "Select a row first",
300301
"a11y_click_not_found": "No matching element to click",
301302

303+
# VLM (AI Locator) Tab
304+
"vlm_desc_label": "Describe:",
305+
"vlm_desc_placeholder": "e.g. the green Submit button",
306+
"vlm_model_label": "Model:",
307+
"vlm_model_placeholder": "optional override (e.g. claude-opus-4-7)",
308+
"vlm_locate": "Locate",
309+
"vlm_click": "Locate & click",
310+
"vlm_result": "Match at ({x}, {y})",
311+
"vlm_ok": "OK",
312+
"vlm_not_found": "No match found",
313+
"vlm_error": "Error",
314+
"vlm_desc_required": "Describe the target first",
315+
302316
# Menu bar
303317
"menu_file": "File",
304318
"menu_file_open_script": "Open Script...",

je_auto_control/gui/language_wrapper/japanese.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
"tab_report": "レポート",
2222
"tab_run_history": "実行履歴",
2323
"tab_accessibility": "アクセシビリティ",
24+
"tab_vlm": "AI ロケーター",
2425

2526
# Auto Click Tab
2627
"interval_time": "間隔 (ms):",
@@ -298,6 +299,19 @@
298299
"a11y_no_selection": "行を選択してください",
299300
"a11y_click_not_found": "クリック可能な要素が見つかりません",
300301

302+
# VLM (AI Locator) Tab
303+
"vlm_desc_label": "説明:",
304+
"vlm_desc_placeholder": "例: 緑色の送信ボタン",
305+
"vlm_model_label": "モデル:",
306+
"vlm_model_placeholder": "任意 (例: claude-opus-4-7)",
307+
"vlm_locate": "検索",
308+
"vlm_click": "検索してクリック",
309+
"vlm_result": "位置: ({x}, {y})",
310+
"vlm_ok": "完了",
311+
"vlm_not_found": "一致する要素が見つかりません",
312+
"vlm_error": "エラー",
313+
"vlm_desc_required": "まず対象の説明を入力してください",
314+
301315
# Menu bar
302316
"menu_file": "ファイル",
303317
"menu_file_open_script": "スクリプトを開く...",

je_auto_control/gui/language_wrapper/simplified_chinese.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
"tab_report": "报告生成",
2222
"tab_run_history": "执行记录",
2323
"tab_accessibility": "无障碍树",
24+
"tab_vlm": "AI 定位",
2425

2526
# Auto Click Tab
2627
"interval_time": "间隔时间 (ms):",
@@ -298,6 +299,19 @@
298299
"a11y_no_selection": "请先选择一行",
299300
"a11y_click_not_found": "找不到可点击的元素",
300301

302+
# VLM (AI Locator) Tab
303+
"vlm_desc_label": "描述:",
304+
"vlm_desc_placeholder": "例如:绿色的提交按钮",
305+
"vlm_model_label": "模型:",
306+
"vlm_model_placeholder": "选填(例如 claude-opus-4-7)",
307+
"vlm_locate": "定位",
308+
"vlm_click": "定位并点击",
309+
"vlm_result": "位置:({x}, {y})",
310+
"vlm_ok": "完成",
311+
"vlm_not_found": "找不到匹配元素",
312+
"vlm_error": "错误",
313+
"vlm_desc_required": "请先输入目标描述",
314+
301315
# Menu bar
302316
"menu_file": "文件",
303317
"menu_file_open_script": "打开脚本...",

je_auto_control/gui/language_wrapper/traditional_chinese.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222
"tab_report": "報告產生",
2323
"tab_run_history": "執行紀錄",
2424
"tab_accessibility": "無障礙樹",
25+
"tab_vlm": "AI 定位",
2526

2627
# Auto Click Tab
2728
"interval_time": "間隔時間 (ms):",
@@ -299,6 +300,19 @@
299300
"a11y_no_selection": "請先選擇一列",
300301
"a11y_click_not_found": "找不到可點擊的元素",
301302

303+
# VLM (AI Locator) Tab
304+
"vlm_desc_label": "描述:",
305+
"vlm_desc_placeholder": "例如:綠色的送出按鈕",
306+
"vlm_model_label": "模型:",
307+
"vlm_model_placeholder": "選填(例如 claude-opus-4-7)",
308+
"vlm_locate": "定位",
309+
"vlm_click": "定位並點擊",
310+
"vlm_result": "位置:({x}, {y})",
311+
"vlm_ok": "完成",
312+
"vlm_not_found": "找不到符合的元素",
313+
"vlm_error": "錯誤",
314+
"vlm_desc_required": "請先輸入目標描述",
315+
302316
# Menu bar
303317
"menu_file": "檔案",
304318
"menu_file_open_script": "開啟腳本...",

je_auto_control/gui/main_widget.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
from je_auto_control.gui.script_builder import ScriptBuilderTab
2424
from je_auto_control.gui.selector import crop_template_to_file, open_region_selector
2525
from je_auto_control.gui.triggers_tab import TriggersTab
26+
from je_auto_control.gui.vlm_tab import VLMTab
2627
from je_auto_control.gui.window_tab import WindowManagerTab
2728
from je_auto_control.wrapper.auto_control_screen import screen_size, screenshot, get_pixel
2829
from je_auto_control.wrapper.auto_control_image import locate_all_image, locate_image_center, locate_and_click
@@ -85,6 +86,7 @@ def __init__(self, parent=None):
8586
self._add_tab("triggers", "tab_triggers", TriggersTab())
8687
self._add_tab("run_history", "tab_run_history", RunHistoryTab())
8788
self._add_tab("accessibility", "tab_accessibility", AccessibilityTab())
89+
self._add_tab("vlm", "tab_vlm", VLMTab())
8890
self._add_tab("plugins", "tab_plugins", PluginsTab())
8991
layout.addWidget(self.tabs)
9092

je_auto_control/gui/vlm_tab.py

Lines changed: 111 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,111 @@
1+
"""VLM tab: describe a UI element in words and have a model find it."""
2+
from typing import Optional
3+
4+
from PySide6.QtWidgets import (
5+
QHBoxLayout, QLabel, QLineEdit, QMessageBox, QPushButton,
6+
QVBoxLayout, QWidget,
7+
)
8+
9+
from je_auto_control.gui._i18n_helpers import TranslatableMixin
10+
from je_auto_control.gui.language_wrapper.multi_language_wrapper import (
11+
language_wrapper,
12+
)
13+
from je_auto_control.utils.vision.backends.base import VLMNotAvailableError
14+
from je_auto_control.utils.vision.vlm_api import (
15+
click_by_description, locate_by_description,
16+
)
17+
18+
19+
def _t(key: str) -> str:
20+
return language_wrapper.translate(key, key)
21+
22+
23+
class VLMTab(TranslatableMixin, QWidget):
24+
"""Drive a vision-language model to locate or click described elements."""
25+
26+
def __init__(self, parent: Optional[QWidget] = None) -> None:
27+
super().__init__(parent)
28+
self._tr_init()
29+
self._description = QLineEdit()
30+
self._model = QLineEdit()
31+
self._status = QLabel()
32+
self._last_result = QLabel()
33+
self._apply_placeholders()
34+
self._build_layout()
35+
36+
def retranslate(self) -> None:
37+
TranslatableMixin.retranslate(self)
38+
self._apply_placeholders()
39+
40+
def _apply_placeholders(self) -> None:
41+
self._description.setPlaceholderText(_t("vlm_desc_placeholder"))
42+
self._model.setPlaceholderText(_t("vlm_model_placeholder"))
43+
44+
def _build_layout(self) -> None:
45+
root = QVBoxLayout(self)
46+
desc_row = QHBoxLayout()
47+
desc_row.addWidget(self._tr(QLabel(), "vlm_desc_label"))
48+
desc_row.addWidget(self._description, stretch=1)
49+
root.addLayout(desc_row)
50+
model_row = QHBoxLayout()
51+
model_row.addWidget(self._tr(QLabel(), "vlm_model_label"))
52+
model_row.addWidget(self._model, stretch=1)
53+
root.addLayout(model_row)
54+
btn_row = QHBoxLayout()
55+
locate_btn = self._tr(QPushButton(), "vlm_locate")
56+
locate_btn.clicked.connect(self._on_locate)
57+
click_btn = self._tr(QPushButton(), "vlm_click")
58+
click_btn.clicked.connect(self._on_click)
59+
btn_row.addWidget(locate_btn)
60+
btn_row.addWidget(click_btn)
61+
btn_row.addStretch()
62+
root.addLayout(btn_row)
63+
root.addWidget(self._last_result)
64+
root.addWidget(self._status)
65+
root.addStretch()
66+
67+
def _collect_inputs(self):
68+
description = self._description.text().strip()
69+
if not description:
70+
self._status.setText(_t("vlm_desc_required"))
71+
return None
72+
model = self._model.text().strip() or None
73+
return description, model
74+
75+
def _on_locate(self) -> None:
76+
inputs = self._collect_inputs()
77+
if inputs is None:
78+
return
79+
description, model = inputs
80+
try:
81+
coords = locate_by_description(description, model=model)
82+
except VLMNotAvailableError as error:
83+
QMessageBox.warning(self, _t("vlm_locate"), str(error))
84+
return
85+
except (OSError, ValueError, RuntimeError) as error:
86+
self._status.setText(f"{_t('vlm_error')}: {error}")
87+
return
88+
if coords is None:
89+
self._status.setText(_t("vlm_not_found"))
90+
self._last_result.setText("")
91+
return
92+
self._last_result.setText(
93+
_t("vlm_result").replace("{x}", str(coords[0]))
94+
.replace("{y}", str(coords[1])),
95+
)
96+
self._status.setText(_t("vlm_ok"))
97+
98+
def _on_click(self) -> None:
99+
inputs = self._collect_inputs()
100+
if inputs is None:
101+
return
102+
description, model = inputs
103+
try:
104+
ok = click_by_description(description, model=model)
105+
except VLMNotAvailableError as error:
106+
QMessageBox.warning(self, _t("vlm_click"), str(error))
107+
return
108+
except (OSError, ValueError, RuntimeError) as error:
109+
self._status.setText(f"{_t('vlm_error')}: {error}")
110+
return
111+
self._status.setText(_t("vlm_ok") if ok else _t("vlm_not_found"))

je_auto_control/utils/executor/action_executor.py

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,9 @@
1212
from je_auto_control.utils.accessibility.accessibility_api import (
1313
click_accessibility_element, find_accessibility_element,
1414
)
15+
from je_auto_control.utils.vision.vlm_api import (
16+
click_by_description, locate_by_description,
17+
)
1518
from je_auto_control.utils.clipboard.clipboard import (
1619
get_clipboard, set_clipboard,
1720
)
@@ -76,6 +79,16 @@ def _a11y_find_as_dict(name: Optional[str] = None,
7679
return None if element is None else element.to_dict()
7780

7881

82+
def _vlm_locate_as_list(description: str,
83+
screen_region: Optional[List[int]] = None,
84+
model: Optional[str] = None) -> Optional[List[int]]:
85+
"""Executor adapter: return VLM-located coords as a JSON-safe list."""
86+
coords = locate_by_description(
87+
description, screen_region=screen_region, model=model,
88+
)
89+
return None if coords is None else [coords[0], coords[1]]
90+
91+
7992
def _history_list_as_dicts(limit: int = 100,
8093
source_type: Optional[str] = None) -> List[dict]:
8194
"""Executor adapter: list run history as plain dicts (JSON-friendly)."""
@@ -191,6 +204,10 @@ def __init__(self):
191204
"AC_a11y_list": _a11y_list_as_dicts,
192205
"AC_a11y_find": _a11y_find_as_dict,
193206
"AC_a11y_click": click_accessibility_element,
207+
208+
# VLM-based element locator
209+
"AC_vlm_locate": _vlm_locate_as_list,
210+
"AC_vlm_click": click_by_description,
194211
}
195212

196213
def known_commands(self) -> set:
Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
"""AI-vision element locator (VLM-backed)."""
2+
from je_auto_control.utils.vision.vlm_api import (
3+
VLMNotAvailableError, click_by_description, locate_by_description,
4+
)
5+
6+
__all__ = [
7+
"VLMNotAvailableError",
8+
"locate_by_description",
9+
"click_by_description",
10+
]
Lines changed: 69 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,69 @@
1+
"""VLM backend factory."""
2+
import os
3+
4+
from je_auto_control.utils.vision.backends.base import (
5+
VLMBackend, VLMNotAvailableError,
6+
)
7+
from je_auto_control.utils.vision.backends.null_backend import NullVLMBackend
8+
9+
_cached_backend: VLMBackend = None # type: ignore[assignment]
10+
11+
12+
def get_backend() -> VLMBackend:
13+
"""Return (and cache) a VLM backend chosen by env vars."""
14+
global _cached_backend
15+
if _cached_backend is not None:
16+
return _cached_backend
17+
_cached_backend = _build_backend()
18+
return _cached_backend
19+
20+
21+
def reset_backend_cache() -> None:
22+
"""Force ``get_backend()`` to re-detect on its next call."""
23+
global _cached_backend
24+
_cached_backend = None # type: ignore[assignment]
25+
26+
27+
def _build_backend() -> VLMBackend:
28+
preferred = os.environ.get("AUTOCONTROL_VLM_BACKEND", "").lower()
29+
order = _preference_order(preferred)
30+
for candidate in order:
31+
backend = _try_build(candidate)
32+
if backend is not None and backend.available:
33+
return backend
34+
return NullVLMBackend(
35+
"no VLM backend ready; set ANTHROPIC_API_KEY or OPENAI_API_KEY "
36+
"and install the matching SDK (anthropic / openai)",
37+
)
38+
39+
40+
def _preference_order(preferred: str):
41+
if preferred == "anthropic":
42+
return ("anthropic", "openai")
43+
if preferred == "openai":
44+
return ("openai", "anthropic")
45+
if os.environ.get("ANTHROPIC_API_KEY"):
46+
return ("anthropic", "openai")
47+
if os.environ.get("OPENAI_API_KEY"):
48+
return ("openai", "anthropic")
49+
return ("anthropic", "openai")
50+
51+
52+
def _try_build(name: str):
53+
if name == "anthropic":
54+
from je_auto_control.utils.vision.backends.anthropic_backend import (
55+
AnthropicVLMBackend,
56+
)
57+
return AnthropicVLMBackend()
58+
if name == "openai":
59+
from je_auto_control.utils.vision.backends.openai_backend import (
60+
OpenAIVLMBackend,
61+
)
62+
return OpenAIVLMBackend()
63+
return None
64+
65+
66+
__all__ = [
67+
"VLMBackend", "VLMNotAvailableError", "NullVLMBackend",
68+
"get_backend", "reset_backend_cache",
69+
]

0 commit comments

Comments
 (0)