diff --git a/.gitignore b/.gitignore
index 5af6f509..d69e7943 100644
--- a/.gitignore
+++ b/.gitignore
@@ -16,6 +16,7 @@ venv/
# IDEs
.vscode/
.idea/
+.claude/
# Build files
dist/
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 20f09e66..fcdc904a 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -4,6 +4,18 @@ All notable changes to Murphy will be documented in this file.
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/).
+## [1.2.0] - 2026-06-02
+
+### Added
+- Lite mode (`--lite` CLI flag / `lite: true` in the REST API) for a faster, simpler run aimed at quick product feedback: Murphy builds a compact persona plan directly from the goal or available analysis, then runs a lighter browser-agent prompt
+- Lite runs return a structured `LiteResult` per scenario with a 1–10 `grade` plus `flaws`, `improvements`, `fixes`, and `other_feedback`, summarised in a dedicated terminal output
+- `lite` field on the `/generate-plan`, `/evaluate`, and `/execute` REST API request models
+- `LITE_MODE.md` documentation describing the mode, what it skips, and how to run it
+
+### Changed
+- Lite mode skips LLM test generation, the Murphy judge, full JSON/Markdown report generation, and interactive review pauses
+- Disabled the unused `write_file` tool in Murphy runs
+
## [1.1.0] - 2026-04-07
### Added
diff --git a/LITE_MODE.md b/LITE_MODE.md
new file mode 100644
index 00000000..1c4eda01
--- /dev/null
+++ b/LITE_MODE.md
@@ -0,0 +1,54 @@
+# Murphy Lite Mode
+
+Lite mode is a faster, simpler Murphy run for quick product feedback. It is enabled with `--lite` in the CLI or `lite: true` in the REST API.
+
+## What It Skips
+
+- LLM test generation
+- Interactive feature and test-plan review pauses
+- Murphy judge calls
+- Full JSON and Markdown report generation
+
+## What It Returns
+
+Each scenario returns a structured `LiteResult`:
+
+- `grade`: 1-10 overall experience score
+- `flaws`: concrete problems or blockers
+- `improvements`: product or UX improvements
+- `fixes`: implementation-level fixes
+- `other_feedback`: additional useful observations
+
+## CLI
+
+```bash
+uv run murphy --url https://example.com --goal "Test agent creation flow" --lite
+```
+
+You can still use `--max-tests`, `--parallel`, `--provider`, `--model`, `--auth`, `--no-auth`, `--features`, and `--plan`.
+
+## REST
+
+Set `lite: true` on `/generate-plan`, `/evaluate`, or `/execute`.
+
+```json
+{
+ "url": "https://example.com",
+ "goal": "Test agent creation flow",
+ "max_tests": 1,
+ "lite": true
+}
+```
+
+## Speed Experiment
+
+Use the manual experiment runner:
+
+```bash
+uv run python exp_2/lite_speed/run_compare.py \
+ --url https://work.toqan.ai \
+ --goal "Test agent creation flow" \
+ --max-tests 1 \
+ --parallel 1 \
+ --repetitions 1
+```
diff --git a/README.md b/README.md
index f75c8d57..e27e3a83 100644
--- a/README.md
+++ b/README.md
@@ -78,6 +78,9 @@ uv run murphy --url https://example.com
# Goal-directed: explores with focus, skips feature discovery, generates plan directly
uv run murphy --url https://example.com --goal "test the checkout flow"
+# Lite mode: faster, simpler run that skips test generation, judge, and reports
+uv run murphy --url https://example.com --goal "test the checkout flow" --lite
+
# Site requires login — opens browser for manual auth first (local only, not Docker)
uv run murphy --url https://example.com --auth
@@ -115,6 +118,8 @@ An AI agent explores the site with the given goal in mind, then synthesizes a te
**Execution (both strategies):** An AI agent runs each test scenario in a real browser, and a separate judge LLM evaluates pass/fail. Saves `evaluation_report.json` and `evaluation_report.md`.
+**Lite mode (`--lite`):** Murphy builds a compact persona plan directly from the goal or available analysis, then runs a lighter browser-agent prompt that returns structured `flaws`, `improvements`, `fixes`, and `other_feedback`. It skips LLM test generation, the judge, report generation, and interactive review pauses.
+
You can resume from any point by passing `--features` or `--plan` with a previously generated (and optionally edited) file.
## Output
diff --git a/murphy/api/cli.py b/murphy/api/cli.py
index 2d5db9a0..52c7da67 100644
--- a/murphy/api/cli.py
+++ b/murphy/api/cli.py
@@ -7,6 +7,7 @@
murphy --url https://example.com --features features.md # skip analysis, load features from file
murphy --url https://example.com --plan plan.yaml # skip analysis + generation, load test plan
murphy --url https://example.com --goal "test the checkout flow"
+ murphy --url https://example.com --goal "test the checkout flow" --lite
"""
from __future__ import annotations
@@ -26,7 +27,7 @@
if TYPE_CHECKING:
from murphy.api.server import ServerState
- from murphy.models import TestPlan, TestResult
+ from murphy.models import TestPlan, TestResult, TokenUsage, WebsiteAnalysis
load_dotenv()
@@ -40,6 +41,35 @@
)
+def _write_reports_and_log_results(
+ url: str,
+ analysis: WebsiteAnalysis | None,
+ results: list[TestResult],
+ output_dir: Path,
+ *,
+ use_lite: bool,
+ persona_discovery_tokens: TokenUsage | None = None,
+ murphy_tokens: TokenUsage | None = None,
+) -> None:
+ """Write report artifacts when analysis context exists, then log the mode-specific terminal summary."""
+ if analysis:
+ from murphy.core.summary import write_reports_and_print
+
+ write_reports_and_print(
+ url,
+ analysis,
+ results,
+ output_dir,
+ persona_discovery_tokens=persona_discovery_tokens,
+ murphy_tokens=murphy_tokens,
+ )
+ elif not use_lite:
+ _log_results_summary(results)
+
+ if use_lite:
+ _log_lite_summary(results)
+
+
def main() -> int:
parser = argparse.ArgumentParser(
prog='murphy',
@@ -51,6 +81,7 @@ def main() -> int:
parser.add_argument('--no-auth', action='store_true', help='Skip auth detection entirely, treat site as public')
parser.add_argument('--features', help='Path to existing features markdown (skips analysis, goes to test generation)')
parser.add_argument('--plan', help='Path to existing YAML test plan (skips analysis + test generation)')
+ parser.add_argument('--lite', action='store_true', help='Run faster lite mode: skip test generation and judge')
parser.add_argument('--max-tests', type=int, default=None, help='Max test scenarios (default: number of personas)')
parser.add_argument(
'--provider', default='openai', help='LLM provider (default: openai). e.g. google, anthropic, azure, mistral'
@@ -119,7 +150,7 @@ async def _async_main(args: argparse.Namespace) -> None:
from murphy.browser.patches import apply as apply_patches
from murphy.core.analysis import analyze_website
from murphy.core.execution import execute_tests_with_session
- from murphy.core.generation import explore_and_generate_plan, generate_tests
+ from murphy.core.generation import explore_and_generate_plan, generate_tests, make_lite_plan
from murphy.core.summary import build_summary, write_reports_and_print
from murphy.io.features_io import read_features_markdown, write_features_markdown
from murphy.io.fixtures import ensure_dummy_fixture_files
@@ -222,6 +253,7 @@ async def _async_main(args: argparse.Namespace) -> None:
# ── Phase 1–2: Discover features & generate plan ──
use_exploration_first = bool(args.goal and not args.features and not args.plan)
+ use_lite = bool(args.lite)
if args.plan:
# Skip both analysis and test generation
@@ -231,6 +263,27 @@ async def _async_main(args: argparse.Namespace) -> None:
if url != args.url:
logger.warning('Plan URL (%s) differs from --url (%s). Using --url.', url, args.url)
logger.info('Loaded %d scenarios from %s', len(test_plan.scenarios), plan_path)
+ elif use_lite:
+ if args.features:
+ features_path = Path(args.features)
+ assert features_path.exists(), f'Features file not found: {features_path}'
+ analysis = read_features_markdown(features_path)
+ logger.info('Loaded %d features from %s', len(analysis.features), features_path)
+ elif not args.goal:
+ analysis = await analyze_website(args.url, llm, goal=args.goal, browser_session=browser_session)
+ features_path = write_features_markdown(analysis, output_dir)
+ logger.info('\n Features saved: %s', features_path)
+
+ test_plan = make_lite_plan(
+ args.url,
+ goal=args.goal,
+ analysis=analysis,
+ max_tests=args.max_tests,
+ discovered_personas=discovered_personas,
+ )
+ plan_path = save_test_plan(args.url, test_plan, output_dir)
+ logger.info('\n Lite plan saved: %s', plan_path)
+ logger.info(' Using %d lite scenarios.\n', len(test_plan.scenarios))
elif use_exploration_first:
# Exploration-first path: explore → summarize → synthesize plan
test_plan = await explore_and_generate_plan(
@@ -350,18 +403,18 @@ def _on_test_complete(results: list[TestResult]) -> None:
judge_llm=judge_llm,
output_dir=output_dir,
discovered_personas=discovered_personas,
+ use_lite=use_lite,
+ analysis=analysis,
+ )
+ _write_reports_and_log_results(
+ args.url,
+ analysis,
+ results,
+ output_dir,
+ use_lite=use_lite,
+ persona_discovery_tokens=persona_discovery_tokens,
+ murphy_tokens=_get_murphy_tokens(),
)
- if analysis:
- write_reports_and_print(
- args.url,
- analysis,
- results,
- output_dir,
- persona_discovery_tokens=persona_discovery_tokens,
- murphy_tokens=_get_murphy_tokens(),
- )
- else:
- _log_results_summary(results)
return
# ── Server UI mode (--ui) ──
@@ -384,6 +437,8 @@ async def _execute_fn(plan: TestPlan, state: ServerState) -> list[TestResult]:
judge_llm=judge_llm,
output_dir=output_dir,
discovered_personas=discovered_personas,
+ use_lite=use_lite,
+ analysis=analysis,
)
state = ServerState(
@@ -402,17 +457,15 @@ async def _execute_fn(plan: TestPlan, state: ServerState) -> list[TestResult]:
while True:
await asyncio.sleep(1)
if state.done and state.results and not getattr(state, '_reports_written', False):
- if analysis:
- write_reports_and_print(
- args.url,
- analysis,
- state.results,
- output_dir,
- persona_discovery_tokens=persona_discovery_tokens,
- murphy_tokens=_get_murphy_tokens(),
- )
- else:
- _log_results_summary(state.results)
+ _write_reports_and_log_results(
+ args.url,
+ analysis,
+ state.results,
+ output_dir,
+ use_lite=use_lite,
+ persona_discovery_tokens=persona_discovery_tokens,
+ murphy_tokens=_get_murphy_tokens(),
+ )
state._reports_written = True # type: ignore[attr-defined]
except KeyboardInterrupt:
pass
@@ -467,5 +520,24 @@ def _log_results_summary(results: list[TestResult]) -> None:
logger.info('\n Pass rate: %s%% (%d/%d)', summary.pass_rate, summary.passed, summary.total)
+def _log_lite_summary(results: list[TestResult]) -> None:
+ logger.info('\n%s', '=' * 60)
+ logger.info('Lite Mode Complete — %d scenario(s)', len(results))
+ logger.info('%s', '=' * 60)
+ for result in results:
+ lite_result = result.lite_result
+ if lite_result is None:
+ logger.info(' [%s] no lite result — %s', result.scenario.test_persona, result.reason)
+ continue
+ logger.info(
+ ' [%s] grade=%d flaws=%d improvements=%d fixes=%d',
+ result.scenario.test_persona,
+ lite_result.grade,
+ len(lite_result.flaws),
+ len(lite_result.improvements),
+ len(lite_result.fixes),
+ )
+
+
if __name__ == '__main__':
sys.exit(main())
diff --git a/murphy/api/request_models.py b/murphy/api/request_models.py
index 497511a9..c9abe31e 100644
--- a/murphy/api/request_models.py
+++ b/murphy/api/request_models.py
@@ -36,6 +36,7 @@ class GeneratePlanRequest(BaseModel):
analysis: Annotated[WebsiteAnalysis, BeforeValidator(_parse_json_string)]
max_tests: int = 8
goal: str | None = None
+ lite: bool = False
provider: str = 'openai'
model: str = 'gpt-5-mini'
webhook_url: str | None = None
@@ -55,6 +56,7 @@ class ExecuteRequest(BaseModel):
judge_model: str | None = None
max_steps: int = 15
max_concurrent: int = 3
+ lite: bool = False
webhook_url: str | None = None
async_mode: bool = Field(False, alias='async')
@@ -65,6 +67,7 @@ class EvaluateRequest(BaseModel):
url: str
goal: str | None = None
max_tests: int = 8
+ lite: bool = False
provider: str = 'openai'
model: str = 'gpt-5-mini'
judge_provider: str | None = None
diff --git a/murphy/api/rest.py b/murphy/api/rest.py
index 3d2c6d8a..c111f255 100644
--- a/murphy/api/rest.py
+++ b/murphy/api/rest.py
@@ -70,7 +70,7 @@ async def _core_generate_plan(req: GeneratePlanRequest) -> dict[str, Any]:
from murphy.core.pipeline import run_generate_plan
test_plan = await run_generate_plan(
- req.url, req.analysis, req.model, provider=req.provider, max_tests=req.max_tests, goal=req.goal
+ req.url, req.analysis, req.model, provider=req.provider, max_tests=req.max_tests, goal=req.goal, lite=req.lite
)
return test_plan.model_dump()
@@ -101,6 +101,7 @@ async def _core_execute(req: ExecuteRequest) -> dict[str, Any]:
goal=req.goal,
max_steps=req.max_steps,
max_concurrent=req.max_concurrent,
+ lite=req.lite,
)
return ExecuteResult(results=results, summary=summary).model_dump()
@@ -109,7 +110,9 @@ async def _core_evaluate(req: EvaluateRequest) -> dict[str, Any]:
"""Run exploration-first evaluation: explore site → generate test plan."""
from murphy.core.pipeline import run_evaluate
- test_plan = await run_evaluate(req.url, req.model, provider=req.provider, max_tests=req.max_tests, goal=req.goal)
+ test_plan = await run_evaluate(
+ req.url, req.model, provider=req.provider, max_tests=req.max_tests, goal=req.goal, lite=req.lite
+ )
return test_plan.model_dump()
diff --git a/murphy/api/templates.py b/murphy/api/templates.py
index 8d1baf18..3f516313 100644
--- a/murphy/api/templates.py
+++ b/murphy/api/templates.py
@@ -45,6 +45,7 @@
.badge-medium { background: var(--text); color: #fff; }
.badge-low { background: var(--gray); color: #fff; }
.badge-pass { background: var(--green); color: #fff; }
+.badge-fail { background: var(--red); color: #fff; }
.badge-fail-website { background: var(--red); color: #fff; }
.badge-fail-test { background: var(--orange); color: #fff; }
.test-name { font-weight: 600; flex: 1; font-size: .95rem; }
@@ -381,6 +382,7 @@ def render_results_html(
sections = [
('Passed', [r for r in results if r.success]),
+ ('Failed', [r for r in results if r.success is not True and r.failure_category is None]),
('Failed — Website Issue', [r for r in results if r.failure_category == 'website_issue']),
('Failed — Test Limitation', [r for r in results if r.failure_category == 'test_limitation']),
]
@@ -395,6 +397,9 @@ def render_results_html(
if r.success:
badge_cls = 'badge-pass'
badge_text = 'PASS'
+ elif r.failure_category is None:
+ badge_cls = 'badge-fail'
+ badge_text = 'FAILED'
elif r.failure_category == 'website_issue':
badge_cls = 'badge-fail-website'
badge_text = 'WEBSITE ISSUE'
diff --git a/murphy/browser/cleanup.py b/murphy/browser/cleanup.py
index e92209a4..07370891 100644
--- a/murphy/browser/cleanup.py
+++ b/murphy/browser/cleanup.py
@@ -140,6 +140,8 @@ def _find_stale_browser_pids() -> list[int]:
if not cmdline:
continue
if any(marker in cmdline for marker in profile_markers) and any(marker in cmdline for marker in browser_markers):
- pids.add(proc.pid)
+ pid = proc.info.get('pid')
+ if isinstance(pid, int):
+ pids.add(pid)
return sorted(pids)
diff --git a/murphy/core/execution.py b/murphy/core/execution.py
index f891701a..64b569fb 100644
--- a/murphy/core/execution.py
+++ b/murphy/core/execution.py
@@ -17,13 +17,15 @@
from murphy.core.summary import classify_failure
from murphy.io.report_helpers import _slugify
from murphy.models import (
+ LiteResult,
ScenarioExecutionVerdict,
TestPlan,
TestResult,
TestScenario,
+ WebsiteAnalysis,
)
from murphy.personas.pipeline_models import PersonaResult, TraitSchema
-from murphy.prompts import build_execution_prompt
+from murphy.prompts import build_execution_prompt, build_lite_prompt
logger = logging.getLogger(__name__)
@@ -32,9 +34,7 @@
# ─── Structured output parsing ────────────────────────────────────────────────
-def _parse_structured_output(
- history: AgentHistoryList, model_cls: type[ScenarioExecutionVerdict]
-) -> ScenarioExecutionVerdict | None:
+def _parse_structured_output(history: AgentHistoryList, model_cls: type[Any]) -> Any | None:
"""Safely parse structured output from agent history."""
result = history.final_result()
if not result:
@@ -89,6 +89,92 @@ def _extract_urls_from_texts(texts: list[str]) -> list[str]:
return urls
+_INTERACTIVE_SCENARIO_KEYWORDS = (
+ 'add',
+ 'book',
+ 'buy',
+ 'change',
+ 'checkout',
+ 'complete',
+ 'configure',
+ 'create',
+ 'delete',
+ 'disable',
+ 'download',
+ 'edit',
+ 'enable',
+ 'filter',
+ 'login',
+ 'order',
+ 'purchase',
+ 'save',
+ 'search',
+ 'select',
+ 'send',
+ 'set up',
+ 'setup',
+ 'sign in',
+ 'sign up',
+ 'submit',
+ 'switch',
+ 'test',
+ 'toggle',
+ 'try',
+ 'update',
+ 'upload',
+ 'use',
+)
+
+_MEANINGFUL_LITE_ACTIONS = {
+ 'click',
+ 'click_element',
+ 'drag_drop',
+ 'input',
+ 'input_text',
+ 'press_key',
+ 'select_dropdown_option',
+ 'send_keys',
+ 'upload_file',
+}
+
+
+def _lite_scenario_requires_interaction(scenario: TestScenario) -> bool:
+ """Return True when a lite scenario describes an interactive objective."""
+ scenario_text = ' '.join(
+ [
+ scenario.name,
+ scenario.description,
+ scenario.target_feature,
+ scenario.steps_description,
+ scenario.success_criteria,
+ ]
+ ).lower()
+ return any(keyword in scenario_text for keyword in _INTERACTIVE_SCENARIO_KEYWORDS)
+
+
+def _has_meaningful_lite_interaction(actions: list[dict[str, Any]]) -> bool:
+ """Return True when actions include an in-app interaction beyond navigation/inspection."""
+ for action in actions:
+ for key in action:
+ if key == 'interacted_element':
+ continue
+ if key in _MEANINGFUL_LITE_ACTIONS:
+ return True
+ return False
+
+
+def _build_lite_retry_prompt(task_prompt: str) -> str:
+ """Append a one-shot continuation instruction for premature lite completions."""
+ return (
+ task_prompt
+ + '\n\nRETRY REQUIRED:\n'
+ + 'Your previous lite attempt stopped before meaningful in-app interaction. '
+ + 'Continue the objective now. You must use the most plausible in-app controls before returning LiteResult, '
+ + 'unless blocked by login, captcha, missing permissions, destructive/payment action, '
+ + 'support/contact/feedback form, external-domain route, or no plausible route after two in-app paths.'
+ )
+
+
async def _collect_session_urls(browser_session: BrowserSession) -> list[str]:
"""Collect current + historical tab URLs from browser session."""
urls: list[str] = []
@@ -103,6 +189,31 @@ async def _collect_session_urls(browser_session: BrowserSession) -> list[str]:
return urls
+def _save_agent_history(
+ history: AgentHistoryList,
+ scenario: TestScenario,
+ index: int,
+ output_dir: Path | None,
+) -> None:
+ """Persist full browser-use history for UI trace and graph views."""
+ if output_dir is None:
+ return
+
+ slug = _slugify(scenario.name)
+ history_path = output_dir / 'agent_history' / f'test_{index:02d}_{slug}.json'
+ try:
+ history_path.parent.mkdir(parents=True, exist_ok=True)
+ history.save_to_file(history_path)
+ logger.debug(' Agent history saved: %s', history_path)
+ except Exception as e:
+ logger.warning(' Failed to save agent history: %s', e)
+
+
+def _disable_unused_murphy_actions(agent: Agent) -> None:
+ """Remove browser-use tools that Murphy does not consume."""
+ agent.tools.exclude_action('write_file')
+
+
# ─── Single-test execution helper ──────────────────────────────────────────────
@@ -119,6 +230,8 @@ async def _execute_single_test(
judge_llm: BaseChatModel | None = None,
discovered_personas: tuple['PersonaResult', 'TraitSchema'] | None = None,
output_dir: Path | None = None,
+ use_lite: bool = False,
+ analysis: WebsiteAnalysis | None = None,
) -> TestResult:
"""Execute one test scenario and return its TestResult.
@@ -134,6 +247,77 @@ async def _execute_single_test(
await prepare_session_for_task(browser_session, url, force_navigate=True)
file_paths_str = [str(p) for p in fixture_paths] if fixture_paths else []
+
+ if use_lite:
+ task_prompt = build_lite_prompt(
+ scenario,
+ url,
+ analysis=analysis,
+ discovered_personas=discovered_personas,
+ )
+
+ async def _run_lite_agent(prompt: str) -> AgentHistoryList:
+ agent_kwargs: dict[str, Any] = {
+ 'task': prompt,
+ 'llm': llm,
+ 'browser_session': browser_session,
+ 'use_judge': False,
+ 'max_actions_per_step': 3,
+ 'output_model_schema': LiteResult,
+ }
+ agent = Agent(**agent_kwargs)
+ _disable_unused_murphy_actions(agent)
+ register_domain_access_action(agent.tools, browser_session)
+ register_refresh_dom_action(agent.tools, browser_session)
+ return await agent.run(max_steps=max_steps)
+
+ history = await _run_lite_agent(task_prompt)
+ all_actions = history.model_actions()
+ if _lite_scenario_requires_interaction(scenario) and not _has_meaningful_lite_interaction(all_actions):
+ logger.info(' Lite run stopped before meaningful interaction; retrying once with stricter objective guidance.')
+ history = await _run_lite_agent(_build_lite_retry_prompt(task_prompt))
+
+ _save_agent_history(history, scenario, index, output_dir)
+ lite_result = _parse_structured_output(history, LiteResult)
+ if lite_result is None:
+ lite_result = LiteResult(
+ grade=5,
+ flaws=['The agent did not return structured lite output.'],
+ improvements=['Retry the lite run or use normal Murphy for a judged report.'],
+ fixes=[],
+ other_feedback=[],
+ )
+
+ all_actions = history.model_actions()
+ errors = history.errors()
+ history_urls = [u for u in history.urls() if u]
+ session_urls = await _collect_session_urls(browser_session)
+ error_urls = _extract_urls_from_texts([e for e in errors if e])
+ seen_urls: set[str] = set()
+ unique_pages: list[str] = []
+ for page_url in history_urls + session_urls + error_urls:
+ if page_url not in seen_urls:
+ seen_urls.add(page_url)
+ unique_pages.append(page_url)
+
+ success = lite_result.grade >= 5
+ logger.info(' Lite result: grade=%d (%.1fs)', lite_result.grade, history.total_duration_seconds())
+ test_result = TestResult(
+ scenario=scenario,
+ success=success,
+ judgement=None,
+ actions=all_actions,
+ errors=errors,
+ duration=history.total_duration_seconds(),
+ pages_visited=unique_pages,
+ screenshot_paths=[p for p in history.screenshot_paths() if p],
+ form_fills=_extract_form_fills(all_actions),
+ reason=f'Lite mode grade: {lite_result.grade}',
+ lite_result=lite_result,
+ )
+ test_result.failure_category = classify_failure(test_result)
+ return test_result
+
task_prompt = build_execution_prompt(
goal or f'Evaluate {url}',
scenario,
@@ -156,6 +340,7 @@ async def _execute_single_test(
agent_kwargs['output_model_schema'] = ScenarioExecutionVerdict
agent = Agent(**agent_kwargs)
+ _disable_unused_murphy_actions(agent)
# Register custom actions
register_domain_access_action(agent.tools, browser_session)
register_refresh_dom_action(agent.tools, browser_session)
@@ -210,15 +395,7 @@ async def _execute_single_test(
seen_urls.add(p)
unique_pages.append(p)
- # Save full browser-use history to output/agent_history/ when output_dir is set
- if output_dir is not None:
- slug = _slugify(scenario.name)
- history_path = output_dir / 'agent_history' / f'test_{index:02d}_{slug}.json'
- try:
- history.save_to_file(history_path)
- logger.debug(' Agent history saved: %s', history_path)
- except Exception as e:
- logger.warning(' Failed to save agent history: %s', e)
+ _save_agent_history(history, scenario, index, output_dir)
test_result = TestResult(
scenario=scenario,
@@ -379,6 +556,8 @@ async def execute_tests(
judge_llm: BaseChatModel | None = None,
output_dir: Path | None = None,
discovered_personas: tuple['PersonaResult', 'TraitSchema'] | None = None,
+ use_lite: bool = False,
+ analysis: WebsiteAnalysis | None = None,
) -> list[TestResult]:
"""Execute tests without a pre-existing session (creates its own)."""
from browser_use.browser.profile import BrowserProfile
@@ -398,6 +577,8 @@ async def execute_tests(
judge_llm=judge_llm,
output_dir=output_dir,
discovered_personas=discovered_personas,
+ use_lite=use_lite,
+ analysis=analysis,
)
finally:
await browser_session.kill()
@@ -417,6 +598,8 @@ async def execute_tests_with_session(
judge_llm: BaseChatModel | None = None,
output_dir: Path | None = None,
discovered_personas: tuple['PersonaResult', 'TraitSchema'] | None = None,
+ use_lite: bool = False,
+ analysis: WebsiteAnalysis | None = None,
) -> list[TestResult]:
"""Phase 3 execution reusing an existing browser session.
@@ -458,6 +641,8 @@ async def execute_tests_with_session(
judge_llm=judge_llm,
output_dir=output_dir,
discovered_personas=discovered_personas,
+ use_lite=use_lite,
+ analysis=analysis,
)
results.append(test_result)
@@ -507,6 +692,8 @@ async def _run_one(index_0: int, scenario: TestScenario) -> None:
judge_llm=judge_llm,
output_dir=output_dir,
discovered_personas=discovered_personas,
+ use_lite=use_lite,
+ analysis=analysis,
)
results_slots[index_0] = result
diff --git a/murphy/core/generation.py b/murphy/core/generation.py
index 4ecdfb20..69e22b46 100644
--- a/murphy/core/generation.py
+++ b/murphy/core/generation.py
@@ -10,7 +10,7 @@
from murphy.browser.session_utils import prepare_session_for_task
from murphy.config import EXPLORE_MAX_STEPS, QUALITY_MAX_RETRIES
from murphy.core.quality import plan_quality_issues
-from murphy.models import PERSONA_REGISTRY, TestPlan
+from murphy.models import PERSONA_REGISTRY, TestPlan, TestScenario, WebsiteAnalysis
from murphy.personas.bridge import get_discovered_persona_names
from murphy.personas.pipeline_models import PersonaResult, TraitSchema
from murphy.prompts import (
@@ -23,6 +23,196 @@
logger = logging.getLogger(__name__)
+_INTERACTIVE_GOAL_KEYWORDS = (
+ 'add',
+ 'book',
+ 'buy',
+ 'change',
+ 'checkout',
+ 'complete',
+ 'configure',
+ 'create',
+ 'delete',
+ 'disable',
+ 'download',
+ 'edit',
+ 'enable',
+ 'filter',
+ 'login',
+ 'order',
+ 'purchase',
+ 'save',
+ 'search',
+ 'select',
+ 'send',
+ 'set up',
+ 'setup',
+ 'sign in',
+ 'sign up',
+ 'submit',
+ 'switch',
+ 'test',
+ 'toggle',
+ 'try',
+ 'update',
+ 'upload',
+ 'use',
+)
+
+_STATE_CHANGE_GOAL_KEYWORDS = (
+ 'change',
+ 'disable',
+ 'enable',
+ 'mode',
+ 'preference',
+ 'setting',
+ 'switch',
+ 'toggle',
+ 'turn off',
+ 'turn on',
+)
+
+_CREATION_OR_SUBMISSION_GOAL_KEYWORDS = (
+ 'add',
+ 'book',
+ 'buy',
+ 'checkout',
+ 'complete',
+ 'configure',
+ 'create',
+ 'creation',
+ 'new',
+ 'order',
+ 'purchase',
+ 'save',
+ 'send',
+ 'set up',
+ 'setup',
+ 'sign up',
+ 'submit',
+ 'upload',
+)
+
+
+def _matches_any_keyword(text: str, keywords: tuple[str, ...]) -> bool:
+ """Return True if any objective keyword appears in text."""
+ normalized = text.lower()
+ return any(keyword in normalized for keyword in keywords)
+
+
+def _lite_goal_requires_interaction(task: str) -> bool:
+ """Whether a lite objective should require at least one in-app interaction."""
+ return _matches_any_keyword(task, _INTERACTIVE_GOAL_KEYWORDS)
+
+
+def _lite_goal_is_state_change(task: str) -> bool:
+ """Whether a lite objective is primarily about changing UI or app state."""
+ return _matches_any_keyword(task, _STATE_CHANGE_GOAL_KEYWORDS)
+
+
+def _lite_goal_is_creation_or_submission(task: str) -> bool:
+ """Whether a lite objective likely needs safe test input and advancement."""
+ return _matches_any_keyword(task, _CREATION_OR_SUBMISSION_GOAL_KEYWORDS)
+
+
+def _build_lite_objective_steps(url: str, task: str) -> str:
+ """Build generalized objective-driven steps for lite mode."""
+ steps = [
+ f'Complete this objective on {url}: {task}.',
+ 'Minimum required path:',
+ '1. Locate the most plausible in-app route for the objective. If a control is ambiguous but plausibly relevant, try it and report the ambiguity afterward.',
+ ]
+
+ if _lite_goal_is_creation_or_submission(task):
+ steps.extend(
+ [
+ '2. Attempt the objective by initiating the route, providing harmless test input only for fields required to continue, and advancing one step at a time.',
+ '3. Advance or submit only when safe; stop before destructive, payment, external-domain, or support/contact/feedback actions unless the objective explicitly requires reporting that blocker.',
+ ]
+ )
+ elif _lite_goal_is_state_change(task):
+ steps.extend(
+ [
+ '2. Change the requested state using the most plausible in-app control or setting.',
+ '3. Check whether the requested state is reflected in the visible UI or remains in effect after a simple in-app navigation or refresh when safe.',
+ ]
+ )
+ elif _lite_goal_requires_interaction(task):
+ steps.extend(
+ [
+ '2. Attempt the objective through the most plausible in-app interaction rather than stopping at observation.',
+ '3. Continue until the objective is completed, blocked, or objectively unavailable.',
+ ]
+ )
+ else:
+ steps.extend(
+ [
+ '2. Inspect the experience through the persona lens and interact with any clearly relevant in-app controls if they are needed to evaluate the objective.',
+ '3. Stop when you have concrete observed evidence for the objective.',
+ ]
+ )
+
+ steps.extend(
+ [
+ '4. Verify the resulting UI state using visible evidence such as confirmation, validation, changed state, blocked state, persisted state, or clear absence of a plausible route.',
+ '5. Return concise lite output with flaws, improvements, fixes, and other observations grounded in what you attempted and observed.',
+ ]
+ )
+ return '\n'.join(steps)
+
+
+def make_lite_plan(
+ url: str,
+ goal: str | None = None,
+ analysis: WebsiteAnalysis | None = None,
+ max_tests: int | None = None,
+ discovered_personas: tuple[PersonaResult, TraitSchema] | None = None,
+) -> TestPlan:
+ """Create a compact lite-mode plan without an LLM generation call."""
+ if discovered_personas:
+ personas = get_discovered_persona_names(discovered_personas[0])
+ else:
+ personas = list(PERSONA_REGISTRY.keys())
+ if max_tests is not None:
+ personas = personas[:max_tests]
+
+ core_features = [f for f in (analysis.features if analysis else []) if f.importance == 'core']
+ testable_features = [f for f in (analysis.features if analysis else []) if f.testability in ('testable', 'partial')]
+ primary_feature = (core_features or testable_features)[0] if (core_features or testable_features) else None
+ target_feature = primary_feature.name if primary_feature else (goal or 'overall site experience')
+ feature_category = primary_feature.category if primary_feature else 'other'
+ site_name = analysis.site_name if analysis else url
+ task = goal or f'Evaluate {site_name}'
+
+ steps_parts = [_build_lite_objective_steps(url, task)]
+ if analysis and analysis.identified_user_flows:
+ steps_parts.append('Relevant user flows:\n' + '\n'.join(f'- {flow}' for flow in analysis.identified_user_flows))
+ if core_features:
+ steps_parts.append('Core features:\n' + '\n'.join(f'- {feature.name}' for feature in core_features))
+ steps_description = '\n\n'.join(steps_parts)
+
+ scenarios: list[TestScenario] = []
+ for index, persona in enumerate(personas):
+ priority = 'critical' if index == 0 else 'high'
+ scenarios.append(
+ TestScenario(
+ name=f'Lite {persona.replace("_", " ")} review'[:100],
+ description=f'{task} as {persona} on {site_name}.',
+ priority=priority, # type: ignore[arg-type]
+ feature_category=feature_category,
+ target_feature=target_feature,
+ test_persona=persona,
+ steps_description=steps_description,
+ success_criteria='Return structured flaws, improvements, fixes, and other observations for this goal.',
+ )
+ )
+
+ logger.info('\n%s', '=' * 60)
+ logger.info('Built %d lite scenarios without LLM test generation', len(scenarios))
+ logger.info('%s\n', '=' * 60)
+ return TestPlan(scenarios=scenarios)
+
+
async def generate_tests(
url: str,
analysis: 'Any',
diff --git a/murphy/core/judge.py b/murphy/core/judge.py
index 48e06a82..5127b782 100644
--- a/murphy/core/judge.py
+++ b/murphy/core/judge.py
@@ -235,8 +235,9 @@ def _select_key_screenshots(history: AgentHistoryList, max_screenshots: int = 3)
Returns at most max_screenshots base64 strings.
"""
steps = history.history
- if not steps:
- return []
+ if not isinstance(steps, list) or not steps:
+ screenshots = history.screenshots()
+ return [s for s in screenshots if s][-max_screenshots:]
# Score each step
scored: list[tuple[int, int, str]] = [] # (score, index, screenshot_b64)
diff --git a/murphy/core/pipeline.py b/murphy/core/pipeline.py
index e080465f..532e6cb9 100644
--- a/murphy/core/pipeline.py
+++ b/murphy/core/pipeline.py
@@ -22,6 +22,7 @@
execute_tests_with_session,
explore_and_generate_plan,
generate_tests,
+ make_lite_plan,
)
from murphy.io.fixtures import ensure_dummy_fixture_files
from murphy.llm import create_llm
@@ -61,9 +62,12 @@ async def run_generate_plan(
provider: str = 'openai',
max_tests: int | None = None,
goal: str | None = None,
+ lite: bool = False,
) -> TestPlan:
"""Generate test plan from analysis."""
apply_patches()
+ if lite:
+ return make_lite_plan(url, goal=goal, analysis=analysis, max_tests=max_tests)
llm = create_llm(model, provider=provider)
return await generate_tests(url, analysis, llm, max_tests, goal=goal)
@@ -83,6 +87,7 @@ async def run_execute(
save_callback: Any = None,
progress_state: Any = None,
output_dir: Path | None = None,
+ lite: bool = False,
) -> tuple[list[TestResult], ReportSummary]:
"""Execute tests and return results + summary."""
apply_patches()
@@ -112,6 +117,7 @@ async def run_execute(
max_concurrent=max_concurrent,
judge_llm=judge_llm,
output_dir=output_dir,
+ use_lite=lite,
)
summary = build_summary(results)
return results, summary
@@ -128,9 +134,12 @@ async def run_evaluate(
max_tests: int | None = None,
goal: str | None = None,
browser_session: BrowserSession | None = None,
+ lite: bool = False,
) -> TestPlan:
"""Exploration-first: explore site then generate test plan."""
apply_patches()
+ if lite:
+ return make_lite_plan(url, goal=goal, max_tests=max_tests)
kill_stale_browser()
task = goal or f'Evaluate the website at {url}'
llm = create_llm(model, provider=provider)
diff --git a/murphy/core/summary.py b/murphy/core/summary.py
index 6714d903..255fdf8d 100644
--- a/murphy/core/summary.py
+++ b/murphy/core/summary.py
@@ -29,6 +29,8 @@ def classify_failure(result: TestResult) -> Literal['website_issue', 'test_limit
"""
if result.success is True:
return None
+ if result.lite_result is not None:
+ return None
# Crashed tests: success=None with no judgement → test infrastructure failure
if result.success is None:
return 'test_limitation'
diff --git a/murphy/evaluate.py b/murphy/evaluate.py
index bcecc708..157a5df0 100644
--- a/murphy/evaluate.py
+++ b/murphy/evaluate.py
@@ -9,7 +9,7 @@
from murphy.core.analysis import analyze_website
from murphy.core.execution import execute_tests, execute_tests_with_session
-from murphy.core.generation import explore_and_generate_plan, generate_tests
+from murphy.core.generation import explore_and_generate_plan, generate_tests, make_lite_plan
from murphy.core.summary import build_summary, classify_failure, generate_executive_summary, write_reports_and_print
__all__ = [
@@ -21,5 +21,6 @@
'explore_and_generate_plan',
'generate_executive_summary',
'generate_tests',
+ 'make_lite_plan',
'write_reports_and_print',
]
diff --git a/murphy/io/report_markdown.py b/murphy/io/report_markdown.py
index e5c2e4f6..bffc85b0 100644
--- a/murphy/io/report_markdown.py
+++ b/murphy/io/report_markdown.py
@@ -13,6 +13,33 @@
from murphy.models import EvaluationReport, TestResult
+def _append_bullets(title: str, items: list[str], lines: list[str]) -> None:
+ if not items:
+ return
+ lines.append(f'**{title}:**')
+ for item in items:
+ lines.append(f'- {item}')
+ lines.append('')
+
+
+def _render_lite_result(r: TestResult, lines: list[str]) -> None:
+ """Append Murphy lite structured evaluation details for one result."""
+ if not r.lite_result:
+ return
+
+ lite = r.lite_result
+ lines += [
+ '**Lite Evaluation:**',
+ '',
+ f'**Grade:** {lite.grade}/10',
+ '',
+ ]
+ _append_bullets('Flaws', lite.flaws, lines)
+ _append_bullets('Improvements', lite.improvements, lines)
+ _append_bullets('Fixes', lite.fixes, lines)
+ _append_bullets('Other feedback', lite.other_feedback, lines)
+
+
def _render_test_detail(r: TestResult, index: int, lines: list[str]) -> None:
"""Append detailed info for a single test result (pass or fail)."""
m = _compute_metrics(r)
@@ -20,6 +47,7 @@ def _render_test_detail(r: TestResult, index: int, lines: list[str]) -> None:
lines.append(f'**Result:** {"Passed" if passed else "Failed"} in {r.duration:.0f}s')
lines.append('')
+ _render_lite_result(r, lines)
lines.append(f'**Metrics:** {_format_metrics_line(m)}')
lines.append('')
# lines.append(f'{format_path(r)}')
@@ -133,6 +161,7 @@ def write_markdown_report(report: EvaluationReport, output_dir: Path) -> Path:
# Partition results
website_issues = [r for r in report.results if r.failure_category == 'website_issue']
test_limitations = [r for r in report.results if r.failure_category == 'test_limitation']
+ failed_tests = [r for r in report.results if r.success is not True and r.failure_category is None]
passed_tests = [r for r in report.results if r.success]
# Scorecard
@@ -142,24 +171,47 @@ def write_markdown_report(report: EvaluationReport, output_dir: Path) -> Path:
f'- Website Issues: {s.website_issues}',
f'- Test Limitations: {s.test_limitations}',
'',
- '| Test | Persona | Result | Category | Duration |',
- '|------|---------|--------|----------|----------|',
]
- for r in report.results:
- persona_label = r.scenario.test_persona.replace('_', ' ').title()
- if r.success:
- emoji = '\u2705'
- result_str = 'Passed'
- category_str = ''
- elif r.failure_category == 'website_issue':
- emoji = '\U0001f534'
- result_str = 'Failed'
- category_str = 'Website Issue'
- else:
- emoji = '\u26a0\ufe0f'
- result_str = 'Failed'
- category_str = 'Test Limitation'
- lines.append(f'| {emoji} {r.scenario.name} | {persona_label} | {result_str} | {category_str} | {r.duration:.0f}s |')
+
+ has_lite_results = any(r.lite_result for r in report.results)
+ if has_lite_results:
+ lines += [
+ '| Test | Persona | Grade | Flaws | Improvements | Fixes | Duration |',
+ '|------|---------|-------|-------|--------------|-------|----------|',
+ ]
+ for r in report.results:
+ persona_label = r.scenario.test_persona.replace('_', ' ').title()
+ lite = r.lite_result
+ if lite:
+ grade = f'{lite.grade}/10'
+ flaws = str(len(lite.flaws))
+ improvements = str(len(lite.improvements))
+ fixes = str(len(lite.fixes))
+ else:
+ grade = flaws = improvements = fixes = 'n/a'
+ lines.append(
+ f'| {r.scenario.name} | {persona_label} | {grade} | {flaws} | {improvements} | {fixes} | {r.duration:.0f}s |'
+ )
+ else:
+ lines += [
+ '| Test | Persona | Result | Category | Duration |',
+ '|------|---------|--------|----------|----------|',
+ ]
+ for r in report.results:
+ persona_label = r.scenario.test_persona.replace('_', ' ').title()
+ if r.success:
+ emoji = '\u2705'
+ result_str = 'Passed'
+ category_str = ''
+ elif r.failure_category == 'website_issue':
+ emoji = '\U0001f534'
+ result_str = 'Failed'
+ category_str = 'Website Issue'
+ else:
+ emoji = '\u26a0\ufe0f'
+ result_str = 'Failed'
+ category_str = 'Test Limitation'
+ lines.append(f'| {emoji} {r.scenario.name} | {persona_label} | {result_str} | {category_str} | {r.duration:.0f}s |')
# ── Feedback Quality Index ────────────────────────────────────────────────
fq_results = [r for r in report.results if r.feedback_quality]
@@ -267,6 +319,21 @@ def yes_no(b):
_render_test_detail(r, i, detail_lines)
lines += ['', f'{summary_text}
', ''] + detail_lines + [' ', '']
+ # ── Generic failed tests section ──────────────────────────────────────────
+ if failed_tests:
+ lines += ['## Failed Tests', '']
+ for i, r in enumerate(failed_tests, 1):
+ persona_label = r.scenario.test_persona.replace('_', ' ').title()
+ summary_text = f'\u274c {i}. {r.scenario.name} — {persona_label}'
+ detail_lines = [
+ f'**Persona:** {persona_label}',
+ '',
+ f'**What was tested:** {r.scenario.description}',
+ '',
+ ]
+ _render_test_detail(r, i, detail_lines)
+ lines += ['', f'{summary_text}
', ''] + detail_lines + [' ', '']
+
# ── Passed Tests section ──────────────────────────────────────────────────
if passed_tests:
lines += ['## Passed Tests', '']
diff --git a/murphy/models.py b/murphy/models.py
index 3ed79170..74186d62 100644
--- a/murphy/models.py
+++ b/murphy/models.py
@@ -432,6 +432,21 @@ class ScenarioExecutionVerdict(BaseModel):
)
+# ─── Lite mode feedback ───────────────────────────────────────────────────────
+
+
+class LiteResult(BaseModel):
+ """Fast, structured output returned by Murphy lite mode."""
+
+ grade: int = Field(ge=1, le=10, description='Overall experience score from 1 (poor) to 10 (excellent).')
+ flaws: list[str] = Field(default_factory=list, description='Observed problems, friction, or broken behavior.')
+ improvements: list[str] = Field(default_factory=list, description='Product or UX improvements that would help users.')
+ fixes: list[str] = Field(default_factory=list, description='Concrete fixes that address the observed flaws.')
+ other_feedback: list[str] = Field(
+ default_factory=list, description='Additional observations that do not fit the other fields.'
+ )
+
+
# ─── Judge verdict ─────────────────────────────────────────────────────────────
@@ -501,6 +516,7 @@ class TestResult(BaseModel):
trait_evaluations: dict[str, Literal['pass', 'fail']] | None = None
missing_signals: list[str] = Field(default_factory=list)
feature_suggestions: list[str] = Field(default_factory=list)
+ lite_result: LiteResult | None = None
class ReportSummary(BaseModel):
diff --git a/murphy/prompts.py b/murphy/prompts.py
index e494eab9..53ddfac3 100644
--- a/murphy/prompts.py
+++ b/murphy/prompts.py
@@ -538,6 +538,59 @@ def _build_suggestion_instruction(
)
+def build_lite_prompt(
+ scenario: TestScenario,
+ start_url: str,
+ analysis: WebsiteAnalysis | None = None,
+ discovered_personas: tuple[PersonaResult, TraitSchema] | None = None,
+) -> str:
+ """Build the lean execution prompt for Murphy lite mode."""
+ if discovered_personas and scenario.test_persona not in PERSONA_REGISTRY:
+ from murphy.personas.bridge import render_discovered_persona_for_execution
+
+ persona_result, trait_schema = discovered_personas
+ persona_block = render_discovered_persona_for_execution(scenario.test_persona, persona_result, trait_schema)
+ else:
+ persona_block = _render_persona_for_execution(scenario.test_persona)
+
+ if analysis:
+ core_features = [feature.name for feature in analysis.features if feature.importance == 'core']
+ site_context = (
+ 'SITE CONTEXT:\n'
+ f'- Site: {analysis.site_name}\n'
+ f'- Category: {analysis.category}\n'
+ f'- Description: {analysis.description}\n'
+ f'- Core features: {", ".join(core_features) if core_features else "not identified"}\n'
+ f'- User flows: {", ".join(analysis.identified_user_flows) if analysis.identified_user_flows else "not identified"}\n\n'
+ )
+ else:
+ site_context = ''
+
+ return (
+ f'You are running Murphy lite mode: a fast objective-driven website test.\n\n'
+ f'{persona_block}\n\n'
+ f'{site_context}'
+ f'Task: {scenario.description}\n\n'
+ f'Steps:\n{scenario.steps_description}\n\n'
+ f'Start URL: {start_url}\n\n'
+ f'Rules:\n'
+ f'- Stay on the same domain as {start_url}.\n'
+ f'- You must attempt the stated objective before returning the LiteResult.\n'
+ f'- Do not stop at first-impression UX feedback if there is any plausible in-app path to continue.\n'
+ f'- If a control is ambiguous but plausibly relevant to the objective, use it and report the ambiguity afterward.\n'
+ f'- Terminal states: objective completed and verified, objective attempted but blocked, or no plausible route found after trying at least two in-app paths.\n'
+ f'- You may submit ordinary app forms needed for the objective using harmless test data.\n'
+ f'- Do not submit support/contact/feedback forms, payment actions, destructive confirmations, or external-domain flows.\n'
+ f'- If the app blocks you with login, captcha, or missing permissions, report that as a flaw and stop.\n\n'
+ f'Return exactly one LiteResult object with these fields:\n'
+ f'- grade: integer from 1 to 10 for the overall experience.\n'
+ f'- flaws: concrete problems, friction, broken behavior, or blockers you observed.\n'
+ f'- improvements: product or UX improvements that would make the flow better.\n'
+ f'- fixes: concrete implementation fixes that address the flaws.\n'
+ f'- other_feedback: useful observations that do not fit the other fields.\n'
+ )
+
+
def build_execution_prompt(
global_task: str,
scenario: TestScenario,
diff --git a/tests/murphy/api/test_cli.py b/tests/murphy/api/test_cli.py
new file mode 100644
index 00000000..1ad46e55
--- /dev/null
+++ b/tests/murphy/api/test_cli.py
@@ -0,0 +1,70 @@
+"""Tests for Murphy CLI report-writing helpers."""
+
+from pathlib import Path
+from unittest.mock import patch
+
+from murphy.api.cli import _write_reports_and_log_results
+from murphy.models import TestResult, TestScenario, TokenUsage, WebsiteAnalysis
+
+
+def _make_analysis() -> WebsiteAnalysis:
+ return WebsiteAnalysis(
+ site_name='Example',
+ category='saas',
+ description='An example site',
+ key_pages=[],
+ features=[],
+ identified_user_flows=[],
+ )
+
+
+def _make_result() -> TestResult:
+ return TestResult(
+ scenario=TestScenario(
+ name='Lite agent creation',
+ description='Assess the agent creation flow',
+ priority='high',
+ feature_category='forms',
+ target_feature='Agent creation',
+ test_persona='confused_novice',
+ steps_description='Try to create an agent',
+ success_criteria='Return lightweight UX feedback',
+ ),
+ success=True,
+ judgement=None,
+ actions=[],
+ errors=[],
+ duration=2.0,
+ reason='Lite mode grade: 6',
+ )
+
+
+def test_write_reports_and_log_results_writes_lite_report_and_logs_summary():
+ analysis = _make_analysis()
+ results = [_make_result()]
+ output_dir = Path('/tmp/murphy-test-output')
+ tokens = TokenUsage(input_tokens=10, output_tokens=5)
+
+ with (
+ patch('murphy.core.summary.write_reports_and_print') as write_reports,
+ patch('murphy.api.cli._log_lite_summary') as log_lite_summary,
+ ):
+ _write_reports_and_log_results(
+ url='https://example.com',
+ analysis=analysis,
+ results=results,
+ output_dir=output_dir,
+ use_lite=True,
+ persona_discovery_tokens=None,
+ murphy_tokens=tokens,
+ )
+
+ write_reports.assert_called_once_with(
+ 'https://example.com',
+ analysis,
+ results,
+ output_dir,
+ persona_discovery_tokens=None,
+ murphy_tokens=tokens,
+ )
+ log_lite_summary.assert_called_once_with(results)
diff --git a/tests/murphy/api/test_request_models.py b/tests/murphy/api/test_request_models.py
index 37fca89e..853eb45e 100644
--- a/tests/murphy/api/test_request_models.py
+++ b/tests/murphy/api/test_request_models.py
@@ -90,6 +90,12 @@ def test_evaluate_request_defaults():
r = EvaluateRequest(url='https://example.com') # type: ignore[call-arg]
assert r.max_tests == 8
assert r.async_mode is False
+ assert r.lite is False
+
+
+def test_evaluate_request_accepts_lite():
+ r = EvaluateRequest(url='https://example.com', lite=True) # type: ignore[call-arg]
+ assert r.lite is True
# ─── ExecuteRequest ──────────────────────────────────────────────────────────
@@ -101,6 +107,12 @@ def test_execute_request_defaults():
assert r.evaluate_job_id is None
assert r.max_steps == 15
assert r.max_concurrent == 3
+ assert r.lite is False
+
+
+def test_execute_request_accepts_lite():
+ r = ExecuteRequest(url='https://example.com', lite=True) # type: ignore[call-arg]
+ assert r.lite is True
def test_execute_request_with_json_string_test_plan():
@@ -167,6 +179,11 @@ def test_generate_plan_request_with_json_string_analysis():
assert r.analysis.site_name == 'Example'
+def test_generate_plan_request_accepts_lite():
+ r = GeneratePlanRequest(url='https://example.com', analysis=_make_analysis_dict(), lite=True) # type: ignore[arg-type]
+ assert r.lite is True
+
+
# ─── JobResponse ──────────────────────────────────────────────────────────────
diff --git a/tests/murphy/api/test_rest_api.py b/tests/murphy/api/test_rest_api.py
index 54919271..f6df314a 100644
--- a/tests/murphy/api/test_rest_api.py
+++ b/tests/murphy/api/test_rest_api.py
@@ -1,10 +1,14 @@
"""Tests for REST API endpoints using FastAPI TestClient — no real LLM/browser calls."""
+from unittest.mock import AsyncMock, patch
+
import pytest
from fastapi.testclient import TestClient
from murphy.api.jobs import Job, _jobs
-from murphy.api.rest import app
+from murphy.api.request_models import EvaluateRequest, ExecuteRequest, GeneratePlanRequest
+from murphy.api.rest import _core_evaluate, _core_execute, _core_generate_plan, app
+from murphy.models import ReportSummary, TestPlan
@pytest.fixture(autouse=True)
@@ -98,3 +102,50 @@ def test_get_job_strips_whitespace(client, monkeypatch):
resp = client.get('/jobs/ my-job ')
assert resp.status_code == 200
+
+
+def _analysis_dict() -> dict:
+ return {
+ 'site_name': 'Example',
+ 'category': 'saas',
+ 'description': 'An example site',
+ 'key_pages': [],
+ 'features': [],
+ 'identified_user_flows': [],
+ }
+
+
+@pytest.mark.asyncio
+async def test_core_generate_plan_propagates_lite():
+ with patch('murphy.core.pipeline.run_generate_plan', new_callable=AsyncMock) as run_generate_plan:
+ run_generate_plan.return_value = TestPlan(scenarios=[])
+ req = GeneratePlanRequest(url='https://example.com', analysis=_analysis_dict(), lite=True) # type: ignore[arg-type]
+
+ await _core_generate_plan(req)
+
+ run_generate_plan.assert_awaited_once()
+ assert run_generate_plan.call_args.kwargs['lite'] is True
+
+
+@pytest.mark.asyncio
+async def test_core_execute_propagates_lite():
+ with patch('murphy.core.pipeline.run_execute', new_callable=AsyncMock) as run_execute:
+ run_execute.return_value = ([], ReportSummary(total=0, passed=0, failed=0, pass_rate=0.0, by_priority={}))
+ req = ExecuteRequest(url='https://example.com', test_plan=TestPlan(scenarios=[]), lite=True) # type: ignore[call-arg]
+
+ await _core_execute(req)
+
+ run_execute.assert_awaited_once()
+ assert run_execute.call_args.kwargs['lite'] is True
+
+
+@pytest.mark.asyncio
+async def test_core_evaluate_propagates_lite():
+ with patch('murphy.core.pipeline.run_evaluate', new_callable=AsyncMock) as run_evaluate:
+ run_evaluate.return_value = TestPlan(scenarios=[])
+ req = EvaluateRequest(url='https://example.com', goal='Test agent creation flow', lite=True) # type: ignore[call-arg]
+
+ await _core_evaluate(req)
+
+ run_evaluate.assert_awaited_once()
+ assert run_evaluate.call_args.kwargs['lite'] is True
diff --git a/tests/murphy/api/test_templates.py b/tests/murphy/api/test_templates.py
index 4748105c..eb198c83 100644
--- a/tests/murphy/api/test_templates.py
+++ b/tests/murphy/api/test_templates.py
@@ -11,6 +11,7 @@
from murphy.models import (
Feature,
JudgeVerdict,
+ LiteResult,
PageInfo,
TestPlan,
TestResult,
@@ -190,6 +191,30 @@ def test_render_results_html_with_failure():
assert 'Website Issue' in html
+def test_render_results_html_failed_lite_result_uses_plain_failed_badge():
+ analysis = _make_analysis()
+ lite_result = LiteResult(
+ grade=4,
+ flaws=['The create flow is hard to find'],
+ improvements=['Expose a clearer create action'],
+ fixes=['Add a primary Create Agent button'],
+ other_feedback=[],
+ )
+ results = [
+ _make_result(
+ success=False,
+ judgement=None,
+ lite_result=lite_result,
+ failure_category=None,
+ reason='Lite mode grade: 4',
+ )
+ ]
+ html = render_results_html('https://example.com', analysis, results, None)
+ assert 'Failed (1)' in html
+ assert 'FAILED' in html
+ assert 'TEST LIMITATION' not in html
+
+
def test_render_results_html_escapes_xss():
analysis = _make_analysis(site_name='')
results = [_make_result()]
diff --git a/tests/murphy/core/test_execution.py b/tests/murphy/core/test_execution.py
index 396f5b34..72f11e88 100644
--- a/tests/murphy/core/test_execution.py
+++ b/tests/murphy/core/test_execution.py
@@ -1,9 +1,16 @@
"""Tests for execution helper functions (no browser/LLM calls)."""
+import json
+from unittest.mock import AsyncMock, MagicMock, patch
+
+import pytest
+
from murphy.core.execution import (
+ _execute_single_test,
_extract_form_fills,
_extract_urls_from_texts,
)
+from murphy.models import JudgeVerdict, LiteResult, ScenarioExecutionVerdict, TestScenario
# ─── _extract_form_fills ─────────────────────────────────────────────────────
@@ -119,3 +126,381 @@ def test_extract_urls_from_texts_multiple():
def test_extract_urls_from_texts_skips_none():
result = _extract_urls_from_texts(['', 'https://ok.com'])
assert result == ['https://ok.com']
+
+
+# ─── Lite execution ──────────────────────────────────────────────────────────
+
+
+@pytest.mark.asyncio
+async def test_execute_single_test_lite_mode_skips_judge_and_returns_lite_result():
+ scenario = TestScenario(
+ name='Lite agent creation',
+ description='Assess the agent creation flow',
+ priority='critical',
+ feature_category='forms',
+ target_feature='Agent creation',
+ test_persona='happy_path',
+ steps_description='Try to create an agent',
+ success_criteria='Return structured flaws, improvements, fixes, and other observations.',
+ )
+ lite_result = LiteResult(
+ grade=7,
+ flaws=['Creation has unclear required fields'],
+ improvements=['Show progress while creating the agent'],
+ fixes=['Label the create button clearly'],
+ other_feedback=['The main navigation is understandable'],
+ )
+ history = MagicMock()
+ history.final_result.return_value = json.dumps(lite_result.model_dump())
+ history.model_actions.return_value = [{'click': {'index': 1}}]
+ history.errors.return_value = []
+ history.total_duration_seconds.return_value = 3.5
+ history.urls.return_value = ['https://example.com/agents']
+ history.screenshot_paths.return_value = []
+
+ agent = MagicMock()
+ agent.tools = MagicMock()
+ agent.run = AsyncMock(return_value=history)
+
+ with (
+ patch('murphy.core.execution.Agent', return_value=agent) as agent_cls,
+ patch('murphy.core.execution.murphy_judge', new_callable=AsyncMock) as judge,
+ patch('murphy.browser.session_utils.prepare_session_for_task', new_callable=AsyncMock),
+ patch('murphy.browser.actions.register_domain_access_action'),
+ patch('murphy.browser.actions.register_refresh_dom_action'),
+ ):
+ result = await _execute_single_test(
+ url='https://example.com',
+ scenario=scenario,
+ llm=MagicMock(),
+ browser_session=MagicMock(),
+ goal='Test agent creation flow',
+ fixture_paths=None,
+ max_steps=5,
+ index=1,
+ total=1,
+ use_lite=True,
+ )
+
+ agent_cls.assert_called_once()
+ assert agent_cls.call_args.kwargs['output_model_schema'] is LiteResult
+ agent.tools.exclude_action.assert_any_call('write_file')
+ judge.assert_not_awaited()
+ assert result.success is True
+ assert result.judgement is None
+ assert result.lite_result == lite_result
+ assert result.reason == 'Lite mode grade: 7'
+
+
+@pytest.mark.asyncio
+async def test_execute_single_test_lite_mode_low_grade_is_plain_failed_test():
+ scenario = TestScenario(
+ name='Lite confused user',
+ description='Assess the agent creation flow for confusion',
+ priority='high',
+ feature_category='forms',
+ target_feature='Agent creation',
+ test_persona='confused_novice',
+ steps_description='Try to create an agent',
+ success_criteria='Return structured flaws, improvements, fixes, and other observations.',
+ )
+ lite_result = LiteResult(
+ grade=4,
+ flaws=['The create flow is hard to find'],
+ improvements=['Expose a clearer create action'],
+ fixes=['Add a primary Create Agent button'],
+ other_feedback=[],
+ )
+ history = MagicMock()
+ history.final_result.return_value = json.dumps(lite_result.model_dump())
+ history.model_actions.return_value = [{'click': {'index': 1}}]
+ history.errors.return_value = []
+ history.total_duration_seconds.return_value = 2.0
+ history.urls.return_value = ['https://example.com']
+ history.screenshot_paths.return_value = []
+
+ agent = MagicMock()
+ agent.tools = MagicMock()
+ agent.run = AsyncMock(return_value=history)
+
+ with (
+ patch('murphy.core.execution.Agent', return_value=agent),
+ patch('murphy.core.execution.murphy_judge', new_callable=AsyncMock) as judge,
+ patch('murphy.browser.session_utils.prepare_session_for_task', new_callable=AsyncMock),
+ patch('murphy.browser.actions.register_domain_access_action'),
+ patch('murphy.browser.actions.register_refresh_dom_action'),
+ ):
+ result = await _execute_single_test(
+ url='https://example.com',
+ scenario=scenario,
+ llm=MagicMock(),
+ browser_session=MagicMock(),
+ goal='Test agent creation flow',
+ fixture_paths=None,
+ max_steps=5,
+ index=1,
+ total=1,
+ use_lite=True,
+ )
+
+ judge.assert_not_awaited()
+ assert result.success is False
+ assert result.judgement is None
+ assert result.failure_category is None
+ assert result.lite_result == lite_result
+ assert result.reason == 'Lite mode grade: 4'
+
+
+@pytest.mark.asyncio
+async def test_execute_single_test_lite_mode_saves_agent_history_when_output_dir_set(tmp_path):
+ scenario = TestScenario(
+ name='Lite agent creation',
+ description='Assess the agent creation flow',
+ priority='critical',
+ feature_category='forms',
+ target_feature='Agent creation',
+ test_persona='happy_path',
+ steps_description='Try to create an agent',
+ success_criteria='Return structured flaws, improvements, fixes, and other observations.',
+ )
+ lite_result = LiteResult(
+ grade=7,
+ flaws=['Creation has unclear required fields'],
+ improvements=['Show progress while creating the agent'],
+ fixes=['Label the create button clearly'],
+ other_feedback=[],
+ )
+ history = MagicMock()
+ history.final_result.return_value = json.dumps(lite_result.model_dump())
+ history.model_actions.return_value = [{'click': {'index': 1}}]
+ history.errors.return_value = []
+ history.total_duration_seconds.return_value = 3.5
+ history.urls.return_value = ['https://example.com/agents']
+ history.screenshot_paths.return_value = []
+
+ agent = MagicMock()
+ agent.tools = MagicMock()
+ agent.run = AsyncMock(return_value=history)
+
+ with (
+ patch('murphy.core.execution.Agent', return_value=agent),
+ patch('murphy.core.execution.murphy_judge', new_callable=AsyncMock),
+ patch('murphy.browser.session_utils.prepare_session_for_task', new_callable=AsyncMock),
+ patch('murphy.browser.actions.register_domain_access_action'),
+ patch('murphy.browser.actions.register_refresh_dom_action'),
+ ):
+ await _execute_single_test(
+ url='https://example.com',
+ scenario=scenario,
+ llm=MagicMock(),
+ browser_session=MagicMock(),
+ goal='Test agent creation flow',
+ fixture_paths=None,
+ max_steps=5,
+ index=1,
+ total=1,
+ output_dir=tmp_path,
+ use_lite=True,
+ )
+
+ history.save_to_file.assert_called_once_with(tmp_path / 'agent_history' / 'test_01_lite_agent_creation.json')
+
+
+@pytest.mark.asyncio
+async def test_execute_single_test_lite_mode_retries_premature_done_without_interaction():
+ scenario = TestScenario(
+ name='Lite objective smoke test',
+ description='Test the objective flow',
+ priority='critical',
+ feature_category='forms',
+ target_feature='Objective flow',
+ test_persona='happy_path',
+ steps_description='Attempt the objective and verify the outcome',
+ success_criteria='Return structured flaws, improvements, fixes, and other observations.',
+ )
+ first_result = LiteResult(
+ grade=5,
+ flaws=['The flow was unclear from the landing page'],
+ improvements=[],
+ fixes=[],
+ other_feedback=[],
+ )
+ second_result = LiteResult(
+ grade=7,
+ flaws=['The flow required extra guidance'],
+ improvements=['Clarify the first step'],
+ fixes=['Add inline guidance'],
+ other_feedback=[],
+ )
+
+ first_history = MagicMock()
+ first_history.final_result.return_value = json.dumps(first_result.model_dump())
+ first_history.model_actions.return_value = [
+ {'navigate': {'url': 'https://example.com'}},
+ {'done': {'success': True}},
+ ]
+ first_history.errors.return_value = []
+ first_history.total_duration_seconds.return_value = 1.0
+ first_history.urls.return_value = ['https://example.com']
+ first_history.screenshot_paths.return_value = []
+
+ second_history = MagicMock()
+ second_history.final_result.return_value = json.dumps(second_result.model_dump())
+ second_history.model_actions.return_value = [
+ {'click': {'index': 1}},
+ {'done': {'success': True}},
+ ]
+ second_history.errors.return_value = []
+ second_history.total_duration_seconds.return_value = 2.0
+ second_history.urls.return_value = ['https://example.com/result']
+ second_history.screenshot_paths.return_value = []
+
+ first_agent = MagicMock()
+ first_agent.tools = MagicMock()
+ first_agent.run = AsyncMock(return_value=first_history)
+ second_agent = MagicMock()
+ second_agent.tools = MagicMock()
+ second_agent.run = AsyncMock(return_value=second_history)
+
+ with (
+ patch('murphy.core.execution.Agent', side_effect=[first_agent, second_agent]) as agent_cls,
+ patch('murphy.core.execution.murphy_judge', new_callable=AsyncMock) as judge,
+ patch('murphy.browser.session_utils.prepare_session_for_task', new_callable=AsyncMock),
+ patch('murphy.browser.actions.register_domain_access_action'),
+ patch('murphy.browser.actions.register_refresh_dom_action'),
+ ):
+ result = await _execute_single_test(
+ url='https://example.com',
+ scenario=scenario,
+ llm=MagicMock(),
+ browser_session=MagicMock(),
+ goal='Test objective flow',
+ fixture_paths=None,
+ max_steps=5,
+ index=1,
+ total=1,
+ use_lite=True,
+ )
+
+ assert agent_cls.call_count == 2
+ assert 'stopped before meaningful in-app interaction' in agent_cls.call_args_list[1].kwargs['task']
+ judge.assert_not_awaited()
+ assert result.lite_result == second_result
+ assert result.actions == second_history.model_actions.return_value
+
+
+@pytest.mark.asyncio
+async def test_execute_single_test_lite_mode_does_not_retry_after_meaningful_interaction():
+ scenario = TestScenario(
+ name='Lite objective smoke test',
+ description='Test the objective flow',
+ priority='critical',
+ feature_category='forms',
+ target_feature='Objective flow',
+ test_persona='happy_path',
+ steps_description='Attempt the objective and verify the outcome',
+ success_criteria='Return structured flaws, improvements, fixes, and other observations.',
+ )
+ lite_result = LiteResult(
+ grade=7,
+ flaws=['The flow required extra guidance'],
+ improvements=[],
+ fixes=[],
+ other_feedback=[],
+ )
+ history = MagicMock()
+ history.final_result.return_value = json.dumps(lite_result.model_dump())
+ history.model_actions.return_value = [
+ {'input_text': {'index': 3, 'text': 'test value'}},
+ {'done': {'success': True}},
+ ]
+ history.errors.return_value = []
+ history.total_duration_seconds.return_value = 2.0
+ history.urls.return_value = ['https://example.com/result']
+ history.screenshot_paths.return_value = []
+
+ agent = MagicMock()
+ agent.tools = MagicMock()
+ agent.run = AsyncMock(return_value=history)
+
+ with (
+ patch('murphy.core.execution.Agent', return_value=agent) as agent_cls,
+ patch('murphy.core.execution.murphy_judge', new_callable=AsyncMock) as judge,
+ patch('murphy.browser.session_utils.prepare_session_for_task', new_callable=AsyncMock),
+ patch('murphy.browser.actions.register_domain_access_action'),
+ patch('murphy.browser.actions.register_refresh_dom_action'),
+ ):
+ result = await _execute_single_test(
+ url='https://example.com',
+ scenario=scenario,
+ llm=MagicMock(),
+ browser_session=MagicMock(),
+ goal='Test objective flow',
+ fixture_paths=None,
+ max_steps=5,
+ index=1,
+ total=1,
+ use_lite=True,
+ )
+
+ agent_cls.assert_called_once()
+ judge.assert_not_awaited()
+ assert result.lite_result == lite_result
+
+
+@pytest.mark.asyncio
+async def test_execute_single_test_normal_mode_excludes_write_file_tool():
+ scenario = TestScenario(
+ name='Agent creation',
+ description='Create an agent from the homepage',
+ priority='critical',
+ feature_category='forms',
+ target_feature='Agent creation',
+ test_persona='happy_path',
+ steps_description='Create an agent and verify it appears',
+ success_criteria='The agent exists after creation.',
+ )
+ verdict = ScenarioExecutionVerdict(success=True, reason='Agent was created')
+ history = MagicMock()
+ history.final_result.return_value = json.dumps(verdict.model_dump())
+ history.model_actions.return_value = [{'click': {'index': 1}}, {'done': {'success': True}}]
+ history.errors.return_value = []
+ history.total_duration_seconds.return_value = 4.0
+ history.urls.return_value = ['https://example.com/agents/1']
+ history.screenshot_paths.return_value = []
+
+ agent = MagicMock()
+ agent.tools = MagicMock()
+ agent.run = AsyncMock(return_value=history)
+ judgement = JudgeVerdict(
+ reasoning='Trace shows the agent was created.',
+ verdict=True,
+ failure_reason='',
+ impossible_task=False,
+ reached_captcha=False,
+ failure_category=None,
+ )
+
+ with (
+ patch('murphy.core.execution.Agent', return_value=agent),
+ patch('murphy.core.execution.murphy_judge', new_callable=AsyncMock, return_value=judgement) as judge,
+ patch('murphy.browser.session_utils.prepare_session_for_task', new_callable=AsyncMock),
+ patch('murphy.browser.actions.register_domain_access_action'),
+ patch('murphy.browser.actions.register_refresh_dom_action'),
+ ):
+ result = await _execute_single_test(
+ url='https://example.com',
+ scenario=scenario,
+ llm=MagicMock(),
+ browser_session=MagicMock(),
+ goal='Test agent creation flow',
+ fixture_paths=None,
+ max_steps=5,
+ index=1,
+ total=1,
+ )
+
+ agent.tools.exclude_action.assert_any_call('write_file')
+ judge.assert_awaited_once()
+ assert result.success is True
+ assert result.judgement == judgement
diff --git a/tests/murphy/core/test_generation.py b/tests/murphy/core/test_generation.py
index 57fd461b..5f5614ff 100644
--- a/tests/murphy/core/test_generation.py
+++ b/tests/murphy/core/test_generation.py
@@ -7,6 +7,7 @@
from murphy.core.generation import (
_log_plan_summary,
generate_tests,
+ make_lite_plan,
summarize_exploration_from_actions,
)
from murphy.models import Feature, PageInfo, TestPersona, TestPlan, TestScenario, WebsiteAnalysis
@@ -85,7 +86,7 @@ async def test_generate_tests_returns_plan():
assert isinstance(result, TestPlan)
assert len(result.scenarios) == 6
- llm.ainvoke.assert_called_once()
+ assert llm.ainvoke.call_count >= 1
@pytest.mark.asyncio
@@ -149,6 +150,49 @@ async def test_generate_tests_retries_on_empty_plan():
assert llm.ainvoke.call_count == 2
+# ─── make_lite_plan ──────────────────────────────────────────────────────────
+
+
+def test_make_lite_plan_creates_persona_scenarios_without_llm():
+ plan = make_lite_plan('https://example.com', goal='Test agent creation flow', analysis=_make_analysis(), max_tests=2)
+
+ assert isinstance(plan, TestPlan)
+ assert len(plan.scenarios) == 2
+ assert [s.test_persona for s in plan.scenarios] == ['happy_path', 'confused_novice']
+ assert all('Test agent creation flow' in s.description for s in plan.scenarios)
+ assert all('flaws, improvements, fixes' in s.success_criteria for s in plan.scenarios)
+
+
+def test_make_lite_plan_interactive_goal_requires_objective_attempt_and_verification():
+ plan = make_lite_plan('https://example.com', goal='Test agent creation flow', analysis=_make_analysis(), max_tests=1)
+ steps = plan.scenarios[0].steps_description
+
+ assert 'most plausible in-app route' in steps
+ assert 'Attempt the objective' in steps
+ assert 'harmless test input' in steps
+ assert 'Advance or submit only when safe' in steps
+ assert 'Verify the resulting UI state' in steps
+
+
+def test_make_lite_plan_state_change_goal_uses_generalized_steps():
+ plan = make_lite_plan('https://example.com', goal='Test dark mode switching', analysis=_make_analysis(), max_tests=1)
+ steps = plan.scenarios[0].steps_description
+
+ assert 'Change the requested state' in steps
+ assert 'Verify the resulting UI state' in steps
+ assert 'appearance' not in steps.lower()
+ assert 'theme control' not in steps.lower()
+
+
+def test_make_lite_plan_uses_analysis_context_when_available():
+ plan = make_lite_plan('https://example.com', goal=None, analysis=_make_analysis(), max_tests=1)
+ scenario = plan.scenarios[0]
+
+ assert scenario.target_feature == 'Search'
+ assert scenario.feature_category == 'search'
+ assert 'Browse -> Search' in scenario.steps_description
+
+
# ─── summarize_exploration_from_actions ──────────────────────────────────────
diff --git a/tests/murphy/core/test_quality.py b/tests/murphy/core/test_quality.py
index ce654e78..bbee35f3 100644
--- a/tests/murphy/core/test_quality.py
+++ b/tests/murphy/core/test_quality.py
@@ -13,7 +13,7 @@ def _make_scenario(**overrides) -> TestScenario:
target_feature='Login form',
test_persona='happy_path',
steps_description='1. Navigate to login page\n2. Enter valid email\n3. Enter password\n4. Click submit',
- success_criteria='User is redirected to dashboard and sees confirmation message',
+ success_criteria='Login succeeds: user is redirected to dashboard and sees confirmation message',
)
defaults.update(overrides)
return TestScenario.model_validate(defaults)
diff --git a/tests/murphy/core/test_summary.py b/tests/murphy/core/test_summary.py
index bfa24322..b2ade615 100644
--- a/tests/murphy/core/test_summary.py
+++ b/tests/murphy/core/test_summary.py
@@ -1,7 +1,7 @@
"""Tests for summary building and failure classification."""
from murphy.core.summary import build_summary, classify_failure
-from murphy.models import JudgeVerdict, TestResult, TestScenario
+from murphy.models import JudgeVerdict, LiteResult, TestResult, TestScenario
def _make_scenario(**overrides) -> TestScenario:
@@ -75,6 +75,18 @@ def test_classify_failure_failed_no_judgement():
assert classify_failure(r) == 'test_limitation'
+def test_classify_failure_failed_lite_result_is_plain_failure():
+ lite_result = LiteResult(
+ grade=4,
+ flaws=['The create flow is hard to find'],
+ improvements=['Expose a clearer create action'],
+ fixes=['Add a primary Create Agent button'],
+ other_feedback=[],
+ )
+ r = _make_result(success=False, judgement=None, lite_result=lite_result)
+ assert classify_failure(r) is None
+
+
# ─── build_summary ────────────────────────────────────────────────────────────
diff --git a/tests/murphy/core/test_summary_extended.py b/tests/murphy/core/test_summary_extended.py
index e54c1e78..c5441f2d 100644
--- a/tests/murphy/core/test_summary_extended.py
+++ b/tests/murphy/core/test_summary_extended.py
@@ -1,5 +1,6 @@
"""Extended tests for summary — generate_executive_summary and write_reports_and_print with mocks."""
+import json
import tempfile
from pathlib import Path
from unittest.mock import AsyncMock, MagicMock
@@ -11,6 +12,7 @@
ExecutiveSummary,
Feature,
JudgeVerdict,
+ LiteResult,
PageInfo,
ReportSummary,
TestResult,
@@ -200,6 +202,41 @@ def test_write_reports_and_print():
assert md_path.exists()
+def test_write_reports_and_print_preserves_lite_result_details():
+ analysis = _make_analysis()
+ lite_result = LiteResult(
+ grade=6,
+ flaws=['Unclear first step'],
+ improvements=['Clarify the first step'],
+ fixes=['Add helper text beside the first field'],
+ other_feedback=['The rest of the flow is understandable'],
+ )
+ results = [
+ _make_result(
+ judgement=None,
+ lite_result=lite_result,
+ reason='Lite mode grade: 6',
+ )
+ ]
+
+ with tempfile.TemporaryDirectory() as tmpdir:
+ output_dir = Path(tmpdir)
+ write_reports_and_print('https://example.com', analysis, results, output_dir)
+
+ json_path = output_dir / 'evaluation_report.json'
+ md_path = output_dir / 'evaluation_report.md'
+ assert json_path.exists()
+ assert md_path.exists()
+
+ json_content = json.loads(json_path.read_text())
+ assert json_content['results'][0]['lite_result']['grade'] == 6
+ md_content = md_path.read_text()
+ assert '**Grade:** 6/10' in md_content
+ assert '- Unclear first step' in md_content
+ assert '- Clarify the first step' in md_content
+ assert '- Add helper text beside the first field' in md_content
+
+
def test_write_reports_and_print_with_executive_summary():
analysis = _make_analysis()
results = [_make_result()]
diff --git a/tests/murphy/io/test_report.py b/tests/murphy/io/test_report.py
index 04b9d116..6a7c8e3d 100644
--- a/tests/murphy/io/test_report.py
+++ b/tests/murphy/io/test_report.py
@@ -20,6 +20,7 @@
Feature,
FeedbackQualityScore,
JudgeVerdict,
+ LiteResult,
PageInfo,
ReportSummary,
TestResult,
@@ -306,6 +307,55 @@ def test_write_markdown_report_basic():
assert 'Test search' in content
+def test_write_markdown_report_includes_lite_result_details():
+ lite_result = LiteResult(
+ grade=6,
+ flaws=[
+ 'The primary action is visually disconnected from the form.',
+ 'The sidebar creates an oversized left gutter.',
+ 'The textarea is wider than the content hierarchy supports.',
+ 'Spacing between the label and field is inconsistent.',
+ 'The disabled button state lacks a clear explanation.',
+ 'The form feels unanchored in the available whitespace.',
+ ],
+ improvements=[
+ 'Constrain the form to a readable max width.',
+ 'Align the label, field, and button to one column.',
+ 'Use consistent vertical spacing tokens.',
+ 'Reduce sidebar visual density.',
+ 'Explain why the disabled action is unavailable.',
+ ],
+ fixes=[
+ 'Set a max-width on the main form container.',
+ 'Align the button with the textarea edge.',
+ 'Apply shared spacing variables to the form stack.',
+ 'Reduce sidebar item padding variance.',
+ 'Add helper text for the disabled Next button.',
+ ],
+ other_feedback=['The visual language is otherwise modern and consistent.'],
+ )
+ result = _make_result(
+ judgement=None,
+ lite_result=lite_result,
+ reason='Lite mode grade: 6',
+ )
+ report = _make_report(results=[result])
+
+ with tempfile.TemporaryDirectory() as tmpdir:
+ content = write_markdown_report(report, Path(tmpdir)).read_text()
+
+ assert '| Test | Persona | Grade | Flaws | Improvements | Fixes | Duration |' in content
+ assert '| Test search | Happy Path | 6/10 | 6 | 5 | 5 | 5s |' in content
+ assert '**Lite Evaluation:**' in content
+ assert '**Grade:** 6/10' in content
+ assert '**Flaws:**' in content
+ assert '**Improvements:**' in content
+ assert '**Fixes:**' in content
+ assert '**Other feedback:**' in content
+ for item in lite_result.flaws + lite_result.improvements + lite_result.fixes + lite_result.other_feedback:
+ assert f'- {item}' in content
+
+
def test_write_markdown_report_includes_passed_section():
report = _make_report()
with tempfile.TemporaryDirectory() as tmpdir:
@@ -335,6 +385,43 @@ def test_write_markdown_report_includes_failure_sections():
assert 'Website Issues' in content
+def test_write_markdown_report_failed_lite_result_is_not_test_limitation():
+ lite_result = LiteResult(
+ grade=4,
+ flaws=['The create flow is hard to find'],
+ improvements=['Expose a clearer create action'],
+ fixes=['Add a primary Create Agent button'],
+ other_feedback=[],
+ )
+ failed_lite_result = _make_result(
+ success=False,
+ judgement=None,
+ lite_result=lite_result,
+ failure_category=None,
+ reason='Lite mode grade: 4',
+ )
+ report = _make_report(
+ results=[failed_lite_result],
+ summary=ReportSummary(
+ total=1,
+ passed=0,
+ failed=1,
+ pass_rate=0.0,
+ website_issues=0,
+ test_limitations=0,
+ by_priority={'high': {'passed': 0, 'failed': 1}},
+ ),
+ )
+
+ with tempfile.TemporaryDirectory() as tmpdir:
+ content = write_markdown_report(report, Path(tmpdir)).read_text()
+
+ assert '## Failed Tests' in content
+ assert '## Test Limitations' not in content
+ assert 'Lite mode grade: 4' in content
+ assert '**Grade:** 4/10' in content
+
+
def test_write_markdown_report_includes_features_discovered():
report = _make_report()
with tempfile.TemporaryDirectory() as tmpdir:
diff --git a/tests/murphy/test_models.py b/tests/murphy/test_models.py
index c853684c..ce1a89d3 100644
--- a/tests/murphy/test_models.py
+++ b/tests/murphy/test_models.py
@@ -11,6 +11,7 @@
FeedbackQualityScore,
InteractiveElement,
JudgeVerdict,
+ LiteResult,
PageInfo,
ReportSummary,
ScenarioExecutionVerdict,
@@ -122,14 +123,25 @@ def test_trait_vector_extra_forbidden():
def test_persona_registry_completeness():
- expected_personas = {'happy_path', 'confused_novice', 'adversarial', 'edge_case', 'explorer', 'impatient_user', 'angry_user'}
+ expected_personas = {
+ 'happy_path',
+ 'confused_novice',
+ 'adversarial',
+ 'edge_case',
+ 'explorer',
+ 'impatient_user',
+ 'angry_user',
+ 'classic_ui',
+ 'modern_ui',
+ 'layout_auditor_ui',
+ }
assert set(PERSONA_REGISTRY.keys()) == expected_personas
def test_persona_registry_values_are_trait_vector_and_test_type():
for persona, (traits, test_type) in PERSONA_REGISTRY.items():
assert isinstance(traits, TraitVector), f'{persona} traits not TraitVector'
- assert test_type in ('ux', 'security', 'boundary'), f'{persona} test_type invalid: {test_type}'
+ assert test_type in ('ux', 'security', 'boundary', 'design'), f'{persona} test_type invalid: {test_type}'
# ─── ScenarioExecutionVerdict ─────────────────────────────────────────────────
@@ -157,6 +169,25 @@ def test_scenario_execution_verdict_with_fields():
assert 'Button' in v.reason
+# ─── LiteResult ───────────────────────────────────────────────────────────────
+
+
+def test_lite_result_requires_grade_between_one_and_ten():
+ result = LiteResult(
+ grade=8,
+ flaws=['Agent creation has unclear validation errors'],
+ improvements=['Add clearer progress feedback'],
+ fixes=['Show inline errors next to required fields'],
+ other_feedback=['The flow is discoverable from the dashboard'],
+ )
+
+ assert result.grade == 8
+ assert result.flaws == ['Agent creation has unclear validation errors']
+
+ with pytest.raises(ValidationError):
+ LiteResult(grade=11, flaws=[], improvements=[], fixes=[], other_feedback=[])
+
+
# ─── TestScenario ─────────────────────────────────────────────────────────────
@@ -186,9 +217,10 @@ def test_test_scenario_rejects_invalid_priority():
_make_scenario(priority='urgent')
-def test_test_scenario_rejects_invalid_persona():
- with pytest.raises(ValidationError):
- _make_scenario(test_persona='robot')
+def test_test_scenario_accepts_custom_persona_names():
+ """Discovered personas are runtime strings, not limited to the built-in registry."""
+ scenario = _make_scenario(test_persona='enterprise_admin')
+ assert scenario.test_persona == 'enterprise_admin'
def test_test_scenario_rejects_invalid_category():
@@ -279,6 +311,22 @@ def test_test_result_nullable_success():
assert r.success is None
+def test_test_result_serializes_lite_result():
+ lite_result = LiteResult(
+ grade=6,
+ flaws=['The create button is hard to find'],
+ improvements=['Add onboarding copy'],
+ fixes=['Make the create button primary'],
+ other_feedback=['Navigation is otherwise clear'],
+ )
+ r = _make_result(judgement=None, lite_result=lite_result, reason='Lite mode grade: 6')
+
+ dumped = r.model_dump()
+
+ assert dumped['lite_result']['grade'] == 6
+ assert dumped['lite_result']['flaws'] == ['The create button is hard to find']
+
+
# ─── WebsiteAnalysis ─────────────────────────────────────────────────────────
diff --git a/tests/murphy/test_prompts.py b/tests/murphy/test_prompts.py
index 6edaf14d..90935dba 100644
--- a/tests/murphy/test_prompts.py
+++ b/tests/murphy/test_prompts.py
@@ -7,6 +7,7 @@
build_analysis_prompt,
build_execution_prompt,
build_exploration_prompt,
+ build_lite_prompt,
build_plan_synthesis_prompt,
build_test_generation_prompt,
build_test_generation_system_message,
@@ -176,6 +177,51 @@ def test_execution_prompt_confused_novice_persona():
assert 'confused_novice' in prompt
+# ─── build_lite_prompt ───────────────────────────────────────────────────────
+
+
+def test_lite_prompt_requests_structured_lite_fields():
+ scenario = _make_scenario(test_persona='happy_path')
+ prompt = build_lite_prompt(scenario, 'https://example.com', analysis=_make_analysis())
+
+ assert 'LiteResult' in prompt
+ assert 'flaws' in prompt
+ assert 'improvements' in prompt
+ assert 'fixes' in prompt
+ assert 'other_feedback' in prompt
+ assert 'SKIP' not in prompt
+
+
+def test_lite_prompt_reuses_persona_context():
+ scenario = _make_scenario(test_persona='confused_novice')
+ prompt = build_lite_prompt(scenario, 'https://example.com')
+
+ assert 'confused_novice' in prompt
+ assert 'first-time user' in prompt
+ assert 'Stay on the same domain' in prompt
+
+
+def test_lite_prompt_requires_objective_attempt_before_result():
+ scenario = _make_scenario(test_persona='happy_path')
+ prompt = build_lite_prompt(scenario, 'https://example.com')
+
+ assert 'fast objective-driven website test' in prompt
+ assert 'attempt the stated objective' in prompt
+ assert 'Terminal states' in prompt
+ assert 'stop as soon as you have enough evidence' not in prompt
+
+
+def test_lite_prompt_keeps_objective_rules_generalized():
+ scenario = _make_scenario(test_persona='happy_path')
+ prompt = build_lite_prompt(scenario, 'https://example.com')
+ lower_prompt = prompt.lower()
+
+ assert 'create agent' not in lower_prompt
+ assert 'dark mode' not in lower_prompt
+ assert 'theme control' not in lower_prompt
+ assert 'ordinary app forms needed for the objective' in prompt
+
+
# ─── _build_persona_distribution_text ─────────────────────────────────────────