diff --git a/.gitignore b/.gitignore index 5af6f509..d69e7943 100644 --- a/.gitignore +++ b/.gitignore @@ -16,6 +16,7 @@ venv/ # IDEs .vscode/ .idea/ +.claude/ # Build files dist/ diff --git a/CHANGELOG.md b/CHANGELOG.md index 20f09e66..fcdc904a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,18 @@ All notable changes to Murphy will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/). +## [1.2.0] - 2026-06-02 + +### Added +- Lite mode (`--lite` CLI flag / `lite: true` in the REST API) for a faster, simpler run aimed at quick product feedback: Murphy builds a compact persona plan directly from the goal or available analysis, then runs a lighter browser-agent prompt +- Lite runs return a structured `LiteResult` per scenario with a 1–10 `grade` plus `flaws`, `improvements`, `fixes`, and `other_feedback`, summarised in a dedicated terminal output +- `lite` field on the `/generate-plan`, `/evaluate`, and `/execute` REST API request models +- `LITE_MODE.md` documentation describing the mode, what it skips, and how to run it + +### Changed +- Lite mode skips LLM test generation, the Murphy judge, full JSON/Markdown report generation, and interactive review pauses +- Disabled the unused `write_file` tool in Murphy runs + ## [1.1.0] - 2026-04-07 ### Added diff --git a/LITE_MODE.md b/LITE_MODE.md new file mode 100644 index 00000000..1c4eda01 --- /dev/null +++ b/LITE_MODE.md @@ -0,0 +1,54 @@ +# Murphy Lite Mode + +Lite mode is a faster, simpler Murphy run for quick product feedback. It is enabled with `--lite` in the CLI or `lite: true` in the REST API. + +## What It Skips + +- LLM test generation +- Interactive feature and test-plan review pauses +- Murphy judge calls +- Full JSON and Markdown report generation + +## What It Returns + +Each scenario returns a structured `LiteResult`: + +- `grade`: 1-10 overall experience score +- `flaws`: concrete problems or blockers +- `improvements`: product or UX improvements +- `fixes`: implementation-level fixes +- `other_feedback`: additional useful observations + +## CLI + +```bash +uv run murphy --url https://example.com --goal "Test agent creation flow" --lite +``` + +You can still use `--max-tests`, `--parallel`, `--provider`, `--model`, `--auth`, `--no-auth`, `--features`, and `--plan`. + +## REST + +Set `lite: true` on `/generate-plan`, `/evaluate`, or `/execute`. + +```json +{ + "url": "https://example.com", + "goal": "Test agent creation flow", + "max_tests": 1, + "lite": true +} +``` + +## Speed Experiment + +Use the manual experiment runner: + +```bash +uv run python exp_2/lite_speed/run_compare.py \ + --url https://work.toqan.ai \ + --goal "Test agent creation flow" \ + --max-tests 1 \ + --parallel 1 \ + --repetitions 1 +``` diff --git a/README.md b/README.md index f75c8d57..e27e3a83 100644 --- a/README.md +++ b/README.md @@ -78,6 +78,9 @@ uv run murphy --url https://example.com # Goal-directed: explores with focus, skips feature discovery, generates plan directly uv run murphy --url https://example.com --goal "test the checkout flow" +# Lite mode: faster, simpler run that skips test generation, judge, and reports +uv run murphy --url https://example.com --goal "test the checkout flow" --lite + # Site requires login — opens browser for manual auth first (local only, not Docker) uv run murphy --url https://example.com --auth @@ -115,6 +118,8 @@ An AI agent explores the site with the given goal in mind, then synthesizes a te **Execution (both strategies):** An AI agent runs each test scenario in a real browser, and a separate judge LLM evaluates pass/fail. Saves `evaluation_report.json` and `evaluation_report.md`. +**Lite mode (`--lite`):** Murphy builds a compact persona plan directly from the goal or available analysis, then runs a lighter browser-agent prompt that returns structured `flaws`, `improvements`, `fixes`, and `other_feedback`. It skips LLM test generation, the judge, report generation, and interactive review pauses. + You can resume from any point by passing `--features` or `--plan` with a previously generated (and optionally edited) file. ## Output diff --git a/murphy/api/cli.py b/murphy/api/cli.py index 2d5db9a0..52c7da67 100644 --- a/murphy/api/cli.py +++ b/murphy/api/cli.py @@ -7,6 +7,7 @@ murphy --url https://example.com --features features.md # skip analysis, load features from file murphy --url https://example.com --plan plan.yaml # skip analysis + generation, load test plan murphy --url https://example.com --goal "test the checkout flow" + murphy --url https://example.com --goal "test the checkout flow" --lite """ from __future__ import annotations @@ -26,7 +27,7 @@ if TYPE_CHECKING: from murphy.api.server import ServerState - from murphy.models import TestPlan, TestResult + from murphy.models import TestPlan, TestResult, TokenUsage, WebsiteAnalysis load_dotenv() @@ -40,6 +41,35 @@ ) +def _write_reports_and_log_results( + url: str, + analysis: WebsiteAnalysis | None, + results: list[TestResult], + output_dir: Path, + *, + use_lite: bool, + persona_discovery_tokens: TokenUsage | None = None, + murphy_tokens: TokenUsage | None = None, +) -> None: + """Write report artifacts when analysis context exists, then log the mode-specific terminal summary.""" + if analysis: + from murphy.core.summary import write_reports_and_print + + write_reports_and_print( + url, + analysis, + results, + output_dir, + persona_discovery_tokens=persona_discovery_tokens, + murphy_tokens=murphy_tokens, + ) + elif not use_lite: + _log_results_summary(results) + + if use_lite: + _log_lite_summary(results) + + def main() -> int: parser = argparse.ArgumentParser( prog='murphy', @@ -51,6 +81,7 @@ def main() -> int: parser.add_argument('--no-auth', action='store_true', help='Skip auth detection entirely, treat site as public') parser.add_argument('--features', help='Path to existing features markdown (skips analysis, goes to test generation)') parser.add_argument('--plan', help='Path to existing YAML test plan (skips analysis + test generation)') + parser.add_argument('--lite', action='store_true', help='Run faster lite mode: skip test generation and judge') parser.add_argument('--max-tests', type=int, default=None, help='Max test scenarios (default: number of personas)') parser.add_argument( '--provider', default='openai', help='LLM provider (default: openai). e.g. google, anthropic, azure, mistral' @@ -119,7 +150,7 @@ async def _async_main(args: argparse.Namespace) -> None: from murphy.browser.patches import apply as apply_patches from murphy.core.analysis import analyze_website from murphy.core.execution import execute_tests_with_session - from murphy.core.generation import explore_and_generate_plan, generate_tests + from murphy.core.generation import explore_and_generate_plan, generate_tests, make_lite_plan from murphy.core.summary import build_summary, write_reports_and_print from murphy.io.features_io import read_features_markdown, write_features_markdown from murphy.io.fixtures import ensure_dummy_fixture_files @@ -222,6 +253,7 @@ async def _async_main(args: argparse.Namespace) -> None: # ── Phase 1–2: Discover features & generate plan ── use_exploration_first = bool(args.goal and not args.features and not args.plan) + use_lite = bool(args.lite) if args.plan: # Skip both analysis and test generation @@ -231,6 +263,27 @@ async def _async_main(args: argparse.Namespace) -> None: if url != args.url: logger.warning('Plan URL (%s) differs from --url (%s). Using --url.', url, args.url) logger.info('Loaded %d scenarios from %s', len(test_plan.scenarios), plan_path) + elif use_lite: + if args.features: + features_path = Path(args.features) + assert features_path.exists(), f'Features file not found: {features_path}' + analysis = read_features_markdown(features_path) + logger.info('Loaded %d features from %s', len(analysis.features), features_path) + elif not args.goal: + analysis = await analyze_website(args.url, llm, goal=args.goal, browser_session=browser_session) + features_path = write_features_markdown(analysis, output_dir) + logger.info('\n Features saved: %s', features_path) + + test_plan = make_lite_plan( + args.url, + goal=args.goal, + analysis=analysis, + max_tests=args.max_tests, + discovered_personas=discovered_personas, + ) + plan_path = save_test_plan(args.url, test_plan, output_dir) + logger.info('\n Lite plan saved: %s', plan_path) + logger.info(' Using %d lite scenarios.\n', len(test_plan.scenarios)) elif use_exploration_first: # Exploration-first path: explore → summarize → synthesize plan test_plan = await explore_and_generate_plan( @@ -350,18 +403,18 @@ def _on_test_complete(results: list[TestResult]) -> None: judge_llm=judge_llm, output_dir=output_dir, discovered_personas=discovered_personas, + use_lite=use_lite, + analysis=analysis, + ) + _write_reports_and_log_results( + args.url, + analysis, + results, + output_dir, + use_lite=use_lite, + persona_discovery_tokens=persona_discovery_tokens, + murphy_tokens=_get_murphy_tokens(), ) - if analysis: - write_reports_and_print( - args.url, - analysis, - results, - output_dir, - persona_discovery_tokens=persona_discovery_tokens, - murphy_tokens=_get_murphy_tokens(), - ) - else: - _log_results_summary(results) return # ── Server UI mode (--ui) ── @@ -384,6 +437,8 @@ async def _execute_fn(plan: TestPlan, state: ServerState) -> list[TestResult]: judge_llm=judge_llm, output_dir=output_dir, discovered_personas=discovered_personas, + use_lite=use_lite, + analysis=analysis, ) state = ServerState( @@ -402,17 +457,15 @@ async def _execute_fn(plan: TestPlan, state: ServerState) -> list[TestResult]: while True: await asyncio.sleep(1) if state.done and state.results and not getattr(state, '_reports_written', False): - if analysis: - write_reports_and_print( - args.url, - analysis, - state.results, - output_dir, - persona_discovery_tokens=persona_discovery_tokens, - murphy_tokens=_get_murphy_tokens(), - ) - else: - _log_results_summary(state.results) + _write_reports_and_log_results( + args.url, + analysis, + state.results, + output_dir, + use_lite=use_lite, + persona_discovery_tokens=persona_discovery_tokens, + murphy_tokens=_get_murphy_tokens(), + ) state._reports_written = True # type: ignore[attr-defined] except KeyboardInterrupt: pass @@ -467,5 +520,24 @@ def _log_results_summary(results: list[TestResult]) -> None: logger.info('\n Pass rate: %s%% (%d/%d)', summary.pass_rate, summary.passed, summary.total) +def _log_lite_summary(results: list[TestResult]) -> None: + logger.info('\n%s', '=' * 60) + logger.info('Lite Mode Complete — %d scenario(s)', len(results)) + logger.info('%s', '=' * 60) + for result in results: + lite_result = result.lite_result + if lite_result is None: + logger.info(' [%s] no lite result — %s', result.scenario.test_persona, result.reason) + continue + logger.info( + ' [%s] grade=%d flaws=%d improvements=%d fixes=%d', + result.scenario.test_persona, + lite_result.grade, + len(lite_result.flaws), + len(lite_result.improvements), + len(lite_result.fixes), + ) + + if __name__ == '__main__': sys.exit(main()) diff --git a/murphy/api/request_models.py b/murphy/api/request_models.py index 497511a9..c9abe31e 100644 --- a/murphy/api/request_models.py +++ b/murphy/api/request_models.py @@ -36,6 +36,7 @@ class GeneratePlanRequest(BaseModel): analysis: Annotated[WebsiteAnalysis, BeforeValidator(_parse_json_string)] max_tests: int = 8 goal: str | None = None + lite: bool = False provider: str = 'openai' model: str = 'gpt-5-mini' webhook_url: str | None = None @@ -55,6 +56,7 @@ class ExecuteRequest(BaseModel): judge_model: str | None = None max_steps: int = 15 max_concurrent: int = 3 + lite: bool = False webhook_url: str | None = None async_mode: bool = Field(False, alias='async') @@ -65,6 +67,7 @@ class EvaluateRequest(BaseModel): url: str goal: str | None = None max_tests: int = 8 + lite: bool = False provider: str = 'openai' model: str = 'gpt-5-mini' judge_provider: str | None = None diff --git a/murphy/api/rest.py b/murphy/api/rest.py index 3d2c6d8a..c111f255 100644 --- a/murphy/api/rest.py +++ b/murphy/api/rest.py @@ -70,7 +70,7 @@ async def _core_generate_plan(req: GeneratePlanRequest) -> dict[str, Any]: from murphy.core.pipeline import run_generate_plan test_plan = await run_generate_plan( - req.url, req.analysis, req.model, provider=req.provider, max_tests=req.max_tests, goal=req.goal + req.url, req.analysis, req.model, provider=req.provider, max_tests=req.max_tests, goal=req.goal, lite=req.lite ) return test_plan.model_dump() @@ -101,6 +101,7 @@ async def _core_execute(req: ExecuteRequest) -> dict[str, Any]: goal=req.goal, max_steps=req.max_steps, max_concurrent=req.max_concurrent, + lite=req.lite, ) return ExecuteResult(results=results, summary=summary).model_dump() @@ -109,7 +110,9 @@ async def _core_evaluate(req: EvaluateRequest) -> dict[str, Any]: """Run exploration-first evaluation: explore site → generate test plan.""" from murphy.core.pipeline import run_evaluate - test_plan = await run_evaluate(req.url, req.model, provider=req.provider, max_tests=req.max_tests, goal=req.goal) + test_plan = await run_evaluate( + req.url, req.model, provider=req.provider, max_tests=req.max_tests, goal=req.goal, lite=req.lite + ) return test_plan.model_dump() diff --git a/murphy/api/templates.py b/murphy/api/templates.py index 8d1baf18..3f516313 100644 --- a/murphy/api/templates.py +++ b/murphy/api/templates.py @@ -45,6 +45,7 @@ .badge-medium { background: var(--text); color: #fff; } .badge-low { background: var(--gray); color: #fff; } .badge-pass { background: var(--green); color: #fff; } +.badge-fail { background: var(--red); color: #fff; } .badge-fail-website { background: var(--red); color: #fff; } .badge-fail-test { background: var(--orange); color: #fff; } .test-name { font-weight: 600; flex: 1; font-size: .95rem; } @@ -381,6 +382,7 @@ def render_results_html( sections = [ ('Passed', [r for r in results if r.success]), + ('Failed', [r for r in results if r.success is not True and r.failure_category is None]), ('Failed — Website Issue', [r for r in results if r.failure_category == 'website_issue']), ('Failed — Test Limitation', [r for r in results if r.failure_category == 'test_limitation']), ] @@ -395,6 +397,9 @@ def render_results_html( if r.success: badge_cls = 'badge-pass' badge_text = 'PASS' + elif r.failure_category is None: + badge_cls = 'badge-fail' + badge_text = 'FAILED' elif r.failure_category == 'website_issue': badge_cls = 'badge-fail-website' badge_text = 'WEBSITE ISSUE' diff --git a/murphy/browser/cleanup.py b/murphy/browser/cleanup.py index e92209a4..07370891 100644 --- a/murphy/browser/cleanup.py +++ b/murphy/browser/cleanup.py @@ -140,6 +140,8 @@ def _find_stale_browser_pids() -> list[int]: if not cmdline: continue if any(marker in cmdline for marker in profile_markers) and any(marker in cmdline for marker in browser_markers): - pids.add(proc.pid) + pid = proc.info.get('pid') + if isinstance(pid, int): + pids.add(pid) return sorted(pids) diff --git a/murphy/core/execution.py b/murphy/core/execution.py index f891701a..64b569fb 100644 --- a/murphy/core/execution.py +++ b/murphy/core/execution.py @@ -17,13 +17,15 @@ from murphy.core.summary import classify_failure from murphy.io.report_helpers import _slugify from murphy.models import ( + LiteResult, ScenarioExecutionVerdict, TestPlan, TestResult, TestScenario, + WebsiteAnalysis, ) from murphy.personas.pipeline_models import PersonaResult, TraitSchema -from murphy.prompts import build_execution_prompt +from murphy.prompts import build_execution_prompt, build_lite_prompt logger = logging.getLogger(__name__) @@ -32,9 +34,7 @@ # ─── Structured output parsing ──────────────────────────────────────────────── -def _parse_structured_output( - history: AgentHistoryList, model_cls: type[ScenarioExecutionVerdict] -) -> ScenarioExecutionVerdict | None: +def _parse_structured_output(history: AgentHistoryList, model_cls: type[Any]) -> Any | None: """Safely parse structured output from agent history.""" result = history.final_result() if not result: @@ -89,6 +89,92 @@ def _extract_urls_from_texts(texts: list[str]) -> list[str]: return urls +_INTERACTIVE_SCENARIO_KEYWORDS = ( + 'add', + 'book', + 'buy', + 'change', + 'checkout', + 'complete', + 'configure', + 'create', + 'delete', + 'disable', + 'download', + 'edit', + 'enable', + 'filter', + 'login', + 'order', + 'purchase', + 'save', + 'search', + 'select', + 'send', + 'set up', + 'setup', + 'sign in', + 'sign up', + 'submit', + 'switch', + 'test', + 'toggle', + 'try', + 'update', + 'upload', + 'use', +) + +_MEANINGFUL_LITE_ACTIONS = { + 'click', + 'click_element', + 'drag_drop', + 'input', + 'input_text', + 'press_key', + 'select_dropdown_option', + 'send_keys', + 'upload_file', +} + + +def _lite_scenario_requires_interaction(scenario: TestScenario) -> bool: + """Return True when a lite scenario describes an interactive objective.""" + scenario_text = ' '.join( + [ + scenario.name, + scenario.description, + scenario.target_feature, + scenario.steps_description, + scenario.success_criteria, + ] + ).lower() + return any(keyword in scenario_text for keyword in _INTERACTIVE_SCENARIO_KEYWORDS) + + +def _has_meaningful_lite_interaction(actions: list[dict[str, Any]]) -> bool: + """Return True when actions include an in-app interaction beyond navigation/inspection.""" + for action in actions: + for key in action: + if key == 'interacted_element': + continue + if key in _MEANINGFUL_LITE_ACTIONS: + return True + return False + + +def _build_lite_retry_prompt(task_prompt: str) -> str: + """Append a one-shot continuation instruction for premature lite completions.""" + return ( + task_prompt + + '\n\nRETRY REQUIRED:\n' + + 'Your previous lite attempt stopped before meaningful in-app interaction. ' + + 'Continue the objective now. You must use the most plausible in-app controls before returning LiteResult, ' + + 'unless blocked by login, captcha, missing permissions, destructive/payment action, ' + + 'support/contact/feedback form, external-domain route, or no plausible route after two in-app paths.' + ) + + async def _collect_session_urls(browser_session: BrowserSession) -> list[str]: """Collect current + historical tab URLs from browser session.""" urls: list[str] = [] @@ -103,6 +189,31 @@ async def _collect_session_urls(browser_session: BrowserSession) -> list[str]: return urls +def _save_agent_history( + history: AgentHistoryList, + scenario: TestScenario, + index: int, + output_dir: Path | None, +) -> None: + """Persist full browser-use history for UI trace and graph views.""" + if output_dir is None: + return + + slug = _slugify(scenario.name) + history_path = output_dir / 'agent_history' / f'test_{index:02d}_{slug}.json' + try: + history_path.parent.mkdir(parents=True, exist_ok=True) + history.save_to_file(history_path) + logger.debug(' Agent history saved: %s', history_path) + except Exception as e: + logger.warning(' Failed to save agent history: %s', e) + + +def _disable_unused_murphy_actions(agent: Agent) -> None: + """Remove browser-use tools that Murphy does not consume.""" + agent.tools.exclude_action('write_file') + + # ─── Single-test execution helper ────────────────────────────────────────────── @@ -119,6 +230,8 @@ async def _execute_single_test( judge_llm: BaseChatModel | None = None, discovered_personas: tuple['PersonaResult', 'TraitSchema'] | None = None, output_dir: Path | None = None, + use_lite: bool = False, + analysis: WebsiteAnalysis | None = None, ) -> TestResult: """Execute one test scenario and return its TestResult. @@ -134,6 +247,77 @@ async def _execute_single_test( await prepare_session_for_task(browser_session, url, force_navigate=True) file_paths_str = [str(p) for p in fixture_paths] if fixture_paths else [] + + if use_lite: + task_prompt = build_lite_prompt( + scenario, + url, + analysis=analysis, + discovered_personas=discovered_personas, + ) + + async def _run_lite_agent(prompt: str) -> AgentHistoryList: + agent_kwargs: dict[str, Any] = { + 'task': prompt, + 'llm': llm, + 'browser_session': browser_session, + 'use_judge': False, + 'max_actions_per_step': 3, + 'output_model_schema': LiteResult, + } + agent = Agent(**agent_kwargs) + _disable_unused_murphy_actions(agent) + register_domain_access_action(agent.tools, browser_session) + register_refresh_dom_action(agent.tools, browser_session) + return await agent.run(max_steps=max_steps) + + history = await _run_lite_agent(task_prompt) + all_actions = history.model_actions() + if _lite_scenario_requires_interaction(scenario) and not _has_meaningful_lite_interaction(all_actions): + logger.info(' Lite run stopped before meaningful interaction; retrying once with stricter objective guidance.') + history = await _run_lite_agent(_build_lite_retry_prompt(task_prompt)) + + _save_agent_history(history, scenario, index, output_dir) + lite_result = _parse_structured_output(history, LiteResult) + if lite_result is None: + lite_result = LiteResult( + grade=5, + flaws=['The agent did not return structured lite output.'], + improvements=['Retry the lite run or use normal Murphy for a judged report.'], + fixes=[], + other_feedback=[], + ) + + all_actions = history.model_actions() + errors = history.errors() + history_urls = [u for u in history.urls() if u] + session_urls = await _collect_session_urls(browser_session) + error_urls = _extract_urls_from_texts([e for e in errors if e]) + seen_urls: set[str] = set() + unique_pages: list[str] = [] + for page_url in history_urls + session_urls + error_urls: + if page_url not in seen_urls: + seen_urls.add(page_url) + unique_pages.append(page_url) + + success = lite_result.grade >= 5 + logger.info(' Lite result: grade=%d (%.1fs)', lite_result.grade, history.total_duration_seconds()) + test_result = TestResult( + scenario=scenario, + success=success, + judgement=None, + actions=all_actions, + errors=errors, + duration=history.total_duration_seconds(), + pages_visited=unique_pages, + screenshot_paths=[p for p in history.screenshot_paths() if p], + form_fills=_extract_form_fills(all_actions), + reason=f'Lite mode grade: {lite_result.grade}', + lite_result=lite_result, + ) + test_result.failure_category = classify_failure(test_result) + return test_result + task_prompt = build_execution_prompt( goal or f'Evaluate {url}', scenario, @@ -156,6 +340,7 @@ async def _execute_single_test( agent_kwargs['output_model_schema'] = ScenarioExecutionVerdict agent = Agent(**agent_kwargs) + _disable_unused_murphy_actions(agent) # Register custom actions register_domain_access_action(agent.tools, browser_session) register_refresh_dom_action(agent.tools, browser_session) @@ -210,15 +395,7 @@ async def _execute_single_test( seen_urls.add(p) unique_pages.append(p) - # Save full browser-use history to output/agent_history/ when output_dir is set - if output_dir is not None: - slug = _slugify(scenario.name) - history_path = output_dir / 'agent_history' / f'test_{index:02d}_{slug}.json' - try: - history.save_to_file(history_path) - logger.debug(' Agent history saved: %s', history_path) - except Exception as e: - logger.warning(' Failed to save agent history: %s', e) + _save_agent_history(history, scenario, index, output_dir) test_result = TestResult( scenario=scenario, @@ -379,6 +556,8 @@ async def execute_tests( judge_llm: BaseChatModel | None = None, output_dir: Path | None = None, discovered_personas: tuple['PersonaResult', 'TraitSchema'] | None = None, + use_lite: bool = False, + analysis: WebsiteAnalysis | None = None, ) -> list[TestResult]: """Execute tests without a pre-existing session (creates its own).""" from browser_use.browser.profile import BrowserProfile @@ -398,6 +577,8 @@ async def execute_tests( judge_llm=judge_llm, output_dir=output_dir, discovered_personas=discovered_personas, + use_lite=use_lite, + analysis=analysis, ) finally: await browser_session.kill() @@ -417,6 +598,8 @@ async def execute_tests_with_session( judge_llm: BaseChatModel | None = None, output_dir: Path | None = None, discovered_personas: tuple['PersonaResult', 'TraitSchema'] | None = None, + use_lite: bool = False, + analysis: WebsiteAnalysis | None = None, ) -> list[TestResult]: """Phase 3 execution reusing an existing browser session. @@ -458,6 +641,8 @@ async def execute_tests_with_session( judge_llm=judge_llm, output_dir=output_dir, discovered_personas=discovered_personas, + use_lite=use_lite, + analysis=analysis, ) results.append(test_result) @@ -507,6 +692,8 @@ async def _run_one(index_0: int, scenario: TestScenario) -> None: judge_llm=judge_llm, output_dir=output_dir, discovered_personas=discovered_personas, + use_lite=use_lite, + analysis=analysis, ) results_slots[index_0] = result diff --git a/murphy/core/generation.py b/murphy/core/generation.py index 4ecdfb20..69e22b46 100644 --- a/murphy/core/generation.py +++ b/murphy/core/generation.py @@ -10,7 +10,7 @@ from murphy.browser.session_utils import prepare_session_for_task from murphy.config import EXPLORE_MAX_STEPS, QUALITY_MAX_RETRIES from murphy.core.quality import plan_quality_issues -from murphy.models import PERSONA_REGISTRY, TestPlan +from murphy.models import PERSONA_REGISTRY, TestPlan, TestScenario, WebsiteAnalysis from murphy.personas.bridge import get_discovered_persona_names from murphy.personas.pipeline_models import PersonaResult, TraitSchema from murphy.prompts import ( @@ -23,6 +23,196 @@ logger = logging.getLogger(__name__) +_INTERACTIVE_GOAL_KEYWORDS = ( + 'add', + 'book', + 'buy', + 'change', + 'checkout', + 'complete', + 'configure', + 'create', + 'delete', + 'disable', + 'download', + 'edit', + 'enable', + 'filter', + 'login', + 'order', + 'purchase', + 'save', + 'search', + 'select', + 'send', + 'set up', + 'setup', + 'sign in', + 'sign up', + 'submit', + 'switch', + 'test', + 'toggle', + 'try', + 'update', + 'upload', + 'use', +) + +_STATE_CHANGE_GOAL_KEYWORDS = ( + 'change', + 'disable', + 'enable', + 'mode', + 'preference', + 'setting', + 'switch', + 'toggle', + 'turn off', + 'turn on', +) + +_CREATION_OR_SUBMISSION_GOAL_KEYWORDS = ( + 'add', + 'book', + 'buy', + 'checkout', + 'complete', + 'configure', + 'create', + 'creation', + 'new', + 'order', + 'purchase', + 'save', + 'send', + 'set up', + 'setup', + 'sign up', + 'submit', + 'upload', +) + + +def _matches_any_keyword(text: str, keywords: tuple[str, ...]) -> bool: + """Return True if any objective keyword appears in text.""" + normalized = text.lower() + return any(keyword in normalized for keyword in keywords) + + +def _lite_goal_requires_interaction(task: str) -> bool: + """Whether a lite objective should require at least one in-app interaction.""" + return _matches_any_keyword(task, _INTERACTIVE_GOAL_KEYWORDS) + + +def _lite_goal_is_state_change(task: str) -> bool: + """Whether a lite objective is primarily about changing UI or app state.""" + return _matches_any_keyword(task, _STATE_CHANGE_GOAL_KEYWORDS) + + +def _lite_goal_is_creation_or_submission(task: str) -> bool: + """Whether a lite objective likely needs safe test input and advancement.""" + return _matches_any_keyword(task, _CREATION_OR_SUBMISSION_GOAL_KEYWORDS) + + +def _build_lite_objective_steps(url: str, task: str) -> str: + """Build generalized objective-driven steps for lite mode.""" + steps = [ + f'Complete this objective on {url}: {task}.', + 'Minimum required path:', + '1. Locate the most plausible in-app route for the objective. If a control is ambiguous but plausibly relevant, try it and report the ambiguity afterward.', + ] + + if _lite_goal_is_creation_or_submission(task): + steps.extend( + [ + '2. Attempt the objective by initiating the route, providing harmless test input only for fields required to continue, and advancing one step at a time.', + '3. Advance or submit only when safe; stop before destructive, payment, external-domain, or support/contact/feedback actions unless the objective explicitly requires reporting that blocker.', + ] + ) + elif _lite_goal_is_state_change(task): + steps.extend( + [ + '2. Change the requested state using the most plausible in-app control or setting.', + '3. Check whether the requested state is reflected in the visible UI or remains in effect after a simple in-app navigation or refresh when safe.', + ] + ) + elif _lite_goal_requires_interaction(task): + steps.extend( + [ + '2. Attempt the objective through the most plausible in-app interaction rather than stopping at observation.', + '3. Continue until the objective is completed, blocked, or objectively unavailable.', + ] + ) + else: + steps.extend( + [ + '2. Inspect the experience through the persona lens and interact with any clearly relevant in-app controls if they are needed to evaluate the objective.', + '3. Stop when you have concrete observed evidence for the objective.', + ] + ) + + steps.extend( + [ + '4. Verify the resulting UI state using visible evidence such as confirmation, validation, changed state, blocked state, persisted state, or clear absence of a plausible route.', + '5. Return concise lite output with flaws, improvements, fixes, and other observations grounded in what you attempted and observed.', + ] + ) + return '\n'.join(steps) + + +def make_lite_plan( + url: str, + goal: str | None = None, + analysis: WebsiteAnalysis | None = None, + max_tests: int | None = None, + discovered_personas: tuple[PersonaResult, TraitSchema] | None = None, +) -> TestPlan: + """Create a compact lite-mode plan without an LLM generation call.""" + if discovered_personas: + personas = get_discovered_persona_names(discovered_personas[0]) + else: + personas = list(PERSONA_REGISTRY.keys()) + if max_tests is not None: + personas = personas[:max_tests] + + core_features = [f for f in (analysis.features if analysis else []) if f.importance == 'core'] + testable_features = [f for f in (analysis.features if analysis else []) if f.testability in ('testable', 'partial')] + primary_feature = (core_features or testable_features)[0] if (core_features or testable_features) else None + target_feature = primary_feature.name if primary_feature else (goal or 'overall site experience') + feature_category = primary_feature.category if primary_feature else 'other' + site_name = analysis.site_name if analysis else url + task = goal or f'Evaluate {site_name}' + + steps_parts = [_build_lite_objective_steps(url, task)] + if analysis and analysis.identified_user_flows: + steps_parts.append('Relevant user flows:\n' + '\n'.join(f'- {flow}' for flow in analysis.identified_user_flows)) + if core_features: + steps_parts.append('Core features:\n' + '\n'.join(f'- {feature.name}' for feature in core_features)) + steps_description = '\n\n'.join(steps_parts) + + scenarios: list[TestScenario] = [] + for index, persona in enumerate(personas): + priority = 'critical' if index == 0 else 'high' + scenarios.append( + TestScenario( + name=f'Lite {persona.replace("_", " ")} review'[:100], + description=f'{task} as {persona} on {site_name}.', + priority=priority, # type: ignore[arg-type] + feature_category=feature_category, + target_feature=target_feature, + test_persona=persona, + steps_description=steps_description, + success_criteria='Return structured flaws, improvements, fixes, and other observations for this goal.', + ) + ) + + logger.info('\n%s', '=' * 60) + logger.info('Built %d lite scenarios without LLM test generation', len(scenarios)) + logger.info('%s\n', '=' * 60) + return TestPlan(scenarios=scenarios) + + async def generate_tests( url: str, analysis: 'Any', diff --git a/murphy/core/judge.py b/murphy/core/judge.py index 48e06a82..5127b782 100644 --- a/murphy/core/judge.py +++ b/murphy/core/judge.py @@ -235,8 +235,9 @@ def _select_key_screenshots(history: AgentHistoryList, max_screenshots: int = 3) Returns at most max_screenshots base64 strings. """ steps = history.history - if not steps: - return [] + if not isinstance(steps, list) or not steps: + screenshots = history.screenshots() + return [s for s in screenshots if s][-max_screenshots:] # Score each step scored: list[tuple[int, int, str]] = [] # (score, index, screenshot_b64) diff --git a/murphy/core/pipeline.py b/murphy/core/pipeline.py index e080465f..532e6cb9 100644 --- a/murphy/core/pipeline.py +++ b/murphy/core/pipeline.py @@ -22,6 +22,7 @@ execute_tests_with_session, explore_and_generate_plan, generate_tests, + make_lite_plan, ) from murphy.io.fixtures import ensure_dummy_fixture_files from murphy.llm import create_llm @@ -61,9 +62,12 @@ async def run_generate_plan( provider: str = 'openai', max_tests: int | None = None, goal: str | None = None, + lite: bool = False, ) -> TestPlan: """Generate test plan from analysis.""" apply_patches() + if lite: + return make_lite_plan(url, goal=goal, analysis=analysis, max_tests=max_tests) llm = create_llm(model, provider=provider) return await generate_tests(url, analysis, llm, max_tests, goal=goal) @@ -83,6 +87,7 @@ async def run_execute( save_callback: Any = None, progress_state: Any = None, output_dir: Path | None = None, + lite: bool = False, ) -> tuple[list[TestResult], ReportSummary]: """Execute tests and return results + summary.""" apply_patches() @@ -112,6 +117,7 @@ async def run_execute( max_concurrent=max_concurrent, judge_llm=judge_llm, output_dir=output_dir, + use_lite=lite, ) summary = build_summary(results) return results, summary @@ -128,9 +134,12 @@ async def run_evaluate( max_tests: int | None = None, goal: str | None = None, browser_session: BrowserSession | None = None, + lite: bool = False, ) -> TestPlan: """Exploration-first: explore site then generate test plan.""" apply_patches() + if lite: + return make_lite_plan(url, goal=goal, max_tests=max_tests) kill_stale_browser() task = goal or f'Evaluate the website at {url}' llm = create_llm(model, provider=provider) diff --git a/murphy/core/summary.py b/murphy/core/summary.py index 6714d903..255fdf8d 100644 --- a/murphy/core/summary.py +++ b/murphy/core/summary.py @@ -29,6 +29,8 @@ def classify_failure(result: TestResult) -> Literal['website_issue', 'test_limit """ if result.success is True: return None + if result.lite_result is not None: + return None # Crashed tests: success=None with no judgement → test infrastructure failure if result.success is None: return 'test_limitation' diff --git a/murphy/evaluate.py b/murphy/evaluate.py index bcecc708..157a5df0 100644 --- a/murphy/evaluate.py +++ b/murphy/evaluate.py @@ -9,7 +9,7 @@ from murphy.core.analysis import analyze_website from murphy.core.execution import execute_tests, execute_tests_with_session -from murphy.core.generation import explore_and_generate_plan, generate_tests +from murphy.core.generation import explore_and_generate_plan, generate_tests, make_lite_plan from murphy.core.summary import build_summary, classify_failure, generate_executive_summary, write_reports_and_print __all__ = [ @@ -21,5 +21,6 @@ 'explore_and_generate_plan', 'generate_executive_summary', 'generate_tests', + 'make_lite_plan', 'write_reports_and_print', ] diff --git a/murphy/io/report_markdown.py b/murphy/io/report_markdown.py index e5c2e4f6..bffc85b0 100644 --- a/murphy/io/report_markdown.py +++ b/murphy/io/report_markdown.py @@ -13,6 +13,33 @@ from murphy.models import EvaluationReport, TestResult +def _append_bullets(title: str, items: list[str], lines: list[str]) -> None: + if not items: + return + lines.append(f'**{title}:**') + for item in items: + lines.append(f'- {item}') + lines.append('') + + +def _render_lite_result(r: TestResult, lines: list[str]) -> None: + """Append Murphy lite structured evaluation details for one result.""" + if not r.lite_result: + return + + lite = r.lite_result + lines += [ + '**Lite Evaluation:**', + '', + f'**Grade:** {lite.grade}/10', + '', + ] + _append_bullets('Flaws', lite.flaws, lines) + _append_bullets('Improvements', lite.improvements, lines) + _append_bullets('Fixes', lite.fixes, lines) + _append_bullets('Other feedback', lite.other_feedback, lines) + + def _render_test_detail(r: TestResult, index: int, lines: list[str]) -> None: """Append detailed info for a single test result (pass or fail).""" m = _compute_metrics(r) @@ -20,6 +47,7 @@ def _render_test_detail(r: TestResult, index: int, lines: list[str]) -> None: lines.append(f'**Result:** {"Passed" if passed else "Failed"} in {r.duration:.0f}s') lines.append('') + _render_lite_result(r, lines) lines.append(f'**Metrics:** {_format_metrics_line(m)}') lines.append('') # lines.append(f'{format_path(r)}') @@ -133,6 +161,7 @@ def write_markdown_report(report: EvaluationReport, output_dir: Path) -> Path: # Partition results website_issues = [r for r in report.results if r.failure_category == 'website_issue'] test_limitations = [r for r in report.results if r.failure_category == 'test_limitation'] + failed_tests = [r for r in report.results if r.success is not True and r.failure_category is None] passed_tests = [r for r in report.results if r.success] # Scorecard @@ -142,24 +171,47 @@ def write_markdown_report(report: EvaluationReport, output_dir: Path) -> Path: f'- Website Issues: {s.website_issues}', f'- Test Limitations: {s.test_limitations}', '', - '| Test | Persona | Result | Category | Duration |', - '|------|---------|--------|----------|----------|', ] - for r in report.results: - persona_label = r.scenario.test_persona.replace('_', ' ').title() - if r.success: - emoji = '\u2705' - result_str = 'Passed' - category_str = '' - elif r.failure_category == 'website_issue': - emoji = '\U0001f534' - result_str = 'Failed' - category_str = 'Website Issue' - else: - emoji = '\u26a0\ufe0f' - result_str = 'Failed' - category_str = 'Test Limitation' - lines.append(f'| {emoji} {r.scenario.name} | {persona_label} | {result_str} | {category_str} | {r.duration:.0f}s |') + + has_lite_results = any(r.lite_result for r in report.results) + if has_lite_results: + lines += [ + '| Test | Persona | Grade | Flaws | Improvements | Fixes | Duration |', + '|------|---------|-------|-------|--------------|-------|----------|', + ] + for r in report.results: + persona_label = r.scenario.test_persona.replace('_', ' ').title() + lite = r.lite_result + if lite: + grade = f'{lite.grade}/10' + flaws = str(len(lite.flaws)) + improvements = str(len(lite.improvements)) + fixes = str(len(lite.fixes)) + else: + grade = flaws = improvements = fixes = 'n/a' + lines.append( + f'| {r.scenario.name} | {persona_label} | {grade} | {flaws} | {improvements} | {fixes} | {r.duration:.0f}s |' + ) + else: + lines += [ + '| Test | Persona | Result | Category | Duration |', + '|------|---------|--------|----------|----------|', + ] + for r in report.results: + persona_label = r.scenario.test_persona.replace('_', ' ').title() + if r.success: + emoji = '\u2705' + result_str = 'Passed' + category_str = '' + elif r.failure_category == 'website_issue': + emoji = '\U0001f534' + result_str = 'Failed' + category_str = 'Website Issue' + else: + emoji = '\u26a0\ufe0f' + result_str = 'Failed' + category_str = 'Test Limitation' + lines.append(f'| {emoji} {r.scenario.name} | {persona_label} | {result_str} | {category_str} | {r.duration:.0f}s |') # ── Feedback Quality Index ──────────────────────────────────────────────── fq_results = [r for r in report.results if r.feedback_quality] @@ -267,6 +319,21 @@ def yes_no(b): _render_test_detail(r, i, detail_lines) lines += ['
', f'{summary_text}', ''] + detail_lines + ['
', ''] + # ── Generic failed tests section ────────────────────────────────────────── + if failed_tests: + lines += ['## Failed Tests', ''] + for i, r in enumerate(failed_tests, 1): + persona_label = r.scenario.test_persona.replace('_', ' ').title() + summary_text = f'\u274c {i}. {r.scenario.name} — {persona_label}' + detail_lines = [ + f'**Persona:** {persona_label}', + '', + f'**What was tested:** {r.scenario.description}', + '', + ] + _render_test_detail(r, i, detail_lines) + lines += ['
', f'{summary_text}', ''] + detail_lines + ['
', ''] + # ── Passed Tests section ────────────────────────────────────────────────── if passed_tests: lines += ['## Passed Tests', ''] diff --git a/murphy/models.py b/murphy/models.py index 3ed79170..74186d62 100644 --- a/murphy/models.py +++ b/murphy/models.py @@ -432,6 +432,21 @@ class ScenarioExecutionVerdict(BaseModel): ) +# ─── Lite mode feedback ─────────────────────────────────────────────────────── + + +class LiteResult(BaseModel): + """Fast, structured output returned by Murphy lite mode.""" + + grade: int = Field(ge=1, le=10, description='Overall experience score from 1 (poor) to 10 (excellent).') + flaws: list[str] = Field(default_factory=list, description='Observed problems, friction, or broken behavior.') + improvements: list[str] = Field(default_factory=list, description='Product or UX improvements that would help users.') + fixes: list[str] = Field(default_factory=list, description='Concrete fixes that address the observed flaws.') + other_feedback: list[str] = Field( + default_factory=list, description='Additional observations that do not fit the other fields.' + ) + + # ─── Judge verdict ───────────────────────────────────────────────────────────── @@ -501,6 +516,7 @@ class TestResult(BaseModel): trait_evaluations: dict[str, Literal['pass', 'fail']] | None = None missing_signals: list[str] = Field(default_factory=list) feature_suggestions: list[str] = Field(default_factory=list) + lite_result: LiteResult | None = None class ReportSummary(BaseModel): diff --git a/murphy/prompts.py b/murphy/prompts.py index e494eab9..53ddfac3 100644 --- a/murphy/prompts.py +++ b/murphy/prompts.py @@ -538,6 +538,59 @@ def _build_suggestion_instruction( ) +def build_lite_prompt( + scenario: TestScenario, + start_url: str, + analysis: WebsiteAnalysis | None = None, + discovered_personas: tuple[PersonaResult, TraitSchema] | None = None, +) -> str: + """Build the lean execution prompt for Murphy lite mode.""" + if discovered_personas and scenario.test_persona not in PERSONA_REGISTRY: + from murphy.personas.bridge import render_discovered_persona_for_execution + + persona_result, trait_schema = discovered_personas + persona_block = render_discovered_persona_for_execution(scenario.test_persona, persona_result, trait_schema) + else: + persona_block = _render_persona_for_execution(scenario.test_persona) + + if analysis: + core_features = [feature.name for feature in analysis.features if feature.importance == 'core'] + site_context = ( + 'SITE CONTEXT:\n' + f'- Site: {analysis.site_name}\n' + f'- Category: {analysis.category}\n' + f'- Description: {analysis.description}\n' + f'- Core features: {", ".join(core_features) if core_features else "not identified"}\n' + f'- User flows: {", ".join(analysis.identified_user_flows) if analysis.identified_user_flows else "not identified"}\n\n' + ) + else: + site_context = '' + + return ( + f'You are running Murphy lite mode: a fast objective-driven website test.\n\n' + f'{persona_block}\n\n' + f'{site_context}' + f'Task: {scenario.description}\n\n' + f'Steps:\n{scenario.steps_description}\n\n' + f'Start URL: {start_url}\n\n' + f'Rules:\n' + f'- Stay on the same domain as {start_url}.\n' + f'- You must attempt the stated objective before returning the LiteResult.\n' + f'- Do not stop at first-impression UX feedback if there is any plausible in-app path to continue.\n' + f'- If a control is ambiguous but plausibly relevant to the objective, use it and report the ambiguity afterward.\n' + f'- Terminal states: objective completed and verified, objective attempted but blocked, or no plausible route found after trying at least two in-app paths.\n' + f'- You may submit ordinary app forms needed for the objective using harmless test data.\n' + f'- Do not submit support/contact/feedback forms, payment actions, destructive confirmations, or external-domain flows.\n' + f'- If the app blocks you with login, captcha, or missing permissions, report that as a flaw and stop.\n\n' + f'Return exactly one LiteResult object with these fields:\n' + f'- grade: integer from 1 to 10 for the overall experience.\n' + f'- flaws: concrete problems, friction, broken behavior, or blockers you observed.\n' + f'- improvements: product or UX improvements that would make the flow better.\n' + f'- fixes: concrete implementation fixes that address the flaws.\n' + f'- other_feedback: useful observations that do not fit the other fields.\n' + ) + + def build_execution_prompt( global_task: str, scenario: TestScenario, diff --git a/tests/murphy/api/test_cli.py b/tests/murphy/api/test_cli.py new file mode 100644 index 00000000..1ad46e55 --- /dev/null +++ b/tests/murphy/api/test_cli.py @@ -0,0 +1,70 @@ +"""Tests for Murphy CLI report-writing helpers.""" + +from pathlib import Path +from unittest.mock import patch + +from murphy.api.cli import _write_reports_and_log_results +from murphy.models import TestResult, TestScenario, TokenUsage, WebsiteAnalysis + + +def _make_analysis() -> WebsiteAnalysis: + return WebsiteAnalysis( + site_name='Example', + category='saas', + description='An example site', + key_pages=[], + features=[], + identified_user_flows=[], + ) + + +def _make_result() -> TestResult: + return TestResult( + scenario=TestScenario( + name='Lite agent creation', + description='Assess the agent creation flow', + priority='high', + feature_category='forms', + target_feature='Agent creation', + test_persona='confused_novice', + steps_description='Try to create an agent', + success_criteria='Return lightweight UX feedback', + ), + success=True, + judgement=None, + actions=[], + errors=[], + duration=2.0, + reason='Lite mode grade: 6', + ) + + +def test_write_reports_and_log_results_writes_lite_report_and_logs_summary(): + analysis = _make_analysis() + results = [_make_result()] + output_dir = Path('/tmp/murphy-test-output') + tokens = TokenUsage(input_tokens=10, output_tokens=5) + + with ( + patch('murphy.core.summary.write_reports_and_print') as write_reports, + patch('murphy.api.cli._log_lite_summary') as log_lite_summary, + ): + _write_reports_and_log_results( + url='https://example.com', + analysis=analysis, + results=results, + output_dir=output_dir, + use_lite=True, + persona_discovery_tokens=None, + murphy_tokens=tokens, + ) + + write_reports.assert_called_once_with( + 'https://example.com', + analysis, + results, + output_dir, + persona_discovery_tokens=None, + murphy_tokens=tokens, + ) + log_lite_summary.assert_called_once_with(results) diff --git a/tests/murphy/api/test_request_models.py b/tests/murphy/api/test_request_models.py index 37fca89e..853eb45e 100644 --- a/tests/murphy/api/test_request_models.py +++ b/tests/murphy/api/test_request_models.py @@ -90,6 +90,12 @@ def test_evaluate_request_defaults(): r = EvaluateRequest(url='https://example.com') # type: ignore[call-arg] assert r.max_tests == 8 assert r.async_mode is False + assert r.lite is False + + +def test_evaluate_request_accepts_lite(): + r = EvaluateRequest(url='https://example.com', lite=True) # type: ignore[call-arg] + assert r.lite is True # ─── ExecuteRequest ────────────────────────────────────────────────────────── @@ -101,6 +107,12 @@ def test_execute_request_defaults(): assert r.evaluate_job_id is None assert r.max_steps == 15 assert r.max_concurrent == 3 + assert r.lite is False + + +def test_execute_request_accepts_lite(): + r = ExecuteRequest(url='https://example.com', lite=True) # type: ignore[call-arg] + assert r.lite is True def test_execute_request_with_json_string_test_plan(): @@ -167,6 +179,11 @@ def test_generate_plan_request_with_json_string_analysis(): assert r.analysis.site_name == 'Example' +def test_generate_plan_request_accepts_lite(): + r = GeneratePlanRequest(url='https://example.com', analysis=_make_analysis_dict(), lite=True) # type: ignore[arg-type] + assert r.lite is True + + # ─── JobResponse ────────────────────────────────────────────────────────────── diff --git a/tests/murphy/api/test_rest_api.py b/tests/murphy/api/test_rest_api.py index 54919271..f6df314a 100644 --- a/tests/murphy/api/test_rest_api.py +++ b/tests/murphy/api/test_rest_api.py @@ -1,10 +1,14 @@ """Tests for REST API endpoints using FastAPI TestClient — no real LLM/browser calls.""" +from unittest.mock import AsyncMock, patch + import pytest from fastapi.testclient import TestClient from murphy.api.jobs import Job, _jobs -from murphy.api.rest import app +from murphy.api.request_models import EvaluateRequest, ExecuteRequest, GeneratePlanRequest +from murphy.api.rest import _core_evaluate, _core_execute, _core_generate_plan, app +from murphy.models import ReportSummary, TestPlan @pytest.fixture(autouse=True) @@ -98,3 +102,50 @@ def test_get_job_strips_whitespace(client, monkeypatch): resp = client.get('/jobs/ my-job ') assert resp.status_code == 200 + + +def _analysis_dict() -> dict: + return { + 'site_name': 'Example', + 'category': 'saas', + 'description': 'An example site', + 'key_pages': [], + 'features': [], + 'identified_user_flows': [], + } + + +@pytest.mark.asyncio +async def test_core_generate_plan_propagates_lite(): + with patch('murphy.core.pipeline.run_generate_plan', new_callable=AsyncMock) as run_generate_plan: + run_generate_plan.return_value = TestPlan(scenarios=[]) + req = GeneratePlanRequest(url='https://example.com', analysis=_analysis_dict(), lite=True) # type: ignore[arg-type] + + await _core_generate_plan(req) + + run_generate_plan.assert_awaited_once() + assert run_generate_plan.call_args.kwargs['lite'] is True + + +@pytest.mark.asyncio +async def test_core_execute_propagates_lite(): + with patch('murphy.core.pipeline.run_execute', new_callable=AsyncMock) as run_execute: + run_execute.return_value = ([], ReportSummary(total=0, passed=0, failed=0, pass_rate=0.0, by_priority={})) + req = ExecuteRequest(url='https://example.com', test_plan=TestPlan(scenarios=[]), lite=True) # type: ignore[call-arg] + + await _core_execute(req) + + run_execute.assert_awaited_once() + assert run_execute.call_args.kwargs['lite'] is True + + +@pytest.mark.asyncio +async def test_core_evaluate_propagates_lite(): + with patch('murphy.core.pipeline.run_evaluate', new_callable=AsyncMock) as run_evaluate: + run_evaluate.return_value = TestPlan(scenarios=[]) + req = EvaluateRequest(url='https://example.com', goal='Test agent creation flow', lite=True) # type: ignore[call-arg] + + await _core_evaluate(req) + + run_evaluate.assert_awaited_once() + assert run_evaluate.call_args.kwargs['lite'] is True diff --git a/tests/murphy/api/test_templates.py b/tests/murphy/api/test_templates.py index 4748105c..eb198c83 100644 --- a/tests/murphy/api/test_templates.py +++ b/tests/murphy/api/test_templates.py @@ -11,6 +11,7 @@ from murphy.models import ( Feature, JudgeVerdict, + LiteResult, PageInfo, TestPlan, TestResult, @@ -190,6 +191,30 @@ def test_render_results_html_with_failure(): assert 'Website Issue' in html +def test_render_results_html_failed_lite_result_uses_plain_failed_badge(): + analysis = _make_analysis() + lite_result = LiteResult( + grade=4, + flaws=['The create flow is hard to find'], + improvements=['Expose a clearer create action'], + fixes=['Add a primary Create Agent button'], + other_feedback=[], + ) + results = [ + _make_result( + success=False, + judgement=None, + lite_result=lite_result, + failure_category=None, + reason='Lite mode grade: 4', + ) + ] + html = render_results_html('https://example.com', analysis, results, None) + assert 'Failed (1)' in html + assert 'FAILED' in html + assert 'TEST LIMITATION' not in html + + def test_render_results_html_escapes_xss(): analysis = _make_analysis(site_name='') results = [_make_result()] diff --git a/tests/murphy/core/test_execution.py b/tests/murphy/core/test_execution.py index 396f5b34..72f11e88 100644 --- a/tests/murphy/core/test_execution.py +++ b/tests/murphy/core/test_execution.py @@ -1,9 +1,16 @@ """Tests for execution helper functions (no browser/LLM calls).""" +import json +from unittest.mock import AsyncMock, MagicMock, patch + +import pytest + from murphy.core.execution import ( + _execute_single_test, _extract_form_fills, _extract_urls_from_texts, ) +from murphy.models import JudgeVerdict, LiteResult, ScenarioExecutionVerdict, TestScenario # ─── _extract_form_fills ───────────────────────────────────────────────────── @@ -119,3 +126,381 @@ def test_extract_urls_from_texts_multiple(): def test_extract_urls_from_texts_skips_none(): result = _extract_urls_from_texts(['', 'https://ok.com']) assert result == ['https://ok.com'] + + +# ─── Lite execution ────────────────────────────────────────────────────────── + + +@pytest.mark.asyncio +async def test_execute_single_test_lite_mode_skips_judge_and_returns_lite_result(): + scenario = TestScenario( + name='Lite agent creation', + description='Assess the agent creation flow', + priority='critical', + feature_category='forms', + target_feature='Agent creation', + test_persona='happy_path', + steps_description='Try to create an agent', + success_criteria='Return structured flaws, improvements, fixes, and other observations.', + ) + lite_result = LiteResult( + grade=7, + flaws=['Creation has unclear required fields'], + improvements=['Show progress while creating the agent'], + fixes=['Label the create button clearly'], + other_feedback=['The main navigation is understandable'], + ) + history = MagicMock() + history.final_result.return_value = json.dumps(lite_result.model_dump()) + history.model_actions.return_value = [{'click': {'index': 1}}] + history.errors.return_value = [] + history.total_duration_seconds.return_value = 3.5 + history.urls.return_value = ['https://example.com/agents'] + history.screenshot_paths.return_value = [] + + agent = MagicMock() + agent.tools = MagicMock() + agent.run = AsyncMock(return_value=history) + + with ( + patch('murphy.core.execution.Agent', return_value=agent) as agent_cls, + patch('murphy.core.execution.murphy_judge', new_callable=AsyncMock) as judge, + patch('murphy.browser.session_utils.prepare_session_for_task', new_callable=AsyncMock), + patch('murphy.browser.actions.register_domain_access_action'), + patch('murphy.browser.actions.register_refresh_dom_action'), + ): + result = await _execute_single_test( + url='https://example.com', + scenario=scenario, + llm=MagicMock(), + browser_session=MagicMock(), + goal='Test agent creation flow', + fixture_paths=None, + max_steps=5, + index=1, + total=1, + use_lite=True, + ) + + agent_cls.assert_called_once() + assert agent_cls.call_args.kwargs['output_model_schema'] is LiteResult + agent.tools.exclude_action.assert_any_call('write_file') + judge.assert_not_awaited() + assert result.success is True + assert result.judgement is None + assert result.lite_result == lite_result + assert result.reason == 'Lite mode grade: 7' + + +@pytest.mark.asyncio +async def test_execute_single_test_lite_mode_low_grade_is_plain_failed_test(): + scenario = TestScenario( + name='Lite confused user', + description='Assess the agent creation flow for confusion', + priority='high', + feature_category='forms', + target_feature='Agent creation', + test_persona='confused_novice', + steps_description='Try to create an agent', + success_criteria='Return structured flaws, improvements, fixes, and other observations.', + ) + lite_result = LiteResult( + grade=4, + flaws=['The create flow is hard to find'], + improvements=['Expose a clearer create action'], + fixes=['Add a primary Create Agent button'], + other_feedback=[], + ) + history = MagicMock() + history.final_result.return_value = json.dumps(lite_result.model_dump()) + history.model_actions.return_value = [{'click': {'index': 1}}] + history.errors.return_value = [] + history.total_duration_seconds.return_value = 2.0 + history.urls.return_value = ['https://example.com'] + history.screenshot_paths.return_value = [] + + agent = MagicMock() + agent.tools = MagicMock() + agent.run = AsyncMock(return_value=history) + + with ( + patch('murphy.core.execution.Agent', return_value=agent), + patch('murphy.core.execution.murphy_judge', new_callable=AsyncMock) as judge, + patch('murphy.browser.session_utils.prepare_session_for_task', new_callable=AsyncMock), + patch('murphy.browser.actions.register_domain_access_action'), + patch('murphy.browser.actions.register_refresh_dom_action'), + ): + result = await _execute_single_test( + url='https://example.com', + scenario=scenario, + llm=MagicMock(), + browser_session=MagicMock(), + goal='Test agent creation flow', + fixture_paths=None, + max_steps=5, + index=1, + total=1, + use_lite=True, + ) + + judge.assert_not_awaited() + assert result.success is False + assert result.judgement is None + assert result.failure_category is None + assert result.lite_result == lite_result + assert result.reason == 'Lite mode grade: 4' + + +@pytest.mark.asyncio +async def test_execute_single_test_lite_mode_saves_agent_history_when_output_dir_set(tmp_path): + scenario = TestScenario( + name='Lite agent creation', + description='Assess the agent creation flow', + priority='critical', + feature_category='forms', + target_feature='Agent creation', + test_persona='happy_path', + steps_description='Try to create an agent', + success_criteria='Return structured flaws, improvements, fixes, and other observations.', + ) + lite_result = LiteResult( + grade=7, + flaws=['Creation has unclear required fields'], + improvements=['Show progress while creating the agent'], + fixes=['Label the create button clearly'], + other_feedback=[], + ) + history = MagicMock() + history.final_result.return_value = json.dumps(lite_result.model_dump()) + history.model_actions.return_value = [{'click': {'index': 1}}] + history.errors.return_value = [] + history.total_duration_seconds.return_value = 3.5 + history.urls.return_value = ['https://example.com/agents'] + history.screenshot_paths.return_value = [] + + agent = MagicMock() + agent.tools = MagicMock() + agent.run = AsyncMock(return_value=history) + + with ( + patch('murphy.core.execution.Agent', return_value=agent), + patch('murphy.core.execution.murphy_judge', new_callable=AsyncMock), + patch('murphy.browser.session_utils.prepare_session_for_task', new_callable=AsyncMock), + patch('murphy.browser.actions.register_domain_access_action'), + patch('murphy.browser.actions.register_refresh_dom_action'), + ): + await _execute_single_test( + url='https://example.com', + scenario=scenario, + llm=MagicMock(), + browser_session=MagicMock(), + goal='Test agent creation flow', + fixture_paths=None, + max_steps=5, + index=1, + total=1, + output_dir=tmp_path, + use_lite=True, + ) + + history.save_to_file.assert_called_once_with(tmp_path / 'agent_history' / 'test_01_lite_agent_creation.json') + + +@pytest.mark.asyncio +async def test_execute_single_test_lite_mode_retries_premature_done_without_interaction(): + scenario = TestScenario( + name='Lite objective smoke test', + description='Test the objective flow', + priority='critical', + feature_category='forms', + target_feature='Objective flow', + test_persona='happy_path', + steps_description='Attempt the objective and verify the outcome', + success_criteria='Return structured flaws, improvements, fixes, and other observations.', + ) + first_result = LiteResult( + grade=5, + flaws=['The flow was unclear from the landing page'], + improvements=[], + fixes=[], + other_feedback=[], + ) + second_result = LiteResult( + grade=7, + flaws=['The flow required extra guidance'], + improvements=['Clarify the first step'], + fixes=['Add inline guidance'], + other_feedback=[], + ) + + first_history = MagicMock() + first_history.final_result.return_value = json.dumps(first_result.model_dump()) + first_history.model_actions.return_value = [ + {'navigate': {'url': 'https://example.com'}}, + {'done': {'success': True}}, + ] + first_history.errors.return_value = [] + first_history.total_duration_seconds.return_value = 1.0 + first_history.urls.return_value = ['https://example.com'] + first_history.screenshot_paths.return_value = [] + + second_history = MagicMock() + second_history.final_result.return_value = json.dumps(second_result.model_dump()) + second_history.model_actions.return_value = [ + {'click': {'index': 1}}, + {'done': {'success': True}}, + ] + second_history.errors.return_value = [] + second_history.total_duration_seconds.return_value = 2.0 + second_history.urls.return_value = ['https://example.com/result'] + second_history.screenshot_paths.return_value = [] + + first_agent = MagicMock() + first_agent.tools = MagicMock() + first_agent.run = AsyncMock(return_value=first_history) + second_agent = MagicMock() + second_agent.tools = MagicMock() + second_agent.run = AsyncMock(return_value=second_history) + + with ( + patch('murphy.core.execution.Agent', side_effect=[first_agent, second_agent]) as agent_cls, + patch('murphy.core.execution.murphy_judge', new_callable=AsyncMock) as judge, + patch('murphy.browser.session_utils.prepare_session_for_task', new_callable=AsyncMock), + patch('murphy.browser.actions.register_domain_access_action'), + patch('murphy.browser.actions.register_refresh_dom_action'), + ): + result = await _execute_single_test( + url='https://example.com', + scenario=scenario, + llm=MagicMock(), + browser_session=MagicMock(), + goal='Test objective flow', + fixture_paths=None, + max_steps=5, + index=1, + total=1, + use_lite=True, + ) + + assert agent_cls.call_count == 2 + assert 'stopped before meaningful in-app interaction' in agent_cls.call_args_list[1].kwargs['task'] + judge.assert_not_awaited() + assert result.lite_result == second_result + assert result.actions == second_history.model_actions.return_value + + +@pytest.mark.asyncio +async def test_execute_single_test_lite_mode_does_not_retry_after_meaningful_interaction(): + scenario = TestScenario( + name='Lite objective smoke test', + description='Test the objective flow', + priority='critical', + feature_category='forms', + target_feature='Objective flow', + test_persona='happy_path', + steps_description='Attempt the objective and verify the outcome', + success_criteria='Return structured flaws, improvements, fixes, and other observations.', + ) + lite_result = LiteResult( + grade=7, + flaws=['The flow required extra guidance'], + improvements=[], + fixes=[], + other_feedback=[], + ) + history = MagicMock() + history.final_result.return_value = json.dumps(lite_result.model_dump()) + history.model_actions.return_value = [ + {'input_text': {'index': 3, 'text': 'test value'}}, + {'done': {'success': True}}, + ] + history.errors.return_value = [] + history.total_duration_seconds.return_value = 2.0 + history.urls.return_value = ['https://example.com/result'] + history.screenshot_paths.return_value = [] + + agent = MagicMock() + agent.tools = MagicMock() + agent.run = AsyncMock(return_value=history) + + with ( + patch('murphy.core.execution.Agent', return_value=agent) as agent_cls, + patch('murphy.core.execution.murphy_judge', new_callable=AsyncMock) as judge, + patch('murphy.browser.session_utils.prepare_session_for_task', new_callable=AsyncMock), + patch('murphy.browser.actions.register_domain_access_action'), + patch('murphy.browser.actions.register_refresh_dom_action'), + ): + result = await _execute_single_test( + url='https://example.com', + scenario=scenario, + llm=MagicMock(), + browser_session=MagicMock(), + goal='Test objective flow', + fixture_paths=None, + max_steps=5, + index=1, + total=1, + use_lite=True, + ) + + agent_cls.assert_called_once() + judge.assert_not_awaited() + assert result.lite_result == lite_result + + +@pytest.mark.asyncio +async def test_execute_single_test_normal_mode_excludes_write_file_tool(): + scenario = TestScenario( + name='Agent creation', + description='Create an agent from the homepage', + priority='critical', + feature_category='forms', + target_feature='Agent creation', + test_persona='happy_path', + steps_description='Create an agent and verify it appears', + success_criteria='The agent exists after creation.', + ) + verdict = ScenarioExecutionVerdict(success=True, reason='Agent was created') + history = MagicMock() + history.final_result.return_value = json.dumps(verdict.model_dump()) + history.model_actions.return_value = [{'click': {'index': 1}}, {'done': {'success': True}}] + history.errors.return_value = [] + history.total_duration_seconds.return_value = 4.0 + history.urls.return_value = ['https://example.com/agents/1'] + history.screenshot_paths.return_value = [] + + agent = MagicMock() + agent.tools = MagicMock() + agent.run = AsyncMock(return_value=history) + judgement = JudgeVerdict( + reasoning='Trace shows the agent was created.', + verdict=True, + failure_reason='', + impossible_task=False, + reached_captcha=False, + failure_category=None, + ) + + with ( + patch('murphy.core.execution.Agent', return_value=agent), + patch('murphy.core.execution.murphy_judge', new_callable=AsyncMock, return_value=judgement) as judge, + patch('murphy.browser.session_utils.prepare_session_for_task', new_callable=AsyncMock), + patch('murphy.browser.actions.register_domain_access_action'), + patch('murphy.browser.actions.register_refresh_dom_action'), + ): + result = await _execute_single_test( + url='https://example.com', + scenario=scenario, + llm=MagicMock(), + browser_session=MagicMock(), + goal='Test agent creation flow', + fixture_paths=None, + max_steps=5, + index=1, + total=1, + ) + + agent.tools.exclude_action.assert_any_call('write_file') + judge.assert_awaited_once() + assert result.success is True + assert result.judgement == judgement diff --git a/tests/murphy/core/test_generation.py b/tests/murphy/core/test_generation.py index 57fd461b..5f5614ff 100644 --- a/tests/murphy/core/test_generation.py +++ b/tests/murphy/core/test_generation.py @@ -7,6 +7,7 @@ from murphy.core.generation import ( _log_plan_summary, generate_tests, + make_lite_plan, summarize_exploration_from_actions, ) from murphy.models import Feature, PageInfo, TestPersona, TestPlan, TestScenario, WebsiteAnalysis @@ -85,7 +86,7 @@ async def test_generate_tests_returns_plan(): assert isinstance(result, TestPlan) assert len(result.scenarios) == 6 - llm.ainvoke.assert_called_once() + assert llm.ainvoke.call_count >= 1 @pytest.mark.asyncio @@ -149,6 +150,49 @@ async def test_generate_tests_retries_on_empty_plan(): assert llm.ainvoke.call_count == 2 +# ─── make_lite_plan ────────────────────────────────────────────────────────── + + +def test_make_lite_plan_creates_persona_scenarios_without_llm(): + plan = make_lite_plan('https://example.com', goal='Test agent creation flow', analysis=_make_analysis(), max_tests=2) + + assert isinstance(plan, TestPlan) + assert len(plan.scenarios) == 2 + assert [s.test_persona for s in plan.scenarios] == ['happy_path', 'confused_novice'] + assert all('Test agent creation flow' in s.description for s in plan.scenarios) + assert all('flaws, improvements, fixes' in s.success_criteria for s in plan.scenarios) + + +def test_make_lite_plan_interactive_goal_requires_objective_attempt_and_verification(): + plan = make_lite_plan('https://example.com', goal='Test agent creation flow', analysis=_make_analysis(), max_tests=1) + steps = plan.scenarios[0].steps_description + + assert 'most plausible in-app route' in steps + assert 'Attempt the objective' in steps + assert 'harmless test input' in steps + assert 'Advance or submit only when safe' in steps + assert 'Verify the resulting UI state' in steps + + +def test_make_lite_plan_state_change_goal_uses_generalized_steps(): + plan = make_lite_plan('https://example.com', goal='Test dark mode switching', analysis=_make_analysis(), max_tests=1) + steps = plan.scenarios[0].steps_description + + assert 'Change the requested state' in steps + assert 'Verify the resulting UI state' in steps + assert 'appearance' not in steps.lower() + assert 'theme control' not in steps.lower() + + +def test_make_lite_plan_uses_analysis_context_when_available(): + plan = make_lite_plan('https://example.com', goal=None, analysis=_make_analysis(), max_tests=1) + scenario = plan.scenarios[0] + + assert scenario.target_feature == 'Search' + assert scenario.feature_category == 'search' + assert 'Browse -> Search' in scenario.steps_description + + # ─── summarize_exploration_from_actions ────────────────────────────────────── diff --git a/tests/murphy/core/test_quality.py b/tests/murphy/core/test_quality.py index ce654e78..bbee35f3 100644 --- a/tests/murphy/core/test_quality.py +++ b/tests/murphy/core/test_quality.py @@ -13,7 +13,7 @@ def _make_scenario(**overrides) -> TestScenario: target_feature='Login form', test_persona='happy_path', steps_description='1. Navigate to login page\n2. Enter valid email\n3. Enter password\n4. Click submit', - success_criteria='User is redirected to dashboard and sees confirmation message', + success_criteria='Login succeeds: user is redirected to dashboard and sees confirmation message', ) defaults.update(overrides) return TestScenario.model_validate(defaults) diff --git a/tests/murphy/core/test_summary.py b/tests/murphy/core/test_summary.py index bfa24322..b2ade615 100644 --- a/tests/murphy/core/test_summary.py +++ b/tests/murphy/core/test_summary.py @@ -1,7 +1,7 @@ """Tests for summary building and failure classification.""" from murphy.core.summary import build_summary, classify_failure -from murphy.models import JudgeVerdict, TestResult, TestScenario +from murphy.models import JudgeVerdict, LiteResult, TestResult, TestScenario def _make_scenario(**overrides) -> TestScenario: @@ -75,6 +75,18 @@ def test_classify_failure_failed_no_judgement(): assert classify_failure(r) == 'test_limitation' +def test_classify_failure_failed_lite_result_is_plain_failure(): + lite_result = LiteResult( + grade=4, + flaws=['The create flow is hard to find'], + improvements=['Expose a clearer create action'], + fixes=['Add a primary Create Agent button'], + other_feedback=[], + ) + r = _make_result(success=False, judgement=None, lite_result=lite_result) + assert classify_failure(r) is None + + # ─── build_summary ──────────────────────────────────────────────────────────── diff --git a/tests/murphy/core/test_summary_extended.py b/tests/murphy/core/test_summary_extended.py index e54c1e78..c5441f2d 100644 --- a/tests/murphy/core/test_summary_extended.py +++ b/tests/murphy/core/test_summary_extended.py @@ -1,5 +1,6 @@ """Extended tests for summary — generate_executive_summary and write_reports_and_print with mocks.""" +import json import tempfile from pathlib import Path from unittest.mock import AsyncMock, MagicMock @@ -11,6 +12,7 @@ ExecutiveSummary, Feature, JudgeVerdict, + LiteResult, PageInfo, ReportSummary, TestResult, @@ -200,6 +202,41 @@ def test_write_reports_and_print(): assert md_path.exists() +def test_write_reports_and_print_preserves_lite_result_details(): + analysis = _make_analysis() + lite_result = LiteResult( + grade=6, + flaws=['Unclear first step'], + improvements=['Clarify the first step'], + fixes=['Add helper text beside the first field'], + other_feedback=['The rest of the flow is understandable'], + ) + results = [ + _make_result( + judgement=None, + lite_result=lite_result, + reason='Lite mode grade: 6', + ) + ] + + with tempfile.TemporaryDirectory() as tmpdir: + output_dir = Path(tmpdir) + write_reports_and_print('https://example.com', analysis, results, output_dir) + + json_path = output_dir / 'evaluation_report.json' + md_path = output_dir / 'evaluation_report.md' + assert json_path.exists() + assert md_path.exists() + + json_content = json.loads(json_path.read_text()) + assert json_content['results'][0]['lite_result']['grade'] == 6 + md_content = md_path.read_text() + assert '**Grade:** 6/10' in md_content + assert '- Unclear first step' in md_content + assert '- Clarify the first step' in md_content + assert '- Add helper text beside the first field' in md_content + + def test_write_reports_and_print_with_executive_summary(): analysis = _make_analysis() results = [_make_result()] diff --git a/tests/murphy/io/test_report.py b/tests/murphy/io/test_report.py index 04b9d116..6a7c8e3d 100644 --- a/tests/murphy/io/test_report.py +++ b/tests/murphy/io/test_report.py @@ -20,6 +20,7 @@ Feature, FeedbackQualityScore, JudgeVerdict, + LiteResult, PageInfo, ReportSummary, TestResult, @@ -306,6 +307,55 @@ def test_write_markdown_report_basic(): assert 'Test search' in content +def test_write_markdown_report_includes_lite_result_details(): + lite_result = LiteResult( + grade=6, + flaws=[ + 'The primary action is visually disconnected from the form.', + 'The sidebar creates an oversized left gutter.', + 'The textarea is wider than the content hierarchy supports.', + 'Spacing between the label and field is inconsistent.', + 'The disabled button state lacks a clear explanation.', + 'The form feels unanchored in the available whitespace.', + ], + improvements=[ + 'Constrain the form to a readable max width.', + 'Align the label, field, and button to one column.', + 'Use consistent vertical spacing tokens.', + 'Reduce sidebar visual density.', + 'Explain why the disabled action is unavailable.', + ], + fixes=[ + 'Set a max-width on the main form container.', + 'Align the button with the textarea edge.', + 'Apply shared spacing variables to the form stack.', + 'Reduce sidebar item padding variance.', + 'Add helper text for the disabled Next button.', + ], + other_feedback=['The visual language is otherwise modern and consistent.'], + ) + result = _make_result( + judgement=None, + lite_result=lite_result, + reason='Lite mode grade: 6', + ) + report = _make_report(results=[result]) + + with tempfile.TemporaryDirectory() as tmpdir: + content = write_markdown_report(report, Path(tmpdir)).read_text() + + assert '| Test | Persona | Grade | Flaws | Improvements | Fixes | Duration |' in content + assert '| Test search | Happy Path | 6/10 | 6 | 5 | 5 | 5s |' in content + assert '**Lite Evaluation:**' in content + assert '**Grade:** 6/10' in content + assert '**Flaws:**' in content + assert '**Improvements:**' in content + assert '**Fixes:**' in content + assert '**Other feedback:**' in content + for item in lite_result.flaws + lite_result.improvements + lite_result.fixes + lite_result.other_feedback: + assert f'- {item}' in content + + def test_write_markdown_report_includes_passed_section(): report = _make_report() with tempfile.TemporaryDirectory() as tmpdir: @@ -335,6 +385,43 @@ def test_write_markdown_report_includes_failure_sections(): assert 'Website Issues' in content +def test_write_markdown_report_failed_lite_result_is_not_test_limitation(): + lite_result = LiteResult( + grade=4, + flaws=['The create flow is hard to find'], + improvements=['Expose a clearer create action'], + fixes=['Add a primary Create Agent button'], + other_feedback=[], + ) + failed_lite_result = _make_result( + success=False, + judgement=None, + lite_result=lite_result, + failure_category=None, + reason='Lite mode grade: 4', + ) + report = _make_report( + results=[failed_lite_result], + summary=ReportSummary( + total=1, + passed=0, + failed=1, + pass_rate=0.0, + website_issues=0, + test_limitations=0, + by_priority={'high': {'passed': 0, 'failed': 1}}, + ), + ) + + with tempfile.TemporaryDirectory() as tmpdir: + content = write_markdown_report(report, Path(tmpdir)).read_text() + + assert '## Failed Tests' in content + assert '## Test Limitations' not in content + assert 'Lite mode grade: 4' in content + assert '**Grade:** 4/10' in content + + def test_write_markdown_report_includes_features_discovered(): report = _make_report() with tempfile.TemporaryDirectory() as tmpdir: diff --git a/tests/murphy/test_models.py b/tests/murphy/test_models.py index c853684c..ce1a89d3 100644 --- a/tests/murphy/test_models.py +++ b/tests/murphy/test_models.py @@ -11,6 +11,7 @@ FeedbackQualityScore, InteractiveElement, JudgeVerdict, + LiteResult, PageInfo, ReportSummary, ScenarioExecutionVerdict, @@ -122,14 +123,25 @@ def test_trait_vector_extra_forbidden(): def test_persona_registry_completeness(): - expected_personas = {'happy_path', 'confused_novice', 'adversarial', 'edge_case', 'explorer', 'impatient_user', 'angry_user'} + expected_personas = { + 'happy_path', + 'confused_novice', + 'adversarial', + 'edge_case', + 'explorer', + 'impatient_user', + 'angry_user', + 'classic_ui', + 'modern_ui', + 'layout_auditor_ui', + } assert set(PERSONA_REGISTRY.keys()) == expected_personas def test_persona_registry_values_are_trait_vector_and_test_type(): for persona, (traits, test_type) in PERSONA_REGISTRY.items(): assert isinstance(traits, TraitVector), f'{persona} traits not TraitVector' - assert test_type in ('ux', 'security', 'boundary'), f'{persona} test_type invalid: {test_type}' + assert test_type in ('ux', 'security', 'boundary', 'design'), f'{persona} test_type invalid: {test_type}' # ─── ScenarioExecutionVerdict ───────────────────────────────────────────────── @@ -157,6 +169,25 @@ def test_scenario_execution_verdict_with_fields(): assert 'Button' in v.reason +# ─── LiteResult ─────────────────────────────────────────────────────────────── + + +def test_lite_result_requires_grade_between_one_and_ten(): + result = LiteResult( + grade=8, + flaws=['Agent creation has unclear validation errors'], + improvements=['Add clearer progress feedback'], + fixes=['Show inline errors next to required fields'], + other_feedback=['The flow is discoverable from the dashboard'], + ) + + assert result.grade == 8 + assert result.flaws == ['Agent creation has unclear validation errors'] + + with pytest.raises(ValidationError): + LiteResult(grade=11, flaws=[], improvements=[], fixes=[], other_feedback=[]) + + # ─── TestScenario ───────────────────────────────────────────────────────────── @@ -186,9 +217,10 @@ def test_test_scenario_rejects_invalid_priority(): _make_scenario(priority='urgent') -def test_test_scenario_rejects_invalid_persona(): - with pytest.raises(ValidationError): - _make_scenario(test_persona='robot') +def test_test_scenario_accepts_custom_persona_names(): + """Discovered personas are runtime strings, not limited to the built-in registry.""" + scenario = _make_scenario(test_persona='enterprise_admin') + assert scenario.test_persona == 'enterprise_admin' def test_test_scenario_rejects_invalid_category(): @@ -279,6 +311,22 @@ def test_test_result_nullable_success(): assert r.success is None +def test_test_result_serializes_lite_result(): + lite_result = LiteResult( + grade=6, + flaws=['The create button is hard to find'], + improvements=['Add onboarding copy'], + fixes=['Make the create button primary'], + other_feedback=['Navigation is otherwise clear'], + ) + r = _make_result(judgement=None, lite_result=lite_result, reason='Lite mode grade: 6') + + dumped = r.model_dump() + + assert dumped['lite_result']['grade'] == 6 + assert dumped['lite_result']['flaws'] == ['The create button is hard to find'] + + # ─── WebsiteAnalysis ───────────────────────────────────────────────────────── diff --git a/tests/murphy/test_prompts.py b/tests/murphy/test_prompts.py index 6edaf14d..90935dba 100644 --- a/tests/murphy/test_prompts.py +++ b/tests/murphy/test_prompts.py @@ -7,6 +7,7 @@ build_analysis_prompt, build_execution_prompt, build_exploration_prompt, + build_lite_prompt, build_plan_synthesis_prompt, build_test_generation_prompt, build_test_generation_system_message, @@ -176,6 +177,51 @@ def test_execution_prompt_confused_novice_persona(): assert 'confused_novice' in prompt +# ─── build_lite_prompt ─────────────────────────────────────────────────────── + + +def test_lite_prompt_requests_structured_lite_fields(): + scenario = _make_scenario(test_persona='happy_path') + prompt = build_lite_prompt(scenario, 'https://example.com', analysis=_make_analysis()) + + assert 'LiteResult' in prompt + assert 'flaws' in prompt + assert 'improvements' in prompt + assert 'fixes' in prompt + assert 'other_feedback' in prompt + assert 'SKIP' not in prompt + + +def test_lite_prompt_reuses_persona_context(): + scenario = _make_scenario(test_persona='confused_novice') + prompt = build_lite_prompt(scenario, 'https://example.com') + + assert 'confused_novice' in prompt + assert 'first-time user' in prompt + assert 'Stay on the same domain' in prompt + + +def test_lite_prompt_requires_objective_attempt_before_result(): + scenario = _make_scenario(test_persona='happy_path') + prompt = build_lite_prompt(scenario, 'https://example.com') + + assert 'fast objective-driven website test' in prompt + assert 'attempt the stated objective' in prompt + assert 'Terminal states' in prompt + assert 'stop as soon as you have enough evidence' not in prompt + + +def test_lite_prompt_keeps_objective_rules_generalized(): + scenario = _make_scenario(test_persona='happy_path') + prompt = build_lite_prompt(scenario, 'https://example.com') + lower_prompt = prompt.lower() + + assert 'create agent' not in lower_prompt + assert 'dark mode' not in lower_prompt + assert 'theme control' not in lower_prompt + assert 'ordinary app forms needed for the objective' in prompt + + # ─── _build_persona_distribution_text ─────────────────────────────────────────