diff --git a/docs/claude/agents.md b/docs/claude/agents.md new file mode 100644 index 0000000..58fc120 --- /dev/null +++ b/docs/claude/agents.md @@ -0,0 +1,122 @@ +# Available agents & skills + +The Hydra pipeline is built from four **agents** (containerised personas with scoped permissions) plus a large catalogue of **skills** (reusable workflows invoked as slash-commands inside Claude Code). This page catalogs both. + +> πŸ’‘ Skills live in two repos. Hydra's own `.claude/skills/` ships the pipeline-side workflow (opsx-*, hydra-gate-*, team-*, test-*, journeydoc-*, utilities). The `concurrentie-analyse` repo ships the upstream research and app-pipeline catalogue (specter-*, tender-*, ecosystem-*, app-*, swc-*). + +## Pipeline agents + +The four containerised personas that move a change from `ready-to-build` through code-review, security-review, and a binary apply gate to a draft PR ready for one human approval. Each agent runs in its own ephemeral container with scoped permissions and a single responsibility β€” see [`hydra/agents/README.md`](https://github.com/ConductionNL/hydra/tree/main/agents) for the directory layout and per-agent `purpose.md` / `behavior.md` / `constraints.md`. + +| Persona | Slug | Role | Container | Model | Turns | +|---|---|---|---|---|---| +| **Al Gorithm** | `al-gorithm` | Builder β€” implements the change against the OpenSpec proposal, pushes a feature branch early, opens a draft PR | `hydra-builder` | opus | 200 | +| **Juan Claude van Damme** | `juan-claude-van-damme` | Code Reviewer β€” reviews PR, posts findings, has fix authority (ADR-013 no-loop policy) | `hydra-reviewer` | sonnet | 200 | +| **Clyde Barcode** | `clyde-barcode` | Security Reviewer β€” SAST analysis, posts findings, has fix authority | `hydra-security` | sonnet | 150 | +| **Axel PliΓ©r** | `axel-plier` | Applier β€” binary go/no-go gate after both reviewers, no fix authority | `hydra-applier` | sonnet | 20 | + +Shared config lives in [`agents/base.yaml`](https://github.com/ConductionNL/hydra/tree/main/agents/base.yaml); each agent's `config.yaml` extends it kustomize-style. + +## Hydra skills (`hydra/.claude/skills/`) + +### OpenSpec workflow β€” `opsx-*` + +The day-to-day flow for spec-driven development. The canonical chain: + +`/opsx-new` β†’ `/opsx-ff` *or* `/opsx-continue` β†’ `/opsx-plan-to-issues` β†’ `/opsx-apply` β†’ `/opsx-verify` β†’ `/opsx-archive` + +| Skill | Purpose | +|---|---| +| `/opsx-new` | Scaffold a new change proposal (`openspec/changes/{slug}/`) | +| `/opsx-ff` | Fast-forward β€” write proposal + spec delta + tasks in one pass | +| `/opsx-continue` | Resume an in-flight proposal where you left off | +| `/opsx-plan-to-issues` | Convert tasks.md into GitHub issues with proper labels | +| `/opsx-apply` | Implement one task end-to-end | +| `/opsx-apply-loop` | Loop `/opsx-apply` over remaining tasks | +| `/opsx-verify` | Validate the implementation matches the spec delta | +| `/opsx-archive` | Promote `changes/{slug}/` β†’ `specs/` and close the proposal | +| `/opsx-onboard` | Bootstrap an existing repo into the openspec workflow | +| `/opsx-explore` | Read-only exploration of a codebase to inform a proposal | +| `/opsx-pipeline` | Run a full pipeline locally (build β†’ review β†’ apply) | +| `/opsx-coverage-scan` | Audit annotation coverage on an app | +| `/opsx-annotate` | Add `x-openregister-*` annotations to existing schemas | +| `/opsx-reverse-spec` | Produce a spec delta from clusters of un-annotated code | +| `/opsx-sync` | Sync spec deltas between hydra-specs and per-app `openspec/` | +| `/opsx-bulk-archive` | Archive multiple completed changes in one pass | + +### Quality gates β€” `hydra-gate-*` + +Mechanical checks that run inside the Builder/Reviewer containers before/after each pass. Used by the pipeline itself, but also runnable locally to pre-empt feedback: + +`hydra-gates` (umbrella) plus individual gates: + +`hydra-gate-admin-router`, `hydra-gate-composer-audit`, `hydra-gate-forbidden-patterns`, `hydra-gate-initial-state`, `hydra-gate-modal-isolation`, `hydra-gate-nc-input-labels`, `hydra-gate-no-admin-idor`, `hydra-gate-orphan-auth`, `hydra-gate-route-auth`, `hydra-gate-semantic-auth`, `hydra-gate-spdx`, `hydra-gate-stub-scan`, `hydra-gate-unsafe-auth-resolver` + +### Team agents β€” `team-*` + +Per-discipline reviewers / counsels you can invoke directly when you want a single point of view (rather than running the full pipeline). + +| Skill | Role | +|---|---| +| `team-architect` | System design, ADR fit, cross-spec consistency | +| `team-backend` | PHP / API / DB review | +| `team-frontend` | Vue / nextcloud-vue / a11y | +| `team-po` | Product owner β€” user value, scope fit | +| `team-qa` | Test plan, edge cases, regression risk | +| `team-reviewer` | General PR reviewer counterpart to `juan-claude-van-damme` | +| `team-sm` | Scrum master β€” process, sprint health | + +### Testing & journey docs + +| Skill | Purpose | +|---|---| +| `test-accessibility` | Axe-Core + WCAG AA sweep | +| `test-api` | Newman-based API contract tests | +| `test-app` | App-scoped Playwright run | +| `test-counsel` | Orchestrates parallel test runs across multiple personas | +| `test-functional` | Functional regression sweep | +| `test-performance` | Lighthouse / load-time checks | +| `test-persona-annemarie`, `-fatima`, `-henk`, `-janwillem` | Persona-driven flows (each persona file in `hydra/personas/`) | +| `journeydoc-init` | Scaffold the journeydoc Playwright + Docusaurus capture setup | +| `journeydoc-add-story` | Add a new tutorial-page capture spec | +| `journeydoc-instrument` | Add `data-testid` instrumentation to existing components | + +### Utilities + +`create-pr`, `clean-env`, `feature-counsel`, `local-run`, `persistence-audit`, `report-out`, `review-pr`, `skill-creator`, `sync-docs`. + +## Concurrentie-analyse skills (`concurrentie-analyse/.claude/skills/`) + +The upstream research and app-pipeline side β€” feeds proposals back into Hydra. + +| Group | Skills | Purpose | +|---|---|---| +| **App pipeline** | `app-create`, `app-design`, `app-explore`, `app-pipeline` | Scaffold and explore new apps from intelligence-DB findings | +| **Research β€” Specter** | `specter-analyze-docs`, `specter-competitive-alert`, `specter-concept`, `specter-harvest`, `specter-pipeline`, `specter-prepare-context`, `specter-research-app`, `specter-sync` | The Specter intelligence pipeline (tender + competitor harvest β†’ cluster β†’ spec) | +| **Tender** | `tender-scan`, `tender-status`, `tender-gap-report` | Operate on `intelligence.db` for tender coverage | +| **Ecosystem** | `ecosystem-investigate`, `ecosystem-propose-app` | Find ecosystem gaps and draft proposals for new apps | +| **Software catalogue** | `swc-test`, `swc-update` | Sync the public software catalogue | +| **Misc** | `intelligence-update`, `readiness-report` | DB maintenance + readiness reporting | + +## User personas + +Personas are non-agent β€” they're test subjects representing real user archetypes the testing skills drive flows against: + +| File | Persona | +|---|---| +| `annemarie-de-vries.md` | Public-sector caseworker β€” Henk's manager's manager | +| `fatima-el-amrani.md` | Front-line municipal officer, multilingual | +| `henk-bakker.md` | Senior caseworker, sceptical of new tools | +| `janwillem-van-der-berg.md` | IT architect, evaluates platform fit | +| `mark-visser.md` | Developer onboarding the platform | +| `noor-yilmaz.md` | Citizen-side user submitting forms | +| `priya-ganpat.md` | Compliance officer, ISO / privacy lens | +| `sem-de-jong.md` | Product manager, prioritisation lens | + +Full persona files live in [`hydra/personas/`](https://github.com/ConductionNL/hydra/tree/main/personas). + +## Going deeper + +- **Each skill is its own folder** under `.claude/skills//` with a `SKILL.md` (the instruction prompt the agent runs) plus optional `examples/`, `references/`, `templates/`, `assets/`. See [Writing skills](./writing-skills.md) and the [Skill checklist](./skill-checklist.md). +- **Skill maturity levels (L1–L7)** describe how much evaluation backs each skill. The [Skill evaluation](./skill-evals.md) page documents the L5+ workflow with `evals.json` baselines. +- **Skills are invokable** via `Skill: ` inside Claude Code or as `/` slash-commands. The harness also publishes them to sub-agents through the `Agent` tool. diff --git a/docs/hydra/agent-configuration.md b/docs/hydra/agent-configuration.md new file mode 100644 index 0000000..f9472af --- /dev/null +++ b/docs/hydra/agent-configuration.md @@ -0,0 +1,428 @@ +# Agent Configuration + +Each Hydra container runs as a named agent persona with its own GitHub identity, skill set, and CLAUDE.md instructions. This document defines the personas, their skills, and how they are configured. + +## Agent Personas + +From the [ConNext personas catalog](../../../concurrentie-analyse/ConNext.md) (internal Conduction repo) β€” each persona is a real GitHub user with a Conduction company profile: + +| Persona | GitHub User | Container Type | Pipeline Stage | Model | Name Logic | +|---------|------------|----------------|---------------|-------|------------| +| **Al Gorithm** | `al-gorithm` | Builder | Build β†’ Quality Fix β†’ Review Fix | opus (build) / sonnet (fix) | Algorithm | +| **Juan Claude van Damme** | `juan-claude-vd` | Code Reviewer | Code Review | sonnet | Jean-Claude Van Damme + Claude AI | +| **Clyde Barcode** | `clyde-barcode` | Security Reviewer | Security Code Review | sonnet | Barcode β†’ audit trail | + +**Model override:** Set `HYDRA_MODEL` environment variable to force a specific model for all agents. + +> GitHub usernames are illustrative β€” actual accounts will be created during setup. +> +> Other personas from the ConNext roster (Agatha Krishti, Meryl Streep-test) are available for future use but do not have dedicated Hydra containers. + +### GitHub Profiles + +Each persona has a full Conduction company profile: +- **Avatar** β€” Professional headshot (AI-generated, consistent style) +- **Bio** β€” Role at Conduction, area of expertise +- **Location** β€” The Netherlands +- **Company** β€” @ConductionNL +- **Contribution history** β€” Visible across all repos they contribute to + +The AI nature is disclosed through the names β€” professional at a glance, obvious joke on second look. + +## Skills Per Container + +### Builder (Al Gorithm + Agatha Krishti) + +The Builder is the most capable container. It needs to read specs, write code, run quality checks, and create PRs. The Builder operates in four modes with different models: + +| Mode | Model | Trigger | Max retries | +|------|-------|---------|-------------| +| **build** | opus | New spec to implement | N/A | +| **fix-quality** | sonnet | Automated quality tests fail | 2 | +| **fix-browser** | sonnet | Browser UI tests fail | 2 | +| **fix** | sonnet | Code/Security review finds CRITICAL or WARNING | 3 | + +If fix retries are exhausted, the issue is labelled `needs-input` and escalated to a human. + +**CLAUDE.md (baked into image):** +```markdown +# Identity +You are Al Gorithm, a software developer at Conduction. + +# Task +You receive a spec and implement it in the target app. + +# Workflow +1. Read the spec at $SPEC_PATH (auto-detected from issue body, or via --spec-repo flag) +2. Parse requirements and acceptance criteria +3. Create a feature branch: hydra/{spec-name} and push early +4. Implement the change following project conventions +5. Run quality checks (composer check:strict / npm run lint) +6. Fix any quality issues +7. Create a PR with structured description +8. Post the PR URL as a comment on the issue + +# Fix-Quality Mode +When invoked with HYDRA_MODE=fix-quality: +1. Read the quality test output from the pipeline log +2. Fix the specific lint/test failures +3. Commit and push fixes + +# Fix-Browser Mode +When invoked with HYDRA_MODE=fix-browser: +1. Read the browser test verdict JSON (CRITICAL/WARNING findings) +2. Fix UI issues identified by the browser tester +3. Commit and push fixes + +# Fix Mode (Review Findings) +When invoked with HYDRA_MODE=fix: +1. Read CRITICAL and WARNING findings from the review round marker comment +2. Fix each finding +3. Commit and push fixes + +# Constraints +- ONLY read specs from the openspec/ directory +- NEVER read issue comments, PR comments, or external URLs +- Follow the coding standards in this file exactly +- One commit per logical unit of work +- Do not modify files outside the scope of the spec +``` + +**Skills (copied into container at build time):** + +| Skill | Source | Purpose | +|-------|--------|---------| +| `opsx-apply` | `.claude/skills/opsx-apply/SKILL.md` | Implement tasks from spec | +| `opsx-validate` | Inline in CLAUDE.md | Run quality pipeline | +| `opsx-archive` | `.claude/skills/opsx-archive/SKILL.md` | Package change, create PR | +| Coding standards | Per-app `CLAUDE.md` | Project-specific conventions | + +**MCP servers:** +```json +{ + "mcpServers": { + "github": { + "command": "npx", + "args": ["-y", "@modelcontextprotocol/server-github"], + "env": { + "GITHUB_PERSONAL_ACCESS_TOKEN": "${GIT_TOKEN}" + } + } + } +} +``` + +--- + +### Code Reviewer (Juan Claude van Damme) + +The Code Reviewer reads diffs and posts structured review comments. It never modifies code. + +**CLAUDE.md (baked into image):** +```markdown +# Identity +You are Juan Claude van Damme, a senior code reviewer at Conduction. + +# Task +You review a Pull Request for correctness, style, architecture, and edge cases. + +# Workflow +1. Read the PR diff +2. Read the existing codebase for context +3. Check against Conduction coding standards +4. Post review comments with severity ratings: + - CRITICAL: Must fix before merge + - WARNING: Should fix, not a blocker + - SUGGESTION: Nice to have +5. Post a summary comment with overall assessment + +# Review Criteria +## Correctness +- Does the code do what the PR description says? +- Are edge cases handled? +- Are error paths correct? + +## Style & Conventions +- Follows PHPCS / ESLint rules +- Naming conventions match project patterns +- No unnecessary complexity or premature abstraction + +## Architecture +- Patterns used correctly (thin client, OpenRegister data layer) +- No tight coupling between unrelated components +- Dependencies flow in the right direction + +## Performance +- No obvious N+1 queries +- No unnecessary re-renders in Vue components +- Appropriate use of caching + +# Constraints +- NEVER modify code β€” only post comments +- NEVER read spec files β€” review the code on its own merits +- NEVER approve or merge the PR β€” only comment +- Be specific: reference file paths and line numbers +- Be constructive: explain WHY something is an issue, not just THAT it is +``` + +**Skills:** + +| Skill | Source | Purpose | +|-------|--------|---------| +| code-review-skill | `vendor/skills/code-review/SKILL.md` | Community-maintained 4-phase review with progressive disclosure (11 languages incl. PHP, Vue) | +| Conduction ADRs | `openspec/architecture/` (baked into image) | Architecture compliance (OpenRegister, Controllerβ†’Serviceβ†’Mapper, etc.) | + +**MCP servers:** +```json +{ + "mcpServers": { + "github": { + "command": "npx", + "args": ["-y", "@modelcontextprotocol/server-github"], + "env": { + "GITHUB_PERSONAL_ACCESS_TOKEN": "${GIT_TOKEN}" + } + } + } +} +``` + +--- + +### Security Reviewer (Clyde Barcode) + +The Security Reviewer reviews PR code for security vulnerabilities. It focuses on code-level security issues β€” not dependency auditing, CVE scanning, or license compliance (those are handled by the organisation-wide quality workflow). + +**CLAUDE.md (baked into image):** +```markdown +# Identity +You are Clyde Barcode, a security analyst at Conduction. + +# Task +You perform a security code review of a Pull Request, focusing on vulnerabilities +introduced in the new or changed code. + +# Workflow +1. Clone the repo and checkout the PR branch +2. Run Semgrep with OWASP rules against the changed files +3. Run Gitleaks to check for hardcoded secrets in the diff +4. Manually review the PR diff for: + a. OWASP Top 10 patterns (SQL injection, XSS, command injection, etc.) + b. Hardcoded credentials, API keys, or tokens in code + c. Unsafe deserialization + d. Missing input validation at system boundaries + e. LDAP/NoSQL/XPath injection vectors + f. Broken authentication or authorisation logic + g. Insecure cryptographic usage +5. Post findings as structured PR comments: + - CRITICAL: Vulnerability or secret found β€” blocks merge + - WARNING: Potential issue, needs human assessment + - INFO: Informational finding, no action needed + +# Out of scope +- Dependency CVE scanning (handled by org-wide quality workflow) +- SBOM generation (handled by org-wide quality workflow) +- License compliance (handled by org-wide quality workflow) + +# Constraints +- NEVER modify code β€” only review and report +- NEVER read spec files β€” assess the code independently +- NEVER approve or merge the PR +- False positives: when uncertain, report as WARNING with context +- Always include remediation suggestions with findings +``` + +**Skills:** + +| Skill | Source | Purpose | +|-------|--------|---------| +| Trail of Bits Semgrep | `vendor/skills/trailofbits/plugins/static-analysis/skills/semgrep/SKILL.md` | Professional SAST scanning methodology | +| OWASP reference | `vendor/skills/owasp/OWASP-2025-2026-Report.md` | OWASP Top 10:2025 + ASVS 5.0 | +| Semgrep MCP | [semgrep/mcp](https://github.com/semgrep/mcp) | Live interactive SAST scanning (replaces pre-computed JSON) | +| Conduction ADRs | `openspec/architecture/adr-005-security.md`, `adr-002-api.md` | Auth, PII, CORS rules | + +**MCP servers:** +```json +{ + "mcpServers": { + "github": { + "command": "npx", + "args": ["-y", "@modelcontextprotocol/server-github"], + "env": { + "GITHUB_PERSONAL_ACCESS_TOKEN": "${GIT_TOKEN}" + } + }, + "semgrep": { + "command": "uvx", + "args": ["semgrep-mcp"] + } + } +} +``` + +**Pre-installed tools (in Dockerfile):** +```dockerfile +# Security tooling (code-level analysis only) +RUN pip install semgrep==1.70.0 +RUN curl -sSfL https://raw.githubusercontent.com/gitleaks/gitleaks/main/scripts/install.sh | sh # v8.18.4 +RUN curl -sfL https://raw.githubusercontent.com/aquasecurity/trivy/main/contrib/install.sh | sh # image/config/K8s scanning +``` + +--- + +### Browser UI Tester + +The Browser UI Tester runs on the host (not in a container) using Claude CLI with Playwright MCP. It validates that the implemented feature works correctly in an actual browser against a live Nextcloud instance. + +**Script:** `scripts/run-browser-tests.sh` +**Skill:** `images/builder/skills/hydra-ui-test/SKILL.md` +**Model:** sonnet +**Runtime:** Host machine with Playwright MCP (headless Chromium) + +The script pre-extracts acceptance criteria (GIVEN/WHEN/THEN scenarios) from the spec into the prompt, so the browser agent does not waste tokens reading files. + +**What it tests:** +- Logs into Nextcloud and navigates to the target app +- Acceptance criteria from the spec +- CRUD flows (create, read, update, delete) +- Navigation between views +- Form validation and error states +- Console errors and JavaScript exceptions +- Network request failures + +**Output:** Structured verdict JSON: +```json +{ + "pass": false, + "findings": [ + { "severity": "CRITICAL", "description": "...", "steps": "..." }, + { "severity": "WARNING", "description": "...", "steps": "..." } + ] +} +``` + +If CRITICAL or WARNING findings are present, the Builder is re-launched in **fix-browser mode** (sonnet, max 2 retries). + +## Skill Architecture + +### Builder: OPSX skills + CLAUDE.md + +The Builder container loads the full OPSX skill suite from this repo's `.claude/skills/` +(copied at build time). This gives it access to `opsx-apply`, `opsx-verify`, `opsx-archive`, +`opsx-sync`, `opsx-continue`, and `opsx-apply-loop` β€” the standard Conduction workflow for +implementing changes. + +The Builder's `CLAUDE.md` provides its agent identity, constraints, and headless +adaptations on top of these skills. + +### Code Reviewer & Security Reviewer: Community Skills + Thin Wrapper + +The Code Reviewer and Security Reviewer delegate their review methodology to +community-maintained skills, with a thin `CLAUDE.md` wrapper that enforces: + +1. **Conduction architecture compliance** β€” ADRs specific to our codebase +2. **Output format contracts** β€” finding format and verdict JSON that the orchestrator parses +3. **Hard constraints** β€” no code modification, turn limits, tool restrictions + +This approach minimises custom review logic to maintain while benefiting from community +improvements to review methodology, language support, and security rule coverage. + +**Code Reviewer** uses: +- [awesome-skills/code-review-skill](https://github.com/awesome-skills/code-review-skill) β€” 4-phase review, progressive disclosure, 11 languages including PHP and Vue + +**Security Reviewer** uses: +- [trailofbits/skills](https://github.com/trailofbits/skills) β€” Professional Semgrep scanning methodology from Trail of Bits +- [agamm/claude-code-owasp](https://github.com/agamm/claude-code-owasp) β€” OWASP Top 10:2025 + ASVS 5.0 reference +- **Semgrep MCP server** β€” live interactive SAST scanning (replaces pre-computed JSON) + +Skills are vendored via `git subtree` in `vendor/skills/` and pinned to specific SHAs. +See `vendor/skills/VERSIONS.md` for versions and update instructions. + +### Skill minimisation + +Each container still loads only what it needs: + +| Container | What's loaded | Model | What's excluded | +|-----------|--------------|-------|----------------| +| Builder (build) | OPSX skills, ADRs, schemas, personas, security hook | opus | No review skills or security tools | +| Builder (fix-quality) | Same as build | sonnet | Same exclusions | +| Builder (fix-browser) | Same as build | sonnet | Same exclusions | +| Builder (fix) | Same as build | sonnet | Same exclusions | +| Browser UI Tester | Claude CLI + Playwright MCP + hydra-ui-test skill | sonnet | No OPSX skills, no security tools, no code modification | +| Code Reviewer | Community code-review skill + thin CLAUDE.md wrapper + ADRs | sonnet | No OPSX skills, no security tools | +| Security Reviewer | Community security skills (Trail of Bits, OWASP) + Semgrep MCP + Gitleaks/Trivy + thin CLAUDE.md wrapper | sonnet | No OPSX skills, no code review | + +**Why this matters:** +- Reduces token usage (smaller context = cheaper and faster) +- Reduces attack surface (a compromised Builder cannot run security scans to find its own vulnerabilities to hide) +- Makes each container's behaviour more predictable and auditable + +## Configuration Architecture + +Agent configuration is split across two layers. See [Container Architecture β€” Configuration Layers](container-architecture.md#configuration-layers) for the full rationale. + +### Layer 1: Agent definitions (`agents/`) + +Portable, YAML-based definitions that are the source of truth for agent capabilities: + +``` +agents/ +β”œβ”€β”€ base.yaml # Shared: egress, MCP, env, permission_mode +β”œβ”€β”€ al-gorithm/ +β”‚ β”œβ”€β”€ config.yaml # Runtime: max_turns=80, allowed_tools, extra egress +β”‚ β”œβ”€β”€ purpose.md # Identity & role description +β”‚ β”œβ”€β”€ behavior.md # Detailed task workflow +β”‚ β”œβ”€β”€ constraints.md # Hard rules & boundaries +β”‚ β”œβ”€β”€ runtime.md # Operational notes & links +β”‚ └── skills.yaml # Skill references +β”œβ”€β”€ juan-claude-van-damme/ +β”‚ └── (same structure) # max_turns=30, read-only tools +└── clyde-barcode/ + └── (same structure) # max_turns=20, Read+Bash only, extra: semgrep.dev +``` + +Key shared settings (`base.yaml`): +- `claude.permission_mode: acceptEdits` β€” headless container operation without prompts +- `claude.output_format: stream-json` β€” JSONL log output for parsing +- `egress.hosts` β€” allowlisted domains; per-agent `extra_hosts` for package registries +- `mcp.github` β€” GitHub MCP server, shared by all agents + +### Layer 2: Docker image config (`images/`) + +Container-specific files baked into each image at build time: + +``` +images/{builder,reviewer,security}/ +β”œβ”€β”€ CLAUDE.md # Agent identity, workflow, constraints (read-only) +β”œβ”€β”€ settings.json # Claude CLI: allowedTools, MCP permissions +β”œβ”€β”€ .mcp.json # MCP server definitions (runtime JSON format) +└── entrypoint.sh # Bootstrap: git auth, egress firewall, config loading +``` + +Plus in the workspace root (cloned at runtime): +``` +/workspace/ +β”œβ”€β”€ CLAUDE.md # Project-specific coding standards (from target app) +└── REVIEW.md # Review criteria (Code Reviewer only) +``` + +### Entrypoint bootstrap + +All three entrypoints share common logic via `scripts/lib/entrypoint-common.sh` and +`scripts/lib/load-config.py`. The bootstrap sequence: + +1. Load config from `agents/base.yaml` + agent-specific `config.yaml` +2. Validate required environment variables +3. Set up egress firewall (iptables allowlist from config) +4. Configure git authentication +5. Execute Claude Code with the appropriate prompt + +## Resolved Decisions + +- **Agent GitHub profiles:** Full Conduction company profiles with avatar, bio, and visible contribution history. The humorous names disclose AI nature transparently. +- **Orchestrator persona:** Handled by service account / CI runner β€” no dedicated agent persona needed. +- **Semgrep MCP:** Integrated into the Security Reviewer container. The MCP server runs alongside the CLI tools (Semgrep, Gitleaks, Trivy). + +## Open Questions + +- Should we add a dedicated test runner persona for future test execution stages? diff --git a/docs/hydra/agentic-workflow.md b/docs/hydra/agentic-workflow.md new file mode 100644 index 0000000..d8d2485 --- /dev/null +++ b/docs/hydra/agentic-workflow.md @@ -0,0 +1,301 @@ +# Agentic Workflow + +This document describes the automated steps that agents execute within the Hydra pipeline. Each step maps to a stage on the GitHub project board. + +## Pipeline Stages + +### 1. Specs (Input) + +**Triggered by:** Card appearing in the Specs/Todo column +**Agent role:** Builder (Al Gorithm) + +The Builder reads the OpenSpec change directory (`openspec/changes//`) and prepares the implementation context: +- Reads `tasks.md` β€” the canonical task list authored during the spec phase +- Parses requirements (MUST, SHOULD, MAY per RFC 2119) from `design.md` and `specs/` +- Extracts acceptance criteria (GIVEN/WHEN/THEN scenarios) from task definitions +- Identifies files likely affected based on task scope +- Checks for conflicts with existing specs or in-progress changes + +> **Note on tasks.md vs plan.json:** OpenSpec uses `tasks.md` as the primary authoring +> artifact. A `plan.json` can optionally be generated from `tasks.md` by running +> `/opsx-plan-to-issues`, which adds GitHub Issue links and structured metadata for +> implementation tracking. The Builder reads `tasks.md` directly β€” `plan.json` is not +> required for the pipeline to function. + +**Output:** Implementation context ready for the Apply stage + +### 2. Apply (Build) + +**Triggered by:** Plan ready +**Agent role:** Builder (opus model) + +The agent implements the change: +- Creates the feature branch (`hydra/{spec-name}`) and pushes early (enables quality tests) +- Scaffolds new files if needed (controllers, views, stores, tests) +- Implements the feature following project conventions +- Writes tests matching the acceptance criteria +- Commits incrementally (one commit per logical unit of work) +- Spec path is auto-detected from the issue body (or overridden via `--spec-repo` flag) + +**Constraints:** +- Must follow the target app's coding standards (PHPCS, ESLint, etc.) +- Must not introduce new linting errors +- Must not break existing tests +- Each commit must have a meaningful message + +**Output:** Feature branch with implementation commits + +### 3. Automated Quality Tests + +**Triggered by:** Apply completes (branch pushed) +**Script:** `scripts/run-quality.sh` +**Environment:** Docker `php:X.Y-cli` container (PHP version read from `app-config.json`) +**Fix agent model:** sonnet (targeted fixes) + +All quality checks run inside a Docker container -- nothing runs on the host PHP. The automated quality gate covers: + +- **PHP linting** β€” `php -l` (syntax check) +- **PHP code style** β€” PHPCS (PHP_CodeSniffer) +- **PHP mess detection** β€” PHPMD +- **PHP static analysis** β€” Psalm, PHPStan +- **PHP metrics** β€” phpmetrics (complexity, coupling, maintainability) +- **PHP dependency audit** β€” `composer audit` +- **Frontend linting** β€” ESLint, Stylelint +- **Frontend dependency audit** β€” `npm audit` +- **Unit tests** β€” PHPUnit running inside a containerized Nextcloud + SQLite environment +- **API tests** β€” Newman collections via PHP built-in server (when enabled) +- **Build verification** β€” `npm run build` / `composer install` must succeed + +The `--keep-server` flag keeps the containerized Nextcloud instance running so the subsequent browser test stage can use it. + +> SBOM generation, CVE scanning, and license compliance are handled by the organisation-wide quality workflow, not by this stage. + +If quality tests fail: +- Builder is re-launched in **fix-quality mode** (sonnet model, max 2 retries) +- If still failing after retries, the issue is labelled `needs-input` and escalated to a human + +**Output:** Clean build with passing checks + +### 3a. Browser UI Tests + +**Triggered by:** Quality tests pass +**Script:** `scripts/run-browser-tests.sh` (runs on HOST, not in Docker) +**Skill:** `images/builder/skills/hydra-ui-test/SKILL.md` +**Model:** sonnet +**Fix agent model:** sonnet (targeted fixes) + +Browser UI tests run on the host machine using Claude CLI with Playwright MCP (headless Chromium). The script pre-extracts acceptance criteria from the spec into the prompt to avoid wasting tokens on file reading. + +The browser tester: +- Logs into Nextcloud and navigates to the target app +- Tests acceptance criteria (GIVEN/WHEN/THEN scenarios from the spec) +- Tests CRUD flows, navigation, forms, and error states +- Checks for console errors and network failures +- Returns a structured verdict JSON with CRITICAL/WARNING findings + +If browser tests fail: +- Builder is re-launched in **fix-browser mode** (sonnet model, max 2 retries) +- If still failing after retries, the issue is labelled `needs-input` and escalated to a human + +**Output:** Structured verdict JSON confirming UI works as specified + +### 4. Archive + +**Triggered by:** Validation passes +**Agent role:** Publisher + +The agent packages the change for review: +- Creates the Pull Request with structured description +- Links the PR back to the GitHub Issue / board card +- Moves the card to In Progress +- Updates the OpenSpec change status to `pr-created` + +**PR description format:** +```markdown +## Summary + + +## Spec Reference + + +## Changes + + +## Test Coverage + +``` + +**Output:** Open PR ready for review + +### 5. Code Review + +**Triggered by:** Quality tests pass +**Agent role:** Code Reviewer (sonnet model) +**Runs in parallel with:** Security Review (stage 6) + +The agent reviews the PR for: +- **Correctness** β€” Does the implementation match the spec? +- **Style** β€” Does it follow project conventions? +- **Architecture** β€” Are patterns used correctly? Any unnecessary complexity? +- **Edge cases** β€” Are error paths handled? Are inputs validated at boundaries? +- **Performance** β€” Any obvious N+1 queries, unnecessary re-renders, or heavy operations? + +Review results are posted as PR comments. Issues are categorised: +- **CRITICAL** β€” Must fix before merge +- **WARNING** β€” Should fix, but not a blocker +- **SUGGESTION** β€” Nice to have, at reviewer's discretion + +If CRITICAL or WARNING issues are found, the Builder is re-launched in fix mode (sonnet model, max 3 retries). If the fix budget is exhausted, the issue is labelled `needs-input` and escalated to a human. Round-based verdict pairing uses marker comments to track which review round each fix addresses. + +**Output:** Review comments on the PR + +### 6. Security + +**Triggered by:** Quality tests pass (runs in parallel with Code Review) +**Agent role:** Security Reviewer (Clyde Barcode, sonnet model) + +The agent reviews the new/changed code for security vulnerabilities: +- **OWASP Top 10** β€” SQL injection, XSS, command injection, LDAP injection, etc. +- **Secret detection** β€” No API keys, passwords, or tokens hardcoded in code +- **Unsafe patterns** β€” Insecure deserialization, broken auth, missing input validation +- **SAST scanning** β€” Semgrep with OWASP rules against the changed files + +> Dependency CVE scanning, SBOM generation, and license compliance are handled by the organisation-wide quality workflow, not by this agent. + +Security findings are posted as PR comments with severity ratings (CRITICAL / WARNING / INFO). WARNING-level findings also generate separate GitHub Issues labelled `finding` for tracking. + +**Output:** Security code review on the PR, finding issues for WARNINGs + +### 7. PR Assigned (Human Gate) + +**Triggered by:** Both Code Review and Security pass +**Board column:** Review + +The PR is assigned to a human reviewer. This is the single mandatory human approval point in the pipeline. The human reviews: +- Does the change make sense for the product? +- Are there concerns the agents missed? +- Is this the right time to ship this change? + +The human approves, requests changes, or rejects. + +### 8. Merge (Human) + +**Triggered by:** Human approves the PR +**Agent role:** None β€” auto-merge intentionally disabled, humans must review and merge + +The human merges the PR (squash or merge commit per project convention). + +### 9. Archive (post-merge) + +**Triggered by:** Human has merged the PR +**Script:** `scripts/run-archive.sh` + +After the PR is merged, run the archive step to formalize the change: + +```bash +scripts/run-archive.sh --app-path --change-name [--repo-path owner/repo] [--issue-number N] +``` + +The archive step: +1. **Sync delta specs** β€” copies specs from the change's `specs/` directory to the + app's main `openspec/specs/` directory +2. **Generate test scenarios** β€” converts acceptance criteria from `tasks.md` into + reusable `TS-NNN-*.md` files in `{app}/test-scenarios/`. These are auto-picked + up by `/test-functional`, `/test-app`, and `/test-scenario-run` in future runs. +3. **Update CHANGELOG.md** β€” adds completed tasks under the current version + (Keep a Changelog format) +4. **Move to archive** β€” moves the change to `openspec/changes/archive/YYYY-MM-DD-{name}/` +5. **Close GitHub issue** β€” if `--issue-number` and `--repo-path` are provided + +Test scenarios accumulate over time β€” each archived change adds scenarios, building +a regression test library that future browser tests can execute. + +**Output:** Archived change, test scenarios, updated changelog + +## Traceability + +Every code change is traceable to its spec through two independent paths: + +**Path 1: Git history** +``` +code line β†’ git blame β†’ commit "feat: add columns (#82)" β†’ PR "Closes #82" β†’ Issue #82 β†’ spec +``` + +**Path 2: PHPDoc `@spec` tag** +``` +code line β†’ docblock @spec openspec/changes/kanban-mvp/tasks.md#task-1 β†’ spec +``` + +The Builder adds `@spec` tags at three levels: +- **File docblock**: links the file to the change that created it +- **Class docblock**: links the class to the spec requirement +- **Method docblock**: links each method to the specific task + +Multiple `@spec` tags are supported (code touched by multiple changes over time). +The Code Reviewer checks for missing `@spec` tags as part of architecture compliance. + +Branch naming follows: `feature/{issue-number}/{change-name}` (e.g., `feature/82/kanban-mvp`). +All commits include `(#issue-number)`. PR body starts with `Closes #N`. + +## Agent Personas + +Each pipeline stage is handled by a named agent persona. These personas have Conduction company profiles and contribute visibly to the repository. See [Agent Configuration](agent-configuration.md) for persona details. + +## Error Handling + +| Scenario | Response | +|----------|----------| +| Build fails | Agent retries up to 3 times with different approaches | +| Quality tests fail | Builder fix-quality mode (sonnet, max 2 retries); escalates with `needs-input` label | +| Browser UI tests fail | Builder fix-browser mode (sonnet, max 2 retries); escalates with `needs-input` label | +| Review finds CRITICAL/WARNING | Builder fix mode (sonnet, max 3 retries); escalates with `needs-input` label if budget exhausted | +| Security vulnerability found in code | PR blocked; Builder attempts fix based on Security Reviewer's findings | +| Spec ambiguity | Agent flags the ambiguity as a PR comment; human resolves | +| Rate limit hit | Agent backs off and retries after cooldown | +| Container crash | Pipeline retries once, then logs the failure; card stays in current column | +| OAuth token expired | Token is refreshed before each container launch | + +## Concurrency + +Multiple changes can be in-progress simultaneously. Each change operates on its own branch and its own board card. Concurrent pipeline isolation is achieved through issue-based temp directories (`/tmp/hydra-{issue-number}/`). Agents must: +- Use git worktrees for parallel same-repo work +- Not modify shared configuration without coordination +- Detect and flag merge conflicts early + +## Implementation Status + +The following stages are fully implemented in the current pipeline: + +| Stage | Status | Notes | +|-------|--------|-------| +| 1. Specs (Input) | Implemented | Builder reads `tasks.md`; spec path auto-detected from issue body | +| 2. Apply (Build) | Implemented | Feature branch creation, code implementation (opus), early branch push | +| 3. Automated Quality Tests | Implemented | `scripts/run-quality.sh` β€” all checks inside Docker php:X.Y-cli; `--keep-server` flag | +| 3a. Fix Quality | Implemented | Builder fix-quality mode (sonnet, max 2 retries) | +| 3b. Browser UI Tests | Implemented | `scripts/run-browser-tests.sh` β€” Playwright MCP on host; structured verdict JSON | +| 3c. Fix Browser | Implemented | Builder fix-browser mode (sonnet, max 2 retries) | +| 4. Archive | Implemented | Draft PR creation with structured description | +| 5. Code Review | Implemented | Parallel with Security (sonnet); posts CRITICAL/WARNING/SUGGESTION | +| 6. Security | Implemented | Parallel with Code Review (sonnet); Semgrep + Gitleaks + Trivy | +| 6a. Fix Review Findings | Implemented | Builder fix mode (sonnet, max 3 retries); escalates with `needs-input` label | +| 7. PR Assigned | Implemented | Human gate β€” manual review required | +| 8. Merge | Manual | Auto-merge intentionally disabled -- humans must review and merge | + +Additional implemented features: +- **Parallel reviewer execution** β€” Code Review and Security Review run simultaneously (halves wall time) +- **Browser UI testing** β€” Playwright MCP on host with pre-extracted acceptance criteria; structured CRITICAL/WARNING verdicts +- **Board card auto-movement** β€” via GitHub Projects v2 API (`docs/templates/hydra-board-sync.yml`) +- **Findings as separate issues** β€” WARNING-level findings create standalone `finding`-labelled issues +- **Pipeline status comment** β€” `scripts/lib/pipeline-comment.sh` posts progress updates on the issue +- **Retry on container crash** β€” 1 automatic retry before escalation +- **OAuth token refresh** β€” token refreshed before each container launch +- **Round-based verdict pairing** β€” marker comments track which review round each fix addresses +- **Structured log aggregation** β€” logs written to `logs/pipeline-{timestamp}/` +- **Poll mode** β€” `--poll` flag for continuous board monitoring +- **Multi-repo spec support** β€” `--spec-repo` flag for specs in a different repository +- **Concurrent pipeline isolation** β€” issue-based temp directories for parallel runs + +## Open Questions + +- Should agents be able to split a spec into multiple PRs if it's too large? +- How do we handle cross-app changes that span multiple repositories? diff --git a/docs/hydra/container-architecture.md b/docs/hydra/container-architecture.md new file mode 100644 index 0000000..ea499b9 --- /dev/null +++ b/docs/hydra/container-architecture.md @@ -0,0 +1,340 @@ +# Container Architecture + +This document describes the three-container pipeline, the security constraints applied to +each container, and the audit output each container produces. + +--- + +## Pipeline Overview + +Every OpenSpec change flows through ephemeral containers in sequence: + +``` +Todo (, default: ready-to-build) + β”‚ + β–Ό +β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” +β”‚ Builder (hydra-builder) β”‚ +β”‚ Persona: Al Gorithm β”‚ +β”‚ Model: opus (complex implementation) β”‚ +β”‚ Tools: Read Write Edit Bash Glob Grep β”‚ +β”‚ Max turns: 80 β”‚ +β”‚ Output: feature branch (pushed early), β”‚ +β”‚ draft PR opened β”‚ +β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ + β”‚ + β–Ό +β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” +β”‚ Automated Quality Tests (Docker) β”‚ +β”‚ Script: scripts/run-quality.sh β”‚ +β”‚ Runs inside: Docker php:X.Y-cli β”‚ +β”‚ (PHP version from app-config.json) β”‚ +β”‚ β”‚ +β”‚ Static analysis: β”‚ +β”‚ lint, phpcs, phpmd, psalm, phpstan, β”‚ +β”‚ phpmetrics, composer audit β”‚ +β”‚ Frontend: β”‚ +β”‚ eslint, stylelint, npm audit β”‚ +β”‚ Tests: β”‚ +β”‚ PHPUnit (containerized NC + SQLite) β”‚ +β”‚ Newman (PHP built-in server) β”‚ +β”‚ β”‚ +β”‚ Flag: --keep-server (keep NC running β”‚ +β”‚ for browser tests) β”‚ +β”‚ Output: pass / fail with log β”‚ +β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ + β”‚ fail β†’ Builder fix-quality (sonnet, max 2 retries) + β”‚ pass ↓ + β–Ό +β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” +β”‚ Browser UI Tests (on HOST) β”‚ +β”‚ Script: scripts/run-browser-tests.sh β”‚ +β”‚ Runtime: Claude CLI + Playwright MCP β”‚ +β”‚ (headless Chromium) β”‚ +β”‚ Skill: hydra-ui-test β”‚ +β”‚ Model: sonnet β”‚ +β”‚ β”‚ +β”‚ Tests: β”‚ +β”‚ Logs into Nextcloud, navigates to appβ”‚ +β”‚ Acceptance criteria (GIVEN/WHEN/THEN)β”‚ +β”‚ CRUD flows, navigation, forms β”‚ +β”‚ Error states, console errors β”‚ +β”‚ Network failure detection β”‚ +β”‚ Output: structured verdict JSON β”‚ +β”‚ (CRITICAL/WARNING findings) β”‚ +β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ + β”‚ fail β†’ Builder fix-browser (sonnet, max 2 retries) + β”‚ pass ↓ + β–Ό +β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” +β”‚ Code Reviewer (hydra-reviewer) β”‚ β”‚ Security Reviewer (hydra-security) β”‚ +β”‚ Persona: Juan Claude van Damme β”‚ β”‚ Persona: Clyde Barcode β”‚ +β”‚ Model: sonnet β”‚ β”‚ Model: sonnet β”‚ +β”‚ Tools: Read Bash Grep Glob β”‚ β”‚ Tools: Read Bash mcp__semgrep__* β”‚ +β”‚ Max turns: 30 β”‚ β”‚ Max turns: 20 β”‚ +β”‚ Resources: 2 CPUs / 4 GB β”‚ β”‚ Resources: 2 CPUs / 4 GB β”‚ +β”‚ Output: PR review comments + verdict β”‚ β”‚ Output: Security findings + verdict β”‚ +β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ + (parallel execution β€” halves wall time) + β”‚ + β”‚ fail β†’ Builder fix CRITICAL+WARNING (sonnet, max 3 retries) + β”‚ If fix budget exhausted β†’ needs-input label, escalate to human + β”‚ pass ↓ + β–Ό +Draft PR (ready-for-review) β€” human reviews and merges (auto-merge intentionally disabled) +``` + +Each container is destroyed after completion. No state persists between runs except what +is written to the target repository or GitHub. + +--- + +## Model Selection + +Each pipeline stage uses the most cost-effective model for its task complexity: + +| Stage | Model | Rationale | +|-------|-------|-----------| +| Builder (build) | opus | Complex implementation requires deep reasoning | +| Builder (fix-quality) | sonnet | Targeted lint/test fixes are straightforward | +| Builder (fix-browser) | sonnet | Targeted UI fixes based on structured findings | +| Builder (fix review findings) | sonnet | Targeted fixes for CRITICAL+WARNING findings | +| Browser UI Tester | sonnet | Structured acceptance testing via Playwright MCP | +| Code Reviewer | sonnet | Review is analytical, not generative | +| Security Reviewer | sonnet | Pattern matching against known vulnerability classes | + +Override: set `HYDRA_MODEL` environment variable to force a specific model for all stages. + +--- + +## Security Constraints + +These constraints are non-negotiable and apply to all containers in all deployment models. + +| Constraint | Value | Rationale | +|---|---|---| +| `--read-only` | always | Prevents the agent from modifying the container filesystem | +| `--tmpfs /tmp:size=512M` | always | Writable scratch space with size cap | +| `--tmpfs /workspace:size=2G` | always | Clone workspace with size cap | +| `--security-opt no-new-privileges` | always | Prevents privilege escalation via setuid | +| `--cap-drop ALL` | always | Removes all Linux capabilities | +| `--cap-add NET_ADMIN` | local only | Required for iptables egress rules in entrypoint.sh | +| `--cpus 4 --memory 8g` | Builder | Resource caps to prevent runaway builds | +| `--cpus 2 --memory 4g` | Reviewers | Lower resource allocation for read-only review tasks | +| `--network hydra-net` | local | Isolated Docker network | +| `--output-format stream-json` | always | JSONL output for log parsing | +| `--max-turns` | 80 / 30 / 20 | Hard turn limit prevents infinite loops (builder / reviewer / security) | +| No `--dangerously-skip-permissions` | always | Explicit allowedTools required | + +**Origin of these constraints:** a prior Claude Code session inherited organisation-admin +Git rights through a developer's WSL session and bypassed peer review. These constraints +are the direct response to that threat model. + +--- + +## SBOM & Audit Trails + +### What each container produces + +Every pipeline run leaves a durable, verifiable audit trail on GitHub β€” no separate audit +store is required. + +#### Builder + +| Output | Location | Contains | +|---|---|---| +| Feature branch | `hydra/` on target repo | All code changes | +| Draft PR | Target repo PR | Structured description, spec reference, change list, test coverage | +| PR description | PR body | Summary, Spec Reference, Changes, Test Coverage sections | +| Commit messages | Git history | Task reference, what changed and why | +| RFI comment | GitHub issue (if blocked) | Blocker reason, attempted fixes, what is needed | +| Status update | `openspec/changes//design.md` | `status: pr-created` | + +#### Code Reviewer + +| Output | Location | Contains | +|---|---|---| +| Review comments | PR comments | Findings with CRITICAL/WARNING/SUGGESTION severity | +| Verdict comment | PR comment | `{ "pass": bool, "blocking": [...] }` | + +#### Security Reviewer + +| Output | Location | Contains | +|---|---|---| +| SAST findings | PR comments | Semgrep + Gitleaks + Trivy results with severity | +| Security verdict | PR comment | `{ "pass": bool, "blocking": [...] }` | + +### Traceability chain + +``` +GitHub Issue (requirement) + β†’ OpenSpec change (design.md, tasks.md, specs/) + β†’ Builder commit history (what was implemented) + β†’ Draft PR (summary + spec reference) + β†’ Code Review verdict (correctness, style) + β†’ Security verdict (SAST, secrets, hardening) + β†’ Human approval (4-eyes rule) + β†’ Merge to main +``` + +Every step is recorded on GitHub with a timestamp and the identity of the agent or human +that took the action. This chain satisfies: + +- **EU AI Act Article 12** (transparency and traceability for high-risk AI systems): + all AI-generated code is explicitly labelled, attributed to a named agent, and reviewed + by a human before reaching production. +- **ISO 27001 A.12.1.2** (change management): every change is traceable from requirement + to deployment. +- **"Public money = public code"**: the full audit trail is available to any reviewer of + the public repository. + +### SBOM generation (phase 2) + +In a future phase, each Builder run will additionally produce: + +- A CycloneDX or SPDX SBOM for the target application's dependencies +- An attestation file (`hydra-attestation.json`) recording which model version, which spec, + and which turn count produced the change + +These will be attached to the GitHub release as assets. + +--- + +## Configuration Layers + +Agent configuration is split across two complementary layers β€” portable agent definitions +and Docker-specific runtime config: + +### Layer 1: Agent definitions (`agents/`) + +Portable, deployment-agnostic configuration managed as YAML: + +``` +agents/ +β”œβ”€β”€ base.yaml # Shared config (egress, MCP, env vars) +β”œβ”€β”€ al-gorithm/config.yaml # Builder overrides (tools, turns, extra env) +β”œβ”€β”€ juan-claude-van-damme/config.yaml # Reviewer overrides +└── clyde-barcode/config.yaml # Security overrides +``` + +`base.yaml` defines shared settings (egress allowlist, MCP servers, required env vars, +`permission_mode`). Per-agent `config.yaml` files extend it using kustomize-style merging: +scalars override, lists replace, maps deep-merge. + +Key settings in `base.yaml`: +- `claude.permission_mode: acceptEdits` β€” allows containers to run headless without + interactive permission prompts +- `claude.output_format: stream-json` β€” JSONL output for log parsing +- `egress.hosts` β€” allowlisted domains (api.anthropic.com, github.com, etc.) +- `mcp.github` β€” GitHub MCP server shared by all agents + +### Layer 2: Docker image config (`images/`) + +Container-specific files baked into each Docker image at build time: + +``` +images/{builder,reviewer,security}/ +β”œβ”€β”€ CLAUDE.md # Agent identity, workflow, and constraints +β”œβ”€β”€ settings.json # Claude CLI settings (allowedTools, MCP permissions) +β”œβ”€β”€ .mcp.json # MCP server definitions (runtime format) +└── entrypoint.sh # Container bootstrap script +``` + +### Shared entrypoint library (`scripts/lib/`) + +Common bootstrap logic is extracted into shared scripts to reduce duplication: + +- `scripts/lib/entrypoint-common.sh` β€” shared setup functions (git auth, egress firewall, + directory prep) called by all three container entrypoints +- `scripts/lib/load-config.py` β€” YAML config parser that merges `base.yaml` with per-agent + `config.yaml` and exposes values as environment variables +- `scripts/lib/pipeline-comment.sh` β€” pipeline status tracking via comments on the GitHub issue +- `scripts/run-quality.sh` β€” automated quality runner (all checks inside Docker php:X.Y-cli + container: lint, phpcs, phpmd, psalm, phpstan, phpmetrics, composer audit, eslint, + stylelint, npm audit, PHPUnit with containerized Nextcloud + SQLite, Newman API tests; + `--keep-server` flag keeps Nextcloud running for subsequent browser tests) +- `scripts/run-browser-tests.sh` β€” browser UI testing on HOST via Claude CLI + Playwright + MCP (headless Chromium); pre-extracts acceptance criteria into prompt; returns structured + verdict JSON with CRITICAL/WARNING findings + +### Why the split? + +The agent definitions in `agents/` are the **source of truth for what each agent can do** +(tools, turn limits, egress, MCP servers). The Docker image config in `images/` is the +**runtime materialisation** of those definitions plus container-specific concerns +(entrypoint scripts, Claude CLI format). This separation allows the same agent definitions +to work across all three deployment models (local Docker, GitHub Actions, Kubernetes). + +--- + +## ADR Layering + +Architectural Decision Records (ADRs) guide how agents write code. They come from two +sources, applied in priority order: + +### Layer 1: Conduction company-wide ADRs (baked into image) + +At build time, the Builder image copies the 12 compact ADRs from this repo's +`openspec/architecture/` into `/home/claude/.claude/openspec/architecture/`. These define +baseline conventions for all Conduction projects: + +| ADR | Topic | +|-----|-------| +| adr-001 | OpenRegister data layer (incl. register templates + seed data) | +| adr-002 | API conventions (NL API strategie) | +| adr-003 | Backend layering (Controllerβ†’Serviceβ†’Mapper, DI, routing) | +| adr-004 | Frontend patterns (Vue 2, Pinia, @conduction/nextcloud-vue) | +| adr-005 | Security and auth | +| adr-006 | Prometheus metrics and health checks | +| adr-007 | i18n requirement (nl/en minimum, register i18n) | +| adr-008 | Mandatory test coverage | +| adr-009 | Documentation with screenshots | +| adr-010 | NL Design System (CSS vars, WCAG AA) | +| adr-011 | Schema standards (schema.org, vCard) | +| adr-012 | Deduplication check against OpenRegister core | + +### Layer 2: Project-specific ADRs (from target repo at runtime) + +When the Builder clones the target repository, it checks for +`openspec/architecture/` in the repo root. If present, these ADRs extend or override +the company-wide ones. Project ADRs take precedence when they conflict with Conduction ADRs. + +### Resolution order + +1. Read all Conduction ADRs (always present in the image) +2. Read project ADRs (if the target repo has them) +3. On conflict: project ADR wins +4. On silence: Conduction ADR applies + +--- + +## OPSX Integration + +The Builder container includes the full OPSX skill suite from this repo's +`.claude/skills/`, copied at build time. This gives the Builder access to: + +- **OPSX skills** (`skills/opsx-apply`, `opsx-verify`, `opsx-archive`, etc.) β€” the standard + Conduction workflow for implementing, verifying, and archiving OpenSpec changes +- **OPSX commands** (`commands/opsx/apply.md`, etc.) β€” slash command definitions +- **OpenSpec CLI** (`@fission-ai/openspec`) β€” for checking change status and getting instructions +- **Conduction schemas** β€” artifact templates for proposals, designs, specs, tasks, etc. + +The Builder follows the OPSX apply workflow (read context β†’ implement tasks β†’ quality checks +β†’ mark complete) but runs headless without interactive prompts. + +--- + +## Security Hooks + +The Builder container inherits the PreToolUse security hook +(originally from `ConductionNL/.github/global-settings/block-write-commands.sh`). This provides defence-in-depth alongside +the container-level isolation: + +- **Hard-blocks** writes to `~/.claude/` config files +- **Hard-blocks** WSL boundary escapes (not applicable in containers, but harmless) +- **Prompts for approval** on destructive operations (rm, git push, etc.) +- **Requires explicit authorization** for git push via transcript phrase matching + +In headless container mode, the hook's "ask" decisions are handled by the +`permission_mode: acceptEdits` setting β€” the agent auto-approves non-blocked operations. +Hard-blocks still apply regardless of permission mode. diff --git a/docs/hydra/context.md b/docs/hydra/context.md new file mode 100644 index 0000000..0d8c484 --- /dev/null +++ b/docs/hydra/context.md @@ -0,0 +1,122 @@ +# Context β€” Hydra Platform + +This document captures the operational context of the Hydra platform: how it is positioned +within Conduction B.V., how it receives work, and how credentials flow through the pipeline. + +--- + +## Purpose + +Hydra takes structured OpenSpec change proposals and turns them into validated, security- +scanned code on a feature branch β€” ready for a single human approval. + +It is the factory, not the product. The applications it builds live under `ConductionNL`. + +--- + +## Secrets & Tokens + +### Overview + +Each container receives exactly the credentials it needs β€” no more. No shared org-admin +tokens exist in the pipeline. + +| Container | Token variable | PAT scope | What it can do | +|---|---|---|---| +| Builder | `GIT_TOKEN` = `HYDRA_BUILDER_TOKEN` | `contents:write`, `pull-requests:write` | Clone, push branch, create draft PR, post issue comments | +| Code Reviewer | `GIT_TOKEN` = `HYDRA_REVIEWER_TOKEN` | `pull-requests:write` | Post PR review comments only | +| Security Reviewer | `GIT_TOKEN` = `HYDRA_SECURITY_TOKEN` | `pull-requests:write` | Post PR review comments only | + +All containers also receive `ANTHROPIC_API_KEY` (same key, all containers). + +### Injection method per deployment model + +#### Local (Docker) + +Secrets are loaded from `secrets/.env` (gitignored). The `docker run` command injects them +via `--env-file`: + +```bash +docker run \ + --env-file secrets/.env \ + -e GIT_TOKEN="${HYDRA_BUILDER_TOKEN}" \ + ... +``` + +The `orchestrate.sh` script reads `secrets/.env` and maps the correct token to `GIT_TOKEN` +based on which stage is running: + +```bash +# Builder stage: +GIT_TOKEN="${HYDRA_BUILDER_TOKEN}" + +# Review stage: +GIT_TOKEN="${HYDRA_REVIEWER_TOKEN}" + +# Security stage: +GIT_TOKEN="${HYDRA_SECURITY_TOKEN}" +``` + +See `secrets/.env.example` for the complete variable reference. + +#### GitHub Actions + +Secrets are stored in the GitHub organisation under **Settings β†’ Secrets and variables β†’ +Actions**. Each workflow step injects the token for the specific container via `env:`: + +```yaml +- name: Run Builder + env: + ANTHROPIC_API_KEY: ${{ secrets.HYDRA_ANTHROPIC_KEY }} + GIT_TOKEN: ${{ secrets.HYDRA_BUILDER_TOKEN }} +``` + +GitHub Actions secrets are never echoed in logs. Each job only receives its own token. + +#### Kubernetes (phase 3) + +Secrets are stored as Kubernetes `Secret` objects in the `hydra` namespace, managed by +ArgoCD via sealed secrets or external secrets operator. Each Job manifest mounts only the +secret it needs via `env.valueFrom.secretKeyRef`: + +```yaml +env: + - name: GIT_TOKEN + valueFrom: + secretKeyRef: + name: hydra-builder-token + key: token + - name: ANTHROPIC_API_KEY + valueFrom: + secretKeyRef: + name: hydra-anthropic-key + key: key +``` + +Secrets are never mounted as files β€” always injected as environment variables. + +### What is NOT a secret + +The following are not secrets and may appear in logs: + +- `REPO_URL` β€” the GitHub URL of the target repository +- `ISSUE_URL` β€” the GitHub issue URL triggering the build +- `PR_URL` β€” the PR URL (Reviewers only) +- `SPEC_PATH` β€” path to the OpenSpec change inside the container +- `GITHUB_ORG`, `HYDRA_PROJECT_NUMBER` β€” organisation metadata + +--- + +## Input Contract + +Every container receives its inputs as environment variables (never as command-line args +or mounted config files, to avoid leakage via `/proc`): + +| Variable | Required by | Description | +|---|---|---| +| `ANTHROPIC_API_KEY` | all | Claude API key | +| `GIT_TOKEN` | all | Scoped PAT for this persona | +| `REPO_URL` | all | `https://github.com/ConductionNL/` | +| `ISSUE_URL` | Builder | GitHub issue that triggered the build | +| `PR_URL` | Reviewers | GitHub PR to review | +| `SPEC_PATH` | Builder | Path to `openspec/changes//` inside the container | diff --git a/docs/hydra/deployment-models.md b/docs/hydra/deployment-models.md new file mode 100644 index 0000000..393587c --- /dev/null +++ b/docs/hydra/deployment-models.md @@ -0,0 +1,225 @@ +# Deployment Models + +Hydra has three deployment layers. The container images are identical across all three β€” +only the orchestrator changes. + +--- + +## Model 1: Local (Developer Machine) + +**Trigger:** cron job running `poll-board.sh` every 15 minutes, or manual `scripts/dev-run.sh` + +**Runner:** Docker on developer workstation + +**Use case:** development of the pipeline itself; testing new container builds locally + +### secrets/ structure + +Create both files in `secrets/` from their `.example` templates (gitignored): + +```bash +cp secrets/.env.example secrets/.env +cp secrets/credentials.example.json secrets/credentials.json +``` + +`secrets/credentials.json` holds the Claude OAuth tokens (multi-account, with +auto-fallback on rate limits) and the per-role Git PAT map. `secrets/.env` +holds the GitHub PATs again (mirrored for `docker --env-file`), the org/board +config, and optional fallback Claude auth. See [`docs/operations/secrets.md`](operations/secrets.md) +for the full layout. + +```bash +# secrets/.env (required values shown β€” see secrets/.env.example for all options) + +# GitHub PATs β€” one per agent persona +HYDRA_BUILDER_TOKEN=ghp_... # contents:write, pull-requests:write +HYDRA_REVIEWER_TOKEN=ghp_... # pull-requests:write (comments only) +HYDRA_SECURITY_TOKEN=ghp_... # pull-requests:write (comments only) + +# GitHub organisation hosting the target app repos +GITHUB_ORG=ConductionNL + +# GitHub Projects v2 board number β€” find it in the URL: +# github.com/orgs//projects/ +HYDRA_PROJECT_NUMBER=1 +``` + +### docker run command (Builder example) + +```bash +docker run --rm \ + --read-only \ + --tmpfs /tmp:size=512M \ + --tmpfs /workspace:size=2G \ + --security-opt no-new-privileges \ + --cap-drop ALL \ + --cap-add NET_ADMIN \ + --cpus 4 \ + --memory 8g \ + --network hydra-net \ + --env-file secrets/.env \ + -e REPO_URL="${REPO_URL}" \ + -e ISSUE_URL="${ISSUE_URL}" \ + -e SPEC_PATH="/spec" \ + -v "$(pwd)/openspec/changes/${CHANGE_NAME}:/spec:ro" \ + -e GIT_TOKEN="${HYDRA_BUILDER_TOKEN}" \ + ghcr.io/conductionnl/hydra-builder:latest +``` + +> `--cap-add NET_ADMIN` is needed locally so entrypoint.sh can configure iptables egress +> rules. In Kubernetes, NetworkPolicy handles egress instead and this flag is omitted. + +Create the Docker network once: +```bash +docker network create hydra-net +``` + +### Cron setup + +Add to crontab (`crontab -e`): + +``` +*/15 * * * * /path/to/hydra/scripts/poll-board.sh >> /var/log/hydra-poll.log 2>&1 +``` + +--- + +## Model 2: GitHub Actions + +**Trigger:** `issues: [labeled]` event with the trigger label (default: `ready-to-build`, configurable via `HYDRA_TRIGGER_LABEL` repository variable) + +**Runner:** GitHub-hosted runner (`ubuntu-latest`) + +**Use case:** team workflow β€” create a GitHub issue, add label, pipeline runs automatically + +### GitHub Secrets to configure (at organisation level) + +Set these under **Settings β†’ Secrets and variables β†’ Actions** in the `ConductionNL` +organisation: + +| Secret name | Value | PAT scopes required | +|---|---|---| +| `HYDRA_ANTHROPIC_KEY` | Anthropic API key | β€” | +| `HYDRA_BUILDER_TOKEN` | PAT for Al Gorithm | `contents:write`, `pull-requests:write` on target repo | +| `HYDRA_REVIEWER_TOKEN` | PAT for Juan Claude van Damme | `pull-requests:write` on target repo | +| `HYDRA_SECURITY_TOKEN` | PAT for Clyde Barcode | `pull-requests:write` on target repo | +| `GITHUB_ORG` | `ConductionNL` | β€” | +| `HYDRA_PROJECT_NUMBER` | Project board number | β€” | + +### PAT scopes per persona + +**Al Gorithm (Builder):** +- `contents:write` β€” clone, push feature branch +- `pull-requests:write` β€” create draft PR, post RFI comment +- Scoped to: `ConductionNL/` only (fine-grained PAT preferred) + +**Juan Claude van Damme (Code Reviewer):** +- `pull-requests:write` β€” post review comments, post verdict +- No `contents` scope β€” cannot push code +- Scoped to: `ConductionNL/` only + +**Clyde Barcode (Security Reviewer):** +- `pull-requests:write` β€” post security findings, post verdict +- No `contents` scope β€” cannot push code +- Scoped to: `ConductionNL/` only + +### Workflow chain + +``` +issues: labeled () + └── hydra-build.yml + β”œβ”€β”€ runs Builder container (35 min timeout) + └── on success: opens draft PR on target repo + +pull_request: opened/synchronize (branch: hydra/*) + └── hydra-review.yml + β”œβ”€β”€ Code Reviewer job (20 min timeout) ─┐ parallel + └── Security Reviewer job (20 min timeout) β”€β”˜ + └── each posts verdict comment on PR +``` + +--- + +## Model 3: Self-Hosted Server (Kubernetes + ArgoCD) + +**Trigger:** GitHub board card event or webhook + +**Runner:** Self-hosted K8s job via `actions-runner-controller` + +**Use case:** production β€” air-gapped, auditable, no public GitHub runner access to code + +### actions-runner-controller installation + +```bash +helm repo add actions-runner-controller \ + https://actions-runner-controller.github.io/actions-runner-controller + +helm install actions-runner-controller \ + actions-runner-controller/actions-runner-controller \ + --namespace actions-runner-system \ + --create-namespace \ + --set authSecret.create=true \ + --set authSecret.github_token="${GITHUB_PAT}" +``` + +Create a `RunnerDeployment` in the `hydra` namespace: + +```yaml +apiVersion: actions.summerwind.dev/v1alpha1 +kind: RunnerDeployment +metadata: + name: hydra-runner + namespace: hydra +spec: + replicas: 2 + template: + spec: + organization: ConductionNL + labels: + - self-hosted + - linux + - hydra +``` + +### Migration from hosted to self-hosted runner + +In each workflow file, change one line: + +```yaml +# Before (GitHub-hosted): +runs-on: ubuntu-latest + +# After (self-hosted): +runs-on: [self-hosted, linux, hydra] +``` + +All other workflow configuration remains identical. The container images are pulled from GHCR +β€” no code is on the runner. + +### ArgoCD Application definition + +See `manifests/argocd-app.yaml`. The Application syncs `manifests/` from the `main` branch. + +```yaml +# Key fields +spec: + source: + repoURL: https://github.com/ConductionNL/hydra + targetRevision: main + path: manifests + destination: + server: https://kubernetes.default.svc + namespace: hydra + syncPolicy: + automated: + prune: true + selfHeal: true +``` + +### NetworkPolicy per Job namespace + +Each container Job runs in its own namespace with a NetworkPolicy that mirrors the iptables +allowlist in `entrypoint.sh`. See `manifests/network-policy.yaml` for the full definitions. + +Key principle: egress is denied by default; only HTTPS (443) to the specific allow-listed +hosts is permitted per container type. diff --git a/docs/hydra/github-workflow.md b/docs/hydra/github-workflow.md new file mode 100644 index 0000000..a27fdd3 --- /dev/null +++ b/docs/hydra/github-workflow.md @@ -0,0 +1,143 @@ +# GitHub Workflow + +Hydra uses GitHub labels as its state machine. All pipeline state lives on GitHub β€” labels, PRs, comments. No in-memory state. If any step is interrupted, the next cron cycle resumes from the current label state. + +## Label State Machine + +``` + β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” + β”‚ β”‚ + β”‚ (default: ready-to-build) β”‚ + β”‚ (configurable via HYDRA_TRIGGER_LABEL)β”‚ + β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ + β”‚ + β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β–Όβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” + β”‚ pipeline-active + building β”‚ + β”‚ (builder container running) β”‚ + β”‚ Creates PR, runs quality checks β”‚ + β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ + β”‚ + β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β–Όβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” + β”‚ pipeline-active β”‚ + β”‚ + ready-for-code-review (on PR) β”‚ + β”‚ + ready-for-security-review (on PR) β”‚ + β”‚ (review agents running) β”‚ + β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ + β”‚ + β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” + β”‚ β”‚ + β”Œβ”€β”€β”€β”€β”€β”€β–Όβ”€β”€β”€β”€β”€β”€β” β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β–Όβ”€β”€β”€β”€β”€β”€β”€β”€β” + β”‚ verdicts β”‚ β”‚ verdicts β”‚ + β”‚ all pass β”‚ β”‚ any fail β”‚ + β””β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”˜ β””β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”˜ + β”‚ β”‚ + β”Œβ”€β”€β”€β”€β”€β”€β–Όβ”€β”€β”€β”€β”€β”€β” β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β–Όβ”€β”€β”€β”€β”€β”€β”€β”€β” + β”‚ ready-for- β”‚ β”‚ pipeline-active β”‚ + β”‚ review β”‚ β”‚ + building β”‚ + β”‚ (or auto- β”‚ β”‚ (fix iteration) β”‚ + β”‚ merge if β”‚ β”‚ then re-trigger β”‚ + β”‚ yolo) β”‚ β”‚ reviews β”‚ + β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ β””β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”˜ + β”‚ + (loops back to review) + max 3 fix iterations + then β†’ needs-input +``` + +## Label Reference + +| Label | Applied to | Meaning | Set by | Removed by | +|-------|-----------|---------|--------|------------| +| `ready-to-build` (configurable) | Issue | New spec, ready for build. Label name configurable via `HYDRA_TRIGGER_LABEL` env var (default: `ready-to-build`). | Specter push script | Orchestrate (build start) | +| `pipeline-active` | Issue | In-progress β€” cron re-dispatches each cycle | Orchestrate (build start) | Orchestrate (merge or escalation) | +| `building` | Issue | Container actively running β€” blocks concurrent dispatch | Orchestrate (before build/fix) | Orchestrate (after build/fix) | +| `ready-for-code-review` | PR | Code review agent should run | Orchestrate (after build/fix) | Supervisor (successful completion) | +| `ready-for-security-review` | PR | Security review agent should run | Orchestrate (after build/fix) | Supervisor (successful completion) | +| `ready-for-review` | Issue | All AI reviews passed β€” human review or auto-merge | Orchestrate (verdicts pass) | Human (after merge) | +| `needs-input` | Issue | Escalated to human β€” fix budget exhausted or build failure | Orchestrate | Human | +| `yolo` | Issue | Skip human review β€” auto-merge when AI reviews pass | Specter push script | Orchestrate (after merge) | +| `openspec` | Issue | Change is spec-driven | Specter push script | β€” | +| `oversized` | Issue | Spec generation exceeded turn limit β€” needs splitting | Specter push script | Human | + +## How Cron Reads Labels + +**cron-hydra.sh** (every 5 minutes): +1. Searches for issues with the trigger label (default: `ready-to-build`, configurable via `HYDRA_TRIGGER_LABEL`) OR `pipeline-active` +2. Skips issues with `building` label (another instance working) +3. Checks `hydra.json` β€” skips if dependency issues aren't closed +4. Dispatches to orchestrate.sh + +**hydra-supervisor.sh** (continuous daemon): +1. Searches for PRs with `ready-for-code-review` or `ready-for-security-review` +2. Skips if already locked (same PR being reviewed) +3. Caps at 3 attempts per PR β€” then removes labels and adds `needs-input` +4. Uses shared slot pool (max 5 concurrent with builders) + +## Concurrency Control + +A shared slot pool (`/tmp/hydra-slots/`) governs all container launches: +- Max **5 slots** across builds, fixes, and reviews +- `cron-hydra.sh` and the supervisor's review dispatcher share the same pool +- `building` label on GitHub provides distributed locking across multiple Hydra instances +- If no slots available, work is deferred to the next cron cycle + +## Board Structure + +The project board has four columns: + +| Column | What lives here | How cards arrive | +|--------|----------------|-----------------| +| **Todo** | Issues with trigger label (default: `ready-to-build`) | Specter creates issue | +| **In Progress** | Issues with `pipeline-active` or `building` | Orchestrate moves card | +| **Review** | Issues with `ready-for-review` | Orchestrate moves card (all checks pass) | +| **Archived** | Merged changes | Human or yolo merge | + +## Standalone Reviews + +Review agents work independently from the build pipeline. Add a label to **any PR** in the org: + +| Label on PR | What happens | +|-------------|-------------| +| `ready-for-code-review` | Code Reviewer runs, posts findings + verdict | +| `ready-for-security-review` | Security Reviewer runs, posts findings + verdict | +| Both labels | Both run sequentially in one slot | + +Labels are only removed on successful completion. Failed reviews keep labels for retry on next cron cycle (max 3 attempts). + +## Branch Strategy + +- Spec branches: `spec/{slug}` (merged to development by Specter, no review needed) +- Feature branches: `feature/{issue-number}/{change-name}` (created by builder) +- PRs target `development` +- `development` β†’ `beta` β†’ `main` via release process + +## Dependency Enforcement + +Each spec includes `openspec/changes/{name}/hydra.json`: + +```json +{ "depends_on": ["core", "access-control-authorisation"] } +``` + +Before dispatching a build, the cron verifies all dependencies have closed implementation issues. This enforces build order without burning tokens: + +``` +Layer 0: core (no deps) β†’ builds immediately +Layer 1: access-control (core) β†’ builds after core merges +Layer 2: accounts-payable β†’ builds after layer 1 merges + (core + access-control) +``` + +## Findings + +Review agents may discover issues unrelated to the current spec: +- **CRITICAL** findings block the PR β€” must be fixed by builder +- **WARNING** findings are posted as comments and may generate separate `finding`-labelled issues +- **SUGGESTION** findings are informational β€” no action required + +## Cron Schedule + +| Script | Interval | Purpose | +|--------|----------|---------| +| `cron-hydra.sh` | `*/5 * * * *` | Discover + dispatch builds, resume active pipelines | +| `hydra-supervisor.sh` | daemon (1-min watchdog) | Run code + security reviews on labelled PRs | diff --git a/docs/hydra/operations/README.md b/docs/hydra/operations/README.md new file mode 100644 index 0000000..ad55660 --- /dev/null +++ b/docs/hydra/operations/README.md @@ -0,0 +1,13 @@ +# Operations + +How to run, debug, and understand the Hydra pipeline containers. + +| Guide | What it covers | +|---|---| +| [auth.md](auth.md) | Claude auth (OAuth, API key, auto-refresh), GitHub PATs, git auth inside containers | +| [entrypoints.md](entrypoints.md) | Step-by-step flow per container (Builder, Reviewer, Security) | +| [dev-run.md](dev-run.md) | Running containers locally, flags, persistent workspace | +| [logging.md](logging.md) | Log files, JSONL format, parsing | +| [secrets.md](secrets.md) | File layout, .gitignore rules, per-deployment injection | +| [networking.md](networking.md) | Container network, K8s NetworkPolicy, egress control | +| [troubleshooting.md](troubleshooting.md) | Common errors and fixes | diff --git a/docs/hydra/operations/auth.md b/docs/hydra/operations/auth.md new file mode 100644 index 0000000..b9e5a4c --- /dev/null +++ b/docs/hydra/operations/auth.md @@ -0,0 +1,64 @@ +# Authentication + +Hydra containers need two types of auth: **Claude** (for AI) and **GitHub** (for git/PR operations). + +## Claude auth β€” resolution order + +The pipeline reads Claude OAuth tokens from `secrets/credentials.json` (the +orchestrator config) and falls back to env vars only when that file is absent: + +| Priority | Source | Auto-refresh | Use case | +|---|---|---|---| +| 1 | `secrets/credentials.json` `claude_accounts[]` (multiple, sorted by priority field) | Per token | Standard local setup with multi-account fallback on rate limits | +| 2 | `CLAUDE_CODE_OAUTH_TOKEN` in `secrets/.env` | No β€” expires ~24h | CI, static injection (K8s) | +| 3 | `ANTHROPIC_API_KEY` in `secrets/.env` | N/A β€” no expiry | Pay-per-token billing | + +`scripts/lib/credentials.sh` is the single source of truth for token loading. +If `secrets/credentials.json` doesn't exist, the script errors out and the +pipeline won't start. (The host-side browser test runner has a separate path +through `secrets/claude-credentials.json` β†’ `~/.claude/.credentials.json` β€” +see `docs/operations/secrets.md`.) + +**Generating tokens:** Run `claude setup-token` while logged in to the relevant +Max account, then copy the printed token into `credentials.json`. To regenerate +expired tokens, repeat the process. + +**Generating a token manually:** + +```bash +claude setup-token +# Opens browser β†’ log in to your Max account β†’ token is stored in ~/.claude/.credentials.json +``` + +**For Kubernetes:** create a secret with a static token. It will expire; rotation is a +future improvement (see Changelog). + +```bash +kubectl create secret generic hydra-claude-oauth \ + --from-literal=token=sk-ant-oat01-... -n hydra +``` + +## GitHub PATs β€” per-agent scoping + +Each agent persona gets its own GitHub Personal Access Token with minimal scopes: + +| Agent | Token variable | Required scopes | +|---|---|---| +| Al Gorithm (Builder) | `HYDRA_BUILDER_TOKEN` | `contents:write`, `pull-requests:write` | +| Juan Claude van Damme (Reviewer) | `HYDRA_REVIEWER_TOKEN` | `pull-requests:write` (no contents write) | +| Clyde Barcode (Security) | `HYDRA_SECURITY_TOKEN` | `pull-requests:write` (no contents write) | + +Prefer fine-grained PATs scoped to specific repositories. Rotate at least every 90 days. + +## How git auth works inside containers + +Each entrypoint configures git to embed the token transparently: + +```bash +git config --global url."https://x-access-token:${GIT_TOKEN}@github.com/".insteadOf "https://github.com/" +export GH_TOKEN="${GIT_TOKEN}" +export GITHUB_PERSONAL_ACCESS_TOKEN="${GIT_TOKEN}" +``` + +This means `git clone`, `git push`, and `gh` CLI all work without the agent needing to +handle credentials. The agent prompt never sees the token. diff --git a/docs/hydra/operations/cron.md b/docs/hydra/operations/cron.md new file mode 100644 index 0000000..17538f1 --- /dev/null +++ b/docs/hydra/operations/cron.md @@ -0,0 +1,121 @@ +# Cron Jobs + +Hydra uses a long-running supervisor daemon for build + review dispatch. A small number of cron jobs keep the daemon alive and handle periodic housekeeping. + +## Setup + +```bash +# Install the crontab +(crontab -l 2>/dev/null; cat <<'CRON' +* * * * * /path/to/hydra/scripts/watchdog-supervisor.sh >> /path/to/hydra/logs/watchdog.log 2>&1 +*/10 * * * * /path/to/hydra/scripts/reconcile.sh >> /path/to/hydra/logs/reconcile.log 2>&1 +*/30 * * * * /path/to/hydra/scripts/cron-audit.sh >> /path/to/hydra/logs/audit-cron.log 2>&1 +*/10 * * * * /path/to/hydra/scripts/cron-spec-from-issue.sh >> /path/to/hydra/logs/spec-from-issue.log 2>&1 +15 * * * * /path/to/hydra/scripts/cron-update-prs.sh >> /path/to/hydra/logs/update-prs-cron.log 2>&1 +CRON +) | crontab - +``` + +Replace `/path/to/hydra` with the actual path (e.g., `/home/wilco/hydra`). + +## Cron Jobs + +| Script | Interval | What it does | +|--------|----------|-------------| +| `watchdog-supervisor.sh` | Every minute | Starts `hydra-supervisor.sh` if the daemon is not running | +| `reconcile.sh` | Every 10 min | Label validation + auto-recovery sweep | +| `cron-audit.sh` | Every 30 min | Full codebase audits on `ready-for-audit` issues | +| `cron-spec-from-issue.sh` | Every 10 min | Converts `needs-spec` issues into OpenSpec changes | +| `cron-update-prs.sh` | Every hour | Merges `development` into open `feature//*` PRs that are `BEHIND` | + +`scripts/cron-hydra.sh` is retained for manual dispatch runs but is **not** scheduled β€” the supervisor replaces it. + +## Logs + +| File | Content | +|------|---------| +| `logs/cron.log` | Build pipeline dispatch log | +| `logs/review-cron.log` | Review dispatch log | +| `logs/pipeline-{issue}-{timestamp}/` | Per-pipeline stage logs (builder, quality, reviewer, etc.) | +| `logs/reviews/{repo}-{pr}-{type}-{timestamp}.jsonl` | Per-review JSONL output | + +## Trigger Label Configuration + +The label that triggers the build pipeline is configurable via the `HYDRA_TRIGGER_LABEL` environment variable. Default: `ready-to-build`. + +Set it in `secrets/.env`: +```bash +HYDRA_TRIGGER_LABEL=wilco-testing +``` + +Or pass it as an environment variable when running scripts directly: +```bash +HYDRA_TRIGGER_LABEL=my-custom-label ./scripts/orchestrate.sh --poll --repo-url https://github.com/ConductionNL/myapp +``` + +Or use the `--trigger-label` flag with `orchestrate.sh`: +```bash +./scripts/orchestrate.sh --issue-url ... --repo-url ... --trigger-label my-custom-label +``` + +This is useful for: +- **Testing**: Use a separate label (e.g. `wilco-testing`) to avoid interfering with production pipelines +- **Multi-environment isolation**: Run separate Hydra instances that each watch for different labels +- **Gradual rollout**: Only process issues you explicitly opt in + +## Build cron (`cron-hydra.sh`) + +- Searches all open issues with the trigger label (default: `ready-to-build`) across the org +- Dispatches pipelines in background (detached), up to 5 parallel slots +- Each slot gets an isolated NC port (8086-8090) and container namespace +- Exits immediately β€” pipelines continue in background +- Lock files in `/tmp/hydra-slots/` prevent double-dispatch + +## Review dispatch (supervisor) + +Review dispatch is handled continuously by `hydra-supervisor.sh`, not a cron entry. + +- Supervisor scans open PRs for review labels across the org each cycle +- Uses the shared slot pool (max 5 concurrent with builders) +- Labels are on **PRs** (not issues) β€” this is how reviews work standalone +- After a reviewer posts its verdict, the label is removed +- Works independently from the build pipeline β€” any repo in the org can use it + +### Standalone review usage + +For a one-off manual review on a specific PR, use `scripts/manual-review.sh`: + +```bash +./scripts/manual-review.sh \ + --pr-url https://github.com/ConductionNL/myapp/pull/42 \ + --review-type code +``` + +Results appear as PR comments. + +## Verify cron is running + +```bash +# Check crontab +crontab -l + +# Check supervisor is alive +pgrep -f hydra-supervisor.sh + +# Check recent logs +tail -20 /path/to/hydra/logs/supervisor.log +tail -20 /path/to/hydra/logs/watchdog.log +tail -20 /path/to/hydra/logs/reconcile.log + +# Check active slots +ls /tmp/hydra-slots/slot-*.lock 2>/dev/null +``` + +## Troubleshooting + +| Symptom | Fix | +|---------|-----| +| Cron not running | Check `crontab -l`, verify PATH in script header | +| `No PRs with review labels found` | Expected when no reviews are pending | +| Review label not removed | Check `logs/reviews/` for container errors | +| Stale slot lock | `rm /tmp/hydra-slots/slot-N.lock` (cron auto-detects dead PIDs) | diff --git a/docs/hydra/operations/dev-run.md b/docs/hydra/operations/dev-run.md new file mode 100644 index 0000000..41a8240 --- /dev/null +++ b/docs/hydra/operations/dev-run.md @@ -0,0 +1,75 @@ +# Running Locally + +## Full pipeline β€” `/local-run` + +From Claude Code, run the entire pipeline (builder β†’ reviewer + security) in one command: + +``` +/local-run --spec-dir /tmp/todo-mvp-spec --repo-url https://github.com/algorithm-conduction/todo-app +``` + +This creates a fresh GitHub repo, uploads the openspec, builds all images, runs all three +stages, and prints a summary with costs. Previous repos are renamed with a timestamp +(museum pattern). + +A standalone shell script (`scripts/smoke-test.sh`) is also available for use outside +Claude Code or in CI. + +## Single stage β€” `dev-run.sh` + +Run one container directly, bypassing orchestration: + +```bash +# Builder +./scripts/dev-run.sh builder --repo-url [--issue-url ] + +# Reviewer +./scripts/dev-run.sh - reviewer --repo-url --pr-url + +# Security +./scripts/dev-run.sh - security --repo-url --pr-url +``` + +## Container flags + +| Flag | Purpose | +|---|---| +| `--rm` | Remove container after exit | +| `--tmpfs /tmp:size=512M` | Ephemeral temp storage | +| `--tmpfs /workspace:size=2G` | Ephemeral workspace (default) | +| `--cap-drop ALL` | Drop all Linux capabilities | +| `--cap-add SETUID,SETGID,DAC_OVERRIDE` | Required for `gosu` user switching | +| `--cpus 4 --memory 8g` | Resource limits | +| `--network hydra-net` | Isolated bridge network | + +**Note:** `NET_ADMIN` is intentionally omitted. Internal iptables doesn't work reliably +in rootless containers (conntrack in user namespaces). Egress is controlled by the +container network in local dev and by NetworkPolicies in Kubernetes. + +## Trigger label + +The pipeline trigger label defaults to `ready-to-build` but can be overridden. Set `HYDRA_TRIGGER_LABEL` in your environment or `secrets/.env`: + +```bash +# Via env var +HYDRA_TRIGGER_LABEL=wilco-testing ./scripts/dev-run.sh builder --repo-url + +# Via secrets/.env (loaded automatically by orchestrate.sh) +echo 'HYDRA_TRIGGER_LABEL=wilco-testing' >> secrets/.env +``` + +When using `orchestrate.sh` directly, you can also pass `--trigger-label`: +```bash +./scripts/orchestrate.sh --issue-url ... --repo-url ... --trigger-label wilco-testing +``` + +## Persistent workspace (debugging) + +By default, `/workspace` is a tmpfs β€” everything is lost when the container exits. To keep +the workspace for inspection: + +```bash +export HYDRA_WORKSPACE_DIR=/tmp/hydra-workspace +./scripts/dev-run.sh ... +# After exit: ls /tmp/hydra-workspace/repo/ +``` diff --git a/docs/hydra/operations/entrypoints.md b/docs/hydra/operations/entrypoints.md new file mode 100644 index 0000000..57319e6 --- /dev/null +++ b/docs/hydra/operations/entrypoints.md @@ -0,0 +1,48 @@ +# Container Entrypoint Flow + +Each container runs an `entrypoint.sh` that sets up the environment before handing off to Claude Code. + +## Builder (`images/builder/entrypoint.sh`) + +1. Validate env vars: `GIT_TOKEN`, `REPO_URL`, `ISSUE_URL`, `SPEC_PATH` +2. Attempt iptables egress allowlist (skipped in rootless mode β€” see [networking.md](networking.md)) +3. Fix `~/.claude` ownership for session data writes +4. Pre-flight check: verify `GIT_TOKEN` has push access to `REPO_URL` via GitHub API +5. Configure git auth (url.insteadOf + token exports) +6. Build prompt with spec path, repo URL, issue URL +7. Exec Claude Code as `claude` user (via `gosu` or direct exec in rootless) + +**Max turns:** 80. **Tools:** Read, Write, Edit, Bash, Glob, Grep. + +## Reviewer (`images/reviewer/entrypoint.sh`) + +1. Validate env vars: `GIT_TOKEN`, `REPO_URL`, `PR_URL` +2. Attempt iptables egress allowlist +3. Fix `~/.claude` ownership +4. Configure git auth +5. Build prompt with PR URL and repo URL +6. Exec Claude Code β€” read-only (no Write/Edit tools) + +**Max turns:** 25. **Tools:** Read, Bash, Grep, Glob. + +## Security Reviewer (`images/security/entrypoint.sh`) + +1. Validate env vars: `GIT_TOKEN`, `REPO_URL`, `PR_URL` +2. Attempt iptables egress allowlist +3. Fix `~/.claude` ownership +4. **Clone the PR branch** into `/workspace/repo` +5. **Run SAST tools before Claude starts:** + - Semgrep (`p/security-audit`, `p/secrets`, `p/owasp-top-ten`) β†’ `/tmp/semgrep-results.json` + - Gitleaks β†’ `/tmp/gitleaks-results.json` + - Trivy (conditional: Dockerfile β†’ `fs` scan, Helm/K8s β†’ `config` scan) β†’ `/tmp/trivy-results.json` +6. Configure git auth +7. Build prompt referencing the pre-computed SAST results +8. Exec Claude Code β€” read-only + +**Max turns:** 20. **Tools:** Read, Bash. + +## `gosu` fallback + +In real-root environments (K8s, rootful Docker/Podman), `gosu` drops privileges to the +`claude` user. In rootless mode, container "root" is already the host user β€” `gosu` setuid +fails, so the entrypoint falls back to `exec` directly. diff --git a/docs/hydra/operations/logging.md b/docs/hydra/operations/logging.md new file mode 100644 index 0000000..80f964e --- /dev/null +++ b/docs/hydra/operations/logging.md @@ -0,0 +1,49 @@ +# Logging + +## Log files + +Every `dev-run.sh` run writes a log to `logs/-.jsonl`: + +``` +logs/builder-20260402T133500Z.jsonl +logs/reviewer-20260402T140000Z.jsonl +logs/security-20260402T140000Z.jsonl +``` + +Timestamps are UTC. The `logs/` directory is gitignored. + +## Log format + +Output is JSONL (one JSON object per line, flushed immediately). This is produced by +Claude Code's `--output-format stream-json` flag. Key event types: + +| `type` | Description | +|---|---| +| `system` | Init event β€” session ID, available tools, model | +| `assistant` | Agent response β€” text or tool use | +| `user` | Tool results returned to the agent | +| `result` | Final summary β€” `is_error`, `result`, `total_cost_usd`, `num_turns` | + +## Parsing logs + +Extract the final result from a log file: + +```bash +python3 -c " +import json +for line in open('logs/builder-20260402T133500Z.jsonl'): + try: + d = json.loads(line) + if d.get('type') == 'result': + print('error:', d.get('is_error')) + print('turns:', d.get('num_turns')) + print('cost: $', d.get('total_cost_usd')) + print(d.get('result', '')[:500]) + except: pass +" +``` + +## GitHub Actions / Kubernetes + +In GitHub Actions, logs go to the workflow run output (stdout). In Kubernetes, logs are +captured by the container runtime and accessible via `kubectl logs`. diff --git a/docs/hydra/operations/networking.md b/docs/hydra/operations/networking.md new file mode 100644 index 0000000..b95ceac --- /dev/null +++ b/docs/hydra/operations/networking.md @@ -0,0 +1,24 @@ +# Networking + +## Local + +> Both Podman and Docker are supported. Examples below use `podman`; substitute `docker` +> if that's your runtime. + +Containers run on `hydra-net`, a bridge network with DNS enabled. Outbound internet +works via the host's NAT. No egress filtering in local dev (iptables skipped). + +```bash +# Create once +podman network create hydra-net +# or: docker network create hydra-net +``` + +## Kubernetes + +Egress is controlled by `manifests/network-policy.yaml` β€” allows only: +- DNS (UDP/TCP 53) +- HTTPS (443) to `api.anthropic.com`, `github.com`, `api.github.com` + +For FQDN-level enforcement (not just IP), use Cilium `CiliumNetworkPolicy` or Calico +`GlobalNetworkPolicy`. diff --git a/docs/hydra/operations/secrets.md b/docs/hydra/operations/secrets.md new file mode 100644 index 0000000..e69eaba --- /dev/null +++ b/docs/hydra/operations/secrets.md @@ -0,0 +1,127 @@ +# Secret Management + +## File Layout + +``` +secrets/ +β”œβ”€β”€ .env ← Git PATs + legacy vars + non-secret config (gitignored) +β”œβ”€β”€ .env.example ← Template with descriptions (committed) +β”œβ”€β”€ credentials.json ← Claude OAuth tokens + Git token mapping (gitignored) +β”œβ”€β”€ credentials.example.json ← Template with descriptions (committed) +β”œβ”€β”€ claude-credentials.json ← OPTIONAL: standalone OAuth file used only by browser tests (gitignored) +└── ... +``` + +All non-template files in `secrets/` are gitignored. Copy `secrets/credentials.example.json` to `secrets/credentials.json` and `secrets/.env.example` to `secrets/.env`, then fill in values. + +## Credentials Configuration + +### `secrets/credentials.json` + +Central credential config loaded by `scripts/lib/credentials.sh`. Supports: + +- **Multiple Claude OAuth accounts** in priority order (work β†’ personal fallback) +- **Per-container Git PATs** (builder, reviewer, security can use different GitHub accounts) + +```json +{ + "claude_accounts": [ + { + "name": "work", + "token": "sk-ant-oat01-...", + "priority": 1 + }, + { + "name": "personal", + "token": "sk-ant-oat01-...", + "priority": 2 + } + ], + "git_tokens": { + "builder": "ghp_... β€” push access, create PRs", + "reviewer": "ghp_... β€” comment on PRs", + "security": "ghp_... β€” comment on PRs" + } +} +``` + +**Adding a new Claude account:** Add an entry to `claude_accounts` with a lower priority number = tried first. Tokens are embedded inline (generate with `claude setup-token` while logged in to the relevant account). + +**Token fallback:** When a container hits a rate limit ("You've hit your limit"), `run_container_with_fallback()` automatically retries with the next token in priority order. + +### `secrets/claude-credentials.json` (optional, browser tests only) + +This optional file uses the standard Claude CLI credentials format: + +```json +{ + "claudeAiOauth": { + "accessToken": "sk-ant-oat01-...", + "expiresAt": "2030-01-01T00:00:00.000Z" + } +} +``` + +It is read **only by `scripts/run-browser-tests.sh`** (the host-side Playwright runner). If absent, the browser tester falls back to `~/.claude/.credentials.json`. The pipeline scripts (orchestrate, supervisor, dev-run) do **not** use this file β€” they read OAuth tokens from `credentials.json` directly. + +### `secrets/.env` + +GitHub PATs, non-secret config, and optional Claude auth fallbacks loaded as `--env-file` by `docker run`: + +```bash +# GitHub PATs β€” per container role +HYDRA_BUILDER_TOKEN=ghp_... # Push access to repos, create PRs +HYDRA_REVIEWER_TOKEN=ghp_... # Comment on PRs (can be same as builder) +HYDRA_SECURITY_TOKEN=ghp_... # Comment on PRs (can be same as builder) + +# Non-secret config (used by poll-board.sh, cron-hydra.sh) +GITHUB_ORG=ConductionNL +HYDRA_PROJECT_NUMBER=1 +``` + +When `secrets/credentials.json` is present its `git_tokens` block takes precedence for in-container auth. If `credentials.json` is absent, `scripts/lib/credentials.sh` returns an error β€” `.env` alone is not sufficient for the pipeline (only the browser test runner can work without it). + +## `.gitignore` Rules + +``` +.env / .env.* / secrets/ β€” environment files and all secrets +*.pem / *.key / *.crt β€” certificates and keys +logs/ β€” run logs (may contain token prefixes in errors) +``` + +## How Credentials Are Used + +``` +scripts/lib/credentials.sh + β”œβ”€β”€ load_credentials() β€” called once at script startup + β”‚ β”œβ”€β”€ reads secrets/credentials.json (or falls back to known paths) + β”‚ β”œβ”€β”€ populates CLAUDE_TOKENS[] array + β”‚ └── populates GIT_TOKEN_BUILDER/REVIEWER/SECURITY + β”‚ + β”œβ”€β”€ get_claude_auth_env(index) β€” returns -e flag for container + β”œβ”€β”€ get_git_token(role) β€” returns PAT for builder/reviewer/security + β”‚ + └── run_container_with_fallback(log_file, cmd...) + β”œβ”€β”€ tries token[0] (work account) + β”œβ”€β”€ if rate limited β†’ tries token[1] (personal) + β”œβ”€β”€ if rate limited β†’ tries token[2] (if configured) + └── all exhausted β†’ returns error +``` + +Sourced by: +- `scripts/orchestrate.sh` β€” builder, fix, fix-quality, fix-browser containers +- `scripts/hydra-supervisor.sh` β€” code review, security review, applier containers + +## Per-Deployment Injection + +| Variable | Local (`cron`) | GitHub Actions | Kubernetes | +|---|---|---|---| +| Claude OAuth | `secrets/credentials.json` | `secrets.CLAUDE_CODE_OAUTH_TOKEN` | K8s secret `hydra-claude-oauth` | +| GitHub PATs | `secrets/.env` | `secrets.HYDRA_*_TOKEN` | K8s secrets per agent | + +## Security Notes + +- OAuth tokens expire and need periodic refresh (`claude auth login`) +- Git PATs should use fine-grained tokens scoped to the org +- Never pass `ANTHROPIC_API_KEY` β€” always use Claude CLI with OAuth +- Container logs (JSONL) may contain token prefixes in error messages β€” `logs/` is gitignored diff --git a/docs/hydra/operations/troubleshooting.md b/docs/hydra/operations/troubleshooting.md new file mode 100644 index 0000000..12f7852 --- /dev/null +++ b/docs/hydra/operations/troubleshooting.md @@ -0,0 +1,64 @@ +# Troubleshooting + +| Symptom | Cause | Fix | +|---|---|---| +| `Not logged in Β· Please run /login` | `--bare` flag or missing credentials | Ensure entrypoint uses `--output-format stream-json`, not `--bare` | +| `OAuth token has expired` | Static `CLAUDE_CODE_OAUTH_TOKEN` in `.env` | Use credentials.json instead (auto-refresh), or run `claude setup-token` | +| `GIT_TOKEN does not have push access` | PAT lacks `contents:write` or no repo access | Regenerate PAT with correct scopes, verify repo collaborator access | +| `Failed to connect to github.com port 443` | Egress blocked (iptables in rootless) | Remove `--cap-add NET_ADMIN` from run flags (done in current dev-run.sh) | +| `EROFS: read-only file system` on `/spec/` | Spec mount is `:ro` by design | Expected β€” spec updates must happen outside the container | +| Container exits immediately | Missing required env var | Check entrypoint output for `is required` error messages | +| `Bind for 0.0.0.0:808x failed: port is already allocated` | Orphaned NC container from a crashed pipeline | `docker ps -a --filter "name=hydra-nc" \| xargs docker rm -f`. The orchestrator now auto-cleans on startup (since v0.1.1). | +| Browser tests fail with `Nextcloud not reachable` | NC server didn't start or port conflict | Check `browser-nc-setup.log` for errors. Verify port with `ss -tlnp \| grep 808`. | + +--- + +## Cleanup + +### Temp directories + +Pipeline runs create temp dirs in `/tmp/hydra-*`. These are **not** automatically cleaned +on crash. To clean up: + +```bash +# Safe cleanup β€” keeps NC cache (saves ~1 min per quality run) +for d in /tmp/hydra-*; do + case "$d" in + /tmp/hydra-nc-cache|/tmp/hydra-slots) echo "KEEP: $d" ;; + *) rm -rf "$d" && echo "REMOVED: $d" ;; + esac +done + +# Some quality dirs are Docker-owned (root) β€” use Docker to remove +docker run --rm -v /tmp:/tmp alpine sh -c "rm -rf /tmp/hydra-quality-*" +``` + +### NC cache (`/tmp/hydra-nc-cache`) + +The NC cache stores a pre-downloaded Nextcloud server (~964 MB). `run-quality.sh` checks +for it before downloading. **Keep this directory** β€” without it, every quality run downloads +Nextcloud from scratch, adding ~60 seconds per pipeline. + +The cache is safe to delete if disk space is needed: +```bash +rm -rf /tmp/hydra-nc-cache +``` + +### Stale slot locks + +If a pipeline crashes, its slot lock (`/tmp/hydra-slots/slot-N.lock`) may persist. +`cron-hydra.sh` auto-detects stale locks (checks if the PID is alive), but you can +manually clear them: + +```bash +rm -f /tmp/hydra-slots/slot-*.lock +``` + +### Orphaned containers + +List and remove orphaned Hydra containers: + +```bash +docker ps -a --filter "name=hydra-nc" --format "{{.ID}} {{.Names}} {{.Status}}" +docker rm -f $(docker ps -aq --filter "name=hydra-nc") +``` diff --git a/docs/hydra/pipeline-overview.md b/docs/hydra/pipeline-overview.md new file mode 100644 index 0000000..2923f7b --- /dev/null +++ b/docs/hydra/pipeline-overview.md @@ -0,0 +1,289 @@ +# Pipeline Overview + +Hydra is a stateless, label-driven pipeline that builds code from OpenSpec specifications. All state lives on GitHub (labels, PRs, comments) β€” nothing in memory. If any step is interrupted (rate limit, crash, timeout), the next cron cycle picks up where it left off. + +Since `openspec/changes/no-loop-review-pipeline`, the pipeline is **single-shot**: no review re-runs, no fix-iteration loop. Every outcome is terminal β€” merge or `needs-input`. + +## State Machine + +The pipeline is driven entirely by GitHub issue labels. The supervisor polls every cycle and decides what to do: + +``` +β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” +β”‚ β”‚ +β”‚ ready-to-build / build:queued β”‚ +β”‚ β”‚ β”‚ +β”‚ β–Ό β”‚ +β”‚ BUILD (Al Gorithm, Haiku) β”‚ +β”‚ implements spec β†’ creates PR β†’ runs quality + browser β”‚ +β”‚ fix-quality / fix-browser loops (pre-review) if checks red β”‚ +β”‚ β”‚ β”‚ +β”‚ β–Ό β”‚ +β”‚ build:pass β†’ code-review:queued (security NOT queued yet) β”‚ +β”‚ β”‚ +β”‚ CODE REVIEW (Juan Claude van Damme, Sonnet) β”‚ +β”‚ reads diff + ADRs β†’ applies bounded fixes in-container β†’ β”‚ +β”‚ posts inline [fixed:…]/[unfixed:…] comments + summary β†’ β”‚ +β”‚ commits + pushes β†’ sets code-review:pass/fail on issue β”‚ +β”‚ emits JSON verdict with fixes_applied[] + unfixed[] β”‚ +β”‚ β”‚ β”‚ +β”‚ β–Ό supervisor sees code-review verdict, queues security β”‚ +β”‚ β”‚ +β”‚ SECURITY REVIEW (Clyde Barcode, Sonnet) β”‚ +β”‚ same shape, plus Semgrep β€” reads POST-code-fix state β”‚ +β”‚ sets security-review:pass/fail on issue β”‚ +β”‚ β”‚ β”‚ +β”‚ β–Ό β”‚ +β”‚ supervisor sees both verdicts β†’ applier:queued β”‚ +β”‚ β”‚ +β”‚ APPLIER DECISION (orchestrate.sh) β”‚ +β”‚ if both reviews passed AND zero fixes applied: β”‚ +β”‚ β†’ done (skip Axel β€” saves tokens) β”‚ +β”‚ else: β”‚ +β”‚ re-run deterministic checks (PHPCS/ESLint/PHPUnit/...) β”‚ +β”‚ ↳ red β†’ needs-input (a fix broke something) β”‚ +β”‚ ↳ green β†’ dispatch Axel PliΓ©r β”‚ +β”‚ β”‚ +β”‚ APPLIER (Axel PliΓ©r, Sonnet, NO fix tools) β”‚ +β”‚ reads final diff + hydra.json fixes_applied/unfixed + β”‚ +β”‚ reviewer PR comments β”‚ +β”‚ emits {pass, blocking[]} verdict β”‚ +β”‚ ↳ applier:pass β†’ done (or yolo merge) β”‚ +β”‚ ↳ applier:fail β†’ needs-input β”‚ +β”‚ β”‚ +β”‚ MERGE (done label set) β”‚ +β”‚ PR undrafted, summary comment posted β”‚ +β”‚ if yolo: approve + merge + close issue β”‚ +β”‚ β”‚ +β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ +``` + +Every failure path terminates in `needs-input`; there is no retry and no fix-iteration loop. + +## Labels + +| Label | On | Meaning | Set by | Removed by | +|-------|-----|---------|--------|------------| +| `ready-to-build` (configurable) | Issue | New spec, ready for initial build. Label name is configurable via `HYDRA_TRIGGER_LABEL` env var (default: `ready-to-build`). | Specter | Supervisor (start of build) | +| `build:queued` / `:running` / `:done` / `:failed` | Issue | Build stage states | Supervisor / Orchestrate | Supervisor | +| `code-review:queued` / `:running` / `:pass` / `:fail` | Issue | Code review states. Juan Claude may edit files. `:pass`/`:fail` triggers supervisor to queue security review. | Supervisor / Juan Claude | Supervisor (after applier queued) | +| `security-review:queued` / `:running` / `:pass` / `:fail` | Issue | Security review states. Queued only after code review completes. Clyde may edit files in PR mode. | Supervisor / Clyde Barcode | Supervisor (after applier queued) | +| `applier:queued` / `:running` / `:pass` / `:fail` | Issue | Final go/no-go stage. Orchestrate.sh decides whether to run Axel PliΓ©r or skip straight to `done`. `:pass` β†’ merge; `:fail` β†’ `needs-input`. | Supervisor / Orchestrate / Axel PliΓ©r | β€” (terminal) | +| `done` | Issue | All phases passed, ready for merge (or auto-merged on yolo) | Supervisor | Human (after merge) | +| `needs-input` | Issue | Escalated β€” needs human intervention (applier fail, reviewer max-out, post-review check failure) | Orchestrate / Supervisor | Human | +| `yolo` | Issue | Auto-approve and merge on `done` | Specter | Orchestrate (after merge) | +| `openspec` | Issue | Change driven by OpenSpec | Specter | β€” | +| `oversized` | Issue | Spec generation timed out β€” needs splitting | Specter | Human | +| `agent-maxed-out` | Issue/PR | Agent hit turn limit β€” output may be incomplete | Supervisor | Human | + +**Deprecated labels** (removed by no-loop-review-pipeline; `migrate-labels.sh` clears these off in-flight PRs): `pipeline-active`, `building`, `ready-for-code-review`, `ready-for-security-review`, `ready-for-review`, `fix:queued`, `fix:running`, `fix:done`, `fix-iteration:1`, `fix-iteration:2`, `build-retry:1`. + +## hydra.json β€” Pipeline Metadata Per Change + +Each spec change has a `hydra.json` in `openspec/changes/{name}/`. This is Hydra-specific metadata β€” not part of the OpenSpec standard. + +```json +{ + "spec_slug": "accounts-payable-receivable", + "title": "Accounts Payable & Receivable", + "app": "shillinq", + "repo": "https://github.com/ConductionNL/shillinq", + "depends_on": ["core", "access-control-authorisation"], + "issue": "https://github.com/ConductionNL/shillinq/issues/49", + "pipeline": { + "code_review": { + "pass": true, + "fixes_applied": [ + { "file": "lib/Service/LedgerService.php", "line": 42, "rule": "PHPCS/PSR12", "note": "Added missing blank line after namespace" } + ], + "unfixed": [] + }, + "security_review": { + "pass": true, + "fixes_applied": [ + { "file": "lib/Db/InvoiceMapper.php", "line": 77, "rule": "CWE-89", "note": "Replaced string concatenation with prepared statement" } + ], + "unfixed": [] + }, + "applier": { + "ran": true, + "pass": true, + "blocking": [] + }, + "build_count": 1, + "findings_fixed": 2, + "findings_open": 0, + "suggestions_open": 1, + "total_cost_usd": 0.42, + "total_cost_eur": 0.39, + "total_turns": 87, + "last_code_review_pass": true, + "last_security_review_pass": true + } +} +``` + +The `pipeline` section is auto-maintained by `update_hydra_summary()` in `scripts/lib/review-results.sh`, which delegates to `scripts/lib/aggregate-hydra-summary.py`. It aggregates the latest `reviews/*.json` + `applier.json` + `builds/*.json`. Legacy fields `review_rounds` and `fix_iterations` are no longer written (no-loop pipeline has exactly one review round and no fix iterations). + +| Field | Set by | Purpose | +|-------|--------|---------| +| `spec_slug` | Specter | Change identifier | +| `title` | Specter | Human-readable name | +| `app` | Specter | Target app | +| `repo` | Specter | GitHub repo URL | +| `depends_on` | Specter | Build order enforcement β€” list of spec slugs that must merge first | +| `issue` | Specter | GitHub issue URL β€” updated after issue creation | +| `pipeline` | Orchestrate | Running summary β€” updated after each build/review cycle | + +The `pipeline` section is auto-maintained by `update_hydra_summary()` in `scripts/lib/review-results.sh`. It aggregates data from all `reviews/*.json` and `builds/*.json` files. + +### Related files per change + +``` +openspec/changes/{name}/ +β”œβ”€β”€ proposal.md ← OpenSpec standard +β”œβ”€β”€ design.md ← OpenSpec standard +β”œβ”€β”€ tasks.md ← OpenSpec standard +β”œβ”€β”€ specs/ ← OpenSpec standard (optional) +β”œβ”€β”€ hydra.json ← Hydra pipeline metadata (pipeline.* aggregated summary) +β”œβ”€β”€ reviews/ ← Hydra review results (single round in the no-loop pipeline) +β”‚ └── 1.json ← code_review + security_review blocks with fixes_applied[] and unfixed[] +β”œβ”€β”€ applier.json ← Axel PliΓ©r verdict: {ran, pass, blocking[]} +β”œβ”€β”€ builds/ ← Hydra build results per phase +β”‚ └── build.json +└── pipeline-logs/ ← Gzipped raw JSONL transcripts for every phase (self-learning) + β”œβ”€β”€ build.jsonl.gz + β”œβ”€β”€ code-review-1.jsonl.gz + β”œβ”€β”€ security-review-1.jsonl.gz + └── applier.jsonl.gz +``` + +## Dependency Enforcement + +Before dispatching a build, the cron checks if each dependency's implementation issue is closed (meaning its code was merged to development). If any dependency is unmerged, the build is skipped β€” no tokens burned. + +This enforces build order automatically: +- Layer 0: `core` (no deps) β€” builds first +- Layer 1: specs depending on core β€” build after core merges +- Layer 2: specs depending on layer 1 β€” and so on + +## Concurrency + +A shared slot pool (`/tmp/hydra-slots/`) limits all containers to **5 concurrent** across every type: + +| Container | Image | Purpose | +|-----------|-------|---------| +| Builder (build) | `hydra-builder` | Generate code from spec | +| Builder (fix-quality) | `hydra-builder` | Fix linting/test failures during the build phase (pre-review) | +| Builder (fix-browser) | `hydra-builder` | Fix browser test failures during the build phase (pre-review) | +| Code Reviewer (Juan Claude van Damme) | `hydra-reviewer` | Review PR for code quality + apply bounded in-container fixes | +| Security Reviewer (Clyde Barcode) | `hydra-security` | Review PR for security issues + apply bounded in-container fixes (PR mode) | +| Applier (Axel PliΓ©r) | `hydra-applier` | Binary go/no-go gate β€” NO fix authority | + +The post-review `Builder (fix)` container was removed by `openspec/changes/no-loop-review-pipeline` β€” reviewers now own fix authority. + +Builds use Opus (better quality, separate rate limit pool on Max). Reviews and fixes use Sonnet. Quality checks (PHPCS, Psalm, PHPStan, ESLint) and browser tests (Playwright) run as host processes β€” no Claude tokens, no slots consumed. + +`cron-hydra.sh` (legacy, manual) and the supervisor (continuous daemon) share the same pool. If all slots are busy, new work is deferred to the next cycle. + +## Self-Healing + +The pipeline is designed to recover from interruptions automatically: + +- **Rate limits**: `run_container_with_fallback()` tries the work token first, falls back to personal. Only checks the final result line β€” mid-session rate limit warnings from Claude CLI don't trigger false fallbacks. +- **Crash recovery**: All state is on GitHub labels. If a container crashes, the next cron cycle sees `pipeline-active`, finds the PR, checks verdicts, and picks up from the right stage. +- **Auth failures**: Token failures are treated like rate limits β€” fallback to next token. +- **Review dedup**: Round tracking prevents re-reviewing the same round. Attempt cap (3 max) prevents infinite retries. +- **Build dedup**: `building` label prevents concurrent dispatch. Set BEFORE dispatch (not after). + +**What can still require manual intervention:** +- `needs-input` label β€” set when fix budget exhausted (3 iterations) or all tokens fail +- `agent-maxed-out` label β€” set when an agent hits its turn limit (incomplete output) +- `oversized` label β€” set when spec generation times out + +## Authentication + +All credentials in `secrets/credentials.json` (single source of truth): + +```json +{ + "claude_accounts": [ + { "name": "work", "token": "sk-ant-...", "priority": 1 }, + { "name": "personal", "token": "sk-ant-...", "priority": 2 } + ], + "git_tokens": { + "builder": "gho_...", + "reviewer": "gho_...", + "security": "gho_..." + } +} +``` + +- Up to 3 Claude Max accounts in priority order +- Per-container Git PATs (builder, reviewer, security can use different GitHub accounts) +- `run_container_with_fallback()` handles token rotation on failure + +## Review Reliability + +- Reviews are capped at **3 attempts** per PR. After 3 failures, labels are removed and `needs-input` is added. +- Labels are only removed on **successful** completion. Failed reviews keep their labels for retry. +- Review verdicts are posted as JSON code blocks: `{ "pass": true/false, "blocking": [...] }` + +## Relationship to Specter + +Specter produces specs and pushes them to target repositories: + +1. Specter generates spec artifacts (proposal.md, design.md, tasks.md, hydra.json) +2. Pushes to a `spec/{slug}` branch, merges to `development` +3. Creates a GitHub issue with the trigger label + `openspec` + `yolo` labels +4. Hydra cron discovers the issue and starts the pipeline + +## Codebase Audit + +Full codebase security and quality scanning, independent of PRs. Creates GitHub issues with checkbox findings. + +**Trigger:** Create an issue with label `ready-for-audit` on any repo in the org. + +**Flow:** +1. `cron-audit.sh` discovers issues with `ready-for-audit` label +2. Runs code reviewer + security reviewer in audit mode (`AUDIT_MODE=full`) +3. Agents scan the entire codebase (not just PR diffs) +4. Agents output structured findings JSON +5. `audit-results.sh` creates/updates GitHub issues with checkboxes per finding +6. Trigger issue closed, labelled `audit-complete` + +**Output:** Two issues per audit (code quality + security), each with: +- CRITICAL findings: must fix (checkbox) +- WARNING findings: should fix (checkbox) +- SUGGESTION findings: may fix (checkbox) + +Subsequent audits update existing issues β€” no duplicates. + +## Issue β†’ Spec Flow (Reverse Pipeline) + +Generate specs from existing issues, enabling: Issue β†’ Spec β†’ Build β†’ Review β†’ Merge. + +**Trigger:** Label any issue with `needs-spec`. + +**Flow:** +1. `cron-spec-from-issue.sh` discovers issues with `needs-spec` label +2. Clones the repo, runs Claude CLI with `/opsx-new` + `/opsx-ff` +3. Generates proposal.md, design.md, tasks.md from the issue description +4. Writes `hydra.json` linking back to the original issue (`"source": "issue"`) +5. Commits to development, relabels issue with the trigger label + `openspec` + `yolo` +6. Normal build pipeline takes over β€” the original issue tracks completion + +**Use cases:** +- Bug reports that need implementation specs +- Feature requests from users +- Audit findings that need code changes +- Manual observations that should become specs + +## Cron Schedule + +| Cron | Script | Interval | Purpose | +|------|--------|----------|---------| +| `cron-hydra.sh` | Build pipeline | Every 5 min | Discover + dispatch builds, resume active pipelines | +| `hydra-supervisor.sh` | Review pipeline | Continuous daemon | Run code + security reviews on labelled PRs | +| `cron-audit.sh` | Codebase audit | Every 30 min | Full codebase scan on repos with `ready-for-audit` | +| `cron-spec-from-issue.sh` | Issueβ†’Spec | Every 10 min | Generate specs from issues with `needs-spec` | diff --git a/docs/hydra/retrospectives/decidesk-44-45-phase-g.md b/docs/hydra/retrospectives/decidesk-44-45-phase-g.md new file mode 100644 index 0000000..4fa0440 --- /dev/null +++ b/docs/hydra/retrospectives/decidesk-44-45-phase-g.md @@ -0,0 +1,238 @@ +# Retrospective β€” decidesk#44/#45 and the Phase G rollout + +**Date:** 2026-04-23 +**Subjects:** decidesk#44 (Minutes and Decisions β€” Core T3), decidesk#45 (Minutes and Decisions β€” Other T1) +**Outcome:** Phase G (scope-to-diff) landed + four related pipeline bugs fixed as validation surfaced them. + +--- + +## Why this retrospective exists + +Two decidesk issues β€” #44 and #45 β€” were stuck for multiple days. Each cycle they went through the pipeline, each cycle they escalated to `needs-input`. The surface cause was always "reviewer/applier fail". The underlying causes turned out to be a stack of five distinct bugs, each hidden behind the previous one. Every fix revealed the next. + +This doc captures the whole stack, in the order we found it, because the failure modes are likely to recur in other forms: + +1. Supervisor comment-spam on terminal-state issues (infrastructure) +2. Gates scan the whole repo instead of the PR diff β€” Phase G (design) +3. Applier `/workspace` root-owned, claude user can't write (container build) +4. `secrets/.env` documented but never actually sourced (doc-code drift) +5. Bounded-fix scope defined by line count fails for common auth patterns (policy) + +The larger lesson: **the no-loop pipeline is very unforgiving of infrastructure gaps**. There's no retry budget to absorb a broken prefetch or a silently-failing chown. One missed `chown` in a Dockerfile looks identical from the outside to a sophisticated applier judgment call ("fail, 0 findings"). + +--- + +## How the pipeline is supposed to work + +For context, the intended flow for a yolo-labelled spec issue: + +``` +trigger label (ready-to-build) + β†’ Supervisor picks up, assigns slot 1-5 + β†’ Builder (Al Gorithm, Haiku) + Β· reads spec, writes code, opens draft PR + Β· Rule 0b wrapper runs composer check:strict + phpunit + 8 hydra-gates + Β· commits + pushes + β†’ Code Review (Juan Claude van Damme, Sonnet) + Β· re-runs hydra-gates + Β· applies bounded fixes, emits verdict JSON + β†’ Security Review (Clyde Barcode, Sonnet) + Β· runs Semgrep + composer/npm audit + gitleaks + manual OWASP + Β· applies bounded security fixes, emits verdict JSON + β†’ Either reviewer fail β†’ needs-input (applier skipped) + β†’ Both reviewers pass + β‰₯1 fix applied β†’ Applier (Axel PliΓ©r) + Β· reads final state, emits binary pass/fail + Β· pass β†’ yolo merge + Β· fail β†’ needs-input + β†’ Both reviewers pass + zero fixes β†’ ready to merge (skip applier) +``` + +No iteration at any stage. Every terminal outcome is either merge or `needs-input`. The only levers are human-applied `retry:queued` (single-shot fixer pass) or `rebuild:queued` (hard reset). + +--- + +## Timeline + +### Before this session + +decidesk#44 and #45 had been through **8+ pipeline cycles** each over 2026-04-19 to 2026-04-22. Both kept landing on `needs-input`. The surface verdicts varied β€” sometimes code-review:fail, sometimes security-review:fail, sometimes applier:fail β€” but the underlying findings were the same small set repeating. + +Earlier sessions had landed Phases A through F of the "no-Claude-git" rollout β€” moving every git/gh operation out of Claude's turn into the entrypoint so the agents never need write credentials. Phase F specifically pre-installed composer + npm + merged development in the builder entrypoint to remove ~10 turns of build-env setup. + +Phase G was the proposed next step: scope mechanical checks to the PR diff so inherited debt in unchanged files doesn't block new PRs. + +### Session start + +First decidesk status check showed: + +``` +[supervisor] handle_completions: review fail on decidesk#45 β€” escalating to needs-input +[supervisor] handle_completions: review fail on decidesk#44 β€” escalating to needs-input +... (same two lines every ~40s) +``` + +Two supervisor processes running in parallel β€” both spamming the issues with fresh `**Pipeline escalation**` comments every cycle. Since yesterday: **134 comments on #44, 99 on #45**. Label-set was idempotent but comment-post wasn't, and the completion-handler had no terminal-state guard. + +This was bug #1: **supervisor spam** ([ADR in CLAUDE.md "Terminal-state guards"](../../CLAUDE.md#terminal-state-guards)). Both supervisors killed, fix bundled into Phase G's PR. + +### Phase G implementation + +The idea: the 8 mechanical hydra-gates (`scripts/run-hydra-gates.sh`) iterate `lib/**/*.php` repo-wide. When a PR edits some files and leaves others untouched, pre-existing debt in the untouched files still shows up as FAIL. decidesk#44 and #45 were both being blocked by 2 findings in `lib/Controller/SettingsController.php` β€” a file neither PR touched. The reviewer could see the failures but couldn't bound-fix them (out of diff scope). Result: every cycle re-bounced on the same debt. + +Phase G added `--scope-to-diff [--base BRANCH]` to `run-hydra-gates.sh`: +- Derive `CHANGED_FILES = git diff --name-only --diff-filter=ACMR BASE...HEAD` once +- Every gate's file loop filters through `_in_scope "$f" || continue` +- Gate 4 (composer-audit) skips entirely unless `composer.json`/`composer.lock` is in diff +- Gate 6 (orphan-auth) scopes the *defining* file but keeps caller grep repo-wide + +All four pipeline positions that invoke gates use the new flag (builder Rule 0b + reviewer pre/post-flight + security pre/post-flight). The applier doesn't invoke gates β€” it consumes verdicts. + +Smoke test on PR#131 (feature/47 branch): +- Full-repo scan: 2 FAIL (`SettingsController.php`, unchanged) +- `--scope-to-diff --base origin/development`: **ALL 8 GATES GREEN** on the 19 changed files + +Shipped as [PR#133](https://github.com/ConductionNL/hydra/pull/133) + [ADR-020](../../openspec/architecture/adr-020-gate-scope-to-pr-diff.md). + +### First post-Phase-G validation + +Applied `retry:queued` to #44 and #45 (stripped `needs-input` + `code-review:fail` + `security-review:fail`, set `code-review:queued`). Rebuilt all 5 images locally (nextcloud-test + builder + reviewer + security + applier), set `HYDRA_IMAGE_PREFIX=localhost/hydra` + `HYDRA_IMAGE_TAG=test` in `secrets/.env`, restarted supervisor. + +First observation: supervisor dispatched reviewers on **`ghcr.io/conductionnl/hydra-reviewer:latest`** β€” not the local images. My `.env` edit had no effect. + +**Bug #2: `secrets/.env` was documented but not sourced.** `hydra-supervisor.sh` and `orchestrate.sh` both have `# Reads: secrets/.env` in their header comments, but neither actually sourced the file. HYDRA_IMAGE_PREFIX / HYDRA_IMAGE_TAG overrides were silently ignored unless exported in the parent shell first. + +Fixed by adding `set -a; . secrets/.env; set +a` at the top of both scripts. Shipped as [PR#134](https://github.com/ConductionNL/hydra/pull/134). + +### Phase G validation β€” first pass + +After restart on local images: + +``` +[INFO] Pre-flight hydra-gates: 0 failing gate(s) # on decidesk#44 +[INFO] Pre-flight hydra-gates: 0 failing gate(s) # on decidesk#45 +``` + +Both issues cleared code review. The `SettingsController` debt that had been blocking them for days was now correctly out of scope. Phase G was working. + +Then: +- #44 β†’ `code-review:pass` β†’ `security-review:pass` β†’ `applier:running` β†’ **`applier:fail`** +- #45 β†’ `code-review:pass` β†’ **`security-review:fail`** + +Two different failure modes. The applier was supposed to be the backstop; what happened? + +### The applier 0-turn bug + +Applier verdict JSON for #44: + +```json +{"ran": true, "pass": null, "blocking": [], "turns": 0, "cost_usd": 0.0} +``` + +Zero turns. Claude never completed a message. Checking the transcript: + +``` +[INFO] Config loaded: container=hydra-applier max_turns=20 tools=Read,Bash +[prefetch] Fetching PR #127 context from ConductionNL/decidesk... +/usr/local/lib/hydra/entrypoint-common.sh: line 426: + /workspace/pr-context/diff.patch: No such file or directory +``` + +`mkdir -p /workspace/pr-context 2>/dev/null || true` and all the subsequent `>` redirects silently failed. No PR diff, no inline comments, no summary comments, no `/workspace/claude-output.jsonl` sink for Claude's output. + +Reason: `/workspace` was `root:root 0755` in the applier image. Applier runs as `claude:claude` (uid 1000) with a minimum cap set β€” no `DAC_OVERRIDE`, no `SETUID`. The other personas (builder / reviewer / security) run with those caps and chown `/workspace` in the entrypoint; the applier can't chown at runtime and has no privilege to bypass the read-only ownership. + +**Bug #3: applier `/workspace` not pre-chowned at image build time.** + +Smoke test confirmed: +``` +$ docker run --rm --user claude:claude --entrypoint sh localhost/hydra-applier:test \ + -c 'touch /workspace/.test && mkdir -p /workspace/pr-context' +touch: cannot touch '/workspace/.test': Permission denied +``` + +Fix (in the Dockerfile, not the entrypoint, because the applier CANNOT chown at runtime): + +```dockerfile +RUN mkdir -p /workspace && chown claude:claude /workspace && chmod 0775 /workspace +``` + +This is the [container capability profiles section of ADR-013](../../openspec/architecture/adr-013-container-pool.md#container-capability-profiles) β€” **minimum-cap personas must be pre-chowned at image build**. Shipped as [PR#135](https://github.com/ConductionNL/hydra/pull/135). + +### The security-review:fail on #45 + +Different story. Clyde's verdict on #45: + +- βœ… composer audit / npm audit / gitleaks / semgrep / all 8 hydra gates β€” clean +- ❌ Manual OWASP review: 2 WARNING findings + 1 SUGGESTION +- Verdict: FAIL (WARNINGs unfixed) + +The WARNINGs: `authorizeAssignment()` and `authorizeReminder()` in `DecisionApprovalService` accept `$uid` but never use it. Any authenticated user could trigger them on any decision. + +The fix was obvious from reading the file: `transitionLifecycle()` five methods above had the exact pattern β€” `checkUserRole($uid, ['chair','secretary'])` in a try/catch, converting to `OCSForbiddenException`. Mirror that. Maybe 5–7 lines. + +Clyde had declined across **eight review cycles** between 2026-04-21 and 2026-04-23, each time citing "exceeds 3-line bounded fix scope" or "architectural decision needed". Clyde was following the rule honestly β€” the rule said fixes must be "1–3 lines in one file" and this was 5–10. + +**Bug #5: bounded-fix scope was defined by line count, which is wrong-shaped for common auth patterns.** + +Meanwhile β€” a sub-observation about the **builder**: the builder in earlier retry cycles had been playing whack-a-mole with gate-7 (no-admin-IDOR). Gate-7 checks that `@NoAdminRequired` methods have an in-body auth guard (regex for `->authorize*` / `->require*` / `->ensure*` calls). When the gate fired on `assignReviewer`, the builder added `$this->approvalService->authorizeAssignment()` in the controller body AND created the empty stub method on the service to make that compile. Gate-7 passed (call exists). Clyde's semantic review caught that the method was a stub. Builder didn't have to implement real auth to pass the gate β€” only to call a method with the right name. + +Fix (this retrospective's ADR-021): **scope by change shape, not line count.** The new rule: + +> If a sibling method in the same class demonstrates the fix, the "architectural decision" escape hatch does NOT apply. + +`transitionLifecycle` exists β†’ `authorizeAssignment` must mirror it β†’ mechanical β†’ Clyde fixes. + +Shipped as [PR#136](https://github.com/ConductionNL/hydra/pull/136) + [ADR-021](../../openspec/architecture/adr-021-bounded-fix-scope-by-shape.md). + +--- + +## The builder's "minimum to clear the gate" anti-pattern + +The decidesk#45 story exposed a subtler failure mode I want to call out separately because it's likely to recur. + +When the builder is in `HYDRA_MODE=fix` (retry cycle), it reads `feedback.md` β€” a digest of the previous cycle's findings. For findings of the form "gate-X failed on method Y", the builder's cheapest path is: + +1. Parse the gate's failure message: "missing auth check in `assignReviewer`" +2. Add the minimum code that clears the gate: `$this->approvalService->authorizeAssignment()` on line Y +3. Create an empty stub method if the call target doesn't exist + +Gate-7's regex just looks for the call syntax. It doesn't verify the target method actually enforces authorization. So the empty stub clears the gate. Builder commits. Code-review:queued. Juan passes. Security:queued. Clyde reads the stub and correctly flags it as empty. + +**The anti-pattern**: deterministic gates that check form, not substance, let the builder minimize effort in a way that's visible only to semantic review. This is harmless if the reviewer catches it AND can fix it β€” but with the old bounded-fix rule, the reviewer couldn't fix it either ("architectural"). + +**The counter-measure** (not yet implemented, noted for future work): harden gate-7 to detect stub-shape auth methods β€” method body that accepts a `$uid` parameter but never references it. A rough grep: `function authorize(...$uid,...).*\{ *[^}]*\}` that doesn't contain `$uid`. This closes the minimum-to-clear path at the gate level. + +--- + +## What we shipped + +- [PR#133](https://github.com/ConductionNL/hydra/pull/133) β€” Phase G (`--scope-to-diff` + supervisor spam guard) +- [PR#134](https://github.com/ConductionNL/hydra/pull/134) β€” `secrets/.env` sourcing + banner glitch cosmetic +- [PR#135](https://github.com/ConductionNL/hydra/pull/135) β€” Applier `/workspace` Dockerfile chown +- [PR#136](https://github.com/ConductionNL/hydra/pull/136) β€” Bounded-fix scope by change shape + +Four PRs, all admin-merged to `development`. Three new rules formalised ([ADR-020](../../openspec/architecture/adr-020-gate-scope-to-pr-diff.md), [ADR-021](../../openspec/architecture/adr-021-bounded-fix-scope-by-shape.md), and the [Container capability profiles section of ADR-013](../../openspec/architecture/adr-013-container-pool.md#container-capability-profiles)). Two CLAUDE.md rules documented (config override contract, terminal-state guards). + +--- + +## Lessons + +1. **Silent error suppression in entrypoints masks ownership bugs.** `mkdir -p ... 2>/dev/null || true` and `> file 2>/dev/null` patterns are meant to tolerate transient failures, but they also hide fundamental ownership issues that make the whole flow non-functional. A non-fatal error that happens on EVERY run is not transient β€” it's a bug. Prefer `mkdir -p ... 2>&1 | tee /tmp/mkdir.log; test -d ...` or explicit assertions for things that must exist. + +2. **Documented behaviour β‰  implemented behaviour.** Both `hydra-supervisor.sh` and `orchestrate.sh` claimed to read `secrets/.env` in their header comments, but neither did. Headers rot. If the behaviour matters, test for it β€” a `.env.example` with a recognisable sentinel that fails the build if the entrypoint doesn't surface it would have caught this instantly. + +3. **Deterministic gates that check form, not substance, can be mechanically satisfied.** Builder exploited gate-7's regex-based check to create stub methods. Gate design matters: gate what the system semantically needs (an auth method that uses the UID), not what it syntactically expects (a call at a certain line). + +4. **The no-loop policy has zero tolerance for infrastructure gaps.** In a loop-based pipeline you can retry through a transient mkdir failure. In a no-loop pipeline, one `root:root` directory turns into `applier:fail` and a human escalation. This is the right trade-off β€” it forces us to build rock-solid infrastructure β€” but it means every infrastructure edge case that would be papered over in other systems shows up as a hard verdict here. + +5. **"Architectural" is a category, not an escape hatch.** When a reviewer says a fix is "architectural", the real question is: *does this require a NEW decision, or does it apply an existing one?* If the pattern exists in the same file, there's no new decision. The old bounded-fix rule treated "5 lines" as architectural; the new rule treats "new concept" as architectural. Much better. + +6. **Retrospectives are cheap. Write them.** This session found and fixed five distinct pipeline bugs, each worth a paragraph of future-me context. Without writing it down, the next pipeline contributor relearns the whole stack the hard way. + +--- + +## Open follow-ups + +- Harden gate-7 against stub-shape auth methods (the builder's "minimum-to-clear" path) +- Phase G.1: scope `composer check:strict` / `phpunit` / `npm run lint` to the PR diff too. Currently still full-repo because `composer`/`phpunit` don't accept per-file args cleanly. Deferred; the reviewer's manual scope filter still works as safety net. +- Consider a "gate coverage test" skill that asserts each gate catches the patterns it claims to. Currently we find gate gaps only when a PR exploits one. diff --git a/website/i18n/nl/docusaurus-plugin-content-docs/current.json b/website/i18n/nl/docusaurus-plugin-content-docs/current.json index 2566422..4cc9edb 100644 --- a/website/i18n/nl/docusaurus-plugin-content-docs/current.json +++ b/website/i18n/nl/docusaurus-plugin-content-docs/current.json @@ -11,8 +11,44 @@ "message": "Ondersteuning & Veiligheid", "description": "The label for category Support & Safety in sidebar tutorialSidebar" }, + "sidebar.tutorialSidebar.category.Hydra": { + "message": "Hydra", + "description": "The label for category Hydra in sidebar tutorialSidebar" + }, + "sidebar.tutorialSidebar.category.Pipeline": { + "message": "Pipeline", + "description": "The label for category Pipeline in sidebar tutorialSidebar" + }, + "sidebar.tutorialSidebar.category.Operations": { + "message": "Operaties", + "description": "The label for category Operations in sidebar tutorialSidebar" + }, + "sidebar.tutorialSidebar.category.Retrospectives": { + "message": "Retrospectives", + "description": "The label for category Retrospectives in sidebar tutorialSidebar" + }, + "sidebar.tutorialSidebar.category.Using Claude": { + "message": "Claude gebruiken", + "description": "The label for category Using Claude in sidebar tutorialSidebar" + }, + "sidebar.tutorialSidebar.category.Setup": { + "message": "Installatie", + "description": "The label for category Setup in sidebar tutorialSidebar" + }, + "sidebar.tutorialSidebar.category.Workflow": { + "message": "Workflow", + "description": "The label for category Workflow in sidebar tutorialSidebar" + }, + "sidebar.tutorialSidebar.category.Writing": { + "message": "Schrijven", + "description": "The label for category Writing in sidebar tutorialSidebar" + }, + "sidebar.tutorialSidebar.category.Quality & testing": { + "message": "Kwaliteit & testen", + "description": "The label for category Quality & testing in sidebar tutorialSidebar" + }, "sidebar.tutorialSidebar.category.Claude workflow": { "message": "Claude-workflow", - "description": "The label for category Claude workflow in sidebar tutorialSidebar" + "description": "The label for category Claude workflow in sidebar tutorialSidebar (legacy, kept for safety)" } } diff --git a/website/sidebars.js b/website/sidebars.js index a713c19..619108c 100644 --- a/website/sidebars.js +++ b/website/sidebars.js @@ -41,12 +41,106 @@ const sidebars = { 'iso/privacy-policy', ], }, + + { + type: 'category', + label: 'Hydra', + collapsed: false, + link: { type: 'doc', id: 'hydra/README' }, + items: [ + { + type: 'category', + label: 'Pipeline', + collapsed: true, + items: [ + 'hydra/pipeline-overview', + 'hydra/agentic-workflow', + 'hydra/agent-configuration', + 'hydra/container-architecture', + 'hydra/deployment-models', + 'hydra/github-workflow', + 'hydra/context', + ], + }, + { + type: 'category', + label: 'Operations', + collapsed: true, + items: [{ type: 'autogenerated', dirName: 'hydra/operations' }], + }, + { + type: 'category', + label: 'Retrospectives', + collapsed: true, + items: [{ type: 'autogenerated', dirName: 'hydra/retrospectives' }], + }, + ], + }, + { type: 'category', - label: 'Claude workflow', - items: [{ type: 'autogenerated', dirName: 'claude' }], + label: 'Using Claude', + collapsed: false, + link: { type: 'doc', id: 'claude/README' }, + items: [ + 'claude/getting-started', + 'claude/walkthrough', + 'claude/agents', + 'claude/rad-platform', + + { + type: 'category', + label: 'Setup', + collapsed: true, + items: [ + 'claude/workstation-setup', + 'claude/global-claude-settings', + 'claude/docker', + 'claude/playwright-setup', + 'claude/local-llm', + ], + }, + { + type: 'category', + label: 'Workflow', + collapsed: true, + items: [ + 'claude/workflow', + 'claude/app-lifecycle', + 'claude/commands', + 'claude/commands-openspec', + 'claude/commands-tender', + ], + }, + { + type: 'category', + label: 'Writing', + collapsed: true, + items: [ + 'claude/writing-specs', + 'claude/writing-adrs', + 'claude/writing-skills', + 'claude/writing-docs', + 'claude/skill-checklist', + 'claude/skill-patterns', + 'claude/skill-evals', + ], + }, + { + type: 'category', + label: 'Quality & testing', + collapsed: true, + items: [ + 'claude/testing', + 'claude/parallel-agents', + 'claude/frontend-standards', + ], + }, + + 'claude/retrofit', + ], }, - 'hydra/README', + 'WayOfWork/about-this-manual', ], };