diff --git a/README.md b/README.md index fceba6e..8ba5279 100644 --- a/README.md +++ b/README.md @@ -647,12 +647,15 @@ governance manifest refresh falls back to `scripts/verify_upstream_slices.py --m to regenerate the receipts instead of failing on missing files alone. Dashboard dependency installs now also carry an ENOSPC recovery branch that retries with a workspace-local pnpm store and the registered dashboard install -env knobs when copy-heavy CI or local maintenance installs run out of disk, and +env knobs when copy-heavy CI or local maintenance installs run out of disk; +that recovery path now also fails closed behind a registered minimum-headroom +threshold so low-disk hosts do not keep churning partial retry stores, and dashboard/desktop clean-room installs now retry bounded transient npm registry socket timeouts before they fail closed. Desktop dependency installs now mirror the same ENOSPC recovery strategy, including the registered desktop install env knobs that scope hardlink imports -to the recovery attempt and move retry stores onto workspace-local temp roots. +to the recovery attempt, gate workspace-local recovery on registered minimum +headroom, and move retry stores onto workspace-local temp roots. Docker-backed GitHub-hosted maintenance lanes now retry daemon prechecks with bounded backoff and registered retry knobs before failing closed on a transient socket refusal. diff --git a/apps/orchestrator/tests/test_runtime_log_path_guardrails.py b/apps/orchestrator/tests/test_runtime_log_path_guardrails.py index 4bba15c..0de6426 100644 --- a/apps/orchestrator/tests/test_runtime_log_path_guardrails.py +++ b/apps/orchestrator/tests/test_runtime_log_path_guardrails.py @@ -37,6 +37,26 @@ def test_install_logs_use_dedicated_runtime_subdir() -> None: assert 'INSTALL_LOG="$ROOT_DIR/.runtime-cache/logs/runtime/deps_install/install_desktop_deps.log"' in desktop +def test_install_scripts_fail_closed_on_low_headroom_workspace_recovery() -> None: + dash = _read("scripts/install_dashboard_deps.sh") + desktop = _read("scripts/install_desktop_deps.sh") + assert 'MIN_ENOSPC_RECOVERY_HEADROOM_GIB="${CORTEXPILOT_DASHBOARD_ENOSPC_MIN_HEADROOM_GIB:-3}"' in dash + assert 'MIN_ENOSPC_RECOVERY_HEADROOM_GIB="${CORTEXPILOT_DESKTOP_ENOSPC_MIN_HEADROOM_GIB:-3}"' in desktop + assert "workspace-local ENOSPC recovery requires at least" in dash + assert "workspace-local ENOSPC recovery requires at least" in desktop + + +def test_install_scripts_cleanup_failed_workspace_retry_store_on_exit() -> None: + dash = _read("scripts/install_dashboard_deps.sh") + desktop = _read("scripts/install_desktop_deps.sh") + assert "cleanup_active_workspace_retry_store" in dash + assert "cleanup_active_workspace_retry_store" in desktop + assert "WORKSPACE_RETRY_STORE_ACTIVE=1" in dash + assert "WORKSPACE_RETRY_STORE_ACTIVE=1" in desktop + assert "trap 'handle_exit' EXIT" in dash + assert "trap 'handle_exit' EXIT" in desktop + + def test_ci_perf_log_uses_dedicated_runtime_subdir() -> None: text = _read("scripts/lib/ci_step9_helpers.sh") assert 'PERF_API_LOG=".runtime-cache/logs/runtime/ci_perf/ci_perf_api.log"' in text diff --git a/configs/env.registry.json b/configs/env.registry.json index 13f17d7..492bd40 100644 --- a/configs/env.registry.json +++ b/configs/env.registry.json @@ -3318,6 +3318,18 @@ "scripts/install_dashboard_deps.sh" ] }, + { + "name": "CORTEXPILOT_DASHBOARD_ENOSPC_MIN_HEADROOM_GIB", + "scope": "frontend-build", + "secret": false, + "required": false, + "default": "3", + "owner": "platform", + "description": "Minimum free disk headroom, in GiB, required before `install_dashboard_deps.sh` is allowed to enter its workspace-local ENOSPC recovery branch.", + "consumers": [ + "scripts/install_dashboard_deps.sh" + ] + }, { "name": "CORTEXPILOT_DASHBOARD_PNPM_IMPORT_METHOD", "scope": "frontend-build", @@ -3379,6 +3391,18 @@ "scripts/install_desktop_deps.sh" ] }, + { + "name": "CORTEXPILOT_DESKTOP_ENOSPC_MIN_HEADROOM_GIB", + "scope": "frontend-build", + "secret": false, + "required": false, + "default": "3", + "owner": "platform", + "description": "Minimum free disk headroom, in GiB, required before `install_desktop_deps.sh` is allowed to enter its workspace-local ENOSPC recovery branch.", + "consumers": [ + "scripts/install_desktop_deps.sh" + ] + }, { "name": "CORTEXPILOT_DESKTOP_PNPM_IMPORT_METHOD", "scope": "frontend-build", diff --git a/docs/README.md b/docs/README.md index 066aa67..965b282 100644 --- a/docs/README.md +++ b/docs/README.md @@ -239,9 +239,9 @@ navigation set. dependency audit contract, sync this summary and the root entrypoints in the same patch; the current examples are `.runtime-cache/test_output/ci/` and `configs/pip_audit_ignored_advisories.json`, plus the dashboard and desktop - ENOSPC recovery knobs plus the Docker daemon precheck retry knobs registered - in `configs/env.registry.json`, together with the bounded transient npm - registry socket-timeout retries inside + ENOSPC recovery knobs, the minimum-headroom fail-fast thresholds, and the + Docker daemon precheck retry knobs registered in `configs/env.registry.json`, + together with the bounded transient npm registry socket-timeout retries inside `scripts/install_dashboard_deps.sh` / `scripts/install_desktop_deps.sh`; current CI contract changes also include the upstream receipt refresh fallback to `scripts/verify_upstream_slices.py --mode smoke` diff --git a/scripts/install_dashboard_deps.sh b/scripts/install_dashboard_deps.sh index cb6e7ef..35985d9 100644 --- a/scripts/install_dashboard_deps.sh +++ b/scripts/install_dashboard_deps.sh @@ -23,6 +23,8 @@ INSTALL_LOG="$ROOT_DIR/.runtime-cache/logs/runtime/deps_install/install_dashboar LOCK_DIR="${STATE_ROOT}/install-dashboard-deps.lock" LOCK_OWNER_FILE="$LOCK_DIR/owner" LOCK_HELD=0 +MIN_ENOSPC_RECOVERY_HEADROOM_GIB="${CORTEXPILOT_DASHBOARD_ENOSPC_MIN_HEADROOM_GIB:-3}" +WORKSPACE_RETRY_STORE_ACTIVE=0 cortexpilot_maybe_auto_prune_machine_cache "$ROOT_DIR" "install_dashboard_deps" @@ -83,6 +85,23 @@ release_install_lock() { LOCK_HELD=0 } +cleanup_active_workspace_retry_store() { + local exit_code="${1:-0}" + if [[ "$WORKSPACE_RETRY_STORE_ACTIVE" != "1" ]]; then + return 0 + fi + if [[ "$exit_code" -eq 0 ]]; then + return 0 + fi + retire_store_dir "${STORE_DIR:-}" +} + +handle_exit() { + local exit_code=$? + cleanup_active_workspace_retry_store "$exit_code" + release_install_lock +} + acquire_install_lock() { mkdir -p "$(dirname "$LOCK_DIR")" local started_epoch @@ -148,6 +167,26 @@ cleanup_stale_retry_stores() { shopt -u nullglob } +print_disk_headroom() { + echo "ℹ️ [install-dashboard-deps] filesystem headroom:" >&2 + df -h "$ROOT_DIR" >&2 || true +} + +workspace_recovery_headroom_ready() { + local available_kib="" + available_kib="$(df -Pk "$ROOT_DIR" 2>/dev/null | awk 'NR==2 {print $4}')" + if [[ ! "$available_kib" =~ ^[0-9]+$ ]]; then + available_kib=0 + fi + local required_kib=$((MIN_ENOSPC_RECOVERY_HEADROOM_GIB * 1024 * 1024)) + if (( available_kib >= required_kib )); then + return 0 + fi + echo "❌ [install-dashboard-deps] workspace-local ENOSPC recovery requires at least ${MIN_ENOSPC_RECOVERY_HEADROOM_GIB}GiB free; skipping recovery to avoid leaving another partial retry store behind" >&2 + print_disk_headroom + return 1 +} + verify_dashboard_build_toolchain() { ( cd "$APP_DIR" @@ -198,7 +237,9 @@ if [[ -d "$ROOT_DIR/node_modules" ]]; then fi acquire_install_lock -trap 'release_install_lock' EXIT INT TERM +trap 'handle_exit' EXIT +trap 'exit 130' INT +trap 'exit 143' TERM STORE_DIR="$(resolve_writable_store_dir "$STORE_DIR")" cleanup_stale_retry_stores @@ -275,6 +316,15 @@ reset_app_node_modules() { return 1 } +cleanup_failed_workspace_recovery() { + local failed_store_dir="$1" + retire_store_dir "$failed_store_dir" + WORKSPACE_RETRY_STORE_ACTIVE=0 + if ! reset_app_node_modules; then + echo "⚠️ [install-dashboard-deps] unable to remove partial dashboard node_modules after failed workspace-local recovery" >&2 + fi +} + recover_with_fresh_store() { local reason="$1" local max_attempts="${2:-3}" @@ -319,8 +369,12 @@ recover_with_fresh_store() { recover_with_workspace_store() { local reason="$1" echo "⚠️ [install-dashboard-deps] ${reason}; switching to workspace-local pnpm store + hardlink import mode and resetting dashboard node_modules" >&2 + if ! workspace_recovery_headroom_ready; then + exit 1 + fi retire_store_dir "$STORE_DIR" STORE_DIR="$(workspace_retry_store_dir)" + WORKSPACE_RETRY_STORE_ACTIVE=1 cleanup_stale_workspace_retry_stores local previous_import_method="$INSTALL_PACKAGE_IMPORT_METHOD" local previous_node_linker="$INSTALL_NODE_LINKER" @@ -339,11 +393,15 @@ recover_with_workspace_store() { INSTALL_PACKAGE_IMPORT_METHOD="$previous_import_method" INSTALL_NODE_LINKER="$previous_node_linker" INSTALL_SHAMEFULLY_HOIST="$previous_shamefully_hoist" + cleanup_failed_workspace_recovery "$STORE_DIR" if install_log_has_socket_timeout; then echo "❌ [install-dashboard-deps] workspace-local recovery exhausted transient npm registry retries; tail follows" >&2 print_install_log_tail 80 exit 1 fi + if log_contains "ERR_PNPM_ENOSPC" || log_contains "no space left on device"; then + print_disk_headroom + fi echo "❌ [install-dashboard-deps] pnpm install failed after workspace-local recovery; tail follows" >&2 print_install_log_tail 80 exit 1 @@ -351,6 +409,7 @@ recover_with_workspace_store() { INSTALL_PACKAGE_IMPORT_METHOD="$previous_import_method" INSTALL_NODE_LINKER="$previous_node_linker" INSTALL_SHAMEFULLY_HOIST="$previous_shamefully_hoist" + WORKSPACE_RETRY_STORE_ACTIVE=0 } if ! run_install_with_network_retry "initial install"; then diff --git a/scripts/install_desktop_deps.sh b/scripts/install_desktop_deps.sh index 99a9b60..3fc920f 100644 --- a/scripts/install_desktop_deps.sh +++ b/scripts/install_desktop_deps.sh @@ -18,6 +18,8 @@ INSTALL_LOG="$ROOT_DIR/.runtime-cache/logs/runtime/deps_install/install_desktop_ LOCK_DIR="$ROOT_DIR/.runtime-cache/cortexpilot/locks/install-desktop-deps.lock" LOCK_OWNER_FILE="$LOCK_DIR/owner" LOCK_HELD=0 +MIN_ENOSPC_RECOVERY_HEADROOM_GIB="${CORTEXPILOT_DESKTOP_ENOSPC_MIN_HEADROOM_GIB:-3}" +WORKSPACE_RETRY_STORE_ACTIVE=0 cortexpilot_maybe_auto_prune_machine_cache "$ROOT_DIR" "install_desktop_deps" @@ -40,6 +42,23 @@ release_install_lock() { LOCK_HELD=0 } +cleanup_active_workspace_retry_store() { + local exit_code="${1:-0}" + if [[ "$WORKSPACE_RETRY_STORE_ACTIVE" != "1" ]]; then + return 0 + fi + if [[ "$exit_code" -eq 0 ]]; then + return 0 + fi + retire_store_dir "${STORE_DIR:-}" +} + +handle_exit() { + local exit_code=$? + cleanup_active_workspace_retry_store "$exit_code" + release_install_lock +} + acquire_install_lock() { mkdir -p "$(dirname "$LOCK_DIR")" local started_epoch @@ -105,6 +124,26 @@ cleanup_stale_workspace_retry_stores() { shopt -u nullglob } +print_disk_headroom() { + echo "ℹ️ [install-desktop-deps] filesystem headroom:" >&2 + df -h "$ROOT_DIR" >&2 || true +} + +workspace_recovery_headroom_ready() { + local available_kib="" + available_kib="$(df -Pk "$ROOT_DIR" 2>/dev/null | awk 'NR==2 {print $4}')" + if [[ ! "$available_kib" =~ ^[0-9]+$ ]]; then + available_kib=0 + fi + local required_kib=$((MIN_ENOSPC_RECOVERY_HEADROOM_GIB * 1024 * 1024)) + if (( available_kib >= required_kib )); then + return 0 + fi + echo "❌ [install-desktop-deps] workspace-local ENOSPC recovery requires at least ${MIN_ENOSPC_RECOVERY_HEADROOM_GIB}GiB free; skipping recovery to avoid leaving another partial retry store behind" >&2 + print_disk_headroom + return 1 +} + retire_store_dir() { local target="$1" [[ -e "$target" ]] || return 0 @@ -130,7 +169,9 @@ if [[ -d "$ROOT_DIR/node_modules" ]]; then fi acquire_install_lock -trap 'release_install_lock' EXIT INT TERM +trap 'handle_exit' EXIT +trap 'exit 130' INT +trap 'exit 143' TERM STORE_DIR="$(resolve_writable_store_dir "$STORE_DIR")" cleanup_stale_retry_stores @@ -227,8 +268,12 @@ recover_with_fresh_store() { recover_with_workspace_store() { local reason="$1" echo "⚠️ [install-desktop-deps] ${reason}; switching to workspace-local pnpm store + hardlink import mode and resetting desktop node_modules" >&2 + if ! workspace_recovery_headroom_ready; then + exit 1 + fi retire_store_dir "$STORE_DIR" STORE_DIR="$(workspace_retry_store_dir)" + WORKSPACE_RETRY_STORE_ACTIVE=1 cleanup_stale_workspace_retry_stores local previous_import_method="$INSTALL_PACKAGE_IMPORT_METHOD" local previous_node_linker="$INSTALL_NODE_LINKER" @@ -247,11 +292,15 @@ recover_with_workspace_store() { INSTALL_PACKAGE_IMPORT_METHOD="$previous_import_method" INSTALL_NODE_LINKER="$previous_node_linker" INSTALL_SHAMEFULLY_HOIST="$previous_shamefully_hoist" + cleanup_failed_workspace_recovery "$STORE_DIR" if install_log_has_socket_timeout; then echo "❌ [install-desktop-deps] workspace-local recovery exhausted transient npm registry retries; tail follows" >&2 tail -n 80 "$INSTALL_LOG" >&2 || true exit 1 fi + if grep -q "ERR_PNPM_ENOSPC" "$INSTALL_LOG" || grep -qi "no space left on device" "$INSTALL_LOG"; then + print_disk_headroom + fi echo "❌ [install-desktop-deps] pnpm install failed after workspace-local recovery; tail follows" >&2 tail -n 80 "$INSTALL_LOG" >&2 || true exit 1 @@ -259,6 +308,7 @@ recover_with_workspace_store() { INSTALL_PACKAGE_IMPORT_METHOD="$previous_import_method" INSTALL_NODE_LINKER="$previous_node_linker" INSTALL_SHAMEFULLY_HOIST="$previous_shamefully_hoist" + WORKSPACE_RETRY_STORE_ACTIVE=0 } reset_app_node_modules() { @@ -280,6 +330,15 @@ reset_app_node_modules() { return 1 } +cleanup_failed_workspace_recovery() { + local failed_store_dir="$1" + retire_store_dir "$failed_store_dir" + WORKSPACE_RETRY_STORE_ACTIVE=0 + if ! reset_app_node_modules; then + echo "⚠️ [install-desktop-deps] unable to remove partial desktop node_modules after failed workspace-local recovery" >&2 + fi +} + if ! run_install_with_network_retry "initial install"; then if grep -q "ERR_PNPM_ENOENT" "$INSTALL_LOG"; then recover_with_fresh_store "detected pnpm store ENOENT"