Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 5 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -647,12 +647,15 @@ governance manifest refresh falls back to `scripts/verify_upstream_slices.py --m
to regenerate the receipts instead of failing on missing files alone.
Dashboard dependency installs now also carry an ENOSPC recovery branch that
retries with a workspace-local pnpm store and the registered dashboard install
env knobs when copy-heavy CI or local maintenance installs run out of disk, and
env knobs when copy-heavy CI or local maintenance installs run out of disk;
that recovery path now also fails closed behind a registered minimum-headroom
threshold so low-disk hosts do not keep churning partial retry stores, and
dashboard/desktop clean-room installs now retry bounded transient npm registry
socket timeouts before they fail closed.
Desktop dependency installs now mirror the same ENOSPC recovery strategy,
including the registered desktop install env knobs that scope hardlink imports
to the recovery attempt and move retry stores onto workspace-local temp roots.
to the recovery attempt, gate workspace-local recovery on registered minimum
headroom, and move retry stores onto workspace-local temp roots.
Docker-backed GitHub-hosted maintenance lanes now retry daemon prechecks with
bounded backoff and registered retry knobs before failing closed on a transient
socket refusal.
Expand Down
20 changes: 20 additions & 0 deletions apps/orchestrator/tests/test_runtime_log_path_guardrails.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,26 @@ def test_install_logs_use_dedicated_runtime_subdir() -> None:
assert 'INSTALL_LOG="$ROOT_DIR/.runtime-cache/logs/runtime/deps_install/install_desktop_deps.log"' in desktop


def test_install_scripts_fail_closed_on_low_headroom_workspace_recovery() -> None:
dash = _read("scripts/install_dashboard_deps.sh")
desktop = _read("scripts/install_desktop_deps.sh")
assert 'MIN_ENOSPC_RECOVERY_HEADROOM_GIB="${CORTEXPILOT_DASHBOARD_ENOSPC_MIN_HEADROOM_GIB:-3}"' in dash
assert 'MIN_ENOSPC_RECOVERY_HEADROOM_GIB="${CORTEXPILOT_DESKTOP_ENOSPC_MIN_HEADROOM_GIB:-3}"' in desktop
assert "workspace-local ENOSPC recovery requires at least" in dash
assert "workspace-local ENOSPC recovery requires at least" in desktop


def test_install_scripts_cleanup_failed_workspace_retry_store_on_exit() -> None:
dash = _read("scripts/install_dashboard_deps.sh")
desktop = _read("scripts/install_desktop_deps.sh")
assert "cleanup_active_workspace_retry_store" in dash
assert "cleanup_active_workspace_retry_store" in desktop
assert "WORKSPACE_RETRY_STORE_ACTIVE=1" in dash
assert "WORKSPACE_RETRY_STORE_ACTIVE=1" in desktop
assert "trap 'handle_exit' EXIT" in dash
assert "trap 'handle_exit' EXIT" in desktop


def test_ci_perf_log_uses_dedicated_runtime_subdir() -> None:
text = _read("scripts/lib/ci_step9_helpers.sh")
assert 'PERF_API_LOG=".runtime-cache/logs/runtime/ci_perf/ci_perf_api.log"' in text
24 changes: 24 additions & 0 deletions configs/env.registry.json
Original file line number Diff line number Diff line change
Expand Up @@ -3318,6 +3318,18 @@
"scripts/install_dashboard_deps.sh"
]
},
{
"name": "CORTEXPILOT_DASHBOARD_ENOSPC_MIN_HEADROOM_GIB",
"scope": "frontend-build",
"secret": false,
"required": false,
"default": "3",
"owner": "platform",
"description": "Minimum free disk headroom, in GiB, required before `install_dashboard_deps.sh` is allowed to enter its workspace-local ENOSPC recovery branch.",
"consumers": [
"scripts/install_dashboard_deps.sh"
]
},
{
"name": "CORTEXPILOT_DASHBOARD_PNPM_IMPORT_METHOD",
"scope": "frontend-build",
Expand Down Expand Up @@ -3379,6 +3391,18 @@
"scripts/install_desktop_deps.sh"
]
},
{
"name": "CORTEXPILOT_DESKTOP_ENOSPC_MIN_HEADROOM_GIB",
"scope": "frontend-build",
"secret": false,
"required": false,
"default": "3",
"owner": "platform",
"description": "Minimum free disk headroom, in GiB, required before `install_desktop_deps.sh` is allowed to enter its workspace-local ENOSPC recovery branch.",
"consumers": [
"scripts/install_desktop_deps.sh"
]
},
{
"name": "CORTEXPILOT_DESKTOP_PNPM_IMPORT_METHOD",
"scope": "frontend-build",
Expand Down
6 changes: 3 additions & 3 deletions docs/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -239,9 +239,9 @@ navigation set.
dependency audit contract, sync this summary and the root entrypoints in the
same patch; the current examples are `.runtime-cache/test_output/ci/` and
`configs/pip_audit_ignored_advisories.json`, plus the dashboard and desktop
ENOSPC recovery knobs plus the Docker daemon precheck retry knobs registered
in `configs/env.registry.json`, together with the bounded transient npm
registry socket-timeout retries inside
ENOSPC recovery knobs, the minimum-headroom fail-fast thresholds, and the
Docker daemon precheck retry knobs registered in `configs/env.registry.json`,
together with the bounded transient npm registry socket-timeout retries inside
`scripts/install_dashboard_deps.sh` / `scripts/install_desktop_deps.sh`;
current CI contract changes also include the
upstream receipt refresh fallback to `scripts/verify_upstream_slices.py --mode smoke`
Expand Down
61 changes: 60 additions & 1 deletion scripts/install_dashboard_deps.sh
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,8 @@ INSTALL_LOG="$ROOT_DIR/.runtime-cache/logs/runtime/deps_install/install_dashboar
LOCK_DIR="${STATE_ROOT}/install-dashboard-deps.lock"
LOCK_OWNER_FILE="$LOCK_DIR/owner"
LOCK_HELD=0
MIN_ENOSPC_RECOVERY_HEADROOM_GIB="${CORTEXPILOT_DASHBOARD_ENOSPC_MIN_HEADROOM_GIB:-3}"
WORKSPACE_RETRY_STORE_ACTIVE=0

cortexpilot_maybe_auto_prune_machine_cache "$ROOT_DIR" "install_dashboard_deps"

Expand Down Expand Up @@ -83,6 +85,23 @@ release_install_lock() {
LOCK_HELD=0
}

cleanup_active_workspace_retry_store() {
local exit_code="${1:-0}"
if [[ "$WORKSPACE_RETRY_STORE_ACTIVE" != "1" ]]; then
return 0
fi
if [[ "$exit_code" -eq 0 ]]; then
return 0
fi
retire_store_dir "${STORE_DIR:-}"
}

handle_exit() {
local exit_code=$?
cleanup_active_workspace_retry_store "$exit_code"
release_install_lock
}

acquire_install_lock() {
mkdir -p "$(dirname "$LOCK_DIR")"
local started_epoch
Expand Down Expand Up @@ -148,6 +167,26 @@ cleanup_stale_retry_stores() {
shopt -u nullglob
}

print_disk_headroom() {
echo "ℹ️ [install-dashboard-deps] filesystem headroom:" >&2
df -h "$ROOT_DIR" >&2 || true
}

workspace_recovery_headroom_ready() {
local available_kib=""
available_kib="$(df -Pk "$ROOT_DIR" 2>/dev/null | awk 'NR==2 {print $4}')"
if [[ ! "$available_kib" =~ ^[0-9]+$ ]]; then
available_kib=0
fi
local required_kib=$((MIN_ENOSPC_RECOVERY_HEADROOM_GIB * 1024 * 1024))
if (( available_kib >= required_kib )); then
return 0
fi
echo "❌ [install-dashboard-deps] workspace-local ENOSPC recovery requires at least ${MIN_ENOSPC_RECOVERY_HEADROOM_GIB}GiB free; skipping recovery to avoid leaving another partial retry store behind" >&2
print_disk_headroom
return 1
}

verify_dashboard_build_toolchain() {
(
cd "$APP_DIR"
Expand Down Expand Up @@ -198,7 +237,9 @@ if [[ -d "$ROOT_DIR/node_modules" ]]; then
fi

acquire_install_lock
trap 'release_install_lock' EXIT INT TERM
trap 'handle_exit' EXIT
trap 'exit 130' INT
trap 'exit 143' TERM

STORE_DIR="$(resolve_writable_store_dir "$STORE_DIR")"
cleanup_stale_retry_stores
Expand Down Expand Up @@ -275,6 +316,15 @@ reset_app_node_modules() {
return 1
}

cleanup_failed_workspace_recovery() {
local failed_store_dir="$1"
retire_store_dir "$failed_store_dir"
WORKSPACE_RETRY_STORE_ACTIVE=0
if ! reset_app_node_modules; then
echo "⚠️ [install-dashboard-deps] unable to remove partial dashboard node_modules after failed workspace-local recovery" >&2
fi
}

recover_with_fresh_store() {
local reason="$1"
local max_attempts="${2:-3}"
Expand Down Expand Up @@ -319,8 +369,12 @@ recover_with_fresh_store() {
recover_with_workspace_store() {
local reason="$1"
echo "⚠️ [install-dashboard-deps] ${reason}; switching to workspace-local pnpm store + hardlink import mode and resetting dashboard node_modules" >&2
if ! workspace_recovery_headroom_ready; then
exit 1
fi
retire_store_dir "$STORE_DIR"
STORE_DIR="$(workspace_retry_store_dir)"
WORKSPACE_RETRY_STORE_ACTIVE=1
cleanup_stale_workspace_retry_stores
local previous_import_method="$INSTALL_PACKAGE_IMPORT_METHOD"
local previous_node_linker="$INSTALL_NODE_LINKER"
Expand All @@ -339,18 +393,23 @@ recover_with_workspace_store() {
INSTALL_PACKAGE_IMPORT_METHOD="$previous_import_method"
INSTALL_NODE_LINKER="$previous_node_linker"
INSTALL_SHAMEFULLY_HOIST="$previous_shamefully_hoist"
cleanup_failed_workspace_recovery "$STORE_DIR"
if install_log_has_socket_timeout; then
echo "❌ [install-dashboard-deps] workspace-local recovery exhausted transient npm registry retries; tail follows" >&2
print_install_log_tail 80
exit 1
fi
if log_contains "ERR_PNPM_ENOSPC" || log_contains "no space left on device"; then
print_disk_headroom
fi
echo "❌ [install-dashboard-deps] pnpm install failed after workspace-local recovery; tail follows" >&2
print_install_log_tail 80
exit 1
fi
INSTALL_PACKAGE_IMPORT_METHOD="$previous_import_method"
INSTALL_NODE_LINKER="$previous_node_linker"
INSTALL_SHAMEFULLY_HOIST="$previous_shamefully_hoist"
WORKSPACE_RETRY_STORE_ACTIVE=0
}

if ! run_install_with_network_retry "initial install"; then
Expand Down
61 changes: 60 additions & 1 deletion scripts/install_desktop_deps.sh
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,8 @@ INSTALL_LOG="$ROOT_DIR/.runtime-cache/logs/runtime/deps_install/install_desktop_
LOCK_DIR="$ROOT_DIR/.runtime-cache/cortexpilot/locks/install-desktop-deps.lock"
LOCK_OWNER_FILE="$LOCK_DIR/owner"
LOCK_HELD=0
MIN_ENOSPC_RECOVERY_HEADROOM_GIB="${CORTEXPILOT_DESKTOP_ENOSPC_MIN_HEADROOM_GIB:-3}"
WORKSPACE_RETRY_STORE_ACTIVE=0

cortexpilot_maybe_auto_prune_machine_cache "$ROOT_DIR" "install_desktop_deps"

Expand All @@ -40,6 +42,23 @@ release_install_lock() {
LOCK_HELD=0
}

cleanup_active_workspace_retry_store() {
local exit_code="${1:-0}"
if [[ "$WORKSPACE_RETRY_STORE_ACTIVE" != "1" ]]; then
return 0
fi
if [[ "$exit_code" -eq 0 ]]; then
return 0
fi
retire_store_dir "${STORE_DIR:-}"
}

handle_exit() {
local exit_code=$?
cleanup_active_workspace_retry_store "$exit_code"
release_install_lock
}

acquire_install_lock() {
mkdir -p "$(dirname "$LOCK_DIR")"
local started_epoch
Expand Down Expand Up @@ -105,6 +124,26 @@ cleanup_stale_workspace_retry_stores() {
shopt -u nullglob
}

print_disk_headroom() {
echo "ℹ️ [install-desktop-deps] filesystem headroom:" >&2
df -h "$ROOT_DIR" >&2 || true
}

workspace_recovery_headroom_ready() {
local available_kib=""
available_kib="$(df -Pk "$ROOT_DIR" 2>/dev/null | awk 'NR==2 {print $4}')"
if [[ ! "$available_kib" =~ ^[0-9]+$ ]]; then
available_kib=0
fi
local required_kib=$((MIN_ENOSPC_RECOVERY_HEADROOM_GIB * 1024 * 1024))
if (( available_kib >= required_kib )); then
return 0
fi
echo "❌ [install-desktop-deps] workspace-local ENOSPC recovery requires at least ${MIN_ENOSPC_RECOVERY_HEADROOM_GIB}GiB free; skipping recovery to avoid leaving another partial retry store behind" >&2
print_disk_headroom
return 1
}

retire_store_dir() {
local target="$1"
[[ -e "$target" ]] || return 0
Expand All @@ -130,7 +169,9 @@ if [[ -d "$ROOT_DIR/node_modules" ]]; then
fi

acquire_install_lock
trap 'release_install_lock' EXIT INT TERM
trap 'handle_exit' EXIT
trap 'exit 130' INT
trap 'exit 143' TERM

STORE_DIR="$(resolve_writable_store_dir "$STORE_DIR")"
cleanup_stale_retry_stores
Expand Down Expand Up @@ -227,8 +268,12 @@ recover_with_fresh_store() {
recover_with_workspace_store() {
local reason="$1"
echo "⚠️ [install-desktop-deps] ${reason}; switching to workspace-local pnpm store + hardlink import mode and resetting desktop node_modules" >&2
if ! workspace_recovery_headroom_ready; then
exit 1
fi
retire_store_dir "$STORE_DIR"
STORE_DIR="$(workspace_retry_store_dir)"
WORKSPACE_RETRY_STORE_ACTIVE=1
cleanup_stale_workspace_retry_stores
local previous_import_method="$INSTALL_PACKAGE_IMPORT_METHOD"
local previous_node_linker="$INSTALL_NODE_LINKER"
Expand All @@ -247,18 +292,23 @@ recover_with_workspace_store() {
INSTALL_PACKAGE_IMPORT_METHOD="$previous_import_method"
INSTALL_NODE_LINKER="$previous_node_linker"
INSTALL_SHAMEFULLY_HOIST="$previous_shamefully_hoist"
cleanup_failed_workspace_recovery "$STORE_DIR"
if install_log_has_socket_timeout; then
echo "❌ [install-desktop-deps] workspace-local recovery exhausted transient npm registry retries; tail follows" >&2
tail -n 80 "$INSTALL_LOG" >&2 || true
exit 1
fi
if grep -q "ERR_PNPM_ENOSPC" "$INSTALL_LOG" || grep -qi "no space left on device" "$INSTALL_LOG"; then
print_disk_headroom
fi
echo "❌ [install-desktop-deps] pnpm install failed after workspace-local recovery; tail follows" >&2
tail -n 80 "$INSTALL_LOG" >&2 || true
exit 1
fi
INSTALL_PACKAGE_IMPORT_METHOD="$previous_import_method"
INSTALL_NODE_LINKER="$previous_node_linker"
INSTALL_SHAMEFULLY_HOIST="$previous_shamefully_hoist"
WORKSPACE_RETRY_STORE_ACTIVE=0
}

reset_app_node_modules() {
Expand All @@ -280,6 +330,15 @@ reset_app_node_modules() {
return 1
}

cleanup_failed_workspace_recovery() {
local failed_store_dir="$1"
retire_store_dir "$failed_store_dir"
WORKSPACE_RETRY_STORE_ACTIVE=0
if ! reset_app_node_modules; then
echo "⚠️ [install-desktop-deps] unable to remove partial desktop node_modules after failed workspace-local recovery" >&2
fi
}

if ! run_install_with_network_retry "initial install"; then
if grep -q "ERR_PNPM_ENOENT" "$INSTALL_LOG"; then
recover_with_fresh_store "detected pnpm store ENOENT"
Expand Down
Loading