From 9242498970f57573702ccf75fcce6980288439a1 Mon Sep 17 00:00:00 2001 From: jichuanh Date: Wed, 20 May 2026 23:11:54 +0000 Subject: [PATCH 01/55] [CI] Add Windows + ARM pytest markers and shared scaffolding Foundation for cross-platform CI. Registers four pytest markers (windows, windows_ci, arm, arm_ci), teaches AppLauncher to recognize them in argv so they do not leak into Isaac Sim's argparse, and moves the AssetConverterBase USD scratch directory from a hardcoded /tmp/IsaacLab to tempfile.gettempdir() for cross-platform compatibility. Tags source/isaaclab/test/deps/test_torch.py and test_scipy.py with the new markers so they are selectable by future cross-platform jobs. Workflow files (arm-ci.yaml, windows-ci.yaml) ship in follow-up PRs. --- pyproject.toml | 4 ++++ .../changelog.d/jichuanh-windows-spark-ci-min.skip | 1 + source/isaaclab/isaaclab/app/app_launcher.py | 10 ++++++++-- .../isaaclab/sim/converters/asset_converter_base.py | 12 +++++++----- source/isaaclab/test/deps/test_scipy.py | 2 ++ source/isaaclab/test/deps/test_torch.py | 2 ++ 6 files changed, 24 insertions(+), 7 deletions(-) create mode 100644 source/isaaclab/changelog.d/jichuanh-windows-spark-ci-min.skip diff --git a/pyproject.toml b/pyproject.toml index 86ab12b38ceb..33ba8e2b1274 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -194,6 +194,10 @@ ignore-words-list = "haa,slq,collapsable,buss,reacher,thirdparty" markers = [ "isaacsim_ci: mark test to run in isaacsim ci", + "windows: mark test as runnable on Windows platforms", + "windows_ci: mark test to run on Windows platforms in CI", + "arm: mark test as runnable on ARM platforms (e.g. NVIDIA DGX Spark)", + "arm_ci: mark test to run on ARM platforms in CI (e.g. NVIDIA DGX Spark)", ] # Add pypi.nvidia.com so that `uv pip install isaaclab[isaacsim]` works without --extra-index-url. diff --git a/source/isaaclab/changelog.d/jichuanh-windows-spark-ci-min.skip b/source/isaaclab/changelog.d/jichuanh-windows-spark-ci-min.skip new file mode 100644 index 000000000000..bfa2b75a780a --- /dev/null +++ b/source/isaaclab/changelog.d/jichuanh-windows-spark-ci-min.skip @@ -0,0 +1 @@ +Skip changelog: CI/test-infrastructure foundation (no user-facing API change). Registers the windows / windows_ci / arm / arm_ci pytest markers in pyproject.toml, teaches AppLauncher to recognize them in argv so they do not leak into Isaac Sim's argparse, and moves the AssetConverterBase USD scratch dir from hardcoded /tmp/IsaacLab to tempfile.gettempdir() for cross-platform compatibility. Workflow files (arm-ci.yaml, windows-ci.yaml) ship in follow-up PRs. diff --git a/source/isaaclab/isaaclab/app/app_launcher.py b/source/isaaclab/isaaclab/app/app_launcher.py index 2bdb8a08932d..a4f7a628f052 100644 --- a/source/isaaclab/isaaclab/app/app_launcher.py +++ b/source/isaaclab/isaaclab/app/app_launcher.py @@ -1127,12 +1127,18 @@ def _create_app(self): sys.stdout = open(os.devnull, "w") # noqa: SIM115 # pytest may have left some things in sys.argv, this will check for some of those - # do a mark and sweep to remove any -m pytest and -m isaacsim_ci and -c **/pyproject.toml + # do a mark and sweep to remove any -m pytest, -m isaacsim_ci, -m windows_ci, -m arm_ci, + # and -c **/pyproject.toml indexes_to_remove = [] for idx, arg in enumerate(sys.argv[:-1]): if arg == "-m": value_for_dash_m = sys.argv[idx + 1] - if "pytest" in value_for_dash_m or "isaacsim_ci" in value_for_dash_m: + if ( + "pytest" in value_for_dash_m + or "isaacsim_ci" in value_for_dash_m + or "windows_ci" in value_for_dash_m + or "arm_ci" in value_for_dash_m + ): indexes_to_remove.append(idx) indexes_to_remove.append(idx + 1) if arg.startswith("--config-file=") and "pyproject.toml" in arg: diff --git a/source/isaaclab/isaaclab/sim/converters/asset_converter_base.py b/source/isaaclab/isaaclab/sim/converters/asset_converter_base.py index 11c200422391..703ef202e2a7 100644 --- a/source/isaaclab/isaaclab/sim/converters/asset_converter_base.py +++ b/source/isaaclab/isaaclab/sim/converters/asset_converter_base.py @@ -9,6 +9,7 @@ import os import pathlib import random +import tempfile from datetime import datetime from isaaclab.sim.converters.asset_converter_base_cfg import AssetConverterBaseCfg @@ -34,9 +35,10 @@ class AssetConverterBase(abc.ABC): can be set to True. When no output directory is defined, lazy conversion is deactivated and the generated USD file is - stored in folder ``/tmp/IsaacLab/usd_{date}_{time}_{random}``, where the parameters in braces are generated - at runtime. The random identifiers help avoid a race condition where two simultaneously triggered conversions - try to use the same directory for reading/writing the generated files. + stored in folder ``/IsaacLab/usd_{date}_{time}_{random}``, where ```` is the system + temporary directory (e.g. ``/tmp`` on POSIX, ``%TEMP%`` on Windows) and the parameters in braces are + generated at runtime. The random identifiers help avoid a race condition where two simultaneously + triggered conversions try to use the same directory for reading/writing the generated files. .. note:: Changes to the parameters :obj:`AssetConverterBaseCfg.asset_path`, :obj:`AssetConverterBaseCfg.usd_dir`, and @@ -64,9 +66,9 @@ def __init__(self, cfg: AssetConverterBaseCfg): # resolve USD directory name if cfg.usd_dir is None: - # a folder in "/tmp/IsaacLab" by the name: usd_{date}_{time}_{random} + # a folder in the system temp dir by the name: IsaacLab/usd_{date}_{time}_{random} time_tag = datetime.now().strftime("%Y%m%d_%H%M%S") - self._usd_dir = f"/tmp/IsaacLab/usd_{time_tag}_{random.randrange(10000)}" + self._usd_dir = os.path.join(tempfile.gettempdir(), "IsaacLab", f"usd_{time_tag}_{random.randrange(10000)}") else: self._usd_dir = cfg.usd_dir diff --git a/source/isaaclab/test/deps/test_scipy.py b/source/isaaclab/test/deps/test_scipy.py index d697716aad7a..f42e54c304e9 100644 --- a/source/isaaclab/test/deps/test_scipy.py +++ b/source/isaaclab/test/deps/test_scipy.py @@ -13,6 +13,8 @@ import numpy as np import scipy.interpolate as interpolate +pytestmark = [pytest.mark.windows_ci, pytest.mark.arm_ci] + @pytest.mark.isaacsim_ci def test_interpolation(): diff --git a/source/isaaclab/test/deps/test_torch.py b/source/isaaclab/test/deps/test_torch.py index 6a50110757de..e651987daa26 100644 --- a/source/isaaclab/test/deps/test_torch.py +++ b/source/isaaclab/test/deps/test_torch.py @@ -7,6 +7,8 @@ import torch import torch.utils.benchmark as benchmark +pytestmark = [pytest.mark.windows_ci, pytest.mark.arm_ci] + @pytest.mark.isaacsim_ci def test_array_slicing(): From 98a5b6886b92d999525d36a07d4af13f8e47b53d Mon Sep 17 00:00:00 2001 From: jichuanh Date: Wed, 20 May 2026 23:14:32 +0000 Subject: [PATCH 02/55] [CI] Add ARM/Spark CI workflow MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Mirrors build.yaml's spirit but stays minimal for the aarch64 path: Tier 1 (gates none — continue-on-error): general-arm, install-arm, kit-launch-arm Tier 2 (meaningful, marker-filtered): kitless-arm, determinism-arm Every job sets continue-on-error: true while the aarch64 runner setup stabilizes. Every pytest invocation passes --timeout=N --timeout-method=signal so a single hung test cannot consume the whole job slot. Inline scripts use set -e to fail on the first nonzero return. Tags three test_rendering_*_kitless.py files plus test_differential_ik.py and test_operational_space.py with the arm_ci marker so the Tier 2 jobs can select them via pytest -m arm_ci. --- .../actions/ecr-build-push-pull/action.yml | 6 +- .github/workflows/arm-ci.yaml | 335 ++++++++++++++++++ docker/Dockerfile.base | 3 + .../isaaclab/changelog.d/jichuanh-arm-ci.skip | 1 + .../test/controllers/test_differential_ik.py | 2 + .../controllers/test_operational_space.py | 2 + .../test/test_rendering_cartpole_kitless.py | 2 +- .../test_rendering_dexsuite_kuka_kitless.py | 2 +- .../test_rendering_shadow_hand_kitless.py | 2 +- 9 files changed, 351 insertions(+), 4 deletions(-) create mode 100644 .github/workflows/arm-ci.yaml create mode 100644 source/isaaclab/changelog.d/jichuanh-arm-ci.skip diff --git a/.github/actions/ecr-build-push-pull/action.yml b/.github/actions/ecr-build-push-pull/action.yml index b661d4b9fd62..11e26a3c1833 100644 --- a/.github/actions/ecr-build-push-pull/action.yml +++ b/.github/actions/ecr-build-push-pull/action.yml @@ -37,6 +37,10 @@ inputs: description: Tag used for the ECR layer cache image (e.g. "cache-base", "cache-curobo"). required: false default: 'cache' + platform: + description: Target platform for `docker buildx build --platform` (e.g. "linux/amd64", "linux/arm64"). + required: false + default: 'linux/amd64' runs: using: composite steps: @@ -256,7 +260,7 @@ runs: run: | BUILD_ARGS=( --progress=plain - --platform linux/amd64 + --platform ${{ inputs.platform }} -f "${{ inputs.dockerfile-path }}" --build-arg "ISAACSIM_BASE_IMAGE_ARG=${{ inputs.isaacsim-base-image }}" --build-arg "ISAACSIM_VERSION_ARG=${{ inputs.isaacsim-version }}" diff --git a/.github/workflows/arm-ci.yaml b/.github/workflows/arm-ci.yaml new file mode 100644 index 000000000000..514ec50c0132 --- /dev/null +++ b/.github/workflows/arm-ci.yaml @@ -0,0 +1,335 @@ +# Copyright (c) 2022-2026, The Isaac Lab Project Developers (https://github.com/isaac-sim/IsaacLab/blob/main/CONTRIBUTORS.md). +# All rights reserved. +# +# SPDX-License-Identifier: BSD-3-Clause + +# ARM/Spark CI — exercises Isaac Lab on aarch64 Linux self-hosted runners +# (NVIDIA DGX Spark). Mirrors the spirit of build.yaml but stays lean by +# running tests inside the multi-arch nvcr.io/nvidian/isaac-sim image +# instead of building a full isaac-lab-ci image. (Once the apt deps and +# editable-install scope stabilize, we can promote to a Dockerfile.base +# build that mirrors build.yaml's structure end-to-end.) +# +# Single job, multiple steps. Each test step sets `continue-on-error: true` +# so a failure in one tier does not abort the others. Each pytest invocation +# passes `--timeout=N --timeout-method=signal --continue-on-collection-errors` +# so a hung or import-broken test cannot consume the whole job slot. +# +# Marker-driven discovery: `pytest -m arm_ci`. Adding a new aarch64-safe +# test = tag it with arm_ci, no yaml edit. + +name: ARM CI + +on: + pull_request: + types: [opened, synchronize, reopened] + branches: + - main + - develop + - 'release/**' + push: + branches: + - main + - develop + - 'release/**' + workflow_dispatch: + +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: true + +permissions: + contents: read + pull-requests: write + checks: write + +env: + NGC_API_KEY: ${{ secrets.NGC_API_KEY }} + +jobs: + changes: + name: Detect Changes + runs-on: ubuntu-latest + outputs: + run_arm_ci: ${{ steps.detect.outputs.run_arm_ci }} + steps: + - id: detect + env: + GH_TOKEN: ${{ github.token }} + PR_NUMBER: ${{ github.event.pull_request.number }} + EVENT_NAME: ${{ github.event_name }} + REPO: ${{ github.repository }} + run: | + set -euo pipefail + patterns=( + $'^source/\tLibrary source code' + $'^tools/\tBuild tooling' + $'^apps/\tStandalone apps' + $'^docker/\tContainer build inputs' + $'(^|/)pyproject\\.toml$\tPython project metadata' + $'^\\.github/workflows/arm-ci\\.yaml$\tThis workflow file' + $'^\\.github/actions/ecr-build-push-pull/\tECR action' + $'^VERSION$\tVersion file' + ) + any_match() { + local files="$1" entry regex + for entry in "${patterns[@]}"; do + IFS=$'\t' read -r regex _ <<< "$entry" + if grep -qE "$regex" <<< "$files"; then + return 0 + fi + done + return 1 + } + if [ "$EVENT_NAME" != "pull_request" ]; then + echo "run_arm_ci=true" >> "$GITHUB_OUTPUT" + exit 0 + fi + changed_files="$(gh api --paginate "repos/$REPO/pulls/$PR_NUMBER/files" --jq '.[].filename' || true)" + if [ -z "$changed_files" ] || any_match "$changed_files"; then + echo "run_arm_ci=true" >> "$GITHUB_OUTPUT" + else + echo "run_arm_ci=false" >> "$GITHUB_OUTPUT" + fi + + config: + name: Load Config + runs-on: ubuntu-latest + needs: [changes] + if: needs.changes.outputs.run_arm_ci == 'true' + outputs: + isaacsim_image_name: ${{ steps.load.outputs.isaacsim_image_name }} + isaacsim_image_tag: ${{ steps.load.outputs.isaacsim_image_tag }} + steps: + - uses: actions/checkout@v4 + with: + fetch-depth: 1 + - id: load + shell: bash + run: | + set -euo pipefail + # Read isaacsim_image_name/tag from .github/workflows/config.yaml. + # Fallback to nightly tag if yq is unavailable on ubuntu-latest. + if command -v yq >/dev/null 2>&1; then + name=$(yq -r .isaacsim_image_name .github/workflows/config.yaml) + tag=$(yq -r .isaacsim_image_tag .github/workflows/config.yaml) + else + name=$(grep '^isaacsim_image_name:' .github/workflows/config.yaml | awk '{print $2}') + tag=$(grep '^isaacsim_image_tag:' .github/workflows/config.yaml | awk '{print $2}') + fi + echo "isaacsim_image_name=$name" >> "$GITHUB_OUTPUT" + echo "isaacsim_image_tag=$tag" >> "$GITHUB_OUTPUT" + + arm-ci: + name: arm-ci + runs-on: [self-hosted, arm64] + needs: [changes, config] + if: needs.changes.outputs.run_arm_ci == 'true' + timeout-minutes: 120 + steps: + - name: Checkout + uses: actions/checkout@v4 + with: + fetch-depth: 1 + lfs: false + + - name: Login to nvcr.io + shell: bash + run: | + set -euo pipefail + if [ -n "${NGC_API_KEY:-}" ]; then + echo "${NGC_API_KEY}" | docker login nvcr.io --username '$oauthtoken' --password-stdin + fi + + - name: Pull arm64 Isaac Sim image + shell: bash + run: | + set -euo pipefail + # Multi-arch manifest at this tag has both linux/arm64 and linux/amd64. + # Docker on an aarch64 host auto-resolves to the arm64 variant. + docker pull --platform linux/arm64 \ + "${{ needs.config.outputs.isaacsim_image_name }}:${{ needs.config.outputs.isaacsim_image_tag }}" + + - name: Install system build deps inside the container + shell: bash + run: | + set -euo pipefail + # pytetwild's fTetWild source build needs libgmp / libmpfr / libeigen3 / + # libcgal / libboost; isaaclab's editable install pulls pytetwild as a + # hard dep (added in PR isaac-sim/IsaacLab#5710 on 2026-05-20). + # We install into a fresh container layer per run so the apt cost (~1-2 + # min) shows up only when this workflow runs, not on the Sim image. + # Persist by committing into a per-run image tagged isaac-lab-arm-ci. + # Build a tagged image with deps baked in so subsequent test runs are fast. + # Override the Sim image's entrypoint (Kit launcher) with bash so we can + # run plain shell commands. EULA env vars set per build.yaml conventions. + docker run --name arm-deps-prep --user root \ + --entrypoint bash \ + -e OMNI_KIT_ACCEPT_EULA=yes -e ACCEPT_EULA=Y -e ISAAC_SIM_HEADLESS=1 \ + -e PRIVACY_CONSENT=Y \ + "${{ needs.config.outputs.isaacsim_image_name }}:${{ needs.config.outputs.isaacsim_image_tag }}" \ + -c " + set -euo pipefail + apt-get update + apt-get install -y --no-install-recommends \ + libgmp-dev libmpfr-dev libeigen3-dev libcgal-dev libboost-all-dev \ + cmake build-essential git + rm -rf /var/lib/apt/lists/* + " + docker commit arm-deps-prep isaac-lab-arm-ci:${{ github.sha }} + docker rm arm-deps-prep + + - name: Editable install of isaaclab + isaaclab_tasks in a uv venv + shell: bash + timeout-minutes: 25 + run: | + set -euo pipefail + # All Tier 2 jobs need both packages installed once. We do the install + # inside a uv venv mounted under /workspace/isaaclab so subsequent + # docker run invocations see the same env_isaaclab_uv directory. + docker run --rm --user root \ + --entrypoint bash \ + -e OMNI_KIT_ACCEPT_EULA=yes -e ACCEPT_EULA=Y -e ISAAC_SIM_HEADLESS=1 \ + -e PRIVACY_CONSENT=Y \ + -v "${{ github.workspace }}":/workspace/isaaclab \ + -w /workspace/isaaclab \ + --gpus all \ + isaac-lab-arm-ci:${{ github.sha }} \ + -c " + set -e + if ! command -v uv >/dev/null 2>&1; then + curl -LsSf https://astral.sh/uv/install.sh | sh + export PATH=\$HOME/.local/bin:\$PATH + fi + uv venv --python 3.12 env_isaaclab_uv + source env_isaaclab_uv/bin/activate + uv pip install -e source/isaaclab + uv pip install -e source/isaaclab_assets + uv pip install -e source/isaaclab_tasks + uv pip install pytest pytest-timeout + python -c 'import isaaclab, isaaclab_assets, isaaclab_tasks; print(\"editable imports ok\")' + " + + - name: Tier 1 — general-arm smoke (torch + scipy) + shell: bash + continue-on-error: true + timeout-minutes: 10 + run: | + set -e + mkdir -p reports + docker run --rm --user root \ + --entrypoint bash \ + -e OMNI_KIT_ACCEPT_EULA=yes -e ACCEPT_EULA=Y -e ISAAC_SIM_HEADLESS=1 \ + -e PRIVACY_CONSENT=Y \ + -v "${{ github.workspace }}":/workspace/isaaclab \ + -w /workspace/isaaclab \ + --gpus all \ + isaac-lab-arm-ci:${{ github.sha }} \ + -c " + source env_isaaclab_uv/bin/activate + python -m pytest \ + source/isaaclab/test/deps \ + --ignore=tools/conftest.py \ + -m arm_ci \ + --continue-on-collection-errors \ + --timeout=60 \ + --timeout-method=signal \ + -v \ + --junitxml=reports/general-arm.xml + " + + - name: Tier 1 — kit-launch-arm (boot Kit headless) + shell: bash + continue-on-error: true + timeout-minutes: 10 + run: | + set -e + docker run --rm --user root \ + --entrypoint bash \ + -e OMNI_KIT_ACCEPT_EULA=yes -e ACCEPT_EULA=Y -e ISAAC_SIM_HEADLESS=1 \ + -e PRIVACY_CONSENT=Y \ + -v "${{ github.workspace }}":/workspace/isaaclab \ + -w /workspace/isaaclab \ + --gpus all \ + isaac-lab-arm-ci:${{ github.sha }} \ + -c " + source env_isaaclab_uv/bin/activate + uv pip install --extra-index-url https://pypi.nvidia.com 'isaacsim[all]' + timeout 120 python - <<'EOF' + import sys + from isaaclab.app import AppLauncher + sim = AppLauncher(headless=True).app + assert sim is not None, 'AppLauncher did not return a SimulationApp' + sim.close() + sys.exit(0) + EOF + " + + - name: Tier 2 — kitless-arm (Warp + OvRTX rendering on aarch64) + shell: bash + continue-on-error: true + timeout-minutes: 30 + run: | + set -e + docker run --rm --user root \ + --entrypoint bash \ + -e OMNI_KIT_ACCEPT_EULA=yes -e ACCEPT_EULA=Y -e ISAAC_SIM_HEADLESS=1 \ + -e PRIVACY_CONSENT=Y \ + -v "${{ github.workspace }}":/workspace/isaaclab \ + -w /workspace/isaaclab \ + --gpus all \ + isaac-lab-arm-ci:${{ github.sha }} \ + -c " + source env_isaaclab_uv/bin/activate + python -m pytest \ + source/isaaclab_tasks/test \ + --ignore=tools/conftest.py \ + -m arm_ci \ + --continue-on-collection-errors \ + --timeout=300 \ + --timeout-method=signal \ + -v \ + --junitxml=reports/kitless-arm.xml + " + + - name: Tier 2 — determinism-arm (controllers / math) + shell: bash + continue-on-error: true + timeout-minutes: 20 + run: | + set -e + docker run --rm --user root \ + --entrypoint bash \ + -e OMNI_KIT_ACCEPT_EULA=yes -e ACCEPT_EULA=Y -e ISAAC_SIM_HEADLESS=1 \ + -e PRIVACY_CONSENT=Y \ + -v "${{ github.workspace }}":/workspace/isaaclab \ + -w /workspace/isaaclab \ + --gpus all \ + isaac-lab-arm-ci:${{ github.sha }} \ + -c " + source env_isaaclab_uv/bin/activate + python -m pytest \ + source/isaaclab/test \ + --ignore=tools/conftest.py \ + --ignore=source/isaaclab/test/deps \ + -m arm_ci \ + --continue-on-collection-errors \ + --timeout=180 \ + --timeout-method=signal \ + -v \ + --junitxml=reports/determinism-arm.xml + " + + - name: Upload test reports + if: always() + uses: actions/upload-artifact@v4 + with: + name: arm-ci-reports + path: reports/ + retention-days: 7 + + - name: Clean up per-run image + if: always() + shell: bash + run: | + docker rmi -f isaac-lab-arm-ci:${{ github.sha }} || true diff --git a/docker/Dockerfile.base b/docker/Dockerfile.base index 39982b343d08..6487f144d170 100644 --- a/docker/Dockerfile.base +++ b/docker/Dockerfile.base @@ -52,11 +52,14 @@ RUN apt-get update && \ # arm64-only build deps: # - imgui-bundle has no prebuilt arm64 wheel; needs GL/X11 dev headers. # - swig is required for the nlopt source build (see arm64 nlopt step below). +# - libgmp-dev / libmpfr-dev / libeigen3-dev / libcgal-dev / libboost-all-dev: +# needed by pytetwild's fTetWild source build (no arm64 wheel on PyPI). RUN if [ "$(dpkg --print-architecture)" = "arm64" ]; then \ apt-get update && \ apt-get install -y --no-install-recommends \ libgl1-mesa-dev libopengl-dev libglx-dev \ libx11-dev libxcursor-dev libxi-dev libxinerama-dev libxrandr-dev \ + libgmp-dev libmpfr-dev libeigen3-dev libcgal-dev libboost-all-dev \ swig; \ fi diff --git a/source/isaaclab/changelog.d/jichuanh-arm-ci.skip b/source/isaaclab/changelog.d/jichuanh-arm-ci.skip new file mode 100644 index 000000000000..5d82dee1471b --- /dev/null +++ b/source/isaaclab/changelog.d/jichuanh-arm-ci.skip @@ -0,0 +1 @@ +Skip changelog: CI-infrastructure only (no user-facing API change). Adds .github/workflows/arm-ci.yaml carrying the ARM/Spark CI pipeline against self-hosted [self-hosted, arm64] runners. Tier 1 (smoke, install probe, Kit launch) plus Tier 2 (kitless rendering, controller determinism). All jobs use continue-on-error: true and pytest --timeout to fail fast on hangs. Tags three test_rendering_*_kitless.py files plus test_differential_ik.py / test_operational_space.py with arm_ci so the Tier 2 jobs can select them. diff --git a/source/isaaclab/test/controllers/test_differential_ik.py b/source/isaaclab/test/controllers/test_differential_ik.py index 2ba7af0ec028..3bfc9c3a2543 100644 --- a/source/isaaclab/test/controllers/test_differential_ik.py +++ b/source/isaaclab/test/controllers/test_differential_ik.py @@ -15,6 +15,8 @@ import pytest import torch +pytestmark = pytest.mark.arm_ci + import isaaclab.sim as sim_utils from isaaclab import cloner from isaaclab.assets import Articulation diff --git a/source/isaaclab/test/controllers/test_operational_space.py b/source/isaaclab/test/controllers/test_operational_space.py index bed0760271e7..c57611b08d34 100644 --- a/source/isaaclab/test/controllers/test_operational_space.py +++ b/source/isaaclab/test/controllers/test_operational_space.py @@ -16,6 +16,8 @@ import torch from flaky import flaky +pytestmark = pytest.mark.arm_ci + import isaaclab.envs.mdp as mdp import isaaclab.sim as sim_utils from isaaclab import cloner diff --git a/source/isaaclab_tasks/test/test_rendering_cartpole_kitless.py b/source/isaaclab_tasks/test/test_rendering_cartpole_kitless.py index 802ecfd32cfc..05ef3d4b1e4b 100644 --- a/source/isaaclab_tasks/test/test_rendering_cartpole_kitless.py +++ b/source/isaaclab_tasks/test/test_rendering_cartpole_kitless.py @@ -17,7 +17,7 @@ rendering_test_cartpole, ) -pytestmark = pytest.mark.isaacsim_ci +pytestmark = [pytest.mark.isaacsim_ci, pytest.mark.arm_ci] _COMPARISON_SCORES: list[dict] = [] diff --git a/source/isaaclab_tasks/test/test_rendering_dexsuite_kuka_kitless.py b/source/isaaclab_tasks/test/test_rendering_dexsuite_kuka_kitless.py index 15afbee806b1..f33495e6cde6 100644 --- a/source/isaaclab_tasks/test/test_rendering_dexsuite_kuka_kitless.py +++ b/source/isaaclab_tasks/test/test_rendering_dexsuite_kuka_kitless.py @@ -17,7 +17,7 @@ rendering_test_dexsuite_kuka, ) -pytestmark = pytest.mark.isaacsim_ci +pytestmark = [pytest.mark.isaacsim_ci, pytest.mark.arm_ci] _COMPARISON_SCORES: list[dict] = [] diff --git a/source/isaaclab_tasks/test/test_rendering_shadow_hand_kitless.py b/source/isaaclab_tasks/test/test_rendering_shadow_hand_kitless.py index 2244dcce5fab..dbda0ca54e9c 100644 --- a/source/isaaclab_tasks/test/test_rendering_shadow_hand_kitless.py +++ b/source/isaaclab_tasks/test/test_rendering_shadow_hand_kitless.py @@ -17,7 +17,7 @@ rendering_test_shadow_hand, ) -pytestmark = pytest.mark.isaacsim_ci +pytestmark = [pytest.mark.isaacsim_ci, pytest.mark.arm_ci] _COMPARISON_SCORES: list[dict] = [] From b3d7f7c481636cbbcb818f2b28a34dd4ecf6d809 Mon Sep 17 00:00:00 2001 From: jichuanh Date: Thu, 21 May 2026 05:55:18 +0000 Subject: [PATCH 03/55] Cleanups: pytetwild aarch64 + Dockerfile.arm-ci MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - source/isaaclab/setup.py: gate pytetwild==0.2.3 with `platform_machine != 'aarch64'`. No aarch64 wheel on PyPI; source build fails because the transitive `geogram` dep hardcodes `-m64` in its CMakeLists. The single call site at sim/schemas/schemas.py already lazy-imports it with a clear "install manually" message, so aarch64 users keep everything except automatic volume-deformable tetrahedralization. - docker/Dockerfile.arm-ci: new lightweight Dockerfile that layers cmake / build-essential / git + EULA env vars + a bash entrypoint onto the multi-arch Isaac Sim base image. Replaces the previous inline `docker run + apt-get install + docker commit` chain in arm-ci.yaml. - docker/Dockerfile.base: reverted the libgmp / libmpfr / libeigen3 / libcgal / libboost additions from the arm64-conditional apt block — those were only needed by pytetwild's fTetWild build, which we no longer install on aarch64. - .github/workflows/arm-ci.yaml: build via docker/Dockerfile.arm-ci instead of the inline apt-install-and-commit pattern. Test steps no longer need to re-specify --entrypoint or EULA env vars on every docker run. --- .github/workflows/arm-ci.yaml | 202 +++++------------- docker/Dockerfile.arm-ci | 48 +++++ docker/Dockerfile.base | 3 - .../jichuanh-apppath-exp-fallback.rst | 11 + .../jichuanh-conftest-ci-marker.skip | 1 + .../jichuanh-pytetwild-aarch64.rst | 8 + source/isaaclab/isaaclab/app/app_launcher.py | 18 +- source/isaaclab/setup.py | 8 +- tools/conftest.py | 47 +++- 9 files changed, 182 insertions(+), 164 deletions(-) create mode 100644 docker/Dockerfile.arm-ci create mode 100644 source/isaaclab/changelog.d/jichuanh-apppath-exp-fallback.rst create mode 100644 source/isaaclab/changelog.d/jichuanh-conftest-ci-marker.skip create mode 100644 source/isaaclab/changelog.d/jichuanh-pytetwild-aarch64.rst diff --git a/.github/workflows/arm-ci.yaml b/.github/workflows/arm-ci.yaml index 514ec50c0132..f59b1f5c879e 100644 --- a/.github/workflows/arm-ci.yaml +++ b/.github/workflows/arm-ci.yaml @@ -141,93 +141,63 @@ jobs: echo "${NGC_API_KEY}" | docker login nvcr.io --username '$oauthtoken' --password-stdin fi - - name: Pull arm64 Isaac Sim image + - name: Build arm-ci docker image shell: bash run: | set -euo pipefail - # Multi-arch manifest at this tag has both linux/arm64 and linux/amd64. - # Docker on an aarch64 host auto-resolves to the arm64 variant. - docker pull --platform linux/arm64 \ - "${{ needs.config.outputs.isaacsim_image_name }}:${{ needs.config.outputs.isaacsim_image_tag }}" - - - name: Install system build deps inside the container + # Build the lightweight arm-ci image from docker/Dockerfile.arm-ci. + # That Dockerfile layers cmake/build-essential/git plus EULA env vars + # and a bash entrypoint onto the multi-arch Isaac Sim base image + # (auto-resolves to the arm64 manifest on aarch64 hosts). + docker build \ + --platform linux/arm64 \ + --build-arg "ISAACSIM_BASE_IMAGE_ARG=${{ needs.config.outputs.isaacsim_image_name }}" \ + --build-arg "ISAACSIM_VERSION_ARG=${{ needs.config.outputs.isaacsim_image_tag }}" \ + -f docker/Dockerfile.arm-ci \ + -t isaac-lab-arm-ci:${{ github.sha }} \ + docker + + - name: Install isaaclab via ./isaaclab.sh -i + run all tier 1/2 tests shell: bash + timeout-minutes: 90 run: | set -euo pipefail - # pytetwild's fTetWild source build needs libgmp / libmpfr / libeigen3 / - # libcgal / libboost; isaaclab's editable install pulls pytetwild as a - # hard dep (added in PR isaac-sim/IsaacLab#5710 on 2026-05-20). - # We install into a fresh container layer per run so the apt cost (~1-2 - # min) shows up only when this workflow runs, not on the Sim image. - # Persist by committing into a per-run image tagged isaac-lab-arm-ci. - # Build a tagged image with deps baked in so subsequent test runs are fast. - # Override the Sim image's entrypoint (Kit launcher) with bash so we can - # run plain shell commands. EULA env vars set per build.yaml conventions. - docker run --name arm-deps-prep --user root \ - --entrypoint bash \ - -e OMNI_KIT_ACCEPT_EULA=yes -e ACCEPT_EULA=Y -e ISAAC_SIM_HEADLESS=1 \ - -e PRIVACY_CONSENT=Y \ - "${{ needs.config.outputs.isaacsim_image_name }}:${{ needs.config.outputs.isaacsim_image_tag }}" \ - -c " - set -euo pipefail - apt-get update - apt-get install -y --no-install-recommends \ - libgmp-dev libmpfr-dev libeigen3-dev libcgal-dev libboost-all-dev \ - cmake build-essential git - rm -rf /var/lib/apt/lists/* - " - docker commit arm-deps-prep isaac-lab-arm-ci:${{ github.sha }} - docker rm arm-deps-prep - - - name: Editable install of isaaclab + isaaclab_tasks in a uv venv - shell: bash - timeout-minutes: 25 - run: | - set -euo pipefail - # All Tier 2 jobs need both packages installed once. We do the install - # inside a uv venv mounted under /workspace/isaaclab so subsequent - # docker run invocations see the same env_isaaclab_uv directory. + # Single docker run because uv-managed Python lives in + # $HOME/.cache/uv/python inside the container and is discarded on + # container exit, leaving env_isaaclab_uv/bin/python as a dangling + # symlink in subsequent containers. + # + # Install via ./isaaclab.sh -i (the canonical user-facing install + # entry point) instead of hand-rolled uv pip install lines. This + # picks up _ensure_cuda_torch (re-installs cu130 torch on aarch64 + # after isaacsim downgrades it to a CPU wheel), nlopt arm prep, + # pin-pink dependency probe, etc. — same install path real users + # hit on `./isaaclab.sh -i`, so CI failures here are real user bugs. + mkdir -p reports docker run --rm --user root \ - --entrypoint bash \ - -e OMNI_KIT_ACCEPT_EULA=yes -e ACCEPT_EULA=Y -e ISAAC_SIM_HEADLESS=1 \ - -e PRIVACY_CONSENT=Y \ -v "${{ github.workspace }}":/workspace/isaaclab \ -w /workspace/isaaclab \ --gpus all \ - isaac-lab-arm-ci:${{ github.sha }} \ - -c " + isaac-lab-arm-ci:${{ github.sha }} " set -e if ! command -v uv >/dev/null 2>&1; then curl -LsSf https://astral.sh/uv/install.sh | sh export PATH=\$HOME/.local/bin:\$PATH fi + uv venv --python 3.12 env_isaaclab_uv source env_isaaclab_uv/bin/activate - uv pip install -e source/isaaclab - uv pip install -e source/isaaclab_assets - uv pip install -e source/isaaclab_tasks uv pip install pytest pytest-timeout - python -c 'import isaaclab, isaaclab_assets, isaaclab_tasks; print(\"editable imports ok\")' - " - - name: Tier 1 — general-arm smoke (torch + scipy) - shell: bash - continue-on-error: true - timeout-minutes: 10 - run: | - set -e - mkdir -p reports - docker run --rm --user root \ - --entrypoint bash \ - -e OMNI_KIT_ACCEPT_EULA=yes -e ACCEPT_EULA=Y -e ISAAC_SIM_HEADLESS=1 \ - -e PRIVACY_CONSENT=Y \ - -v "${{ github.workspace }}":/workspace/isaaclab \ - -w /workspace/isaaclab \ - --gpus all \ - isaac-lab-arm-ci:${{ github.sha }} \ - -c " - source env_isaaclab_uv/bin/activate - python -m pytest \ + # Use ./isaaclab.sh -i (mirrors user install path). + # -i none installs core submodules + isaacsim + restores cu130 torch on aarch64. + ./isaaclab.sh -i none + python -c 'import isaaclab, isaaclab_assets, isaaclab_physx, isaaclab_tasks; print(\"editable imports ok\")' + + set +e # individual tier failures do not abort the script + + echo '::group::Tier 1 — general-arm smoke (torch + scipy)' + ./isaaclab.sh -p -m pytest \ source/isaaclab/test/deps \ --ignore=tools/conftest.py \ -m arm_ci \ @@ -236,88 +206,20 @@ jobs: --timeout-method=signal \ -v \ --junitxml=reports/general-arm.xml - " - - - name: Tier 1 — kit-launch-arm (boot Kit headless) - shell: bash - continue-on-error: true - timeout-minutes: 10 - run: | - set -e - docker run --rm --user root \ - --entrypoint bash \ - -e OMNI_KIT_ACCEPT_EULA=yes -e ACCEPT_EULA=Y -e ISAAC_SIM_HEADLESS=1 \ - -e PRIVACY_CONSENT=Y \ - -v "${{ github.workspace }}":/workspace/isaaclab \ - -w /workspace/isaaclab \ - --gpus all \ - isaac-lab-arm-ci:${{ github.sha }} \ - -c " - source env_isaaclab_uv/bin/activate - uv pip install --extra-index-url https://pypi.nvidia.com 'isaacsim[all]' - timeout 120 python - <<'EOF' - import sys - from isaaclab.app import AppLauncher - sim = AppLauncher(headless=True).app - assert sim is not None, 'AppLauncher did not return a SimulationApp' - sim.close() - sys.exit(0) - EOF - " - - - name: Tier 2 — kitless-arm (Warp + OvRTX rendering on aarch64) - shell: bash - continue-on-error: true - timeout-minutes: 30 - run: | - set -e - docker run --rm --user root \ - --entrypoint bash \ - -e OMNI_KIT_ACCEPT_EULA=yes -e ACCEPT_EULA=Y -e ISAAC_SIM_HEADLESS=1 \ - -e PRIVACY_CONSENT=Y \ - -v "${{ github.workspace }}":/workspace/isaaclab \ - -w /workspace/isaaclab \ - --gpus all \ - isaac-lab-arm-ci:${{ github.sha }} \ - -c " - source env_isaaclab_uv/bin/activate - python -m pytest \ - source/isaaclab_tasks/test \ - --ignore=tools/conftest.py \ - -m arm_ci \ - --continue-on-collection-errors \ - --timeout=300 \ - --timeout-method=signal \ - -v \ - --junitxml=reports/kitless-arm.xml - " - - - name: Tier 2 — determinism-arm (controllers / math) - shell: bash - continue-on-error: true - timeout-minutes: 20 - run: | - set -e - docker run --rm --user root \ - --entrypoint bash \ - -e OMNI_KIT_ACCEPT_EULA=yes -e ACCEPT_EULA=Y -e ISAAC_SIM_HEADLESS=1 \ - -e PRIVACY_CONSENT=Y \ - -v "${{ github.workspace }}":/workspace/isaaclab \ - -w /workspace/isaaclab \ - --gpus all \ - isaac-lab-arm-ci:${{ github.sha }} \ - -c " - source env_isaaclab_uv/bin/activate - python -m pytest \ - source/isaaclab/test \ - --ignore=tools/conftest.py \ - --ignore=source/isaaclab/test/deps \ - -m arm_ci \ - --continue-on-collection-errors \ - --timeout=180 \ - --timeout-method=signal \ - -v \ - --junitxml=reports/determinism-arm.xml + echo '::endgroup::' + + echo '::group::Tier 1 — kit-launch-arm (boot Kit headless)' + timeout 120 ./isaaclab.sh -p -c \"from isaaclab.app import AppLauncher; sim = AppLauncher(headless=True).app; assert sim is not None, 'AppLauncher did not return a SimulationApp'; sim.close()\" + echo '::endgroup::' + + # Tier 2 uses tools/conftest.py's subprocess-per-file orchestrator + # (CI_MARKER=arm_ci) so aarch64 Kit re-init across multiple + # AppLauncher-at-module-level test files doesn't SIGSEGV — each + # test file gets its own Python process. + echo '::group::Tier 2 — arm_ci marker discovery (subprocess-per-file)' + CI_MARKER=arm_ci python -m pytest tools -v --junitxml=reports/tier2-arm.xml + echo '::endgroup::' + true " - name: Upload test reports diff --git a/docker/Dockerfile.arm-ci b/docker/Dockerfile.arm-ci new file mode 100644 index 000000000000..99ca947462b3 --- /dev/null +++ b/docker/Dockerfile.arm-ci @@ -0,0 +1,48 @@ +# Copyright (c) 2022-2026, The Isaac Lab Project Developers (https://github.com/isaac-sim/IsaacLab/blob/main/CONTRIBUTORS.md). +# All rights reserved. +# +# SPDX-License-Identifier: BSD-3-Clause + +# Lightweight base image for ARM/Spark CI (arm-ci.yaml). +# +# Layers on top of the multi-arch nvcr.io/nvidian/isaac-sim image (arm64 +# manifest on aarch64 hosts) and adds: +# - cmake / build-essential / git: source-build infra for aarch64 Python +# packages that have no prebuilt aarch64 wheel (pin-pink, daqp, etc.). +# - EULA env vars + bash entrypoint so the workflow can run plain shell +# commands without re-specifying flags per `docker run`. +# +# This is intentionally NOT Dockerfile.base — that one builds a full +# isaac-lab-ci image with isaaclab pre-installed. arm-ci.yaml instead +# mounts source/ at test time and does a uv editable install, so we only +# need the apt layer here. + +ARG ISAACSIM_BASE_IMAGE_ARG=nvcr.io/nvidian/isaac-sim +ARG ISAACSIM_VERSION_ARG=latest-develop + +FROM ${ISAACSIM_BASE_IMAGE_ARG}:${ISAACSIM_VERSION_ARG} + +USER root + +RUN apt-get update && \ + apt-get install -y --no-install-recommends \ + cmake build-essential git \ + # imgui-bundle has no aarch64 wheel and source-builds need GL/X11 headers. + # swig is needed for the nlopt aarch64 source build. + libgl1-mesa-dev libopengl-dev libglx-dev \ + libx11-dev libxcursor-dev libxi-dev libxinerama-dev libxrandr-dev \ + swig && \ + apt-get -y autoremove && apt-get clean && \ + rm -rf /var/lib/apt/lists/* + +ENV ACCEPT_EULA=Y \ + OMNI_KIT_ACCEPT_EULA=yes \ + ISAAC_SIM_HEADLESS=1 \ + PRIVACY_CONSENT=Y \ + # aarch64 scipy/numpy have a known OpenMP thread-safety issue that + # requires libgomp.so.1 to be LD_PRELOAD-ed before scipy imports. Without + # it, pytest collection trips an InitError. The scipy error message + # spells out this exact fix. + LD_PRELOAD=/lib/aarch64-linux-gnu/libgomp.so.1 + +ENTRYPOINT ["/bin/bash", "-c"] diff --git a/docker/Dockerfile.base b/docker/Dockerfile.base index 6487f144d170..39982b343d08 100644 --- a/docker/Dockerfile.base +++ b/docker/Dockerfile.base @@ -52,14 +52,11 @@ RUN apt-get update && \ # arm64-only build deps: # - imgui-bundle has no prebuilt arm64 wheel; needs GL/X11 dev headers. # - swig is required for the nlopt source build (see arm64 nlopt step below). -# - libgmp-dev / libmpfr-dev / libeigen3-dev / libcgal-dev / libboost-all-dev: -# needed by pytetwild's fTetWild source build (no arm64 wheel on PyPI). RUN if [ "$(dpkg --print-architecture)" = "arm64" ]; then \ apt-get update && \ apt-get install -y --no-install-recommends \ libgl1-mesa-dev libopengl-dev libglx-dev \ libx11-dev libxcursor-dev libxi-dev libxinerama-dev libxrandr-dev \ - libgmp-dev libmpfr-dev libeigen3-dev libcgal-dev libboost-all-dev \ swig; \ fi diff --git a/source/isaaclab/changelog.d/jichuanh-apppath-exp-fallback.rst b/source/isaaclab/changelog.d/jichuanh-apppath-exp-fallback.rst new file mode 100644 index 000000000000..376f7f589170 --- /dev/null +++ b/source/isaaclab/changelog.d/jichuanh-apppath-exp-fallback.rst @@ -0,0 +1,11 @@ +Fixed +^^^^^ + +* Added a defensive fallback in :class:`isaaclab.app.AppLauncher` so it derives + ``EXP_PATH`` from the installed ``isaacsim`` package when the env var is not + set. ``isaacsim.bootstrap_kernel`` normally sets ``EXP_PATH`` on first import, + but the early-return path in its bootstrap (triggered under some pip install + layouts on aarch64) skips the env-var setup. Previously this caused + ``KeyError: 'EXP_PATH'`` deep inside ``_resolve_experience_file``; now + AppLauncher resolves the path from ``isaacsim.__file__`` and stores it back + into the environment so subsequent code can rely on it. diff --git a/source/isaaclab/changelog.d/jichuanh-conftest-ci-marker.skip b/source/isaaclab/changelog.d/jichuanh-conftest-ci-marker.skip new file mode 100644 index 000000000000..b189494df2ae --- /dev/null +++ b/source/isaaclab/changelog.d/jichuanh-conftest-ci-marker.skip @@ -0,0 +1 @@ +Skip changelog: CI-infrastructure only. Generalizes tools/conftest.py to read a CI_MARKER env var (defaulting to ISAACSIM_CI_SHORT=true → "isaacsim_ci" for back-compat). Lets cross-platform CI workflows reuse the same subprocess-per-test orchestrator with their own markers (arm_ci for ARM/Spark, windows_ci for Windows) instead of forking conftest.py per platform. diff --git a/source/isaaclab/changelog.d/jichuanh-pytetwild-aarch64.rst b/source/isaaclab/changelog.d/jichuanh-pytetwild-aarch64.rst new file mode 100644 index 000000000000..1d025bd9a8b1 --- /dev/null +++ b/source/isaaclab/changelog.d/jichuanh-pytetwild-aarch64.rst @@ -0,0 +1,8 @@ +Fixed +^^^^^ + +* Excluded ``pytetwild`` install on aarch64 platforms. The package has no aarch64 wheel on PyPI and its + source build fails (the ``geogram`` CMake dep hardcodes ``-m64``). The single call site in + :mod:`isaaclab.sim.schemas` already raises a clear "install pytetwild manually or provide a + pre-tetrahedralized UsdGeom.TetMesh" message when the lazy import fails, so aarch64 users keep + everything except automatic volume-deformable tetrahedralization. diff --git a/source/isaaclab/isaaclab/app/app_launcher.py b/source/isaaclab/isaaclab/app/app_launcher.py index a4f7a628f052..4aabf903552d 100644 --- a/source/isaaclab/isaaclab/app/app_launcher.py +++ b/source/isaaclab/isaaclab/app/app_launcher.py @@ -1036,8 +1036,22 @@ def _resolve_experience_file(self, launcher_args: dict): launcher_args.get("deterministic", AppLauncher._APPLAUNCHER_CFG_INFO["deterministic"][1]) ) - # If nothing is provided resolve the experience file based on the headless flag - kit_app_exp_path = os.environ["EXP_PATH"] + # If nothing is provided resolve the experience file based on the headless flag. + # EXP_PATH is normally set by ``isaacsim.bootstrap_kernel()`` on first import. + # If it is not set (e.g. on aarch64 where the bootstrap early-return triggered + # under certain install layouts), derive it from the installed isaacsim package. + kit_app_exp_path = os.environ.get("EXP_PATH") + if not kit_app_exp_path: + try: + import isaacsim as _isaacsim_for_paths + except ImportError as e: + raise RuntimeError( + "EXP_PATH is not set and the 'isaacsim' package is not importable." + " Install Isaac Sim (`pip install isaacsim` or the binary distribution)" + " before launching AppLauncher." + ) from e + kit_app_exp_path = os.path.join(os.path.dirname(_isaacsim_for_paths.__file__), "apps") + os.environ["EXP_PATH"] = kit_app_exp_path isaaclab_app_exp_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), *[".."] * 4, "apps") # For Isaac Sim 4.5 compatibility, we use the 4.5 app files in a different folder # if launcher_args.get("use_isaacsim_45", False): diff --git a/source/isaaclab/setup.py b/source/isaaclab/setup.py index 1b1e1b2efa51..2409d1b9c568 100644 --- a/source/isaaclab/setup.py +++ b/source/isaaclab/setup.py @@ -30,8 +30,12 @@ # procedural-generation "trimesh", "pyglet>=2.1.6,<3", - # tetrahedralization for deformable bodies (pinned: >=0.3 unconditionally imports pyvista at package import time) - "pytetwild==0.2.3", + # tetrahedralization for deformable bodies (pinned: >=0.3 unconditionally imports pyvista at package import time). + # Skip on aarch64: pytetwild has no aarch64 wheel on PyPI and its source build fails because the geogram CMake + # dep hardcodes -m64 (x86_64-only). The single call site (sim/schemas/schemas.py) already raises a clear + # "install pytetwild manually" error if the lazy import fails, so aarch64 users keep everything except + # automatic volume-deformable tetrahedralization. + "pytetwild==0.2.3 ; platform_machine != 'aarch64'", # image processing "transformers==4.57.6", "einops", # needed for transformers, doesn't always auto-install diff --git a/tools/conftest.py b/tools/conftest.py index 15aaa2323647..4ee0dbae0639 100644 --- a/tools/conftest.py +++ b/tools/conftest.py @@ -304,7 +304,7 @@ def _capture_system_diagnostics(): return "\n\n".join(sections) -def run_individual_tests(test_files, workspace_root, isaacsim_ci): +def run_individual_tests(test_files, workspace_root, ci_marker): """Run each test file separately, ensuring one finishes before starting the next.""" failed_tests = [] test_status = {} @@ -348,9 +348,9 @@ def run_individual_tests(test_files, workspace_root, isaacsim_ci): "--tb=short", ] - if isaacsim_ci: + if ci_marker: cmd.append("-m") - cmd.append("isaacsim_ci") + cmd.append(ci_marker) cmd.append(str(test_file)) @@ -649,7 +649,13 @@ def pytest_sessionstart(session): quarantined_only = os.environ.get("TEST_QUARANTINED_ONLY", "false") == "true" curobo_only = os.environ.get("TEST_CUROBO_ONLY", "false") == "true" - isaacsim_ci = os.environ.get("ISAACSIM_CI_SHORT", "false") == "true" + # CI_MARKER env var generalizes the previous ISAACSIM_CI_SHORT=true gate so + # cross-platform jobs (ARM, Windows) can reuse this orchestrator with their + # own markers (arm_ci, windows_ci, ...). ISAACSIM_CI_SHORT=true stays + # supported as a back-compat shorthand for CI_MARKER=isaacsim_ci. + ci_marker = os.environ.get("CI_MARKER", "") + if not ci_marker and os.environ.get("ISAACSIM_CI_SHORT", "false") == "true": + ci_marker = "isaacsim_ci" # Parse include files list (comma-separated paths) include_files = set() @@ -680,6 +686,29 @@ def pytest_sessionstart(session): print(f"TEST_CUROBO_ONLY env var: '{os.environ.get('TEST_CUROBO_ONLY', 'NOT_SET')}'") print("=" * 50) + # When a CI_MARKER is set, the marker tag is treated as explicit opt-in for + # this CI scope (the same way TEST_INCLUDE_FILES works). Pre-scan the tree + # for files containing the marker token and pass them as include_files so + # `_collect_test_files` does not silently drop them via TESTS_TO_SKIP. + if ci_marker: + marker_token = f"pytest.mark.{ci_marker}" + marker_include_files = set() + for source_dir in source_dirs: + for root, _, files in os.walk(source_dir): + for file in files: + if not (file.startswith("test_") and file.endswith(".py")): + continue + try: + with open(os.path.join(root, file)) as f: + if marker_token in f.read(): + marker_include_files.add(file) + except OSError: + continue + if marker_include_files: + print(f"CI_MARKER={ci_marker}: marker-tagged files: {sorted(marker_include_files)}") + # Union with any explicit TEST_INCLUDE_FILES the caller passed. + include_files = include_files | marker_include_files + # Get all test files in the source directories test_files = _collect_test_files( source_dirs, @@ -690,11 +719,15 @@ def pytest_sessionstart(session): curobo_only, ) - if isaacsim_ci: + if ci_marker: + # Match both `@pytest.mark.` (per-function) and + # `pytestmark = pytest.mark.` / `pytestmark = [..., pytest.mark., ...]` + # (module-level) by looking for the common `pytest.mark.` substring. + marker_token = f"pytest.mark.{ci_marker}" new_test_files = [] for test_file in test_files: with open(test_file) as f: - if "@pytest.mark.isaacsim_ci" in f.read(): + if marker_token in f.read(): new_test_files.append(test_file) test_files = new_test_files @@ -715,7 +748,7 @@ def pytest_sessionstart(session): print(f" - {test_file}") # Run all tests individually - failed_tests, test_status, xml_reports = run_individual_tests(test_files, workspace_root, isaacsim_ci) + failed_tests, test_status, xml_reports = run_individual_tests(test_files, workspace_root, ci_marker) print("failed tests:", failed_tests) From 955e28c1ece5aeab67f5580c74a6226f227a0606 Mon Sep 17 00:00:00 2001 From: jichuanh Date: Thu, 21 May 2026 07:44:58 +0000 Subject: [PATCH 04/55] TEMP: disable heavy Linux Docker + Tests while iterating ARM CI Forces run_docker_tests=false in build.yaml's changes job so all heavy test jobs skip via their existing if-gate. Saves CI runner time + cost during ARM CI iteration on PR #5698. Must be reverted before final review. --- .github/workflows/build.yaml | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml index 22deff079a4d..bc81f1384249 100644 --- a/.github/workflows/build.yaml +++ b/.github/workflows/build.yaml @@ -74,7 +74,10 @@ jobs: name: Detect Changes runs-on: ubuntu-latest outputs: - run_docker_tests: ${{ steps.detect.outputs.run_docker_tests }} + # TEMP (revert before final review): force run_docker_tests=false on this + # branch so all heavy Linux Docker + Tests jobs skip while we iterate ARM + # CI on PR #5698. Saves runner time + cost during the back-and-forth. + run_docker_tests: 'false' steps: - id: detect env: From 75e367c8dc468e5ca68e006af0d23a095a9b1b48 Mon Sep 17 00:00:00 2001 From: jichuanh Date: Thu, 21 May 2026 07:46:38 +0000 Subject: [PATCH 05/55] arm-ci: install isaacsim and ov[ovrtx] via ./isaaclab.sh -i --- .github/workflows/arm-ci.yaml | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/.github/workflows/arm-ci.yaml b/.github/workflows/arm-ci.yaml index f59b1f5c879e..86a68bc50052 100644 --- a/.github/workflows/arm-ci.yaml +++ b/.github/workflows/arm-ci.yaml @@ -190,8 +190,10 @@ jobs: uv pip install pytest pytest-timeout # Use ./isaaclab.sh -i (mirrors user install path). - # -i none installs core submodules + isaacsim + restores cu130 torch on aarch64. - ./isaaclab.sh -i none + # Tokens: isaacsim (Tier 1 kit-launch + controllers need it), + # ov[ovrtx] (Tier 2 newton-ovrtx kitless rendering needs ovrtx). + # _ensure_cuda_torch fires automatically after isaacsim install. + ./isaaclab.sh -i isaacsim,'ov[ovrtx]' python -c 'import isaaclab, isaaclab_assets, isaaclab_physx, isaaclab_tasks; print(\"editable imports ok\")' set +e # individual tier failures do not abort the script From dadfe52f4678ddec58d5783195280d1b351ef9c6 Mon Sep 17 00:00:00 2001 From: jichuanh Date: Thu, 21 May 2026 07:49:37 +0000 Subject: [PATCH 06/55] =?UTF-8?q?arm-ci:=20use=20bundled=20Sim=20(symlink?= =?UTF-8?q?=20=5Fisaac=5Fsim=20=E2=86=92=20/isaac-sim),=20drop=20uv=20venv?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Mirrors Dockerfile.base's pattern: ln -sf /isaac-sim _isaac_sim so ./isaaclab.sh -p picks up the bundled Sim's python.sh instead of spinning a fresh uv venv that hid the bundled Sim and forced redundant pip install of isaacsim from pypi.nvidia.com. --- .github/workflows/arm-ci.yaml | 40 ++++++++++++----------------------- 1 file changed, 14 insertions(+), 26 deletions(-) diff --git a/.github/workflows/arm-ci.yaml b/.github/workflows/arm-ci.yaml index 86a68bc50052..cc859c3b615f 100644 --- a/.github/workflows/arm-ci.yaml +++ b/.github/workflows/arm-ci.yaml @@ -162,17 +162,10 @@ jobs: timeout-minutes: 90 run: | set -euo pipefail - # Single docker run because uv-managed Python lives in - # $HOME/.cache/uv/python inside the container and is discarded on - # container exit, leaving env_isaaclab_uv/bin/python as a dangling - # symlink in subsequent containers. - # - # Install via ./isaaclab.sh -i (the canonical user-facing install - # entry point) instead of hand-rolled uv pip install lines. This - # picks up _ensure_cuda_torch (re-installs cu130 torch on aarch64 - # after isaacsim downgrades it to a CPU wheel), nlopt arm prep, - # pin-pink dependency probe, etc. — same install path real users - # hit on `./isaaclab.sh -i`, so CI failures here are real user bugs. + # Mirrors Dockerfile.base's pattern: symlink _isaac_sim → /isaac-sim + # so ./isaaclab.sh finds the bundled Isaac Sim Python (no uv venv, + # no separate isaacsim pip install — the docker image already + # ships Sim). mkdir -p reports docker run --rm --user root \ -v "${{ github.workspace }}":/workspace/isaaclab \ @@ -180,21 +173,16 @@ jobs: --gpus all \ isaac-lab-arm-ci:${{ github.sha }} " set -e - if ! command -v uv >/dev/null 2>&1; then - curl -LsSf https://astral.sh/uv/install.sh | sh - export PATH=\$HOME/.local/bin:\$PATH - fi - - uv venv --python 3.12 env_isaaclab_uv - source env_isaaclab_uv/bin/activate - uv pip install pytest pytest-timeout - - # Use ./isaaclab.sh -i (mirrors user install path). - # Tokens: isaacsim (Tier 1 kit-launch + controllers need it), - # ov[ovrtx] (Tier 2 newton-ovrtx kitless rendering needs ovrtx). - # _ensure_cuda_torch fires automatically after isaacsim install. - ./isaaclab.sh -i isaacsim,'ov[ovrtx]' - python -c 'import isaaclab, isaaclab_assets, isaaclab_physx, isaaclab_tasks; print(\"editable imports ok\")' + ln -sf /isaac-sim _isaac_sim + + # ./isaaclab.sh detects _isaac_sim and uses its bundled python.sh. + # -i 'ov[ovrtx]' installs ov submodule + ovrtx selector on top of + # the bundled Sim; isaacsim itself stays the bundled version. + # _ensure_cuda_torch fires after install and restores cu130 torch + # if anything pulls a CPU wheel. + ./isaaclab.sh -i 'ov[ovrtx]' + ./isaaclab.sh -p -m pip install pytest pytest-timeout + ./isaaclab.sh -p -c \"import isaaclab, isaaclab_assets, isaaclab_physx, isaaclab_tasks; print('editable imports ok')\" set +e # individual tier failures do not abort the script From ef9e1996d08c2b725fbdd4dd801c2c0f9fb7bf6a Mon Sep 17 00:00:00 2001 From: jichuanh Date: Thu, 21 May 2026 08:02:09 +0000 Subject: [PATCH 07/55] arm-ci: Tier 2 invokes ./isaaclab.sh -p (bundled Sim python) --- .github/workflows/arm-ci.yaml | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/.github/workflows/arm-ci.yaml b/.github/workflows/arm-ci.yaml index cc859c3b615f..85f7d70ba90a 100644 --- a/.github/workflows/arm-ci.yaml +++ b/.github/workflows/arm-ci.yaml @@ -207,7 +207,11 @@ jobs: # AppLauncher-at-module-level test files doesn't SIGSEGV — each # test file gets its own Python process. echo '::group::Tier 2 — arm_ci marker discovery (subprocess-per-file)' - CI_MARKER=arm_ci python -m pytest tools -v --junitxml=reports/tier2-arm.xml + # ./isaaclab.sh -p wrapper invokes the bundled Sim's python.sh + # (no uv venv in this approach; bare `python` is not on PATH). + # CI_MARKER env var propagates through exec to the pytest subprocess + # → tools/conftest.py picks it up. + CI_MARKER=arm_ci ./isaaclab.sh -p -m pytest tools -v --junitxml=reports/tier2-arm.xml echo '::endgroup::' true " From c64755bf6b4e1f92171ad2cf983feaab2e478b6a Mon Sep 17 00:00:00 2001 From: jichuanh Date: Thu, 21 May 2026 08:17:30 +0000 Subject: [PATCH 08/55] arm-ci: build via Dockerfile.base through ecr-build-push-pull Drops docker/Dockerfile.arm-ci and rewrites arm-ci.yaml to mirror build.yaml's structure end-to-end: build job uses the shared ecr-build-push-pull action (with platform=linux/arm64), test job pulls the same image and docker-runs pytest inside it. Dockerfile.base already has the arm64 branches for libgl/libx11/swig + nlopt source-build and installs everything via ./isaaclab.sh --install, so the dedicated Dockerfile.arm-ci was just a thinner duplicate of the same setup. --- .github/workflows/arm-ci.yaml | 143 ++++++++++++++++------------------ docker/Dockerfile.arm-ci | 48 ------------ 2 files changed, 68 insertions(+), 123 deletions(-) delete mode 100644 docker/Dockerfile.arm-ci diff --git a/.github/workflows/arm-ci.yaml b/.github/workflows/arm-ci.yaml index 85f7d70ba90a..2b72dba298e3 100644 --- a/.github/workflows/arm-ci.yaml +++ b/.github/workflows/arm-ci.yaml @@ -4,16 +4,15 @@ # SPDX-License-Identifier: BSD-3-Clause # ARM/Spark CI — exercises Isaac Lab on aarch64 Linux self-hosted runners -# (NVIDIA DGX Spark). Mirrors the spirit of build.yaml but stays lean by -# running tests inside the multi-arch nvcr.io/nvidian/isaac-sim image -# instead of building a full isaac-lab-ci image. (Once the apt deps and -# editable-install scope stabilize, we can promote to a Dockerfile.base -# build that mirrors build.yaml's structure end-to-end.) +# (NVIDIA DGX Spark). Mirrors build.yaml's structure: one job builds +# Dockerfile.base for linux/arm64 via the shared ecr-build-push-pull action +# (with ECR-backed deps caching), downstream jobs pull the same image and +# `docker run` pytest inside it. # -# Single job, multiple steps. Each test step sets `continue-on-error: true` -# so a failure in one tier does not abort the others. Each pytest invocation -# passes `--timeout=N --timeout-method=signal --continue-on-collection-errors` -# so a hung or import-broken test cannot consume the whole job slot. +# Each test step is wrapped with `continue-on-error: true` so a failure in +# one tier does not abort the others. Each pytest invocation passes +# `--timeout=N --timeout-method=signal --continue-on-collection-errors` so a +# hung or import-broken test cannot consume the whole job slot. # # Marker-driven discovery: `pytest -m arm_ci`. Adding a new aarch64-safe # test = tag it with arm_ci, no yaml edit. @@ -45,6 +44,9 @@ permissions: env: NGC_API_KEY: ${{ secrets.NGC_API_KEY }} + # Arch-suffixed tag so the per-commit ECR lookup, layer cache, and local + # docker tag never collide with the linux/amd64 image built by build.yaml. + CI_IMAGE_TAG: isaac-lab-ci-arm64:${{ github.event_name == 'pull_request' && format('pr-{0}', github.event.pull_request.number) || github.ref_name }}-${{ github.sha }} jobs: changes: @@ -120,73 +122,74 @@ jobs: echo "isaacsim_image_name=$name" >> "$GITHUB_OUTPUT" echo "isaacsim_image_tag=$tag" >> "$GITHUB_OUTPUT" - arm-ci: - name: arm-ci + build-arm: + name: Build Base Docker Image (arm64) runs-on: [self-hosted, arm64] needs: [changes, config] if: needs.changes.outputs.run_arm_ci == 'true' - timeout-minutes: 120 steps: - - name: Checkout + - name: Checkout Code uses: actions/checkout@v4 with: fetch-depth: 1 lfs: false - - name: Login to nvcr.io - shell: bash - run: | - set -euo pipefail - if [ -n "${NGC_API_KEY:-}" ]; then - echo "${NGC_API_KEY}" | docker login nvcr.io --username '$oauthtoken' --password-stdin - fi + - name: Build and push to ECR + uses: ./.github/actions/ecr-build-push-pull + with: + image-tag: ${{ env.CI_IMAGE_TAG }} + isaacsim-base-image: ${{ needs.config.outputs.isaacsim_image_name }} + isaacsim-version: ${{ needs.config.outputs.isaacsim_image_tag }} + dockerfile-path: docker/Dockerfile.base + cache-tag: cache-base-arm64 + platform: linux/arm64 - - name: Build arm-ci docker image - shell: bash - run: | - set -euo pipefail - # Build the lightweight arm-ci image from docker/Dockerfile.arm-ci. - # That Dockerfile layers cmake/build-essential/git plus EULA env vars - # and a bash entrypoint onto the multi-arch Isaac Sim base image - # (auto-resolves to the arm64 manifest on aarch64 hosts). - docker build \ - --platform linux/arm64 \ - --build-arg "ISAACSIM_BASE_IMAGE_ARG=${{ needs.config.outputs.isaacsim_image_name }}" \ - --build-arg "ISAACSIM_VERSION_ARG=${{ needs.config.outputs.isaacsim_image_tag }}" \ - -f docker/Dockerfile.arm-ci \ - -t isaac-lab-arm-ci:${{ github.sha }} \ - docker - - - name: Install isaaclab via ./isaaclab.sh -i + run all tier 1/2 tests + arm-ci: + name: arm-ci + runs-on: [self-hosted, arm64] + needs: [build-arm, config] + if: needs.build-arm.result == 'success' + timeout-minutes: 120 + steps: + - name: Checkout Code + uses: actions/checkout@v4 + with: + fetch-depth: 1 + lfs: false + + - name: Pull Base Docker image + uses: ./.github/actions/ecr-build-push-pull + with: + image-tag: ${{ env.CI_IMAGE_TAG }} + isaacsim-base-image: ${{ needs.config.outputs.isaacsim_image_name }} + isaacsim-version: ${{ needs.config.outputs.isaacsim_image_tag }} + dockerfile-path: docker/Dockerfile.base + cache-tag: cache-base-arm64 + platform: linux/arm64 + + - name: Run tier 1/2 tests shell: bash timeout-minutes: 90 run: | set -euo pipefail - # Mirrors Dockerfile.base's pattern: symlink _isaac_sim → /isaac-sim - # so ./isaaclab.sh finds the bundled Isaac Sim Python (no uv venv, - # no separate isaacsim pip install — the docker image already - # ships Sim). mkdir -p reports + # Dockerfile.base already installs isaaclab via ./isaaclab.sh --install + # and symlinks _isaac_sim → /isaac-sim, so the container is ready to + # run pytest immediately. We bind-mount the workspace so the test + # files reflect this PR's checkout (not the image's frozen copy). docker run --rm --user root \ -v "${{ github.workspace }}":/workspace/isaaclab \ -w /workspace/isaaclab \ --gpus all \ - isaac-lab-arm-ci:${{ github.sha }} " - set -e - ln -sf /isaac-sim _isaac_sim - - # ./isaaclab.sh detects _isaac_sim and uses its bundled python.sh. - # -i 'ov[ovrtx]' installs ov submodule + ovrtx selector on top of - # the bundled Sim; isaacsim itself stays the bundled version. - # _ensure_cuda_torch fires after install and restores cu130 torch - # if anything pulls a CPU wheel. - ./isaaclab.sh -i 'ov[ovrtx]' - ./isaaclab.sh -p -m pip install pytest pytest-timeout - ./isaaclab.sh -p -c \"import isaaclab, isaaclab_assets, isaaclab_physx, isaaclab_tasks; print('editable imports ok')\" - + -e ACCEPT_EULA=Y \ + -e OMNI_KIT_ACCEPT_EULA=yes \ + -e ISAAC_SIM_HEADLESS=1 \ + -e PRIVACY_CONSENT=Y \ + --entrypoint bash \ + "${{ env.CI_IMAGE_TAG }}" -lc ' set +e # individual tier failures do not abort the script - echo '::group::Tier 1 — general-arm smoke (torch + scipy)' + echo "::group::Tier 1 — general-arm smoke (torch + scipy)" ./isaaclab.sh -p -m pytest \ source/isaaclab/test/deps \ --ignore=tools/conftest.py \ @@ -196,25 +199,21 @@ jobs: --timeout-method=signal \ -v \ --junitxml=reports/general-arm.xml - echo '::endgroup::' + echo "::endgroup::" - echo '::group::Tier 1 — kit-launch-arm (boot Kit headless)' - timeout 120 ./isaaclab.sh -p -c \"from isaaclab.app import AppLauncher; sim = AppLauncher(headless=True).app; assert sim is not None, 'AppLauncher did not return a SimulationApp'; sim.close()\" - echo '::endgroup::' + echo "::group::Tier 1 — kit-launch-arm (boot Kit headless)" + timeout 120 ./isaaclab.sh -p -c "from isaaclab.app import AppLauncher; sim = AppLauncher(headless=True).app; assert sim is not None, \"AppLauncher did not return a SimulationApp\"; sim.close()" + echo "::endgroup::" - # Tier 2 uses tools/conftest.py's subprocess-per-file orchestrator + # Tier 2 uses tools/conftest.py subprocess-per-file orchestrator # (CI_MARKER=arm_ci) so aarch64 Kit re-init across multiple - # AppLauncher-at-module-level test files doesn't SIGSEGV — each - # test file gets its own Python process. - echo '::group::Tier 2 — arm_ci marker discovery (subprocess-per-file)' - # ./isaaclab.sh -p wrapper invokes the bundled Sim's python.sh - # (no uv venv in this approach; bare `python` is not on PATH). - # CI_MARKER env var propagates through exec to the pytest subprocess - # → tools/conftest.py picks it up. + # AppLauncher-at-module-level test files cannot SIGSEGV — each + # test file gets its own python process. + echo "::group::Tier 2 — arm_ci marker discovery (subprocess-per-file)" CI_MARKER=arm_ci ./isaaclab.sh -p -m pytest tools -v --junitxml=reports/tier2-arm.xml - echo '::endgroup::' + echo "::endgroup::" true - " + ' - name: Upload test reports if: always() @@ -223,9 +222,3 @@ jobs: name: arm-ci-reports path: reports/ retention-days: 7 - - - name: Clean up per-run image - if: always() - shell: bash - run: | - docker rmi -f isaac-lab-arm-ci:${{ github.sha }} || true diff --git a/docker/Dockerfile.arm-ci b/docker/Dockerfile.arm-ci deleted file mode 100644 index 99ca947462b3..000000000000 --- a/docker/Dockerfile.arm-ci +++ /dev/null @@ -1,48 +0,0 @@ -# Copyright (c) 2022-2026, The Isaac Lab Project Developers (https://github.com/isaac-sim/IsaacLab/blob/main/CONTRIBUTORS.md). -# All rights reserved. -# -# SPDX-License-Identifier: BSD-3-Clause - -# Lightweight base image for ARM/Spark CI (arm-ci.yaml). -# -# Layers on top of the multi-arch nvcr.io/nvidian/isaac-sim image (arm64 -# manifest on aarch64 hosts) and adds: -# - cmake / build-essential / git: source-build infra for aarch64 Python -# packages that have no prebuilt aarch64 wheel (pin-pink, daqp, etc.). -# - EULA env vars + bash entrypoint so the workflow can run plain shell -# commands without re-specifying flags per `docker run`. -# -# This is intentionally NOT Dockerfile.base — that one builds a full -# isaac-lab-ci image with isaaclab pre-installed. arm-ci.yaml instead -# mounts source/ at test time and does a uv editable install, so we only -# need the apt layer here. - -ARG ISAACSIM_BASE_IMAGE_ARG=nvcr.io/nvidian/isaac-sim -ARG ISAACSIM_VERSION_ARG=latest-develop - -FROM ${ISAACSIM_BASE_IMAGE_ARG}:${ISAACSIM_VERSION_ARG} - -USER root - -RUN apt-get update && \ - apt-get install -y --no-install-recommends \ - cmake build-essential git \ - # imgui-bundle has no aarch64 wheel and source-builds need GL/X11 headers. - # swig is needed for the nlopt aarch64 source build. - libgl1-mesa-dev libopengl-dev libglx-dev \ - libx11-dev libxcursor-dev libxi-dev libxinerama-dev libxrandr-dev \ - swig && \ - apt-get -y autoremove && apt-get clean && \ - rm -rf /var/lib/apt/lists/* - -ENV ACCEPT_EULA=Y \ - OMNI_KIT_ACCEPT_EULA=yes \ - ISAAC_SIM_HEADLESS=1 \ - PRIVACY_CONSENT=Y \ - # aarch64 scipy/numpy have a known OpenMP thread-safety issue that - # requires libgomp.so.1 to be LD_PRELOAD-ed before scipy imports. Without - # it, pytest collection trips an InitError. The scipy error message - # spells out this exact fix. - LD_PRELOAD=/lib/aarch64-linux-gnu/libgomp.so.1 - -ENTRYPOINT ["/bin/bash", "-c"] From 1dc5ac2d7c3d16087235005c2d48e2c1da519e26 Mon Sep 17 00:00:00 2001 From: jichuanh Date: Thu, 21 May 2026 08:46:01 +0000 Subject: [PATCH 09/55] arm-ci: re-create _isaac_sim symlink on bind-mounted workspace MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The workspace bind-mount overlays Dockerfile.base's pre-built _isaac_sim → /isaac-sim symlink, so ./isaaclab.sh -p falls back to python3 (not in PATH) and exits 'exec: python3: not found' before any tier runs. Re-create the symlink at the top of the docker-run script so the wrapper can find Sim's python.sh. --- .github/workflows/arm-ci.yaml | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/.github/workflows/arm-ci.yaml b/.github/workflows/arm-ci.yaml index 2b72dba298e3..08c0fc021b51 100644 --- a/.github/workflows/arm-ci.yaml +++ b/.github/workflows/arm-ci.yaml @@ -187,6 +187,13 @@ jobs: -e PRIVACY_CONSENT=Y \ --entrypoint bash \ "${{ env.CI_IMAGE_TAG }}" -lc ' + # Dockerfile.base bakes _isaac_sim → /isaac-sim into the image, but + # the workspace bind-mount overlays it on the host checkout, so + # ./isaaclab.sh -p cannot find Sim'"'"'s python.sh. Re-create the + # symlink on the bind-mounted tree; actions/checkout clean:true + # default wipes it before the next run. + ln -sf /isaac-sim _isaac_sim + set +e # individual tier failures do not abort the script echo "::group::Tier 1 — general-arm smoke (torch + scipy)" From 16db25efa967b5e164fe82e37fa13a9edee25e48 Mon Sep 17 00:00:00 2001 From: jichuanh Date: Thu, 21 May 2026 08:49:30 +0000 Subject: [PATCH 10/55] arm-ci: run tests via shared .github/actions/run-tests Replaces the hand-rolled docker-run block with a single run-tests action call (test-path=tools, ci-marker=arm_ci) so arm-ci goes through the exact same pytest entry path as Linux's daily-compatibility and per-PR test jobs: subprocess-per-file orchestrator, JUnit report copy, comparison-image upload, cancellation-safe container teardown, host-uid volume-mount. Adds a 'ci-marker' input to .github/actions/run-tests that forwards a CI_MARKER env var into the container; tools/conftest.py already reads it (ISAACSIM_CI_SHORT=true keeps working as a back-compat shorthand for CI_MARKER=isaacsim_ci) and pre-scans the source tree for files tagged with the matching pytest marker. --- .github/actions/run-tests/action.yml | 12 ++++- .github/workflows/arm-ci.yaml | 76 +++++++++------------------- 2 files changed, 35 insertions(+), 53 deletions(-) diff --git a/.github/actions/run-tests/action.yml b/.github/actions/run-tests/action.yml index 844dc07b8209..d03803932fc6 100644 --- a/.github/actions/run-tests/action.yml +++ b/.github/actions/run-tests/action.yml @@ -59,6 +59,10 @@ inputs: description: 'Space-separated pip packages to install inside the Docker container before pytest starts' default: '' required: false + ci-marker: + description: 'CI_MARKER value forwarded to the container (read by tools/conftest.py to select test files by pytest marker)' + default: '' + required: false runs: using: composite @@ -82,6 +86,7 @@ runs: local shard_count="${12}" local volume_mount_source="${13}" local extra_pip_packages="${14}" + local ci_marker="${15}" local logs_pid="" local wait_pid="" local docker_wait_file="/tmp/.docker_exit_${container_name}" @@ -182,6 +187,11 @@ runs: docker_env_vars="$docker_env_vars -e TEST_EXTRA_PIP_PACKAGES" fi + if [ -n "$ci_marker" ]; then + docker_env_vars="$docker_env_vars -e CI_MARKER=$ci_marker" + echo "Setting CI_MARKER=$ci_marker" + fi + # Volume mount for deps-cache-hit mode: bind-mount the checked-out # source code over /workspace/isaaclab instead of baking it into the image. docker_volume_args="" @@ -369,7 +379,7 @@ runs: } # Call the function with provided parameters - run_tests "${{ inputs.test-path }}" "${{ inputs.result-file }}" "${{ inputs.container-name }}" "${{ inputs.image-tag }}" "${{ inputs.reports-dir }}" "${{ inputs.pytest-options }}" "${{ inputs.filter-pattern }}" "${{ inputs.curobo-only }}" "${{ inputs.include-files }}" "${{ inputs.quarantined-only }}" "${{ inputs.shard-index }}" "${{ inputs.shard-count }}" "${{ inputs.volume-mount-source }}" "${{ inputs.extra-pip-packages }}" + run_tests "${{ inputs.test-path }}" "${{ inputs.result-file }}" "${{ inputs.container-name }}" "${{ inputs.image-tag }}" "${{ inputs.reports-dir }}" "${{ inputs.pytest-options }}" "${{ inputs.filter-pattern }}" "${{ inputs.curobo-only }}" "${{ inputs.include-files }}" "${{ inputs.quarantined-only }}" "${{ inputs.shard-index }}" "${{ inputs.shard-count }}" "${{ inputs.volume-mount-source }}" "${{ inputs.extra-pip-packages }}" "${{ inputs.ci-marker }}" - name: Kill container on cancellation if: cancelled() diff --git a/.github/workflows/arm-ci.yaml b/.github/workflows/arm-ci.yaml index 08c0fc021b51..fb2d24336c92 100644 --- a/.github/workflows/arm-ci.yaml +++ b/.github/workflows/arm-ci.yaml @@ -167,60 +167,32 @@ jobs: cache-tag: cache-base-arm64 platform: linux/arm64 - - name: Run tier 1/2 tests + # tools/conftest.py's subprocess-per-file orchestrator picks up + # CI_MARKER=arm_ci and spawns one python subprocess per test file + # tagged `pytest.mark.arm_ci`. Each Kit launch is isolated, so the + # AppLauncher-at-module-level pattern cannot SIGSEGV across files. + - name: Run arm_ci marker tests + uses: ./.github/actions/run-tests + with: + test-path: "tools" + result-file: "arm-ci-report.xml" + container-name: "isaac-lab-arm-ci-${{ github.run_id }}-${{ github.run_attempt }}" + image-tag: ${{ env.CI_IMAGE_TAG }} + pytest-options: "" + ci-marker: "arm_ci" + volume-mount-source: ${{ github.workspace }} + + - name: Check Test Results + if: always() shell: bash - timeout-minutes: 90 run: | - set -euo pipefail - mkdir -p reports - # Dockerfile.base already installs isaaclab via ./isaaclab.sh --install - # and symlinks _isaac_sim → /isaac-sim, so the container is ready to - # run pytest immediately. We bind-mount the workspace so the test - # files reflect this PR's checkout (not the image's frozen copy). - docker run --rm --user root \ - -v "${{ github.workspace }}":/workspace/isaaclab \ - -w /workspace/isaaclab \ - --gpus all \ - -e ACCEPT_EULA=Y \ - -e OMNI_KIT_ACCEPT_EULA=yes \ - -e ISAAC_SIM_HEADLESS=1 \ - -e PRIVACY_CONSENT=Y \ - --entrypoint bash \ - "${{ env.CI_IMAGE_TAG }}" -lc ' - # Dockerfile.base bakes _isaac_sim → /isaac-sim into the image, but - # the workspace bind-mount overlays it on the host checkout, so - # ./isaaclab.sh -p cannot find Sim'"'"'s python.sh. Re-create the - # symlink on the bind-mounted tree; actions/checkout clean:true - # default wipes it before the next run. - ln -sf /isaac-sim _isaac_sim - - set +e # individual tier failures do not abort the script - - echo "::group::Tier 1 — general-arm smoke (torch + scipy)" - ./isaaclab.sh -p -m pytest \ - source/isaaclab/test/deps \ - --ignore=tools/conftest.py \ - -m arm_ci \ - --continue-on-collection-errors \ - --timeout=60 \ - --timeout-method=signal \ - -v \ - --junitxml=reports/general-arm.xml - echo "::endgroup::" - - echo "::group::Tier 1 — kit-launch-arm (boot Kit headless)" - timeout 120 ./isaaclab.sh -p -c "from isaaclab.app import AppLauncher; sim = AppLauncher(headless=True).app; assert sim is not None, \"AppLauncher did not return a SimulationApp\"; sim.close()" - echo "::endgroup::" - - # Tier 2 uses tools/conftest.py subprocess-per-file orchestrator - # (CI_MARKER=arm_ci) so aarch64 Kit re-init across multiple - # AppLauncher-at-module-level test files cannot SIGSEGV — each - # test file gets its own python process. - echo "::group::Tier 2 — arm_ci marker discovery (subprocess-per-file)" - CI_MARKER=arm_ci ./isaaclab.sh -p -m pytest tools -v --junitxml=reports/tier2-arm.xml - echo "::endgroup::" - true - ' + if [ -f "reports/arm-ci-report.xml" ]; then + if grep -qE 'failures="[1-9][0-9]*"' reports/arm-ci-report.xml || grep -qE 'errors="[1-9][0-9]*"' reports/arm-ci-report.xml; then + echo "::warning::ARM CI tests reported failures/errors (see job summary)." + # Don't fail the job while arm_ci is opt-in — surfaces a warning + # but lets known aarch64 gaps land alongside the green tiers. + fi + fi - name: Upload test reports if: always() From d849323b892bf558d3f325bcb880b994c58a2d72 Mon Sep 17 00:00:00 2001 From: jichuanh Date: Thu, 21 May 2026 09:29:49 +0000 Subject: [PATCH 11/55] arm-ci: enable LFS checkout + skip ovrtx renderer on aarch64 Two arm-ci failures had distinct root causes: 1. lfs:false on the workflow's actions/checkout dropped the LFS-tracked golden-image PNGs to plain pointer text, so PIL failed 'cannot identify image file' on every renderer=newton_renderer case that had a real comparison image. Mirror build.yaml's lfs:true. 2. The make_require_ovrtx_install_fixture autouse helper hard-failed when ov[ovrtx] wasn't importable, but ovrtx ships no aarch64 wheel today. The fixture now pytest.skip()s on aarch64 (still fails with install guidance on x86), so renderer=ovrtx_renderer parametrize cases become unreachable-but-skipped on aarch64 instead of red. --- .github/workflows/arm-ci.yaml | 16 ++-------------- .../changelog.d/jichuanh-ovrtx-skip-aarch64.rst | 9 +++++++++ .../isaaclab_tasks/test/rendering_test_utils.py | 3 +++ 3 files changed, 14 insertions(+), 14 deletions(-) create mode 100644 source/isaaclab_tasks/changelog.d/jichuanh-ovrtx-skip-aarch64.rst diff --git a/.github/workflows/arm-ci.yaml b/.github/workflows/arm-ci.yaml index fb2d24336c92..2ac657e95541 100644 --- a/.github/workflows/arm-ci.yaml +++ b/.github/workflows/arm-ci.yaml @@ -132,7 +132,7 @@ jobs: uses: actions/checkout@v4 with: fetch-depth: 1 - lfs: false + lfs: true - name: Build and push to ECR uses: ./.github/actions/ecr-build-push-pull @@ -155,7 +155,7 @@ jobs: uses: actions/checkout@v4 with: fetch-depth: 1 - lfs: false + lfs: true - name: Pull Base Docker image uses: ./.github/actions/ecr-build-push-pull @@ -182,18 +182,6 @@ jobs: ci-marker: "arm_ci" volume-mount-source: ${{ github.workspace }} - - name: Check Test Results - if: always() - shell: bash - run: | - if [ -f "reports/arm-ci-report.xml" ]; then - if grep -qE 'failures="[1-9][0-9]*"' reports/arm-ci-report.xml || grep -qE 'errors="[1-9][0-9]*"' reports/arm-ci-report.xml; then - echo "::warning::ARM CI tests reported failures/errors (see job summary)." - # Don't fail the job while arm_ci is opt-in — surfaces a warning - # but lets known aarch64 gaps land alongside the green tiers. - fi - fi - - name: Upload test reports if: always() uses: actions/upload-artifact@v4 diff --git a/source/isaaclab_tasks/changelog.d/jichuanh-ovrtx-skip-aarch64.rst b/source/isaaclab_tasks/changelog.d/jichuanh-ovrtx-skip-aarch64.rst new file mode 100644 index 000000000000..ad5fe3d0098f --- /dev/null +++ b/source/isaaclab_tasks/changelog.d/jichuanh-ovrtx-skip-aarch64.rst @@ -0,0 +1,9 @@ +Fixed +^^^^^ + +* Changed the ``ovrtx`` autouse guard in the kitless rendering tests to skip + rather than fail on aarch64 when the ``ov[ovrtx]`` optional dependency is + unavailable. The ``ovrtx`` wheel is published only for x86_64, so on aarch64 + this gate was turning unreachable parametrize cases into hard failures; x86 + environments without ``ov[ovrtx]`` still see the original "install with + ``./isaaclab.sh -i 'ov[ovrtx]'``" failure with install guidance. diff --git a/source/isaaclab_tasks/test/rendering_test_utils.py b/source/isaaclab_tasks/test/rendering_test_utils.py index d4fbf368ea7c..e4af95e51004 100644 --- a/source/isaaclab_tasks/test/rendering_test_utils.py +++ b/source/isaaclab_tasks/test/rendering_test_utils.py @@ -6,6 +6,7 @@ """Shared helpers for rendering correctness tests.""" import os +import platform from datetime import datetime from typing import Any @@ -495,6 +496,8 @@ def _require_ovrtx_install(request): print(f"ovrtx version: {ovrtx.__version__}") except ImportError as exc: + if platform.machine() == "aarch64": + pytest.skip("OVRTX has no aarch64 wheel; skipping renderer=ovrtx_renderer on this platform.") pytest.fail( "Kitless OVRTX rendering tests require the optional dependency ov[ovrtx]. " "Install with: ./isaaclab.sh -i 'ov[ovrtx]'\n" From eb89153a22adaace6bb8704cb298e7fa6c8c650b Mon Sep 17 00:00:00 2001 From: jichuanh Date: Thu, 21 May 2026 10:07:48 +0000 Subject: [PATCH 12/55] Revert TEMP-disable: re-enable Linux Docker + Tests on this branch Restores the changes job to forward steps.detect.outputs.run_docker_tests so the heavy Linux test jobs run again. ARM CI on commit d849323b89 is green end-to-end (see run 26217691562: deps 7/7, controllers 20/20, rendering 6/27 passed + 21 skipped on aarch64-unsupported ovrtx). --- .github/workflows/build.yaml | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml index bc81f1384249..22deff079a4d 100644 --- a/.github/workflows/build.yaml +++ b/.github/workflows/build.yaml @@ -74,10 +74,7 @@ jobs: name: Detect Changes runs-on: ubuntu-latest outputs: - # TEMP (revert before final review): force run_docker_tests=false on this - # branch so all heavy Linux Docker + Tests jobs skip while we iterate ARM - # CI on PR #5698. Saves runner time + cost during the back-and-forth. - run_docker_tests: 'false' + run_docker_tests: ${{ steps.detect.outputs.run_docker_tests }} steps: - id: detect env: From d55e64a6216de5bf8aa042cb35a247aac6558945 Mon Sep 17 00:00:00 2001 From: jichuanh Date: Thu, 21 May 2026 18:55:56 +0000 Subject: [PATCH 13/55] Add cartpole training smoke (state + perception) for arm_ci MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two-iteration rsl_rl PPO smoke covering both observation shapes — pure state on Isaac-Cartpole-Direct-v0 and RGB tiled camera on Isaac-Cartpole-RGB-Camera-Direct-v0. Tagged pytest.mark.arm_ci so the ARM/Spark CI workflow picks them up via the CI_MARKER orchestrator. Mirrors the subprocess-based pattern in test_train_scripts_deterministic.py but drops the tensorboard reward parsing — exit code 0 is sufficient signal for 'training pipeline plumbed correctly end-to-end'. --- .../jichuanh-cartpole-train-smoke.skip | 1 + .../test/test_cartpole_training_smoke.py | 70 +++++++++++++++++++ 2 files changed, 71 insertions(+) create mode 100644 source/isaaclab_tasks/changelog.d/jichuanh-cartpole-train-smoke.skip create mode 100644 source/isaaclab_tasks/test/test_cartpole_training_smoke.py diff --git a/source/isaaclab_tasks/changelog.d/jichuanh-cartpole-train-smoke.skip b/source/isaaclab_tasks/changelog.d/jichuanh-cartpole-train-smoke.skip new file mode 100644 index 000000000000..303fdd5708e5 --- /dev/null +++ b/source/isaaclab_tasks/changelog.d/jichuanh-cartpole-train-smoke.skip @@ -0,0 +1 @@ +Skip changelog: CI/test-infrastructure only. Adds source/isaaclab_tasks/test/test_cartpole_training_smoke.py covering two-iter rsl_rl PPO training on Isaac-Cartpole-Direct-v0 (state) and Isaac-Cartpole-RGB-Camera-Direct-v0 (perception), tagged pytest.mark.arm_ci so the ARM/Spark workflow picks them up via the CI_MARKER orchestrator. Mirrors the subprocess pattern from test_train_scripts_deterministic.py with --max_iterations=2 --num_envs=16. diff --git a/source/isaaclab_tasks/test/test_cartpole_training_smoke.py b/source/isaaclab_tasks/test/test_cartpole_training_smoke.py new file mode 100644 index 000000000000..6f37f24301c8 --- /dev/null +++ b/source/isaaclab_tasks/test/test_cartpole_training_smoke.py @@ -0,0 +1,70 @@ +# Copyright (c) 2022-2026, The Isaac Lab Project Developers (https://github.com/isaac-sim/IsaacLab/blob/main/CONTRIBUTORS.md). +# All rights reserved. +# +# SPDX-License-Identifier: BSD-3-Clause + +"""Minimal end-to-end training smoke for cartpole. + +Two cases — state-only and perception (RGB tiled camera) — each spawn +``scripts/reinforcement_learning/rsl_rl/train.py`` for two PPO iterations on a +small env count. They validate the full pipeline (``./isaaclab.sh`` wrapper, +gym registration, env build, RL wrapper, optimizer step, checkpoint write) +without the cost of a real training run, so the orchestrator can include them +in every CI shape (Linux, ARM/Spark). +""" + +from __future__ import annotations + +import subprocess +from pathlib import Path + +import pytest + +pytestmark = pytest.mark.arm_ci + +_REPO_ROOT = Path(__file__).resolve().parents[4] +_TRAIN_SCRIPT = "scripts/reinforcement_learning/rsl_rl/train.py" + + +def _run_train(task_name: str, extra_args: list[str] | None = None, timeout: int = 600) -> None: + """Spawn the rsl_rl trainer for two iterations and assert it exits cleanly.""" + cmd = [ + "./isaaclab.sh", + "-p", + _TRAIN_SCRIPT, + "--task", + task_name, + "--headless", + "--num_envs", + "16", + "--max_iterations", + "2", + "--seed", + "42", + ] + if extra_args: + cmd.extend(extra_args) + + result = subprocess.run( + cmd, + cwd=_REPO_ROOT, + text=True, + capture_output=True, + timeout=timeout, + check=False, + ) + assert result.returncode == 0, ( + f"Training command failed for {task_name}: {' '.join(cmd)}\n" + f"--- stdout (tail) ---\n{result.stdout[-4000:]}\n" + f"--- stderr (tail) ---\n{result.stderr[-4000:]}\n" + ) + + +def test_train_cartpole_state(): + """State-observation cartpole trains for two PPO iterations without errors.""" + _run_train("Isaac-Cartpole-Direct-v0") + + +def test_train_cartpole_perception(): + """RGB-camera cartpole trains for two PPO iterations without errors.""" + _run_train("Isaac-Cartpole-RGB-Camera-Direct-v0", extra_args=["--enable_cameras"]) From 45e4143175aaebaae35f1dd5253694c84287704a Mon Sep 17 00:00:00 2001 From: jichuanh Date: Thu, 21 May 2026 18:57:35 +0000 Subject: [PATCH 14/55] TEMP: disable heavy Linux Docker + Tests while iterating cartpole smoke Forces run_docker_tests=false in build.yaml's changes job so all gated test jobs skip via their existing if-gate. Must be reverted before final review. --- .github/workflows/build.yaml | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml index 22deff079a4d..28a92a061f30 100644 --- a/.github/workflows/build.yaml +++ b/.github/workflows/build.yaml @@ -74,7 +74,10 @@ jobs: name: Detect Changes runs-on: ubuntu-latest outputs: - run_docker_tests: ${{ steps.detect.outputs.run_docker_tests }} + # TEMP (revert before final review): force run_docker_tests=false while + # iterating the cartpole training-smoke addition on ARM CI. Saves runner + # time + cost during the back-and-forth. + run_docker_tests: 'false' steps: - id: detect env: From 609d97ed7ed6f20ae7c4113f9f3410a4455aef65 Mon Sep 17 00:00:00 2001 From: jichuanh Date: Thu, 21 May 2026 19:33:28 +0000 Subject: [PATCH 15/55] =?UTF-8?q?cartpole=20train=20smoke:=20fix=20=5FREPO?= =?UTF-8?q?=5FROOT=20off-by-one=20(parents[4]=E2=86=92[3])?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit source/isaaclab_tasks/test/test_cartpole_training_smoke.py is 3 levels under the repo root (source / isaaclab_tasks / test / file). parents[4] resolved one above the repo root, where ./isaaclab.sh does not exist, so both test functions died with FileNotFoundError before launching train.py. --- source/isaaclab_tasks/test/test_cartpole_training_smoke.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/source/isaaclab_tasks/test/test_cartpole_training_smoke.py b/source/isaaclab_tasks/test/test_cartpole_training_smoke.py index 6f37f24301c8..f21d2294f64c 100644 --- a/source/isaaclab_tasks/test/test_cartpole_training_smoke.py +++ b/source/isaaclab_tasks/test/test_cartpole_training_smoke.py @@ -22,7 +22,7 @@ pytestmark = pytest.mark.arm_ci -_REPO_ROOT = Path(__file__).resolve().parents[4] +_REPO_ROOT = Path(__file__).resolve().parents[3] _TRAIN_SCRIPT = "scripts/reinforcement_learning/rsl_rl/train.py" From 5f0dc9a4529799f391c473a8f7663e6c9aa64c25 Mon Sep 17 00:00:00 2001 From: jichuanh Date: Thu, 21 May 2026 20:07:23 +0000 Subject: [PATCH 16/55] cartpole train smoke: use rl_games for the perception case MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Isaac-Cartpole-RGB-Camera-Direct-v0 only registers rl_games_cfg_entry_point — the rsl_rl invocation raised 'ValueError: Could not find configuration for the environment ... gym registry has the entry point rsl_rl_cfg_entry_point. Existing RL library config entry points: rl_games, skrl'. Switch the perception test to rl_games's train.py (same --task / --num_envs / --max_iterations / --seed CLI). State case stays on rsl_rl + Isaac-Cartpole-Direct-v0 where the rsl_rl PPO config exists. --- .../test/test_cartpole_training_smoke.py | 35 +++++++++++-------- 1 file changed, 21 insertions(+), 14 deletions(-) diff --git a/source/isaaclab_tasks/test/test_cartpole_training_smoke.py b/source/isaaclab_tasks/test/test_cartpole_training_smoke.py index f21d2294f64c..71e541cc5545 100644 --- a/source/isaaclab_tasks/test/test_cartpole_training_smoke.py +++ b/source/isaaclab_tasks/test/test_cartpole_training_smoke.py @@ -5,12 +5,16 @@ """Minimal end-to-end training smoke for cartpole. -Two cases — state-only and perception (RGB tiled camera) — each spawn -``scripts/reinforcement_learning/rsl_rl/train.py`` for two PPO iterations on a -small env count. They validate the full pipeline (``./isaaclab.sh`` wrapper, -gym registration, env build, RL wrapper, optimizer step, checkpoint write) -without the cost of a real training run, so the orchestrator can include them -in every CI shape (Linux, ARM/Spark). +Two cases — state-only and perception (RGB tiled camera) — each spawn a +``scripts/reinforcement_learning//train.py`` for two PPO iterations +on a small env count. They validate the full pipeline (``./isaaclab.sh`` +wrapper, gym registration, env build, RL wrapper, optimizer step, checkpoint +write) without the cost of a real training run, so the orchestrator can +include them in every CI shape (Linux, ARM/Spark). + +The state case uses rsl_rl (matches Isaac-Cartpole-Direct-v0's registered +config entry); the perception case uses rl_games because the camera-variant +direct envs only register ``rl_games_cfg_entry_point``. """ from __future__ import annotations @@ -23,15 +27,14 @@ pytestmark = pytest.mark.arm_ci _REPO_ROOT = Path(__file__).resolve().parents[3] -_TRAIN_SCRIPT = "scripts/reinforcement_learning/rsl_rl/train.py" -def _run_train(task_name: str, extra_args: list[str] | None = None, timeout: int = 600) -> None: - """Spawn the rsl_rl trainer for two iterations and assert it exits cleanly.""" +def _run_train(train_script: str, task_name: str, extra_args: list[str] | None = None, timeout: int = 600) -> None: + """Spawn a trainer for two iterations and assert it exits cleanly.""" cmd = [ "./isaaclab.sh", "-p", - _TRAIN_SCRIPT, + train_script, "--task", task_name, "--headless", @@ -61,10 +64,14 @@ def _run_train(task_name: str, extra_args: list[str] | None = None, timeout: int def test_train_cartpole_state(): - """State-observation cartpole trains for two PPO iterations without errors.""" - _run_train("Isaac-Cartpole-Direct-v0") + """State-observation cartpole trains for two rsl_rl PPO iterations without errors.""" + _run_train("scripts/reinforcement_learning/rsl_rl/train.py", "Isaac-Cartpole-Direct-v0") def test_train_cartpole_perception(): - """RGB-camera cartpole trains for two PPO iterations without errors.""" - _run_train("Isaac-Cartpole-RGB-Camera-Direct-v0", extra_args=["--enable_cameras"]) + """RGB-camera cartpole trains for two rl_games PPO iterations without errors.""" + _run_train( + "scripts/reinforcement_learning/rl_games/train.py", + "Isaac-Cartpole-RGB-Camera-Direct-v0", + extra_args=["--enable_cameras"], + ) From c859044a53c0449a86210e18fc42d3f4c1794fa2 Mon Sep 17 00:00:00 2001 From: jichuanh Date: Sat, 23 May 2026 06:21:01 +0000 Subject: [PATCH 17/55] Short-circuit ecr-build-push-pull when image is local Add an unconditional `docker image inspect` check as the first step of the action. When the requested image-tag is already in the local docker store (built by an earlier job on the same self-hosted runner), skip ECR resolution, the exact/deps-cache lookups, build, and push. Why: image tags are commit-pinned (CI_IMAGE_TAG = :), so any Dockerfile / setup.py / pyproject.toml / config.yaml change moves the SHA and tag together. A local hit means the cached image was built from identical inputs and is safe to reuse. Same staleness behavior as the existing ECR-exact-pull path. Impact: - arm64 runners (no ECR cache configured): when arm-ci lands on the same host as build-arm, skips a 13-min rebuild that today only happens because the action's local-image check was nested inside the ECR-exact-pull step and skipped whenever ECR was unreachable. - linux runners with warm caches: skips the ~30s ECR pull on test jobs scheduled to the same host as a prior job. --- .../actions/ecr-build-push-pull/action.yml | 26 ++++++++++++++++++- .../changelog.d/jichuanh-ecr-local-cache.skip | 1 + 2 files changed, 26 insertions(+), 1 deletion(-) create mode 100644 source/isaaclab/changelog.d/jichuanh-ecr-local-cache.skip diff --git a/.github/actions/ecr-build-push-pull/action.yml b/.github/actions/ecr-build-push-pull/action.yml index 11e26a3c1833..f65ca98d8237 100644 --- a/.github/actions/ecr-build-push-pull/action.yml +++ b/.github/actions/ecr-build-push-pull/action.yml @@ -72,6 +72,26 @@ runs: echo "🟠 NGC_API_KEY not set - skipping nvcr.io login (normal for fork PRs)" fi + ##### 1.5: Local-image short-circuit ##### + + # Fast path: if the exact image-tag is already in the local docker store + # (built by an earlier job on this same self-hosted runner), skip ECR + # resolution, manifest checks, deps-cache lookup, build, and push entirely. + # Image tags are commit-pinned (CI_IMAGE_TAG in build.yaml / arm-ci.yaml), + # so a local hit is safe: every code, Dockerfile, dep, or config.yaml + # change produces a new SHA and a new tag. + + - name: Check image locally + id: local + shell: bash + run: | + if docker image inspect "${{ inputs.image-tag }}" >/dev/null 2>&1; then + echo "🟢 Image already in local docker store: ${{ inputs.image-tag }}" + echo "hit=true" >> "$GITHUB_OUTPUT" + else + echo "🔵 Image not present locally, will try ECR / build paths" + fi + ##### 2: Resolve ECR URL ##### # Tries: explicit input >> ECR_CACHE_URL env var >> SSM parameter on EC2. @@ -79,6 +99,7 @@ runs: - name: Resolve ECR URL id: resolve-ecr + if: steps.local.outputs.hit != 'true' shell: bash env: INPUT_ECR_URL: ${{ inputs.ecr-url }} @@ -255,7 +276,10 @@ runs: # Uses ECR layer cache (--cache-from/--cache-to) when ECR is available. - name: Full build - if: steps.pull-exact.outputs.hit != 'true' && steps.deps-cache.outputs.deps-cache-hit != 'true' + if: > + steps.local.outputs.hit != 'true' && + steps.pull-exact.outputs.hit != 'true' && + steps.deps-cache.outputs.deps-cache-hit != 'true' shell: bash run: | BUILD_ARGS=( diff --git a/source/isaaclab/changelog.d/jichuanh-ecr-local-cache.skip b/source/isaaclab/changelog.d/jichuanh-ecr-local-cache.skip new file mode 100644 index 000000000000..c9378e5b4cd4 --- /dev/null +++ b/source/isaaclab/changelog.d/jichuanh-ecr-local-cache.skip @@ -0,0 +1 @@ +CI-only change in .github/actions/ecr-build-push-pull/action.yml. From 693c362406f851c916c60473501fe2efea718ffe Mon Sep 17 00:00:00 2001 From: jichuanh Date: Sat, 23 May 2026 08:49:45 +0000 Subject: [PATCH 18/55] Add local deps-cache to ecr-build-push-pull Mirror the ECR deps-cache logic (step 5) against the local docker store so builds without ECR can still skip rebuilds across different commit SHAs when dependencies are unchanged. - Step 1.6 computes the same deps-hash as step 5 (Dockerfile + isaaclab.sh + environment.yml + setup.py / pyproject.toml / uv.lock + base image digest), and checks `docker image inspect :deps-`. On hit, retags as the commit-tag in milliseconds (metadata-only) and skips the rest of the action. - Step 6.5 mirrors step 8 (Push deps tag) but locally: after a successful full build, `docker tag` the result as deps- in the local store so the next build on this host with identical deps short-circuits via step 1.6. Use case: arm64 self-hosted runners with no ECR cache configured. Code-only PRs (Dockerfile / setup.py / pyproject.toml unchanged) go from ~13 min rebuild to ~5 s retag on any host that previously built any PR with the same dependencies. No behavior change on the ECR-reachable path: when ECR is up, steps 4/5 still handle the cross-host case as before, and the local checks just provide an earlier same-host fast path. --- .../actions/ecr-build-push-pull/action.yml | 79 ++++++++++++++++++- 1 file changed, 78 insertions(+), 1 deletion(-) diff --git a/.github/actions/ecr-build-push-pull/action.yml b/.github/actions/ecr-build-push-pull/action.yml index f65ca98d8237..90e029b21288 100644 --- a/.github/actions/ecr-build-push-pull/action.yml +++ b/.github/actions/ecr-build-push-pull/action.yml @@ -92,6 +92,58 @@ runs: echo "🔵 Image not present locally, will try ECR / build paths" fi + ##### 1.6: Local deps-cache short-circuit ##### + + # Equivalent of step 5 (ECR deps-cache), but checked against the local + # docker store. When ECR is unreachable (today's arm64 path), this is the + # only way to reuse work across different commit SHAs of the same PR when + # dependencies haven't changed. + # + # Hash inputs match step 5 exactly. If the deps- tag exists locally, + # retag it as the requested commit-tag (metadata-only operation) so the + # rest of the action treats it as a cache hit. Otherwise, export + # LOCAL_DEPS_TAG so step 6.5 can populate the cache after a full build. + + - name: Check deps-tag locally + id: local-deps + if: steps.local.outputs.hit != 'true' + shell: bash + run: | + DEPS_FILES=( + "${{ inputs.dockerfile-path }}" + isaaclab.sh + environment.yml + source/isaaclab/isaaclab/cli + ) + DEPS_MANIFEST_PATTERN='(setup\.py|pyproject\.toml|setup\.cfg|extension\.toml|requirements[^/]*\.txt|uv\.lock)$' + + BASE_IMAGE_DIGEST=$(docker buildx imagetools inspect \ + "${{ inputs.isaacsim-base-image }}:${{ inputs.isaacsim-version }}" \ + --format '{{json .Manifest.Digest}}' 2>/dev/null | tr -d '"' || true) + if [ -n "${BASE_IMAGE_DIGEST}" ]; then + BASE_IMAGE_UNIQ_ID="${{ inputs.isaacsim-base-image }}:${{ inputs.isaacsim-version }}:${BASE_IMAGE_DIGEST}" + else + echo "🟠 Could not resolve base image digest, falling back to tag string" + BASE_IMAGE_UNIQ_ID="${{ inputs.isaacsim-base-image }}:${{ inputs.isaacsim-version }}" + fi + + MANIFEST_FILES=$(git ls-files | grep -E "${DEPS_MANIFEST_PATTERN}" || true) + FILE_HASH=$(git ls-files -s "${DEPS_FILES[@]}" ${MANIFEST_FILES} 2>/dev/null \ + | sha256sum | cut -c1-16) + DEPS_HASH=$(printf '%s %s' "${FILE_HASH}" "${BASE_IMAGE_UNIQ_ID}" | sha256sum | cut -c1-16) + LOCAL_DEPS_TAG="$(echo "${{ inputs.image-tag }}" | cut -d: -f1):deps-${DEPS_HASH}" + + echo "🔵 Local deps tag: ${LOCAL_DEPS_TAG}" + echo "LOCAL_DEPS_TAG=${LOCAL_DEPS_TAG}" >> "$GITHUB_ENV" + + if docker image inspect "${LOCAL_DEPS_TAG}" >/dev/null 2>&1; then + echo "🟢 Local deps-cache HIT! Retagging as ${{ inputs.image-tag }}" + docker tag "${LOCAL_DEPS_TAG}" "${{ inputs.image-tag }}" + echo "hit=true" >> "$GITHUB_OUTPUT" + else + echo "🟠 Local deps-cache MISS (will build then tag for future hits)" + fi + ##### 2: Resolve ECR URL ##### # Tries: explicit input >> ECR_CACHE_URL env var >> SSM parameter on EC2. @@ -99,7 +151,9 @@ runs: - name: Resolve ECR URL id: resolve-ecr - if: steps.local.outputs.hit != 'true' + if: > + steps.local.outputs.hit != 'true' && + steps.local-deps.outputs.hit != 'true' shell: bash env: INPUT_ECR_URL: ${{ inputs.ecr-url }} @@ -278,6 +332,7 @@ runs: - name: Full build if: > steps.local.outputs.hit != 'true' && + steps.local-deps.outputs.hit != 'true' && steps.pull-exact.outputs.hit != 'true' && steps.deps-cache.outputs.deps-cache-hit != 'true' shell: bash @@ -309,6 +364,28 @@ runs: echo "🔵 Building ${{ inputs.image-tag }}..." docker buildx build --load "${BUILD_ARGS[@]}" . + ##### 6.5: Tag built image with local deps-tag ##### + + # Local mirror of step 8 (Push deps tag). After a successful full build, + # tag the resulting image as deps- in the local docker store so the + # next build on this host with identical deps short-circuits via step 1.6, + # regardless of whether ECR is reachable. + + - name: Tag built image with local deps-tag + if: > + steps.local.outputs.hit != 'true' && + steps.local-deps.outputs.hit != 'true' && + steps.pull-exact.outputs.hit != 'true' && + steps.deps-cache.outputs.deps-cache-hit != 'true' + shell: bash + run: | + if [ -n "${LOCAL_DEPS_TAG:-}" ]; then + docker tag "${{ inputs.image-tag }}" "${LOCAL_DEPS_TAG}" + echo "🟢 Tagged local deps-cache: ${LOCAL_DEPS_TAG}" + else + echo "🟠 LOCAL_DEPS_TAG not set, skipping local deps-cache tag" + fi + ##### 7: Push to ECR ##### # Pushes the per-commit ECR image after a successful full build. From 3f1e7c5c8747e3f00c687bcf289a97567bf2a5ae Mon Sep 17 00:00:00 2001 From: jichuanh Date: Sat, 23 May 2026 09:33:13 +0000 Subject: [PATCH 19/55] Document local deps-cache behavior in arm-ci.yaml --- .github/workflows/arm-ci.yaml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.github/workflows/arm-ci.yaml b/.github/workflows/arm-ci.yaml index 2ac657e95541..8de6b57de01e 100644 --- a/.github/workflows/arm-ci.yaml +++ b/.github/workflows/arm-ci.yaml @@ -16,6 +16,10 @@ # # Marker-driven discovery: `pytest -m arm_ci`. Adding a new aarch64-safe # test = tag it with arm_ci, no yaml edit. +# +# Local deps-cache: the ecr-build-push-pull action checks both the local +# docker store and ECR for a deps- image before doing a full build. +# Same-dependency PRs reuse a prior build via metadata-only retag. name: ARM CI From 026793e696f589ad919a1cdc6f41ffba21695b59 Mon Sep 17 00:00:00 2001 From: jichuanh Date: Mon, 25 May 2026 06:08:36 +0000 Subject: [PATCH 20/55] Add disk observability and 14-day deps-cache eviction Adds three steps to ecr-build-push-pull so we can size disk growth on self-hosted runners and prevent the new local deps-cache from accumulating unbounded artifacts. - Step 0 (pre-snapshot): df + docker system df + tag counts + deps-tag table, dumped to step summary and stdout. - Step 10 (evict, if: always()): docker rmi any *:deps-* tag older than 14 days. Re-acquiring an evicted tag costs one full build, acceptable given the steady-state hit rate. - Step 11 (post-snapshot, if: always()): same schema as Step 0 so the pair is diffable. Cost <500ms per job. No behavioral impact on the cache hit path. --- .../actions/ecr-build-push-pull/action.yml | 96 +++++++++++++++++++ 1 file changed, 96 insertions(+) diff --git a/.github/actions/ecr-build-push-pull/action.yml b/.github/actions/ecr-build-push-pull/action.yml index 90e029b21288..616dff5f9da5 100644 --- a/.github/actions/ecr-build-push-pull/action.yml +++ b/.github/actions/ecr-build-push-pull/action.yml @@ -45,6 +45,40 @@ runs: using: composite steps: + ##### 0: Host disk snapshot (pre) ##### + + # Capture host filesystem + docker store usage before the action does any + # work. Output goes to the run's step summary (visible on the PR page) and + # stdout (greppable across runs via `gh run view`). Used to size the + # deps-cache eviction policy once we have a few weeks of data. + + - name: Host disk snapshot (pre) + shell: bash + run: | + set +e + docker_root=$(docker info --format '{{.DockerRootDir}}' 2>/dev/null || echo "/var/lib/docker") + deps_count=$(docker images --filter 'reference=*:deps-*' -q 2>/dev/null | wc -l) + commit_count=$(docker images --filter 'reference=isaac-lab-ci*' -q 2>/dev/null | wc -l) + { + echo "## Disk snapshot (pre)" + echo '```' + echo "Filesystem:" + df -h / "${docker_root}" 2>/dev/null | sort -u + echo + echo "docker system df:" + docker system df + echo + echo "Tag counts:" + echo " isaac-lab-ci-* (commit + deps tags): ${commit_count}" + echo " *:deps-* (deps cache): ${deps_count}" + echo + echo "Deps tags (newest first):" + docker images --filter 'reference=*:deps-*' \ + --format 'table {{.Repository}}:{{.Tag}}\t{{.Size}}\t{{.CreatedSince}}' 2>/dev/null \ + | head -20 + echo '```' + } | tee -a "$GITHUB_STEP_SUMMARY" + ##### 1: Setup docker config + Login to nvcr.io ##### # Create a temp docker config with credsStore disabled before any login. @@ -424,3 +458,65 @@ runs: if [ -n "${DOCKER_CONFIG}" ] && [ -d "${DOCKER_CONFIG}" ]; then rm -rf "${DOCKER_CONFIG}" fi + + ##### 10: Evict stale local deps-cache tags ##### + + # 14-day TTL on `*:deps-*` images. Prevents unbounded disk growth on + # long-lived self-hosted runners as new deps hashes land over time. + # Re-acquiring an evicted tag costs one full build (~13 min on arm64, + # registry pull on Linux), which is acceptable given the hit rate. + # Runs `if: always()` so cleanup happens even on failed jobs. + + - name: Evict stale local deps-cache tags (>14d) + if: always() + shell: bash + run: | + set +e + TTL_DAYS=14 + cutoff=$(date -u -d "${TTL_DAYS} days ago" +%s) + evicted=0 + while IFS='|' read -r created tag; do + [ -z "$tag" ] && continue + created_epoch=$(date -d "$created" +%s 2>/dev/null) || continue + if [ "$created_epoch" -lt "$cutoff" ]; then + days_old=$(( (cutoff - created_epoch) / 86400 + TTL_DAYS )) + echo "🟠 Evicting deps tag (~${days_old}d old): ${tag}" + docker rmi -f "$tag" >/dev/null 2>&1 || true + evicted=$(( evicted + 1 )) + fi + done < <(docker images --filter 'reference=*:deps-*' \ + --format '{{.CreatedAt}}|{{.Repository}}:{{.Tag}}' 2>/dev/null) + echo "🔵 Evicted ${evicted} deps tag(s) older than ${TTL_DAYS}d" + + ##### 11: Host disk snapshot (post) ##### + + # Mirror of step 0. Same schema so the pre/post pair is diffable. The + # delta (post - pre) is one job's net contribution to host disk. + + - name: Host disk snapshot (post) + if: always() + shell: bash + run: | + set +e + docker_root=$(docker info --format '{{.DockerRootDir}}' 2>/dev/null || echo "/var/lib/docker") + deps_count=$(docker images --filter 'reference=*:deps-*' -q 2>/dev/null | wc -l) + commit_count=$(docker images --filter 'reference=isaac-lab-ci*' -q 2>/dev/null | wc -l) + { + echo "## Disk snapshot (post)" + echo '```' + echo "Filesystem:" + df -h / "${docker_root}" 2>/dev/null | sort -u + echo + echo "docker system df:" + docker system df + echo + echo "Tag counts:" + echo " isaac-lab-ci-* (commit + deps tags): ${commit_count}" + echo " *:deps-* (deps cache): ${deps_count}" + echo + echo "Deps tags (newest first):" + docker images --filter 'reference=*:deps-*' \ + --format 'table {{.Repository}}:{{.Tag}}\t{{.Size}}\t{{.CreatedSince}}' 2>/dev/null \ + | head -20 + echo '```' + } | tee -a "$GITHUB_STEP_SUMMARY" From 1ae351f316401af7535df4ffd39500b0144b64c6 Mon Sep 17 00:00:00 2001 From: jichuanh Date: Tue, 26 May 2026 16:23:57 +0000 Subject: [PATCH 21/55] arm-ci: collapse build-arm + arm-ci into one job MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit With ECR not wired for arm64 and ephemeral runner state, splitting the build out of arm-ci just makes the test job rebuild the image on a different fresh host. Merge them so the build happens on the same runner that runs the tests, halving wall time on cold runs (~37 min → ~24 min). When a runner is warm from a prior run (within the autoscaler hold window), the action's local exact-tag and deps-tag short-circuits still fire and bring this further down to ~11-13 min. Original argument for separation — share an image across multiple test jobs via ECR — does not apply: no ECR, single test job. Also tighten the header comment to reflect the new model (no ECR mention, no deps-cache as a headline mechanism). --- .github/workflows/arm-ci.yaml | 46 ++++++----------------------------- 1 file changed, 8 insertions(+), 38 deletions(-) diff --git a/.github/workflows/arm-ci.yaml b/.github/workflows/arm-ci.yaml index 8de6b57de01e..3aefaf8c9d8e 100644 --- a/.github/workflows/arm-ci.yaml +++ b/.github/workflows/arm-ci.yaml @@ -4,22 +4,14 @@ # SPDX-License-Identifier: BSD-3-Clause # ARM/Spark CI — exercises Isaac Lab on aarch64 Linux self-hosted runners -# (NVIDIA DGX Spark). Mirrors build.yaml's structure: one job builds -# Dockerfile.base for linux/arm64 via the shared ecr-build-push-pull action -# (with ECR-backed deps caching), downstream jobs pull the same image and -# `docker run` pytest inside it. -# -# Each test step is wrapped with `continue-on-error: true` so a failure in -# one tier does not abort the others. Each pytest invocation passes -# `--timeout=N --timeout-method=signal --continue-on-collection-errors` so a -# hung or import-broken test cannot consume the whole job slot. +# (NVIDIA DGX Spark). Single job that builds Dockerfile.base for linux/arm64 +# and runs the `arm_ci`-marked pytest set against it. ECR is not wired for +# arm64, so build output stays local to the runner; the action's local +# exact-tag and deps-tag checks short-circuit rebuilds when the runner is +# still warm from a prior run. # # Marker-driven discovery: `pytest -m arm_ci`. Adding a new aarch64-safe # test = tag it with arm_ci, no yaml edit. -# -# Local deps-cache: the ecr-build-push-pull action checks both the local -# docker store and ECR for a deps- image before doing a full build. -# Same-dependency PRs reuse a prior build via metadata-only retag. name: ARM CI @@ -126,33 +118,11 @@ jobs: echo "isaacsim_image_name=$name" >> "$GITHUB_OUTPUT" echo "isaacsim_image_tag=$tag" >> "$GITHUB_OUTPUT" - build-arm: - name: Build Base Docker Image (arm64) - runs-on: [self-hosted, arm64] - needs: [changes, config] - if: needs.changes.outputs.run_arm_ci == 'true' - steps: - - name: Checkout Code - uses: actions/checkout@v4 - with: - fetch-depth: 1 - lfs: true - - - name: Build and push to ECR - uses: ./.github/actions/ecr-build-push-pull - with: - image-tag: ${{ env.CI_IMAGE_TAG }} - isaacsim-base-image: ${{ needs.config.outputs.isaacsim_image_name }} - isaacsim-version: ${{ needs.config.outputs.isaacsim_image_tag }} - dockerfile-path: docker/Dockerfile.base - cache-tag: cache-base-arm64 - platform: linux/arm64 - arm-ci: name: arm-ci runs-on: [self-hosted, arm64] - needs: [build-arm, config] - if: needs.build-arm.result == 'success' + needs: [changes, config] + if: needs.changes.outputs.run_arm_ci == 'true' timeout-minutes: 120 steps: - name: Checkout Code @@ -161,7 +131,7 @@ jobs: fetch-depth: 1 lfs: true - - name: Pull Base Docker image + - name: Build Base Docker image uses: ./.github/actions/ecr-build-push-pull with: image-tag: ${{ env.CI_IMAGE_TAG }} From 17be9344a566fcfc3a6384d2b22464e4bb7fe9f1 Mon Sep 17 00:00:00 2001 From: jichuanh Date: Tue, 26 May 2026 22:51:10 +0000 Subject: [PATCH 22/55] arm-ci: extract detect-changes into composite action MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Move the inline path-detection bash out of arm-ci.yaml's changes job into a reusable composite action at .github/actions/detect-changes. Keeps build.yaml on its existing inline copy for now; that workflow can adopt the same action in a follow-up PR. The composite covers both the simple path used by arm-ci (just decide) and the richer summary-table render that build.yaml uses, so the migration there is purely mechanical when scheduled. Inputs: patterns newline + tab-delimited regex/description pairs triggered-jobs short text shown in step summary summary-title heading for the step summary section Output: run (true/false). Behavior matches the previous inline logic exactly: * non-PR events → run=true * PR with no matched path → run=false * API failure listing changed files → run=true (fail-safe) --- .github/actions/detect-changes/action.yml | 139 ++++++++++++++++++++++ .github/workflows/arm-ci.yaml | 58 ++++----- 2 files changed, 159 insertions(+), 38 deletions(-) create mode 100644 .github/actions/detect-changes/action.yml diff --git a/.github/actions/detect-changes/action.yml b/.github/actions/detect-changes/action.yml new file mode 100644 index 000000000000..e2a789028150 --- /dev/null +++ b/.github/actions/detect-changes/action.yml @@ -0,0 +1,139 @@ +# Copyright (c) 2022-2026, The Isaac Lab Project Developers (https://github.com/isaac-sim/IsaacLab/blob/main/CONTRIBUTORS.md). +# All rights reserved. +# +# SPDX-License-Identifier: BSD-3-Clause + +name: 'Detect Changes' +description: > + Decide whether expensive downstream jobs should run on this PR. Returns + "true" on non-PR events (push, workflow_dispatch) and fails-safe to "true" + if the changed-files API call errors out. On PR events, returns "true" iff + any provided regex pattern matches a changed file path. + + Renders a step summary table for visibility on the run page. + + This pattern (always-runs gating job + downstream `if:`) is used instead of + a workflow-level `paths:` filter because a not-triggered required check + would block PRs indefinitely under branch protection. + +inputs: + patterns: + description: > + Newline-separated, TAB-delimited regex + human-readable description + pairs. Example: + ^source/\tLibrary source code + ^docker/\tContainer build inputs + required: true + triggered-jobs: + description: > + Short text shown in the step summary describing what runs when this + returns true. + required: false + default: 'downstream jobs' + summary-title: + description: 'Heading text for the step summary section.' + required: false + default: 'Change detection' + +outputs: + run: + description: '"true" if any pattern matched (or this is a non-PR event); "false" otherwise.' + value: ${{ steps.detect.outputs.run }} + +runs: + using: composite + steps: + - id: detect + shell: bash + env: + GH_TOKEN: ${{ github.token }} + PR_NUMBER: ${{ github.event.pull_request.number }} + EVENT_NAME: ${{ github.event_name }} + REPO: ${{ github.repository }} + PATTERNS_INPUT: ${{ inputs.patterns }} + TRIGGERED_JOBS: ${{ inputs.triggered-jobs }} + SUMMARY_TITLE: ${{ inputs.summary-title }} + run: | + set -euo pipefail + + # Parse newline+tab-delimited input into a bash array. + patterns=() + while IFS= read -r line; do + # Skip blank lines and lines without a tab (treat as malformed). + [ -z "$line" ] && continue + [[ "$line" == *$'\t'* ]] || continue + patterns+=("$line") + done <<< "$PATTERNS_INPUT" + + if [ "${#patterns[@]}" -eq 0 ]; then + echo "::error::detect-changes received no valid pattern lines" + exit 1 + fi + + render_table() { + local files="$1" entry regex desc count sample + echo "| Pattern | What it covers | Matched files |" + echo "|---|---|---|" + for entry in "${patterns[@]}"; do + IFS=$'\t' read -r regex desc <<< "$entry" + count=$(grep -cE "$regex" <<< "$files" || true) + if [ "$count" -gt 0 ]; then + sample=$(grep -m 3 -E "$regex" <<< "$files" | paste -sd ', ' -) + [ "$count" -gt 3 ] && sample="$sample (and $((count - 3)) more)" + echo "| \`$regex\` | $desc | $sample |" + else + echo "| \`$regex\` | $desc | - |" + fi + done + } + + any_match() { + local files="$1" entry regex + for entry in "${patterns[@]}"; do + IFS=$'\t' read -r regex _ <<< "$entry" + if grep -qE "$regex" <<< "$files"; then + return 0 + fi + done + return 1 + } + + decide() { + local decision="$1" reason="$2" files="${3:-}" + echo "Decision: run=$decision ($reason)" + echo "run=$decision" >> "$GITHUB_OUTPUT" + { + echo "## ${SUMMARY_TITLE}" + echo "" + if [ "$decision" = "true" ]; then + echo "Downstream jobs will **run**: $reason." + else + echo "Downstream jobs will be **skipped**: $reason." + fi + echo "" + echo "Triggered jobs: $TRIGGERED_JOBS." + if [ -n "$files" ]; then + echo "" + render_table "$files" + fi + } >> "$GITHUB_STEP_SUMMARY" + } + + if [ "$EVENT_NAME" != "pull_request" ]; then + decide true "non-PR event ($EVENT_NAME)" + exit 0 + fi + + if ! changed_files="$(gh api --paginate "repos/$REPO/pulls/$PR_NUMBER/files" --jq '.[].filename')"; then + echo "::warning::Could not list changed files; defaulting to run=true" + decide true "fail-safe (could not list changed files)" + exit 0 + fi + + printf '%s\n' "$changed_files" + + if any_match "$changed_files"; then + decide true "relevant paths changed" "$changed_files" + else + decide false "no relevant paths changed" "$changed_files" + fi diff --git a/.github/workflows/arm-ci.yaml b/.github/workflows/arm-ci.yaml index 3aefaf8c9d8e..0d370a5d6df4 100644 --- a/.github/workflows/arm-ci.yaml +++ b/.github/workflows/arm-ci.yaml @@ -49,46 +49,28 @@ jobs: name: Detect Changes runs-on: ubuntu-latest outputs: - run_arm_ci: ${{ steps.detect.outputs.run_arm_ci }} + run_arm_ci: ${{ steps.detect.outputs.run }} steps: + - uses: actions/checkout@v4 + with: + fetch-depth: 1 + sparse-checkout: .github/actions/detect-changes + sparse-checkout-cone-mode: false - id: detect - env: - GH_TOKEN: ${{ github.token }} - PR_NUMBER: ${{ github.event.pull_request.number }} - EVENT_NAME: ${{ github.event_name }} - REPO: ${{ github.repository }} - run: | - set -euo pipefail - patterns=( - $'^source/\tLibrary source code' - $'^tools/\tBuild tooling' - $'^apps/\tStandalone apps' - $'^docker/\tContainer build inputs' - $'(^|/)pyproject\\.toml$\tPython project metadata' - $'^\\.github/workflows/arm-ci\\.yaml$\tThis workflow file' - $'^\\.github/actions/ecr-build-push-pull/\tECR action' - $'^VERSION$\tVersion file' - ) - any_match() { - local files="$1" entry regex - for entry in "${patterns[@]}"; do - IFS=$'\t' read -r regex _ <<< "$entry" - if grep -qE "$regex" <<< "$files"; then - return 0 - fi - done - return 1 - } - if [ "$EVENT_NAME" != "pull_request" ]; then - echo "run_arm_ci=true" >> "$GITHUB_OUTPUT" - exit 0 - fi - changed_files="$(gh api --paginate "repos/$REPO/pulls/$PR_NUMBER/files" --jq '.[].filename' || true)" - if [ -z "$changed_files" ] || any_match "$changed_files"; then - echo "run_arm_ci=true" >> "$GITHUB_OUTPUT" - else - echo "run_arm_ci=false" >> "$GITHUB_OUTPUT" - fi + uses: ./.github/actions/detect-changes + with: + summary-title: ARM CI gating + triggered-jobs: arm-ci (build + tests) + patterns: | + ^source/ Library source code + ^tools/ Build tooling + ^apps/ Standalone apps + ^docker/ Container build inputs + (^|/)pyproject\.toml$ Python project metadata + ^\.github/workflows/arm-ci\.yaml$ This workflow file + ^\.github/actions/ecr-build-push-pull/ ECR action + ^\.github/actions/detect-changes/ Change-detection action + ^VERSION$ Version file config: name: Load Config From f2378c961b7dd1528b7aa9273369a41035c62829 Mon Sep 17 00:00:00 2001 From: jichuanh Date: Tue, 26 May 2026 22:52:08 +0000 Subject: [PATCH 23/55] Consolidate changelog fragments to one per touched package PR convention: one fragment per touched package (AGENTS.md), so a single-package PR has 1 and a cross-package PR has 2. This PR touches source/isaaclab/ and source/isaaclab_tasks/, so we keep two fragments total. Merges the six existing source/isaaclab/changelog.d/jichuanh-* fragments into one .rst (apppath-fallback + pytetwild-aarch64 entries), and the two existing source/isaaclab_tasks/changelog.d/jichuanh-* fragments into one .rst (ovrtx skip entry). The remaining individual fragments were CI-only .skip entries that didn't need a changelog line. --- ...huanh-apppath-exp-fallback.rst => jichuanh-arm-ci.rst} | 8 ++++++++ source/isaaclab/changelog.d/jichuanh-arm-ci.skip | 1 - .../isaaclab/changelog.d/jichuanh-conftest-ci-marker.skip | 1 - source/isaaclab/changelog.d/jichuanh-ecr-local-cache.skip | 1 - .../isaaclab/changelog.d/jichuanh-pytetwild-aarch64.rst | 8 -------- .../changelog.d/jichuanh-windows-spark-ci-min.skip | 1 - ...ichuanh-ovrtx-skip-aarch64.rst => jichuanh-arm-ci.rst} | 0 .../changelog.d/jichuanh-cartpole-train-smoke.skip | 1 - 8 files changed, 8 insertions(+), 13 deletions(-) rename source/isaaclab/changelog.d/{jichuanh-apppath-exp-fallback.rst => jichuanh-arm-ci.rst} (56%) delete mode 100644 source/isaaclab/changelog.d/jichuanh-arm-ci.skip delete mode 100644 source/isaaclab/changelog.d/jichuanh-conftest-ci-marker.skip delete mode 100644 source/isaaclab/changelog.d/jichuanh-ecr-local-cache.skip delete mode 100644 source/isaaclab/changelog.d/jichuanh-pytetwild-aarch64.rst delete mode 100644 source/isaaclab/changelog.d/jichuanh-windows-spark-ci-min.skip rename source/isaaclab_tasks/changelog.d/{jichuanh-ovrtx-skip-aarch64.rst => jichuanh-arm-ci.rst} (100%) delete mode 100644 source/isaaclab_tasks/changelog.d/jichuanh-cartpole-train-smoke.skip diff --git a/source/isaaclab/changelog.d/jichuanh-apppath-exp-fallback.rst b/source/isaaclab/changelog.d/jichuanh-arm-ci.rst similarity index 56% rename from source/isaaclab/changelog.d/jichuanh-apppath-exp-fallback.rst rename to source/isaaclab/changelog.d/jichuanh-arm-ci.rst index 376f7f589170..04b19fdeb4bc 100644 --- a/source/isaaclab/changelog.d/jichuanh-apppath-exp-fallback.rst +++ b/source/isaaclab/changelog.d/jichuanh-arm-ci.rst @@ -9,3 +9,11 @@ Fixed ``KeyError: 'EXP_PATH'`` deep inside ``_resolve_experience_file``; now AppLauncher resolves the path from ``isaacsim.__file__`` and stores it back into the environment so subsequent code can rely on it. + +* Excluded ``pytetwild`` install on aarch64 platforms. The package has no + aarch64 wheel on PyPI and its source build fails (the ``geogram`` CMake dep + hardcodes ``-m64``). The single call site in :mod:`isaaclab.sim.schemas` + already raises a clear "install pytetwild manually or provide a + pre-tetrahedralized UsdGeom.TetMesh" message when the lazy import fails, so + aarch64 users keep everything except automatic volume-deformable + tetrahedralization. diff --git a/source/isaaclab/changelog.d/jichuanh-arm-ci.skip b/source/isaaclab/changelog.d/jichuanh-arm-ci.skip deleted file mode 100644 index 5d82dee1471b..000000000000 --- a/source/isaaclab/changelog.d/jichuanh-arm-ci.skip +++ /dev/null @@ -1 +0,0 @@ -Skip changelog: CI-infrastructure only (no user-facing API change). Adds .github/workflows/arm-ci.yaml carrying the ARM/Spark CI pipeline against self-hosted [self-hosted, arm64] runners. Tier 1 (smoke, install probe, Kit launch) plus Tier 2 (kitless rendering, controller determinism). All jobs use continue-on-error: true and pytest --timeout to fail fast on hangs. Tags three test_rendering_*_kitless.py files plus test_differential_ik.py / test_operational_space.py with arm_ci so the Tier 2 jobs can select them. diff --git a/source/isaaclab/changelog.d/jichuanh-conftest-ci-marker.skip b/source/isaaclab/changelog.d/jichuanh-conftest-ci-marker.skip deleted file mode 100644 index b189494df2ae..000000000000 --- a/source/isaaclab/changelog.d/jichuanh-conftest-ci-marker.skip +++ /dev/null @@ -1 +0,0 @@ -Skip changelog: CI-infrastructure only. Generalizes tools/conftest.py to read a CI_MARKER env var (defaulting to ISAACSIM_CI_SHORT=true → "isaacsim_ci" for back-compat). Lets cross-platform CI workflows reuse the same subprocess-per-test orchestrator with their own markers (arm_ci for ARM/Spark, windows_ci for Windows) instead of forking conftest.py per platform. diff --git a/source/isaaclab/changelog.d/jichuanh-ecr-local-cache.skip b/source/isaaclab/changelog.d/jichuanh-ecr-local-cache.skip deleted file mode 100644 index c9378e5b4cd4..000000000000 --- a/source/isaaclab/changelog.d/jichuanh-ecr-local-cache.skip +++ /dev/null @@ -1 +0,0 @@ -CI-only change in .github/actions/ecr-build-push-pull/action.yml. diff --git a/source/isaaclab/changelog.d/jichuanh-pytetwild-aarch64.rst b/source/isaaclab/changelog.d/jichuanh-pytetwild-aarch64.rst deleted file mode 100644 index 1d025bd9a8b1..000000000000 --- a/source/isaaclab/changelog.d/jichuanh-pytetwild-aarch64.rst +++ /dev/null @@ -1,8 +0,0 @@ -Fixed -^^^^^ - -* Excluded ``pytetwild`` install on aarch64 platforms. The package has no aarch64 wheel on PyPI and its - source build fails (the ``geogram`` CMake dep hardcodes ``-m64``). The single call site in - :mod:`isaaclab.sim.schemas` already raises a clear "install pytetwild manually or provide a - pre-tetrahedralized UsdGeom.TetMesh" message when the lazy import fails, so aarch64 users keep - everything except automatic volume-deformable tetrahedralization. diff --git a/source/isaaclab/changelog.d/jichuanh-windows-spark-ci-min.skip b/source/isaaclab/changelog.d/jichuanh-windows-spark-ci-min.skip deleted file mode 100644 index bfa2b75a780a..000000000000 --- a/source/isaaclab/changelog.d/jichuanh-windows-spark-ci-min.skip +++ /dev/null @@ -1 +0,0 @@ -Skip changelog: CI/test-infrastructure foundation (no user-facing API change). Registers the windows / windows_ci / arm / arm_ci pytest markers in pyproject.toml, teaches AppLauncher to recognize them in argv so they do not leak into Isaac Sim's argparse, and moves the AssetConverterBase USD scratch dir from hardcoded /tmp/IsaacLab to tempfile.gettempdir() for cross-platform compatibility. Workflow files (arm-ci.yaml, windows-ci.yaml) ship in follow-up PRs. diff --git a/source/isaaclab_tasks/changelog.d/jichuanh-ovrtx-skip-aarch64.rst b/source/isaaclab_tasks/changelog.d/jichuanh-arm-ci.rst similarity index 100% rename from source/isaaclab_tasks/changelog.d/jichuanh-ovrtx-skip-aarch64.rst rename to source/isaaclab_tasks/changelog.d/jichuanh-arm-ci.rst diff --git a/source/isaaclab_tasks/changelog.d/jichuanh-cartpole-train-smoke.skip b/source/isaaclab_tasks/changelog.d/jichuanh-cartpole-train-smoke.skip deleted file mode 100644 index 303fdd5708e5..000000000000 --- a/source/isaaclab_tasks/changelog.d/jichuanh-cartpole-train-smoke.skip +++ /dev/null @@ -1 +0,0 @@ -Skip changelog: CI/test-infrastructure only. Adds source/isaaclab_tasks/test/test_cartpole_training_smoke.py covering two-iter rsl_rl PPO training on Isaac-Cartpole-Direct-v0 (state) and Isaac-Cartpole-RGB-Camera-Direct-v0 (perception), tagged pytest.mark.arm_ci so the ARM/Spark workflow picks them up via the CI_MARKER orchestrator. Mirrors the subprocess pattern from test_train_scripts_deterministic.py with --max_iterations=2 --num_envs=16. From c9f064d85fe87667365ebe8351764f270f362edf Mon Sep 17 00:00:00 2001 From: jichuanh Date: Wed, 27 May 2026 05:17:36 +0000 Subject: [PATCH 24/55] Drop unused windows / arm pytest markers `pytest.mark.windows` and `pytest.mark.arm` (without the `_ci` suffix) were declared in pyproject.toml but never referenced by any test in the tree. Only the `_ci` variants (`windows_ci`, `arm_ci`) are used by actual tests and the CI orchestrator. Drop the dead declarations; re-add when a real test needs the platform-but-not-CI distinction. --- pyproject.toml | 2 -- 1 file changed, 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 33ba8e2b1274..a2a781e466aa 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -194,9 +194,7 @@ ignore-words-list = "haa,slq,collapsable,buss,reacher,thirdparty" markers = [ "isaacsim_ci: mark test to run in isaacsim ci", - "windows: mark test as runnable on Windows platforms", "windows_ci: mark test to run on Windows platforms in CI", - "arm: mark test as runnable on ARM platforms (e.g. NVIDIA DGX Spark)", "arm_ci: mark test to run on ARM platforms in CI (e.g. NVIDIA DGX Spark)", ] From 445c0f1ce0bf1a35fc7ffc4f37b6cabd4dc94ff7 Mon Sep 17 00:00:00 2001 From: jichuanh Date: Wed, 27 May 2026 05:17:44 +0000 Subject: [PATCH 25/55] Address codex review on local-cache + detect-changes work MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two small follow-ups from the codex review of the local-deps-cache and detect-changes commits: - detect-changes: turn the silent "skip tab-less pattern lines" into an explicit `::error::` + exit 1. A misconfigured patterns input (e.g., space-delimited by mistake) would previously drop patterns silently, making it easy to ship a workflow that fails to trigger on changes it should have. Loud failure is the safer default. - ecr-build-push-pull: narrow the `*:deps-*` reference filter on both the disk snapshot count and the 14-day eviction step to `isaac-lab-ci*:deps-*`. The previous wildcard would have evicted any other tenant's `deps-`-prefixed image tags on the same self-hosted runner — defensive scoping for the case where these runners ever host more than one CI pipeline. Codex's third finding (claimed broken backtick escaping in the disk snapshot bash) was a misread of single-quoted backticks; verified locally that `echo '\`\`\`'` emits the literal triple-backtick fence. --- .github/actions/detect-changes/action.yml | 10 +++++++--- .github/actions/ecr-build-push-pull/action.yml | 10 +++++----- 2 files changed, 12 insertions(+), 8 deletions(-) diff --git a/.github/actions/detect-changes/action.yml b/.github/actions/detect-changes/action.yml index e2a789028150..e511381aa7f7 100644 --- a/.github/actions/detect-changes/action.yml +++ b/.github/actions/detect-changes/action.yml @@ -56,12 +56,16 @@ runs: run: | set -euo pipefail - # Parse newline+tab-delimited input into a bash array. + # Parse newline+tab-delimited input into a bash array. Tab-less lines + # are an error (likely space-delimited by mistake) — a silently-skipped + # pattern could make the workflow not trigger on changes it should have. patterns=() while IFS= read -r line; do - # Skip blank lines and lines without a tab (treat as malformed). [ -z "$line" ] && continue - [[ "$line" == *$'\t'* ]] || continue + if [[ "$line" != *$'\t'* ]]; then + echo "::error::detect-changes: pattern line missing tab separator: '$line'" + exit 1 + fi patterns+=("$line") done <<< "$PATTERNS_INPUT" diff --git a/.github/actions/ecr-build-push-pull/action.yml b/.github/actions/ecr-build-push-pull/action.yml index 616dff5f9da5..8d37ccb5c924 100644 --- a/.github/actions/ecr-build-push-pull/action.yml +++ b/.github/actions/ecr-build-push-pull/action.yml @@ -57,7 +57,7 @@ runs: run: | set +e docker_root=$(docker info --format '{{.DockerRootDir}}' 2>/dev/null || echo "/var/lib/docker") - deps_count=$(docker images --filter 'reference=*:deps-*' -q 2>/dev/null | wc -l) + deps_count=$(docker images --filter 'reference=isaac-lab-ci*:deps-*' -q 2>/dev/null | wc -l) commit_count=$(docker images --filter 'reference=isaac-lab-ci*' -q 2>/dev/null | wc -l) { echo "## Disk snapshot (pre)" @@ -73,7 +73,7 @@ runs: echo " *:deps-* (deps cache): ${deps_count}" echo echo "Deps tags (newest first):" - docker images --filter 'reference=*:deps-*' \ + docker images --filter 'reference=isaac-lab-ci*:deps-*' \ --format 'table {{.Repository}}:{{.Tag}}\t{{.Size}}\t{{.CreatedSince}}' 2>/dev/null \ | head -20 echo '```' @@ -484,7 +484,7 @@ runs: docker rmi -f "$tag" >/dev/null 2>&1 || true evicted=$(( evicted + 1 )) fi - done < <(docker images --filter 'reference=*:deps-*' \ + done < <(docker images --filter 'reference=isaac-lab-ci*:deps-*' \ --format '{{.CreatedAt}}|{{.Repository}}:{{.Tag}}' 2>/dev/null) echo "🔵 Evicted ${evicted} deps tag(s) older than ${TTL_DAYS}d" @@ -499,7 +499,7 @@ runs: run: | set +e docker_root=$(docker info --format '{{.DockerRootDir}}' 2>/dev/null || echo "/var/lib/docker") - deps_count=$(docker images --filter 'reference=*:deps-*' -q 2>/dev/null | wc -l) + deps_count=$(docker images --filter 'reference=isaac-lab-ci*:deps-*' -q 2>/dev/null | wc -l) commit_count=$(docker images --filter 'reference=isaac-lab-ci*' -q 2>/dev/null | wc -l) { echo "## Disk snapshot (post)" @@ -515,7 +515,7 @@ runs: echo " *:deps-* (deps cache): ${deps_count}" echo echo "Deps tags (newest first):" - docker images --filter 'reference=*:deps-*' \ + docker images --filter 'reference=isaac-lab-ci*:deps-*' \ --format 'table {{.Repository}}:{{.Tag}}\t{{.Size}}\t{{.CreatedSince}}' 2>/dev/null \ | head -20 echo '```' From 3afe9b36b161a7b571d8ae0d31f37eccee7fb737 Mon Sep 17 00:00:00 2001 From: jichuanh Date: Wed, 27 May 2026 05:30:22 +0000 Subject: [PATCH 26/55] Split docker-build out of ecr-build-push-pull Promote the existing minimal docker-build action into the canonical local-build primitive: nvcr login + pre-build disk snapshot + local exact-tag short-circuit + local deps-tag short-circuit + buildx build + post-build deps-tag write + 14-day deps-cache eviction + post-build disk snapshot. Reshape ecr-build-push-pull as a thin ECR wrapper over docker-build: URL resolve + auth + ECR exact-image check + ECR deps-cache registry retag + (on miss) delegate the local build to docker-build with optional --cache-from / --cache-to and an ECR-prefixed extra-tag for the push step. arm-ci.yaml switches from ecr-build-push-pull to docker-build directly, since the arm64 hosts have no ECR access. The "ECR" name no longer shows up in the arm path's dependency graph. docker-build keeps backward compatibility for daily-compatibility.yml's existing callers: same image-tag / isaacsim-base-image / isaacsim-version / dockerfile-path / context-path inputs, defaults unchanged. New inputs (platform, cache-from, cache-to, extra-tags, skip-docker-config) are all optional. Deps-hash schema is intentionally identical between the two actions so a local-cache hit and a registry-side retag converge on the same image identity. --- .github/actions/docker-build/action.yml | 348 +++++++++++++++--- .../actions/ecr-build-push-pull/action.yml | 321 ++++------------ .github/workflows/arm-ci.yaml | 5 +- 3 files changed, 358 insertions(+), 316 deletions(-) diff --git a/.github/actions/docker-build/action.yml b/.github/actions/docker-build/action.yml index 7f88241cfb8c..5e123394ba9e 100644 --- a/.github/actions/docker-build/action.yml +++ b/.github/actions/docker-build/action.yml @@ -3,82 +3,322 @@ # # SPDX-License-Identifier: BSD-3-Clause -name: 'Build Docker Image' -description: 'Builds a Docker image with IsaacSim and IsaacLab dependencies' +name: 'Docker Build (local-cache)' +description: > + Build a Docker image with local-cache short-circuits and observability. + No ECR knowledge built in: the output stays in the host's local docker + store. Designed for self-hosted runners where ECR is not wired up (e.g. + the arm64 / Spark pool), and as the build primitive that + `ecr-build-push-pull` delegates to once its registry-side checks miss. + + Pipeline: + 1. Optional: setup docker config + login to nvcr.io (skipped when the + caller has already done it — `skip-docker-config: true`). + 2. Pre-build disk snapshot (df + docker system df + tag counts) into the + step summary. + 3. Local exact-tag short-circuit: skip everything if `image-tag` is + already in the host's docker store. + 4. Local deps-tag short-circuit: compute a hash over Dockerfile + + setup.py / pyproject.toml / uv.lock + base image digest. If + `:deps-` exists locally, retag as `image-tag` (metadata-only). + 5. buildx build (only when neither short-circuit fired). Optional + `--cache-from` / `--cache-to` flags for callers that want to layer + in a registry-side build cache (e.g. ECR). + 6. Post-build: tag the result as `:deps-` so future builds + with identical deps short-circuit at step 4. + 7. Evict `isaac-lab-ci*:deps-*` tags older than 14 days to bound disk + growth on long-lived hosts. + 8. Post-build disk snapshot. + + The deps-hash schema must stay identical to the one in + `ecr-build-push-pull`'s registry-side deps-cache (step 5 there) so a local + build and a registry retag converge on the same image identity. inputs: image-tag: - description: 'Docker image tag to use' + description: 'Tag for the Docker image (e.g. my-image:latest).' required: true isaacsim-base-image: - description: 'IsaacSim base image' + description: 'IsaacSim base image (passed as ISAACSIM_BASE_IMAGE_ARG build-arg).' required: true isaacsim-version: - description: 'IsaacSim version' + description: 'IsaacSim version (passed as ISAACSIM_VERSION_ARG build-arg).' required: true dockerfile-path: - description: 'Path to Dockerfile' + description: 'Path to Dockerfile, relative to the repository root.' default: 'docker/Dockerfile.base' required: false context-path: - description: 'Build context path' + description: 'Build context path passed to `docker buildx build`.' default: '.' required: false + platform: + description: 'Target platform for `docker buildx build --platform`.' + default: 'linux/amd64' + required: false + cache-from: + description: > + Optional value for `docker buildx build --cache-from`. Typically a + `type=registry,ref=` for cross-host layer cache. Leave empty for + pure local-only builds. + default: '' + required: false + cache-to: + description: > + Optional value for `docker buildx build --cache-to`. Pairs with + `cache-from` for registry-backed layer cache writes. + default: '' + required: false + extra-tags: + description: > + Newline-separated additional tags to apply to the built image (e.g. an + ECR-prefixed tag for the caller to push). Each tag is materialized via + `docker tag` after a successful build or local short-circuit, so the + caller can rely on them being present. + default: '' + required: false + skip-docker-config: + description: > + Skip the docker config setup + nvcr.io login step. Set to "true" when + the caller has already configured docker auth (e.g. ecr-build-push-pull + does its own ECR + nvcr setup before delegating here). + default: 'false' + required: false + +outputs: + local-hit: + description: '"true" if the exact image-tag was already in local docker.' + value: ${{ steps.local.outputs.hit }} + local-deps-hit: + description: '"true" if a local deps-cache tag matched (and was retagged).' + value: ${{ steps.local-deps.outputs.hit }} + was-built: + description: '"true" if a full buildx build ran (no cache hit).' + value: ${{ steps.build.outputs.was-built }} runs: using: composite steps: - - name: NGC Login - shell: sh + + ##### 1: Setup docker config + login to nvcr.io (optional) ##### + + - name: Setup docker config and login to nvcr.io + if: inputs.skip-docker-config != 'true' + shell: bash + run: | + DOCKER_CONFIG_DIR=$(mktemp -d) + if [ -f "${HOME}/.docker/config.json" ]; then + python3 -c "import json; cfg=json.load(open('${HOME}/.docker/config.json')); cfg['credsStore']=''; cfg.pop('credHelpers',None); json.dump(cfg,open('${DOCKER_CONFIG_DIR}/config.json','w'))" + else + echo '{"credsStore":""}' > "${DOCKER_CONFIG_DIR}/config.json" + fi + echo "DOCKER_CONFIG=${DOCKER_CONFIG_DIR}" >> "$GITHUB_ENV" + export DOCKER_CONFIG="${DOCKER_CONFIG_DIR}" + + if [ -n "${NGC_API_KEY:-}" ]; then + echo "🔵 Logging into nvcr.io..." + docker login -u \$oauthtoken -p "${NGC_API_KEY}" nvcr.io + else + echo "🟠 NGC_API_KEY not set - skipping nvcr.io login (normal for fork PRs)" + fi + + ##### 2: Host disk snapshot (pre) ##### + + - name: Host disk snapshot (pre) + shell: bash run: | - # Only attempt NGC login if API key is available - if [ -n "${{ env.NGC_API_KEY }}" ]; then - echo "Logging into NGC registry..." - docker login -u \$oauthtoken -p ${{ env.NGC_API_KEY }} nvcr.io - echo "✅ Successfully logged into NGC registry" + set +e + docker_root=$(docker info --format '{{.DockerRootDir}}' 2>/dev/null || echo "/var/lib/docker") + deps_count=$(docker images --filter 'reference=isaac-lab-ci*:deps-*' -q 2>/dev/null | wc -l) + commit_count=$(docker images --filter 'reference=isaac-lab-ci*' -q 2>/dev/null | wc -l) + { + echo "## Disk snapshot (pre)" + echo '```' + echo "Filesystem:" + df -h / "${docker_root}" 2>/dev/null | sort -u + echo + echo "docker system df:" + docker system df + echo + echo "Tag counts:" + echo " isaac-lab-ci-* (commit + deps tags): ${commit_count}" + echo " isaac-lab-ci*:deps-* (deps cache): ${deps_count}" + echo + echo "Deps tags (newest first):" + docker images --filter 'reference=isaac-lab-ci*:deps-*' \ + --format 'table {{.Repository}}:{{.Tag}}\t{{.Size}}\t{{.CreatedSince}}' 2>/dev/null \ + | head -20 + echo '```' + } | tee -a "$GITHUB_STEP_SUMMARY" + + ##### 3: Local exact-tag short-circuit ##### + + - name: Check image locally + id: local + shell: bash + run: | + if docker image inspect "${{ inputs.image-tag }}" >/dev/null 2>&1; then + echo "🟢 Image already in local docker store: ${{ inputs.image-tag }}" + echo "hit=true" >> "$GITHUB_OUTPUT" else - echo "⚠️ NGC_API_KEY not available - skipping NGC login" - echo "This is normal for PRs from forks or when secrets are not configured" + echo "🔵 Image not present locally, will check deps-cache / build" fi - - name: Build Docker Image - shell: sh + ##### 4: Local deps-tag short-circuit ##### + + - name: Check deps-tag locally + id: local-deps + if: steps.local.outputs.hit != 'true' + shell: bash run: | - # Function to build Docker image - build_docker_image() { - local image_tag="$1" - local isaacsim_base_image="$2" - local isaacsim_version="$3" - local dockerfile_path="$4" - local context_path="$5" - - # Skip build if image already exists locally (e.g. built by a prior job on the same runner) - if docker image inspect "$image_tag" > /dev/null 2>&1; then - echo "Image $image_tag already exists locally, skipping build." - return 0 + DEPS_FILES=( + "${{ inputs.dockerfile-path }}" + isaaclab.sh + environment.yml + source/isaaclab/isaaclab/cli + ) + DEPS_MANIFEST_PATTERN='(setup\.py|pyproject\.toml|setup\.cfg|extension\.toml|requirements[^/]*\.txt|uv\.lock)$' + + BASE_IMAGE_DIGEST=$(docker buildx imagetools inspect \ + "${{ inputs.isaacsim-base-image }}:${{ inputs.isaacsim-version }}" \ + --format '{{json .Manifest.Digest}}' 2>/dev/null | tr -d '"' || true) + if [ -n "${BASE_IMAGE_DIGEST}" ]; then + BASE_IMAGE_UNIQ_ID="${{ inputs.isaacsim-base-image }}:${{ inputs.isaacsim-version }}:${BASE_IMAGE_DIGEST}" + else + echo "🟠 Could not resolve base image digest, falling back to tag string" + BASE_IMAGE_UNIQ_ID="${{ inputs.isaacsim-base-image }}:${{ inputs.isaacsim-version }}" + fi + + MANIFEST_FILES=$(git ls-files | grep -E "${DEPS_MANIFEST_PATTERN}" || true) + FILE_HASH=$(git ls-files -s "${DEPS_FILES[@]}" ${MANIFEST_FILES} 2>/dev/null \ + | sha256sum | cut -c1-16) + DEPS_HASH=$(printf '%s %s' "${FILE_HASH}" "${BASE_IMAGE_UNIQ_ID}" | sha256sum | cut -c1-16) + LOCAL_DEPS_TAG="$(echo "${{ inputs.image-tag }}" | cut -d: -f1):deps-${DEPS_HASH}" + + echo "🔵 Local deps tag: ${LOCAL_DEPS_TAG}" + echo "LOCAL_DEPS_TAG=${LOCAL_DEPS_TAG}" >> "$GITHUB_ENV" + + if docker image inspect "${LOCAL_DEPS_TAG}" >/dev/null 2>&1; then + echo "🟢 Local deps-cache HIT! Retagging as ${{ inputs.image-tag }}" + docker tag "${LOCAL_DEPS_TAG}" "${{ inputs.image-tag }}" + echo "hit=true" >> "$GITHUB_OUTPUT" + else + echo "🟠 Local deps-cache MISS (will build then tag for future hits)" + fi + + ##### 5: Full build ##### + + - name: Build image + id: build + if: > + steps.local.outputs.hit != 'true' && + steps.local-deps.outputs.hit != 'true' + shell: bash + run: | + BUILD_ARGS=( + --progress=plain + --platform "${{ inputs.platform }}" + -f "${{ inputs.dockerfile-path }}" + --build-arg "ISAACSIM_BASE_IMAGE_ARG=${{ inputs.isaacsim-base-image }}" + --build-arg "ISAACSIM_VERSION_ARG=${{ inputs.isaacsim-version }}" + --build-arg "ISAACSIM_ROOT_PATH_ARG=/isaac-sim" + --build-arg "ISAACLAB_PATH_ARG=/workspace/isaaclab" + --build-arg "DOCKER_USER_HOME_ARG=/root" + -t "${{ inputs.image-tag }}" + ) + if [ -n "${{ inputs.cache-from }}" ]; then + BUILD_ARGS+=( --cache-from "${{ inputs.cache-from }}" ) + fi + if [ -n "${{ inputs.cache-to }}" ]; then + BUILD_ARGS+=( --cache-to "${{ inputs.cache-to }}" ) + fi + + BUILDER_NAME="docker-build-${{ github.run_id }}-${{ github.job }}" + docker buildx create --use --driver docker-container --name "${BUILDER_NAME}" \ + || docker buildx use "${BUILDER_NAME}" + trap 'docker buildx rm "${BUILDER_NAME}" || true' EXIT + + echo "🔵 Building ${{ inputs.image-tag }}..." + docker buildx build --load "${BUILD_ARGS[@]}" "${{ inputs.context-path }}" + echo "was-built=true" >> "$GITHUB_OUTPUT" + + ##### 6: Tag built image with local deps-tag ##### + + # Runs only when a real build happened (not on cache hits). Populates the + # deps-tag so the next build with identical deps short-circuits at step 4. + + - name: Tag built image with local deps-tag + if: steps.build.outputs.was-built == 'true' + shell: bash + run: | + if [ -n "${LOCAL_DEPS_TAG:-}" ]; then + docker tag "${{ inputs.image-tag }}" "${LOCAL_DEPS_TAG}" + echo "🟢 Tagged local deps-cache: ${LOCAL_DEPS_TAG}" + else + echo "🟠 LOCAL_DEPS_TAG not set, skipping local deps-cache tag" + fi + + ##### 6b: Apply extra-tags (for callers that need additional names) ##### + + - name: Apply extra tags + if: inputs.extra-tags != '' + shell: bash + run: | + while IFS= read -r extra; do + [ -z "$extra" ] && continue + docker tag "${{ inputs.image-tag }}" "$extra" + echo "🟢 Tagged: $extra" + done <<< "${{ inputs.extra-tags }}" + + ##### 7: Evict stale local deps-cache tags (>14d) ##### + + - name: Evict stale local deps-cache tags (>14d) + if: always() + shell: bash + run: | + set +e + TTL_DAYS=14 + cutoff=$(date -u -d "${TTL_DAYS} days ago" +%s) + evicted=0 + while IFS='|' read -r created tag; do + [ -z "$tag" ] && continue + created_epoch=$(date -d "$created" +%s 2>/dev/null) || continue + if [ "$created_epoch" -lt "$cutoff" ]; then + days_old=$(( (cutoff - created_epoch) / 86400 + TTL_DAYS )) + echo "🟠 Evicting deps tag (~${days_old}d old): ${tag}" + docker rmi -f "$tag" >/dev/null 2>&1 || true + evicted=$(( evicted + 1 )) fi + done < <(docker images --filter 'reference=isaac-lab-ci*:deps-*' \ + --format '{{.CreatedAt}}|{{.Repository}}:{{.Tag}}' 2>/dev/null) + echo "🔵 Evicted ${evicted} deps tag(s) older than ${TTL_DAYS}d" + + ##### 8: Host disk snapshot (post) ##### - echo "Building Docker image: $image_tag" - echo "Using Dockerfile: $dockerfile_path" - echo "Build context: $context_path" - - # Build Docker image - docker buildx build --progress=plain --platform linux/amd64 \ - -t $image_tag \ - --build-arg ISAACSIM_BASE_IMAGE_ARG="$isaacsim_base_image" \ - --build-arg ISAACSIM_VERSION_ARG="$isaacsim_version" \ - --build-arg ISAACSIM_ROOT_PATH_ARG=/isaac-sim \ - --build-arg ISAACLAB_PATH_ARG=/workspace/isaaclab \ - --build-arg DOCKER_USER_HOME_ARG=/root \ - --cache-from type=gha \ - --cache-to type=gha,mode=max \ - -f $dockerfile_path \ - --load $context_path - - echo "✅ Docker image built successfully: $image_tag" - echo "Current local Docker images:" - docker images - } - - # Call the function with provided parameters - build_docker_image "${{ inputs.image-tag }}" "${{ inputs.isaacsim-base-image }}" "${{ inputs.isaacsim-version }}" "${{ inputs.dockerfile-path }}" "${{ inputs.context-path }}" + - name: Host disk snapshot (post) + if: always() + shell: bash + run: | + set +e + docker_root=$(docker info --format '{{.DockerRootDir}}' 2>/dev/null || echo "/var/lib/docker") + deps_count=$(docker images --filter 'reference=isaac-lab-ci*:deps-*' -q 2>/dev/null | wc -l) + commit_count=$(docker images --filter 'reference=isaac-lab-ci*' -q 2>/dev/null | wc -l) + { + echo "## Disk snapshot (post)" + echo '```' + echo "Filesystem:" + df -h / "${docker_root}" 2>/dev/null | sort -u + echo + echo "docker system df:" + docker system df + echo + echo "Tag counts:" + echo " isaac-lab-ci-* (commit + deps tags): ${commit_count}" + echo " isaac-lab-ci*:deps-* (deps cache): ${deps_count}" + echo + echo "Deps tags (newest first):" + docker images --filter 'reference=isaac-lab-ci*:deps-*' \ + --format 'table {{.Repository}}:{{.Tag}}\t{{.Size}}\t{{.CreatedSince}}' 2>/dev/null \ + | head -20 + echo '```' + } | tee -a "$GITHUB_STEP_SUMMARY" diff --git a/.github/actions/ecr-build-push-pull/action.yml b/.github/actions/ecr-build-push-pull/action.yml index 8d37ccb5c924..5b7e9a24dcb5 100644 --- a/.github/actions/ecr-build-push-pull/action.yml +++ b/.github/actions/ecr-build-push-pull/action.yml @@ -5,9 +5,27 @@ name: 'ECR Build-Push-Pull' description: > - Builds a Docker image and pushes it to ECR, using ECR as the layer cache. + Build a Docker image and push it to ECR, using ECR as the layer cache. If the image already exists in ECR (same tag), pulls it instead of building. - Drop-in replacement for docker-build/action.yml with ECR-backed caching. + + Composition: + * This action owns the ECR-side concerns (URL resolution, auth, manifest + checks for the per-commit and deps-hash tags, push). + * The actual local build — including local-cache short-circuits, disk + observability, and the post-build deps-tag write — is delegated to + `.github/actions/docker-build`. + + Flow: + 1. Setup docker config + login to nvcr.io. + 2. Resolve ECR URL (input → env var → SSM lookup; falls through if unset). + 3. Login to ECR (when URL was resolved). + 4. Check ECR for the exact per-commit image; if present, pull → done. + 5. Check ECR for the deps-hash tag; if present, registry-side retag to + the commit tag → done. + 6. Otherwise, delegate to docker-build with optional ECR-backed + --cache-from / --cache-to and an ECR-prefixed extra-tag. + 7. Push the commit-tag and the deps-tag to ECR if a build happened. + 8. Cleanup docker config. inputs: image-tag: @@ -30,7 +48,7 @@ inputs: 1. ecr-url input, if provided. 2. ECR_CACHE_URL environment variable on the runner. 3. SSM parameter /github-runner//ecr-cache-url. - 4. If still empty, ECR cache is skipped and the image is built locally. + 4. If still empty, ECR is skipped and the build runs purely locally. required: false default: '' cache-tag: @@ -41,51 +59,18 @@ inputs: description: Target platform for `docker buildx build --platform` (e.g. "linux/amd64", "linux/arm64"). required: false default: 'linux/amd64' + runs: using: composite steps: - ##### 0: Host disk snapshot (pre) ##### - - # Capture host filesystem + docker store usage before the action does any - # work. Output goes to the run's step summary (visible on the PR page) and - # stdout (greppable across runs via `gh run view`). Used to size the - # deps-cache eviction policy once we have a few weeks of data. - - - name: Host disk snapshot (pre) - shell: bash - run: | - set +e - docker_root=$(docker info --format '{{.DockerRootDir}}' 2>/dev/null || echo "/var/lib/docker") - deps_count=$(docker images --filter 'reference=isaac-lab-ci*:deps-*' -q 2>/dev/null | wc -l) - commit_count=$(docker images --filter 'reference=isaac-lab-ci*' -q 2>/dev/null | wc -l) - { - echo "## Disk snapshot (pre)" - echo '```' - echo "Filesystem:" - df -h / "${docker_root}" 2>/dev/null | sort -u - echo - echo "docker system df:" - docker system df - echo - echo "Tag counts:" - echo " isaac-lab-ci-* (commit + deps tags): ${commit_count}" - echo " *:deps-* (deps cache): ${deps_count}" - echo - echo "Deps tags (newest first):" - docker images --filter 'reference=isaac-lab-ci*:deps-*' \ - --format 'table {{.Repository}}:{{.Tag}}\t{{.Size}}\t{{.CreatedSince}}' 2>/dev/null \ - | head -20 - echo '```' - } | tee -a "$GITHUB_STEP_SUMMARY" - ##### 1: Setup docker config + Login to nvcr.io ##### # Create a temp docker config with credsStore disabled before any login. # The runner's credential store backend is broken ("not implemented") and # causes all docker login calls to fail unless we bypass it upfront. - # The temp config is exported as DOCKER_CONFIG so all subsequent steps - # (including ECR login in step 3) inherit it automatically. + # Exported as DOCKER_CONFIG so the ECR login (step 3) and the delegated + # docker-build (step 6, with skip-docker-config=true) inherit it. - name: Setup docker config and login to nvcr.io shell: bash @@ -106,78 +91,6 @@ runs: echo "🟠 NGC_API_KEY not set - skipping nvcr.io login (normal for fork PRs)" fi - ##### 1.5: Local-image short-circuit ##### - - # Fast path: if the exact image-tag is already in the local docker store - # (built by an earlier job on this same self-hosted runner), skip ECR - # resolution, manifest checks, deps-cache lookup, build, and push entirely. - # Image tags are commit-pinned (CI_IMAGE_TAG in build.yaml / arm-ci.yaml), - # so a local hit is safe: every code, Dockerfile, dep, or config.yaml - # change produces a new SHA and a new tag. - - - name: Check image locally - id: local - shell: bash - run: | - if docker image inspect "${{ inputs.image-tag }}" >/dev/null 2>&1; then - echo "🟢 Image already in local docker store: ${{ inputs.image-tag }}" - echo "hit=true" >> "$GITHUB_OUTPUT" - else - echo "🔵 Image not present locally, will try ECR / build paths" - fi - - ##### 1.6: Local deps-cache short-circuit ##### - - # Equivalent of step 5 (ECR deps-cache), but checked against the local - # docker store. When ECR is unreachable (today's arm64 path), this is the - # only way to reuse work across different commit SHAs of the same PR when - # dependencies haven't changed. - # - # Hash inputs match step 5 exactly. If the deps- tag exists locally, - # retag it as the requested commit-tag (metadata-only operation) so the - # rest of the action treats it as a cache hit. Otherwise, export - # LOCAL_DEPS_TAG so step 6.5 can populate the cache after a full build. - - - name: Check deps-tag locally - id: local-deps - if: steps.local.outputs.hit != 'true' - shell: bash - run: | - DEPS_FILES=( - "${{ inputs.dockerfile-path }}" - isaaclab.sh - environment.yml - source/isaaclab/isaaclab/cli - ) - DEPS_MANIFEST_PATTERN='(setup\.py|pyproject\.toml|setup\.cfg|extension\.toml|requirements[^/]*\.txt|uv\.lock)$' - - BASE_IMAGE_DIGEST=$(docker buildx imagetools inspect \ - "${{ inputs.isaacsim-base-image }}:${{ inputs.isaacsim-version }}" \ - --format '{{json .Manifest.Digest}}' 2>/dev/null | tr -d '"' || true) - if [ -n "${BASE_IMAGE_DIGEST}" ]; then - BASE_IMAGE_UNIQ_ID="${{ inputs.isaacsim-base-image }}:${{ inputs.isaacsim-version }}:${BASE_IMAGE_DIGEST}" - else - echo "🟠 Could not resolve base image digest, falling back to tag string" - BASE_IMAGE_UNIQ_ID="${{ inputs.isaacsim-base-image }}:${{ inputs.isaacsim-version }}" - fi - - MANIFEST_FILES=$(git ls-files | grep -E "${DEPS_MANIFEST_PATTERN}" || true) - FILE_HASH=$(git ls-files -s "${DEPS_FILES[@]}" ${MANIFEST_FILES} 2>/dev/null \ - | sha256sum | cut -c1-16) - DEPS_HASH=$(printf '%s %s' "${FILE_HASH}" "${BASE_IMAGE_UNIQ_ID}" | sha256sum | cut -c1-16) - LOCAL_DEPS_TAG="$(echo "${{ inputs.image-tag }}" | cut -d: -f1):deps-${DEPS_HASH}" - - echo "🔵 Local deps tag: ${LOCAL_DEPS_TAG}" - echo "LOCAL_DEPS_TAG=${LOCAL_DEPS_TAG}" >> "$GITHUB_ENV" - - if docker image inspect "${LOCAL_DEPS_TAG}" >/dev/null 2>&1; then - echo "🟢 Local deps-cache HIT! Retagging as ${{ inputs.image-tag }}" - docker tag "${LOCAL_DEPS_TAG}" "${{ inputs.image-tag }}" - echo "hit=true" >> "$GITHUB_OUTPUT" - else - echo "🟠 Local deps-cache MISS (will build then tag for future hits)" - fi - ##### 2: Resolve ECR URL ##### # Tries: explicit input >> ECR_CACHE_URL env var >> SSM parameter on EC2. @@ -185,9 +98,6 @@ runs: - name: Resolve ECR URL id: resolve-ecr - if: > - steps.local.outputs.hit != 'true' && - steps.local-deps.outputs.hit != 'true' shell: bash env: INPUT_ECR_URL: ${{ inputs.ecr-url }} @@ -295,33 +205,29 @@ runs: echo "🟢 Image pulled and tagged as ${{ inputs.image-tag }}" fi - ##### 5: Check deps cache ##### - - # Hashes installation-relevant files + the base image digest to produce a stable - # deps- ECR tag. If the image exists in ECR, the build job succeeds - # immediately and test jobs pull the deps image with a source volume mount. + ##### 5: Check deps cache (registry-side) ##### - # Edit DEPS_FILES or DEPS_MANIFEST_PATTERN when install - # inputs change (new packages, new manifests, etc.). + # Hashes installation-relevant files + the base image digest to produce a + # stable deps- ECR tag. If the image exists in ECR, alias it as the + # commit-tag (registry-side, no layer download). Hash schema must stay + # in sync with .github/actions/docker-build (step 4 there). + # + # Edit DEPS_FILES or DEPS_MANIFEST_PATTERN when install inputs change + # (new packages, new manifests, etc.). - name: Check deps cache id: deps-cache if: steps.resolve-ecr.outputs.available == 'true' && steps.pull-exact.outputs.hit != 'true' shell: bash run: | - ##### Deps-hash configuration ##### - # Exact files/dirs whose full content is hashed. The Dockerfile is first. DEPS_FILES=( "${{ inputs.dockerfile-path }}" isaaclab.sh environment.yml source/isaaclab/isaaclab/cli ) - # Manifest files matched repo-wide via git ls-files. DEPS_MANIFEST_PATTERN='(setup\.py|pyproject\.toml|setup\.cfg|extension\.toml|requirements[^/]*\.txt|uv\.lock)$' - # Resolve the actual base image digest so a new push of a mutable tag - # (e.g. latest-develop) invalidates the deps cache automatically. BASE_IMAGE_DIGEST=$(docker buildx imagetools inspect \ "${{ inputs.isaacsim-base-image }}:${{ inputs.isaacsim-version }}" \ --format '{{json .Manifest.Digest}}' 2>/dev/null | tr -d '"' || true) @@ -342,12 +248,8 @@ runs: echo "🔵 Deps hash: ${DEPS_HASH}" echo "🔵 Checking if deps image ${DEPS_ECR_IMAGE} exists in ECR..." - # Lightweight manifest check - fetches only the image manifest (~KB), - # not the actual layers, so this completes in seconds. if docker manifest inspect "${DEPS_ECR_IMAGE}" >/dev/null 2>&1; then echo "🟢 Deps cache HIT!!! Image exists in ECR: ${DEPS_ECR_IMAGE}" - # Create a commit-tagged alias pointing to the same manifest (registry-side, - # no layer download). Test jobs will pull this tag normally. echo "🔵 Tagging as commit image ${ECR_IMAGE}..." docker buildx imagetools create -t "${ECR_IMAGE}" "${DEPS_ECR_IMAGE}" echo "🟢 Tagged ${ECR_IMAGE} >> ${DEPS_ECR_IMAGE}" @@ -358,72 +260,35 @@ runs: echo "PUSH_DEPS_IMAGE=true" >> "$GITHUB_ENV" fi - ##### 6: Full build ##### - - # Runs when neither the exact image nor the deps cache was available. - # Uses ECR layer cache (--cache-from/--cache-to) when ECR is available. - - - name: Full build - if: > - steps.local.outputs.hit != 'true' && - steps.local-deps.outputs.hit != 'true' && - steps.pull-exact.outputs.hit != 'true' && - steps.deps-cache.outputs.deps-cache-hit != 'true' - shell: bash - run: | - BUILD_ARGS=( - --progress=plain - --platform ${{ inputs.platform }} - -f "${{ inputs.dockerfile-path }}" - --build-arg "ISAACSIM_BASE_IMAGE_ARG=${{ inputs.isaacsim-base-image }}" - --build-arg "ISAACSIM_VERSION_ARG=${{ inputs.isaacsim-version }}" - --build-arg "ISAACSIM_ROOT_PATH_ARG=/isaac-sim" - --build-arg "ISAACLAB_PATH_ARG=/workspace/isaaclab" - --build-arg "DOCKER_USER_HOME_ARG=/root" - -t "${{ inputs.image-tag }}" - ) - if [ -n "${ECR_URL:-}" ]; then - BUILD_ARGS+=( - --cache-from "type=registry,ref=${CACHE_IMAGE}" - --cache-to "type=registry,ref=${CACHE_IMAGE},mode=max" - -t "${ECR_IMAGE}" - ) - fi - - BUILDER_NAME="ci-builder-${{ github.run_id }}-${{ github.job }}" - docker buildx create --use --driver docker-container --name "${BUILDER_NAME}" \ - || docker buildx use "${BUILDER_NAME}" - trap 'docker buildx rm "${BUILDER_NAME}" || true' EXIT + ##### 6: Local build (delegated to docker-build) ##### - echo "🔵 Building ${{ inputs.image-tag }}..." - docker buildx build --load "${BUILD_ARGS[@]}" . - - ##### 6.5: Tag built image with local deps-tag ##### - - # Local mirror of step 8 (Push deps tag). After a successful full build, - # tag the resulting image as deps- in the local docker store so the - # next build on this host with identical deps short-circuits via step 1.6, - # regardless of whether ECR is reachable. - - - name: Tag built image with local deps-tag - if: > - steps.local.outputs.hit != 'true' && - steps.local-deps.outputs.hit != 'true' && - steps.pull-exact.outputs.hit != 'true' && - steps.deps-cache.outputs.deps-cache-hit != 'true' - shell: bash - run: | - if [ -n "${LOCAL_DEPS_TAG:-}" ]; then - docker tag "${{ inputs.image-tag }}" "${LOCAL_DEPS_TAG}" - echo "🟢 Tagged local deps-cache: ${LOCAL_DEPS_TAG}" - else - echo "🟠 LOCAL_DEPS_TAG not set, skipping local deps-cache tag" - fi - - ##### 7: Push to ECR ##### - - # Pushes the per-commit ECR image after a successful full build. - # Skipped if the image was pulled in (4). + # Only runs when neither ECR cache lookup hit. docker-build does its own + # local-exact / local-deps short-circuits, then either retags or builds. + # + # When ECR is available we layer in registry-side buildx layer cache via + # --cache-from / --cache-to, and pass the ECR-prefixed name as an extra + # tag so the push step has a tagged image to push. + + - name: Build (via docker-build) + id: dbuild + if: steps.pull-exact.outputs.hit != 'true' && steps.deps-cache.outputs.deps-cache-hit != 'true' + uses: ./.github/actions/docker-build + with: + image-tag: ${{ inputs.image-tag }} + isaacsim-base-image: ${{ inputs.isaacsim-base-image }} + isaacsim-version: ${{ inputs.isaacsim-version }} + dockerfile-path: ${{ inputs.dockerfile-path }} + platform: ${{ inputs.platform }} + skip-docker-config: 'true' + cache-from: ${{ steps.resolve-ecr.outputs.available == 'true' && format('type=registry,ref={0}', env.CACHE_IMAGE) || '' }} + cache-to: ${{ steps.resolve-ecr.outputs.available == 'true' && format('type=registry,ref={0},mode=max', env.CACHE_IMAGE) || '' }} + extra-tags: ${{ steps.resolve-ecr.outputs.available == 'true' && env.ECR_IMAGE || '' }} + + ##### 7: Push to ECR (commit tag) ##### + + # Push when ECR is available and neither ECR cache lookup served us. + # The image carries the ECR-prefixed tag because step 6 passed it via + # docker-build's extra-tags input. - name: Push to ECR if: > @@ -436,9 +301,9 @@ runs: docker push "${ECR_IMAGE}" echo "🟢 Pushed ${ECR_IMAGE}" - ##### 8: Push deps tag ##### + ##### 8: Push deps tag to ECR ##### - # Tags the freshly built image as deps- so future runs with identical + # Tag the freshly built image as deps- so future runs with identical # install inputs hit the fast path (step 5) instead of doing a full build. - name: Push deps tag @@ -458,65 +323,3 @@ runs: if [ -n "${DOCKER_CONFIG}" ] && [ -d "${DOCKER_CONFIG}" ]; then rm -rf "${DOCKER_CONFIG}" fi - - ##### 10: Evict stale local deps-cache tags ##### - - # 14-day TTL on `*:deps-*` images. Prevents unbounded disk growth on - # long-lived self-hosted runners as new deps hashes land over time. - # Re-acquiring an evicted tag costs one full build (~13 min on arm64, - # registry pull on Linux), which is acceptable given the hit rate. - # Runs `if: always()` so cleanup happens even on failed jobs. - - - name: Evict stale local deps-cache tags (>14d) - if: always() - shell: bash - run: | - set +e - TTL_DAYS=14 - cutoff=$(date -u -d "${TTL_DAYS} days ago" +%s) - evicted=0 - while IFS='|' read -r created tag; do - [ -z "$tag" ] && continue - created_epoch=$(date -d "$created" +%s 2>/dev/null) || continue - if [ "$created_epoch" -lt "$cutoff" ]; then - days_old=$(( (cutoff - created_epoch) / 86400 + TTL_DAYS )) - echo "🟠 Evicting deps tag (~${days_old}d old): ${tag}" - docker rmi -f "$tag" >/dev/null 2>&1 || true - evicted=$(( evicted + 1 )) - fi - done < <(docker images --filter 'reference=isaac-lab-ci*:deps-*' \ - --format '{{.CreatedAt}}|{{.Repository}}:{{.Tag}}' 2>/dev/null) - echo "🔵 Evicted ${evicted} deps tag(s) older than ${TTL_DAYS}d" - - ##### 11: Host disk snapshot (post) ##### - - # Mirror of step 0. Same schema so the pre/post pair is diffable. The - # delta (post - pre) is one job's net contribution to host disk. - - - name: Host disk snapshot (post) - if: always() - shell: bash - run: | - set +e - docker_root=$(docker info --format '{{.DockerRootDir}}' 2>/dev/null || echo "/var/lib/docker") - deps_count=$(docker images --filter 'reference=isaac-lab-ci*:deps-*' -q 2>/dev/null | wc -l) - commit_count=$(docker images --filter 'reference=isaac-lab-ci*' -q 2>/dev/null | wc -l) - { - echo "## Disk snapshot (post)" - echo '```' - echo "Filesystem:" - df -h / "${docker_root}" 2>/dev/null | sort -u - echo - echo "docker system df:" - docker system df - echo - echo "Tag counts:" - echo " isaac-lab-ci-* (commit + deps tags): ${commit_count}" - echo " *:deps-* (deps cache): ${deps_count}" - echo - echo "Deps tags (newest first):" - docker images --filter 'reference=isaac-lab-ci*:deps-*' \ - --format 'table {{.Repository}}:{{.Tag}}\t{{.Size}}\t{{.CreatedSince}}' 2>/dev/null \ - | head -20 - echo '```' - } | tee -a "$GITHUB_STEP_SUMMARY" diff --git a/.github/workflows/arm-ci.yaml b/.github/workflows/arm-ci.yaml index 0d370a5d6df4..a3f3b95abcf0 100644 --- a/.github/workflows/arm-ci.yaml +++ b/.github/workflows/arm-ci.yaml @@ -68,7 +68,7 @@ jobs: ^docker/ Container build inputs (^|/)pyproject\.toml$ Python project metadata ^\.github/workflows/arm-ci\.yaml$ This workflow file - ^\.github/actions/ecr-build-push-pull/ ECR action + ^\.github/actions/docker-build/ Docker build action ^\.github/actions/detect-changes/ Change-detection action ^VERSION$ Version file @@ -114,13 +114,12 @@ jobs: lfs: true - name: Build Base Docker image - uses: ./.github/actions/ecr-build-push-pull + uses: ./.github/actions/docker-build with: image-tag: ${{ env.CI_IMAGE_TAG }} isaacsim-base-image: ${{ needs.config.outputs.isaacsim_image_name }} isaacsim-version: ${{ needs.config.outputs.isaacsim_image_tag }} dockerfile-path: docker/Dockerfile.base - cache-tag: cache-base-arm64 platform: linux/arm64 # tools/conftest.py's subprocess-per-file orchestrator picks up From 9eaa4ae0cd98702529780b8bcad639e5ddf37fb7 Mon Sep 17 00:00:00 2001 From: jichuanh Date: Wed, 27 May 2026 05:55:58 +0000 Subject: [PATCH 27/55] daily-compatibility: opt in to GHA cache after docker-build refactor MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The previous docker-build action hardcoded `--cache-from type=gha --cache-to type=gha,mode=max`. After the recent split, docker-build defaults to no cache (callers pass cache-from / cache-to explicitly). Restore GHA cache for daily-compatibility by passing the two cache inputs at both call sites. Explicit at the call site rather than implicit-default in the action — callers that don't want GHA cache (arm-ci on self-hosted ARM, where GHA cache writes from self-hosted runners are slow and bandwidth-charged) just omit the inputs. --- .github/workflows/daily-compatibility.yml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.github/workflows/daily-compatibility.yml b/.github/workflows/daily-compatibility.yml index b85ba3f3b49a..e3419754e914 100644 --- a/.github/workflows/daily-compatibility.yml +++ b/.github/workflows/daily-compatibility.yml @@ -101,6 +101,8 @@ jobs: image-tag: ${{ env.DOCKER_IMAGE_TAG }} isaacsim-base-image: ${{ needs.config.outputs.isaacsim_image_name }} isaacsim-version: ${{ matrix.isaacsim_version }} + cache-from: type=gha + cache-to: type=gha,mode=max - name: Run IsaacLab Tasks Tests uses: ./.github/actions/run-tests @@ -159,6 +161,8 @@ jobs: image-tag: ${{ env.DOCKER_IMAGE_TAG }} isaacsim-base-image: ${{ needs.config.outputs.isaacsim_image_name }} isaacsim-version: ${{ matrix.isaacsim_version }} + cache-from: type=gha + cache-to: type=gha,mode=max - name: Run General Tests uses: ./.github/actions/run-tests From a5491bfcfae2fbaa772473cb6a8a9a4a13d457e9 Mon Sep 17 00:00:00 2001 From: jichuanh Date: Wed, 27 May 2026 06:01:51 +0000 Subject: [PATCH 28/55] docker-build: broaden eviction + snapshot filter to isaac-lab* MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The previous filter `isaac-lab-ci*:deps-*` covered build.yaml's `isaac-lab-ci:*` tags and arm-ci's `isaac-lab-ci-arm64:*` tags, but missed daily-compatibility's `isaac-lab-compat:*` tags. Two consequences for daily-compat: - The 14-day deps-tag eviction never matched, so daily-compat's per-IsaacSim-version deps tags would accumulate forever (~2/day × ~8 GB each on every gpu runner that ran a daily). - The pre/post disk-snapshot counts undercounted by skipping the daily-compat tags. Broaden to `isaac-lab*:deps-*`. Still scoped to our naming convention (no other tenant uses an `isaac-lab` prefix), so we won't evict unrelated images. Filter label in the snapshot table updated to match. Also add a TODO in daily-compatibility.yml noting that this workflow could migrate to ecr-build-push-pull once it's ready to share the ECR registry cache with build.yaml. --- .github/actions/docker-build/action.yml | 24 +++++++++++------------ .github/workflows/daily-compatibility.yml | 8 ++++++++ 2 files changed, 20 insertions(+), 12 deletions(-) diff --git a/.github/actions/docker-build/action.yml b/.github/actions/docker-build/action.yml index 5e123394ba9e..2af59890759e 100644 --- a/.github/actions/docker-build/action.yml +++ b/.github/actions/docker-build/action.yml @@ -26,7 +26,7 @@ description: > in a registry-side build cache (e.g. ECR). 6. Post-build: tag the result as `:deps-` so future builds with identical deps short-circuit at step 4. - 7. Evict `isaac-lab-ci*:deps-*` tags older than 14 days to bound disk + 7. Evict `isaac-lab*:deps-*` tags older than 14 days to bound disk growth on long-lived hosts. 8. Post-build disk snapshot. @@ -129,8 +129,8 @@ runs: run: | set +e docker_root=$(docker info --format '{{.DockerRootDir}}' 2>/dev/null || echo "/var/lib/docker") - deps_count=$(docker images --filter 'reference=isaac-lab-ci*:deps-*' -q 2>/dev/null | wc -l) - commit_count=$(docker images --filter 'reference=isaac-lab-ci*' -q 2>/dev/null | wc -l) + deps_count=$(docker images --filter 'reference=isaac-lab*:deps-*' -q 2>/dev/null | wc -l) + commit_count=$(docker images --filter 'reference=isaac-lab*' -q 2>/dev/null | wc -l) { echo "## Disk snapshot (pre)" echo '```' @@ -141,11 +141,11 @@ runs: docker system df echo echo "Tag counts:" - echo " isaac-lab-ci-* (commit + deps tags): ${commit_count}" - echo " isaac-lab-ci*:deps-* (deps cache): ${deps_count}" + echo " isaac-lab* (commit + deps tags): ${commit_count}" + echo " isaac-lab*:deps-* (deps cache) : ${deps_count}" echo echo "Deps tags (newest first):" - docker images --filter 'reference=isaac-lab-ci*:deps-*' \ + docker images --filter 'reference=isaac-lab*:deps-*' \ --format 'table {{.Repository}}:{{.Tag}}\t{{.Size}}\t{{.CreatedSince}}' 2>/dev/null \ | head -20 echo '```' @@ -289,7 +289,7 @@ runs: docker rmi -f "$tag" >/dev/null 2>&1 || true evicted=$(( evicted + 1 )) fi - done < <(docker images --filter 'reference=isaac-lab-ci*:deps-*' \ + done < <(docker images --filter 'reference=isaac-lab*:deps-*' \ --format '{{.CreatedAt}}|{{.Repository}}:{{.Tag}}' 2>/dev/null) echo "🔵 Evicted ${evicted} deps tag(s) older than ${TTL_DAYS}d" @@ -301,8 +301,8 @@ runs: run: | set +e docker_root=$(docker info --format '{{.DockerRootDir}}' 2>/dev/null || echo "/var/lib/docker") - deps_count=$(docker images --filter 'reference=isaac-lab-ci*:deps-*' -q 2>/dev/null | wc -l) - commit_count=$(docker images --filter 'reference=isaac-lab-ci*' -q 2>/dev/null | wc -l) + deps_count=$(docker images --filter 'reference=isaac-lab*:deps-*' -q 2>/dev/null | wc -l) + commit_count=$(docker images --filter 'reference=isaac-lab*' -q 2>/dev/null | wc -l) { echo "## Disk snapshot (post)" echo '```' @@ -313,11 +313,11 @@ runs: docker system df echo echo "Tag counts:" - echo " isaac-lab-ci-* (commit + deps tags): ${commit_count}" - echo " isaac-lab-ci*:deps-* (deps cache): ${deps_count}" + echo " isaac-lab* (commit + deps tags): ${commit_count}" + echo " isaac-lab*:deps-* (deps cache) : ${deps_count}" echo echo "Deps tags (newest first):" - docker images --filter 'reference=isaac-lab-ci*:deps-*' \ + docker images --filter 'reference=isaac-lab*:deps-*' \ --format 'table {{.Repository}}:{{.Tag}}\t{{.Size}}\t{{.CreatedSince}}' 2>/dev/null \ | head -20 echo '```' diff --git a/.github/workflows/daily-compatibility.yml b/.github/workflows/daily-compatibility.yml index e3419754e914..29762620758c 100644 --- a/.github/workflows/daily-compatibility.yml +++ b/.github/workflows/daily-compatibility.yml @@ -95,6 +95,10 @@ jobs: fetch-depth: 1 lfs: true + # TODO: migrate to ./.github/actions/ecr-build-push-pull once daily- + # compat is ready to share the ECR registry cache with build.yaml. Today + # it uses docker-build directly with the GHA cache backend, which works + # but doesn't benefit from build.yaml's cross-runner ECR layer cache. - name: Build Docker Image uses: ./.github/actions/docker-build with: @@ -155,6 +159,10 @@ jobs: fetch-depth: 1 lfs: true + # TODO: migrate to ./.github/actions/ecr-build-push-pull once daily- + # compat is ready to share the ECR registry cache with build.yaml. Today + # it uses docker-build directly with the GHA cache backend, which works + # but doesn't benefit from build.yaml's cross-runner ECR layer cache. - name: Build Docker Image uses: ./.github/actions/docker-build with: From a1eec9fbe67a3646f8c3fae504e2706f9bad8185 Mon Sep 17 00:00:00 2001 From: jichuanh Date: Wed, 27 May 2026 06:13:35 +0000 Subject: [PATCH 29/55] ecr-build-push-pull: revert incidental edits, keep minimal delegation The previous refactor commit (3afe9b36b1) rewrote the action's description, dropped or reworded several comments, and added an unused `platform` input on top of the actual functional change. Restore origin/develop's text exactly, and apply ONLY the necessary patch: replace the inline `Full build` step with a `uses:` of the local docker-build action, passing through the ECR cache refs and the ECR-prefixed extra tag. Behavior of every other step is identical to origin/develop. --- .../actions/ecr-build-push-pull/action.yml | 84 +++++++------------ 1 file changed, 32 insertions(+), 52 deletions(-) diff --git a/.github/actions/ecr-build-push-pull/action.yml b/.github/actions/ecr-build-push-pull/action.yml index 5b7e9a24dcb5..6051c984c510 100644 --- a/.github/actions/ecr-build-push-pull/action.yml +++ b/.github/actions/ecr-build-push-pull/action.yml @@ -5,27 +5,9 @@ name: 'ECR Build-Push-Pull' description: > - Build a Docker image and push it to ECR, using ECR as the layer cache. + Builds a Docker image and pushes it to ECR, using ECR as the layer cache. If the image already exists in ECR (same tag), pulls it instead of building. - - Composition: - * This action owns the ECR-side concerns (URL resolution, auth, manifest - checks for the per-commit and deps-hash tags, push). - * The actual local build — including local-cache short-circuits, disk - observability, and the post-build deps-tag write — is delegated to - `.github/actions/docker-build`. - - Flow: - 1. Setup docker config + login to nvcr.io. - 2. Resolve ECR URL (input → env var → SSM lookup; falls through if unset). - 3. Login to ECR (when URL was resolved). - 4. Check ECR for the exact per-commit image; if present, pull → done. - 5. Check ECR for the deps-hash tag; if present, registry-side retag to - the commit tag → done. - 6. Otherwise, delegate to docker-build with optional ECR-backed - --cache-from / --cache-to and an ECR-prefixed extra-tag. - 7. Push the commit-tag and the deps-tag to ECR if a build happened. - 8. Cleanup docker config. + Drop-in replacement for docker-build/action.yml with ECR-backed caching. inputs: image-tag: @@ -48,18 +30,13 @@ inputs: 1. ecr-url input, if provided. 2. ECR_CACHE_URL environment variable on the runner. 3. SSM parameter /github-runner//ecr-cache-url. - 4. If still empty, ECR is skipped and the build runs purely locally. + 4. If still empty, ECR cache is skipped and the image is built locally. required: false default: '' cache-tag: description: Tag used for the ECR layer cache image (e.g. "cache-base", "cache-curobo"). required: false default: 'cache' - platform: - description: Target platform for `docker buildx build --platform` (e.g. "linux/amd64", "linux/arm64"). - required: false - default: 'linux/amd64' - runs: using: composite steps: @@ -69,8 +46,8 @@ runs: # Create a temp docker config with credsStore disabled before any login. # The runner's credential store backend is broken ("not implemented") and # causes all docker login calls to fail unless we bypass it upfront. - # Exported as DOCKER_CONFIG so the ECR login (step 3) and the delegated - # docker-build (step 6, with skip-docker-config=true) inherit it. + # The temp config is exported as DOCKER_CONFIG so all subsequent steps + # (including ECR login in step 3) inherit it automatically. - name: Setup docker config and login to nvcr.io shell: bash @@ -205,29 +182,33 @@ runs: echo "🟢 Image pulled and tagged as ${{ inputs.image-tag }}" fi - ##### 5: Check deps cache (registry-side) ##### + ##### 5: Check deps cache ##### + + # Hashes installation-relevant files + the base image digest to produce a stable + # deps- ECR tag. If the image exists in ECR, the build job succeeds + # immediately and test jobs pull the deps image with a source volume mount. - # Hashes installation-relevant files + the base image digest to produce a - # stable deps- ECR tag. If the image exists in ECR, alias it as the - # commit-tag (registry-side, no layer download). Hash schema must stay - # in sync with .github/actions/docker-build (step 4 there). - # - # Edit DEPS_FILES or DEPS_MANIFEST_PATTERN when install inputs change - # (new packages, new manifests, etc.). + # Edit DEPS_FILES or DEPS_MANIFEST_PATTERN when install + # inputs change (new packages, new manifests, etc.). - name: Check deps cache id: deps-cache if: steps.resolve-ecr.outputs.available == 'true' && steps.pull-exact.outputs.hit != 'true' shell: bash run: | + ##### Deps-hash configuration ##### + # Exact files/dirs whose full content is hashed. The Dockerfile is first. DEPS_FILES=( "${{ inputs.dockerfile-path }}" isaaclab.sh environment.yml source/isaaclab/isaaclab/cli ) + # Manifest files matched repo-wide via git ls-files. DEPS_MANIFEST_PATTERN='(setup\.py|pyproject\.toml|setup\.cfg|extension\.toml|requirements[^/]*\.txt|uv\.lock)$' + # Resolve the actual base image digest so a new push of a mutable tag + # (e.g. latest-develop) invalidates the deps cache automatically. BASE_IMAGE_DIGEST=$(docker buildx imagetools inspect \ "${{ inputs.isaacsim-base-image }}:${{ inputs.isaacsim-version }}" \ --format '{{json .Manifest.Digest}}' 2>/dev/null | tr -d '"' || true) @@ -248,8 +229,12 @@ runs: echo "🔵 Deps hash: ${DEPS_HASH}" echo "🔵 Checking if deps image ${DEPS_ECR_IMAGE} exists in ECR..." + # Lightweight manifest check - fetches only the image manifest (~KB), + # not the actual layers, so this completes in seconds. if docker manifest inspect "${DEPS_ECR_IMAGE}" >/dev/null 2>&1; then echo "🟢 Deps cache HIT!!! Image exists in ECR: ${DEPS_ECR_IMAGE}" + # Create a commit-tagged alias pointing to the same manifest (registry-side, + # no layer download). Test jobs will pull this tag normally. echo "🔵 Tagging as commit image ${ECR_IMAGE}..." docker buildx imagetools create -t "${ECR_IMAGE}" "${DEPS_ECR_IMAGE}" echo "🟢 Tagged ${ECR_IMAGE} >> ${DEPS_ECR_IMAGE}" @@ -260,17 +245,14 @@ runs: echo "PUSH_DEPS_IMAGE=true" >> "$GITHUB_ENV" fi - ##### 6: Local build (delegated to docker-build) ##### + ##### 6: Full build (delegated to docker-build) ##### - # Only runs when neither ECR cache lookup hit. docker-build does its own - # local-exact / local-deps short-circuits, then either retags or builds. - # - # When ECR is available we layer in registry-side buildx layer cache via - # --cache-from / --cache-to, and pass the ECR-prefixed name as an extra - # tag so the push step has a tagged image to push. + # Runs when neither the exact image nor the deps cache was available. + # docker-build does the actual buildx invocation; we pass ECR layer-cache + # refs and the ECR-prefixed tag so the push steps below have something to + # push. - - name: Build (via docker-build) - id: dbuild + - name: Full build if: steps.pull-exact.outputs.hit != 'true' && steps.deps-cache.outputs.deps-cache-hit != 'true' uses: ./.github/actions/docker-build with: @@ -278,17 +260,15 @@ runs: isaacsim-base-image: ${{ inputs.isaacsim-base-image }} isaacsim-version: ${{ inputs.isaacsim-version }} dockerfile-path: ${{ inputs.dockerfile-path }} - platform: ${{ inputs.platform }} skip-docker-config: 'true' cache-from: ${{ steps.resolve-ecr.outputs.available == 'true' && format('type=registry,ref={0}', env.CACHE_IMAGE) || '' }} cache-to: ${{ steps.resolve-ecr.outputs.available == 'true' && format('type=registry,ref={0},mode=max', env.CACHE_IMAGE) || '' }} extra-tags: ${{ steps.resolve-ecr.outputs.available == 'true' && env.ECR_IMAGE || '' }} - ##### 7: Push to ECR (commit tag) ##### + ##### 7: Push to ECR ##### - # Push when ECR is available and neither ECR cache lookup served us. - # The image carries the ECR-prefixed tag because step 6 passed it via - # docker-build's extra-tags input. + # Pushes the per-commit ECR image after a successful full build. + # Skipped if the image was pulled in (4). - name: Push to ECR if: > @@ -301,9 +281,9 @@ runs: docker push "${ECR_IMAGE}" echo "🟢 Pushed ${ECR_IMAGE}" - ##### 8: Push deps tag to ECR ##### + ##### 8: Push deps tag ##### - # Tag the freshly built image as deps- so future runs with identical + # Tags the freshly built image as deps- so future runs with identical # install inputs hit the fast path (step 5) instead of doing a full build. - name: Push deps tag From 08aa1017bbb5d148d06ee67b4a5a15595d9155de Mon Sep 17 00:00:00 2001 From: jichuanh Date: Wed, 27 May 2026 06:16:25 +0000 Subject: [PATCH 30/55] daily-compatibility: trigger on PRs that change its dependencies MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add a `pull_request:` trigger with a narrow `paths:` filter covering the workflow file itself, the config.yaml it reads, and the three shared actions it composes (docker-build, run-tests, combine-results). This lets refactors of the docker-build or run-tests actions exercise the daily-compat call sites pre-merge instead of waiting for the next nightly cron to surface breakage. Source-only PRs are unaffected — the paths filter excludes everything outside the workflow's own dependency graph, so a typical `source/...` change still doesn't burn the four-cell gpu matrix. Safe to gate by paths here because daily-compatibility is not a required branch-protection check; a not-triggered run does not block merges. (Compare with build.yaml / arm-ci.yaml, which always run a cheap detect-changes job + conditional heavy jobs precisely because they ARE required.) --- .github/workflows/daily-compatibility.yml | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/.github/workflows/daily-compatibility.yml b/.github/workflows/daily-compatibility.yml index 29762620758c..b2f4c275fe77 100644 --- a/.github/workflows/daily-compatibility.yml +++ b/.github/workflows/daily-compatibility.yml @@ -10,6 +10,25 @@ on: # Run daily at 8 PM PST (4 AM UTC) - cron: '0 4 * * *' + # Also fire on PRs that change the workflow itself or the actions / + # config it depends on, so refactors get validated before merge. + # Narrow `paths:` keeps normal source-only PRs from spinning up the + # heavy compatibility matrix. This trigger is safe to gate by paths + # because the workflow is NOT a required branch-protection check — + # a not-triggered run does not block merges. + pull_request: + types: [opened, synchronize, reopened] + branches: + - main + - develop + - 'release/**' + paths: + - '.github/workflows/daily-compatibility.yml' + - '.github/workflows/config.yaml' + - '.github/actions/docker-build/**' + - '.github/actions/run-tests/**' + - '.github/actions/combine-results/**' + workflow_dispatch: inputs: isaacsim_version: From 4f528eb83190ebee6b025a2fc546603e75a52e44 Mon Sep 17 00:00:00 2001 From: jichuanh Date: Wed, 27 May 2026 06:18:38 +0000 Subject: [PATCH 31/55] daily-compatibility: drop auto-trigger, document manual run on changes Revert the `pull_request:` + `paths:` trigger added in 08aa1017bbb. The auto-trigger was over-eager (every push to the docker-build action fires the heavy 4-cell matrix) and the team prefers keeping this as a nightly canary only. Document the consequence in the TODO comment alongside the existing ecr-build-push-pull migration note: anyone editing this workflow or its action dependencies should manually trigger the workflow against their PR branch via `gh workflow run "Backwards Compatibility Tests"` before merge, otherwise breakage surfaces only on the next nightly. --- .github/workflows/daily-compatibility.yml | 35 +++++++++++------------ 1 file changed, 16 insertions(+), 19 deletions(-) diff --git a/.github/workflows/daily-compatibility.yml b/.github/workflows/daily-compatibility.yml index b2f4c275fe77..ab325410e1e3 100644 --- a/.github/workflows/daily-compatibility.yml +++ b/.github/workflows/daily-compatibility.yml @@ -10,25 +10,6 @@ on: # Run daily at 8 PM PST (4 AM UTC) - cron: '0 4 * * *' - # Also fire on PRs that change the workflow itself or the actions / - # config it depends on, so refactors get validated before merge. - # Narrow `paths:` keeps normal source-only PRs from spinning up the - # heavy compatibility matrix. This trigger is safe to gate by paths - # because the workflow is NOT a required branch-protection check — - # a not-triggered run does not block merges. - pull_request: - types: [opened, synchronize, reopened] - branches: - - main - - develop - - 'release/**' - paths: - - '.github/workflows/daily-compatibility.yml' - - '.github/workflows/config.yaml' - - '.github/actions/docker-build/**' - - '.github/actions/run-tests/**' - - '.github/actions/combine-results/**' - workflow_dispatch: inputs: isaacsim_version: @@ -118,6 +99,14 @@ jobs: # compat is ready to share the ECR registry cache with build.yaml. Today # it uses docker-build directly with the GHA cache backend, which works # but doesn't benefit from build.yaml's cross-runner ECR layer cache. + # + # Note: this workflow has no `pull_request:` trigger, so PR-time changes + # to this file, .github/actions/docker-build, .github/actions/run-tests, + # or .github/actions/combine-results are NOT validated automatically. + # When touching any of those, manually trigger this workflow against + # the PR branch before merge: + # gh workflow run "Backwards Compatibility Tests" --ref + # Otherwise breakage surfaces only on the next nightly cron after merge. - name: Build Docker Image uses: ./.github/actions/docker-build with: @@ -182,6 +171,14 @@ jobs: # compat is ready to share the ECR registry cache with build.yaml. Today # it uses docker-build directly with the GHA cache backend, which works # but doesn't benefit from build.yaml's cross-runner ECR layer cache. + # + # Note: this workflow has no `pull_request:` trigger, so PR-time changes + # to this file, .github/actions/docker-build, .github/actions/run-tests, + # or .github/actions/combine-results are NOT validated automatically. + # When touching any of those, manually trigger this workflow against + # the PR branch before merge: + # gh workflow run "Backwards Compatibility Tests" --ref + # Otherwise breakage surfaces only on the next nightly cron after merge. - name: Build Docker Image uses: ./.github/actions/docker-build with: From 679b09d518a007fc8e0d5c357434181059326ffe Mon Sep 17 00:00:00 2001 From: jichuanh Date: Wed, 27 May 2026 06:19:47 +0000 Subject: [PATCH 32/55] daily-compatibility: consolidate duplicate TODO into header --- .github/workflows/daily-compatibility.yml | 41 ++++++++++------------- 1 file changed, 17 insertions(+), 24 deletions(-) diff --git a/.github/workflows/daily-compatibility.yml b/.github/workflows/daily-compatibility.yml index ab325410e1e3..20c4371fd33f 100644 --- a/.github/workflows/daily-compatibility.yml +++ b/.github/workflows/daily-compatibility.yml @@ -3,6 +3,23 @@ # # SPDX-License-Identifier: BSD-3-Clause +# Nightly canary: runs the IsaacLab test suite against multiple pinned +# IsaacSim versions to catch backwards-compatibility regressions. +# +# Caveats for editors of this file or its action dependencies +# (`.github/actions/docker-build`, `run-tests`, `combine-results`): +# +# * No `pull_request:` trigger — changes are not validated automatically +# at PR time. Manually trigger against the PR branch before merge: +# gh workflow run "Backwards Compatibility Tests" --ref +# Otherwise breakage surfaces only on the next nightly cron after merge. +# +# * The build steps below pass `cache-from: type=gha` / `cache-to: +# type=gha,mode=max` to docker-build explicitly. Future migration to +# `./.github/actions/ecr-build-push-pull` would share build.yaml's +# cross-runner ECR layer cache, but that's deferred until daily-compat +# is ready to wire ECR auth. + name: Backwards Compatibility Tests on: @@ -95,18 +112,6 @@ jobs: fetch-depth: 1 lfs: true - # TODO: migrate to ./.github/actions/ecr-build-push-pull once daily- - # compat is ready to share the ECR registry cache with build.yaml. Today - # it uses docker-build directly with the GHA cache backend, which works - # but doesn't benefit from build.yaml's cross-runner ECR layer cache. - # - # Note: this workflow has no `pull_request:` trigger, so PR-time changes - # to this file, .github/actions/docker-build, .github/actions/run-tests, - # or .github/actions/combine-results are NOT validated automatically. - # When touching any of those, manually trigger this workflow against - # the PR branch before merge: - # gh workflow run "Backwards Compatibility Tests" --ref - # Otherwise breakage surfaces only on the next nightly cron after merge. - name: Build Docker Image uses: ./.github/actions/docker-build with: @@ -167,18 +172,6 @@ jobs: fetch-depth: 1 lfs: true - # TODO: migrate to ./.github/actions/ecr-build-push-pull once daily- - # compat is ready to share the ECR registry cache with build.yaml. Today - # it uses docker-build directly with the GHA cache backend, which works - # but doesn't benefit from build.yaml's cross-runner ECR layer cache. - # - # Note: this workflow has no `pull_request:` trigger, so PR-time changes - # to this file, .github/actions/docker-build, .github/actions/run-tests, - # or .github/actions/combine-results are NOT validated automatically. - # When touching any of those, manually trigger this workflow against - # the PR branch before merge: - # gh workflow run "Backwards Compatibility Tests" --ref - # Otherwise breakage surfaces only on the next nightly cron after merge. - name: Build Docker Image uses: ./.github/actions/docker-build with: From f7b3cd129d82bf917391ab51399a62ec5f270721 Mon Sep 17 00:00:00 2001 From: jichuanh Date: Wed, 27 May 2026 06:54:58 +0000 Subject: [PATCH 33/55] conftest: separate CI_MARKER from ISAACSIM_CI_SHORT, don't alias MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Previously this PR aliased ISAACSIM_CI_SHORT=true → CI_MARKER=isaacsim_ci and used the new broader filter pattern (`pytest.mark.X` substring) for both. That accidentally widened Isaac Sim CI's external filter scope from 52 files (the legacy `@pytest.mark.isaacsim_ci`-only pattern) to 86 files (includes module-level `pytestmark = pytest.mark.isaacsim_ci`). The 34-file expansion would only surface in Isaac Sim's pipeline, which sets ISAACSIM_CI_SHORT externally. Restore the original isaacsim_ci variable + `@pytest.mark.isaacsim_ci` filter pattern verbatim — Isaac Sim's external selector contract is untouched. CI_MARKER is now a strictly parallel mechanism for this PR's arm-ci and any future windows-ci. Both can run on the same invocation; pytest -m takes the more specific CI_MARKER when set, falling back to "isaacsim_ci" when only ISAACSIM_CI_SHORT is set. --- tools/conftest.py | 28 ++++++++++++++++++++-------- 1 file changed, 20 insertions(+), 8 deletions(-) diff --git a/tools/conftest.py b/tools/conftest.py index 4ee0dbae0639..b13eed84b17e 100644 --- a/tools/conftest.py +++ b/tools/conftest.py @@ -649,13 +649,14 @@ def pytest_sessionstart(session): quarantined_only = os.environ.get("TEST_QUARANTINED_ONLY", "false") == "true" curobo_only = os.environ.get("TEST_CUROBO_ONLY", "false") == "true" - # CI_MARKER env var generalizes the previous ISAACSIM_CI_SHORT=true gate so - # cross-platform jobs (ARM, Windows) can reuse this orchestrator with their - # own markers (arm_ci, windows_ci, ...). ISAACSIM_CI_SHORT=true stays - # supported as a back-compat shorthand for CI_MARKER=isaacsim_ci. + isaacsim_ci = os.environ.get("ISAACSIM_CI_SHORT", "false") == "true" + + # CI_MARKER env var is a separate, parallel mechanism for cross-platform + # jobs (arm-ci, windows-ci, ...) to reuse this orchestrator with their own + # markers. Deliberately NOT aliased to ISAACSIM_CI_SHORT: the isaacsim_ci + # filter is owned by Isaac Sim's external CI pipeline; this PR's CI_MARKER + # path leaves that contract untouched. ci_marker = os.environ.get("CI_MARKER", "") - if not ci_marker and os.environ.get("ISAACSIM_CI_SHORT", "false") == "true": - ci_marker = "isaacsim_ci" # Parse include files list (comma-separated paths) include_files = set() @@ -719,6 +720,14 @@ def pytest_sessionstart(session): curobo_only, ) + if isaacsim_ci: + new_test_files = [] + for test_file in test_files: + with open(test_file) as f: + if "@pytest.mark.isaacsim_ci" in f.read(): + new_test_files.append(test_file) + test_files = new_test_files + if ci_marker: # Match both `@pytest.mark.` (per-function) and # `pytestmark = pytest.mark.` / `pytestmark = [..., pytest.mark., ...]` @@ -747,8 +756,11 @@ def pytest_sessionstart(session): for test_file in test_files: print(f" - {test_file}") - # Run all tests individually - failed_tests, test_status, xml_reports = run_individual_tests(test_files, workspace_root, ci_marker) + # Run all tests individually. CI_MARKER takes precedence when both env + # vars are set; falls back to "isaacsim_ci" when only ISAACSIM_CI_SHORT + # is set. The pytest -m flag only accepts one expression. + effective_marker = ci_marker or ("isaacsim_ci" if isaacsim_ci else "") + failed_tests, test_status, xml_reports = run_individual_tests(test_files, workspace_root, effective_marker) print("failed tests:", failed_tests) From bd1613b79cef324a8b874082a647107563382559 Mon Sep 17 00:00:00 2001 From: jichuanh Date: Wed, 27 May 2026 07:09:10 +0000 Subject: [PATCH 34/55] arm-ci: skip ovphysx parametrize cases on aarch64 Symmetric fix to the existing OVRTX aarch64 skip. The new `make_require_ovlibs_install_fixture` shape from develop checks both ovrtx and ovphysx imports; both wheels are x86_64-only on NVIDIA's distribution, so the ovphysx parametrize cases were erroring on aarch64 the same way ovrtx was before the existing skip. Verified via PR 5698 run 26495850292: arm-ci's test_rendering_{shadow_hand,dexsuite_kuka,cartpole}_kitless all failed with `ModuleNotFoundError: No module named 'ovphysx'` at fixture setup for `physics_backend=ovphysx` cases. --- source/isaaclab_tasks/test/rendering_test_utils.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/source/isaaclab_tasks/test/rendering_test_utils.py b/source/isaaclab_tasks/test/rendering_test_utils.py index 2a3403f978dc..5d2bcf740ea6 100644 --- a/source/isaaclab_tasks/test/rendering_test_utils.py +++ b/source/isaaclab_tasks/test/rendering_test_utils.py @@ -600,6 +600,8 @@ def _require_ovlibs_install(request): print(f"ovphysx version: {ovphysx.__version__}") except ImportError as exc: + if platform.machine() == "aarch64": + pytest.skip("OVPhysX has no aarch64 wheel; skipping physics_backend=ovphysx on this platform.") pytest.fail( "Kitless OVPhysX rendering tests require the optional dependency ov[ovphysx]. " "Install with: ./isaaclab.sh -i 'ov[ovphysx]'\n" From 74a3a83e276cff980122b6d4bfbc811982cc0950 Mon Sep 17 00:00:00 2001 From: jichuanh Date: Wed, 27 May 2026 07:09:41 +0000 Subject: [PATCH 35/55] changelog: cover ovphysx alongside ovrtx aarch64 skip --- .../isaaclab_tasks/changelog.d/jichuanh-arm-ci.rst | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/source/isaaclab_tasks/changelog.d/jichuanh-arm-ci.rst b/source/isaaclab_tasks/changelog.d/jichuanh-arm-ci.rst index ad5fe3d0098f..20dbfd59928b 100644 --- a/source/isaaclab_tasks/changelog.d/jichuanh-arm-ci.rst +++ b/source/isaaclab_tasks/changelog.d/jichuanh-arm-ci.rst @@ -1,9 +1,9 @@ Fixed ^^^^^ -* Changed the ``ovrtx`` autouse guard in the kitless rendering tests to skip - rather than fail on aarch64 when the ``ov[ovrtx]`` optional dependency is - unavailable. The ``ovrtx`` wheel is published only for x86_64, so on aarch64 - this gate was turning unreachable parametrize cases into hard failures; x86 - environments without ``ov[ovrtx]`` still see the original "install with - ``./isaaclab.sh -i 'ov[ovrtx]'``" failure with install guidance. +* Changed the kitless rendering tests' ``ov[ovrtx]`` and ``ov[ovphysx]`` autouse + guards to skip rather than fail on aarch64 when the optional dependency is + unavailable. Both wheels are published only for x86_64, so on aarch64 these + gates were turning unreachable parametrize cases into hard failures; x86 + environments without the dependency still see the original + "install with ``./isaaclab.sh -i 'ov[…]'``" failure with install guidance. From 4bd088f36a1d0efcbae6097978ac999542a1956a Mon Sep 17 00:00:00 2001 From: jichuanh Date: Wed, 27 May 2026 07:24:53 +0000 Subject: [PATCH 36/55] Revert TEMP build.yaml override; restore full Docker + Tests gating Removes the hardcoded `run_docker_tests: 'false'` introduced for iteration on this PR and restores the detect-step's computed value. Linux Docker + Tests matrix will now actually run on this PR's CI. Prior commit 74a3a83e27 was green on the reduced surface; revert enables the broader validation pre-review. --- .github/workflows/build.yaml | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml index 8f1b459f0225..5b4992e32edc 100644 --- a/.github/workflows/build.yaml +++ b/.github/workflows/build.yaml @@ -74,10 +74,7 @@ jobs: name: Detect Changes runs-on: ubuntu-latest outputs: - # TEMP (revert before final review): force run_docker_tests=false while - # iterating the cartpole training-smoke addition on ARM CI. Saves runner - # time + cost during the back-and-forth. - run_docker_tests: 'false' + run_docker_tests: ${{ steps.detect.outputs.run_docker_tests }} steps: - id: detect env: From a9711efac98e571bb134aac0e9a133c6b453028e Mon Sep 17 00:00:00 2001 From: jichuanh Date: Wed, 27 May 2026 07:53:28 +0000 Subject: [PATCH 37/55] conftest: guard ci_marker post-scan open() against OSError The pre-scan at the top of pytest_sessionstart wraps its `open()` in `except OSError: continue` to tolerate races (a marker-tagged file deleted between os.walk and open(), an unreadable symlink, etc.). The post-scan filter further down was missing the same guard, so a single transient I/O error during the marker-token check would raise an unhandled exception and abort the orchestrator before any test ran. Mirror the pre-scan's try/except so both passes degrade gracefully. Flagged P1 by Greptile auto-review on commit 4bd088f36a. --- tools/conftest.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/tools/conftest.py b/tools/conftest.py index b13eed84b17e..173d2a088809 100644 --- a/tools/conftest.py +++ b/tools/conftest.py @@ -732,12 +732,18 @@ def pytest_sessionstart(session): # Match both `@pytest.mark.` (per-function) and # `pytestmark = pytest.mark.` / `pytestmark = [..., pytest.mark., ...]` # (module-level) by looking for the common `pytest.mark.` substring. + # OSError handling mirrors the pre-scan above so a transient filesystem + # issue (race-condition delete, permission flap, unreadable symlink) + # doesn't abort the whole session. marker_token = f"pytest.mark.{ci_marker}" new_test_files = [] for test_file in test_files: - with open(test_file) as f: - if marker_token in f.read(): - new_test_files.append(test_file) + try: + with open(test_file) as f: + if marker_token in f.read(): + new_test_files.append(test_file) + except OSError: + continue test_files = new_test_files if not test_files: From d7fb1bd81117c391f6f6ce92862730925f32a6bc Mon Sep 17 00:00:00 2001 From: jichuanh Date: Wed, 27 May 2026 08:03:31 +0000 Subject: [PATCH 38/55] conftest: differentiate ci_marker OSError handling between scans MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Pre-scan: log loud `::warning::` then continue. The pre-scan is best-effort (missed files may still be picked up via _collect_test_files's normal walk path), so silent continue was acceptable but unhelpful — adding a visible warning lets the issue surface without aborting. Post-scan: raise RuntimeError. This is the final filter pass — a silent drop here would let a marker-tagged test exit the run while the orchestrator still reports success. Loud abort on OSError is the right signal; the failure case (permission flap, race-deleted file, broken symlink on a previously-walked path) is exotic enough that hitting it indicates a real CI environment issue worth investigating. Replaces the silent `continue` introduced in a9711efac9 from Greptile's P1 suggestion — that suggestion mirrored pre-scan's behavior mechanically but missed the semantic difference between the two passes. --- tools/conftest.py | 25 ++++++++++++++++++------- 1 file changed, 18 insertions(+), 7 deletions(-) diff --git a/tools/conftest.py b/tools/conftest.py index 173d2a088809..07640c9638a9 100644 --- a/tools/conftest.py +++ b/tools/conftest.py @@ -699,11 +699,17 @@ def pytest_sessionstart(session): for file in files: if not (file.startswith("test_") and file.endswith(".py")): continue + full_path = os.path.join(root, file) try: - with open(os.path.join(root, file)) as f: + with open(full_path) as f: if marker_token in f.read(): marker_include_files.add(file) - except OSError: + except OSError as exc: + # Pre-scan is best-effort: even if we miss a file here, + # _collect_test_files may still pick it up via the + # normal walk path. Log loudly so the miss is visible + # without aborting the whole orchestrator. + print(f"::warning::ci_marker pre-scan could not read {full_path}: {exc}") continue if marker_include_files: print(f"CI_MARKER={ci_marker}: marker-tagged files: {sorted(marker_include_files)}") @@ -732,9 +738,11 @@ def pytest_sessionstart(session): # Match both `@pytest.mark.` (per-function) and # `pytestmark = pytest.mark.` / `pytestmark = [..., pytest.mark., ...]` # (module-level) by looking for the common `pytest.mark.` substring. - # OSError handling mirrors the pre-scan above so a transient filesystem - # issue (race-condition delete, permission flap, unreadable symlink) - # doesn't abort the whole session. + # Unlike the pre-scan, an OSError here is fatal: this is the final + # filter pass, so silently dropping a file would let a marker-tagged + # test silently exit the run with the orchestrator still reporting + # success. We'd rather abort loudly than mask a real CI environment + # issue (permission flap, race-deleted file, broken symlink). marker_token = f"pytest.mark.{ci_marker}" new_test_files = [] for test_file in test_files: @@ -742,8 +750,11 @@ def pytest_sessionstart(session): with open(test_file) as f: if marker_token in f.read(): new_test_files.append(test_file) - except OSError: - continue + except OSError as exc: + raise RuntimeError( + f"ci_marker post-scan could not read {test_file}; refusing to" + f" silently drop a potentially marker-tagged file" + ) from exc test_files = new_test_files if not test_files: From dae0566bd5da1f65df7db27b2388f1b7d4000289 Mon Sep 17 00:00:00 2001 From: jichuanh Date: Wed, 27 May 2026 17:02:38 +0000 Subject: [PATCH 39/55] conftest: drop try/except wrappers on ci_marker open() calls MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Match the existing ISAACSIM_CI_SHORT post-scan's behavior — no OSError handling, let `open()` propagate. Same semantic for both markers: if a test file can't be read during scan, the orchestrator aborts with the raw OSError and CI goes red. No silent drops. Reverts the try/except additions from a9711efac9 (Greptile P1 suggestion) and d7fb1bd811 (the differentiated raise/warn version). --- tools/conftest.py | 32 ++++++-------------------------- 1 file changed, 6 insertions(+), 26 deletions(-) diff --git a/tools/conftest.py b/tools/conftest.py index 07640c9638a9..a1261c54e556 100644 --- a/tools/conftest.py +++ b/tools/conftest.py @@ -699,18 +699,9 @@ def pytest_sessionstart(session): for file in files: if not (file.startswith("test_") and file.endswith(".py")): continue - full_path = os.path.join(root, file) - try: - with open(full_path) as f: - if marker_token in f.read(): - marker_include_files.add(file) - except OSError as exc: - # Pre-scan is best-effort: even if we miss a file here, - # _collect_test_files may still pick it up via the - # normal walk path. Log loudly so the miss is visible - # without aborting the whole orchestrator. - print(f"::warning::ci_marker pre-scan could not read {full_path}: {exc}") - continue + with open(os.path.join(root, file)) as f: + if marker_token in f.read(): + marker_include_files.add(file) if marker_include_files: print(f"CI_MARKER={ci_marker}: marker-tagged files: {sorted(marker_include_files)}") # Union with any explicit TEST_INCLUDE_FILES the caller passed. @@ -738,23 +729,12 @@ def pytest_sessionstart(session): # Match both `@pytest.mark.` (per-function) and # `pytestmark = pytest.mark.` / `pytestmark = [..., pytest.mark., ...]` # (module-level) by looking for the common `pytest.mark.` substring. - # Unlike the pre-scan, an OSError here is fatal: this is the final - # filter pass, so silently dropping a file would let a marker-tagged - # test silently exit the run with the orchestrator still reporting - # success. We'd rather abort loudly than mask a real CI environment - # issue (permission flap, race-deleted file, broken symlink). marker_token = f"pytest.mark.{ci_marker}" new_test_files = [] for test_file in test_files: - try: - with open(test_file) as f: - if marker_token in f.read(): - new_test_files.append(test_file) - except OSError as exc: - raise RuntimeError( - f"ci_marker post-scan could not read {test_file}; refusing to" - f" silently drop a potentially marker-tagged file" - ) from exc + with open(test_file) as f: + if marker_token in f.read(): + new_test_files.append(test_file) test_files = new_test_files if not test_files: From ed2151055d4f811d1f323c61a4b52201fb512d13 Mon Sep 17 00:00:00 2001 From: jichuanh Date: Sun, 31 May 2026 23:27:31 +0000 Subject: [PATCH 40/55] Share CI utilities across build.yaml and arm-ci.yaml build.yaml's `changes` job had its own ~95-line inline detector while arm-ci.yaml consumed the new detect-changes composite. Route both workflows through the composite so the gating logic stays in one place. Extract the deps-hash computation into _lib/compute-deps-hash.sh, sourced by both docker-build (local-store check) and ecr-build-push-pull (registry check). The two caches stay independent; only the hash schema is shared, which removes the silent-drift risk the docker-build docstring previously warned about. Extract the docker-config + nvcr.io login into _lib/setup-docker-config.sh, sourced by both composites. The script is idempotent (early-returns when DOCKER_CONFIG already points at a valid temp dir), so the `skip-docker-config` input on docker-build and the corresponding plumbing in ecr-build-push-pull's delegation are no longer needed. Bump arm-ci.yaml's actions/checkout from v4 to v6 to match the rest of the workflows. --- .github/actions/_lib/compute-deps-hash.sh | 47 ++++++++ .github/actions/_lib/setup-docker-config.sh | 41 +++++++ .github/actions/docker-build/action.yml | 71 +++-------- .../actions/ecr-build-push-pull/action.yml | 53 ++------ .github/workflows/arm-ci.yaml | 6 +- .github/workflows/build.yaml | 114 +++--------------- 6 files changed, 134 insertions(+), 198 deletions(-) create mode 100644 .github/actions/_lib/compute-deps-hash.sh create mode 100644 .github/actions/_lib/setup-docker-config.sh diff --git a/.github/actions/_lib/compute-deps-hash.sh b/.github/actions/_lib/compute-deps-hash.sh new file mode 100644 index 000000000000..a52284120837 --- /dev/null +++ b/.github/actions/_lib/compute-deps-hash.sh @@ -0,0 +1,47 @@ +# Copyright (c) 2022-2026, The Isaac Lab Project Developers (https://github.com/isaac-sim/IsaacLab/blob/main/CONTRIBUTORS.md). +# All rights reserved. +# +# SPDX-License-Identifier: BSD-3-Clause + +# Shared deps-hash computation for the docker-build and ecr-build-push-pull +# composite actions. Both invoke this script so a registry-side cache hit and +# a local-store cache hit always agree on the same `deps-` tag. +# +# Source this script (do not exec) — it sets DEPS_HASH in the caller's +# environment. Caller must export DOCKERFILE_PATH, ISAACSIM_BASE_IMAGE, +# ISAACSIM_VERSION before sourcing. Diagnostic output goes to stderr so the +# caller's stdout stays usable. + +: "${DOCKERFILE_PATH:?compute-deps-hash: DOCKERFILE_PATH must be set}" +: "${ISAACSIM_BASE_IMAGE:?compute-deps-hash: ISAACSIM_BASE_IMAGE must be set}" +: "${ISAACSIM_VERSION:?compute-deps-hash: ISAACSIM_VERSION must be set}" + +# Exact files/dirs whose full content is hashed. The Dockerfile is first. +_DEPS_FILES=( + "${DOCKERFILE_PATH}" + isaaclab.sh + environment.yml + source/isaaclab/isaaclab/cli +) +# Manifest files matched repo-wide via git ls-files. +_DEPS_MANIFEST_PATTERN='(setup\.py|pyproject\.toml|setup\.cfg|extension\.toml|requirements[^/]*\.txt|uv\.lock)$' + +# Resolve the actual base image digest so a new push of a mutable tag +# (e.g. latest-develop) invalidates the deps cache automatically. +_BASE_IMAGE_DIGEST=$(docker buildx imagetools inspect \ + "${ISAACSIM_BASE_IMAGE}:${ISAACSIM_VERSION}" \ + --format '{{json .Manifest.Digest}}' 2>/dev/null | tr -d '"' || true) +if [ -n "${_BASE_IMAGE_DIGEST}" ]; then + _BASE_IMAGE_UNIQ_ID="${ISAACSIM_BASE_IMAGE}:${ISAACSIM_VERSION}:${_BASE_IMAGE_DIGEST}" +else + echo "🟠 Could not resolve base image digest, falling back to tag string" >&2 + _BASE_IMAGE_UNIQ_ID="${ISAACSIM_BASE_IMAGE}:${ISAACSIM_VERSION}" +fi + +_MANIFEST_FILES=$(git ls-files | grep -E "${_DEPS_MANIFEST_PATTERN}" || true) +# shellcheck disable=SC2086 # word-splitting MANIFEST_FILES is intentional +_FILE_HASH=$(git ls-files -s "${_DEPS_FILES[@]}" ${_MANIFEST_FILES} 2>/dev/null \ + | sha256sum | cut -c1-16) +DEPS_HASH=$(printf '%s %s' "${_FILE_HASH}" "${_BASE_IMAGE_UNIQ_ID}" | sha256sum | cut -c1-16) + +unset _DEPS_FILES _DEPS_MANIFEST_PATTERN _BASE_IMAGE_DIGEST _BASE_IMAGE_UNIQ_ID _MANIFEST_FILES _FILE_HASH diff --git a/.github/actions/_lib/setup-docker-config.sh b/.github/actions/_lib/setup-docker-config.sh new file mode 100644 index 000000000000..3a4290e4a5d7 --- /dev/null +++ b/.github/actions/_lib/setup-docker-config.sh @@ -0,0 +1,41 @@ +# Copyright (c) 2022-2026, The Isaac Lab Project Developers (https://github.com/isaac-sim/IsaacLab/blob/main/CONTRIBUTORS.md). +# All rights reserved. +# +# SPDX-License-Identifier: BSD-3-Clause + +# Idempotent docker-config + nvcr.io login setup for the docker-build and +# ecr-build-push-pull composite actions. Both composites source this; the +# second invocation in a job is a no-op so callers don't need to coordinate. +# +# Source this script (do not exec) — it sets DOCKER_CONFIG in the caller's +# environment and writes it to $GITHUB_ENV so subsequent steps inherit it. +# Expects NGC_API_KEY in the environment (optional; warns when missing). + +# The runner's credential helper backend is broken ("not implemented") and +# causes docker login calls to fail unless we point DOCKER_CONFIG at a temp +# config with credsStore disabled. + +if [ -n "${DOCKER_CONFIG:-}" ] && [ -f "${DOCKER_CONFIG}/config.json" ]; then + echo "🟢 Docker config already set up at ${DOCKER_CONFIG}, skipping" >&2 + return 0 2>/dev/null || exit 0 +fi + +DOCKER_CONFIG_DIR=$(mktemp -d) +if [ -f "${HOME}/.docker/config.json" ]; then + python3 -c "import json; cfg=json.load(open('${HOME}/.docker/config.json')); cfg['credsStore']=''; cfg.pop('credHelpers',None); json.dump(cfg,open('${DOCKER_CONFIG_DIR}/config.json','w'))" +else + echo '{"credsStore":""}' > "${DOCKER_CONFIG_DIR}/config.json" +fi +export DOCKER_CONFIG="${DOCKER_CONFIG_DIR}" +if [ -n "${GITHUB_ENV:-}" ]; then + echo "DOCKER_CONFIG=${DOCKER_CONFIG_DIR}" >> "${GITHUB_ENV}" +fi + +if [ -n "${NGC_API_KEY:-}" ]; then + echo "🔵 Logging into nvcr.io..." >&2 + docker login -u '$oauthtoken' -p "${NGC_API_KEY}" nvcr.io +else + echo "🟠 NGC_API_KEY not set - skipping nvcr.io login (normal for fork PRs)" >&2 +fi + +unset DOCKER_CONFIG_DIR diff --git a/.github/actions/docker-build/action.yml b/.github/actions/docker-build/action.yml index 2af59890759e..5e3defa62bbc 100644 --- a/.github/actions/docker-build/action.yml +++ b/.github/actions/docker-build/action.yml @@ -12,15 +12,18 @@ description: > `ecr-build-push-pull` delegates to once its registry-side checks miss. Pipeline: - 1. Optional: setup docker config + login to nvcr.io (skipped when the - caller has already done it — `skip-docker-config: true`). + 1. Setup docker config + login to nvcr.io via the shared + `_lib/setup-docker-config.sh` (idempotent — no-op if a caller + already ran it earlier in the job). 2. Pre-build disk snapshot (df + docker system df + tag counts) into the step summary. 3. Local exact-tag short-circuit: skip everything if `image-tag` is already in the host's docker store. - 4. Local deps-tag short-circuit: compute a hash over Dockerfile + - setup.py / pyproject.toml / uv.lock + base image digest. If - `:deps-` exists locally, retag as `image-tag` (metadata-only). + 4. Local deps-tag short-circuit: compute deps-hash via the shared + `_lib/compute-deps-hash.sh` (same hash `ecr-build-push-pull` uses, + so a local hit and a registry hit always agree on the deps tag). + If `:deps-` exists locally, retag as `image-tag` + (metadata-only). 5. buildx build (only when neither short-circuit fired). Optional `--cache-from` / `--cache-to` flags for callers that want to layer in a registry-side build cache (e.g. ECR). @@ -30,10 +33,6 @@ description: > growth on long-lived hosts. 8. Post-build disk snapshot. - The deps-hash schema must stay identical to the one in - `ecr-build-push-pull`'s registry-side deps-cache (step 5 there) so a local - build and a registry retag converge on the same image identity. - inputs: image-tag: description: 'Tag for the Docker image (e.g. my-image:latest).' @@ -77,14 +76,6 @@ inputs: caller can rely on them being present. default: '' required: false - skip-docker-config: - description: > - Skip the docker config setup + nvcr.io login step. Set to "true" when - the caller has already configured docker auth (e.g. ecr-build-push-pull - does its own ECR + nvcr setup before delegating here). - default: 'false' - required: false - outputs: local-hit: description: '"true" if the exact image-tag was already in local docker.' @@ -103,24 +94,10 @@ runs: ##### 1: Setup docker config + login to nvcr.io (optional) ##### - name: Setup docker config and login to nvcr.io - if: inputs.skip-docker-config != 'true' shell: bash run: | - DOCKER_CONFIG_DIR=$(mktemp -d) - if [ -f "${HOME}/.docker/config.json" ]; then - python3 -c "import json; cfg=json.load(open('${HOME}/.docker/config.json')); cfg['credsStore']=''; cfg.pop('credHelpers',None); json.dump(cfg,open('${DOCKER_CONFIG_DIR}/config.json','w'))" - else - echo '{"credsStore":""}' > "${DOCKER_CONFIG_DIR}/config.json" - fi - echo "DOCKER_CONFIG=${DOCKER_CONFIG_DIR}" >> "$GITHUB_ENV" - export DOCKER_CONFIG="${DOCKER_CONFIG_DIR}" - - if [ -n "${NGC_API_KEY:-}" ]; then - echo "🔵 Logging into nvcr.io..." - docker login -u \$oauthtoken -p "${NGC_API_KEY}" nvcr.io - else - echo "🟠 NGC_API_KEY not set - skipping nvcr.io login (normal for fork PRs)" - fi + # shellcheck source=/dev/null + . .github/actions/_lib/setup-docker-config.sh ##### 2: Host disk snapshot (pre) ##### @@ -170,29 +147,13 @@ runs: id: local-deps if: steps.local.outputs.hit != 'true' shell: bash + env: + DOCKERFILE_PATH: ${{ inputs.dockerfile-path }} + ISAACSIM_BASE_IMAGE: ${{ inputs.isaacsim-base-image }} + ISAACSIM_VERSION: ${{ inputs.isaacsim-version }} run: | - DEPS_FILES=( - "${{ inputs.dockerfile-path }}" - isaaclab.sh - environment.yml - source/isaaclab/isaaclab/cli - ) - DEPS_MANIFEST_PATTERN='(setup\.py|pyproject\.toml|setup\.cfg|extension\.toml|requirements[^/]*\.txt|uv\.lock)$' - - BASE_IMAGE_DIGEST=$(docker buildx imagetools inspect \ - "${{ inputs.isaacsim-base-image }}:${{ inputs.isaacsim-version }}" \ - --format '{{json .Manifest.Digest}}' 2>/dev/null | tr -d '"' || true) - if [ -n "${BASE_IMAGE_DIGEST}" ]; then - BASE_IMAGE_UNIQ_ID="${{ inputs.isaacsim-base-image }}:${{ inputs.isaacsim-version }}:${BASE_IMAGE_DIGEST}" - else - echo "🟠 Could not resolve base image digest, falling back to tag string" - BASE_IMAGE_UNIQ_ID="${{ inputs.isaacsim-base-image }}:${{ inputs.isaacsim-version }}" - fi - - MANIFEST_FILES=$(git ls-files | grep -E "${DEPS_MANIFEST_PATTERN}" || true) - FILE_HASH=$(git ls-files -s "${DEPS_FILES[@]}" ${MANIFEST_FILES} 2>/dev/null \ - | sha256sum | cut -c1-16) - DEPS_HASH=$(printf '%s %s' "${FILE_HASH}" "${BASE_IMAGE_UNIQ_ID}" | sha256sum | cut -c1-16) + # shellcheck source=/dev/null + . .github/actions/_lib/compute-deps-hash.sh LOCAL_DEPS_TAG="$(echo "${{ inputs.image-tag }}" | cut -d: -f1):deps-${DEPS_HASH}" echo "🔵 Local deps tag: ${LOCAL_DEPS_TAG}" diff --git a/.github/actions/ecr-build-push-pull/action.yml b/.github/actions/ecr-build-push-pull/action.yml index 6051c984c510..9b3d8debc75f 100644 --- a/.github/actions/ecr-build-push-pull/action.yml +++ b/.github/actions/ecr-build-push-pull/action.yml @@ -52,21 +52,8 @@ runs: - name: Setup docker config and login to nvcr.io shell: bash run: | - DOCKER_CONFIG_DIR=$(mktemp -d) - if [ -f "${HOME}/.docker/config.json" ]; then - python3 -c "import json; cfg=json.load(open('${HOME}/.docker/config.json')); cfg['credsStore']=''; cfg.pop('credHelpers',None); json.dump(cfg,open('${DOCKER_CONFIG_DIR}/config.json','w'))" - else - echo '{"credsStore":""}' > "${DOCKER_CONFIG_DIR}/config.json" - fi - echo "DOCKER_CONFIG=${DOCKER_CONFIG_DIR}" >> "$GITHUB_ENV" - export DOCKER_CONFIG="${DOCKER_CONFIG_DIR}" - - if [ -n "${{ env.NGC_API_KEY }}" ]; then - echo "🔵 Logging into nvcr.io..." - docker login -u \$oauthtoken -p ${{ env.NGC_API_KEY }} nvcr.io - else - echo "🟠 NGC_API_KEY not set - skipping nvcr.io login (normal for fork PRs)" - fi + # shellcheck source=/dev/null + . .github/actions/_lib/setup-docker-config.sh ##### 2: Resolve ECR URL ##### @@ -195,36 +182,13 @@ runs: id: deps-cache if: steps.resolve-ecr.outputs.available == 'true' && steps.pull-exact.outputs.hit != 'true' shell: bash + env: + DOCKERFILE_PATH: ${{ inputs.dockerfile-path }} + ISAACSIM_BASE_IMAGE: ${{ inputs.isaacsim-base-image }} + ISAACSIM_VERSION: ${{ inputs.isaacsim-version }} run: | - ##### Deps-hash configuration ##### - # Exact files/dirs whose full content is hashed. The Dockerfile is first. - DEPS_FILES=( - "${{ inputs.dockerfile-path }}" - isaaclab.sh - environment.yml - source/isaaclab/isaaclab/cli - ) - # Manifest files matched repo-wide via git ls-files. - DEPS_MANIFEST_PATTERN='(setup\.py|pyproject\.toml|setup\.cfg|extension\.toml|requirements[^/]*\.txt|uv\.lock)$' - - # Resolve the actual base image digest so a new push of a mutable tag - # (e.g. latest-develop) invalidates the deps cache automatically. - BASE_IMAGE_DIGEST=$(docker buildx imagetools inspect \ - "${{ inputs.isaacsim-base-image }}:${{ inputs.isaacsim-version }}" \ - --format '{{json .Manifest.Digest}}' 2>/dev/null | tr -d '"' || true) - if [ -n "${BASE_IMAGE_DIGEST}" ]; then - BASE_IMAGE_UNIQ_ID="${{ inputs.isaacsim-base-image }}:${{ inputs.isaacsim-version }}:${BASE_IMAGE_DIGEST}" - else - echo "🟠 Could not resolve base image digest, falling back to tag string" - BASE_IMAGE_UNIQ_ID="${{ inputs.isaacsim-base-image }}:${{ inputs.isaacsim-version }}" - fi - - echo "🔵 Base image ID: ${BASE_IMAGE_UNIQ_ID}" - - MANIFEST_FILES=$(git ls-files | grep -E "${DEPS_MANIFEST_PATTERN}" || true) - FILE_HASH=$(git ls-files -s "${DEPS_FILES[@]}" ${MANIFEST_FILES} 2>/dev/null \ - | sha256sum | cut -c1-16) - DEPS_HASH=$(printf '%s %s' "${FILE_HASH}" "${BASE_IMAGE_UNIQ_ID}" | sha256sum | cut -c1-16) + # shellcheck source=/dev/null + . .github/actions/_lib/compute-deps-hash.sh DEPS_ECR_IMAGE="${ECR_URL}:deps-${DEPS_HASH}" echo "🔵 Deps hash: ${DEPS_HASH}" echo "🔵 Checking if deps image ${DEPS_ECR_IMAGE} exists in ECR..." @@ -260,7 +224,6 @@ runs: isaacsim-base-image: ${{ inputs.isaacsim-base-image }} isaacsim-version: ${{ inputs.isaacsim-version }} dockerfile-path: ${{ inputs.dockerfile-path }} - skip-docker-config: 'true' cache-from: ${{ steps.resolve-ecr.outputs.available == 'true' && format('type=registry,ref={0}', env.CACHE_IMAGE) || '' }} cache-to: ${{ steps.resolve-ecr.outputs.available == 'true' && format('type=registry,ref={0},mode=max', env.CACHE_IMAGE) || '' }} extra-tags: ${{ steps.resolve-ecr.outputs.available == 'true' && env.ECR_IMAGE || '' }} diff --git a/.github/workflows/arm-ci.yaml b/.github/workflows/arm-ci.yaml index a3f3b95abcf0..eec2e97673cb 100644 --- a/.github/workflows/arm-ci.yaml +++ b/.github/workflows/arm-ci.yaml @@ -51,7 +51,7 @@ jobs: outputs: run_arm_ci: ${{ steps.detect.outputs.run }} steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@v6 with: fetch-depth: 1 sparse-checkout: .github/actions/detect-changes @@ -81,7 +81,7 @@ jobs: isaacsim_image_name: ${{ steps.load.outputs.isaacsim_image_name }} isaacsim_image_tag: ${{ steps.load.outputs.isaacsim_image_tag }} steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@v6 with: fetch-depth: 1 - id: load @@ -108,7 +108,7 @@ jobs: timeout-minutes: 120 steps: - name: Checkout Code - uses: actions/checkout@v4 + uses: actions/checkout@v6 with: fetch-depth: 1 lfs: true diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml index 5b4992e32edc..7b75aa36371e 100644 --- a/.github/workflows/build.yaml +++ b/.github/workflows/build.yaml @@ -74,103 +74,27 @@ jobs: name: Detect Changes runs-on: ubuntu-latest outputs: - run_docker_tests: ${{ steps.detect.outputs.run_docker_tests }} + run_docker_tests: ${{ steps.detect.outputs.run }} steps: + - uses: actions/checkout@v6 + with: + fetch-depth: 1 + sparse-checkout: .github/actions/detect-changes + sparse-checkout-cone-mode: false - id: detect - env: - GH_TOKEN: ${{ github.token }} - PR_NUMBER: ${{ github.event.pull_request.number }} - EVENT_NAME: ${{ github.event_name }} - REPO: ${{ github.repository }} - run: | - set -euo pipefail - - # Docker test jobs run only when paths in the patterns table change. - # Otherwise they skip via `if:` and report green to branch protection, - # which is why we don't use a workflow-level `paths:` filter (a - # not-triggered required check would block the PR forever). - # config.yaml is included because it controls the base image names and - # tags consumed by the Docker build jobs. - patterns=( - $'^source/\tLibrary source code' - $'^docker/\tContainer build inputs' - $'^tools/\tBuild tooling' - $'^apps/\tStandalone apps' - $'^scripts/\tStandalone scripts' - $'^\\.github/workflows/build\\.yaml$\tThis workflow file' - $'^\\.github/workflows/config\\.yaml$\tBase image config' - $'^\\.github/actions/\tCI actions' - ) - triggered_jobs="Docker build jobs + all test-* matrix jobs (non-root verify is folded into test-isaaclab-ov and test-curobo)" - - render_table() { - local files="$1" entry regex desc count sample - echo "| Pattern | What it covers | Matched files |" - echo "|---|---|---|" - for entry in "${patterns[@]}"; do - IFS=$'\t' read -r regex desc <<< "$entry" - count=$(grep -cE "$regex" <<< "$files" || true) - if [ "$count" -gt 0 ]; then - sample=$(grep -m 3 -E "$regex" <<< "$files" | paste -sd ', ' -) - [ "$count" -gt 3 ] && sample="$sample (and $((count - 3)) more)" - echo "| \`$regex\` | $desc | $sample |" - else - echo "| \`$regex\` | $desc | - |" - fi - done - } - - any_match() { - local files="$1" entry regex - for entry in "${patterns[@]}"; do - IFS=$'\t' read -r regex _ <<< "$entry" - if grep -qE "$regex" <<< "$files"; then - return 0 - fi - done - return 1 - } - - decide() { - local decision="$1" reason="$2" files="${3:-}" - echo "Decision: run_docker_tests=$decision ($reason)" - echo "run_docker_tests=$decision" >> "$GITHUB_OUTPUT" - { - echo "## Docker test gating" - echo "" - if [ "$decision" = "true" ]; then - echo "Docker tests will **run**: $reason." - else - echo "Docker tests will be **skipped**: $reason." - fi - echo "" - echo "Triggered jobs: $triggered_jobs." - if [ -n "$files" ]; then - echo "" - render_table "$files" - fi - } >> "$GITHUB_STEP_SUMMARY" - } - - if [ "$EVENT_NAME" != "pull_request" ]; then - decide true "non-PR event ($EVENT_NAME)" - exit 0 - fi - - if ! changed_files="$(gh api --paginate "repos/$REPO/pulls/$PR_NUMBER/files" --jq '.[].filename')"; then - # Fail-safe: a transient API error must not block merge. Default to running. - echo "::warning::Could not list changed files; defaulting to running tests" - decide true "fail-safe (could not list changed files)" - exit 0 - fi - - printf '%s\n' "$changed_files" - - if any_match "$changed_files"; then - decide true "relevant paths changed" "$changed_files" - else - decide false "no relevant paths changed" "$changed_files" - fi + uses: ./.github/actions/detect-changes + with: + summary-title: Docker test gating + triggered-jobs: Docker build jobs + all test-* matrix jobs (non-root verify is folded into test-isaaclab-ov and test-curobo) + patterns: | + ^source/ Library source code + ^docker/ Container build inputs + ^tools/ Build tooling + ^apps/ Standalone apps + ^scripts/ Standalone scripts + ^\.github/workflows/build\.yaml$ This workflow file + ^\.github/workflows/config\.yaml$ Base image config + ^\.github/actions/ CI actions config: name: Load Config From c7c41246facbb67c41553b6b5738ddf95f9861e6 Mon Sep 17 00:00:00 2001 From: jichuanh Date: Mon, 1 Jun 2026 01:09:32 +0000 Subject: [PATCH 41/55] Fold arm-ci.yaml into build.yaml Per @myurasov-nv's review feedback: same OS (Linux), different runner pool shouldn't justify a separate workflow file. Other multi-arch repos (NVIDIA Warp, NumPy, newton, PyTorch) put cross-arch jobs in one workflow with different `runs-on:` labels. The build/test work itself doesn't change. Add an `arm-ci` job to build.yaml that builds and tests on the same [self-hosted, arm64] runner (no ECR for arm64, so the image must stay local to the runner that builds it). Source-side fixes from the original series stay: AppLauncher EXP_PATH fallback, pytetwild aarch64 gate, OVRTX/OVPhysX skip-on-aarch64, conftest.py CI_MARKER orchestration. Gate the amd64 `build` and `build-curobo` jobs on absence of the `ci-skip-amd64` PR label so this branch can iterate on arm-ci alone without burning GPU minutes. All 18 test-* jobs skip transitively via `needs: build`. Remove the label to re-enable the full amd64 surface. Delete .github/workflows/arm-ci.yaml. --- .github/workflows/arm-ci.yaml | 146 ---------------------------------- .github/workflows/build.yaml | 53 +++++++++++- 2 files changed, 51 insertions(+), 148 deletions(-) delete mode 100644 .github/workflows/arm-ci.yaml diff --git a/.github/workflows/arm-ci.yaml b/.github/workflows/arm-ci.yaml deleted file mode 100644 index eec2e97673cb..000000000000 --- a/.github/workflows/arm-ci.yaml +++ /dev/null @@ -1,146 +0,0 @@ -# Copyright (c) 2022-2026, The Isaac Lab Project Developers (https://github.com/isaac-sim/IsaacLab/blob/main/CONTRIBUTORS.md). -# All rights reserved. -# -# SPDX-License-Identifier: BSD-3-Clause - -# ARM/Spark CI — exercises Isaac Lab on aarch64 Linux self-hosted runners -# (NVIDIA DGX Spark). Single job that builds Dockerfile.base for linux/arm64 -# and runs the `arm_ci`-marked pytest set against it. ECR is not wired for -# arm64, so build output stays local to the runner; the action's local -# exact-tag and deps-tag checks short-circuit rebuilds when the runner is -# still warm from a prior run. -# -# Marker-driven discovery: `pytest -m arm_ci`. Adding a new aarch64-safe -# test = tag it with arm_ci, no yaml edit. - -name: ARM CI - -on: - pull_request: - types: [opened, synchronize, reopened] - branches: - - main - - develop - - 'release/**' - push: - branches: - - main - - develop - - 'release/**' - workflow_dispatch: - -concurrency: - group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} - cancel-in-progress: true - -permissions: - contents: read - pull-requests: write - checks: write - -env: - NGC_API_KEY: ${{ secrets.NGC_API_KEY }} - # Arch-suffixed tag so the per-commit ECR lookup, layer cache, and local - # docker tag never collide with the linux/amd64 image built by build.yaml. - CI_IMAGE_TAG: isaac-lab-ci-arm64:${{ github.event_name == 'pull_request' && format('pr-{0}', github.event.pull_request.number) || github.ref_name }}-${{ github.sha }} - -jobs: - changes: - name: Detect Changes - runs-on: ubuntu-latest - outputs: - run_arm_ci: ${{ steps.detect.outputs.run }} - steps: - - uses: actions/checkout@v6 - with: - fetch-depth: 1 - sparse-checkout: .github/actions/detect-changes - sparse-checkout-cone-mode: false - - id: detect - uses: ./.github/actions/detect-changes - with: - summary-title: ARM CI gating - triggered-jobs: arm-ci (build + tests) - patterns: | - ^source/ Library source code - ^tools/ Build tooling - ^apps/ Standalone apps - ^docker/ Container build inputs - (^|/)pyproject\.toml$ Python project metadata - ^\.github/workflows/arm-ci\.yaml$ This workflow file - ^\.github/actions/docker-build/ Docker build action - ^\.github/actions/detect-changes/ Change-detection action - ^VERSION$ Version file - - config: - name: Load Config - runs-on: ubuntu-latest - needs: [changes] - if: needs.changes.outputs.run_arm_ci == 'true' - outputs: - isaacsim_image_name: ${{ steps.load.outputs.isaacsim_image_name }} - isaacsim_image_tag: ${{ steps.load.outputs.isaacsim_image_tag }} - steps: - - uses: actions/checkout@v6 - with: - fetch-depth: 1 - - id: load - shell: bash - run: | - set -euo pipefail - # Read isaacsim_image_name/tag from .github/workflows/config.yaml. - # Fallback to nightly tag if yq is unavailable on ubuntu-latest. - if command -v yq >/dev/null 2>&1; then - name=$(yq -r .isaacsim_image_name .github/workflows/config.yaml) - tag=$(yq -r .isaacsim_image_tag .github/workflows/config.yaml) - else - name=$(grep '^isaacsim_image_name:' .github/workflows/config.yaml | awk '{print $2}') - tag=$(grep '^isaacsim_image_tag:' .github/workflows/config.yaml | awk '{print $2}') - fi - echo "isaacsim_image_name=$name" >> "$GITHUB_OUTPUT" - echo "isaacsim_image_tag=$tag" >> "$GITHUB_OUTPUT" - - arm-ci: - name: arm-ci - runs-on: [self-hosted, arm64] - needs: [changes, config] - if: needs.changes.outputs.run_arm_ci == 'true' - timeout-minutes: 120 - steps: - - name: Checkout Code - uses: actions/checkout@v6 - with: - fetch-depth: 1 - lfs: true - - - name: Build Base Docker image - uses: ./.github/actions/docker-build - with: - image-tag: ${{ env.CI_IMAGE_TAG }} - isaacsim-base-image: ${{ needs.config.outputs.isaacsim_image_name }} - isaacsim-version: ${{ needs.config.outputs.isaacsim_image_tag }} - dockerfile-path: docker/Dockerfile.base - platform: linux/arm64 - - # tools/conftest.py's subprocess-per-file orchestrator picks up - # CI_MARKER=arm_ci and spawns one python subprocess per test file - # tagged `pytest.mark.arm_ci`. Each Kit launch is isolated, so the - # AppLauncher-at-module-level pattern cannot SIGSEGV across files. - - name: Run arm_ci marker tests - uses: ./.github/actions/run-tests - with: - test-path: "tools" - result-file: "arm-ci-report.xml" - container-name: "isaac-lab-arm-ci-${{ github.run_id }}-${{ github.run_attempt }}" - image-tag: ${{ env.CI_IMAGE_TAG }} - pytest-options: "" - ci-marker: "arm_ci" - volume-mount-source: ${{ github.workspace }} - - - name: Upload test reports - if: always() - uses: actions/upload-artifact@v4 - with: - name: arm-ci-reports - path: reports/ - retention-days: 7 diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml index 7b75aa36371e..3679badd7cd1 100644 --- a/.github/workflows/build.yaml +++ b/.github/workflows/build.yaml @@ -68,6 +68,9 @@ permissions: env: NGC_API_KEY: ${{ secrets.NGC_API_KEY }} CI_IMAGE_TAG: isaac-lab-ci:${{ github.event_name == 'pull_request' && format('pr-{0}', github.event.pull_request.number) || github.ref_name }}-${{ github.sha }} + # Arch-suffixed tag so the arm64 build never collides with the amd64 image in + # the local docker store on shared runners. + CI_IMAGE_TAG_ARM64: isaac-lab-ci-arm64:${{ github.event_name == 'pull_request' && format('pr-{0}', github.event.pull_request.number) || github.ref_name }}-${{ github.sha }} jobs: changes: @@ -122,7 +125,10 @@ jobs: name: Build Base Docker Image runs-on: [self-hosted, gpu] needs: [changes, config] - if: needs.changes.outputs.run_docker_tests == 'true' + # `ci-skip-amd64` PR label short-circuits the amd64 build (and therefore + # every downstream test job via `needs: build`). Use during branch-local + # iteration on aarch64 changes; remove the label to re-enable amd64 CI. + if: needs.changes.outputs.run_docker_tests == 'true' && !contains(github.event.pull_request.labels.*.name, 'ci-skip-amd64') steps: - name: Checkout Code uses: actions/checkout@v6 @@ -143,7 +149,7 @@ jobs: name: Build cuRobo Docker Image runs-on: [self-hosted, gpu] needs: [changes, config] - if: needs.changes.outputs.run_docker_tests == 'true' + if: needs.changes.outputs.run_docker_tests == 'true' && !contains(github.event.pull_request.labels.*.name, 'ci-skip-amd64') steps: - name: Checkout Code uses: actions/checkout@v6 @@ -160,6 +166,49 @@ jobs: dockerfile-path: docker/Dockerfile.curobo cache-tag: cache-curobo + # aarch64 build + marker-gated tests on NVIDIA DGX Spark self-hosted runners. + # Build and test must share one runner because ECR is not wired for arm64 — + # the locally-built image cannot be handed off across machines. + arm-ci: + name: arm-ci + runs-on: [self-hosted, arm64] + needs: [changes, config] + if: needs.changes.outputs.run_docker_tests == 'true' + timeout-minutes: 60 + continue-on-error: true + steps: + - uses: actions/checkout@v6 + with: + fetch-depth: 1 + lfs: true + + - name: Build base image (linux/arm64) + uses: ./.github/actions/docker-build + with: + image-tag: ${{ env.CI_IMAGE_TAG_ARM64 }} + isaacsim-base-image: ${{ needs.config.outputs.isaacsim_image_name }} + isaacsim-version: ${{ needs.config.outputs.isaacsim_image_tag }} + dockerfile-path: docker/Dockerfile.base + platform: linux/arm64 + + - name: Run arm_ci marker tests + uses: ./.github/actions/run-tests + with: + test-path: tools + result-file: arm-ci-report.xml + container-name: isaac-lab-arm-ci-${{ github.run_id }}-${{ github.run_attempt }} + image-tag: ${{ env.CI_IMAGE_TAG_ARM64 }} + ci-marker: arm_ci + volume-mount-source: ${{ github.workspace }} + + - name: Upload test reports + if: always() + uses: actions/upload-artifact@v4 + with: + name: arm-ci-reports + path: reports/ + retention-days: 7 + #endregion #region test jobs From 4d276e0e88269321fb8270a359d87b2f7d9a40de Mon Sep 17 00:00:00 2001 From: jichuanh Date: Mon, 1 Jun 2026 01:10:24 +0000 Subject: [PATCH 42/55] Trigger CI with ci-skip-amd64 label active From b52497d31310f1dbd269f6888b4ecdfd82f16b53 Mon Sep 17 00:00:00 2001 From: jichuanh Date: Mon, 1 Jun 2026 01:25:45 +0000 Subject: [PATCH 43/55] ci: expand ci-arm-only label to gate non-arm workflows Iteration helper. The label-gate added in the prior commit (then named ci-skip-amd64) only skipped build.yaml's amd64 build jobs. Expanding to also gate docs.yaml (build-latest-docs, build-multi-docs), install-ci.yml (both x86 and arm installation test jobs), and license-check.yaml so that PRs iterating on a single platform can run only their target workflow. Rename ci-skip-amd64 -> ci-arm-only to reflect the broader scope. --- .github/workflows/build.yaml | 7 ++++--- .github/workflows/docs.yaml | 8 ++++---- .github/workflows/install-ci.yml | 6 ++++-- .github/workflows/license-check.yaml | 3 +++ 4 files changed, 15 insertions(+), 9 deletions(-) diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml index 3679badd7cd1..6e5f6b57fb32 100644 --- a/.github/workflows/build.yaml +++ b/.github/workflows/build.yaml @@ -125,10 +125,11 @@ jobs: name: Build Base Docker Image runs-on: [self-hosted, gpu] needs: [changes, config] - # `ci-skip-amd64` PR label short-circuits the amd64 build (and therefore + # `ci-arm-only` PR label short-circuits the amd64 build (and therefore # every downstream test job via `needs: build`). Use during branch-local # iteration on aarch64 changes; remove the label to re-enable amd64 CI. - if: needs.changes.outputs.run_docker_tests == 'true' && !contains(github.event.pull_request.labels.*.name, 'ci-skip-amd64') + # Other workflows (docs, install-ci, license-check) honor the same label. + if: needs.changes.outputs.run_docker_tests == 'true' && !contains(github.event.pull_request.labels.*.name, 'ci-arm-only') steps: - name: Checkout Code uses: actions/checkout@v6 @@ -149,7 +150,7 @@ jobs: name: Build cuRobo Docker Image runs-on: [self-hosted, gpu] needs: [changes, config] - if: needs.changes.outputs.run_docker_tests == 'true' && !contains(github.event.pull_request.labels.*.name, 'ci-skip-amd64') + if: needs.changes.outputs.run_docker_tests == 'true' && !contains(github.event.pull_request.labels.*.name, 'ci-arm-only') steps: - name: Checkout Code uses: actions/checkout@v6 diff --git a/.github/workflows/docs.yaml b/.github/workflows/docs.yaml index 013be3a5b126..ad69a1dd802d 100644 --- a/.github/workflows/docs.yaml +++ b/.github/workflows/docs.yaml @@ -42,8 +42,9 @@ jobs: name: Build Latest Docs runs-on: ubuntu-latest needs: [doc-build-type] - # run on non-deploy branches to build current version docs only - if: needs.doc-build-type.outputs.trigger-deploy != 'true' + # `ci-arm-only` PR label short-circuits this job for branch-local arm + # iteration; see build.yaml for the full convention. + if: needs.doc-build-type.outputs.trigger-deploy != 'true' && !contains(github.event.pull_request.labels.*.name, 'ci-arm-only') steps: - name: Checkout code @@ -73,8 +74,7 @@ jobs: name: Build Multi-Version Docs runs-on: ubuntu-latest needs: [doc-build-type] - # run on deploy branches to create multi-version docs - if: needs.doc-build-type.outputs.trigger-deploy == 'true' + if: needs.doc-build-type.outputs.trigger-deploy == 'true' && !contains(github.event.pull_request.labels.*.name, 'ci-arm-only') steps: - name: Checkout code diff --git a/.github/workflows/install-ci.yml b/.github/workflows/install-ci.yml index 2a2fee50c8ab..5a53726b976c 100644 --- a/.github/workflows/install-ci.yml +++ b/.github/workflows/install-ci.yml @@ -126,7 +126,9 @@ jobs: install-tests-x86: name: Installation Tests (x86) needs: [changes] - if: needs.changes.outputs.run_install_tests == 'true' + # `ci-arm-only` PR label short-circuits this job for branch-local arm + # iteration; see build.yaml for the full convention. + if: needs.changes.outputs.run_install_tests == 'true' && !contains(github.event.pull_request.labels.*.name, 'ci-arm-only') runs-on: [self-hosted, gpu] timeout-minutes: 90 steps: @@ -145,7 +147,7 @@ jobs: install-tests-arm: name: Installation Tests (ARM) needs: [changes] - if: needs.changes.outputs.run_install_tests == 'true' + if: needs.changes.outputs.run_install_tests == 'true' && !contains(github.event.pull_request.labels.*.name, 'ci-arm-only') runs-on: [self-hosted, aarch64] timeout-minutes: 90 steps: diff --git a/.github/workflows/license-check.yaml b/.github/workflows/license-check.yaml index 0b296f9e74eb..5403dcca73fe 100644 --- a/.github/workflows/license-check.yaml +++ b/.github/workflows/license-check.yaml @@ -16,6 +16,9 @@ concurrency: jobs: license-check: runs-on: ubuntu-24.04 + # `ci-arm-only` PR label short-circuits this job for branch-local arm + # iteration; see build.yaml for the full convention. + if: ${{ !contains(github.event.pull_request.labels.*.name, 'ci-arm-only') }} steps: - name: Checkout code From 4f70f4b65a4f96a4b2f9b1a0e12826421271c4dd Mon Sep 17 00:00:00 2001 From: jichuanh Date: Mon, 1 Jun 2026 16:29:54 +0000 Subject: [PATCH 44/55] Revert detect-changes composite; restore inline change detection After folding arm-ci.yaml into build.yaml, the detect-changes composite had a single caller. The composite + sparse-checkout pattern was overhead to support cross-workflow sharing that no longer exists. Restore the inline bash detector that was on develop before the migration. Net: -50 lines, one fewer directory under .github/actions/. --- .github/actions/detect-changes/action.yml | 143 ---------------------- .github/workflows/build.yaml | 116 +++++++++++++++--- 2 files changed, 97 insertions(+), 162 deletions(-) delete mode 100644 .github/actions/detect-changes/action.yml diff --git a/.github/actions/detect-changes/action.yml b/.github/actions/detect-changes/action.yml deleted file mode 100644 index e511381aa7f7..000000000000 --- a/.github/actions/detect-changes/action.yml +++ /dev/null @@ -1,143 +0,0 @@ -# Copyright (c) 2022-2026, The Isaac Lab Project Developers (https://github.com/isaac-sim/IsaacLab/blob/main/CONTRIBUTORS.md). -# All rights reserved. -# -# SPDX-License-Identifier: BSD-3-Clause - -name: 'Detect Changes' -description: > - Decide whether expensive downstream jobs should run on this PR. Returns - "true" on non-PR events (push, workflow_dispatch) and fails-safe to "true" - if the changed-files API call errors out. On PR events, returns "true" iff - any provided regex pattern matches a changed file path. - - Renders a step summary table for visibility on the run page. - - This pattern (always-runs gating job + downstream `if:`) is used instead of - a workflow-level `paths:` filter because a not-triggered required check - would block PRs indefinitely under branch protection. - -inputs: - patterns: - description: > - Newline-separated, TAB-delimited regex + human-readable description - pairs. Example: - ^source/\tLibrary source code - ^docker/\tContainer build inputs - required: true - triggered-jobs: - description: > - Short text shown in the step summary describing what runs when this - returns true. - required: false - default: 'downstream jobs' - summary-title: - description: 'Heading text for the step summary section.' - required: false - default: 'Change detection' - -outputs: - run: - description: '"true" if any pattern matched (or this is a non-PR event); "false" otherwise.' - value: ${{ steps.detect.outputs.run }} - -runs: - using: composite - steps: - - id: detect - shell: bash - env: - GH_TOKEN: ${{ github.token }} - PR_NUMBER: ${{ github.event.pull_request.number }} - EVENT_NAME: ${{ github.event_name }} - REPO: ${{ github.repository }} - PATTERNS_INPUT: ${{ inputs.patterns }} - TRIGGERED_JOBS: ${{ inputs.triggered-jobs }} - SUMMARY_TITLE: ${{ inputs.summary-title }} - run: | - set -euo pipefail - - # Parse newline+tab-delimited input into a bash array. Tab-less lines - # are an error (likely space-delimited by mistake) — a silently-skipped - # pattern could make the workflow not trigger on changes it should have. - patterns=() - while IFS= read -r line; do - [ -z "$line" ] && continue - if [[ "$line" != *$'\t'* ]]; then - echo "::error::detect-changes: pattern line missing tab separator: '$line'" - exit 1 - fi - patterns+=("$line") - done <<< "$PATTERNS_INPUT" - - if [ "${#patterns[@]}" -eq 0 ]; then - echo "::error::detect-changes received no valid pattern lines" - exit 1 - fi - - render_table() { - local files="$1" entry regex desc count sample - echo "| Pattern | What it covers | Matched files |" - echo "|---|---|---|" - for entry in "${patterns[@]}"; do - IFS=$'\t' read -r regex desc <<< "$entry" - count=$(grep -cE "$regex" <<< "$files" || true) - if [ "$count" -gt 0 ]; then - sample=$(grep -m 3 -E "$regex" <<< "$files" | paste -sd ', ' -) - [ "$count" -gt 3 ] && sample="$sample (and $((count - 3)) more)" - echo "| \`$regex\` | $desc | $sample |" - else - echo "| \`$regex\` | $desc | - |" - fi - done - } - - any_match() { - local files="$1" entry regex - for entry in "${patterns[@]}"; do - IFS=$'\t' read -r regex _ <<< "$entry" - if grep -qE "$regex" <<< "$files"; then - return 0 - fi - done - return 1 - } - - decide() { - local decision="$1" reason="$2" files="${3:-}" - echo "Decision: run=$decision ($reason)" - echo "run=$decision" >> "$GITHUB_OUTPUT" - { - echo "## ${SUMMARY_TITLE}" - echo "" - if [ "$decision" = "true" ]; then - echo "Downstream jobs will **run**: $reason." - else - echo "Downstream jobs will be **skipped**: $reason." - fi - echo "" - echo "Triggered jobs: $TRIGGERED_JOBS." - if [ -n "$files" ]; then - echo "" - render_table "$files" - fi - } >> "$GITHUB_STEP_SUMMARY" - } - - if [ "$EVENT_NAME" != "pull_request" ]; then - decide true "non-PR event ($EVENT_NAME)" - exit 0 - fi - - if ! changed_files="$(gh api --paginate "repos/$REPO/pulls/$PR_NUMBER/files" --jq '.[].filename')"; then - echo "::warning::Could not list changed files; defaulting to run=true" - decide true "fail-safe (could not list changed files)" - exit 0 - fi - - printf '%s\n' "$changed_files" - - if any_match "$changed_files"; then - decide true "relevant paths changed" "$changed_files" - else - decide false "no relevant paths changed" "$changed_files" - fi diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml index 6e5f6b57fb32..80f98a98814b 100644 --- a/.github/workflows/build.yaml +++ b/.github/workflows/build.yaml @@ -77,27 +77,105 @@ jobs: name: Detect Changes runs-on: ubuntu-latest outputs: - run_docker_tests: ${{ steps.detect.outputs.run }} + run_docker_tests: ${{ steps.detect.outputs.run_docker_tests }} steps: - - uses: actions/checkout@v6 - with: - fetch-depth: 1 - sparse-checkout: .github/actions/detect-changes - sparse-checkout-cone-mode: false - id: detect - uses: ./.github/actions/detect-changes - with: - summary-title: Docker test gating - triggered-jobs: Docker build jobs + all test-* matrix jobs (non-root verify is folded into test-isaaclab-ov and test-curobo) - patterns: | - ^source/ Library source code - ^docker/ Container build inputs - ^tools/ Build tooling - ^apps/ Standalone apps - ^scripts/ Standalone scripts - ^\.github/workflows/build\.yaml$ This workflow file - ^\.github/workflows/config\.yaml$ Base image config - ^\.github/actions/ CI actions + env: + GH_TOKEN: ${{ github.token }} + PR_NUMBER: ${{ github.event.pull_request.number }} + EVENT_NAME: ${{ github.event_name }} + REPO: ${{ github.repository }} + run: | + set -euo pipefail + + # Docker test jobs run only when paths in the patterns table change. + # Otherwise they skip via `if:` and report green to branch protection, + # which is why we don't use a workflow-level `paths:` filter (a + # not-triggered required check would block the PR forever). + # config.yaml is included because it controls the base image names and + # tags consumed by the Docker build jobs. + patterns=( + $'^source/\tLibrary source code' + $'^docker/\tContainer build inputs' + $'^tools/\tBuild tooling' + $'^apps/\tStandalone apps' + $'^scripts/\tStandalone scripts' + $'^\\.github/workflows/build\\.yaml$\tThis workflow file' + $'^\\.github/workflows/config\\.yaml$\tBase image config' + $'^\\.github/actions/\tCI actions' + ) + triggered_jobs="Docker build jobs + all test-* matrix jobs (non-root verify is folded into test-isaaclab-ov and test-curobo)" + + render_table() { + local files="$1" entry regex desc count sample shown + echo "| Pattern | What it covers | Matched files |" + echo "|---|---|---|" + for entry in "${patterns[@]}"; do + IFS=$'\t' read -r regex desc <<< "$entry" + # escape | so it doesn't end the markdown table cell mid-regex + shown="${regex//|/\\|}" + count=$(grep -cE "$regex" <<< "$files" || true) + if [ "$count" -gt 0 ]; then + sample=$(grep -m 3 -E "$regex" <<< "$files" | paste -sd ', ' -) + [ "$count" -gt 3 ] && sample="$sample (and $((count - 3)) more)" + echo "| \`$shown\` | $desc | $sample |" + else + echo "| \`$shown\` | $desc | - |" + fi + done + } + + any_match() { + local files="$1" entry regex + for entry in "${patterns[@]}"; do + IFS=$'\t' read -r regex _ <<< "$entry" + if grep -qE "$regex" <<< "$files"; then + return 0 + fi + done + return 1 + } + + decide() { + local decision="$1" reason="$2" files="${3:-}" + echo "Decision: run_docker_tests=$decision ($reason)" + echo "run_docker_tests=$decision" >> "$GITHUB_OUTPUT" + { + echo "## Docker test gating" + echo "" + if [ "$decision" = "true" ]; then + echo "Docker tests will **run**: $reason." + else + echo "Docker tests will be **skipped**: $reason." + fi + echo "" + echo "Triggered jobs: $triggered_jobs." + if [ -n "$files" ]; then + echo "" + render_table "$files" + fi + } >> "$GITHUB_STEP_SUMMARY" + } + + if [ "$EVENT_NAME" != "pull_request" ]; then + decide true "non-PR event ($EVENT_NAME)" + exit 0 + fi + + if ! changed_files="$(gh api --paginate "repos/$REPO/pulls/$PR_NUMBER/files" --jq '.[].filename')"; then + # Fail-safe: a transient API error must not block merge. Default to running. + echo "::warning::Could not list changed files; defaulting to running tests" + decide true "fail-safe (could not list changed files)" + exit 0 + fi + + printf '%s\n' "$changed_files" + + if any_match "$changed_files"; then + decide true "relevant paths changed" "$changed_files" + else + decide false "no relevant paths changed" "$changed_files" + fi config: name: Load Config From ba550188c3f0d9c6c1f2b553870bf1ef192c7b72 Mon Sep 17 00:00:00 2001 From: jichuanh Date: Mon, 1 Jun 2026 17:34:00 +0000 Subject: [PATCH 45/55] Drop CI_IMAGE_TAG_ARM64 env var; inline -arm64 suffix arm64 runs on a different runner pool ([self-hosted, arm64]) than the amd64 build ([self-hosted, gpu]), so the local docker tag could not collide in practice. The extra env var was defensive only. Follow the same suffix pattern that build-curobo already uses (`${{ env.CI_IMAGE_TAG }}-curobo`) for consistency. --- .github/workflows/build.yaml | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml index 80f98a98814b..9d865d8615a4 100644 --- a/.github/workflows/build.yaml +++ b/.github/workflows/build.yaml @@ -68,9 +68,6 @@ permissions: env: NGC_API_KEY: ${{ secrets.NGC_API_KEY }} CI_IMAGE_TAG: isaac-lab-ci:${{ github.event_name == 'pull_request' && format('pr-{0}', github.event.pull_request.number) || github.ref_name }}-${{ github.sha }} - # Arch-suffixed tag so the arm64 build never collides with the amd64 image in - # the local docker store on shared runners. - CI_IMAGE_TAG_ARM64: isaac-lab-ci-arm64:${{ github.event_name == 'pull_request' && format('pr-{0}', github.event.pull_request.number) || github.ref_name }}-${{ github.sha }} jobs: changes: @@ -264,7 +261,7 @@ jobs: - name: Build base image (linux/arm64) uses: ./.github/actions/docker-build with: - image-tag: ${{ env.CI_IMAGE_TAG_ARM64 }} + image-tag: ${{ env.CI_IMAGE_TAG }}-arm64 isaacsim-base-image: ${{ needs.config.outputs.isaacsim_image_name }} isaacsim-version: ${{ needs.config.outputs.isaacsim_image_tag }} dockerfile-path: docker/Dockerfile.base @@ -276,7 +273,7 @@ jobs: test-path: tools result-file: arm-ci-report.xml container-name: isaac-lab-arm-ci-${{ github.run_id }}-${{ github.run_attempt }} - image-tag: ${{ env.CI_IMAGE_TAG_ARM64 }} + image-tag: ${{ env.CI_IMAGE_TAG }}-arm64 ci-marker: arm_ci volume-mount-source: ${{ github.workspace }} From 61aab1ec6c715be0841ff5428821c851a05f8784 Mon Sep 17 00:00:00 2001 From: jichuanh Date: Mon, 1 Jun 2026 17:52:24 +0000 Subject: [PATCH 46/55] Drop unused outputs from docker-build action `local-hit`, `local-deps-hit`, and `was-built` were declared as action-level outputs anticipating downstream telemetry callers that never materialized. Grep across .github/ shows zero consumers. Internal step outputs of the same names continue to gate the conditional steps inside the action (the local exact-tag short-circuit, the local deps-tag short-circuit, the post-build deps-tag application), so the deletion is purely surface-area: external declarations go, in-action flow unchanged. --- .github/actions/docker-build/action.yml | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/.github/actions/docker-build/action.yml b/.github/actions/docker-build/action.yml index 5e3defa62bbc..e7c6216ef49d 100644 --- a/.github/actions/docker-build/action.yml +++ b/.github/actions/docker-build/action.yml @@ -76,16 +76,6 @@ inputs: caller can rely on them being present. default: '' required: false -outputs: - local-hit: - description: '"true" if the exact image-tag was already in local docker.' - value: ${{ steps.local.outputs.hit }} - local-deps-hit: - description: '"true" if a local deps-cache tag matched (and was retagged).' - value: ${{ steps.local-deps.outputs.hit }} - was-built: - description: '"true" if a full buildx build ran (no cache hit).' - value: ${{ steps.build.outputs.was-built }} runs: using: composite From 23456a525dbfb480bc4fb587101e6e390c465fe1 Mon Sep 17 00:00:00 2001 From: jichuanh Date: Mon, 1 Jun 2026 18:42:55 +0000 Subject: [PATCH 47/55] Tighten arm-ci action surface; unmask amd64 + other workflows MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Cleanups in response to self-review: 1. compute-deps-hash.sh refactored from source-with-env-vars to exec-with-positional-args. Caller captures hash via $(...) or pipes into $GITHUB_OUTPUT. More idiomatic, testable from any shell, no env-var contract. 2. docker-build gains a `deps-hash` input. ecr-build-push-pull computes the hash once in its registry-side check step, writes it to $GITHUB_OUTPUT, and passes it forward to docker-build's delegation. docker-build's local check uses the input if non-empty, else computes itself. Hash now computed exactly once per build. 3. `extra-tags` input removed from docker-build. The single consumer (ecr-build-push-pull) now does its own `docker tag ` step after the build returns. Less surface area on the primitive. 4. `evict-stale-cache` input added on docker-build (default false). The 14-day eviction is opt-in rather than implicit. No current caller opts in; runners that need disk cleanup do so explicitly. 5. Three unused action-level outputs (local-hit, local-deps-hit, was-built) on docker-build removed — no consumers grep'd. 6. docker-build description reverted to develop's one-liner. 7. ci_marker pre-scan in tools/conftest.py removed. The pre-scan previously overrode TESTS_TO_SKIP for marker-tagged files, which only papered over one mismatch (test_differential_ik.py is in TESTS_TO_SKIP as "Failing" yet was arm_ci-tagged). Respect the global skip list: a skip-listed test stays skipped on all platforms. arm-ci canary goes 8 -> 7 with the one known-failing file correctly dropped. 8. ci-arm-only label gates removed from build.yaml, docs.yaml, install-ci.yml, license-check.yaml. The iteration helper was one-shot; arm-ci is now green, the gate's job is done. Drop the gates and the corresponding repo label. 9. triggered_jobs string in build.yaml's `changes` job reverted to develop's text — the "non-root verify is folded" detail snuck in from an earlier commit and isn't strictly relevant. --- .github/actions/_lib/compute-deps-hash.sh | 45 ++++++----- .github/actions/docker-build/action.yml | 77 ++++++------------- .../actions/ecr-build-push-pull/action.yml | 23 ++++-- .github/workflows/build.yaml | 10 +-- .github/workflows/docs.yaml | 8 +- .github/workflows/install-ci.yml | 6 +- .github/workflows/license-check.yaml | 3 - tools/conftest.py | 22 ------ 8 files changed, 72 insertions(+), 122 deletions(-) mode change 100644 => 100755 .github/actions/_lib/compute-deps-hash.sh diff --git a/.github/actions/_lib/compute-deps-hash.sh b/.github/actions/_lib/compute-deps-hash.sh old mode 100644 new mode 100755 index a52284120837..855341f1a0ba --- a/.github/actions/_lib/compute-deps-hash.sh +++ b/.github/actions/_lib/compute-deps-hash.sh @@ -1,3 +1,4 @@ +#!/usr/bin/env bash # Copyright (c) 2022-2026, The Isaac Lab Project Developers (https://github.com/isaac-sim/IsaacLab/blob/main/CONTRIBUTORS.md). # All rights reserved. # @@ -7,41 +8,43 @@ # composite actions. Both invoke this script so a registry-side cache hit and # a local-store cache hit always agree on the same `deps-` tag. # -# Source this script (do not exec) — it sets DEPS_HASH in the caller's -# environment. Caller must export DOCKERFILE_PATH, ISAACSIM_BASE_IMAGE, -# ISAACSIM_VERSION before sourcing. Diagnostic output goes to stderr so the -# caller's stdout stays usable. +# Usage: compute-deps-hash.sh +# Prints the 16-character deps-hash to stdout. Diagnostic output goes to stderr. +set -euo pipefail -: "${DOCKERFILE_PATH:?compute-deps-hash: DOCKERFILE_PATH must be set}" -: "${ISAACSIM_BASE_IMAGE:?compute-deps-hash: ISAACSIM_BASE_IMAGE must be set}" -: "${ISAACSIM_VERSION:?compute-deps-hash: ISAACSIM_VERSION must be set}" +if [ "$#" -ne 3 ]; then + echo "compute-deps-hash: expected 3 args (dockerfile-path, isaacsim-base-image, isaacsim-version)" >&2 + exit 2 +fi + +dockerfile_path="$1" +isaacsim_base_image="$2" +isaacsim_version="$3" # Exact files/dirs whose full content is hashed. The Dockerfile is first. -_DEPS_FILES=( - "${DOCKERFILE_PATH}" +deps_files=( + "${dockerfile_path}" isaaclab.sh environment.yml source/isaaclab/isaaclab/cli ) -# Manifest files matched repo-wide via git ls-files. -_DEPS_MANIFEST_PATTERN='(setup\.py|pyproject\.toml|setup\.cfg|extension\.toml|requirements[^/]*\.txt|uv\.lock)$' +deps_manifest_pattern='(setup\.py|pyproject\.toml|setup\.cfg|extension\.toml|requirements[^/]*\.txt|uv\.lock)$' # Resolve the actual base image digest so a new push of a mutable tag # (e.g. latest-develop) invalidates the deps cache automatically. -_BASE_IMAGE_DIGEST=$(docker buildx imagetools inspect \ - "${ISAACSIM_BASE_IMAGE}:${ISAACSIM_VERSION}" \ +base_image_digest=$(docker buildx imagetools inspect \ + "${isaacsim_base_image}:${isaacsim_version}" \ --format '{{json .Manifest.Digest}}' 2>/dev/null | tr -d '"' || true) -if [ -n "${_BASE_IMAGE_DIGEST}" ]; then - _BASE_IMAGE_UNIQ_ID="${ISAACSIM_BASE_IMAGE}:${ISAACSIM_VERSION}:${_BASE_IMAGE_DIGEST}" +if [ -n "${base_image_digest}" ]; then + base_image_uniq_id="${isaacsim_base_image}:${isaacsim_version}:${base_image_digest}" else echo "🟠 Could not resolve base image digest, falling back to tag string" >&2 - _BASE_IMAGE_UNIQ_ID="${ISAACSIM_BASE_IMAGE}:${ISAACSIM_VERSION}" + base_image_uniq_id="${isaacsim_base_image}:${isaacsim_version}" fi -_MANIFEST_FILES=$(git ls-files | grep -E "${_DEPS_MANIFEST_PATTERN}" || true) -# shellcheck disable=SC2086 # word-splitting MANIFEST_FILES is intentional -_FILE_HASH=$(git ls-files -s "${_DEPS_FILES[@]}" ${_MANIFEST_FILES} 2>/dev/null \ +mapfile -t manifest_files < <(git ls-files | grep -E "${deps_manifest_pattern}" || true) +file_hash=$(git ls-files -s "${deps_files[@]}" "${manifest_files[@]}" 2>/dev/null \ | sha256sum | cut -c1-16) -DEPS_HASH=$(printf '%s %s' "${_FILE_HASH}" "${_BASE_IMAGE_UNIQ_ID}" | sha256sum | cut -c1-16) +deps_hash=$(printf '%s %s' "${file_hash}" "${base_image_uniq_id}" | sha256sum | cut -c1-16) -unset _DEPS_FILES _DEPS_MANIFEST_PATTERN _BASE_IMAGE_DIGEST _BASE_IMAGE_UNIQ_ID _MANIFEST_FILES _FILE_HASH +printf '%s\n' "${deps_hash}" diff --git a/.github/actions/docker-build/action.yml b/.github/actions/docker-build/action.yml index e7c6216ef49d..bf88a512e399 100644 --- a/.github/actions/docker-build/action.yml +++ b/.github/actions/docker-build/action.yml @@ -3,35 +3,8 @@ # # SPDX-License-Identifier: BSD-3-Clause -name: 'Docker Build (local-cache)' -description: > - Build a Docker image with local-cache short-circuits and observability. - No ECR knowledge built in: the output stays in the host's local docker - store. Designed for self-hosted runners where ECR is not wired up (e.g. - the arm64 / Spark pool), and as the build primitive that - `ecr-build-push-pull` delegates to once its registry-side checks miss. - - Pipeline: - 1. Setup docker config + login to nvcr.io via the shared - `_lib/setup-docker-config.sh` (idempotent — no-op if a caller - already ran it earlier in the job). - 2. Pre-build disk snapshot (df + docker system df + tag counts) into the - step summary. - 3. Local exact-tag short-circuit: skip everything if `image-tag` is - already in the host's docker store. - 4. Local deps-tag short-circuit: compute deps-hash via the shared - `_lib/compute-deps-hash.sh` (same hash `ecr-build-push-pull` uses, - so a local hit and a registry hit always agree on the deps tag). - If `:deps-` exists locally, retag as `image-tag` - (metadata-only). - 5. buildx build (only when neither short-circuit fired). Optional - `--cache-from` / `--cache-to` flags for callers that want to layer - in a registry-side build cache (e.g. ECR). - 6. Post-build: tag the result as `:deps-` so future builds - with identical deps short-circuit at step 4. - 7. Evict `isaac-lab*:deps-*` tags older than 14 days to bound disk - growth on long-lived hosts. - 8. Post-build disk snapshot. +name: 'Build Docker Image' +description: 'Builds a Docker image with IsaacSim and IsaacLab dependencies' inputs: image-tag: @@ -68,14 +41,21 @@ inputs: `cache-from` for registry-backed layer cache writes. default: '' required: false - extra-tags: + deps-hash: description: > - Newline-separated additional tags to apply to the built image (e.g. an - ECR-prefixed tag for the caller to push). Each tag is materialized via - `docker tag` after a successful build or local short-circuit, so the - caller can rely on them being present. + Pre-computed deps-hash to use for the local deps-tag check. When empty, + this action computes the hash itself via `_lib/compute-deps-hash.sh`. + Set by callers (e.g. `ecr-build-push-pull`) that already compute the hash + for a registry-side check, to avoid recomputing here. default: '' required: false + evict-stale-cache: + description: > + When 'true', evict `isaac-lab*:deps-*` tags older than 14 days at the + end of the build to bound disk growth on long-lived self-hosted + runners. Default 'false' — no implicit cleanup. + default: 'false' + required: false runs: using: composite @@ -137,13 +117,14 @@ runs: id: local-deps if: steps.local.outputs.hit != 'true' shell: bash - env: - DOCKERFILE_PATH: ${{ inputs.dockerfile-path }} - ISAACSIM_BASE_IMAGE: ${{ inputs.isaacsim-base-image }} - ISAACSIM_VERSION: ${{ inputs.isaacsim-version }} run: | - # shellcheck source=/dev/null - . .github/actions/_lib/compute-deps-hash.sh + DEPS_HASH="${{ inputs.deps-hash }}" + if [ -z "${DEPS_HASH}" ]; then + DEPS_HASH=$(.github/actions/_lib/compute-deps-hash.sh \ + "${{ inputs.dockerfile-path }}" \ + "${{ inputs.isaacsim-base-image }}" \ + "${{ inputs.isaacsim-version }}") + fi LOCAL_DEPS_TAG="$(echo "${{ inputs.image-tag }}" | cut -d: -f1):deps-${DEPS_HASH}" echo "🔵 Local deps tag: ${LOCAL_DEPS_TAG}" @@ -209,22 +190,10 @@ runs: echo "🟠 LOCAL_DEPS_TAG not set, skipping local deps-cache tag" fi - ##### 6b: Apply extra-tags (for callers that need additional names) ##### - - - name: Apply extra tags - if: inputs.extra-tags != '' - shell: bash - run: | - while IFS= read -r extra; do - [ -z "$extra" ] && continue - docker tag "${{ inputs.image-tag }}" "$extra" - echo "🟢 Tagged: $extra" - done <<< "${{ inputs.extra-tags }}" - - ##### 7: Evict stale local deps-cache tags (>14d) ##### + ##### 7: Evict stale local deps-cache tags (>14d) — opt-in ##### - name: Evict stale local deps-cache tags (>14d) - if: always() + if: always() && inputs.evict-stale-cache == 'true' shell: bash run: | set +e diff --git a/.github/actions/ecr-build-push-pull/action.yml b/.github/actions/ecr-build-push-pull/action.yml index 9b3d8debc75f..193b16187391 100644 --- a/.github/actions/ecr-build-push-pull/action.yml +++ b/.github/actions/ecr-build-push-pull/action.yml @@ -182,13 +182,12 @@ runs: id: deps-cache if: steps.resolve-ecr.outputs.available == 'true' && steps.pull-exact.outputs.hit != 'true' shell: bash - env: - DOCKERFILE_PATH: ${{ inputs.dockerfile-path }} - ISAACSIM_BASE_IMAGE: ${{ inputs.isaacsim-base-image }} - ISAACSIM_VERSION: ${{ inputs.isaacsim-version }} run: | - # shellcheck source=/dev/null - . .github/actions/_lib/compute-deps-hash.sh + DEPS_HASH=$(.github/actions/_lib/compute-deps-hash.sh \ + "${{ inputs.dockerfile-path }}" \ + "${{ inputs.isaacsim-base-image }}" \ + "${{ inputs.isaacsim-version }}") + echo "deps_hash=${DEPS_HASH}" >> "$GITHUB_OUTPUT" DEPS_ECR_IMAGE="${ECR_URL}:deps-${DEPS_HASH}" echo "🔵 Deps hash: ${DEPS_HASH}" echo "🔵 Checking if deps image ${DEPS_ECR_IMAGE} exists in ECR..." @@ -226,7 +225,17 @@ runs: dockerfile-path: ${{ inputs.dockerfile-path }} cache-from: ${{ steps.resolve-ecr.outputs.available == 'true' && format('type=registry,ref={0}', env.CACHE_IMAGE) || '' }} cache-to: ${{ steps.resolve-ecr.outputs.available == 'true' && format('type=registry,ref={0},mode=max', env.CACHE_IMAGE) || '' }} - extra-tags: ${{ steps.resolve-ecr.outputs.available == 'true' && env.ECR_IMAGE || '' }} + deps-hash: ${{ steps.deps-cache.outputs.deps_hash }} + + - name: Tag built image with ECR-prefixed name + if: > + steps.resolve-ecr.outputs.available == 'true' && + steps.pull-exact.outputs.hit != 'true' && + steps.deps-cache.outputs.deps-cache-hit != 'true' + shell: bash + run: | + docker tag "${{ inputs.image-tag }}" "${ECR_IMAGE}" + echo "🟢 Tagged ${ECR_IMAGE}" ##### 7: Push to ECR ##### diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml index 9d865d8615a4..18a4a4f34dfd 100644 --- a/.github/workflows/build.yaml +++ b/.github/workflows/build.yaml @@ -101,7 +101,7 @@ jobs: $'^\\.github/workflows/config\\.yaml$\tBase image config' $'^\\.github/actions/\tCI actions' ) - triggered_jobs="Docker build jobs + all test-* matrix jobs (non-root verify is folded into test-isaaclab-ov and test-curobo)" + triggered_jobs="Docker build jobs + all test-* matrix jobs" render_table() { local files="$1" entry regex desc count sample shown @@ -200,11 +200,7 @@ jobs: name: Build Base Docker Image runs-on: [self-hosted, gpu] needs: [changes, config] - # `ci-arm-only` PR label short-circuits the amd64 build (and therefore - # every downstream test job via `needs: build`). Use during branch-local - # iteration on aarch64 changes; remove the label to re-enable amd64 CI. - # Other workflows (docs, install-ci, license-check) honor the same label. - if: needs.changes.outputs.run_docker_tests == 'true' && !contains(github.event.pull_request.labels.*.name, 'ci-arm-only') + if: needs.changes.outputs.run_docker_tests == 'true' steps: - name: Checkout Code uses: actions/checkout@v6 @@ -225,7 +221,7 @@ jobs: name: Build cuRobo Docker Image runs-on: [self-hosted, gpu] needs: [changes, config] - if: needs.changes.outputs.run_docker_tests == 'true' && !contains(github.event.pull_request.labels.*.name, 'ci-arm-only') + if: needs.changes.outputs.run_docker_tests == 'true' steps: - name: Checkout Code uses: actions/checkout@v6 diff --git a/.github/workflows/docs.yaml b/.github/workflows/docs.yaml index ad69a1dd802d..013be3a5b126 100644 --- a/.github/workflows/docs.yaml +++ b/.github/workflows/docs.yaml @@ -42,9 +42,8 @@ jobs: name: Build Latest Docs runs-on: ubuntu-latest needs: [doc-build-type] - # `ci-arm-only` PR label short-circuits this job for branch-local arm - # iteration; see build.yaml for the full convention. - if: needs.doc-build-type.outputs.trigger-deploy != 'true' && !contains(github.event.pull_request.labels.*.name, 'ci-arm-only') + # run on non-deploy branches to build current version docs only + if: needs.doc-build-type.outputs.trigger-deploy != 'true' steps: - name: Checkout code @@ -74,7 +73,8 @@ jobs: name: Build Multi-Version Docs runs-on: ubuntu-latest needs: [doc-build-type] - if: needs.doc-build-type.outputs.trigger-deploy == 'true' && !contains(github.event.pull_request.labels.*.name, 'ci-arm-only') + # run on deploy branches to create multi-version docs + if: needs.doc-build-type.outputs.trigger-deploy == 'true' steps: - name: Checkout code diff --git a/.github/workflows/install-ci.yml b/.github/workflows/install-ci.yml index 5a53726b976c..2a2fee50c8ab 100644 --- a/.github/workflows/install-ci.yml +++ b/.github/workflows/install-ci.yml @@ -126,9 +126,7 @@ jobs: install-tests-x86: name: Installation Tests (x86) needs: [changes] - # `ci-arm-only` PR label short-circuits this job for branch-local arm - # iteration; see build.yaml for the full convention. - if: needs.changes.outputs.run_install_tests == 'true' && !contains(github.event.pull_request.labels.*.name, 'ci-arm-only') + if: needs.changes.outputs.run_install_tests == 'true' runs-on: [self-hosted, gpu] timeout-minutes: 90 steps: @@ -147,7 +145,7 @@ jobs: install-tests-arm: name: Installation Tests (ARM) needs: [changes] - if: needs.changes.outputs.run_install_tests == 'true' && !contains(github.event.pull_request.labels.*.name, 'ci-arm-only') + if: needs.changes.outputs.run_install_tests == 'true' runs-on: [self-hosted, aarch64] timeout-minutes: 90 steps: diff --git a/.github/workflows/license-check.yaml b/.github/workflows/license-check.yaml index 5403dcca73fe..0b296f9e74eb 100644 --- a/.github/workflows/license-check.yaml +++ b/.github/workflows/license-check.yaml @@ -16,9 +16,6 @@ concurrency: jobs: license-check: runs-on: ubuntu-24.04 - # `ci-arm-only` PR label short-circuits this job for branch-local arm - # iteration; see build.yaml for the full convention. - if: ${{ !contains(github.event.pull_request.labels.*.name, 'ci-arm-only') }} steps: - name: Checkout code diff --git a/tools/conftest.py b/tools/conftest.py index 6ff7784637fd..dacde06b1be3 100644 --- a/tools/conftest.py +++ b/tools/conftest.py @@ -815,28 +815,6 @@ def pytest_sessionstart(session): print(f"TEST_CUROBO_ONLY env var: '{os.environ.get('TEST_CUROBO_ONLY', 'NOT_SET')}'") print("=" * 50) - # When CI_MARKER is set, pre-scan the tree for files containing the marker - # token and add them as include_files so `_collect_test_files` does not - # silently drop them via TESTS_TO_SKIP. - if ci_marker: - marker_token = f"pytest.mark.{ci_marker}" - marker_include_files = set() - for source_dir in source_dirs: - for root, _, files in os.walk(source_dir): - for file in files: - if not (file.startswith("test_") and file.endswith(".py")): - continue - try: - with open(os.path.join(root, file)) as f: - if marker_token in f.read(): - marker_include_files.add(file) - except OSError as exc: - print(f"::warning::ci_marker pre-scan could not read {os.path.join(root, file)}: {exc}") - continue - if marker_include_files: - print(f"CI_MARKER={ci_marker}: marker-tagged files: {sorted(marker_include_files)}") - include_files = include_files | marker_include_files - # Get all test files in the source directories test_files = _collect_test_files( source_dirs, From 2965d2ee726b8e67fe4599bd7f6f21d8fe2797d8 Mon Sep 17 00:00:00 2001 From: jichuanh Date: Mon, 1 Jun 2026 19:23:57 +0000 Subject: [PATCH 48/55] docker-build: restore develop's input descriptions The five pre-existing inputs (image-tag, isaacsim-base-image, isaacsim-version, dockerfile-path, context-path) had their descriptions rewritten during the split out of ecr-build-push-pull, with no behavior change. Restore develop's original wording; keep descriptions only on the genuinely-new inputs (platform, cache-from, cache-to, deps-hash, evict-stale-cache). --- .github/actions/docker-build/action.yml | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/.github/actions/docker-build/action.yml b/.github/actions/docker-build/action.yml index bf88a512e399..1b961348a098 100644 --- a/.github/actions/docker-build/action.yml +++ b/.github/actions/docker-build/action.yml @@ -8,20 +8,20 @@ description: 'Builds a Docker image with IsaacSim and IsaacLab dependencies' inputs: image-tag: - description: 'Tag for the Docker image (e.g. my-image:latest).' + description: 'Docker image tag to use' required: true isaacsim-base-image: - description: 'IsaacSim base image (passed as ISAACSIM_BASE_IMAGE_ARG build-arg).' + description: 'IsaacSim base image' required: true isaacsim-version: - description: 'IsaacSim version (passed as ISAACSIM_VERSION_ARG build-arg).' + description: 'IsaacSim version' required: true dockerfile-path: - description: 'Path to Dockerfile, relative to the repository root.' + description: 'Path to Dockerfile' default: 'docker/Dockerfile.base' required: false context-path: - description: 'Build context path passed to `docker buildx build`.' + description: 'Build context path' default: '.' required: false platform: From e54b7079b1e4063df576a5b3a2827c28b9b5a86e Mon Sep 17 00:00:00 2001 From: jichuanh Date: Mon, 1 Jun 2026 19:38:13 +0000 Subject: [PATCH 49/55] Drop dead arm_ci marker and orphaned changelog bullet MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit F2: test_differential_ik.py carried `pytestmark = pytest.mark.arm_ci`, but the file is in tools/test_settings.py TESTS_TO_SKIP ("Failing"), so _collect_test_files drops it before the ci_marker filter ever sees it — the marker was inert. A skip-listed test should not claim to run on any platform; remove the marker. (The other arm_ci-tagged tests are not in TESTS_TO_SKIP and are unaffected.) F3: the changelog fragment credited a pytetwild aarch64 exclusion, but that gate now lives on develop (via the packaging cleanup that removed setup.py) and is already documented by a separate fragment. This PR's diff no longer touches pytetwild gating, so the bullet described a change not present here. Drop it; keep the in-scope AppLauncher EXP_PATH fallback entry. --- source/isaaclab/changelog.d/jichuanh-arm-ci.rst | 8 -------- source/isaaclab/test/controllers/test_differential_ik.py | 2 -- 2 files changed, 10 deletions(-) diff --git a/source/isaaclab/changelog.d/jichuanh-arm-ci.rst b/source/isaaclab/changelog.d/jichuanh-arm-ci.rst index 04b19fdeb4bc..376f7f589170 100644 --- a/source/isaaclab/changelog.d/jichuanh-arm-ci.rst +++ b/source/isaaclab/changelog.d/jichuanh-arm-ci.rst @@ -9,11 +9,3 @@ Fixed ``KeyError: 'EXP_PATH'`` deep inside ``_resolve_experience_file``; now AppLauncher resolves the path from ``isaacsim.__file__`` and stores it back into the environment so subsequent code can rely on it. - -* Excluded ``pytetwild`` install on aarch64 platforms. The package has no - aarch64 wheel on PyPI and its source build fails (the ``geogram`` CMake dep - hardcodes ``-m64``). The single call site in :mod:`isaaclab.sim.schemas` - already raises a clear "install pytetwild manually or provide a - pre-tetrahedralized UsdGeom.TetMesh" message when the lazy import fails, so - aarch64 users keep everything except automatic volume-deformable - tetrahedralization. diff --git a/source/isaaclab/test/controllers/test_differential_ik.py b/source/isaaclab/test/controllers/test_differential_ik.py index 3bfc9c3a2543..2ba7af0ec028 100644 --- a/source/isaaclab/test/controllers/test_differential_ik.py +++ b/source/isaaclab/test/controllers/test_differential_ik.py @@ -15,8 +15,6 @@ import pytest import torch -pytestmark = pytest.mark.arm_ci - import isaaclab.sim as sim_utils from isaaclab import cloner from isaaclab.assets import Articulation From 7b13e52e93d75823a050a39e2d4811c5b06ae1fc Mon Sep 17 00:00:00 2001 From: jichuanh Date: Mon, 1 Jun 2026 19:44:08 +0000 Subject: [PATCH 50/55] Convert _lib shared scripts to composite actions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The two _lib helpers were a sourced bash script (setup-docker-config) and an exec'd bash script (compute-deps-hash) — two different calling conventions for what are both "shared step-logic." Unify them on the GHA-native pattern: each becomes a composite action with an inline `run:` block and a declared interface. - _lib/compute-deps-hash/action.yml: inputs (dockerfile-path, isaacsim-base-image, isaacsim-version) -> output `hash`. Both docker-build and ecr-build-push-pull `uses:` it, so the deps-hash schema lives in exactly one place (no drift between the local-store check and the registry check). - _lib/setup-docker-config/action.yml: no inputs; writes DOCKER_CONFIG to $GITHUB_ENV and logs into nvcr.io. Idempotent (a second invocation in the same job short-circuits), so ecr-build-push-pull delegating to docker-build doesn't redo it. Call sites change from `run: . _lib/x.sh` / `$(_lib/x.sh ...)` to `uses: ./.github/actions/_lib/x`. Dedup is unchanged: ecr computes the hash once via the composite and forwards it to docker-build through the deps-hash input; docker-build computes its own only when the input is empty (the arm-ci direct-call path). Delete the two .sh files — the composite action is now the single shared unit, so no separate script is needed. --- .github/actions/_lib/compute-deps-hash.sh | 50 -------------- .../actions/_lib/compute-deps-hash/action.yml | 69 +++++++++++++++++++ .github/actions/_lib/setup-docker-config.sh | 41 ----------- .../_lib/setup-docker-config/action.yml | 43 ++++++++++++ .github/actions/docker-build/action.yml | 28 ++++---- .../actions/ecr-build-push-pull/action.yml | 22 +++--- 6 files changed, 138 insertions(+), 115 deletions(-) delete mode 100755 .github/actions/_lib/compute-deps-hash.sh create mode 100644 .github/actions/_lib/compute-deps-hash/action.yml delete mode 100644 .github/actions/_lib/setup-docker-config.sh create mode 100644 .github/actions/_lib/setup-docker-config/action.yml diff --git a/.github/actions/_lib/compute-deps-hash.sh b/.github/actions/_lib/compute-deps-hash.sh deleted file mode 100755 index 855341f1a0ba..000000000000 --- a/.github/actions/_lib/compute-deps-hash.sh +++ /dev/null @@ -1,50 +0,0 @@ -#!/usr/bin/env bash -# Copyright (c) 2022-2026, The Isaac Lab Project Developers (https://github.com/isaac-sim/IsaacLab/blob/main/CONTRIBUTORS.md). -# All rights reserved. -# -# SPDX-License-Identifier: BSD-3-Clause - -# Shared deps-hash computation for the docker-build and ecr-build-push-pull -# composite actions. Both invoke this script so a registry-side cache hit and -# a local-store cache hit always agree on the same `deps-` tag. -# -# Usage: compute-deps-hash.sh -# Prints the 16-character deps-hash to stdout. Diagnostic output goes to stderr. -set -euo pipefail - -if [ "$#" -ne 3 ]; then - echo "compute-deps-hash: expected 3 args (dockerfile-path, isaacsim-base-image, isaacsim-version)" >&2 - exit 2 -fi - -dockerfile_path="$1" -isaacsim_base_image="$2" -isaacsim_version="$3" - -# Exact files/dirs whose full content is hashed. The Dockerfile is first. -deps_files=( - "${dockerfile_path}" - isaaclab.sh - environment.yml - source/isaaclab/isaaclab/cli -) -deps_manifest_pattern='(setup\.py|pyproject\.toml|setup\.cfg|extension\.toml|requirements[^/]*\.txt|uv\.lock)$' - -# Resolve the actual base image digest so a new push of a mutable tag -# (e.g. latest-develop) invalidates the deps cache automatically. -base_image_digest=$(docker buildx imagetools inspect \ - "${isaacsim_base_image}:${isaacsim_version}" \ - --format '{{json .Manifest.Digest}}' 2>/dev/null | tr -d '"' || true) -if [ -n "${base_image_digest}" ]; then - base_image_uniq_id="${isaacsim_base_image}:${isaacsim_version}:${base_image_digest}" -else - echo "🟠 Could not resolve base image digest, falling back to tag string" >&2 - base_image_uniq_id="${isaacsim_base_image}:${isaacsim_version}" -fi - -mapfile -t manifest_files < <(git ls-files | grep -E "${deps_manifest_pattern}" || true) -file_hash=$(git ls-files -s "${deps_files[@]}" "${manifest_files[@]}" 2>/dev/null \ - | sha256sum | cut -c1-16) -deps_hash=$(printf '%s %s' "${file_hash}" "${base_image_uniq_id}" | sha256sum | cut -c1-16) - -printf '%s\n' "${deps_hash}" diff --git a/.github/actions/_lib/compute-deps-hash/action.yml b/.github/actions/_lib/compute-deps-hash/action.yml new file mode 100644 index 000000000000..7df3607c4e54 --- /dev/null +++ b/.github/actions/_lib/compute-deps-hash/action.yml @@ -0,0 +1,69 @@ +# Copyright (c) 2022-2026, The Isaac Lab Project Developers (https://github.com/isaac-sim/IsaacLab/blob/main/CONTRIBUTORS.md). +# All rights reserved. +# +# SPDX-License-Identifier: BSD-3-Clause + +name: 'Compute deps hash' +description: > + Compute the deps-cache hash for the Isaac Lab Docker build. Shared by the + docker-build (local-store check) and ecr-build-push-pull (registry check) + actions so a local hit and a registry hit always agree on the same + `deps-` tag. Hashes the install-relevant files plus the resolved base + image digest. + +inputs: + dockerfile-path: + description: 'Path to Dockerfile' + required: true + isaacsim-base-image: + description: 'IsaacSim base image' + required: true + isaacsim-version: + description: 'IsaacSim version' + required: true + +outputs: + hash: + description: '16-char deps-cache hash' + value: ${{ steps.compute.outputs.hash }} + +runs: + using: composite + steps: + - id: compute + shell: bash + env: + DOCKERFILE_PATH: ${{ inputs.dockerfile-path }} + ISAACSIM_BASE_IMAGE: ${{ inputs.isaacsim-base-image }} + ISAACSIM_VERSION: ${{ inputs.isaacsim-version }} + run: | + set -euo pipefail + + # Exact files/dirs whose full content is hashed. The Dockerfile is first. + deps_files=( + "${DOCKERFILE_PATH}" + isaaclab.sh + environment.yml + source/isaaclab/isaaclab/cli + ) + deps_manifest_pattern='(setup\.py|pyproject\.toml|setup\.cfg|extension\.toml|requirements[^/]*\.txt|uv\.lock)$' + + # Resolve the actual base image digest so a new push of a mutable tag + # (e.g. latest-develop) invalidates the deps cache automatically. + base_image_digest=$(docker buildx imagetools inspect \ + "${ISAACSIM_BASE_IMAGE}:${ISAACSIM_VERSION}" \ + --format '{{json .Manifest.Digest}}' 2>/dev/null | tr -d '"' || true) + if [ -n "${base_image_digest}" ]; then + base_image_uniq_id="${ISAACSIM_BASE_IMAGE}:${ISAACSIM_VERSION}:${base_image_digest}" + else + echo "🟠 Could not resolve base image digest, falling back to tag string" + base_image_uniq_id="${ISAACSIM_BASE_IMAGE}:${ISAACSIM_VERSION}" + fi + + mapfile -t manifest_files < <(git ls-files | grep -E "${deps_manifest_pattern}" || true) + file_hash=$(git ls-files -s "${deps_files[@]}" "${manifest_files[@]}" 2>/dev/null \ + | sha256sum | cut -c1-16) + deps_hash=$(printf '%s %s' "${file_hash}" "${base_image_uniq_id}" | sha256sum | cut -c1-16) + + echo "🔵 Deps hash: ${deps_hash}" + echo "hash=${deps_hash}" >> "$GITHUB_OUTPUT" diff --git a/.github/actions/_lib/setup-docker-config.sh b/.github/actions/_lib/setup-docker-config.sh deleted file mode 100644 index 3a4290e4a5d7..000000000000 --- a/.github/actions/_lib/setup-docker-config.sh +++ /dev/null @@ -1,41 +0,0 @@ -# Copyright (c) 2022-2026, The Isaac Lab Project Developers (https://github.com/isaac-sim/IsaacLab/blob/main/CONTRIBUTORS.md). -# All rights reserved. -# -# SPDX-License-Identifier: BSD-3-Clause - -# Idempotent docker-config + nvcr.io login setup for the docker-build and -# ecr-build-push-pull composite actions. Both composites source this; the -# second invocation in a job is a no-op so callers don't need to coordinate. -# -# Source this script (do not exec) — it sets DOCKER_CONFIG in the caller's -# environment and writes it to $GITHUB_ENV so subsequent steps inherit it. -# Expects NGC_API_KEY in the environment (optional; warns when missing). - -# The runner's credential helper backend is broken ("not implemented") and -# causes docker login calls to fail unless we point DOCKER_CONFIG at a temp -# config with credsStore disabled. - -if [ -n "${DOCKER_CONFIG:-}" ] && [ -f "${DOCKER_CONFIG}/config.json" ]; then - echo "🟢 Docker config already set up at ${DOCKER_CONFIG}, skipping" >&2 - return 0 2>/dev/null || exit 0 -fi - -DOCKER_CONFIG_DIR=$(mktemp -d) -if [ -f "${HOME}/.docker/config.json" ]; then - python3 -c "import json; cfg=json.load(open('${HOME}/.docker/config.json')); cfg['credsStore']=''; cfg.pop('credHelpers',None); json.dump(cfg,open('${DOCKER_CONFIG_DIR}/config.json','w'))" -else - echo '{"credsStore":""}' > "${DOCKER_CONFIG_DIR}/config.json" -fi -export DOCKER_CONFIG="${DOCKER_CONFIG_DIR}" -if [ -n "${GITHUB_ENV:-}" ]; then - echo "DOCKER_CONFIG=${DOCKER_CONFIG_DIR}" >> "${GITHUB_ENV}" -fi - -if [ -n "${NGC_API_KEY:-}" ]; then - echo "🔵 Logging into nvcr.io..." >&2 - docker login -u '$oauthtoken' -p "${NGC_API_KEY}" nvcr.io -else - echo "🟠 NGC_API_KEY not set - skipping nvcr.io login (normal for fork PRs)" >&2 -fi - -unset DOCKER_CONFIG_DIR diff --git a/.github/actions/_lib/setup-docker-config/action.yml b/.github/actions/_lib/setup-docker-config/action.yml new file mode 100644 index 000000000000..09effa56eaa3 --- /dev/null +++ b/.github/actions/_lib/setup-docker-config/action.yml @@ -0,0 +1,43 @@ +# Copyright (c) 2022-2026, The Isaac Lab Project Developers (https://github.com/isaac-sim/IsaacLab/blob/main/CONTRIBUTORS.md). +# All rights reserved. +# +# SPDX-License-Identifier: BSD-3-Clause + +name: 'Setup docker config' +description: > + Point DOCKER_CONFIG at a temp config with the credential helper disabled and + log into nvcr.io. Shared by the docker-build and ecr-build-push-pull actions. + Idempotent: re-invoking it in the same job is a no-op, so callers (e.g. + ecr-build-push-pull delegating to docker-build) don't need to coordinate. + Reads NGC_API_KEY from the environment (optional; warns when missing). + +runs: + using: composite + steps: + - shell: bash + run: | + # The runner's credential helper backend is broken ("not implemented") + # and causes docker login calls to fail unless we point DOCKER_CONFIG at + # a temp config with credsStore disabled. The value is written to + # $GITHUB_ENV so subsequent steps in the job inherit it; a second + # invocation sees it already set and short-circuits. + if [ -n "${DOCKER_CONFIG:-}" ] && [ -f "${DOCKER_CONFIG}/config.json" ]; then + echo "🟢 Docker config already set up at ${DOCKER_CONFIG}, skipping" + exit 0 + fi + + DOCKER_CONFIG_DIR=$(mktemp -d) + if [ -f "${HOME}/.docker/config.json" ]; then + python3 -c "import json; cfg=json.load(open('${HOME}/.docker/config.json')); cfg['credsStore']=''; cfg.pop('credHelpers',None); json.dump(cfg,open('${DOCKER_CONFIG_DIR}/config.json','w'))" + else + echo '{"credsStore":""}' > "${DOCKER_CONFIG_DIR}/config.json" + fi + export DOCKER_CONFIG="${DOCKER_CONFIG_DIR}" + echo "DOCKER_CONFIG=${DOCKER_CONFIG_DIR}" >> "$GITHUB_ENV" + + if [ -n "${NGC_API_KEY:-}" ]; then + echo "🔵 Logging into nvcr.io..." + docker login -u '$oauthtoken' -p "${NGC_API_KEY}" nvcr.io + else + echo "🟠 NGC_API_KEY not set - skipping nvcr.io login (normal for fork PRs)" + fi diff --git a/.github/actions/docker-build/action.yml b/.github/actions/docker-build/action.yml index 1b961348a098..cbf69590c815 100644 --- a/.github/actions/docker-build/action.yml +++ b/.github/actions/docker-build/action.yml @@ -44,9 +44,9 @@ inputs: deps-hash: description: > Pre-computed deps-hash to use for the local deps-tag check. When empty, - this action computes the hash itself via `_lib/compute-deps-hash.sh`. - Set by callers (e.g. `ecr-build-push-pull`) that already compute the hash - for a registry-side check, to avoid recomputing here. + this action computes the hash itself via the `_lib/compute-deps-hash` + action. Set by callers (e.g. `ecr-build-push-pull`) that already compute + the hash for a registry-side check, to avoid recomputing here. default: '' required: false evict-stale-cache: @@ -64,10 +64,7 @@ runs: ##### 1: Setup docker config + login to nvcr.io (optional) ##### - name: Setup docker config and login to nvcr.io - shell: bash - run: | - # shellcheck source=/dev/null - . .github/actions/_lib/setup-docker-config.sh + uses: ./.github/actions/_lib/setup-docker-config ##### 2: Host disk snapshot (pre) ##### @@ -113,18 +110,21 @@ runs: ##### 4: Local deps-tag short-circuit ##### + - name: Compute deps hash + id: deps-hash + if: steps.local.outputs.hit != 'true' && inputs.deps-hash == '' + uses: ./.github/actions/_lib/compute-deps-hash + with: + dockerfile-path: ${{ inputs.dockerfile-path }} + isaacsim-base-image: ${{ inputs.isaacsim-base-image }} + isaacsim-version: ${{ inputs.isaacsim-version }} + - name: Check deps-tag locally id: local-deps if: steps.local.outputs.hit != 'true' shell: bash run: | - DEPS_HASH="${{ inputs.deps-hash }}" - if [ -z "${DEPS_HASH}" ]; then - DEPS_HASH=$(.github/actions/_lib/compute-deps-hash.sh \ - "${{ inputs.dockerfile-path }}" \ - "${{ inputs.isaacsim-base-image }}" \ - "${{ inputs.isaacsim-version }}") - fi + DEPS_HASH="${{ inputs.deps-hash || steps.deps-hash.outputs.hash }}" LOCAL_DEPS_TAG="$(echo "${{ inputs.image-tag }}" | cut -d: -f1):deps-${DEPS_HASH}" echo "🔵 Local deps tag: ${LOCAL_DEPS_TAG}" diff --git a/.github/actions/ecr-build-push-pull/action.yml b/.github/actions/ecr-build-push-pull/action.yml index 193b16187391..d6fe0cb27987 100644 --- a/.github/actions/ecr-build-push-pull/action.yml +++ b/.github/actions/ecr-build-push-pull/action.yml @@ -50,10 +50,7 @@ runs: # (including ECR login in step 3) inherit it automatically. - name: Setup docker config and login to nvcr.io - shell: bash - run: | - # shellcheck source=/dev/null - . .github/actions/_lib/setup-docker-config.sh + uses: ./.github/actions/_lib/setup-docker-config ##### 2: Resolve ECR URL ##### @@ -178,16 +175,21 @@ runs: # Edit DEPS_FILES or DEPS_MANIFEST_PATTERN when install # inputs change (new packages, new manifests, etc.). + - name: Compute deps hash + id: deps-hash + if: steps.resolve-ecr.outputs.available == 'true' && steps.pull-exact.outputs.hit != 'true' + uses: ./.github/actions/_lib/compute-deps-hash + with: + dockerfile-path: ${{ inputs.dockerfile-path }} + isaacsim-base-image: ${{ inputs.isaacsim-base-image }} + isaacsim-version: ${{ inputs.isaacsim-version }} + - name: Check deps cache id: deps-cache if: steps.resolve-ecr.outputs.available == 'true' && steps.pull-exact.outputs.hit != 'true' shell: bash run: | - DEPS_HASH=$(.github/actions/_lib/compute-deps-hash.sh \ - "${{ inputs.dockerfile-path }}" \ - "${{ inputs.isaacsim-base-image }}" \ - "${{ inputs.isaacsim-version }}") - echo "deps_hash=${DEPS_HASH}" >> "$GITHUB_OUTPUT" + DEPS_HASH="${{ steps.deps-hash.outputs.hash }}" DEPS_ECR_IMAGE="${ECR_URL}:deps-${DEPS_HASH}" echo "🔵 Deps hash: ${DEPS_HASH}" echo "🔵 Checking if deps image ${DEPS_ECR_IMAGE} exists in ECR..." @@ -225,7 +227,7 @@ runs: dockerfile-path: ${{ inputs.dockerfile-path }} cache-from: ${{ steps.resolve-ecr.outputs.available == 'true' && format('type=registry,ref={0}', env.CACHE_IMAGE) || '' }} cache-to: ${{ steps.resolve-ecr.outputs.available == 'true' && format('type=registry,ref={0},mode=max', env.CACHE_IMAGE) || '' }} - deps-hash: ${{ steps.deps-cache.outputs.deps_hash }} + deps-hash: ${{ steps.deps-hash.outputs.hash }} - name: Tag built image with ECR-prefixed name if: > From 2db1482062e3705e9f21dfee45e40732f06c05bf Mon Sep 17 00:00:00 2001 From: jichuanh Date: Mon, 1 Jun 2026 20:20:54 +0000 Subject: [PATCH 51/55] arm-ci: opt into docker-build stale-cache eviction MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The `evict-stale-cache` input on docker-build was inert — no caller set it, so the 14-day deps-tag eviction never ran. The Spark runner is long-lived self-hosted with no ECR, so its local deps-cache tags accumulate indefinitely; it's the legitimate consumer. Set evict-stale-cache: "true" on the arm-ci build so stale tags are pruned each run. --- .github/workflows/build.yaml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml index 18a4a4f34dfd..9d58887d20f2 100644 --- a/.github/workflows/build.yaml +++ b/.github/workflows/build.yaml @@ -262,6 +262,9 @@ jobs: isaacsim-version: ${{ needs.config.outputs.isaacsim_image_tag }} dockerfile-path: docker/Dockerfile.base platform: linux/arm64 + # The Spark runner is long-lived self-hosted with no ECR, so its local + # deps-cache tags accumulate; evict ones older than 14 days each run. + evict-stale-cache: "true" - name: Run arm_ci marker tests uses: ./.github/actions/run-tests From c54e5124147370d9d95faa8c354cdfc6e359708d Mon Sep 17 00:00:00 2001 From: jichuanh Date: Mon, 1 Jun 2026 20:36:25 +0000 Subject: [PATCH 52/55] Apply post-review fixes; scope test tags to arm_ci only Review fixes (F4-F6): - test_cartpole_training_smoke.py: repoint the perception case from the retired/deprecated Isaac-Cartpole-RGB-Camera-Direct-v0 to the canonical Isaac-Cartpole-Camera-Direct-v0 (defaults to rgb, has an rl_games entry point, not on the removal path); fix the docstring claim about camera envs registering "only" rl_games (they register rl_games + skrl, just no rsl_rl). - isaaclab changelog: add a bullet for the AssetConverterBase default USD dir moving from hardcoded /tmp to tempfile.gettempdir() (a user-facing cross-platform change that was undocumented). - isaaclab_tasks changelog: the package's delta here is test-only (arm_ci markers + aarch64 skip guards in rendering test helpers), so convert the fragment to .skip per AGENTS.md (no user-facing entry). Arm-only test tags: - test_scipy.py / test_torch.py carried [windows_ci, arm_ci]; this is the ARM workflow PR, so scope their tags to arm_ci only. windows_ci test tags belong to the Windows workflow PR. --- source/isaaclab/changelog.d/jichuanh-arm-ci.rst | 5 +++++ source/isaaclab/test/deps/test_scipy.py | 2 +- source/isaaclab/test/deps/test_torch.py | 2 +- source/isaaclab_tasks/changelog.d/jichuanh-arm-ci.rst | 9 --------- source/isaaclab_tasks/changelog.d/jichuanh-arm-ci.skip | 2 ++ .../isaaclab_tasks/test/test_cartpole_training_smoke.py | 5 +++-- 6 files changed, 12 insertions(+), 13 deletions(-) delete mode 100644 source/isaaclab_tasks/changelog.d/jichuanh-arm-ci.rst create mode 100644 source/isaaclab_tasks/changelog.d/jichuanh-arm-ci.skip diff --git a/source/isaaclab/changelog.d/jichuanh-arm-ci.rst b/source/isaaclab/changelog.d/jichuanh-arm-ci.rst index 376f7f589170..38548a771d35 100644 --- a/source/isaaclab/changelog.d/jichuanh-arm-ci.rst +++ b/source/isaaclab/changelog.d/jichuanh-arm-ci.rst @@ -9,3 +9,8 @@ Fixed ``KeyError: 'EXP_PATH'`` deep inside ``_resolve_experience_file``; now AppLauncher resolves the path from ``isaacsim.__file__`` and stores it back into the environment so subsequent code can rely on it. + +* Fixed :class:`~isaaclab.sim.converters.AssetConverterBase` hardcoding + ``/tmp`` for its default USD output directory. It now resolves the path under + ``tempfile.gettempdir()``, so it honors ``$TMPDIR`` on POSIX and works on + Windows (where the system temp dir is ``%TEMP%``). diff --git a/source/isaaclab/test/deps/test_scipy.py b/source/isaaclab/test/deps/test_scipy.py index f42e54c304e9..6b2d0dee6e81 100644 --- a/source/isaaclab/test/deps/test_scipy.py +++ b/source/isaaclab/test/deps/test_scipy.py @@ -13,7 +13,7 @@ import numpy as np import scipy.interpolate as interpolate -pytestmark = [pytest.mark.windows_ci, pytest.mark.arm_ci] +pytestmark = pytest.mark.arm_ci @pytest.mark.isaacsim_ci diff --git a/source/isaaclab/test/deps/test_torch.py b/source/isaaclab/test/deps/test_torch.py index e651987daa26..9d1bf39da64f 100644 --- a/source/isaaclab/test/deps/test_torch.py +++ b/source/isaaclab/test/deps/test_torch.py @@ -7,7 +7,7 @@ import torch import torch.utils.benchmark as benchmark -pytestmark = [pytest.mark.windows_ci, pytest.mark.arm_ci] +pytestmark = pytest.mark.arm_ci @pytest.mark.isaacsim_ci diff --git a/source/isaaclab_tasks/changelog.d/jichuanh-arm-ci.rst b/source/isaaclab_tasks/changelog.d/jichuanh-arm-ci.rst deleted file mode 100644 index 20dbfd59928b..000000000000 --- a/source/isaaclab_tasks/changelog.d/jichuanh-arm-ci.rst +++ /dev/null @@ -1,9 +0,0 @@ -Fixed -^^^^^ - -* Changed the kitless rendering tests' ``ov[ovrtx]`` and ``ov[ovphysx]`` autouse - guards to skip rather than fail on aarch64 when the optional dependency is - unavailable. Both wheels are published only for x86_64, so on aarch64 these - gates were turning unreachable parametrize cases into hard failures; x86 - environments without the dependency still see the original - "install with ``./isaaclab.sh -i 'ov[…]'``" failure with install guidance. diff --git a/source/isaaclab_tasks/changelog.d/jichuanh-arm-ci.skip b/source/isaaclab_tasks/changelog.d/jichuanh-arm-ci.skip new file mode 100644 index 000000000000..c312132ab201 --- /dev/null +++ b/source/isaaclab_tasks/changelog.d/jichuanh-arm-ci.skip @@ -0,0 +1,2 @@ +Test-only changes (arm_ci markers + aarch64 skip guards in rendering test +helpers); no user-facing changelog entry. diff --git a/source/isaaclab_tasks/test/test_cartpole_training_smoke.py b/source/isaaclab_tasks/test/test_cartpole_training_smoke.py index 71e541cc5545..d7c640323d7a 100644 --- a/source/isaaclab_tasks/test/test_cartpole_training_smoke.py +++ b/source/isaaclab_tasks/test/test_cartpole_training_smoke.py @@ -14,7 +14,8 @@ The state case uses rsl_rl (matches Isaac-Cartpole-Direct-v0's registered config entry); the perception case uses rl_games because the camera-variant -direct envs only register ``rl_games_cfg_entry_point``. +direct envs register ``rl_games_cfg_entry_point`` and ``skrl_cfg_entry_point`` +but no ``rsl_rl_cfg_entry_point``. """ from __future__ import annotations @@ -72,6 +73,6 @@ def test_train_cartpole_perception(): """RGB-camera cartpole trains for two rl_games PPO iterations without errors.""" _run_train( "scripts/reinforcement_learning/rl_games/train.py", - "Isaac-Cartpole-RGB-Camera-Direct-v0", + "Isaac-Cartpole-Camera-Direct-v0", extra_args=["--enable_cameras"], ) From 5cbba720a4cdfaf97ea47531806e59e4579e9da2 Mon Sep 17 00:00:00 2001 From: jichuanh Date: Mon, 1 Jun 2026 22:05:58 +0000 Subject: [PATCH 53/55] Move cartpole smoke test into test/core/ after tasks split The develop merge brought in #5891 (split tasks tests into core/contrib), which relocated every test file under test/core/ or test/contrib/ and left only shared helpers + conftest at the test-suite root. The new test_cartpole_training_smoke.py predated that split and was the lone test_*.py still at top-level. Cartpole is a core task, so move it under test/core/ to match the new layout, and bump its _REPO_ROOT path depth from parents[3] to parents[4] (matching the sibling core/ tests). --- .../test/{ => core}/test_cartpole_training_smoke.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) rename source/isaaclab_tasks/test/{ => core}/test_cartpole_training_smoke.py (98%) diff --git a/source/isaaclab_tasks/test/test_cartpole_training_smoke.py b/source/isaaclab_tasks/test/core/test_cartpole_training_smoke.py similarity index 98% rename from source/isaaclab_tasks/test/test_cartpole_training_smoke.py rename to source/isaaclab_tasks/test/core/test_cartpole_training_smoke.py index d7c640323d7a..331fab2280b3 100644 --- a/source/isaaclab_tasks/test/test_cartpole_training_smoke.py +++ b/source/isaaclab_tasks/test/core/test_cartpole_training_smoke.py @@ -27,7 +27,7 @@ pytestmark = pytest.mark.arm_ci -_REPO_ROOT = Path(__file__).resolve().parents[3] +_REPO_ROOT = Path(__file__).resolve().parents[4] def _run_train(train_script: str, task_name: str, extra_args: list[str] | None = None, timeout: int = 600) -> None: From df786d2f77b87dd1be096941e69bde6e79b9b4fd Mon Sep 17 00:00:00 2001 From: jichuanh Date: Mon, 1 Jun 2026 22:30:08 +0000 Subject: [PATCH 54/55] Raise cartpole perception smoke-test timeout for cold shader cache test_train_cartpole_perception timed out at the default 600s on a cold-cache Spark runner: the first camera-enabled run compiles shaders (~600s) before training even starts, which alone consumes the budget. It passed earlier only because that runner had a warm shader cache. Give the camera case a 1800s timeout (the state case keeps 600s). --- .../isaaclab_tasks/test/core/test_cartpole_training_smoke.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/source/isaaclab_tasks/test/core/test_cartpole_training_smoke.py b/source/isaaclab_tasks/test/core/test_cartpole_training_smoke.py index 331fab2280b3..c2b44a94827f 100644 --- a/source/isaaclab_tasks/test/core/test_cartpole_training_smoke.py +++ b/source/isaaclab_tasks/test/core/test_cartpole_training_smoke.py @@ -71,8 +71,11 @@ def test_train_cartpole_state(): def test_train_cartpole_perception(): """RGB-camera cartpole trains for two rl_games PPO iterations without errors.""" + # The first camera-enabled run on a cold cache compiles shaders (~600 s) + # before training starts, so allow well beyond the state-case budget. _run_train( "scripts/reinforcement_learning/rl_games/train.py", "Isaac-Cartpole-Camera-Direct-v0", extra_args=["--enable_cameras"], + timeout=1800, ) From 6f2bdec650ec2f14c19d2b00680143dd9a73929f Mon Sep 17 00:00:00 2001 From: jichuanh Date: Thu, 11 Jun 2026 19:18:43 +0000 Subject: [PATCH 55/55] Update cartpole smoke test to unversioned task IDs The develop merge removed the versioned Isaac-Cartpole-Direct-v0 and Isaac-Cartpole-Camera-Direct-v0 gym registrations in favor of the unversioned Isaac-Cartpole-Direct and Isaac-Cartpole-Camera-Direct ids. The smoke test still requested the -v0 names, so every run failed at gym registration with DeprecatedEnv before training started, breaking both the arm-ci and isaaclab_tasks jobs. Point the test at the current ids and correct the docstring: the camera variant now also registers an rsl_rl_cfg_entry_point, so the old rationale for choosing rl_games no longer held. --- .../test/core/test_cartpole_training_smoke.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/source/isaaclab_tasks/test/core/test_cartpole_training_smoke.py b/source/isaaclab_tasks/test/core/test_cartpole_training_smoke.py index c2b44a94827f..4b0ff1c054cc 100644 --- a/source/isaaclab_tasks/test/core/test_cartpole_training_smoke.py +++ b/source/isaaclab_tasks/test/core/test_cartpole_training_smoke.py @@ -12,10 +12,9 @@ write) without the cost of a real training run, so the orchestrator can include them in every CI shape (Linux, ARM/Spark). -The state case uses rsl_rl (matches Isaac-Cartpole-Direct-v0's registered -config entry); the perception case uses rl_games because the camera-variant -direct envs register ``rl_games_cfg_entry_point`` and ``skrl_cfg_entry_point`` -but no ``rsl_rl_cfg_entry_point``. +The state case uses rsl_rl (matches Isaac-Cartpole-Direct's registered +``rsl_rl_cfg_entry_point``); the perception case uses rl_games against +Isaac-Cartpole-Camera-Direct's ``rl_games_cfg_entry_point``. """ from __future__ import annotations @@ -66,7 +65,7 @@ def _run_train(train_script: str, task_name: str, extra_args: list[str] | None = def test_train_cartpole_state(): """State-observation cartpole trains for two rsl_rl PPO iterations without errors.""" - _run_train("scripts/reinforcement_learning/rsl_rl/train.py", "Isaac-Cartpole-Direct-v0") + _run_train("scripts/reinforcement_learning/rsl_rl/train.py", "Isaac-Cartpole-Direct") def test_train_cartpole_perception(): @@ -75,7 +74,7 @@ def test_train_cartpole_perception(): # before training starts, so allow well beyond the state-case budget. _run_train( "scripts/reinforcement_learning/rl_games/train.py", - "Isaac-Cartpole-Camera-Direct-v0", + "Isaac-Cartpole-Camera-Direct", extra_args=["--enable_cameras"], timeout=1800, )