Remove dangling layers. #7938
Workflow file for this run
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # Runner information: | |
| # - OpenACC/OpenMP on NVIDIA runs on runners labeled `nvidiagpu` | |
| # - OpenMP on AMD runs on runners labeled `amdgpu` | |
| # | |
| # Changes vs original: | |
| # * Respect CUDA_VISIBLE_DEVICES for NVIDIA jobs by passing it AND restricting Docker with --gpus "device=…" | |
| # * Tag images and container names with ${{ runner.name }} to avoid cross-runner races and maximize cache reuse | |
| # * Remove docker prune / global container deletes (we assume disk space is fine) | |
| # * Add comments throughout | |
| name: CI-gpu | |
| concurrency: | |
| group: ${{ github.workflow }}-${{ github.ref }} | |
| cancel-in-progress: true | |
| env: | |
| OUTPUT_PATH: ${{ github.workspace }} | |
| RESOURCE_GROUP: CI-gpu | |
| on: | |
| push: | |
| branches: [ main ] | |
| pull_request: | |
| branches: [ main ] | |
| workflow_dispatch: | |
| inputs: | |
| tags: | |
| description: 'Run GPU tests' | |
| jobs: | |
| build: | |
| name: ${{ matrix.name }} | |
| runs-on: | |
| - self-hosted | |
| - ${{ matrix.runner_label }} | |
| strategy: | |
| fail-fast: false | |
| matrix: | |
| name: [ | |
| pytest-gpu-acc-nvidia, | |
| pytest-gpu-omp-amd | |
| ] | |
| test_examples: ["examples/seismic/tti/tti_example.py examples/seismic/acoustic/acoustic_example.py examples/seismic/viscoacoustic/viscoacoustic_example.py examples/seismic/viscoelastic/viscoelastic_example.py examples/seismic/elastic/elastic_example.py"] | |
| include: | |
| # -------------------- NVIDIA job -------------------- | |
| - name: pytest-gpu-acc-nvidia | |
| test_files: "tests/test_adjoint.py tests/test_gpu_common.py tests/test_gpu_openacc.py" | |
| base: "devitocodes/bases:nvidia-nvc" | |
| runner_label: nvidiagpu | |
| test_drive_cmd: "nvidia-smi" | |
| # Respect CUDA_VISIBLE_DEVICES and also hard-limit Docker to that device. | |
| # NOTE: CUDA_VISIBLE_DEVICES must be set by the runner (systemd drop-in etc.). | |
| flags: >- | |
| --init --rm -t | |
| --name ${CONTAINER_BASENAME} | |
| --env CUDA_VISIBLE_DEVICES | |
| --gpus "device=${CUDA_VISIBLE_DEVICES:-all}" | |
| # -------------------- AMD job ----------------------- | |
| - name: pytest-gpu-omp-amd | |
| test_files: "tests/test_adjoint.py tests/test_gpu_common.py tests/test_gpu_openmp.py" | |
| runner_label: amdgpu | |
| base: "devitocodes/bases:amd" | |
| test_drive_cmd: "rocm-smi" | |
| # Unchanged, still passes through required /dev nodes etc. | |
| flags: >- | |
| --init --network=host | |
| --device=/dev/kfd --device=/dev/dri | |
| --ipc=host | |
| --group-add video --group-add "$(getent group render | cut -d: -f3)" | |
| --cap-add=SYS_PTRACE --security-opt seccomp=unconfined | |
| --rm -t | |
| --name ${CONTAINER_BASENAME} | |
| steps: | |
| - name: Checkout devito | |
| uses: actions/checkout@v4 | |
| - name: Set per-runner tags | |
| run: | | |
| echo "DOCKER_IMAGE=${{ matrix.name }}-${RUNNER_NAME// /_}" >> $GITHUB_ENV | |
| echo "CONTAINER_BASENAME=testrun-${{ matrix.name }}-${RUNNER_NAME// /_}" >> $GITHUB_ENV | |
| - name: Build docker image | |
| run: | | |
| docker buildx create --use --name $RUNNER_NAME || true | |
| docker build . \ | |
| --builder $RUNNER_NAME \ | |
| --label ci-run=$GITHUB_RUN_ID \ | |
| --rm --pull \ | |
| --file docker/Dockerfile.devito \ | |
| --tag "${DOCKER_IMAGE}" \ | |
| --build-arg base="${{ matrix.base }}" | |
| - name: Export CODECOV token | |
| run: echo "CODECOV_TOKEN=${{ secrets.CODECOV_TOKEN }}" >> "$GITHUB_ENV" | |
| - name: Probe gpu | |
| run: | | |
| # Make sure CUDA_VISIBLE_DEVICES is at least *something* on NVIDIA | |
| # runners; fall back to "all" so the driver probe does not fail. | |
| if [[ "${{ matrix.runner_label }}" == "nvidiagpu" && -z "${CUDA_VISIBLE_DEVICES:-}" ]]; then | |
| echo "CUDA_VISIBLE_DEVICES=all" >> $GITHUB_ENV | |
| fi | |
| # Run a simple driver-probe command (nvidia-smi / rocm-smi) | |
| docker run ${{ matrix.flags }} "${DOCKER_IMAGE}" ${{ matrix.test_drive_cmd }} | |
| - name: Test with pytest | |
| env: | |
| # Exported earlier in the job; needed inside the container for codecov | |
| CODECOV_TOKEN: ${{ env.CODECOV_TOKEN }} | |
| run: | | |
| # 1. Add Codecov’s environment variables (GITHUB_SHA, etc.) | |
| ci_env=$(bash <(curl -s https://codecov.io/env)) | |
| # 2. For NVIDIA, make sure the runner gave us a GPU mask | |
| # Default to empty | |
| NVIDIA_ENV_FLAGS="" | |
| # For NVIDIA, check for device mask AND set specific env vars | |
| if [[ "${{ matrix.runner_label }}" == "nvidiagpu" ]]; then | |
| : "${CUDA_VISIBLE_DEVICES:?CUDA_VISIBLE_DEVICES not set on NVIDIA runner}" | |
| NVIDIA_ENV_FLAGS=" \ | |
| -e CUDA_LAUNCH_BLOCKING=1 \ | |
| -e NV_ACC_NOTIFY=3 \ | |
| -e NV_ACC_DEBUG=1" | |
| fi | |
| # 3. Run the test suite using the matrix-defined flags | |
| docker run ${{ matrix.flags }} \ | |
| ${ci_env} \ | |
| -e CI=true \ | |
| -e PYTHONFAULTHANDLER=1 \ | |
| -e DEVITO_LOGGING=DEBUG \ | |
| -e CODECOV_TOKEN \ | |
| ${NVIDIA_ENV_FLAGS} \ | |
| "${DOCKER_IMAGE}" \ | |
| pytest -vv -ra -l -s --full-trace --maxfail=1 \ | |
| --cov --cov-config=.coveragerc --cov-report=xml \ | |
| ${{ matrix.test_files }} | |
| - name: Test examples | |
| run: | | |
| docker run ${{ matrix.flags }} "${DOCKER_IMAGE}" pytest ${{ matrix.test_examples }} | |
| - name: Test examples with MPI | |
| run: | | |
| docker run ${{ matrix.flags }} --env DEVITO_MPI=1 "${DOCKER_IMAGE}" \ | |
| mpiexec -n 2 pytest ${{ matrix.test_examples }} | |
| - name: Clean up test image | |
| if: always() | |
| run: | | |
| # Remove only the image we just built | |
| docker rmi -f "${DOCKER_IMAGE}" || true | |
| # Guard global prune the layers we created | |
| # (label ci-run=$GITHUB_RUN_ID) | |
| docker image prune -f --filter label=ci-run=$GITHUB_RUN_ID |