Remove dangling layers. #7938

Workflow file for this run

.github/workflows/pytest-gpu.yml at adc267e

	# Runner information:
	# - OpenACC/OpenMP on NVIDIA runs on runners labeled `nvidiagpu`
	# - OpenMP on AMD runs on runners labeled `amdgpu`
	#
	# Changes vs original:
	# * Respect CUDA_VISIBLE_DEVICES for NVIDIA jobs by passing it AND restricting Docker with --gpus "device=…"
	# * Tag images and container names with ${{ runner.name }} to avoid cross-runner races and maximize cache reuse
	# * Remove docker prune / global container deletes (we assume disk space is fine)
	# * Add comments throughout

	name: CI-gpu

	concurrency:
	group: ${{ github.workflow }}-${{ github.ref }}
	cancel-in-progress: true

	env:
	OUTPUT_PATH: ${{ github.workspace }}
	RESOURCE_GROUP: CI-gpu

	on:
	push:
	branches: [ main ]
	pull_request:
	branches: [ main ]
	workflow_dispatch:
	inputs:
	tags:
	description: 'Run GPU tests'

	jobs:
	build:
	name: ${{ matrix.name }}
	runs-on:
	- self-hosted
	- ${{ matrix.runner_label }}

	strategy:
	fail-fast: false
	matrix:
	name: [
	pytest-gpu-acc-nvidia,
	pytest-gpu-omp-amd
	]
	test_examples: ["examples/seismic/tti/tti_example.py examples/seismic/acoustic/acoustic_example.py examples/seismic/viscoacoustic/viscoacoustic_example.py examples/seismic/viscoelastic/viscoelastic_example.py examples/seismic/elastic/elastic_example.py"]

	include:
	# -------------------- NVIDIA job --------------------
	- name: pytest-gpu-acc-nvidia
	test_files: "tests/test_adjoint.py tests/test_gpu_common.py tests/test_gpu_openacc.py"
	base: "devitocodes/bases:nvidia-nvc"
	runner_label: nvidiagpu
	test_drive_cmd: "nvidia-smi"
	# Respect CUDA_VISIBLE_DEVICES and also hard-limit Docker to that device.
	# NOTE: CUDA_VISIBLE_DEVICES must be set by the runner (systemd drop-in etc.).
	flags: >-
	--init --rm -t
	--name ${CONTAINER_BASENAME}
	--env CUDA_VISIBLE_DEVICES
	--gpus "device=${CUDA_VISIBLE_DEVICES:-all}"

	# -------------------- AMD job -----------------------
	- name: pytest-gpu-omp-amd
	test_files: "tests/test_adjoint.py tests/test_gpu_common.py tests/test_gpu_openmp.py"
	runner_label: amdgpu
	base: "devitocodes/bases:amd"
	test_drive_cmd: "rocm-smi"
	# Unchanged, still passes through required /dev nodes etc.
	flags: >-
	--init --network=host
	--device=/dev/kfd --device=/dev/dri
	--ipc=host
	--group-add video --group-add "$(getent group render \| cut -d: -f3)"
	--cap-add=SYS_PTRACE --security-opt seccomp=unconfined
	--rm -t
	--name ${CONTAINER_BASENAME}

	steps:
	- name: Checkout devito
	uses: actions/checkout@v4

	- name: Set per-runner tags
	run: \|
	echo "DOCKER_IMAGE=${{ matrix.name }}-${RUNNER_NAME// /_}" >> $GITHUB_ENV
	echo "CONTAINER_BASENAME=testrun-${{ matrix.name }}-${RUNNER_NAME// /_}" >> $GITHUB_ENV

	- name: Build docker image
	run: \|
	docker buildx create --use --name $RUNNER_NAME \|\| true
	docker build . \
	--builder $RUNNER_NAME \
	--label ci-run=$GITHUB_RUN_ID \
	--rm --pull \
	--file docker/Dockerfile.devito \
	--tag "${DOCKER_IMAGE}" \
	--build-arg base="${{ matrix.base }}"

	- name: Export CODECOV token
	run: echo "CODECOV_TOKEN=${{ secrets.CODECOV_TOKEN }}" >> "$GITHUB_ENV"

	- name: Probe gpu
	run: \|
	# Make sure CUDA_VISIBLE_DEVICES is at least something on NVIDIA
	# runners; fall back to "all" so the driver probe does not fail.
	if [[ "${{ matrix.runner_label }}" == "nvidiagpu" && -z "${CUDA_VISIBLE_DEVICES:-}" ]]; then
	echo "CUDA_VISIBLE_DEVICES=all" >> $GITHUB_ENV
	fi

	# Run a simple driver-probe command (nvidia-smi / rocm-smi)
	docker run ${{ matrix.flags }} "${DOCKER_IMAGE}" ${{ matrix.test_drive_cmd }}

	- name: Test with pytest
	env:
	# Exported earlier in the job; needed inside the container for codecov
	CODECOV_TOKEN: ${{ env.CODECOV_TOKEN }}
	run: \|
	# 1. Add Codecov’s environment variables (GITHUB_SHA, etc.)
	ci_env=$(bash <(curl -s https://codecov.io/env))

	# 2. For NVIDIA, make sure the runner gave us a GPU mask
	# Default to empty
	NVIDIA_ENV_FLAGS=""
	# For NVIDIA, check for device mask AND set specific env vars
	if [[ "${{ matrix.runner_label }}" == "nvidiagpu" ]]; then
	: "${CUDA_VISIBLE_DEVICES:?CUDA_VISIBLE_DEVICES not set on NVIDIA runner}"
	NVIDIA_ENV_FLAGS=" \
	-e CUDA_LAUNCH_BLOCKING=1 \
	-e NV_ACC_NOTIFY=3 \
	-e NV_ACC_DEBUG=1"
	fi

	# 3. Run the test suite using the matrix-defined flags
	docker run ${{ matrix.flags }} \
	${ci_env} \
	-e CI=true \
	-e PYTHONFAULTHANDLER=1 \
	-e DEVITO_LOGGING=DEBUG \
	-e CODECOV_TOKEN \
	${NVIDIA_ENV_FLAGS} \
	"${DOCKER_IMAGE}" \
	pytest -vv -ra -l -s --full-trace --maxfail=1 \
	--cov --cov-config=.coveragerc --cov-report=xml \
	${{ matrix.test_files }}

	- name: Test examples
	run: \|
	docker run ${{ matrix.flags }} "${DOCKER_IMAGE}" pytest ${{ matrix.test_examples }}

	- name: Test examples with MPI
	run: \|
	docker run ${{ matrix.flags }} --env DEVITO_MPI=1 "${DOCKER_IMAGE}" \
	mpiexec -n 2 pytest ${{ matrix.test_examples }}

	- name: Clean up test image
	if: always()
	run: \|
	# Remove only the image we just built
	docker rmi -f "${DOCKER_IMAGE}" \|\| true

	# Guard global prune the layers we created
	# (label ci-run=$GITHUB_RUN_ID)
	docker image prune -f --filter label=ci-run=$GITHUB_RUN_ID

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Remove dangling layers. #7938

Workflow file

Remove dangling layers. #7938

Uh oh!

Workflow file for this run