7070 --init --network=host
7171 --device=/dev/kfd --device=/dev/dri
7272 --ipc=host
73- --group-add video --group-add $(getent group render | cut -d: -f3)
73+ --group-add video --group-add " $(getent group render | cut -d: -f3)"
7474 --cap-add=SYS_PTRACE --security-opt seccomp=unconfined
7575 --rm -t
7676 --name ${CONTAINER_BASENAME}
8181
8282 - name : Set per-runner tags
8383 run : |
84- echo "RUNNER_NAME=$RUNNER_NAME" >> $GITHUB_ENV
8584 echo "DOCKER_IMAGE=${{ matrix.name }}-$RUNNER_NAME" >> $GITHUB_ENV
8685 echo "CONTAINER_BASENAME=testrun-${{ matrix.name }}-$RUNNER_NAME" >> $GITHUB_ENV
8786
@@ -101,40 +100,43 @@ jobs:
101100
102101 - name : Probe gpu
103102 run : |
104- # Run a simple driver cmd first (nvidia-smi / rocm-smi)
103+ # Make sure CUDA_VISIBLE_DEVICES is at least *something* on NVIDIA
104+ # runners; fall back to "all" so the driver probe does not fail.
105+ if [[ "${{ matrix.runner_label }}" == "nvidiagpu" && -z "${CUDA_VISIBLE_DEVICES:-}" ]]; then
106+ echo "CUDA_VISIBLE_DEVICES=all" >> $GITHUB_ENV
107+ fi
108+
109+ # Run a simple driver-probe command (nvidia-smi / rocm-smi)
105110 docker run ${{ matrix.flags }} "${DOCKER_IMAGE}" ${{ matrix.test_drive_cmd }}
106111
107112 - name : Test with pytest
108113 env :
109114 # Exported earlier in the job; needed inside the container for codecov
110115 CODECOV_TOKEN : ${{ env.CODECOV_TOKEN }}
111116 run : |
112- # ------------------------------------------------------------------
113117 # 1. Add Codecov’s environment variables (GITHUB_SHA, etc.)
114- # ------------------------------------------------------------------
115118 ci_env=$(bash <(curl -s https://codecov.io/env))
116119
117- # ------------------------------------------------------------------
118- # 2. Make sure the runner really gave us a GPU mask
119- # ------------------------------------------------------------------
120- : "${CUDA_VISIBLE_DEVICES:?CUDA_VISIBLE_DEVICES not set}"
121-
122- # ------------------------------------------------------------------
123- # 3. Run the test suite inside the image we just built
124- # ------------------------------------------------------------------
125- docker run \
126- --init --rm -t \
127- --name "${CONTAINER_BASENAME}" \
128- --env CUDA_VISIBLE_DEVICES="${CUDA_VISIBLE_DEVICES}" \
129- --gpus "device=${CUDA_VISIBLE_DEVICES}" \
120+ # 2. For NVIDIA, make sure the runner gave us a GPU mask
121+ # Default to empty
122+ NVIDIA_ENV_FLAGS=""
123+ # For NVIDIA, check for device mask AND set specific env vars
124+ if [[ "${{ matrix.runner_label }}" == "nvidiagpu" ]]; then
125+ : "${CUDA_VISIBLE_DEVICES:?CUDA_VISIBLE_DEVICES not set on NVIDIA runner}"
126+ NVIDIA_ENV_FLAGS=" \
127+ -e CUDA_LAUNCH_BLOCKING=1 \
128+ -e NV_ACC_NOTIFY=3 \
129+ -e NV_ACC_DEBUG=1"
130+ fi
131+
132+ # 3. Run the test suite using the matrix-defined flags
133+ docker run ${{ matrix.flags }} \
130134 ${ci_env} \
131135 -e CI=true \
132136 -e PYTHONFAULTHANDLER=1 \
133- -e CUDA_LAUNCH_BLOCKING=1 \
134- -e NV_ACC_NOTIFY=3 \
135- -e NV_ACC_DEBUG=1 \
136137 -e DEVITO_LOGGING=DEBUG \
137138 -e CODECOV_TOKEN \
139+ ${NVIDIA_ENV_FLAGS} \
138140 "${DOCKER_IMAGE}" \
139141 pytest -vv -ra -l -s --full-trace --maxfail=1 \
140142 --cov --cov-config=.coveragerc --cov-report=xml \
0 commit comments