Skip to content

Commit 91c0e4a

Browse files
committed
ci: bug fixes
1 parent 7f7456c commit 91c0e4a

1 file changed

Lines changed: 23 additions & 21 deletions

File tree

.github/workflows/pytest-gpu.yml

Lines changed: 23 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -70,7 +70,7 @@ jobs:
7070
--init --network=host
7171
--device=/dev/kfd --device=/dev/dri
7272
--ipc=host
73-
--group-add video --group-add $(getent group render | cut -d: -f3)
73+
--group-add video --group-add "$(getent group render | cut -d: -f3)"
7474
--cap-add=SYS_PTRACE --security-opt seccomp=unconfined
7575
--rm -t
7676
--name ${CONTAINER_BASENAME}
@@ -81,7 +81,6 @@ jobs:
8181

8282
- name: Set per-runner tags
8383
run: |
84-
echo "RUNNER_NAME=$RUNNER_NAME" >> $GITHUB_ENV
8584
echo "DOCKER_IMAGE=${{ matrix.name }}-$RUNNER_NAME" >> $GITHUB_ENV
8685
echo "CONTAINER_BASENAME=testrun-${{ matrix.name }}-$RUNNER_NAME" >> $GITHUB_ENV
8786
@@ -101,40 +100,43 @@ jobs:
101100

102101
- name: Probe gpu
103102
run: |
104-
# Run a simple driver cmd first (nvidia-smi / rocm-smi)
103+
# Make sure CUDA_VISIBLE_DEVICES is at least *something* on NVIDIA
104+
# runners; fall back to "all" so the driver probe does not fail.
105+
if [[ "${{ matrix.runner_label }}" == "nvidiagpu" && -z "${CUDA_VISIBLE_DEVICES:-}" ]]; then
106+
echo "CUDA_VISIBLE_DEVICES=all" >> $GITHUB_ENV
107+
fi
108+
109+
# Run a simple driver-probe command (nvidia-smi / rocm-smi)
105110
docker run ${{ matrix.flags }} "${DOCKER_IMAGE}" ${{ matrix.test_drive_cmd }}
106111
107112
- name: Test with pytest
108113
env:
109114
# Exported earlier in the job; needed inside the container for codecov
110115
CODECOV_TOKEN: ${{ env.CODECOV_TOKEN }}
111116
run: |
112-
# ------------------------------------------------------------------
113117
# 1. Add Codecov’s environment variables (GITHUB_SHA, etc.)
114-
# ------------------------------------------------------------------
115118
ci_env=$(bash <(curl -s https://codecov.io/env))
116119
117-
# ------------------------------------------------------------------
118-
# 2. Make sure the runner really gave us a GPU mask
119-
# ------------------------------------------------------------------
120-
: "${CUDA_VISIBLE_DEVICES:?CUDA_VISIBLE_DEVICES not set}"
121-
122-
# ------------------------------------------------------------------
123-
# 3. Run the test suite inside the image we just built
124-
# ------------------------------------------------------------------
125-
docker run \
126-
--init --rm -t \
127-
--name "${CONTAINER_BASENAME}" \
128-
--env CUDA_VISIBLE_DEVICES="${CUDA_VISIBLE_DEVICES}" \
129-
--gpus "device=${CUDA_VISIBLE_DEVICES}" \
120+
# 2. For NVIDIA, make sure the runner gave us a GPU mask
121+
# Default to empty
122+
NVIDIA_ENV_FLAGS=""
123+
# For NVIDIA, check for device mask AND set specific env vars
124+
if [[ "${{ matrix.runner_label }}" == "nvidiagpu" ]]; then
125+
: "${CUDA_VISIBLE_DEVICES:?CUDA_VISIBLE_DEVICES not set on NVIDIA runner}"
126+
NVIDIA_ENV_FLAGS=" \
127+
-e CUDA_LAUNCH_BLOCKING=1 \
128+
-e NV_ACC_NOTIFY=3 \
129+
-e NV_ACC_DEBUG=1"
130+
fi
131+
132+
# 3. Run the test suite using the matrix-defined flags
133+
docker run ${{ matrix.flags }} \
130134
${ci_env} \
131135
-e CI=true \
132136
-e PYTHONFAULTHANDLER=1 \
133-
-e CUDA_LAUNCH_BLOCKING=1 \
134-
-e NV_ACC_NOTIFY=3 \
135-
-e NV_ACC_DEBUG=1 \
136137
-e DEVITO_LOGGING=DEBUG \
137138
-e CODECOV_TOKEN \
139+
${NVIDIA_ENV_FLAGS} \
138140
"${DOCKER_IMAGE}" \
139141
pytest -vv -ra -l -s --full-trace --maxfail=1 \
140142
--cov --cov-config=.coveragerc --cov-report=xml \

0 commit comments

Comments
 (0)