-
Notifications
You must be signed in to change notification settings - Fork 256
162 lines (141 loc) · 5.94 KB
/
pytest-gpu.yml
File metadata and controls
162 lines (141 loc) · 5.94 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
# Runner information:
# - OpenACC/OpenMP on NVIDIA runs on runners labeled `nvidiagpu`
# - OpenMP on AMD runs on runners labeled `amdgpu`
#
# Changes vs original:
# * Respect CUDA_VISIBLE_DEVICES for NVIDIA jobs by passing it AND restricting Docker with --gpus "device=…"
# * Tag images and container names with ${{ runner.name }} to avoid cross-runner races and maximize cache reuse
# * Remove docker prune / global container deletes (we assume disk space is fine)
# * Add comments throughout
name: CI-gpu
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: true
env:
OUTPUT_PATH: ${{ github.workspace }}
RESOURCE_GROUP: CI-gpu
on:
push:
branches: [ main ]
pull_request:
branches: [ main ]
workflow_dispatch:
inputs:
tags:
description: 'Run GPU tests'
jobs:
build:
name: ${{ matrix.name }}
runs-on:
- self-hosted
- ${{ matrix.runner_label }}
strategy:
fail-fast: false
matrix:
name: [
pytest-gpu-acc-nvidia,
pytest-gpu-omp-amd
]
test_examples: ["examples/seismic/tti/tti_example.py examples/seismic/acoustic/acoustic_example.py examples/seismic/viscoacoustic/viscoacoustic_example.py examples/seismic/viscoelastic/viscoelastic_example.py examples/seismic/elastic/elastic_example.py"]
include:
# -------------------- NVIDIA job --------------------
- name: pytest-gpu-acc-nvidia
test_files: "tests/test_adjoint.py tests/test_gpu_common.py tests/test_gpu_openacc.py"
base: "devitocodes/bases:nvidia-nvc"
runner_label: nvidiagpu
test_drive_cmd: "nvidia-smi"
# Respect CUDA_VISIBLE_DEVICES and also hard-limit Docker to that device.
# NOTE: CUDA_VISIBLE_DEVICES must be set by the runner (systemd drop-in etc.).
flags: >-
--init --rm -t
--name ${CONTAINER_BASENAME}
--env CUDA_VISIBLE_DEVICES
--gpus "device=${CUDA_VISIBLE_DEVICES:-all}"
# -------------------- AMD job -----------------------
- name: pytest-gpu-omp-amd
test_files: "tests/test_adjoint.py tests/test_gpu_common.py tests/test_gpu_openmp.py"
runner_label: amdgpu
base: "devitocodes/bases:amd"
test_drive_cmd: "rocm-smi"
# Unchanged, still passes through required /dev nodes etc.
flags: >-
--init --network=host
--device=/dev/kfd --device=/dev/dri
--ipc=host
--group-add video --group-add "$(getent group render | cut -d: -f3)"
--cap-add=SYS_PTRACE --security-opt seccomp=unconfined
--rm -t
--name ${CONTAINER_BASENAME}
steps:
- name: Checkout devito
uses: actions/checkout@v4
- name: Set per-runner tags
run: |
echo "DOCKER_IMAGE=${{ matrix.name }}-${RUNNER_NAME// /_}" >> $GITHUB_ENV
echo "CONTAINER_BASENAME=testrun-${{ matrix.name }}-${RUNNER_NAME// /_}" >> $GITHUB_ENV
- name: Build docker image
run: |
docker buildx create --use --name $RUNNER_NAME || true
docker build . \
--builder $RUNNER_NAME \
--label ci-run=$GITHUB_RUN_ID \
--rm --pull \
--file docker/Dockerfile.devito \
--tag "${DOCKER_IMAGE}" \
--build-arg base="${{ matrix.base }}"
- name: Export CODECOV token
run: echo "CODECOV_TOKEN=${{ secrets.CODECOV_TOKEN }}" >> "$GITHUB_ENV"
- name: Probe gpu
run: |
# Make sure CUDA_VISIBLE_DEVICES is at least *something* on NVIDIA
# runners; fall back to "all" so the driver probe does not fail.
if [[ "${{ matrix.runner_label }}" == "nvidiagpu" && -z "${CUDA_VISIBLE_DEVICES:-}" ]]; then
echo "CUDA_VISIBLE_DEVICES=all" >> $GITHUB_ENV
fi
# Run a simple driver-probe command (nvidia-smi / rocm-smi)
docker run ${{ matrix.flags }} "${DOCKER_IMAGE}" ${{ matrix.test_drive_cmd }}
- name: Test with pytest
env:
# Exported earlier in the job; needed inside the container for codecov
CODECOV_TOKEN: ${{ env.CODECOV_TOKEN }}
run: |
# 1. Add Codecov’s environment variables (GITHUB_SHA, etc.)
ci_env=$(bash <(curl -s https://codecov.io/env))
# 2. For NVIDIA, make sure the runner gave us a GPU mask
# Default to empty
NVIDIA_ENV_FLAGS=""
# For NVIDIA, check for device mask AND set specific env vars
if [[ "${{ matrix.runner_label }}" == "nvidiagpu" ]]; then
: "${CUDA_VISIBLE_DEVICES:?CUDA_VISIBLE_DEVICES not set on NVIDIA runner}"
NVIDIA_ENV_FLAGS=" \
-e CUDA_LAUNCH_BLOCKING=1 \
-e NV_ACC_NOTIFY=3 \
-e NV_ACC_DEBUG=1"
fi
# 3. Run the test suite using the matrix-defined flags
docker run ${{ matrix.flags }} \
${ci_env} \
-e CI=true \
-e PYTHONFAULTHANDLER=1 \
-e DEVITO_LOGGING=DEBUG \
-e CODECOV_TOKEN \
${NVIDIA_ENV_FLAGS} \
"${DOCKER_IMAGE}" \
pytest -vv -ra -l -s --full-trace --maxfail=1 \
--cov --cov-config=.coveragerc --cov-report=xml \
${{ matrix.test_files }}
- name: Test examples
run: |
docker run ${{ matrix.flags }} "${DOCKER_IMAGE}" pytest ${{ matrix.test_examples }}
- name: Test examples with MPI
run: |
docker run ${{ matrix.flags }} --env DEVITO_MPI=1 "${DOCKER_IMAGE}" \
mpiexec -n 2 pytest ${{ matrix.test_examples }}
- name: Clean up test image
if: always()
run: |
# Remove only the image we just built
docker rmi -f "${DOCKER_IMAGE}" || true
# Guard global prune the layers we created
# (label ci-run=$GITHUB_RUN_ID)
docker image prune -f --filter label=ci-run=$GITHUB_RUN_ID