|
1 | | -# Runner information: |
| 1 | +# Workflow information: |
2 | 2 | # - OpenACC/OpenMP on NVIDIA runs on runners labeled `nvidiagpu` |
3 | 3 | # - OpenMP on AMD runs on runners labeled `amdgpu` |
4 | | -# |
5 | | -# Changes vs original: |
6 | | -# * Respect CUDA_VISIBLE_DEVICES for NVIDIA jobs by passing it AND restricting Docker with --gpus "device=…" |
7 | | -# * Tag images and container names with ${{ runner.name }} to avoid cross-runner races and maximize cache reuse |
8 | | -# * Remove docker prune / global container deletes (we assume disk space is fine) |
9 | | -# * Add comments throughout |
| 4 | +# - Respect CUDA_VISIBLE_DEVICES for NVIDIA jobs by passing it AND restricting Docker with --gpus "device=…" |
| 5 | +# - Tag images and container names to avoid cross-runner races and maximize cache reuse |
10 | 6 |
|
11 | 7 | name: CI-gpu |
12 | 8 |
|
@@ -37,143 +33,104 @@ jobs: |
37 | 33 | - self-hosted |
38 | 34 | - ${{ matrix.runner_label }} |
39 | 35 |
|
40 | | - outputs: |
41 | | - unique : ${{ steps.uniquetag.outputs.unique }} |
42 | | - |
43 | 36 | strategy: |
44 | 37 | fail-fast: false |
45 | 38 | matrix: |
46 | 39 | name: [pytest-gpu-acc-nvidia, pytest-gpu-omp-amd] |
47 | 40 | test_examples: ["examples/seismic/tti/tti_example.py examples/seismic/acoustic/acoustic_example.py examples/seismic/viscoacoustic/viscoacoustic_example.py examples/seismic/viscoelastic/viscoelastic_example.py examples/seismic/elastic/elastic_example.py"] |
48 | 41 |
|
49 | 42 | include: |
50 | | - # -------------------- NVIDIA job -------------------- |
51 | | - - name: pytest-gpu-acc-nvidia |
52 | | - test_files: "tests/test_adjoint.py tests/test_gpu_common.py tests/test_gpu_openacc.py tests/test_operator.py::TestEstimateMemory" |
53 | | - base: "devitocodes/bases:nvidia-nvc12" |
54 | | - runner_label: nvidiagpu |
55 | | - test_drive_cmd: "nvidia-smi" |
56 | | - # Respect CUDA_VISIBLE_DEVICES and also hard-limit Docker to that device. |
57 | | - # NOTE: CUDA_VISIBLE_DEVICES must be set by the runner (systemd drop-in etc.). |
58 | | - dockerflags: >- |
59 | | - --init --rm -t |
60 | | - --name ${CONTAINER_BASENAME} |
61 | | - --gpus "device=${CUDA_VISIBLE_DEVICES:-all}" |
62 | | -
|
63 | | - # -------------------- AMD job ----------------------- |
64 | | - - name: pytest-gpu-omp-amd |
65 | | - test_files: "tests/test_adjoint.py tests/test_gpu_common.py tests/test_gpu_openmp.py tests/test_operator.py::TestEstimateMemory" |
66 | | - runner_label: amdgpu |
67 | | - base: "devitocodes/bases:amd" |
68 | | - test_drive_cmd: "rocm-smi" |
69 | | - # Unchanged, still passes through required /dev nodes etc. |
70 | | - dockerflags: >- |
71 | | - --init --network=host |
72 | | - --device=/dev/kfd --device=/dev/dri |
73 | | - --ipc=host |
74 | | - --group-add video --group-add "$(getent group render | cut -d: -f3)" |
75 | | - --cap-add=SYS_PTRACE --security-opt seccomp=unconfined |
76 | | - --rm -t |
77 | | - --name ${CONTAINER_BASENAME} |
| 43 | + # -------------------- NVIDIA job -------------------- |
| 44 | + - name: pytest-gpu-acc-nvidia |
| 45 | + test_files: "tests/test_adjoint.py tests/test_gpu_common.py tests/test_gpu_openacc.py tests/test_operator.py::TestEstimateMemory" |
| 46 | + base: "devitocodes/bases:nvidia-nvc12" |
| 47 | + runner_label: nvidiagpu |
| 48 | + test_drive_cmd: "nvidia-smi" |
| 49 | + # Respect CUDA_VISIBLE_DEVICES and also hard-limit Docker to that device. |
| 50 | + # NOTE: CUDA_VISIBLE_DEVICES must be set by the runner (systemd drop-in etc.). |
| 51 | + dockerflags: --gpus "device=${CUDA_VISIBLE_DEVICES:-all}" |
| 52 | + |
| 53 | + # -------------------- AMD job ----------------------- |
| 54 | + - name: pytest-gpu-omp-amd |
| 55 | + test_files: "tests/test_adjoint.py tests/test_gpu_common.py tests/test_gpu_openmp.py tests/test_operator.py::TestEstimateMemory" |
| 56 | + runner_label: amdgpu |
| 57 | + base: "devitocodes/bases:amd" |
| 58 | + test_drive_cmd: "rocm-smi" |
| 59 | + # Unchanged, still passes through required /dev nodes etc. |
| 60 | + dockerflags: >- |
| 61 | + --network=host |
| 62 | + --device=/dev/kfd --device=/dev/dri |
| 63 | + --ipc=host |
| 64 | + --group-add video --group-add "$(getent group render | cut -d: -f3)" |
| 65 | + --cap-add=SYS_PTRACE --security-opt seccomp=unconfined |
78 | 66 |
|
79 | 67 | steps: |
80 | | - - name: Checkout devito |
81 | | - uses: actions/checkout@v6 |
82 | | - |
83 | | - - name: Generate unique CI tag |
84 | | - id: uniquetag |
85 | | - run: | |
86 | | - UNIQUE=$(echo "${GITHUB_RUN_ID}_${GITHUB_RUN_ATTEMPT}" | cksum | cut -f 1 -d " ") |
87 | | - echo "Unique ID: ${UNIQUE}" |
88 | | - echo "unique=${UNIQUE}" >> "$GITHUB_OUTPUT" |
89 | | -
|
90 | | - - name: Set per-runner tags |
91 | | - env: |
92 | | - UNIQUE: ${{ steps.uniquetag.outputs.unique }} |
93 | | - run: | |
94 | | - echo "DOCKER_IMAGE=${{ matrix.name }}-${RUNNER_NAME// /_}-${UNIQUE}" >> "$GITHUB_ENV" |
95 | | - echo "CONTAINER_BASENAME=testrun-${{ matrix.name }}-${RUNNER_NAME// /_}-${{ github.sha }}" >> "$GITHUB_ENV" |
96 | | -
|
97 | | - - name: Ensure buildx builder |
98 | | - run: | |
99 | | - docker buildx inspect "${RUNNER_NAME// /_}" >/dev/null 2>&1 || \ |
100 | | - docker buildx create --name "${RUNNER_NAME// /_}" --driver docker-container |
101 | | - docker buildx use "${RUNNER_NAME// /_}" |
102 | | -
|
103 | | - - name: Build docker image |
104 | | - run: | |
105 | | - docker buildx build . \ |
106 | | - --builder "${RUNNER_NAME// /_}" \ |
107 | | - --load \ |
108 | | - --label ci-run="$GITHUB_RUN_ID" \ |
109 | | - --rm --pull \ |
110 | | - --file docker/Dockerfile.devito \ |
111 | | - --tag "${DOCKER_IMAGE}" \ |
112 | | - --build-arg base="${{ matrix.base }}" |
113 | | -
|
114 | | - - name: Export CODECOV token |
115 | | - run: echo "CODECOV_TOKEN=${{ secrets.CODECOV_TOKEN }}" >> "$GITHUB_ENV" |
116 | | - |
117 | | - - name: Probe gpu |
118 | | - run: | |
119 | | - # Make sure CUDA_VISIBLE_DEVICES is at least *something* on NVIDIA |
120 | | - # runners; fall back to "all" so the driver probe does not fail. |
121 | | - if [[ "${{ matrix.runner_label }}" == "nvidiagpu" && -z "${CUDA_VISIBLE_DEVICES:-}" ]]; then |
122 | | - echo "CUDA_VISIBLE_DEVICES=all" >> "$GITHUB_ENV" |
123 | | - fi |
124 | | -
|
125 | | - # Run a simple driver-probe command (nvidia-smi / rocm-smi) |
126 | | - docker rm -f "${CONTAINER_BASENAME}" 2>/dev/null || true |
127 | | - docker run ${{ matrix.dockerflags }} "${DOCKER_IMAGE}" ${{ matrix.test_drive_cmd }} |
128 | | -
|
129 | | - - name: Test with pytest |
130 | | - env: |
131 | | - # Exported earlier in the job; needed inside the container for codecov |
132 | | - CODECOV_TOKEN: ${{ env.CODECOV_TOKEN }} |
133 | | - run: | |
134 | | - # Add Codecov’s environment variables (GITHUB_SHA, etc.) |
135 | | - ci_env=$(bash <(curl -s https://codecov.io/env)) |
136 | | -
|
137 | | - # Run the test suite using the matrix-defined flags |
138 | | - docker run \ |
139 | | - ${{ matrix.dockerflags }} \ |
140 | | - "${ci_env}" \ |
141 | | - --env CI=true \ |
142 | | - --env PYTHONFAULTHANDLER=1 \ |
143 | | - --env DEVITO_LOGGING=DEBUG \ |
144 | | - --env CODECOV_TOKEN \ |
145 | | - "${DOCKER_IMAGE}" \ |
146 | | - pytest -vvv --capture=no --showlocals \ |
147 | | - --log-cli-level=DEBUG -o log_cli=true \ |
148 | | - --full-trace --durations=10 \ |
149 | | - --cov --cov-config=.coveragerc --cov-report=xml \ |
150 | | - ${{ matrix.test_files }} |
151 | | -
|
152 | | - - name: Test examples |
153 | | - run: | |
154 | | - docker run \ |
155 | | - ${{ matrix.dockerflags }} \ |
156 | | - "${DOCKER_IMAGE}" \ |
157 | | - pytest ${{ matrix.test_examples }} |
158 | | -
|
159 | | - - name: Test examples with MPI |
160 | | - run: | |
161 | | - docker run \ |
162 | | - ${{ matrix.dockerflags }} \ |
163 | | - --env DEVITO_MPI=1 \ |
164 | | - "${DOCKER_IMAGE}" \ |
165 | | - mpiexec -n 2 pytest ${{ matrix.test_examples }} |
166 | | -
|
167 | | - - name: Builder & image cleanup (keep 3 days of cache) |
168 | | - if: always() |
169 | | - run: | |
170 | | - # Remove only the test image we built |
171 | | - docker rmi -f "${DOCKER_IMAGE}" || true |
172 | | -
|
173 | | - # Classic image layers created in this job |
174 | | - docker image prune -f --filter label=ci-run="$GITHUB_RUN_ID" |
175 | | -
|
176 | | - # BuildKit cache: target the per-runner builder explicitly |
177 | | - docker builder prune --builder "${RUNNER_NAME// /_}" \ |
178 | | - -f \ |
179 | | - --filter "until=72h" |
| 68 | + - name: Checkout devito |
| 69 | + uses: actions/checkout@v6 |
| 70 | + |
| 71 | + - id: build |
| 72 | + name: Build docker image |
| 73 | + uses: ./.github/actions/docker-build |
| 74 | + with: |
| 75 | + file: docker/Dockerfile.devito |
| 76 | + tag: ${{ matrix.name }} |
| 77 | + base: ${{ matrix.base }} |
| 78 | + |
| 79 | + - name: Probe GPU |
| 80 | + uses: ./.github/actions/docker-run |
| 81 | + with: |
| 82 | + uid: ${{ steps.build.outputs.unique }} |
| 83 | + tag: ${{ matrix.name }} |
| 84 | + command: ${{ matrix.test_drive_cmd }} |
| 85 | + |
| 86 | + - name: Test with pytest |
| 87 | + uses: ./.github/actions/docker-run |
| 88 | + with: |
| 89 | + uid: ${{ steps.build.outputs.unique }} |
| 90 | + tag: ${{ matrix.name }} |
| 91 | + args: ${{ matrix.dockerflags }} |
| 92 | + env: | |
| 93 | + CI=true |
| 94 | + CODECOV_TOKEN=${{ secrets.CODECOV_TOKEN }} |
| 95 | + DEVITO_LOGGING=DEBUG |
| 96 | + PYTHONFAULTHANDLER=1 |
| 97 | + command: | |
| 98 | + pytest \ |
| 99 | + -vvv \ |
| 100 | + --capture=no \ |
| 101 | + --showlocals \ |
| 102 | + --log-cli-level=DEBUG \ |
| 103 | + -o log_cli=true \ |
| 104 | + --full-trace \ |
| 105 | + --durations=10 \ |
| 106 | + --cov \ |
| 107 | + --cov-config=.coveragerc \ |
| 108 | + --cov-report=xml \ |
| 109 | + ${{ matrix.test_files }} |
| 110 | +
|
| 111 | + - name: Test examples |
| 112 | + uses: ./.github/actions/docker-run |
| 113 | + with: |
| 114 | + uid: ${{ steps.build.outputs.unique }} |
| 115 | + tag: ${{ matrix.name }} |
| 116 | + args: ${{ matrix.dockerflags }} |
| 117 | + env: | |
| 118 | + command: pytest ${{ matrix.test_examples }} |
| 119 | + |
| 120 | + - name: Test examples with MPI |
| 121 | + uses: ./.github/actions/docker-run |
| 122 | + with: |
| 123 | + uid: ${{ steps.build.outputs.unique }} |
| 124 | + tag: ${{ matrix.name }} |
| 125 | + args: ${{ matrix.dockerflags }} |
| 126 | + env: | |
| 127 | + DEVITO_LOGGING=DEBUG |
| 128 | + DEVITO_MPI=1 |
| 129 | + command: mpiexec -n 2 pytest ${{ matrix.test_examples }} |
| 130 | + |
| 131 | + - name: Cleanup docker image |
| 132 | + if: always() |
| 133 | + uses: ./.github/actions/docker-clean |
| 134 | + with: |
| 135 | + uid: ${{ steps.build.outputs.unique }} |
| 136 | + tag: ${{ matrix.name }} |
0 commit comments