devito/.github/workflows/pytest-gpu.yml at adc267e4f194e98190241718b897bd1e527fea5e · devitocodes/devito · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
# Runner information:
#  - OpenACC/OpenMP on NVIDIA runs on runners labeled `nvidiagpu`
#  - OpenMP on AMD runs on runners labeled `amdgpu`
#
# Changes vs original:
#  * Respect CUDA_VISIBLE_DEVICES for NVIDIA jobs by passing it AND restricting Docker with --gpus "device=…"
#  * Tag images and container names with ${{ runner.name }} to avoid cross-runner races and maximize cache reuse
#  * Remove docker prune / global container deletes (we assume disk space is fine)
#  * Add comments throughout

name: CI-gpu

concurrency:
  group: ${{ github.workflow }}-${{ github.ref }}
  cancel-in-progress: true

env:
  OUTPUT_PATH: ${{ github.workspace }}
  RESOURCE_GROUP: CI-gpu

on:
  push:
    branches: [ main ]
  pull_request:
    branches: [ main ]
  workflow_dispatch:
    inputs:
      tags:
        description: 'Run GPU tests'

jobs:
  build:
    name: ${{ matrix.name }}
    runs-on:
      - self-hosted
      - ${{ matrix.runner_label }}

    strategy:
      fail-fast: false
      matrix:
        name: [
          pytest-gpu-acc-nvidia,
          pytest-gpu-omp-amd
        ]
        test_examples: ["examples/seismic/tti/tti_example.py examples/seismic/acoustic/acoustic_example.py examples/seismic/viscoacoustic/viscoacoustic_example.py examples/seismic/viscoelastic/viscoelastic_example.py examples/seismic/elastic/elastic_example.py"]

        include:
        # -------------------- NVIDIA job --------------------
        - name: pytest-gpu-acc-nvidia
          test_files: "tests/test_adjoint.py tests/test_gpu_common.py tests/test_gpu_openacc.py"
          base: "devitocodes/bases:nvidia-nvc"
          runner_label: nvidiagpu
          test_drive_cmd: "nvidia-smi"
          # Respect CUDA_VISIBLE_DEVICES and also hard-limit Docker to that device.
          # NOTE: CUDA_VISIBLE_DEVICES must be set by the runner (systemd drop-in etc.).
          flags: >-
            --init --rm -t
            --name ${CONTAINER_BASENAME}
            --env CUDA_VISIBLE_DEVICES
            --gpus "device=${CUDA_VISIBLE_DEVICES:-all}"

        # -------------------- AMD job -----------------------
        - name: pytest-gpu-omp-amd
          test_files: "tests/test_adjoint.py tests/test_gpu_common.py tests/test_gpu_openmp.py"
          runner_label: amdgpu
          base: "devitocodes/bases:amd"
          test_drive_cmd: "rocm-smi"
          # Unchanged, still passes through required /dev nodes etc.
          flags: >-
            --init --network=host
            --device=/dev/kfd --device=/dev/dri
            --ipc=host
            --group-add video --group-add "$(getent group render | cut -d: -f3)"
            --cap-add=SYS_PTRACE --security-opt seccomp=unconfined
            --rm -t
            --name ${CONTAINER_BASENAME}

    steps:
      - name: Checkout devito
        uses: actions/checkout@v4

      - name: Set per-runner tags
        run: |
          echo "DOCKER_IMAGE=${{ matrix.name }}-${RUNNER_NAME// /_}" >> $GITHUB_ENV
          echo "CONTAINER_BASENAME=testrun-${{ matrix.name }}-${RUNNER_NAME// /_}" >> $GITHUB_ENV

      - name: Build docker image
        run: |
          docker buildx create --use --name $RUNNER_NAME || true
          docker build . \
            --builder $RUNNER_NAME \
            --label ci-run=$GITHUB_RUN_ID \
            --rm --pull \
            --file docker/Dockerfile.devito \
            --tag "${DOCKER_IMAGE}" \
            --build-arg base="${{ matrix.base }}"

      - name: Export CODECOV token
        run: echo "CODECOV_TOKEN=${{ secrets.CODECOV_TOKEN }}" >> "$GITHUB_ENV"

      - name: Probe gpu
        run: |
          # Make sure CUDA_VISIBLE_DEVICES is at least *something* on NVIDIA
          # runners; fall back to "all" so the driver probe does not fail.
          if [[ "${{ matrix.runner_label }}" == "nvidiagpu" && -z "${CUDA_VISIBLE_DEVICES:-}" ]]; then
            echo "CUDA_VISIBLE_DEVICES=all" >> $GITHUB_ENV
          fi

          # Run a simple driver-probe command (nvidia-smi / rocm-smi)
          docker run ${{ matrix.flags }} "${DOCKER_IMAGE}" ${{ matrix.test_drive_cmd }}

      - name: Test with pytest
        env:
          # Exported earlier in the job; needed inside the container for codecov
          CODECOV_TOKEN: ${{ env.CODECOV_TOKEN }}
        run: |
          # 1.  Add Codecov’s environment variables (GITHUB_SHA, etc.)
          ci_env=$(bash <(curl -s https://codecov.io/env))

          # 2.  For NVIDIA, make sure the runner gave us a GPU mask
          # Default to empty
          NVIDIA_ENV_FLAGS=""
          # For NVIDIA, check for device mask AND set specific env vars
          if [[ "${{ matrix.runner_label }}" == "nvidiagpu" ]]; then
            : "${CUDA_VISIBLE_DEVICES:?CUDA_VISIBLE_DEVICES not set on NVIDIA runner}"
            NVIDIA_ENV_FLAGS=" \
              -e CUDA_LAUNCH_BLOCKING=1 \
              -e NV_ACC_NOTIFY=3 \
              -e NV_ACC_DEBUG=1"
          fi

          # 3.  Run the test suite using the matrix-defined flags
          docker run ${{ matrix.flags }} \
            ${ci_env} \
            -e CI=true \
            -e PYTHONFAULTHANDLER=1 \
            -e DEVITO_LOGGING=DEBUG \
            -e CODECOV_TOKEN \
            ${NVIDIA_ENV_FLAGS} \
            "${DOCKER_IMAGE}" \
            pytest -vv -ra -l -s --full-trace --maxfail=1 \
                   --cov --cov-config=.coveragerc --cov-report=xml \
                   ${{ matrix.test_files }}

      - name: Test examples
        run: |
          docker run ${{ matrix.flags }} "${DOCKER_IMAGE}" pytest ${{ matrix.test_examples }}

      - name: Test examples with MPI
        run: |
          docker run ${{ matrix.flags }} --env DEVITO_MPI=1 "${DOCKER_IMAGE}" \
            mpiexec -n 2 pytest ${{ matrix.test_examples }}

      - name: Clean up test image
        if: always()
        run: |
          # Remove only the image we just built
          docker rmi -f "${DOCKER_IMAGE}" || true

          # Guard global prune the layers we created
          # (label ci-run=$GITHUB_RUN_ID)
          docker image prune -f --filter label=ci-run=$GITHUB_RUN_ID