Skip to content

Commit 639d77e

Browse files
committed
debugging
1 parent adc267e commit 639d77e

1 file changed

Lines changed: 55 additions & 42 deletions

File tree

.github/workflows/pytest-gpu.yml

Lines changed: 55 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -15,18 +15,17 @@ concurrency:
1515
cancel-in-progress: true
1616

1717
env:
18-
OUTPUT_PATH: ${{ github.workspace }}
1918
RESOURCE_GROUP: CI-gpu
2019

2120
on:
2221
push:
23-
branches: [ main ]
22+
branches: [main]
2423
pull_request:
25-
branches: [ main ]
24+
branches: [main]
2625
workflow_dispatch:
2726
inputs:
2827
tags:
29-
description: 'Run GPU tests'
28+
description: "Run GPU tests"
3029

3130
jobs:
3231
build:
@@ -38,42 +37,44 @@ jobs:
3837
strategy:
3938
fail-fast: false
4039
matrix:
41-
name: [
42-
pytest-gpu-acc-nvidia,
43-
pytest-gpu-omp-amd
44-
]
45-
test_examples: ["examples/seismic/tti/tti_example.py examples/seismic/acoustic/acoustic_example.py examples/seismic/viscoacoustic/viscoacoustic_example.py examples/seismic/viscoelastic/viscoelastic_example.py examples/seismic/elastic/elastic_example.py"]
40+
name: [pytest-gpu-acc-nvidia, pytest-gpu-omp-amd]
41+
test_examples:
42+
- examples/seismic/tti/tti_example.py
43+
- examples/seismic/acoustic/acoustic_example.py
44+
- examples/seismic/viscoacoustic/viscoacoustic_example.py
45+
- examples/seismic/viscoelastic/viscoelastic_example.py
46+
- examples/seismic/elastic/elastic_example.py
4647

4748
include:
48-
# -------------------- NVIDIA job --------------------
49-
- name: pytest-gpu-acc-nvidia
50-
test_files: "tests/test_adjoint.py tests/test_gpu_common.py tests/test_gpu_openacc.py"
51-
base: "devitocodes/bases:nvidia-nvc"
52-
runner_label: nvidiagpu
53-
test_drive_cmd: "nvidia-smi"
54-
# Respect CUDA_VISIBLE_DEVICES and also hard-limit Docker to that device.
55-
# NOTE: CUDA_VISIBLE_DEVICES must be set by the runner (systemd drop-in etc.).
56-
flags: >-
57-
--init --rm -t
58-
--name ${CONTAINER_BASENAME}
59-
--env CUDA_VISIBLE_DEVICES
60-
--gpus "device=${CUDA_VISIBLE_DEVICES:-all}"
61-
62-
# -------------------- AMD job -----------------------
63-
- name: pytest-gpu-omp-amd
64-
test_files: "tests/test_adjoint.py tests/test_gpu_common.py tests/test_gpu_openmp.py"
65-
runner_label: amdgpu
66-
base: "devitocodes/bases:amd"
67-
test_drive_cmd: "rocm-smi"
68-
# Unchanged, still passes through required /dev nodes etc.
69-
flags: >-
70-
--init --network=host
71-
--device=/dev/kfd --device=/dev/dri
72-
--ipc=host
73-
--group-add video --group-add "$(getent group render | cut -d: -f3)"
74-
--cap-add=SYS_PTRACE --security-opt seccomp=unconfined
75-
--rm -t
76-
--name ${CONTAINER_BASENAME}
49+
# -------------------- NVIDIA job --------------------
50+
- name: pytest-gpu-acc-nvidia
51+
test_files: "tests/test_adjoint.py tests/test_gpu_common.py tests/test_gpu_openacc.py"
52+
base: "devitocodes/bases:nvidia-nvc"
53+
runner_label: nvidiagpu
54+
test_drive_cmd: "nvidia-smi"
55+
# Respect CUDA_VISIBLE_DEVICES and also hard-limit Docker to that device.
56+
# NOTE: CUDA_VISIBLE_DEVICES must be set by the runner (systemd drop-in etc.).
57+
flags: >-
58+
--init --rm -t
59+
--name ${CONTAINER_BASENAME}
60+
--env CUDA_VISIBLE_DEVICES
61+
--gpus "device=${CUDA_VISIBLE_DEVICES:-all}"
62+
63+
# -------------------- AMD job -----------------------
64+
- name: pytest-gpu-omp-amd
65+
test_files: "tests/test_adjoint.py tests/test_gpu_common.py tests/test_gpu_openmp.py"
66+
runner_label: amdgpu
67+
base: "devitocodes/bases:amd"
68+
test_drive_cmd: "rocm-smi"
69+
# Unchanged, still passes through required /dev nodes etc.
70+
flags: >-
71+
--init --network=host
72+
--device=/dev/kfd --device=/dev/dri
73+
--ipc=host
74+
--group-add video --group-add "$(getent group render | cut -d: -f3)"
75+
--cap-add=SYS_PTRACE --security-opt seccomp=unconfined
76+
--rm -t
77+
--name ${CONTAINER_BASENAME}
7778
7879
steps:
7980
- name: Checkout devito
@@ -84,11 +85,16 @@ jobs:
8485
echo "DOCKER_IMAGE=${{ matrix.name }}-${RUNNER_NAME// /_}" >> $GITHUB_ENV
8586
echo "CONTAINER_BASENAME=testrun-${{ matrix.name }}-${RUNNER_NAME// /_}" >> $GITHUB_ENV
8687
88+
- name: Ensure buildx builder
89+
run: |
90+
docker buildx inspect "$RUNNER_NAME" >/dev/null 2>&1 || \
91+
docker buildx create --name "$RUNNER_NAME" --driver docker-container
92+
docker buildx use "$RUNNER_NAME"
93+
8794
- name: Build docker image
8895
run: |
89-
docker buildx create --use --name $RUNNER_NAME || true
90-
docker build . \
91-
--builder $RUNNER_NAME \
96+
docker buildx build . \
97+
--builder "$RUNNER_NAME" \
9298
--label ci-run=$GITHUB_RUN_ID \
9399
--rm --pull \
94100
--file docker/Dockerfile.devito \
@@ -138,7 +144,9 @@ jobs:
138144
-e CODECOV_TOKEN \
139145
${NVIDIA_ENV_FLAGS} \
140146
"${DOCKER_IMAGE}" \
141-
pytest -vv -ra -l -s --full-trace --maxfail=1 \
147+
pytest -vvv --capture=no --showlocals \
148+
--log-cli-level=DEBUG -o log_cli=true \
149+
--full-trace --durations=10 \
142150
--cov --cov-config=.coveragerc --cov-report=xml \
143151
${{ matrix.test_files }}
144152
@@ -160,3 +168,8 @@ jobs:
160168
# Guard global prune the layers we created
161169
# (label ci-run=$GITHUB_RUN_ID)
162170
docker image prune -f --filter label=ci-run=$GITHUB_RUN_ID
171+
172+
docker builder prune -f --filter "name=$RUNNER_NAME" --filter "until=168h"
173+
174+
# May be overkill - commented out for now but left here as a reminder
175+
# docker buildx rm $RUNNER_NAME || true

0 commit comments

Comments
 (0)