Attempt to use bash in step #616
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
name: Run unit tests on GPUs | ||
on: | ||
push: | ||
paths: | ||
- "ignite/**" | ||
- "tests/ignite/**" | ||
- "tests/run_gpu_tests.sh" | ||
- "tests/run_code_style.sh" | ||
- "examples/**.py" | ||
- "requirements-dev.txt" | ||
- ".github/workflows/gpu-tests.yml" | ||
workflow_dispatch: | ||
concurrency: | ||
# <workflow_name>-<branch_name>-<true || commit_sha (if branch is protected)> | ||
group: gpu-tests-${{ github.ref_name }}-${{ !(github.ref_protected) || github.sha }} | ||
cancel-in-progress: true | ||
# Cherry-picked from https://github.com/pytorch/test-infra/blob/main/.github/workflows/linux_job_v2.yml | ||
jobs: | ||
gpu-tests: | ||
strategy: | ||
matrix: | ||
pytorch-channel: [pytorch, pytorch-nightly] | ||
fail-fast: false | ||
env: | ||
DOCKER_IMAGE: "pytorch/almalinux-builder:cuda12.4" | ||
REPOSITORY: ${{ github.repository }} | ||
PR_NUMBER: ${{ github.event.pull_request.number }} | ||
runs-on: linux.8xlarge.nvidia.gpu | ||
timeout-minutes: 85 | ||
steps: | ||
- name: Clean workspace | ||
run: | | ||
echo "::group::Cleanup debug output" | ||
sudo rm -rfv "${GITHUB_WORKSPACE}" | ||
mkdir -p "${GITHUB_WORKSPACE}" | ||
echo "::endgroup::" | ||
- name: Checkout repository (pytorch/test-infra) | ||
uses: actions/checkout@v4 | ||
with: | ||
# Support the use case where we need to checkout someone's fork | ||
repository: pytorch/test-infra | ||
path: test-infra | ||
- name: Setup Linux | ||
uses: ./test-infra/.github/actions/setup-linux | ||
- name: Pull docker image | ||
uses: ./test-infra/.github/actions/pull-docker-image | ||
with: | ||
docker-image: ${{ env.DOCKER_IMAGE }} | ||
- name: Checkout repository (${{ github.repository }}) | ||
uses: actions/checkout@v4 | ||
with: | ||
# Support the use case where we need to checkout someone's fork | ||
repository: ${{ github.repository }} | ||
ref: ${{ github.ref }} | ||
path: ${{ github.repository }} | ||
fetch-depth: 1 | ||
- name: Start Pytorch container | ||
working-directory: ${{ github.repository }} | ||
run: | | ||
docker run --name pthd --gpus=all --rm \ | ||
--cap-add=SYS_PTRACE \ | ||
--detach \ | ||
--ipc=host \ | ||
--security-opt seccomp=unconfined \ | ||
--shm-size=2g \ | ||
--tty \ | ||
--ulimit stack=10485760:83886080 \ | ||
-v $PWD:/work \ | ||
-w /work \ | ||
${DOCKER_IMAGE} | ||
script=$(cat << EOF | ||
set -xe | ||
nvidia-smi | ||
ls -alh | ||
conda --version | ||
python --version | ||
EOF | ||
) | ||
docker exec -t pthd /bin/bash -c "${script}" | ||
- name: Install PyTorch and dependencies | ||
continue-on-error: false | ||
run: | | ||
script=$(cat << EOF | ||
set -xe | ||
# Install PyTorch | ||
if [ "${{ matrix.pytorch-channel }}" == "pytorch" ]; then | ||
pip install --upgrade torch torchvision --index-url https://download.pytorch.org/whl/cu124 | ||
else | ||
pip install --upgrade --pre torch torchvision --index-url https://download.pytorch.org/whl/nightly/cu124 | ||
fi | ||
python -c "import torch; print(torch.__version__, ', CUDA is available: ', torch.cuda.is_available()); exit(not torch.cuda.is_available())" | ||
pip list | ||
# Install dependencies | ||
pip install -r requirements-dev.txt | ||
pip install -e . | ||
EOF | ||
) | ||
docker exec -t pthd /bin/bash -c "${script}" | ||
- name: Debug with tmate | ||
uses: mxschmitt/action-tmate@v3 | ||
shell: bash | ||
with: | ||
sudo: false | ||
- name: Run GPU Unit Tests | ||
continue-on-error: false | ||
uses: nick-fields/[email protected] | ||
with: | ||
max_attempts: 5 | ||
timeout_minutes: 45 | ||
shell: bash | ||
command: docker exec -t pthd /bin/bash -xec 'bash tests/run_gpu_tests.sh 2' | ||
new_command_on_retry: docker exec -e USE_LAST_FAILED=1 -t pthd /bin/bash -xec 'bash tests/run_gpu_tests.sh 2' | ||
- name: Upload coverage to Codecov | ||
uses: codecov/codecov-action@v3 | ||
with: | ||
file: ${{ github.repository }}/coverage.xml | ||
flags: gpu-2 | ||
fail_ci_if_error: false | ||
- name: Run examples in container | ||
continue-on-error: false | ||
run: | | ||
script=$(cat << EOF | ||
set -xe | ||
# Install additional example dependencies | ||
pip install fire | ||
# Check training on cifar10, run without backend | ||
## initial run | ||
CI=1 python examples/cifar10/main.py run --checkpoint_every=200 --stop_iteration=500 | ||
## resume | ||
CI=1 python examples/cifar10/main.py run --checkpoint_every=200 --num_epochs=7 --resume-from=/tmp/output-cifar10/resnet18_backend-None-1_stop-on-500/training_checkpoint_400.pt | ||
# Check training on cifar10, run with NCCL backend using torchrun | ||
## initial run | ||
CI=1 torchrun --nproc_per_node=2 examples/cifar10/main.py run --backend=nccl --checkpoint_every=200 --stop_iteration=500 | ||
## resume | ||
CI=1 torchrun --nproc_per_node=2 examples/cifar10/main.py run --backend=nccl --checkpoint_every=200 --num_epochs=7 --resume-from=/tmp/output-cifar10/resnet18_backend-nccl-2_stop-on-500/training_checkpoint_400.pt | ||
# Check training on cifar10, run with NCCL backend using spawn | ||
## initial run | ||
CI=1 python -u examples/cifar10/main.py run --backend=nccl --nproc_per_node=2 --checkpoint_every=200 --stop_iteration=500 | ||
## resume | ||
CI=1 python -u examples/cifar10/main.py run --backend=nccl --nproc_per_node=2 --checkpoint_every=200 --num_epochs=7 --resume-from=/tmp/output-cifar10/resnet18_backend-nccl-2_stop-on-500/training_checkpoint_400.pt | ||
EOF | ||
) | ||
docker exec -t pthd /bin/bash -c "${script}" | ||
- name: Teardown Linux | ||
if: ${{ always() }} | ||
uses: ./test-infra/.github/actions/teardown-linux |