From bf58c35f5ceeaffd257ebc3f2397cceb3ce903e8 Mon Sep 17 00:00:00 2001 From: Kaustubh Tangsali <71059996+ktangsali@users.noreply.github.com> Date: Mon, 7 Aug 2023 10:50:32 -0700 Subject: [PATCH] Hot fix NCCL CUDA Graphs bug (#48) * add time delay as a temporary workaround * control delay via env variable * reduce default delay * Update CHANGELOG.md --- CHANGELOG.md | 1 + modulus/sym/trainer.py | 4 ++++ 2 files changed, 5 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index b80f76fa..cadc35b1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -41,6 +41,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Arch `from_config` bug for literal params. - Fixed fused SiLU activation test. - Update `np.bool` to `np.bool_`. +- Added a workaround fix for the CUDA graphs error in multi-node runs ### Security diff --git a/modulus/sym/trainer.py b/modulus/sym/trainer.py index 2aa4d5ea..9010c5f4 100644 --- a/modulus/sym/trainer.py +++ b/modulus/sym/trainer.py @@ -737,6 +737,10 @@ def _cuda_graph_training_step(self, step: int): self.g = torch.cuda.CUDAGraph() self.global_optimizer_model.zero_grad(set_to_none=True) + # TODO: temporary workaround till this issue is fixed: + # https://github.com/pytorch/pytorch/pull/104487#issuecomment-1638665876 + delay = os.environ.get("MODULUS_CUDA_GRAPH_CAPTURE_DELAY", "10") + time.sleep(int(delay)) with torch.cuda.graph(self.g): # compute gradients self.loss_static, self.losses_static = self.compute_gradients(