diff --git a/CHANGELOG.md b/CHANGELOG.md index b80f76fa..cadc35b1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -41,6 +41,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Arch `from_config` bug for literal params. - Fixed fused SiLU activation test. - Update `np.bool` to `np.bool_`. +- Added a workaround fix for the CUDA graphs error in multi-node runs ### Security diff --git a/modulus/sym/trainer.py b/modulus/sym/trainer.py index 2aa4d5ea..9010c5f4 100644 --- a/modulus/sym/trainer.py +++ b/modulus/sym/trainer.py @@ -737,6 +737,10 @@ def _cuda_graph_training_step(self, step: int): self.g = torch.cuda.CUDAGraph() self.global_optimizer_model.zero_grad(set_to_none=True) + # TODO: temporary workaround till this issue is fixed: + # https://github.com/pytorch/pytorch/pull/104487#issuecomment-1638665876 + delay = os.environ.get("MODULUS_CUDA_GRAPH_CAPTURE_DELAY", "10") + time.sleep(int(delay)) with torch.cuda.graph(self.g): # compute gradients self.loss_static, self.losses_static = self.compute_gradients(