From 85733e9623cec3526b8f4870aadefdbc696931a8 Mon Sep 17 00:00:00 2001 From: Simon Byrne Date: Fri, 15 Nov 2024 09:48:07 -0800 Subject: [PATCH 1/2] Fix NCCL_ASYNC_ERROR_HANDLING deprecation warning It looks like the patch from https://github.com/pytorch/pytorch/pull/114077 landed in torch 2.2.0. Fixes #568. --- modulus/distributed/manager.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/modulus/distributed/manager.py b/modulus/distributed/manager.py index 61bc2687e9..a266fdd0c5 100644 --- a/modulus/distributed/manager.py +++ b/modulus/distributed/manager.py @@ -332,7 +332,11 @@ def initialize(): addr = os.getenv("MASTER_ADDR", "localhost") port = os.getenv("MASTER_PORT", "12355") # https://pytorch.org/docs/master/notes/cuda.html#id5 - os.environ["NCCL_ASYNC_ERROR_HANDLING"] = "0" + # was changed in version 2.2 + if torch.__version__ < (2, 2): + os.environ["NCCL_ASYNC_ERROR_HANDLING"] = "0" + else: + os.environ["TORCH_NCCL_ASYNC_ERROR_HANDLING"] = "0" initialization_method = os.getenv("MODULUS_DISTRIBUTED_INITIALIZATION_METHOD") if initialization_method is None: try: From 7166174c4651757a465c464720a4e379a7b30d42 Mon Sep 17 00:00:00 2001 From: Simon Byrne Date: Tue, 19 Nov 2024 13:34:52 -0800 Subject: [PATCH 2/2] Update CHANGELOG.md --- CHANGELOG.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 93824e625a..c25e069eaa 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -36,6 +36,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Fixed +- Fixed NCCL_ASYNC_ERROR_HANDLING deprecation warning + ### Security ### Dependencies