From f32302c3d27bdac65c5ed21e2c44c2ca4ca654c5 Mon Sep 17 00:00:00 2001
From: Sangkug Lym <slym@nvidia.com>
Date: Tue, 28 Jan 2025 23:06:35 -0800
Subject: [PATCH 1/2] Add docs on env vars

Signed-off-by: Sangkug Lym <slym@nvidia.com>
---
 examples/llm/pretrain/default_executor.py      | 16 ++++++----------
 scripts/llm/pretraining.py                     | 18 +++++++-----------
 scripts/performance/utils.py                   | 14 ++++++--------
 tests/collections/llm/hf/peft_nemorun.py       |  6 ++----
 tests/collections/llm/hf/pretrain_nemorun.py   |  8 +++-----
 tests/collections/llm/hf/sft_nemorun.py        |  6 ++----
 tests/collections/llm/hf/sft_nemorun_fsdp2.py  |  8 +++-----
 .../llm/llama/nemo2-sft-peft/nemo2-peft.ipynb  |  4 ----
 .../llm/llama/nemo2-sft-peft/nemo2-sft.ipynb   |  4 ----
 9 files changed, 29 insertions(+), 55 deletions(-)

diff --git a/examples/llm/pretrain/default_executor.py b/examples/llm/pretrain/default_executor.py
index 6ebc874b2e39..89b2db21ed08 100644
--- a/examples/llm/pretrain/default_executor.py
+++ b/examples/llm/pretrain/default_executor.py
@@ -21,11 +21,9 @@
 
 def local_executor_torchrun(devices: int = 2) -> run.LocalExecutor:
     env_vars = {
-        "TRANSFORMERS_OFFLINE": "1",
-        "TORCH_NCCL_AVOID_RECORD_STREAMS": "1",
-        "NCCL_NVLS_ENABLE": "0",
-        "NVTE_DP_AMAX_REDUCE_INTERVAL": "0",
-        "NVTE_ASYNC_AMAX_REDUCTION": "1",
+        "TRANSFORMERS_OFFLINE": "1",            # Enable online downloads from HuggingFace
+        "TORCH_NCCL_AVOID_RECORD_STREAMS": "1", # Disable caching NCCL communication buffer memory
+        "NCCL_NVLS_ENABLE": "0",                # Disable NVLink SHARP to save memory
     }
 
     executor = run.LocalExecutor(ntasks_per_node=devices, launcher="torchrun", env_vars=env_vars)
@@ -57,11 +55,9 @@ def slurm_executor(
         mounts.extend(custom_mounts)
 
     env_vars = {
-        "TRANSFORMERS_OFFLINE": "1",
-        "TORCH_NCCL_AVOID_RECORD_STREAMS": "1",
-        "NCCL_NVLS_ENABLE": "0",
-        "NVTE_DP_AMAX_REDUCE_INTERVAL": "0",
-        "NVTE_ASYNC_AMAX_REDUCTION": "1",
+        "TRANSFORMERS_OFFLINE": "1",            # Enable online downloads from HuggingFace
+        "TORCH_NCCL_AVOID_RECORD_STREAMS": "1", # Disable caching NCCL communication buffer memory
+        "NCCL_NVLS_ENABLE": "0",                # Disable NVLink SHARP to save memory
     }
     if custom_env_vars:
         env_vars |= custom_env_vars
diff --git a/scripts/llm/pretraining.py b/scripts/llm/pretraining.py
index 3b1a2f140b4c..60954edeb270 100644
--- a/scripts/llm/pretraining.py
+++ b/scripts/llm/pretraining.py
@@ -81,11 +81,9 @@ def slurm_executor(
         mounts.extend(custom_mounts)
 
     env_vars = {
-        "TRANSFORMERS_OFFLINE": "1",
-        "TORCH_NCCL_AVOID_RECORD_STREAMS": "1",
-        "NCCL_NVLS_ENABLE": "0",
-        "NVTE_DP_AMAX_REDUCE_INTERVAL": "0",
-        "NVTE_ASYNC_AMAX_REDUCTION": "1",
+        "TRANSFORMERS_OFFLINE": "1",            # Enable online downloads from HuggingFace
+        "TORCH_NCCL_AVOID_RECORD_STREAMS": "1", # Disable caching NCCL communication buffer memory
+        "NCCL_NVLS_ENABLE": "0",                # Disable NVLink SHARP to save memory
     }
     if custom_env_vars:
         env_vars |= custom_env_vars
@@ -118,12 +116,10 @@ def slurm_executor(
 
 def local_executor_torchrun(nodes: int = 1, devices: int = 2) -> run.LocalExecutor:
     env_vars = {
-        "TRANSFORMERS_OFFLINE": "1",
-        "TORCH_NCCL_AVOID_RECORD_STREAMS": "1",
-        "NCCL_NVLS_ENABLE": "0",
-        "NVTE_DP_AMAX_REDUCE_INTERVAL": "0",
-        "NVTE_ASYNC_AMAX_REDUCTION": "1",
-        "NVTE_FUSED_ATTN": "0",
+        "TRANSFORMERS_OFFLINE": "1",            # Enable online downloads from HuggingFace
+        "TORCH_NCCL_AVOID_RECORD_STREAMS": "1", # Disable caching NCCL communication buffer memory
+        "NCCL_NVLS_ENABLE": "0",                # Disable NVLink SHARP to save memory
+        "NVTE_FUSED_ATTN": "0",                 # Disable cuDNN fused attention
     }
 
     executor = run.LocalExecutor(ntasks_per_node=devices, launcher="torchrun", env_vars=env_vars)
diff --git a/scripts/performance/utils.py b/scripts/performance/utils.py
index ba3c07cd0954..197092a8fb3d 100644
--- a/scripts/performance/utils.py
+++ b/scripts/performance/utils.py
@@ -59,14 +59,12 @@ def slurm_executor(
         sys.exit(1)
 
     env_vars = {
-        "TRANSFORMERS_OFFLINE": "1",
-        "TOKENIZERS_PARALLELISM": "False",
-        "NCCL_NVLS_ENABLE": "0",
-        "NVTE_DP_AMAX_REDUCE_INTERVAL": "0",
-        "NVTE_ASYNC_AMAX_REDUCTION": "1",
-        "NVTE_FUSED_ATTN": "1",
-        "NVTE_FLASH_ATTN": "1",
-        "NEMO_LOG_MEMORY_USAGE": "1",
+        "TRANSFORMERS_OFFLINE": "1",        # Enable online downloads from HuggingFace
+        "TOKENIZERS_PARALLELISM": "False",  # Restrict warning message prints
+        "NCCL_NVLS_ENABLE": "0",            # Disable NVLink SHARP to save memory
+        "NVTE_FLASH_ATTN": "1",             # Enable Flash Attention, which is needed to enable cuDNN fused attention
+        "NVTE_FUSED_ATTN": "1",             # Enable cuDNN fused attention
+        "NEMO_LOG_MEMORY_USAGE": "1",       # Print memory allocation
         "NEMORUN_HOME": log_dir,
     }
     mounts = []
diff --git a/tests/collections/llm/hf/peft_nemorun.py b/tests/collections/llm/hf/peft_nemorun.py
index 3a135b2346be..debbbf3b9c3b 100644
--- a/tests/collections/llm/hf/peft_nemorun.py
+++ b/tests/collections/llm/hf/peft_nemorun.py
@@ -24,10 +24,8 @@
 def local_executor_torchrun(nodes: int = 1, devices: int = 2) -> run.LocalExecutor:
     # Env vars for jobs are configured here
     env_vars = {
-        "TORCH_NCCL_AVOID_RECORD_STREAMS": "1",
-        "NCCL_NVLS_ENABLE": "0",
-        "NVTE_DP_AMAX_REDUCE_INTERVAL": "0",
-        "NVTE_ASYNC_AMAX_REDUCTION": "1",
+        "TORCH_NCCL_AVOID_RECORD_STREAMS": "1", # Disable caching NCCL communication buffer memory
+        "NCCL_NVLS_ENABLE": "0",                # Disable NVLink SHARP to save memory
     }
 
     executor = run.LocalExecutor(ntasks_per_node=devices, launcher="torchrun", env_vars=env_vars)
diff --git a/tests/collections/llm/hf/pretrain_nemorun.py b/tests/collections/llm/hf/pretrain_nemorun.py
index 331a0652e21a..a68b5f17446d 100644
--- a/tests/collections/llm/hf/pretrain_nemorun.py
+++ b/tests/collections/llm/hf/pretrain_nemorun.py
@@ -25,11 +25,9 @@
 def local_executor_torchrun(nodes: int = 1, devices: int = 2) -> run.LocalExecutor:
     # Env vars for jobs are configured here
     env_vars = {
-        "TORCH_NCCL_AVOID_RECORD_STREAMS": "1",
-        "NCCL_NVLS_ENABLE": "0",
-        "NVTE_DP_AMAX_REDUCE_INTERVAL": "0",
-        "NVTE_ASYNC_AMAX_REDUCTION": "1",
-        "NVTE_FUSED_ATTN": "0",
+        "TORCH_NCCL_AVOID_RECORD_STREAMS": "1", # Disable caching NCCL communication buffer memory
+        "NCCL_NVLS_ENABLE": "0",                # Disable NVLink SHARP to save memory
+        "NVTE_FUSED_ATTN": "0",                 # Disable cuDNN fused attention
     }
 
     executor = run.LocalExecutor(ntasks_per_node=devices, launcher="torchrun", env_vars=env_vars)
diff --git a/tests/collections/llm/hf/sft_nemorun.py b/tests/collections/llm/hf/sft_nemorun.py
index b559c04f6cbd..bca653c6de64 100644
--- a/tests/collections/llm/hf/sft_nemorun.py
+++ b/tests/collections/llm/hf/sft_nemorun.py
@@ -25,10 +25,8 @@
 def local_executor_torchrun(nodes: int = 1, devices: int = 2) -> run.LocalExecutor:
     # Env vars for jobs are configured here
     env_vars = {
-        "TORCH_NCCL_AVOID_RECORD_STREAMS": "1",
-        "NCCL_NVLS_ENABLE": "0",
-        "NVTE_DP_AMAX_REDUCE_INTERVAL": "0",
-        "NVTE_ASYNC_AMAX_REDUCTION": "1",
+        "TORCH_NCCL_AVOID_RECORD_STREAMS": "1", # Disable caching NCCL communication buffer memory
+        "NCCL_NVLS_ENABLE": "0",                # Disable NVLink SHARP to save memory
     }
 
     executor = run.LocalExecutor(ntasks_per_node=devices, launcher="torchrun", env_vars=env_vars)
diff --git a/tests/collections/llm/hf/sft_nemorun_fsdp2.py b/tests/collections/llm/hf/sft_nemorun_fsdp2.py
index 53dd863cb185..81d7ac8550ba 100644
--- a/tests/collections/llm/hf/sft_nemorun_fsdp2.py
+++ b/tests/collections/llm/hf/sft_nemorun_fsdp2.py
@@ -27,11 +27,9 @@
 def local_executor_torchrun(nodes: int = 1, devices: int = 2) -> run.LocalExecutor:
     # Env vars for jobs are configured here
     env_vars = {
-        "TORCH_NCCL_AVOID_RECORD_STREAMS": "1",
-        "NCCL_NVLS_ENABLE": "0",
-        "NVTE_DP_AMAX_REDUCE_INTERVAL": "0",
-        "NVTE_ASYNC_AMAX_REDUCTION": "1",
-        "NVTE_FUSED_ATTN": "0",
+        "TORCH_NCCL_AVOID_RECORD_STREAMS": "1", # Disable caching NCCL communication buffer memory
+        "NCCL_NVLS_ENABLE": "0",                # Disable NVLink SHARP to save memory
+        "NVTE_FUSED_ATTN": "0",                 # Disable cuDNN attention
     }
 
     executor = run.LocalExecutor(ntasks_per_node=devices, launcher="torchrun", env_vars=env_vars)
diff --git a/tutorials/llm/llama/nemo2-sft-peft/nemo2-peft.ipynb b/tutorials/llm/llama/nemo2-sft-peft/nemo2-peft.ipynb
index c983b277e72a..54a571b9bc45 100644
--- a/tutorials/llm/llama/nemo2-sft-peft/nemo2-peft.ipynb
+++ b/tutorials/llm/llama/nemo2-sft-peft/nemo2-peft.ipynb
@@ -339,8 +339,6 @@
     "    env_vars = {\n",
     "        \"TORCH_NCCL_AVOID_RECORD_STREAMS\": \"1\",\n",
     "        \"NCCL_NVLS_ENABLE\": \"0\",\n",
-    "        \"NVTE_DP_AMAX_REDUCE_INTERVAL\": \"0\",\n",
-    "        \"NVTE_ASYNC_AMAX_REDUCTION\": \"1\",\n",
     "    }\n",
     "\n",
     "    executor = run.LocalExecutor(ntasks_per_node=devices, launcher=\"torchrun\", env_vars=env_vars)\n",
@@ -454,8 +452,6 @@
     "    env_vars = {\n",
     "        \"TORCH_NCCL_AVOID_RECORD_STREAMS\": \"1\",\n",
     "        \"NCCL_NVLS_ENABLE\": \"0\",\n",
-    "        \"NVTE_DP_AMAX_REDUCE_INTERVAL\": \"0\",\n",
-    "        \"NVTE_ASYNC_AMAX_REDUCTION\": \"1\",\n",
     "    }\n",
     "\n",
     "    executor = run.LocalExecutor(ntasks_per_node=devices, launcher=\"torchrun\", env_vars=env_vars)\n",
diff --git a/tutorials/llm/llama/nemo2-sft-peft/nemo2-sft.ipynb b/tutorials/llm/llama/nemo2-sft-peft/nemo2-sft.ipynb
index 0bb4367d50e9..7dbd4b904ad9 100644
--- a/tutorials/llm/llama/nemo2-sft-peft/nemo2-sft.ipynb
+++ b/tutorials/llm/llama/nemo2-sft-peft/nemo2-sft.ipynb
@@ -480,8 +480,6 @@
     "    env_vars = {\n",
     "        \"TORCH_NCCL_AVOID_RECORD_STREAMS\": \"1\",\n",
     "        \"NCCL_NVLS_ENABLE\": \"0\",\n",
-    "        \"NVTE_DP_AMAX_REDUCE_INTERVAL\": \"0\",\n",
-    "        \"NVTE_ASYNC_AMAX_REDUCTION\": \"1\",\n",
     "    }\n",
     "\n",
     "    executor = run.LocalExecutor(ntasks_per_node=devices, launcher=\"torchrun\", env_vars=env_vars)\n",
@@ -562,8 +560,6 @@
     "    env_vars = {\n",
     "        \"TORCH_NCCL_AVOID_RECORD_STREAMS\": \"1\",\n",
     "        \"NCCL_NVLS_ENABLE\": \"0\",\n",
-    "        \"NVTE_DP_AMAX_REDUCE_INTERVAL\": \"0\",\n",
-    "        \"NVTE_ASYNC_AMAX_REDUCTION\": \"1\",\n",
     "    }\n",
     "\n",
     "    executor = run.LocalExecutor(ntasks_per_node=devices, launcher=\"torchrun\", env_vars=env_vars)\n",

From 9d1cdf08429a0de06d88abb2b29492db19277593 Mon Sep 17 00:00:00 2001
From: erhoo82 <erhoo82@users.noreply.github.com>
Date: Wed, 5 Feb 2025 03:12:32 +0000
Subject: [PATCH 2/2] Apply isort and black reformatting

Signed-off-by: erhoo82 <erhoo82@users.noreply.github.com>
---
 examples/llm/pretrain/default_executor.py     | 12 ++++++------
 scripts/llm/pretraining.py                    | 14 +++++++-------
 scripts/performance/utils.py                  | 10 +++++-----
 tests/collections/llm/hf/peft_nemorun.py      |  4 ++--
 tests/collections/llm/hf/pretrain_nemorun.py  |  6 +++---
 tests/collections/llm/hf/sft_nemorun.py       |  4 ++--
 tests/collections/llm/hf/sft_nemorun_fsdp2.py |  6 +++---
 7 files changed, 28 insertions(+), 28 deletions(-)

diff --git a/examples/llm/pretrain/default_executor.py b/examples/llm/pretrain/default_executor.py
index 89b2db21ed08..ac7a80f25b77 100644
--- a/examples/llm/pretrain/default_executor.py
+++ b/examples/llm/pretrain/default_executor.py
@@ -21,9 +21,9 @@
 
 def local_executor_torchrun(devices: int = 2) -> run.LocalExecutor:
     env_vars = {
-        "TRANSFORMERS_OFFLINE": "1",            # Enable online downloads from HuggingFace
-        "TORCH_NCCL_AVOID_RECORD_STREAMS": "1", # Disable caching NCCL communication buffer memory
-        "NCCL_NVLS_ENABLE": "0",                # Disable NVLink SHARP to save memory
+        "TRANSFORMERS_OFFLINE": "1",  # Enable online downloads from HuggingFace
+        "TORCH_NCCL_AVOID_RECORD_STREAMS": "1",  # Disable caching NCCL communication buffer memory
+        "NCCL_NVLS_ENABLE": "0",  # Disable NVLink SHARP to save memory
     }
 
     executor = run.LocalExecutor(ntasks_per_node=devices, launcher="torchrun", env_vars=env_vars)
@@ -55,9 +55,9 @@ def slurm_executor(
         mounts.extend(custom_mounts)
 
     env_vars = {
-        "TRANSFORMERS_OFFLINE": "1",            # Enable online downloads from HuggingFace
-        "TORCH_NCCL_AVOID_RECORD_STREAMS": "1", # Disable caching NCCL communication buffer memory
-        "NCCL_NVLS_ENABLE": "0",                # Disable NVLink SHARP to save memory
+        "TRANSFORMERS_OFFLINE": "1",  # Enable online downloads from HuggingFace
+        "TORCH_NCCL_AVOID_RECORD_STREAMS": "1",  # Disable caching NCCL communication buffer memory
+        "NCCL_NVLS_ENABLE": "0",  # Disable NVLink SHARP to save memory
     }
     if custom_env_vars:
         env_vars |= custom_env_vars
diff --git a/scripts/llm/pretraining.py b/scripts/llm/pretraining.py
index 60954edeb270..9a8a55140405 100644
--- a/scripts/llm/pretraining.py
+++ b/scripts/llm/pretraining.py
@@ -81,9 +81,9 @@ def slurm_executor(
         mounts.extend(custom_mounts)
 
     env_vars = {
-        "TRANSFORMERS_OFFLINE": "1",            # Enable online downloads from HuggingFace
-        "TORCH_NCCL_AVOID_RECORD_STREAMS": "1", # Disable caching NCCL communication buffer memory
-        "NCCL_NVLS_ENABLE": "0",                # Disable NVLink SHARP to save memory
+        "TRANSFORMERS_OFFLINE": "1",  # Enable online downloads from HuggingFace
+        "TORCH_NCCL_AVOID_RECORD_STREAMS": "1",  # Disable caching NCCL communication buffer memory
+        "NCCL_NVLS_ENABLE": "0",  # Disable NVLink SHARP to save memory
     }
     if custom_env_vars:
         env_vars |= custom_env_vars
@@ -116,10 +116,10 @@ def slurm_executor(
 
 def local_executor_torchrun(nodes: int = 1, devices: int = 2) -> run.LocalExecutor:
     env_vars = {
-        "TRANSFORMERS_OFFLINE": "1",            # Enable online downloads from HuggingFace
-        "TORCH_NCCL_AVOID_RECORD_STREAMS": "1", # Disable caching NCCL communication buffer memory
-        "NCCL_NVLS_ENABLE": "0",                # Disable NVLink SHARP to save memory
-        "NVTE_FUSED_ATTN": "0",                 # Disable cuDNN fused attention
+        "TRANSFORMERS_OFFLINE": "1",  # Enable online downloads from HuggingFace
+        "TORCH_NCCL_AVOID_RECORD_STREAMS": "1",  # Disable caching NCCL communication buffer memory
+        "NCCL_NVLS_ENABLE": "0",  # Disable NVLink SHARP to save memory
+        "NVTE_FUSED_ATTN": "0",  # Disable cuDNN fused attention
     }
 
     executor = run.LocalExecutor(ntasks_per_node=devices, launcher="torchrun", env_vars=env_vars)
diff --git a/scripts/performance/utils.py b/scripts/performance/utils.py
index 197092a8fb3d..585aedb768dd 100644
--- a/scripts/performance/utils.py
+++ b/scripts/performance/utils.py
@@ -59,12 +59,12 @@ def slurm_executor(
         sys.exit(1)
 
     env_vars = {
-        "TRANSFORMERS_OFFLINE": "1",        # Enable online downloads from HuggingFace
+        "TRANSFORMERS_OFFLINE": "1",  # Enable online downloads from HuggingFace
         "TOKENIZERS_PARALLELISM": "False",  # Restrict warning message prints
-        "NCCL_NVLS_ENABLE": "0",            # Disable NVLink SHARP to save memory
-        "NVTE_FLASH_ATTN": "1",             # Enable Flash Attention, which is needed to enable cuDNN fused attention
-        "NVTE_FUSED_ATTN": "1",             # Enable cuDNN fused attention
-        "NEMO_LOG_MEMORY_USAGE": "1",       # Print memory allocation
+        "NCCL_NVLS_ENABLE": "0",  # Disable NVLink SHARP to save memory
+        "NVTE_FLASH_ATTN": "1",  # Enable Flash Attention, which is needed to enable cuDNN fused attention
+        "NVTE_FUSED_ATTN": "1",  # Enable cuDNN fused attention
+        "NEMO_LOG_MEMORY_USAGE": "1",  # Print memory allocation
         "NEMORUN_HOME": log_dir,
     }
     mounts = []
diff --git a/tests/collections/llm/hf/peft_nemorun.py b/tests/collections/llm/hf/peft_nemorun.py
index debbbf3b9c3b..21b4f7b0bd70 100644
--- a/tests/collections/llm/hf/peft_nemorun.py
+++ b/tests/collections/llm/hf/peft_nemorun.py
@@ -24,8 +24,8 @@
 def local_executor_torchrun(nodes: int = 1, devices: int = 2) -> run.LocalExecutor:
     # Env vars for jobs are configured here
     env_vars = {
-        "TORCH_NCCL_AVOID_RECORD_STREAMS": "1", # Disable caching NCCL communication buffer memory
-        "NCCL_NVLS_ENABLE": "0",                # Disable NVLink SHARP to save memory
+        "TORCH_NCCL_AVOID_RECORD_STREAMS": "1",  # Disable caching NCCL communication buffer memory
+        "NCCL_NVLS_ENABLE": "0",  # Disable NVLink SHARP to save memory
     }
 
     executor = run.LocalExecutor(ntasks_per_node=devices, launcher="torchrun", env_vars=env_vars)
diff --git a/tests/collections/llm/hf/pretrain_nemorun.py b/tests/collections/llm/hf/pretrain_nemorun.py
index a68b5f17446d..a6b9b12f564f 100644
--- a/tests/collections/llm/hf/pretrain_nemorun.py
+++ b/tests/collections/llm/hf/pretrain_nemorun.py
@@ -25,9 +25,9 @@
 def local_executor_torchrun(nodes: int = 1, devices: int = 2) -> run.LocalExecutor:
     # Env vars for jobs are configured here
     env_vars = {
-        "TORCH_NCCL_AVOID_RECORD_STREAMS": "1", # Disable caching NCCL communication buffer memory
-        "NCCL_NVLS_ENABLE": "0",                # Disable NVLink SHARP to save memory
-        "NVTE_FUSED_ATTN": "0",                 # Disable cuDNN fused attention
+        "TORCH_NCCL_AVOID_RECORD_STREAMS": "1",  # Disable caching NCCL communication buffer memory
+        "NCCL_NVLS_ENABLE": "0",  # Disable NVLink SHARP to save memory
+        "NVTE_FUSED_ATTN": "0",  # Disable cuDNN fused attention
     }
 
     executor = run.LocalExecutor(ntasks_per_node=devices, launcher="torchrun", env_vars=env_vars)
diff --git a/tests/collections/llm/hf/sft_nemorun.py b/tests/collections/llm/hf/sft_nemorun.py
index bca653c6de64..e4a0242bcdb2 100644
--- a/tests/collections/llm/hf/sft_nemorun.py
+++ b/tests/collections/llm/hf/sft_nemorun.py
@@ -25,8 +25,8 @@
 def local_executor_torchrun(nodes: int = 1, devices: int = 2) -> run.LocalExecutor:
     # Env vars for jobs are configured here
     env_vars = {
-        "TORCH_NCCL_AVOID_RECORD_STREAMS": "1", # Disable caching NCCL communication buffer memory
-        "NCCL_NVLS_ENABLE": "0",                # Disable NVLink SHARP to save memory
+        "TORCH_NCCL_AVOID_RECORD_STREAMS": "1",  # Disable caching NCCL communication buffer memory
+        "NCCL_NVLS_ENABLE": "0",  # Disable NVLink SHARP to save memory
     }
 
     executor = run.LocalExecutor(ntasks_per_node=devices, launcher="torchrun", env_vars=env_vars)
diff --git a/tests/collections/llm/hf/sft_nemorun_fsdp2.py b/tests/collections/llm/hf/sft_nemorun_fsdp2.py
index 81d7ac8550ba..8e169527446a 100644
--- a/tests/collections/llm/hf/sft_nemorun_fsdp2.py
+++ b/tests/collections/llm/hf/sft_nemorun_fsdp2.py
@@ -27,9 +27,9 @@
 def local_executor_torchrun(nodes: int = 1, devices: int = 2) -> run.LocalExecutor:
     # Env vars for jobs are configured here
     env_vars = {
-        "TORCH_NCCL_AVOID_RECORD_STREAMS": "1", # Disable caching NCCL communication buffer memory
-        "NCCL_NVLS_ENABLE": "0",                # Disable NVLink SHARP to save memory
-        "NVTE_FUSED_ATTN": "0",                 # Disable cuDNN attention
+        "TORCH_NCCL_AVOID_RECORD_STREAMS": "1",  # Disable caching NCCL communication buffer memory
+        "NCCL_NVLS_ENABLE": "0",  # Disable NVLink SHARP to save memory
+        "NVTE_FUSED_ATTN": "0",  # Disable cuDNN attention
     }
 
     executor = run.LocalExecutor(ntasks_per_node=devices, launcher="torchrun", env_vars=env_vars)