Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 8 additions & 7 deletions examples/evaluation/utils/executors.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,11 +99,9 @@ def kuberay_executor(
"TRANSFORMERS_OFFLINE": "1",
"HF_HOME": "/nemo-workspace/pagaray/hf_cache",
"RAY_enable_infeasible_task_early_exit": "true",
"NCCL_IB_DISABLE": "1",
"NCCL_IB_HCA": "^openib", # Ignore OpenIB devices
"NCCL_NET": "Socket",
"NCCL_NET_GDR_LEVEL": "0",
"FI_PROVIDER": "tcp",
"NCCL_NET": "tcpxo",
"NCCL_SOCKET_IFNAME": "eth1,eth2,eth3,eth4,eth5,eth6,eth7,eth8",
"NCCL_FASTRAK_CTRL_DEV": "eth0",
}
if custom_env_vars:
env_vars.update(custom_env_vars)
Expand Down Expand Up @@ -132,8 +130,11 @@ def kuberay_executor(
spec_kwargs={
"schedulerName": "runai-scheduler",
"image_pull_secrets": ["dockerregistry-dockerregistry-pagaray-ngc"],
"dnsConfig": {"options": [{"name": "ndots", "value": "1"}, {"name": "single-request-reopen"}]},
}, # e.g. Run:ai
volume_mounts=[{"name": "workspace", "mountPath": dgxc_pvc_mount_path}],
volume_mounts=[
{"name": "workspace", "mountPath": dgxc_pvc_mount_path},
],
volumes=[
{
"name": "workspace",
Expand All @@ -145,7 +146,7 @@ def kuberay_executor(
"securityContext": {
"allowPrivilegeEscalation": False,
"runAsUser": 0,
},
}
},
)

Expand Down
22 changes: 11 additions & 11 deletions scripts/performance/setup_experiment.py
Original file line number Diff line number Diff line change
Expand Up @@ -386,17 +386,17 @@ def main(
)
)

if use_recipes and dgxc_cluster is not None:
plugins.append(
FaultTolerancePlugin(
enable_ft_package=True,
calc_ft_timeouts=True,
num_in_job_restarts=10,
num_job_retries_on_failure=10,
initial_rank_heartbeat_timeout=1800,
rank_heartbeat_timeout=300,
)
)
# if use_recipes and dgxc_cluster is not None:
# plugins.append(
# FaultTolerancePlugin(
# enable_ft_package=True,
# calc_ft_timeouts=True,
# num_in_job_restarts=10,
# num_job_retries_on_failure=10,
# initial_rank_heartbeat_timeout=1800,
# rank_heartbeat_timeout=300,
# )
# )
Comment on lines +389 to +399
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major

Remove the commented FT block or restore it behind a real flag.

As written, this disables FaultTolerancePlugin but leaves the import unused, which is already failing Ruff in CI. If the intent is to turn FT off for this path, delete the dead block and drop the import from Lines 49-53 instead of parking it here.

✂️ Suggested cleanup
-    # if use_recipes and dgxc_cluster is not None:
-    #     plugins.append(
-    #         FaultTolerancePlugin(
-    #             enable_ft_package=True,
-    #             calc_ft_timeouts=True,
-    #             num_in_job_restarts=10,
-    #             num_job_retries_on_failure=10,
-    #             initial_rank_heartbeat_timeout=1800,
-    #             rank_heartbeat_timeout=300,
-    #         )
-    #     )

Also remove FaultTolerancePlugin from the imports at Lines 49-53.

As per coding guidelines, "If code is commented out, include a comment describing its usage and why it is commented out; otherwise remove it as debug code before merging."

📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
# if use_recipes and dgxc_cluster is not None:
# plugins.append(
# FaultTolerancePlugin(
# enable_ft_package=True,
# calc_ft_timeouts=True,
# num_in_job_restarts=10,
# num_job_retries_on_failure=10,
# initial_rank_heartbeat_timeout=1800,
# rank_heartbeat_timeout=300,
# )
# )
🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In `@scripts/performance/setup_experiment.py` around lines 389 - 399, The
commented-out FaultTolerancePlugin block (referencing use_recipes, dgxc_cluster,
and plugins) should be removed or re-enabled behind a real feature flag; delete
the dead block and also remove the unused FaultTolerancePlugin import from the
module imports (the import that references FaultTolerancePlugin) so Ruff/CI no
longer complains, or alternatively restore the block behind a proper conditional
(e.g., a configurable enable_fault_tolerance flag) keeping the plugin
instantiation inside the active branch.


nemorun_script = run.Script(
path=str(run_script_path),
Expand Down
Loading