Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 9 additions & 4 deletions nemo_run/core/execution/dgxcloud.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,20 +3,20 @@
import logging
import os
import subprocess
import tempfile
import time
from dataclasses import dataclass, field
from enum import Enum
from pathlib import Path
import time
import tempfile
from typing import Any, Optional, Type

import requests
from invoke.context import Context

from nemo_run.config import get_nemorun_home
from nemo_run.core.execution.base import Executor, ExecutorMacros
from nemo_run.core.packaging.base import Packager
from nemo_run.core.packaging.git import GitArchivePackager
from nemo_run.config import get_nemorun_home

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -170,6 +170,8 @@ def move_data(self, token: str, project_id: str, cluster_id: str, sleep: float =
workload_id = resp_json["workloadId"]
status = DGXCloudState(resp_json["actualPhase"])

logger.info(f"Successfully created data movement workload {workload_id} on DGXCloud")

while status in [
DGXCloudState.PENDING,
DGXCloudState.CREATING,
Expand All @@ -178,9 +180,12 @@ def move_data(self, token: str, project_id: str, cluster_id: str, sleep: float =
]:
time.sleep(sleep)
status = self.status(workload_id)
logger.debug(
f"Polling data movement workload {workload_id}'s status. Current status is: {status}"
)

if status is not DGXCloudState.COMPLETED:
raise RuntimeError("Failed to move data to PVC")
raise RuntimeError(f"Failed to move data to PVC. Workload status is {status}")

resp = self.delete_workload(token, workload_id)
if resp.status_code >= 200 and resp.status_code < 300:
Expand Down
Loading