From 98a07b7b1a9bc3d8f818bec83ceef9fe5d03aaf3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Thu, 20 Nov 2025 13:54:28 +0000 Subject: [PATCH 1/2] fix: Retry polling token MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: oliver könig --- nemo_run/core/execution/dgxcloud.py | 26 +++++++++++++++++--------- 1 file changed, 17 insertions(+), 9 deletions(-) diff --git a/nemo_run/core/execution/dgxcloud.py b/nemo_run/core/execution/dgxcloud.py index 69a7e29c..a8405ac2 100644 --- a/nemo_run/core/execution/dgxcloud.py +++ b/nemo_run/core/execution/dgxcloud.py @@ -90,15 +90,23 @@ def get_auth_token(self) -> Optional[str]: "appSecret": self.app_secret, } - response = requests.post(url, json=payload, headers=self._default_headers()) - response_text = response.text.strip() - auth_token = json.loads(response_text).get("accessToken", None) # [1] - if not auth_token: - logger.error("Failed to retrieve auth token; response was: %s", response_text) - return None - - logger.debug("Retrieved auth token from %s", url) - return auth_token + n_attempts = 0 + while n_attempts < 3: + try: + response = requests.post(url, json=payload, headers=self._default_headers()) + response_text = response.text.strip() + auth_token = json.loads(response_text).get("accessToken", None) # [1] + if auth_token: + return auth_token + + raise ValueError(f"Failed to retrieve auth token; response was: {response_text}") + + except Exception as e: + logger.error("Failed to retrieve auth token; error was: %s", e) + time.sleep(10) + n_attempts += 1 + + raise ValueError("Failed to retrieve auth token after 3 attempts.") def get_project_and_cluster_id(self, token: str) -> tuple[Optional[str], Optional[str]]: url = f"{self.base_url}/org-unit/projects" From 079563520fea46f89740353d28424fc6f736ff4d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Thu, 20 Nov 2025 14:04:35 +0000 Subject: [PATCH 2/2] keep original behavior MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: oliver könig --- nemo_run/core/execution/dgxcloud.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/nemo_run/core/execution/dgxcloud.py b/nemo_run/core/execution/dgxcloud.py index a8405ac2..2cacb1a1 100644 --- a/nemo_run/core/execution/dgxcloud.py +++ b/nemo_run/core/execution/dgxcloud.py @@ -106,7 +106,8 @@ def get_auth_token(self) -> Optional[str]: time.sleep(10) n_attempts += 1 - raise ValueError("Failed to retrieve auth token after 3 attempts.") + logger.error("Failed to retrieve auth token after 3 attempts.") + return None def get_project_and_cluster_id(self, token: str) -> tuple[Optional[str], Optional[str]]: url = f"{self.base_url}/org-unit/projects"