fix(chore): Addressed more review comments

christian-pinto · christian-pinto · commit 4347dfcdfca0 · 2025-12-03T11:33:10.000Z
Signed-off-by: Christian Pinto &lt;christian.pinto@ibm.com&gt;
diff --git a/plugins/actuators/vllm_performance/ado_actuators/vllm_performance/deployment_management.py b/plugins/actuators/vllm_performance/ado_actuators/vllm_performance/deployment_management.py
@@ -46,15 +46,15 @@ async def wait(self, request_id: str, k8s_name: str, model: str) -> None:
                     )
                 )
                 await waiter.model_downloaded_event.wait()
-                # If after we got awaken the model is not among the downloaded models, it means that
+                # If after we got awoken the model is not among the downloaded models, it means that
                 # something has gone wrong, such as the deployment we were waiting for has failed.
                 # If am the first to wake up let me add myself as the deployment to be waited for and stop waiting.
                 if (
                     model not in self.model_already_downloaded
                     and not self.maybe_add_deployment(k8s_name=k8s_name, model=model)
                 ):
                     # If I am not the first to wake up, I get the new waiter object and continue waiting
-                    waiter = self.deployments_to_wait_for.get(model, None)
+                    waiter = self.deployments_to_wait_for.get(model)
                     continue
 
                 console.put.remote(
@@ -67,11 +67,14 @@ async def wait(self, request_id: str, k8s_name: str, model: str) -> None:
                 break
 
     def signal(self, k8s_name: str, model: str, error: bool = False) -> None:
-        if model in self.deployments_to_wait_for:
-            waiter = self.deployments_to_wait_for.pop(model)
-            assert (
-                waiter.k8s_name == k8s_name
-            ), f"This environment deployment ({k8s_name}) shouldn't have been created because it is conflicting with deployment {waiter.k8s_name}"
-            if not error:
-                self.model_already_downloaded.add(model)
-            waiter.model_downloaded_event.set()
+        if model not in self.deployments_to_wait_for:
+            return
+
+        waiter = self.deployments_to_wait_for.pop(model)
+        if waiter.k8s_name != k8s_name:
+            raise ValueError(
+                f"This environment deployment ({k8s_name}) shouldn't have been created because it is conflicting with deployment {waiter.k8s_name}"
+            )
+        if not error:
+            self.model_already_downloaded.add(model)
+        waiter.model_downloaded_event.set()
diff --git a/plugins/actuators/vllm_performance/ado_actuators/vllm_performance/env_manager.py b/plugins/actuators/vllm_performance/ado_actuators/vllm_performance/env_manager.py
@@ -3,13 +3,13 @@
 
 import asyncio
 import logging
-import time
 from enum import Enum
 
 import ray
 from ado_actuators.vllm_performance.deployment_management import (
     DeploymentConflictManager,
 )
+from ado_actuators.vllm_performance.k8s import K8sEnvironmentCreationError
 from ado_actuators.vllm_performance.k8s.manage_components import (
     ComponentsManager,
 )
@@ -105,18 +105,18 @@ def __init__(
             pvc_template=pvc_template,
         )
 
-    def _wipe_deployment(self, identifier: str) -> None:
+    def _delete_environment_k8s_resources(self, k8s_name: str) -> None:
         """
         Deletes a deployment. Intended to be used for cleanup or error recovery
         param: identifier: the deployment identifier
         """
         try:
-            self.manager.delete_service(k8s_name=identifier)
+            self.manager.delete_service(k8s_name=k8s_name)
         except ApiException as e:
             if e.reason != "Not Found":
                 raise e
         try:
-            self.manager.delete_deployment(k8s_name=identifier)
+            self.manager.delete_deployment(k8s_name=k8s_name)
         except ApiException as e:
             if e.reason != "Not Found":
                 raise e
@@ -138,9 +138,6 @@ def get_environment(self, model: str, definition: str) -> Environment | None:
         :param increment_usage: increment usage flag
         :return: environment state
         """
-        print(
-            f"getting environment for model {model}, currently {self.active_environments} deployments"
-        )
 
         # check if there's an existing free environment satisfying the request
         env = self.get_matching_free_environment(definition)
@@ -153,24 +150,54 @@ def get_environment(self, model: str, definition: str) -> Environment | None:
                         f"There are already {self.max_concurrent} actively in use, and I can't create a new one"
                     )
                     return None
-                    # There are unused environments, let's evict one
-
-                # Gets the oldest env in the list
-                environment_to_evict = self.free_environments[0]
-                try:
-                    self.manager.delete_service(k8s_name=environment_to_evict.k8s_name)
-                    self.manager.delete_deployment(
-                        k8s_name=environment_to_evict.k8s_name
-                    )
+
+                # There are unused environments, let's try to evict one
+                environment_evicted = False
+                eviction_index = 0
+                # Continue looping until we find one environment that can be successfully evicted or we have gone through them all
+                while not environment_evicted and eviction_index < len(
+                    self.free_environments
+                ):
+                    environment_to_evict = self.free_environments[eviction_index]
+                    try:
+                        # _delete_environment_k8s_resources will not raise an error if for whatever the reason the service
+                        # or the deployment we are trying to delete does not exist anymore, and we assume
+                        # the deployment was properly deleted.
+                        self._delete_environment_k8s_resources(
+                            k8s_name=environment_to_evict.k8s_name
+                        )
+                    except ApiException as e:
+                        # If we can't delete this environment we try with the next one, but we do not
+                        # delete the current env from the free list. This is to avoid spawning more pods than the maximum configured
+                        # in the case the failing ones are still running.
+                        # Since the current eviction candidate environment will stay in the free ones, some other measurement might
+                        # try to evict again and perhaps succeed (e.g., connection restored to the cluster).
+                        logger.critical(
+                            f"Error deleting deployment or service {environment_to_evict.k8s_name}: {e}"
+                        )
+                        eviction_index += 1
+                        continue
+
                     logger.info(
                         f"deleted environment {environment_to_evict.k8s_name}. "
                         f"Active environments {self.active_environments}"
                     )
-                except ApiException as e:
-                    logger.error(f"Error deleting deployment or service {e}")
-                # If all the Kubernetes resources got deleted, let's remove this environment from our records
-                self.free_environments.pop(0)
-                time.sleep(3)
+                    environment_evicted = True
+
+                if environment_evicted:
+                    # successfully deleted an environment
+                    self.free_environments.pop(eviction_index)
+                elif len(self.in_use_environments) > 0:
+                    # all the free ones have failed deleting but there is one or more in use that
+                    # might make room for waiting measurements. In this case we just behave as if there
+                    # are no free available environments and we wait.
+                    return None
+                else:
+                    # None of the free environments could be evicted due to errors and none are in use
+                    # To avoid a deadlock of the operation we fail the measurement
+                    raise K8sEnvironmentCreationError(
+                        "All free environments failed deleting and none are currently in use."
+                    )
 
             # We either made space or we had enough space already
             env = Environment(model=model, configuration=definition)
@@ -211,7 +238,7 @@ def done_creating(self, identifier: str) -> None:
 
     def cleanup_failed_deployment(self, identifier: str) -> None:
         env = self.in_use_environments[identifier]
-        self._wipe_deployment(identifier=identifier)
+        self._delete_environment_k8s_resources(k8s_name=identifier)
         self.done_using(identifier=identifier, reclaim_on_completion=True)
         self.deployment_conflict_manager.signal(
             k8s_name=identifier, model=env.model, error=True
@@ -259,7 +286,7 @@ def cleanup(self) -> None:
         logger.info("Cleaning environments")
         all_envs = list(self.in_use_environments.values()) + self.free_environments
         for env in all_envs:
-            self._wipe_deployment(identifier=env.k8s_name)
+            self._delete_environment_k8s_resources(k8s_name=env.k8s_name)
 
         # We only delete the PVC if it was created by this actuator
         if self.manager.pvc_created:
diff --git a/plugins/actuators/vllm_performance/ado_actuators/vllm_performance/experiment_executor.py b/plugins/actuators/vllm_performance/ado_actuators/vllm_performance/experiment_executor.py
@@ -16,6 +16,10 @@
     EnvironmentManager,
     EnvironmentState,
 )
+from ado_actuators.vllm_performance.k8s import (
+    K8sConnectionError,
+    K8sEnvironmentCreationError,
+)
 from ado_actuators.vllm_performance.k8s.create_environment import (
     create_test_environment,
 )
@@ -41,14 +45,6 @@
 logger = logging.getLogger(__name__)
 
 
-class K8EnvironmentCreationError(Exception):
-    """Error raised when K8 environment cannot be created for some reason"""
-
-
-class K8ConnectionError(Exception):
-    """Error raised when there is an issue connecting to K8s or a service its hosting"""
-
-
 def _build_entity_env(values: dict[str, str]) -> str:
     """
     This is the list of entity parameters that define the environment:
@@ -109,7 +105,7 @@ def _create_environment(
      :param timeout: timeout
     :return: kubernetes environment name
 
-    :raises K8EnvironmentCreationError if there was an issue
+    :raises K8sEnvironmentCreationError if there was an issue
     - If the creation step fails after three attempts
     - If after creation the environment was not in ready state after timeout seconds (1200 default)
 
@@ -135,9 +131,12 @@ def _create_environment(
     )
     while True:
 
-        env: Environment = ray.get(
-            env_manager.get_environment.remote(model=model, definition=definition)
-        )
+        try:
+            env: Environment = ray.get(
+                env_manager.get_environment.remote(model=model, definition=definition)
+            )
+        except Exception as e:
+            raise e
         if env is not None:
             console.put.remote(
                 message=RichConsoleSpinnerMessage(
@@ -255,7 +254,7 @@ def _create_environment(
                     )
                 )
 
-                raise K8EnvironmentCreationError(
+                raise K8sEnvironmentCreationError(
                     f"Failed to create test environment {env.k8s_name}: {error}"
                 )
 
@@ -315,12 +314,12 @@ def _connect_to_vllm_server(
             time.sleep(5)
             # Check if there is a returncode- if there is it means port-forward exited
             if pf.returncode:
-                raise K8ConnectionError(
+                raise K8sConnectionError(
                     f"failed to start port forward to service {k8s_name} - port-forward command exited for unknown reason. Check logs."
                 )
         except Exception as e:
             logger.warning(f"failed to start port forward to service {k8s_name} - {e}")
-            raise K8ConnectionError(
+            raise K8sConnectionError(
                 f"failed to start port forward to service {k8s_name} - {e}"
             )
 
@@ -379,7 +378,7 @@ def run_resource_and_workload_experiment(
 
             logger.info(f"Creating K8s environment for {entity.identifier}")
 
-            # Will raise an K8EnvironmentCreationError if the environment could not be created
+            # Will raise an K8sEnvironmentCreationError if the environment could not be created
             k8s_name, definition = _create_environment(
                 values=values,
                 actuator=actuator_parameters,
@@ -388,7 +387,7 @@ def run_resource_and_workload_experiment(
                 request_id=request.requestid,
             )
 
-            # Will raise an K8ConnectionError if a port-forward was required
+            # Will raise an K8sConnectionError if a port-forward was required
             # but could not be created
             current_port += 1
             base_url, port_forward = _connect_to_vllm_server(
@@ -428,8 +427,8 @@ def run_resource_and_workload_experiment(
             )
 
         except (
-            K8EnvironmentCreationError,
-            K8ConnectionError,
+            K8sEnvironmentCreationError,
+            K8sConnectionError,
             VLLMBenchmarkError,
         ) as error:
             logger.error(f"Error running tests for entity {entity.identifier}: {error}")