33
44import asyncio
55import logging
6- import time
76from enum import Enum
87
98import ray
109from ado_actuators .vllm_performance .deployment_management import (
1110 DeploymentConflictManager ,
1211)
12+ from ado_actuators .vllm_performance .k8s import K8sEnvironmentCreationError
1313from ado_actuators .vllm_performance .k8s .manage_components import (
1414 ComponentsManager ,
1515)
@@ -105,18 +105,18 @@ def __init__(
105105 pvc_template = pvc_template ,
106106 )
107107
108- def _wipe_deployment (self , identifier : str ) -> None :
108+ def _delete_environment_k8s_resources (self , k8s_name : str ) -> None :
109109 """
110110 Deletes a deployment. Intended to be used for cleanup or error recovery
111111 param: identifier: the deployment identifier
112112 """
113113 try :
114- self .manager .delete_service (k8s_name = identifier )
114+ self .manager .delete_service (k8s_name = k8s_name )
115115 except ApiException as e :
116116 if e .reason != "Not Found" :
117117 raise e
118118 try :
119- self .manager .delete_deployment (k8s_name = identifier )
119+ self .manager .delete_deployment (k8s_name = k8s_name )
120120 except ApiException as e :
121121 if e .reason != "Not Found" :
122122 raise e
@@ -138,9 +138,6 @@ def get_environment(self, model: str, definition: str) -> Environment | None:
138138 :param increment_usage: increment usage flag
139139 :return: environment state
140140 """
141- print (
142- f"getting environment for model { model } , currently { self .active_environments } deployments"
143- )
144141
145142 # check if there's an existing free environment satisfying the request
146143 env = self .get_matching_free_environment (definition )
@@ -153,24 +150,54 @@ def get_environment(self, model: str, definition: str) -> Environment | None:
153150 f"There are already { self .max_concurrent } actively in use, and I can't create a new one"
154151 )
155152 return None
156- # There are unused environments, let's evict one
157-
158- # Gets the oldest env in the list
159- environment_to_evict = self .free_environments [0 ]
160- try :
161- self .manager .delete_service (k8s_name = environment_to_evict .k8s_name )
162- self .manager .delete_deployment (
163- k8s_name = environment_to_evict .k8s_name
164- )
153+
154+ # There are unused environments, let's try to evict one
155+ environment_evicted = False
156+ eviction_index = 0
157+ # Continue looping until we find one environment that can be successfully evicted or we have gone through them all
158+ while not environment_evicted and eviction_index < len (
159+ self .free_environments
160+ ):
161+ environment_to_evict = self .free_environments [eviction_index ]
162+ try :
163+ # _delete_environment_k8s_resources will not raise an error if for whatever the reason the service
164+ # or the deployment we are trying to delete does not exist anymore, and we assume
165+ # the deployment was properly deleted.
166+ self ._delete_environment_k8s_resources (
167+ k8s_name = environment_to_evict .k8s_name
168+ )
169+ except ApiException as e :
170+ # If we can't delete this environment we try with the next one, but we do not
171+ # delete the current env from the free list. This is to avoid spawning more pods than the maximum configured
172+ # in the case the failing ones are still running.
173+ # Since the current eviction candidate environment will stay in the free ones, some other measurement might
174+ # try to evict again and perhaps succeed (e.g., connection restored to the cluster).
175+ logger .critical (
176+ f"Error deleting deployment or service { environment_to_evict .k8s_name } : { e } "
177+ )
178+ eviction_index += 1
179+ continue
180+
165181 logger .info (
166182 f"deleted environment { environment_to_evict .k8s_name } . "
167183 f"Active environments { self .active_environments } "
168184 )
169- except ApiException as e :
170- logger .error (f"Error deleting deployment or service { e } " )
171- # If all the Kubernetes resources got deleted, let's remove this environment from our records
172- self .free_environments .pop (0 )
173- time .sleep (3 )
185+ environment_evicted = True
186+
187+ if environment_evicted :
188+ # successfully deleted an environment
189+ self .free_environments .pop (eviction_index )
190+ elif len (self .in_use_environments ) > 0 :
191+ # all the free ones have failed deleting but there is one or more in use that
192+ # might make room for waiting measurements. In this case we just behave as if there
193+ # are no free available environments and we wait.
194+ return None
195+ else :
196+ # None of the free environments could be evicted due to errors and none are in use
197+ # To avoid a deadlock of the operation we fail the measurement
198+ raise K8sEnvironmentCreationError (
199+ "All free environments failed deleting and none are currently in use."
200+ )
174201
175202 # We either made space or we had enough space already
176203 env = Environment (model = model , configuration = definition )
@@ -211,7 +238,7 @@ def done_creating(self, identifier: str) -> None:
211238
212239 def cleanup_failed_deployment (self , identifier : str ) -> None :
213240 env = self .in_use_environments [identifier ]
214- self ._wipe_deployment ( identifier = identifier )
241+ self ._delete_environment_k8s_resources ( k8s_name = identifier )
215242 self .done_using (identifier = identifier , reclaim_on_completion = True )
216243 self .deployment_conflict_manager .signal (
217244 k8s_name = identifier , model = env .model , error = True
@@ -259,7 +286,7 @@ def cleanup(self) -> None:
259286 logger .info ("Cleaning environments" )
260287 all_envs = list (self .in_use_environments .values ()) + self .free_environments
261288 for env in all_envs :
262- self ._wipe_deployment ( identifier = env .k8s_name )
289+ self ._delete_environment_k8s_resources ( k8s_name = env .k8s_name )
263290
264291 # We only delete the PVC if it was created by this actuator
265292 if self .manager .pvc_created :
0 commit comments