From 9490c78db7c18aeed7e29c58642afef640ff0395 Mon Sep 17 00:00:00 2001 From: liyonghua0910 Date: Mon, 29 Sep 2025 20:17:37 +0800 Subject: [PATCH 1/2] [fix] fix requests & block metrics --- .../cache_manager/prefix_cache_manager.py | 1 + .../engine/sched/resource_manager_v1.py | 19 +++++++++---------- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/fastdeploy/cache_manager/prefix_cache_manager.py b/fastdeploy/cache_manager/prefix_cache_manager.py index 5c3c4a63b5..9ca550b1eb 100644 --- a/fastdeploy/cache_manager/prefix_cache_manager.py +++ b/fastdeploy/cache_manager/prefix_cache_manager.py @@ -115,6 +115,7 @@ def __init__( main_process_metrics.max_gpu_block_num.set(self.num_gpu_blocks) main_process_metrics.available_gpu_block_num.set(self.num_gpu_blocks) + main_process_metrics.free_gpu_block_num.set(self.num_gpu_blocks) main_process_metrics.available_gpu_resource.set(1.0) @property diff --git a/fastdeploy/engine/sched/resource_manager_v1.py b/fastdeploy/engine/sched/resource_manager_v1.py index fc57df0f3c..451653139b 100644 --- a/fastdeploy/engine/sched/resource_manager_v1.py +++ b/fastdeploy/engine/sched/resource_manager_v1.py @@ -123,8 +123,6 @@ def _trigger_preempt(self, request, num_new_blocks, preempted_reqs, scheduled_re llm_logger.info(f"Preemption is triggered! Preempted request id: {preempted_req.request_id}") preempted_reqs.append(preempted_req) scheduled_reqs.append(self._prepare_preempt_task(preempted_req)) - main_process_metrics.num_requests_waiting.inc(1) - main_process_metrics.num_requests_running.dec(1) if preempted_req == request: # No more request to preempt. can_schedule = False @@ -381,8 +379,6 @@ def schedule(self): request, self.config.cache_config.block_size, request.num_computed_tokens ) request.status = RequestStatus.RUNNING - main_process_metrics.num_requests_waiting.dec(1) - main_process_metrics.num_requests_running.inc(1) allocated_position = self.get_available_position() request.idx = allocated_position self.tasks_list[allocated_position] = request @@ -426,8 +422,6 @@ def schedule(self): request, self.config.cache_config.block_size, request.num_computed_tokens ) request.status = RequestStatus.RUNNING - main_process_metrics.num_requests_waiting.dec(1) - main_process_metrics.num_requests_running.inc(1) else: if self.config.cache_config.enable_prefix_caching: self._free_blocks(request) @@ -435,11 +429,16 @@ def schedule(self): else: llm_logger.error("Unknown request status type") if scheduled_reqs: - task_used_block_num = sum([len(task.block_tables) if task else 0 for task in self.tasks_list]) - main_process_metrics.available_gpu_block_num.set(self.total_block_number() - task_used_block_num) - main_process_metrics.batch_size.set(self.max_num_seqs - self.available_batch()) - main_process_metrics.gpu_cache_usage_perc.set(self.get_gpu_cache_usage_perc()) llm_logger.debug(f"schedued_reqs: {scheduled_reqs}") + + num_tasks = sum([1 if task else 0 for task in self.tasks_list]) + num_blocks_used_by_tasks = sum([len(task.block_tables) if task else 0 for task in self.tasks_list]) + main_process_metrics.available_gpu_block_num.set(self.total_block_number() - num_blocks_used_by_tasks) + main_process_metrics.batch_size.set(self.max_num_seqs - self.available_batch()) + main_process_metrics.gpu_cache_usage_perc.set(self.get_gpu_cache_usage_perc()) + main_process_metrics.num_requests_running.set(len(self.running)) + main_process_metrics.num_requests_waiting.set(num_tasks - len(self.running)) + return scheduled_reqs def get_available_position(self) -> int: From d49f7ce177e23fb6594d25b86f6c8132f214faf5 Mon Sep 17 00:00:00 2001 From: liyonghua0910 Date: Tue, 30 Sep 2025 10:57:28 +0800 Subject: [PATCH 2/2] [chore] rename variables --- fastdeploy/cache_manager/prefix_cache_manager.py | 1 + fastdeploy/engine/resource_manager.py | 4 ++-- fastdeploy/engine/sched/resource_manager_v1.py | 1 + fastdeploy/metrics/metrics.py | 2 +- fastdeploy/output/token_processor.py | 7 +++++-- 5 files changed, 10 insertions(+), 5 deletions(-) diff --git a/fastdeploy/cache_manager/prefix_cache_manager.py b/fastdeploy/cache_manager/prefix_cache_manager.py index 9ca550b1eb..a8a45a655b 100644 --- a/fastdeploy/cache_manager/prefix_cache_manager.py +++ b/fastdeploy/cache_manager/prefix_cache_manager.py @@ -275,6 +275,7 @@ def update_cache_config(self, cache_config): main_process_metrics.max_gpu_block_num.set(self.num_gpu_blocks) main_process_metrics.available_gpu_block_num.set(self.num_gpu_blocks) + main_process_metrics.free_gpu_block_num.set(self.num_gpu_blocks) main_process_metrics.available_gpu_resource.set(1.0) def can_allocate_gpu_blocks(self, num_blocks: int): diff --git a/fastdeploy/engine/resource_manager.py b/fastdeploy/engine/resource_manager.py index ef6190fc1b..dc0f5a5016 100644 --- a/fastdeploy/engine/resource_manager.py +++ b/fastdeploy/engine/resource_manager.py @@ -311,8 +311,8 @@ def allocate_resources_for_new_tasks(self, tasks): break # record batch size here - task_used_block_num = sum([len(task.block_tables) if task else 0 for task in self.tasks_list]) - main_process_metrics.available_gpu_block_num.set(self.total_block_number() - task_used_block_num) + num_blocks_used_by_tasks = sum([len(task.block_tables) if task else 0 for task in self.tasks_list]) + main_process_metrics.available_gpu_block_num.set(self.total_block_number() - num_blocks_used_by_tasks) main_process_metrics.batch_size.set(self.max_num_seqs - self.available_batch()) main_process_metrics.gpu_cache_usage_perc.set(self.get_gpu_cache_usage_perc()) diff --git a/fastdeploy/engine/sched/resource_manager_v1.py b/fastdeploy/engine/sched/resource_manager_v1.py index 451653139b..9d32bfa978 100644 --- a/fastdeploy/engine/sched/resource_manager_v1.py +++ b/fastdeploy/engine/sched/resource_manager_v1.py @@ -431,6 +431,7 @@ def schedule(self): if scheduled_reqs: llm_logger.debug(f"schedued_reqs: {scheduled_reqs}") + # Update metrics num_tasks = sum([1 if task else 0 for task in self.tasks_list]) num_blocks_used_by_tasks = sum([len(task.block_tables) if task else 0 for task in self.tasks_list]) main_process_metrics.available_gpu_block_num.set(self.total_block_number() - num_blocks_used_by_tasks) diff --git a/fastdeploy/metrics/metrics.py b/fastdeploy/metrics/metrics.py index ca8b6b3919..c1982e0a87 100644 --- a/fastdeploy/metrics/metrics.py +++ b/fastdeploy/metrics/metrics.py @@ -311,7 +311,7 @@ class MetricsManager: "available_gpu_block_num": { "type": Gauge, "name": "fastdeploy:available_gpu_block_num", - "description": "Number of available gpu blocks in cache, including prefix caching blocks that are not officially released", + "description": "Number of available gpu blocks in cache, including blocks in LRU list", "kwargs": {}, }, "free_gpu_block_num": { diff --git a/fastdeploy/output/token_processor.py b/fastdeploy/output/token_processor.py index 0851dee638..6dde9c3ccf 100644 --- a/fastdeploy/output/token_processor.py +++ b/fastdeploy/output/token_processor.py @@ -248,9 +248,12 @@ def _recycle_resources(self, task_id, index, task, result=None, is_prefill=False self.resource_manager.tasks_list[index] = None self.resource_manager._recycle_block_tables(task) - task_used_block_num = sum([len(task.block_tables) if task else 0 for task in self.resource_manager.tasks_list]) + # Update block metrics + num_blocks_used_by_tasks = sum( + [len(task.block_tables) if task else 0 for task in self.resource_manager.tasks_list] + ) main_process_metrics.available_gpu_block_num.set( - self.resource_manager.total_block_number() - task_used_block_num + self.resource_manager.total_block_number() - num_blocks_used_by_tasks ) main_process_metrics.batch_size.set( self.resource_manager.max_num_seqs - self.resource_manager.available_batch()