[Bugfix] fix automatic prefix args and add log info (vllm-project#3608)

gty111 · web-flow · commit e67c295b0c81 · 2024-03-25T05:35:22.000-07:00
diff --git a/vllm/core/block_manager.py b/vllm/core/block_manager.py
@@ -9,6 +9,9 @@
 from vllm.sequence import Sequence, SequenceGroup, SequenceStatus
 from vllm.utils import Device
 from vllm.core.evictor import Evictor, EvictionPolicy, make_evictor
+from vllm.logger import init_logger
+
+logger = init_logger(__name__)
 
 
 class BlockAllocatorBase(ABC):
@@ -241,11 +244,13 @@ def __init__(
         self.watermark_blocks = int(watermark * num_gpu_blocks)
 
         if self.enable_caching:
+            logger.info("enable automatic prefix caching")
             self.gpu_allocator = CachedBlockAllocator(Device.GPU, block_size,
                                                       num_gpu_blocks)
             self.cpu_allocator = CachedBlockAllocator(Device.CPU, block_size,
                                                       num_cpu_blocks)
         else:
+            logger.info("disable automatic prefix caching")
             self.gpu_allocator = UncachedBlockAllocator(
                 Device.GPU, block_size, num_gpu_blocks)
             self.cpu_allocator = UncachedBlockAllocator(
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
@@ -337,7 +337,8 @@ def create_engine_configs(
         cache_config = CacheConfig(self.block_size,
                                    self.gpu_memory_utilization,
                                    self.swap_space, self.kv_cache_dtype,
-                                   model_config.get_sliding_window())
+                                   model_config.get_sliding_window(),
+                                   self.enable_prefix_caching)
         parallel_config = ParallelConfig(
             self.pipeline_parallel_size, self.tensor_parallel_size,
             self.worker_use_ray, self.max_parallel_loading_workers,