feat: Add security flag to MM flow in vllm (#4556)

indrajit96 · KrishnanPrash · nv-tusharma · commit 160d423d9f03 · 2025-11-24T18:09:26.000-08:00
Co-authored-by: KrishnanPrash &lt;140860868+KrishnanPrash@users.noreply.github.com&gt;
diff --git a/components/src/dynamo/vllm/args.py b/components/src/dynamo/vllm/args.py
@@ -62,6 +62,7 @@ class Config:
     multimodal_encode_worker: bool = False
     multimodal_worker: bool = False
     multimodal_decode_worker: bool = False
+    enable_multimodal: bool = False
     multimodal_encode_prefill_worker: bool = False
     mm_prompt_template: str = "USER: <image>\n<prompt> ASSISTANT:"
     # dump config to file
@@ -161,6 +162,11 @@ def parse_args() -> Config:
         action="store_true",
         help="Run as unified encode+prefill+decode worker for models requiring integrated image encoding (e.g., Llama 4)",
     )
+    parser.add_argument(
+        "--enable-multimodal",
+        action="store_true",
+        help="Enable multimodal processing. If not set, none of the multimodal components can be used.",
+    )
     parser.add_argument(
         "--mm-prompt-template",
         type=str,
@@ -218,6 +224,9 @@ def parse_args() -> Config:
             "Use only one of --multimodal-processor, --multimodal-encode-worker, --multimodal-worker, --multimodal-decode-worker, or --multimodal-encode-prefill-worker"
         )
 
+    if mm_flags == 1 and not args.enable_multimodal:
+        raise ValueError("Use --enable-multimodal to enable multimodal processing")
+
     # Set component and endpoint based on worker type
     if args.multimodal_processor:
         config.component = "processor"
@@ -256,6 +265,7 @@ def parse_args() -> Config:
     config.multimodal_worker = args.multimodal_worker
     config.multimodal_decode_worker = args.multimodal_decode_worker
     config.multimodal_encode_prefill_worker = args.multimodal_encode_prefill_worker
+    config.enable_multimodal = args.enable_multimodal
     config.mm_prompt_template = args.mm_prompt_template
     config.store_kv = args.store_kv
 
diff --git a/components/src/dynamo/vllm/handlers.py b/components/src/dynamo/vllm/handlers.py
@@ -65,14 +65,25 @@ class BaseWorkerHandler(ABC):
     Request handler for the generate and clear_kv_blocks endpoints.
     """
 
-    def __init__(self, runtime, component, engine, default_sampling_params):
+    def __init__(
+        self,
+        runtime,
+        component,
+        engine,
+        default_sampling_params,
+        model_max_len: int | None = None,
+        enable_multimodal: bool = False,
+    ):
         self.runtime = runtime
         self.component = component
         self.engine_client = engine
         self.default_sampling_params = default_sampling_params
         self.kv_publishers: list[ZmqKvEventPublisher] | None = None
         self.engine_monitor = VllmEngineMonitor(runtime, engine)
         self.image_loader = ImageLoader()
+        self.temp_dirs: list[tempfile.TemporaryDirectory] = []
+        self.model_max_len = model_max_len
+        self.enable_multimodal = enable_multimodal
 
     @abstractmethod
     async def generate(self, request, context) -> AsyncGenerator[dict, None]:
@@ -128,6 +139,13 @@ async def _extract_multimodal_data(
         if "multi_modal_data" not in request or request["multi_modal_data"] is None:
             return None
 
+        # Security check: reject multimodal data if not explicitly enabled
+        if not self.enable_multimodal:
+            raise ValueError(
+                "Received multimodal data but multimodal processing is not enabled. "
+                "Use --enable-multimodal flag to enable multimodal processing."
+            )
+
         mm_map = request["multi_modal_data"]
         vllm_mm_data = {}
 
@@ -212,8 +230,17 @@ def __init__(
         component,
         engine,
         default_sampling_params,
+        model_max_len: int | None = None,
+        enable_multimodal: bool = False,
     ):
-        super().__init__(runtime, component, engine, default_sampling_params)
+        super().__init__(
+            runtime,
+            component,
+            engine,
+            default_sampling_params,
+            model_max_len,
+            enable_multimodal,
+        )
 
     async def generate(self, request, context):
         # Use context ID for request tracking and correlation
@@ -259,8 +286,23 @@ async def generate(self, request, context):
 
 
 class PrefillWorkerHandler(BaseWorkerHandler):
-    def __init__(self, runtime, component, engine, default_sampling_params):
-        super().__init__(runtime, component, engine, default_sampling_params)
+    def __init__(
+        self,
+        runtime,
+        component,
+        engine,
+        default_sampling_params,
+        model_max_len: int | None = None,
+        enable_multimodal: bool = False,
+    ):
+        super().__init__(
+            runtime,
+            component,
+            engine,
+            default_sampling_params,
+            model_max_len,
+            enable_multimodal,
+        )
 
     async def generate(self, request, context):
         # Use context ID for request tracking and correlation with decode phase
diff --git a/components/src/dynamo/vllm/multimodal_handlers/worker_handler.py b/components/src/dynamo/vllm/multimodal_handlers/worker_handler.py
@@ -38,7 +38,13 @@ def __init__(
         )
 
         # Call BaseWorkerHandler.__init__ with proper parameters
-        super().__init__(runtime, component, engine_client, default_sampling_params)
+        super().__init__(
+            runtime,
+            component,
+            engine_client,
+            default_sampling_params,
+            enable_multimodal=config.enable_multimodal,
+        )
 
         self.config = config
         self.enable_disagg = config.is_prefill_worker
@@ -98,7 +104,13 @@ def __init__(
         )
 
         # Call BaseWorkerHandler.__init__ with proper parameters
-        super().__init__(runtime, component, engine_client, default_sampling_params)
+        super().__init__(
+            runtime,
+            component,
+            engine_client,
+            default_sampling_params,
+            enable_multimodal=config.enable_multimodal,
+        )
 
         self.config = config
         self.decode_worker_client = decode_worker_client
diff --git a/docs/backends/vllm/multimodal.md b/docs/backends/vllm/multimodal.md
@@ -22,6 +22,10 @@ Dynamo supports multimodal models with vLLM v1. In general, multimodal models ca
 > [!WARNING]
 > **LLaVA Model Limitation**: Do not use LLaVA models (e.g., `llava-hf/llava-1.5-7b-hf`) with the standard aggregated serving setup, as they contain keywords that Dynamo cannot yet parse. LLaVA models can still be used with the EPD (Encode-Prefill-Decode) setup described below.
 
+> [!IMPORTANT]
+> **Security Requirement**: All multimodal workers require the `--enable-multimodal` flag to be explicitly set at startup. This is a security feature to prevent unintended processing of multimodal data from untrusted sources. Workers will fail at startup if multimodal flags (e.g., `--multimodal-worker`, `--multimodal-processor`) are used without `--enable-multimodal`.
+This flag is analogus to `--enable-mm-embeds` in vllm serve but also extends it to all multimodal content (url, embeddings, b64).
+
 # Multimodal EPD Deployment Examples
 
 This section provides example workflows and reference implementations for deploying a multimodal model using Dynamo and vLLM v1 with EPD(Encode-Prefill-Decode) pipeline.
diff --git a/examples/backends/vllm/launch/agg_multimodal.sh b/examples/backends/vllm/launch/agg_multimodal.sh
@@ -53,7 +53,7 @@ fi
 # --enforce-eager: Quick deployment (remove for production)
 # --connector none: No KV transfer needed for aggregated serving
 DYN_SYSTEM_PORT=8081 \
-    python -m dynamo.vllm --model $MODEL_NAME --enforce-eager --connector none $EXTRA_ARGS
+    python -m dynamo.vllm --enable-multimodal --model $MODEL_NAME --enforce-eager --connector none $EXTRA_ARGS
 
 # Wait for all background processes to complete
 wait
diff --git a/examples/backends/vllm/launch/agg_multimodal_epd.sh b/examples/backends/vllm/launch/agg_multimodal_epd.sh
@@ -73,11 +73,11 @@ if [[ "$MODEL_NAME" == "Qwen/Qwen2.5-VL-7B-Instruct" ]]; then
 fi
 
 # Start processor (Python-based preprocessing, handles prompt templating)
-python -m dynamo.vllm --multimodal-processor --model $MODEL_NAME --mm-prompt-template "$PROMPT_TEMPLATE" &
+python -m dynamo.vllm --multimodal-processor --enable-multimodal --model $MODEL_NAME --mm-prompt-template "$PROMPT_TEMPLATE" &
 
 # run E/P/D workers
-CUDA_VISIBLE_DEVICES=0 python -m dynamo.vllm --multimodal-encode-worker --model $MODEL_NAME &
-CUDA_VISIBLE_DEVICES=1 python -m dynamo.vllm --multimodal-worker --model $MODEL_NAME $EXTRA_ARGS &
+CUDA_VISIBLE_DEVICES=0 python -m dynamo.vllm --multimodal-encode-worker --enable-multimodal --model $MODEL_NAME &
+CUDA_VISIBLE_DEVICES=1 python -m dynamo.vllm --multimodal-worker --enable-multimodal --model $MODEL_NAME $EXTRA_ARGS &
 
 # Wait for all background processes to complete
 wait
diff --git a/examples/backends/vllm/launch/agg_multimodal_llama.sh b/examples/backends/vllm/launch/agg_multimodal_llama.sh
@@ -11,10 +11,10 @@ MODEL_NAME="meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8"
 python -m dynamo.frontend --http-port=8000 &
 
 # run processor
-python -m dynamo.vllm --multimodal-processor --model $MODEL_NAME --mm-prompt-template "<|image|>\n<prompt>" &
+python -m dynamo.vllm --multimodal-processor --enable-multimodal --model $MODEL_NAME --mm-prompt-template "<|image|>\n<prompt>" &
 # Llama 4 doesn't support image embedding input, so use encode+prefill worker
 # that handles image encoding inline
-python -m dynamo.vllm --multimodal-encode-prefill-worker --model $MODEL_NAME --tensor-parallel-size=8 --max-model-len=208960 --gpu-memory-utilization 0.80 &
+python -m dynamo.vllm --multimodal-encode-prefill-worker --enable-multimodal --model $MODEL_NAME --tensor-parallel-size=8 --max-model-len=208960 --gpu-memory-utilization 0.80 &
 
 # Wait for all background processes to complete
 wait
diff --git a/examples/backends/vllm/launch/disagg_multimodal_epd.sh b/examples/backends/vllm/launch/disagg_multimodal_epd.sh
@@ -76,7 +76,7 @@ python -m dynamo.frontend --http-port=8000 &
 
 # Start processor
 echo "Starting processor..."
-python -m dynamo.vllm --multimodal-processor --model $MODEL_NAME --mm-prompt-template "$PROMPT_TEMPLATE" &
+python -m dynamo.vllm --multimodal-processor --enable-multimodal --model $MODEL_NAME --mm-prompt-template "$PROMPT_TEMPLATE" &
 
 # Configure GPU memory optimization for specific models
 EXTRA_ARGS=""
@@ -86,17 +86,17 @@ fi
 
 # Start encode worker
 echo "Starting encode worker on GPU 1..."
-VLLM_NIXL_SIDE_CHANNEL_PORT=20097 CUDA_VISIBLE_DEVICES=1 python -m dynamo.vllm --multimodal-encode-worker --model $MODEL_NAME  $EXTRA_ARGS --kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20080"}' &
+VLLM_NIXL_SIDE_CHANNEL_PORT=20097 CUDA_VISIBLE_DEVICES=1 python -m dynamo.vllm --multimodal-encode-worker --enable-multimodal --model $MODEL_NAME  $EXTRA_ARGS --kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20080"}' &
 
 # Start prefill worker
 echo "Starting prefill worker on GPU 2..."
 VLLM_NIXL_SIDE_CHANNEL_PORT=20098 \
-CUDA_VISIBLE_DEVICES=2 python -m dynamo.vllm --multimodal-worker --is-prefill-worker --model $MODEL_NAME $EXTRA_ARGS --kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20081"}' &
+CUDA_VISIBLE_DEVICES=2 python -m dynamo.vllm --multimodal-worker --is-prefill-worker --enable-multimodal --model $MODEL_NAME $EXTRA_ARGS --kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20081"}' &
 
 # Start decode worker
 echo "Starting decode worker on GPU 3..."
 VLLM_NIXL_SIDE_CHANNEL_PORT=20099 \
-CUDA_VISIBLE_DEVICES=3 python -m dynamo.vllm --multimodal-decode-worker --model $MODEL_NAME $EXTRA_ARGS --kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20082"}' &
+CUDA_VISIBLE_DEVICES=3 python -m dynamo.vllm --multimodal-decode-worker --enable-multimodal --model $MODEL_NAME $EXTRA_ARGS --kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20082"}' &
 
 echo "=================================================="
 echo "All components started. Waiting for initialization..."