ai-dynamo · rmccorm4 · Oct 11, 2025 · Oct 10, 2025 · Oct 10, 2025 · Oct 10, 2025
diff --git a/components/backends/sglang/prometheus.md b/components/backends/sglang/prometheus.md
@@ -0,0 +1,99 @@
+# SGLang Prometheus Metrics
+
+**📚 Official Documentation**: [SGLang Production Metrics](https://docs.sglang.ai/references/production_metrics.html)
+
+This document describes how SGLang Prometheus metrics are exposed in Dynamo.
+
+## Overview
+
+When running SGLang through Dynamo, SGLang engine metrics are automatically passed through and exposed on Dynamo's `/metrics` endpoint (default port 8081). This allows you to access both SGLang engine metrics (prefixed with `sglang:`) and Dynamo runtime metrics (prefixed with `dynamo_*`) from a single worker backend endpoint.
+
+For the complete and authoritative list of all SGLang metrics, always refer to the official documentation linked above.
+
+Dynamo runtime metrics are documented in [docs/guides/metrics.md](../../../docs/guides/metrics.md).
+
+## Metric Reference
+
+The official documentation includes:
+- Complete metric definitions with HELP and TYPE descriptions
+- Example metric output in Prometheus exposition format
+- Counter, Gauge, and Histogram metrics
+- Metric labels (e.g., `model_name`, `engine_type`, `tp_rank`, `pp_rank`)
+- Setup guide for Prometheus + Grafana monitoring
+- Troubleshooting tips and configuration examples
+
+## Metric Categories
+
+SGLang provides metrics in the following categories (all prefixed with `sglang:`):
+- Throughput metrics
+- Resource usage
+- Latency metrics
+- Disaggregation metrics (when enabled)
+
+**Note:** Specific metrics are subject to change between SGLang versions. Always refer to the [official documentation](https://docs.sglang.ai/references/production_metrics.html) or inspect the `/metrics` endpoint for your SGLang version.
+
+## Enabling Metrics in Dynamo
+
+SGLang metrics are automatically exposed when running SGLang through Dynamo with metrics enabled.
+
+## Inspecting Metrics
+
+To see the actual metrics available in your SGLang version:
+
+### 1. Launch SGLang with Metrics Enabled
+
+```bash
+# Set environment variables
+export DYN_SYSTEM_ENABLED=true
+export DYN_SYSTEM_PORT=8081
+
+# Start SGLang worker with metrics enabled
+python -m dynamo.sglang --model <model_name> --enable-metrics
+
+# Wait for engine to initialize
+```
+
+Metrics will be available at: `http://localhost:8081/metrics`
+
+### 2. Fetch Metrics via curl
+
+```bash
+curl http://localhost:8081/metrics | grep "^sglang:"
+```
+
+### 3. Example Output
+
+**Note:** The specific metrics shown below are examples and may vary depending on your SGLang version. Always inspect your actual `/metrics` endpoint for the current list.
+
+```
+# HELP sglang:prompt_tokens_total Number of prefill tokens processed.
+# TYPE sglang:prompt_tokens_total counter
+sglang:prompt_tokens_total{model_name="meta-llama/Llama-3.1-8B-Instruct"} 8128902.0
+# HELP sglang:generation_tokens_total Number of generation tokens processed.
+# TYPE sglang:generation_tokens_total counter
+sglang:generation_tokens_total{model_name="meta-llama/Llama-3.1-8B-Instruct"} 7557572.0
+# HELP sglang:cache_hit_rate The cache hit rate
+# TYPE sglang:cache_hit_rate gauge
+sglang:cache_hit_rate{model_name="meta-llama/Llama-3.1-8B-Instruct"} 0.0075
+```
+
+## Implementation Details
+
+- SGLang uses multiprocess metrics collection via `prometheus_client.multiprocess.MultiProcessCollector`
+- Metrics are filtered by the `sglang:` prefix before being exposed
+- The integration uses Dynamo's `register_engine_metrics_callback()` function
+- Metrics appear after SGLang engine initialization completes
+
+## See Also
+
+### SGLang Metrics
+- [Official SGLang Production Metrics](https://docs.sglang.ai/references/production_metrics.html)
+- [SGLang GitHub - Metrics Collector](https://github.com/sgl-project/sglang/blob/main/python/sglang/srt/metrics/collector.py)
+
+### Dynamo Metrics
+- **Dynamo Metrics Guide**: See `docs/guides/metrics.md` for complete documentation on Dynamo runtime metrics
+- **Dynamo Runtime Metrics**: Metrics prefixed with `dynamo_*` for runtime, components, endpoints, and namespaces
+  - Implementation: `lib/runtime/src/metrics.rs` (Rust runtime metrics)
+  - Metric names: `lib/runtime/src/metrics/prometheus_names.rs` (metric name constants)
+  - Available at the same `/metrics` endpoint alongside SGLang metrics
+- **Integration Code**: `components/src/dynamo/common/utils/prometheus.py` - Prometheus utilities and callback registration
diff --git a/components/backends/vllm/prometheus.md b/components/backends/vllm/prometheus.md
@@ -0,0 +1,104 @@
+# vLLM Prometheus Metrics
+
+**📚 Official Documentation**: [vLLM Metrics Design](https://docs.vllm.ai/en/latest/design/metrics.html)
+
+This document describes how vLLM Prometheus metrics are exposed in Dynamo.
+
+## Overview
+
+When running vLLM through Dynamo, vLLM engine metrics are automatically passed through and exposed on Dynamo's `/metrics` endpoint (default port 8081). This allows you to access both vLLM engine metrics (prefixed with `vllm:`) and Dynamo runtime metrics (prefixed with `dynamo_*`) from a single worker backend endpoint.
+
+For the complete and authoritative list of all vLLM metrics, always refer to the official documentation linked above.
+
+Dynamo runtime metrics are documented in [docs/guides/metrics.md](../../../docs/guides/metrics.md).
+
+## Metric Reference
+
+The official documentation includes:
+- Complete metric definitions with detailed explanations
+- Counter, Gauge, and Histogram metrics
+- Metric labels (e.g., `model_name`, `finished_reason`, `scheduling_event`)
+- Design rationale and implementation details
+- Information about v1 metrics migration
+- Future work and deprecated metrics
+
+## Metric Categories
+
+vLLM provides metrics in the following categories (all prefixed with `vllm:`):
+- Request metrics
+- Performance metrics
+- Resource usage
+- Scheduler metrics
+- Disaggregation metrics (when enabled)
+
+**Note:** Specific metrics are subject to change between vLLM versions. Always refer to the [official documentation](https://docs.vllm.ai/en/latest/design/metrics.html) or inspect the `/metrics` endpoint for your vLLM version.
+
+## Enabling Metrics in Dynamo
+
+vLLM metrics are automatically exposed when running vLLM through Dynamo with metrics enabled.
+
+## Inspecting Metrics
+
+To see the actual metrics available in your vLLM version:
+
+### 1. Launch vLLM with Metrics Enabled
+
+```bash
+# Set environment variables
+export DYN_SYSTEM_ENABLED=true
+export DYN_SYSTEM_PORT=8081
+
+# Start vLLM worker (metrics enabled by default via --disable-log-stats=false)
+python -m dynamo.vllm --model <model_name>
+
+# Wait for engine to initialize
+```
+
+Metrics will be available at: `http://localhost:8081/metrics`
+
+### 2. Fetch Metrics via curl
+
+```bash
+curl http://localhost:8081/metrics | grep "^vllm:"
+```
+
+### 3. Example Output
+
+**Note:** The specific metrics shown below are examples and may vary depending on your vLLM version. Always inspect your actual `/metrics` endpoint for the current list.
+
+```
+# HELP vllm:request_success_total Number of successfully finished requests.
+# TYPE vllm:request_success_total counter
+vllm:request_success_total{finished_reason="length",model_name="meta-llama/Llama-3.1-8B"} 15.0
+vllm:request_success_total{finished_reason="stop",model_name="meta-llama/Llama-3.1-8B"} 150.0
+# HELP vllm:time_to_first_token_seconds Histogram of time to first token in seconds.
+# TYPE vllm:time_to_first_token_seconds histogram
+vllm:time_to_first_token_seconds_bucket{le="0.001",model_name="meta-llama/Llama-3.1-8B"} 0.0
+vllm:time_to_first_token_seconds_bucket{le="0.005",model_name="meta-llama/Llama-3.1-8B"} 5.0
+vllm:time_to_first_token_seconds_count{model_name="meta-llama/Llama-3.1-8B"} 165.0
+vllm:time_to_first_token_seconds_sum{model_name="meta-llama/Llama-3.1-8B"} 89.38
+```
+
+## Implementation Details
+
+- vLLM v1 uses multiprocess metrics collection via `prometheus_client.multiprocess`
+- `PROMETHEUS_MULTIPROC_DIR`: vLLM sets this environment variable to a temporary directory where multiprocess metrics are stored as memory-mapped files. Each worker process writes its metrics to separate files in this directory, which are aggregated when `/metrics` is scraped.
+- Metrics are filtered by the `vllm:` prefix before being exposed
+- The integration uses Dynamo's `register_engine_metrics_callback()` function
+- Metrics appear after vLLM engine initialization completes
+- vLLM v1 metrics are different from v0 - see the [official documentation](https://docs.vllm.ai/en/latest/design/metrics.html) for migration details
+
+## See Also
+
+### vLLM Metrics
+- [Official vLLM Metrics Design Documentation](https://docs.vllm.ai/en/latest/design/metrics.html)
+- [vLLM Production Metrics User Guide](https://docs.vllm.ai/en/latest/user/production_metrics.html)
+- [vLLM GitHub - Metrics Implementation](https://github.com/vllm-project/vllm/tree/main/vllm/engine/metrics)
+
+### Dynamo Metrics
+- **Dynamo Metrics Guide**: See `docs/guides/metrics.md` for complete documentation on Dynamo runtime metrics
+- **Dynamo Runtime Metrics**: Metrics prefixed with `dynamo_*` for runtime, components, endpoints, and namespaces
+  - Implementation: `lib/runtime/src/metrics.rs` (Rust runtime metrics)
+  - Metric names: `lib/runtime/src/metrics/prometheus_names.rs` (metric name constants)
+  - Available at the same `/metrics` endpoint alongside vLLM metrics
+- **Integration Code**: `components/src/dynamo/common/utils/prometheus.py` - Prometheus utilities and callback registration
@@ -9,9 +9,10 @@
 
 Main submodules:
     - config_dump: Configuration dumping and system diagnostics utilities
+    - utils: Common utilities including environment and prometheus helpers
 """
 
-from dynamo.common import config_dump
+from dynamo.common import config_dump, utils
 
 try:
     from ._version import __version__
@@ -23,4 +24,4 @@
     except Exception:
         __version__ = "0.0.0+unknown"
 
-__all__ = ["__version__", "config_dump"]
+__all__ = ["__version__", "config_dump", "utils"]
@@ -0,0 +1,16 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+"""
+Dynamo Common Utils Module
+
+This module contains shared utility functions used across multiple
+Dynamo backends and components.
+
+Submodules:
+    - prometheus: Prometheus metrics collection and logging utilities
+"""
+
+from dynamo.common.utils import prometheus
+
+__all__ = ["prometheus"]
@@ -0,0 +1,129 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+"""
+Prometheus metrics utilities for Dynamo components.
+
+This module provides shared functionality for collecting and exposing Prometheus metrics
+from backend engines (SGLang, vLLM, etc.) via Dynamo's metrics endpoint.
+
+Note: Engine metrics take time to appear after engine initialization,
+while Dynamo runtime metrics are available immediately after component creation.
+"""
+
+import logging
+import re
+from typing import TYPE_CHECKING, Optional
+
+from prometheus_client import generate_latest
+
+from dynamo._core import Endpoint
+
+# Import CollectorRegistry only for type hints to avoid importing prometheus_client at module load time.
+# prometheus_client must be imported AFTER set_prometheus_multiproc_dir() is called.
+# See main.py worker() function for detailed explanation.
+if TYPE_CHECKING:
+    from prometheus_client import CollectorRegistry
+
+
+def register_engine_metrics_callback(
+    endpoint: Endpoint,
+    registry: "CollectorRegistry",
+    metric_prefix: str,
+    engine_name: str,
+) -> None:
+    """
+    Register a callback to expose engine Prometheus metrics via Dynamo's metrics endpoint.
+
+    This registers a callback that is invoked when /metrics is scraped, passing through
+    engine-specific metrics alongside Dynamo runtime metrics.
+
+    Args:
+        endpoint: Dynamo endpoint object with metrics.register_prometheus_expfmt_callback()
+        registry: Prometheus registry to collect from (e.g., REGISTRY or CollectorRegistry)
+        metric_prefix: Prefix to filter metrics (e.g., "vllm:" or "sglang:")
+        engine_name: Name of the engine for logging (e.g., "vLLM" or "SGLang")
+
+    Example:
+        from prometheus_client import REGISTRY
+        register_engine_metrics_callback(
+            generate_endpoint, REGISTRY, "vllm:", "vLLM"
+        )
+    """
+
+    def get_expfmt() -> str:
+        """Callback to return engine Prometheus metrics in exposition format"""
+        return get_prometheus_expfmt(registry, metric_prefix_filter=metric_prefix)
+
+    endpoint.metrics.register_prometheus_expfmt_callback(get_expfmt)
+
+
+def get_prometheus_expfmt(
+    registry,
+    metric_prefix_filter: Optional[str] = None,
+) -> str:
+    """
+    Get Prometheus metrics from a registry formatted as text using the standard text encoder.
+
+    Collects all metrics from the registry and returns them in Prometheus text exposition format.
+    Optionally filters metrics by prefix.
+
+    Prometheus exposition format consists of:
+    - Comment lines starting with # (HELP and TYPE declarations)
+    - Metric lines with format: metric_name{label="value"} metric_value timestamp
+
+    Example output format:
+        # HELP vllm:request_success_total Number of successful requests
+        # TYPE vllm:request_success_total counter
+        vllm:request_success_total{model="llama2",endpoint="generate"} 150.0
+        # HELP vllm:time_to_first_token_seconds Time to first token
+        # TYPE vllm:time_to_first_token_seconds histogram
+        vllm:time_to_first_token_seconds_bucket{model="llama2",le="0.01"} 10.0
+        vllm:time_to_first_token_seconds_bucket{model="llama2",le="0.1"} 45.0
+        vllm:time_to_first_token_seconds_count{model="llama2"} 50.0
+        vllm:time_to_first_token_seconds_sum{model="llama2"} 2.5
+
+    Args:
+        registry: Prometheus registry to collect from.
+                 Pass CollectorRegistry with MultiProcessCollector for SGLang.
+                 Pass REGISTRY for vLLM single-process mode.
+        metric_prefix_filter: Optional prefix to filter displayed metrics (e.g., "vllm:").
+                             If None, returns all metrics. (default: None)
+
+    Returns:
+        Formatted metrics text in Prometheus exposition format. Returns empty string on error.
+
+    Example:
+        from prometheus_client import REGISTRY
+        metrics_text = get_prometheus_expfmt(REGISTRY)
+        print(metrics_text)
+
+        # With filter
+        vllm_metrics = get_prometheus_expfmt(REGISTRY, metric_prefix_filter="vllm:")
+    """
+    try:
+        # Generate metrics in Prometheus text format
+        metrics_text = generate_latest(registry).decode("utf-8")
+
+        if metric_prefix_filter:
+            # Filter lines: keep metric lines starting with prefix and their HELP/TYPE comments
+            escaped_prefix = re.escape(metric_prefix_filter)
+            pattern = rf"^(?:{escaped_prefix}|# (?:HELP|TYPE) {escaped_prefix})"
+            filtered_lines = [
+                line for line in metrics_text.split("\n") if re.match(pattern, line)
+            ]
+            result = "\n".join(filtered_lines)
+            if result:
+                # Ensure result ends with newline
+                if result and not result.endswith("\n"):
+                    result += "\n"
+            return result
+        else:
+            # Ensure metrics_text ends with newline
+            if metrics_text and not metrics_text.endswith("\n"):
+                metrics_text += "\n"
+            return metrics_text
+
+    except Exception as e:
+        logging.error(f"Error getting metrics: {e}")
+        return ""
@@ -9,8 +9,10 @@
 import sglang as sgl
 import zmq
 import zmq.asyncio
+from prometheus_client import CollectorRegistry, multiprocess
 from sglang.srt.utils import get_local_ip_auto, get_zmq_socket
 
+from dynamo.common.utils.prometheus import register_engine_metrics_callback
 from dynamo.llm import (
     ForwardPassMetrics,
     KvStats,
@@ -217,6 +219,16 @@ async def setup_sgl_metrics(
     publisher.init_engine_metrics_publish()
     publisher.init_kv_event_publish()
 
+    # Register Prometheus metrics callback if enabled
+    if engine.server_args.enable_metrics:
+        # SGLang uses multiprocess architecture where metrics are stored in shared memory.
+        # MultiProcessCollector aggregates metrics from all worker processes.
+        registry = CollectorRegistry()
+        multiprocess.MultiProcessCollector(registry)
+        register_engine_metrics_callback(
+            generate_endpoint, registry, "sglang:", "SGLang"
+        )
+
     task = asyncio.create_task(publisher.run())
     logging.info("SGLang metrics loop started")
     return publisher, task, metrics_labels