ai-dynamo · jorgeantonio21 · Jul 11, 2025 · Jul 11, 2025 · Jul 15, 2025 · Jul 15, 2025
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/docs/guides/rate_limiting.md b/docs/guides/rate_limiting.md
@@ -0,0 +1,165 @@
+# Rate Limiting Guide
+
+## Overview
+
+The Dynamo LLM service includes an intelligent rate limiter that monitors service performance metrics and automatically throttles requests when quality degrades. Unlike traditional rate limiters that count requests, this system focuses on maintaining good user experience by monitoring:
+
+- **Time to First Token (TTFT)** - How long users wait for the first response
+- **Inter-Token Latency (ITL)** - How long between subsequent tokens
+
+## How It Works
+
+### Time-Weighted Exponential Moving Average
+
+The rate limiter uses a sophisticated time-weighted exponential moving average (EMA) algorithm:
+
+```text
+average = sum(value * weight) / sum(weight)
+weight = exp(-age / time_constant_secs)
+```
+
+
+This means:
+- Recent samples have higher influence on the average
+- Old samples decay exponentially over time
+- System "recovers" during idle periods
+
+### Decision Logic
+
+For each incoming request, the system:
+1. Computes current decayed EMA for TTFT and ITL
+2. Compares against configured thresholds
+3. Rejects request if either threshold is exceeded
+4. Logs detailed metrics for observability
+
+## Configuration
+
+### Environment Variables
+
+```bash
+# Enable rate limiting
+export DYN_RATE_LIMITER_ENABLED=true
+
+# TTFT threshold in milliseconds (default: 1000ms = 1s)
+export DYN_RATE_LIMITER_TTFT_THRESHOLD_MS=1500
+
+# ITL threshold in milliseconds (default: 10ms)
+export DYN_RATE_LIMITER_ITL_THRESHOLD_MS=15
+
+# Time constant for EMA decay (default: 30s)
+export DYN_RATE_LIMITER_TIME_CONSTANT_SECS=60
+
+# Enable per-model vs global limits (default: false)
+export DYN_RATE_LIMITER_PER_MODEL_LIMITS=true
+```
+
+### Command Line Arguments
+
+```bash
+dynamo-http \
+  --enable-rate-limiting \
+  --ttft-threshold-ms 1500 \
+  --itl-threshold-ms 15 \
+  --time-constant-secs 60 \
+  --per-model-limits
+```
+
+### Programmatic Configuration
+
+```rust
+use dynamo_llm::http::service::rate_limiter::RateLimiterConfig;
+
+let config = RateLimiterConfig::new(
+    1500.0,  // TTFT threshold (ms)
+    15.0,    // ITL threshold (ms)
+    60.0,    // Time constant (s)
+    true,    // Per-model limits
+);
+
+let http_service = HttpService::builder()
+    .with_rate_limiter_config(config)
+    .build()?;
+```
+
+## Monitoring
+
+### Prometheus Metrics
+
+The rate limiter exposes several Prometheus metrics:
+
+**Requests rejected by rate limiter:**
+
+```text
+nv_llm_http_service_rate_limit_requests_total{model, endpoint, request_type, status}
+```
+
+**Current TTFT metrics:**
+
+```text
+nv_llm_http_service_time_to_first_token_seconds{model}
+```
+
+**Current ITL metrics:**
+
+```text
+nv_llm_http_service_inter_token_latency_seconds{model}
+```
+
+### Log Messages
+
+When requests are rejected, detailed log messages are emitted:
+
+```text
+WARN Rate limit exceeded for model deepseek-ai/DeepSeek-R1: RateLimiterMetrics {
+TTFT: TimeWeightedDiagnostics { decayed_time_weighted_average: 2.450, time_constant_secs: 30.0, last_weighted_sum: 1.245, duration_since_last_update: 0.125 },
+ITL: TimeWeightedDiagnostics { decayed_time_weighted_average: 0.025, time_constant_secs: 30.0, last_weighted_sum: 1.245, duration_since_last_update: 0.125 }
+}
+```
+
+
+## Tuning Guidelines
+
+### Time Constant
+- **Shorter (10-30s)**: Faster reaction to load changes, more sensitive
+- **Longer (60-120s)**: Smoother operation, less reactive to spikes
+
+### TTFT Threshold
+- **Conservative (500-1000ms)**: Maintains very responsive feel
+- **Moderate (1000-2000ms)**: Balances throughput with responsiveness
+- **Aggressive (2000ms+)**: Prioritizes throughput over latency
+
+### ITL Threshold
+- **Conservative (5-10ms)**: Ensures smooth streaming experience
+- **Moderate (10-20ms)**: Allows some latency for higher throughput
+- **Aggressive (20ms+)**: Accepts choppier streaming for max throughput
+
+### Per-Model vs Global
+- **Per-Model**: Better for multi-tenant scenarios with different SLAs
+- **Global**: Simpler for single-tenant or uniform SLA scenarios
+
+## Best Practices
+
+1. **Start Conservative**: Begin with lower thresholds and increase based on user feedback
+2. **Monitor Closely**: Watch both rate limit counters and user-facing metrics
+3. **Load Test**: Validate behavior under realistic load patterns
+4. **Document SLAs**: Clearly communicate expected performance to users
+5. **Alert on Rejections**: Set up alerts when rejection rates exceed acceptable levels
+
+## Troubleshooting
+
+### High Rejection Rates
+- Check if system is genuinely overloaded
+- Consider increasing thresholds temporarily
+- Scale backend resources
+- Investigate specific models causing issues
+
+### No Rejections During Overload
+- Verify rate limiter is enabled
+- Check threshold configuration
+- Ensure metrics are being recorded properly
+- Review time constant settings
+
+### Inconsistent Behavior
+- Check if per-model limits are configured correctly
+- Review metric collection for gaps
+- Validate system clock stability
@@ -18,6 +18,7 @@ use std::path::PathBuf;
 
 use clap::ValueEnum;
 use dynamo_llm::entrypoint::RouterConfig;
+use dynamo_llm::http::service::rate_limiter::RateLimiterConfig;
 use dynamo_llm::kv_router::KvRouterConfig;
 use dynamo_llm::local_model::LocalModel;
 use dynamo_llm::mocker::protocols::MockEngineArgs;
@@ -171,6 +172,26 @@ pub struct Flags {
     /// These are the command line arguments to the python engine when using `pystr` or `pytok`.
     #[arg(index = 2, last = true, hide = true, allow_hyphen_values = true)]
     pub last: Vec<String>,
+
+    /// Enables rate limiter config.
+    #[arg(long)]
+    pub enable_rate_limiter: Option<bool>,
+
+    /// Time to first token threshold in seconds, for the OpenAI HTTP service rate limiter.
+    #[arg(long)]
+    pub rate_limiter_ttft_threshold_secs: Option<f64>,
+
+    /// Inter-token latency threshold in seconds, for the OpenAI HTTP service rate limiter.
+    #[arg(long)]
+    pub rate_limiter_itl_threshold_secs: Option<f64>,
+
+    /// Time constant for the time-weighted EMA, for the OpenAI HTTP service rate limiter.
+    #[arg(long)]
+    pub rate_limiter_time_constant_secs: Option<f64>,
+
+    /// Whether to use per-model limits, for the OpenAI HTTP service rate limiter.
+    #[arg(long)]
+    pub rate_limiter_per_model_limits: Option<bool>,
 }
 
 impl Flags {
@@ -240,6 +261,28 @@ impl Flags {
         )
     }
 
+    pub fn rate_limiter_config(&self) -> RateLimiterConfig {
+        if self.enable_rate_limiter.is_none() {
+            return RateLimiterConfig::empty();
+        }
+
+        let mut builder = RateLimiterConfig::builder();
+        if let Some(ttft_threshold_secs) = self.rate_limiter_ttft_threshold_secs {
+            builder = builder.ttft_threshold_secs(ttft_threshold_secs);
+        }
+        if let Some(itl_threshold_secs) = self.rate_limiter_itl_threshold_secs {
+            builder = builder.itl_threshold_secs(itl_threshold_secs);
+        }
+        if let Some(time_constant_secs) = self.rate_limiter_time_constant_secs {
+            builder = builder.time_constant_secs(time_constant_secs);
+        }
+        if let Some(per_model_limits) = self.rate_limiter_per_model_limits {
+            builder = builder.per_model_limits(per_model_limits);
+        }
+
+        builder.build().unwrap_or_default()
+    }
+
     /// Load extra engine arguments from a JSON file
     /// Returns a HashMap of parameter names to values
     pub fn load_extra_engine_args(

@@ -46,7 +46,8 @@ pub async fn run(
         .http_port(Some(flags.http_port))
         .router_config(Some(flags.router_config()))
         .request_template(flags.request_template.clone())
-        .migration_limit(flags.migration_limit);
+        .migration_limit(flags.migration_limit)
+        .rate_limiter_config(flags.rate_limiter_config());
 
     // If `in=dyn` we want the trtllm/sglang/vllm subprocess to listen on that endpoint.
     // If not, then the endpoint isn't exposed so we let LocalModel invent one.

diff --git a/lib/bindings/python/Cargo.lock b/lib/bindings/python/Cargo.lock
@@ -36,9 +36,17 @@ pub struct HttpService {
 #[pymethods]
 impl HttpService {
     #[new]
-    #[pyo3(signature = (port=None))]
-    pub fn new(port: Option<u16>) -> PyResult<Self> {
-        let builder = service_v2::HttpService::builder().port(port.unwrap_or(8080));
+    #[pyo3(signature = (port=None, rate_limiter_config=None))]
+    pub fn new(
+        port: Option<u16>,
+        rate_limiter_config: Option<RateLimiterConfig>,
+    ) -> PyResult<Self> {
+        let mut builder = service_v2::HttpService::builder().port(port.unwrap_or(8080));
+
+        if let Some(rate_limiter_config) = rate_limiter_config {
+            builder = builder.rate_limiter_config(rate_limiter_config.inner);
+        }
+
         let inner = builder.build().map_err(to_pyerr)?;
         Ok(Self { inner })
     }
@@ -184,3 +192,30 @@ where
         }
     }
 }
+
+#[pyclass]
+#[derive(Clone)]
+pub struct RateLimiterConfig {
+    inner: dynamo_llm::http::service::rate_limiter::RateLimiterConfig,
+}
+
+#[pymethods]
+impl RateLimiterConfig {
+    #[new]
+    pub fn new(
+        ttft_threshold_secs: f64,
+        itl_threshold_secs: f64,
+        time_constant_secs: f64,
+        per_model_limits: bool,
+    ) -> PyResult<Self> {
+        let inner = dynamo_llm::http::service::rate_limiter::RateLimiterConfig::new(
+            ttft_threshold_secs,
+            itl_threshold_secs,
+            time_constant_secs,
+            per_model_limits,
+        )
+        .map_err(to_pyerr)?;
+
+        Ok(Self { inner })
+    }
+}
@@ -100,6 +100,7 @@ fn _core(m: &Bound<'_, PyModule>) -> PyResult<()> {
     m.add_class::<http::HttpService>()?;
     m.add_class::<http::HttpError>()?;
     m.add_class::<http::HttpAsyncEngine>()?;
+    m.add_class::<http::RateLimiterConfig>()?;
     m.add_class::<EtcdKvCache>()?;
     m.add_class::<ModelType>()?;
     m.add_class::<llm::kv::ForwardPassMetrics>()?;