Allow the sidecar to sample from a list of prefill host ports (#404)

smarterclayton · web-flow · commit f712dfacb595 · 2025-11-20T08:25:54.000Z
In some benchmarking and test environments dynamic prefill selection
may be difficult and random selection among a set of hosts is
sufficient.

Add a new `--enable-prefiller-sampling` flag that instructs the
sidecar to select a random prefill host from the provided list
instead of the first one. Make the behavior opt-in to prevent
users from accidentally depending on the new behavior, and
keep the existing default behavior (first header value) consistent.

E.g.:

curl -H 'x-prefiller-host-port: server1:8000` -H 'x-prefiller-host-port: server2:8000'

will randomly choose one of the two values.

Signed-off-by: Clayton Coleman &lt;smarterclayton@gmail.com&gt;
diff --git a/cmd/pd-sidecar/main.go b/cmd/pd-sidecar/main.go
@@ -20,6 +20,7 @@ import (
 	"flag"
 	"net/url"
 	"os"
+	"strconv"
 	"strings"
 
 	"k8s.io/klog/v2"
@@ -55,6 +56,7 @@ func main() {
 	enableSSRFProtection := flag.Bool("enable-ssrf-protection", false, "enable SSRF protection using InferencePool allowlisting")
 	inferencePoolNamespace := flag.String("inference-pool-namespace", os.Getenv("INFERENCE_POOL_NAMESPACE"), "the Kubernetes namespace to watch for InferencePool resources (defaults to INFERENCE_POOL_NAMESPACE env var)")
 	inferencePoolName := flag.String("inference-pool-name", os.Getenv("INFERENCE_POOL_NAME"), "the specific InferencePool name to watch (defaults to INFERENCE_POOL_NAME env var)")
+	enablePrefillerSampling := flag.Bool("enable-prefiller-sampling", func() bool { b, _ := strconv.ParseBool(os.Getenv("ENABLE_PREFILLER_SAMPLING")); return b }(), "if true, the target prefill instance will be selected randomly from among the provided prefill host values")
 
 	klog.InitFlags(nil)
 	flag.Parse()
@@ -127,6 +129,7 @@ func main() {
 		PrefillerInsecureSkipVerify: *prefillerInsecureSkipVerify,
 		DecoderInsecureSkipVerify:   *decoderInsecureSkipVerify,
 		DataParallelSize:            *vLLMDataParallelSize,
+		EnablePrefillerSampling:     *enablePrefillerSampling,
 	}
 
 	// Create SSRF protection validator
diff --git a/pkg/sidecar/proxy/chat_completions.go b/pkg/sidecar/proxy/chat_completions.go
@@ -18,6 +18,7 @@ package proxy
 
 import (
 	"net/http"
+	"strings"
 
 	"github.com/llm-d/llm-d-inference-scheduler/pkg/common"
 )
@@ -31,9 +32,29 @@ var (
 )
 
 func (s *Server) chatCompletionsHandler(w http.ResponseWriter, r *http.Request) {
-	prefillPodHostPort := r.Header.Get(common.PrefillPodHeader)
+	var prefillHostPorts []string
+	prefillHostPorts = r.Header.Values(common.PrefillPodHeader)
 
-	if prefillPodHostPort == "" {
+	// https://datatracker.ietf.org/doc/html/rfc7230#section-3.2.2 specifies proxies
+	// may combine multiple header values with a comma. Accept either one host per
+	// header line OR one line with multiple header values.
+	if len(prefillHostPorts) == 1 {
+		prefillHostPorts = strings.Split(prefillHostPorts[0], ",")
+	}
+
+	numHosts := len(prefillHostPorts)
+	var prefillHostPort string
+	if numHosts > 0 {
+		if s.config.EnablePrefillerSampling {
+			// Sample a host value from the list
+			prefillHostPort = strings.TrimSpace(prefillHostPorts[s.prefillSamplerFn(numHosts)])
+		} else if numHosts > 0 {
+			// Select only the first header value, consistent with previous behavior
+			prefillHostPort = strings.TrimSpace(prefillHostPorts[0])
+		}
+	}
+
+	if len(prefillHostPort) == 0 {
 		s.logger.V(4).Info("skip disaggregated prefill")
 
 		if s.forwardDataParallel && !s.dataParallelHandler(w, r) {
@@ -43,16 +64,16 @@ func (s *Server) chatCompletionsHandler(w http.ResponseWriter, r *http.Request)
 	}
 
 	// SSRF Protection: Check if the prefill target is allowed
-	if !s.allowlistValidator.IsAllowed(prefillPodHostPort) {
+	if !s.allowlistValidator.IsAllowed(prefillHostPort) {
 		s.logger.Error(nil, "SSRF protection: prefill target not in allowlist",
-			"target", prefillPodHostPort,
+			"target", prefillHostPort,
 			"clientIP", r.RemoteAddr,
 			"userAgent", r.Header.Get("User-Agent"),
 			"requestPath", r.URL.Path)
 		http.Error(w, "Forbidden: prefill target not allowed by SSRF protection", http.StatusForbidden)
 		return
 	}
 
-	s.logger.V(4).Info("SSRF protection: prefill target allowed", "target", prefillPodHostPort)
-	s.runConnectorProtocol(w, r, prefillPodHostPort)
+	s.logger.V(4).Info("SSRF protection: prefill target allowed", "target", prefillHostPort)
+	s.runConnectorProtocol(w, r, prefillHostPort)
 }
diff --git a/pkg/sidecar/proxy/chat_completions_test.go b/pkg/sidecar/proxy/chat_completions_test.go
@@ -0,0 +1,158 @@
+/*
+Copyright 2025 The llm-d Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package proxy
+
+import (
+	"fmt"
+	"net/http"
+	"net/http/httptest"
+	"testing"
+
+	"github.com/llm-d/llm-d-inference-scheduler/pkg/common"
+)
+
+func TestServer_chatCompletionsHandler(t *testing.T) {
+	tests := []struct {
+		name     string
+		sampling bool
+		r        *http.Request
+
+		expectedCode             int
+		expectedPrefillHostPorts []string
+		expectedPassthrough      bool
+	}{
+		{
+			name: "passthrough by default",
+			r:    &http.Request{},
+
+			expectedPassthrough: true,
+		},
+		{
+			name: "passthrough with no header value",
+			r:    &http.Request{Header: http.Header{http.CanonicalHeaderKey(common.PrefillPodHeader): []string{}}},
+
+			expectedPassthrough: true,
+		},
+		{
+			name: "default prefill to one header value",
+			r:    &http.Request{Header: http.Header{http.CanonicalHeaderKey(common.PrefillPodHeader): []string{"a"}}},
+
+			expectedCode:             200,
+			expectedPrefillHostPorts: []string{"a"},
+		},
+		{
+			name: "default prefill to first header value",
+			r:    &http.Request{Header: http.Header{http.CanonicalHeaderKey(common.PrefillPodHeader): []string{"a,b"}}},
+
+			expectedCode:             200,
+			expectedPrefillHostPorts: []string{"a"},
+		},
+		{
+			name:     "sample from comma delimited header",
+			r:        &http.Request{Header: http.Header{http.CanonicalHeaderKey(common.PrefillPodHeader): []string{"a,b"}}},
+			sampling: true,
+
+			expectedCode:             200,
+			expectedPrefillHostPorts: []string{"a", "b"},
+		},
+		{
+			name:     "sample from comma delimited header with whitespace",
+			r:        &http.Request{Header: http.Header{http.CanonicalHeaderKey(common.PrefillPodHeader): []string{" a, b"}}},
+			sampling: true,
+
+			expectedCode:             200,
+			expectedPrefillHostPorts: []string{"a", "b"},
+		},
+		{
+			name:     "sample from duplicate values",
+			r:        &http.Request{Header: http.Header{http.CanonicalHeaderKey(common.PrefillPodHeader): []string{"a,a"}}},
+			sampling: true,
+
+			expectedCode:             200,
+			expectedPrefillHostPorts: []string{"a"},
+		},
+		{
+			name:     "sample from multiple header values",
+			r:        &http.Request{Header: http.Header{http.CanonicalHeaderKey(common.PrefillPodHeader): []string{"a", "b"}}},
+			sampling: true,
+
+			expectedCode:             200,
+			expectedPrefillHostPorts: []string{"a", "b"},
+		},
+		{
+			name:     "sample from empty header value",
+			r:        &http.Request{Header: http.Header{http.CanonicalHeaderKey(common.PrefillPodHeader): []string{""}}},
+			sampling: true,
+
+			expectedPassthrough: true,
+		},
+		{
+			name:     "sample from multiple empty header values",
+			r:        &http.Request{Header: http.Header{http.CanonicalHeaderKey(common.PrefillPodHeader): []string{"", ""}}},
+			sampling: true,
+
+			expectedPassthrough: true,
+		},
+	}
+	for _, tt := range tests {
+		maxAttempts := len(tt.expectedPrefillHostPorts) + 1
+
+		for i := 0; i < maxAttempts; i++ {
+			t.Run(fmt.Sprintf("%s_%d", tt.name, i), func(t *testing.T) {
+				s := NewProxy("8000", nil, Config{EnablePrefillerSampling: tt.sampling})
+				s.allowlistValidator = &AllowlistValidator{}
+				// return a predictable sequence of values
+				s.prefillSamplerFn = func(n int) int { return i % n }
+				// verify the hostPort value
+				var hostPort string
+				s.runConnectorProtocol = func(_ http.ResponseWriter, _ *http.Request, selectedHostPort string) { hostPort = selectedHostPort }
+				var passthrough bool
+				s.decoderProxy = http.HandlerFunc(func(_ http.ResponseWriter, _ *http.Request) {
+					passthrough = true
+				})
+				s.dataParallelProxies = make(map[string]http.Handler)
+				recorder := httptest.NewRecorder()
+				recorder.Code = 0
+				s.chatCompletionsHandler(recorder, tt.r)
+
+				resp := recorder.Result()
+				if passthrough {
+					if !tt.expectedPassthrough {
+						t.Errorf("unexpected passthrough to decode")
+					}
+					if recorder.Code != 0 || recorder.Body.Len() > 0 || len(resp.Header) > 0 {
+						t.Errorf("unexpected write to recorder during passthrough: %#v %#v", recorder, resp)
+					}
+					if len(hostPort) > 0 {
+						t.Errorf("unexpected hostPort set")
+					}
+				} else {
+					if tt.expectedPassthrough {
+						t.Fatal("unexpected handled request")
+					}
+					if resp.StatusCode != tt.expectedCode {
+						t.Errorf("unexpected code: %d", resp.StatusCode)
+					}
+					expected, actual := tt.expectedPrefillHostPorts[i%len(tt.expectedPrefillHostPorts)], hostPort
+					if expected != actual {
+						t.Errorf("expected=%s actual=%s", expected, actual)
+					}
+				}
+			})
+		}
+	}
+}
diff --git a/pkg/sidecar/proxy/proxy.go b/pkg/sidecar/proxy/proxy.go
@@ -19,6 +19,7 @@ package proxy
 import (
 	"context"
 	"crypto/tls"
+	"math/rand"
 	"net"
 	"net/http"
 	"net/http/httputil"
@@ -77,6 +78,10 @@ type Config struct {
 
 	// DataParallelSize is the value passed to the vLLM server's --DATA_PARALLEL-SIZE command line argument
 	DataParallelSize int
+
+	// EnablePrefillerSampling configures the proxy to randomly choose from the set
+	// of provided prefill hosts instead of always using the first one.
+	EnablePrefillerSampling bool
 }
 
 type protocolRunner func(http.ResponseWriter, *http.Request, string)
@@ -92,10 +97,12 @@ type Server struct {
 	runConnectorProtocol protocolRunner // the handler for running the protocol
 	prefillerURLPrefix   string
 
-	decoderProxy        *httputil.ReverseProxy            // decoder proxy handler
-	prefillerProxies    *lru.Cache[string, http.Handler]  // cached prefiller proxy handlers
-	dataParallelProxies map[string]*httputil.ReverseProxy // Proxies to other vLLM servers
-	forwardDataParallel bool                              // Use special Data Parallel work around
+	decoderProxy        http.Handler                     // decoder proxy handler
+	prefillerProxies    *lru.Cache[string, http.Handler] // cached prefiller proxy handlers
+	dataParallelProxies map[string]http.Handler          // Proxies to other vLLM servers
+	forwardDataParallel bool                             // Use special Data Parallel work around
+
+	prefillSamplerFn func(n int) int // allow test override
 
 	config Config
 }
@@ -110,8 +117,9 @@ func NewProxy(port string, decodeURL *url.URL, config Config) *Server {
 		prefillerProxies:    cache,
 		prefillerURLPrefix:  "http://",
 		config:              config,
-		dataParallelProxies: map[string]*httputil.ReverseProxy{},
+		dataParallelProxies: map[string]http.Handler{},
 		forwardDataParallel: true,
+		prefillSamplerFn:    rand.Intn,
 	}
 	switch config.Connector {
 	case ConnectorLMCache: