rc2

pierDipi · pierDipi · commit 2125f03f1a5a · 2025-11-24T08:48:09.000+01:00
Signed-off-by: Pierangelo Di Pilato &lt;pierdipi@redhat.com&gt;
diff --git a/go.mod b/go.mod
@@ -10,7 +10,7 @@ require (
 	github.com/google/uuid v1.6.0
 	github.com/hashicorp/golang-lru/v2 v2.0.7
 	github.com/jellydator/ttlcache/v3 v3.4.0
-	github.com/llm-d/llm-d-kv-cache-manager v0.4.0-rc1
+	github.com/llm-d/llm-d-kv-cache-manager v0.4.0-rc2
 	github.com/onsi/ginkgo/v2 v2.27.2
 	github.com/onsi/gomega v1.38.2
 	github.com/openai/openai-go v1.12.0
diff --git a/go.sum b/go.sum
@@ -181,12 +181,8 @@ github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY=
 github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE=
 github.com/kylelemons/godebug v1.1.0 h1:RPNrshWIDI6G2gRW9EHilWtl7Z6Sb1BR0xunSBf0SNc=
 github.com/kylelemons/godebug v1.1.0/go.mod h1:9/0rRGxNHcop5bhtWyNeEfOS8JIWk580+fNqagV/RAw=
-github.com/llm-d/llm-d-kv-cache-manager v0.3.2 h1:omSTXtuII3ol37CaoI9h+2VxE0m8EoeVOor+CkQh99I=
-github.com/llm-d/llm-d-kv-cache-manager v0.3.2/go.mod h1:q6u7LnzMxNcHHb5/LRdHNNeZzzGMSENFSP1NGfsJEmA=
-github.com/llm-d/llm-d-kv-cache-manager v0.3.3-0.20251119172839-f8bb3049d991 h1:zGC/uDL4ytR4idUKd4iP/Doto0HNdxuJtgR6mn9w2Ro=
-github.com/llm-d/llm-d-kv-cache-manager v0.3.3-0.20251119172839-f8bb3049d991/go.mod h1:oEmDhEjW1pEoOSlEFy8CKoMc7ixQmSKEbhLt9CoH/a0=
-github.com/llm-d/llm-d-kv-cache-manager v0.4.0-rc1 h1:gWkZ9yp7sU5j1vbNB7eO95lxbvgJV+qd/60LnPfNk9w=
-github.com/llm-d/llm-d-kv-cache-manager v0.4.0-rc1/go.mod h1:oEmDhEjW1pEoOSlEFy8CKoMc7ixQmSKEbhLt9CoH/a0=
+github.com/llm-d/llm-d-kv-cache-manager v0.4.0-rc2 h1:l2Sm8W6SRg4TAme4RsndwZ++5+4aQvDI4vnf8TKrhww=
+github.com/llm-d/llm-d-kv-cache-manager v0.4.0-rc2/go.mod h1:ZlK7MCuz5D/weLeHyNKEmVF/eJZDyYn3XyRowTihq9o=
 github.com/mailru/easyjson v0.9.0 h1:PrnmzHw7262yW8sTBwxi1PdJA3Iw/EKBa8psRf7d9a4=
 github.com/mailru/easyjson v0.9.0/go.mod h1:1+xMtQp2MRNVL/V1bOzuP3aP8VNwRW55fQUto+XFtTU=
 github.com/maruel/natural v1.1.1 h1:Hja7XhhmvEFhcByqDoHz9QZbkWey+COd9xWfCfn1ioo=
diff --git a/pkg/plugins/scorer/precise_prefix_cache.go b/pkg/plugins/scorer/precise_prefix_cache.go
@@ -102,9 +102,8 @@ func New(ctx context.Context, config PrecisePrefixCachePluginConfig) (*PrecisePr
 	}
 
 	return &PrecisePrefixCacheScorer{
-		typedName:            plugins.TypedName{Type: PrecisePrefixCachePluginType},
-		kvCacheIndexer:       kvCacheIndexer,
-		chatTemplateRenderer: chatTemplateRenderer,
+		typedName:      plugins.TypedName{Type: PrecisePrefixCachePluginType},
+		kvCacheIndexer: kvCacheIndexer,
 	}, nil
 }
 
@@ -114,9 +113,8 @@ func New(ctx context.Context, config PrecisePrefixCachePluginConfig) (*PrecisePr
 // state, and the `kvevents.Pool` to subscribe to KV-cache events
 // to keep the internal KV-cache index state up-to-date.
 type PrecisePrefixCacheScorer struct {
-	typedName            plugins.TypedName
-	kvCacheIndexer       *kvcache.Indexer
-	chatTemplateRenderer *preprocessing.ChatTemplatingProcessor
+	typedName      plugins.TypedName
+	kvCacheIndexer *kvcache.Indexer
 }
 
 // TypedName returns the typed name of the plugin.
@@ -134,26 +132,20 @@ func (s *PrecisePrefixCacheScorer) WithName(name string) *PrecisePrefixCacheScor
 // The returned scores are normalized to a range of 0-1.
 func (s *PrecisePrefixCacheScorer) Score(ctx context.Context, _ *types.CycleState, request *types.LLMRequest, pods []types.Pod) map[types.Pod]float64 {
 	logger := log.FromContext(ctx).WithName(s.typedName.String())
+	debugLogger := logger.V(logutil.DEBUG)
 
 	if request == nil {
-		logger.V(logutil.DEBUG).Info("Request is nil, skipping scoring")
+		debugLogger.Info("Request is nil, skipping scoring")
 		return nil
 	}
 
-	// Extract the flattened prompt from the request
-	prompt, err := s.extractPrompt(ctx, request)
+	// Extract the flattened scores from the request
+	scores, err := s.getScores(ctx, request)
 	if err != nil {
-		logger.Error(err, "Failed to extract prompt from request")
+		logger.Error(err, "Failed to extract scores from request")
 		return nil
 	}
-
-	scores, err := s.kvCacheIndexer.GetPodScores(ctx, prompt, request.TargetModel, nil)
-	if err != nil {
-		logger.Error(err, "Failed to get pod scores")
-		return nil
-	}
-
-	logger.V(logutil.DEBUG).Info("Got pod scores", "scores", scores)
+	debugLogger.Info("Got pod scores", "scores", scores)
 
 	podToKey := func(pod types.Pod) (string, bool) {
 		metricsPod := pod.GetPod()
@@ -170,20 +162,23 @@ func (s *PrecisePrefixCacheScorer) Score(ctx context.Context, _ *types.CycleStat
 // extractPrompt extracts the flattened prompt from the request.
 // For chat completions, it renders the messages using the model's chat template.
 // For regular completions, it uses the prompt directly.
-func (s *PrecisePrefixCacheScorer) extractPrompt(ctx context.Context, request *types.LLMRequest) (string, error) {
-	traceLogger := log.FromContext(ctx).V(logutil.TRACE).WithName(s.typedName.String())
+func (s *PrecisePrefixCacheScorer) getScores(ctx context.Context, request *types.LLMRequest) (map[string]float64, error) {
+	logger := log.FromContext(ctx).WithName(s.typedName.String())
+	debugLogger := logger.V(logutil.DEBUG)
+	traceLogger := logger.V(logutil.TRACE)
+
+	debugLogger.Info("Getting scores",
+		"target_model", request.TargetModel,
+		"has_chat_completions", request.Body != nil && request.Body.ChatCompletions != nil,
+		"has_completions", request.Body != nil && request.Body.Completions != nil)
 
 	// The upstream parser guarantees exactly one body is populated, but we defensively prioritize chat completions.
 	// If an unexpected dual payload slips through (parser regression/new client), log it and use chat semantics.
 	if request.Body != nil && request.Body.ChatCompletions != nil {
 		if request.Body.Completions != nil {
 			traceLogger.Info("Both chat/completions and completions present; defaulting to chat/completions")
 		}
-		traceLogger.Info("Processing chat completion request",
-			"messages_count", len(request.Body.ChatCompletions.Messages),
-			"target_model", request.TargetModel)
 
-		// Create render request
 		renderReq := &preprocessing.RenderJinjaTemplateRequest{
 			Conversations:             make([]preprocessing.ChatMessage, 0),
 			Tools:                     request.Body.ChatCompletions.Tools,
@@ -203,47 +198,30 @@ func (s *PrecisePrefixCacheScorer) extractPrompt(ctx context.Context, request *t
 			})
 		}
 
-		// Fetch the chat template from the model
-		fetchReq := preprocessing.FetchChatTemplateRequest{
-			Model: request.TargetModel,
-		}
-
-		chatTemplate, chatTemplateKWArgs, err := s.chatTemplateRenderer.FetchChatTemplate(ctx, fetchReq)
-		if err != nil {
-			return "", fmt.Errorf("failed to fetch chat template: %w", err)
-		}
-
-		traceLogger.Info("Chat template fetched",
-			"model", request.TargetModel,
-			"templateLength", len(chatTemplate),
-			"hasKwargs", len(chatTemplateKWArgs) > 0)
-
-		// Set the fetched template in the render request
-		renderReq.ChatTemplate = chatTemplate
-		renderReq.ChatTemplateKWArgs = chatTemplateKWArgs
+		traceLogger.Info("Processing chat completion request",
+			"messages_count", len(renderReq.Conversations),
+			"tools_count", len(renderReq.Tools),
+			"documents_count", len(renderReq.Documents),
+			"target_model", request.TargetModel)
 
-		// Render the template to get flattened prompt
-		resp, err := s.chatTemplateRenderer.RenderChatTemplate(ctx, renderReq)
+		scores, err := s.kvCacheIndexer.GetPodScores(ctx, renderReq, "", request.TargetModel, nil)
 		if err != nil {
-			return "", fmt.Errorf("failed to render chat template: %w", err)
+			return nil, fmt.Errorf("failed to get pod scores for chat/completions: %w", err)
 		}
-
-		if len(resp.RenderedChats) == 0 {
-			return "", errors.New("no rendered chat returned from template rendering")
-		}
-
-		prompt := resp.RenderedChats[0]
-		traceLogger.Info("Chat template rendered successfully",
-			"promptLength", len(prompt))
-		return prompt, nil
+		return scores, nil
 	}
 
 	// For regular completions, use the prompt directly
 	if request.Body != nil && request.Body.Completions != nil {
 		prompt := request.Body.Completions.Prompt
 		traceLogger.Info("Using completion prompt directly", "promptLength", len(prompt))
-		return prompt, nil
+
+		scores, err := s.kvCacheIndexer.GetPodScores(ctx, nil, prompt, request.TargetModel, nil)
+		if err != nil {
+			return nil, fmt.Errorf("failed to get pod scores for completions: %w", err)
+		}
+		return scores, nil
 	}
 
-	return "", errors.New("no valid prompt found in request")
+	return nil, errors.New("no valid input found in request")
 }