Skip to content

Commit 2125f03

Browse files
committed
rc2
Signed-off-by: Pierangelo Di Pilato <[email protected]>
1 parent 0e55b1e commit 2125f03

File tree

3 files changed

+37
-63
lines changed

3 files changed

+37
-63
lines changed

go.mod

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ require (
1010
github.com/google/uuid v1.6.0
1111
github.com/hashicorp/golang-lru/v2 v2.0.7
1212
github.com/jellydator/ttlcache/v3 v3.4.0
13-
github.com/llm-d/llm-d-kv-cache-manager v0.4.0-rc1
13+
github.com/llm-d/llm-d-kv-cache-manager v0.4.0-rc2
1414
github.com/onsi/ginkgo/v2 v2.27.2
1515
github.com/onsi/gomega v1.38.2
1616
github.com/openai/openai-go v1.12.0

go.sum

Lines changed: 2 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -181,12 +181,8 @@ github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY=
181181
github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE=
182182
github.com/kylelemons/godebug v1.1.0 h1:RPNrshWIDI6G2gRW9EHilWtl7Z6Sb1BR0xunSBf0SNc=
183183
github.com/kylelemons/godebug v1.1.0/go.mod h1:9/0rRGxNHcop5bhtWyNeEfOS8JIWk580+fNqagV/RAw=
184-
github.com/llm-d/llm-d-kv-cache-manager v0.3.2 h1:omSTXtuII3ol37CaoI9h+2VxE0m8EoeVOor+CkQh99I=
185-
github.com/llm-d/llm-d-kv-cache-manager v0.3.2/go.mod h1:q6u7LnzMxNcHHb5/LRdHNNeZzzGMSENFSP1NGfsJEmA=
186-
github.com/llm-d/llm-d-kv-cache-manager v0.3.3-0.20251119172839-f8bb3049d991 h1:zGC/uDL4ytR4idUKd4iP/Doto0HNdxuJtgR6mn9w2Ro=
187-
github.com/llm-d/llm-d-kv-cache-manager v0.3.3-0.20251119172839-f8bb3049d991/go.mod h1:oEmDhEjW1pEoOSlEFy8CKoMc7ixQmSKEbhLt9CoH/a0=
188-
github.com/llm-d/llm-d-kv-cache-manager v0.4.0-rc1 h1:gWkZ9yp7sU5j1vbNB7eO95lxbvgJV+qd/60LnPfNk9w=
189-
github.com/llm-d/llm-d-kv-cache-manager v0.4.0-rc1/go.mod h1:oEmDhEjW1pEoOSlEFy8CKoMc7ixQmSKEbhLt9CoH/a0=
184+
github.com/llm-d/llm-d-kv-cache-manager v0.4.0-rc2 h1:l2Sm8W6SRg4TAme4RsndwZ++5+4aQvDI4vnf8TKrhww=
185+
github.com/llm-d/llm-d-kv-cache-manager v0.4.0-rc2/go.mod h1:ZlK7MCuz5D/weLeHyNKEmVF/eJZDyYn3XyRowTihq9o=
190186
github.com/mailru/easyjson v0.9.0 h1:PrnmzHw7262yW8sTBwxi1PdJA3Iw/EKBa8psRf7d9a4=
191187
github.com/mailru/easyjson v0.9.0/go.mod h1:1+xMtQp2MRNVL/V1bOzuP3aP8VNwRW55fQUto+XFtTU=
192188
github.com/maruel/natural v1.1.1 h1:Hja7XhhmvEFhcByqDoHz9QZbkWey+COd9xWfCfn1ioo=

pkg/plugins/scorer/precise_prefix_cache.go

Lines changed: 34 additions & 56 deletions
Original file line numberDiff line numberDiff line change
@@ -102,9 +102,8 @@ func New(ctx context.Context, config PrecisePrefixCachePluginConfig) (*PrecisePr
102102
}
103103

104104
return &PrecisePrefixCacheScorer{
105-
typedName: plugins.TypedName{Type: PrecisePrefixCachePluginType},
106-
kvCacheIndexer: kvCacheIndexer,
107-
chatTemplateRenderer: chatTemplateRenderer,
105+
typedName: plugins.TypedName{Type: PrecisePrefixCachePluginType},
106+
kvCacheIndexer: kvCacheIndexer,
108107
}, nil
109108
}
110109

@@ -114,9 +113,8 @@ func New(ctx context.Context, config PrecisePrefixCachePluginConfig) (*PrecisePr
114113
// state, and the `kvevents.Pool` to subscribe to KV-cache events
115114
// to keep the internal KV-cache index state up-to-date.
116115
type PrecisePrefixCacheScorer struct {
117-
typedName plugins.TypedName
118-
kvCacheIndexer *kvcache.Indexer
119-
chatTemplateRenderer *preprocessing.ChatTemplatingProcessor
116+
typedName plugins.TypedName
117+
kvCacheIndexer *kvcache.Indexer
120118
}
121119

122120
// TypedName returns the typed name of the plugin.
@@ -134,26 +132,20 @@ func (s *PrecisePrefixCacheScorer) WithName(name string) *PrecisePrefixCacheScor
134132
// The returned scores are normalized to a range of 0-1.
135133
func (s *PrecisePrefixCacheScorer) Score(ctx context.Context, _ *types.CycleState, request *types.LLMRequest, pods []types.Pod) map[types.Pod]float64 {
136134
logger := log.FromContext(ctx).WithName(s.typedName.String())
135+
debugLogger := logger.V(logutil.DEBUG)
137136

138137
if request == nil {
139-
logger.V(logutil.DEBUG).Info("Request is nil, skipping scoring")
138+
debugLogger.Info("Request is nil, skipping scoring")
140139
return nil
141140
}
142141

143-
// Extract the flattened prompt from the request
144-
prompt, err := s.extractPrompt(ctx, request)
142+
// Extract the flattened scores from the request
143+
scores, err := s.getScores(ctx, request)
145144
if err != nil {
146-
logger.Error(err, "Failed to extract prompt from request")
145+
logger.Error(err, "Failed to extract scores from request")
147146
return nil
148147
}
149-
150-
scores, err := s.kvCacheIndexer.GetPodScores(ctx, prompt, request.TargetModel, nil)
151-
if err != nil {
152-
logger.Error(err, "Failed to get pod scores")
153-
return nil
154-
}
155-
156-
logger.V(logutil.DEBUG).Info("Got pod scores", "scores", scores)
148+
debugLogger.Info("Got pod scores", "scores", scores)
157149

158150
podToKey := func(pod types.Pod) (string, bool) {
159151
metricsPod := pod.GetPod()
@@ -170,20 +162,23 @@ func (s *PrecisePrefixCacheScorer) Score(ctx context.Context, _ *types.CycleStat
170162
// extractPrompt extracts the flattened prompt from the request.
171163
// For chat completions, it renders the messages using the model's chat template.
172164
// For regular completions, it uses the prompt directly.
173-
func (s *PrecisePrefixCacheScorer) extractPrompt(ctx context.Context, request *types.LLMRequest) (string, error) {
174-
traceLogger := log.FromContext(ctx).V(logutil.TRACE).WithName(s.typedName.String())
165+
func (s *PrecisePrefixCacheScorer) getScores(ctx context.Context, request *types.LLMRequest) (map[string]float64, error) {
166+
logger := log.FromContext(ctx).WithName(s.typedName.String())
167+
debugLogger := logger.V(logutil.DEBUG)
168+
traceLogger := logger.V(logutil.TRACE)
169+
170+
debugLogger.Info("Getting scores",
171+
"target_model", request.TargetModel,
172+
"has_chat_completions", request.Body != nil && request.Body.ChatCompletions != nil,
173+
"has_completions", request.Body != nil && request.Body.Completions != nil)
175174

176175
// The upstream parser guarantees exactly one body is populated, but we defensively prioritize chat completions.
177176
// If an unexpected dual payload slips through (parser regression/new client), log it and use chat semantics.
178177
if request.Body != nil && request.Body.ChatCompletions != nil {
179178
if request.Body.Completions != nil {
180179
traceLogger.Info("Both chat/completions and completions present; defaulting to chat/completions")
181180
}
182-
traceLogger.Info("Processing chat completion request",
183-
"messages_count", len(request.Body.ChatCompletions.Messages),
184-
"target_model", request.TargetModel)
185181

186-
// Create render request
187182
renderReq := &preprocessing.RenderJinjaTemplateRequest{
188183
Conversations: make([]preprocessing.ChatMessage, 0),
189184
Tools: request.Body.ChatCompletions.Tools,
@@ -203,47 +198,30 @@ func (s *PrecisePrefixCacheScorer) extractPrompt(ctx context.Context, request *t
203198
})
204199
}
205200

206-
// Fetch the chat template from the model
207-
fetchReq := preprocessing.FetchChatTemplateRequest{
208-
Model: request.TargetModel,
209-
}
210-
211-
chatTemplate, chatTemplateKWArgs, err := s.chatTemplateRenderer.FetchChatTemplate(ctx, fetchReq)
212-
if err != nil {
213-
return "", fmt.Errorf("failed to fetch chat template: %w", err)
214-
}
215-
216-
traceLogger.Info("Chat template fetched",
217-
"model", request.TargetModel,
218-
"templateLength", len(chatTemplate),
219-
"hasKwargs", len(chatTemplateKWArgs) > 0)
220-
221-
// Set the fetched template in the render request
222-
renderReq.ChatTemplate = chatTemplate
223-
renderReq.ChatTemplateKWArgs = chatTemplateKWArgs
201+
traceLogger.Info("Processing chat completion request",
202+
"messages_count", len(renderReq.Conversations),
203+
"tools_count", len(renderReq.Tools),
204+
"documents_count", len(renderReq.Documents),
205+
"target_model", request.TargetModel)
224206

225-
// Render the template to get flattened prompt
226-
resp, err := s.chatTemplateRenderer.RenderChatTemplate(ctx, renderReq)
207+
scores, err := s.kvCacheIndexer.GetPodScores(ctx, renderReq, "", request.TargetModel, nil)
227208
if err != nil {
228-
return "", fmt.Errorf("failed to render chat template: %w", err)
209+
return nil, fmt.Errorf("failed to get pod scores for chat/completions: %w", err)
229210
}
230-
231-
if len(resp.RenderedChats) == 0 {
232-
return "", errors.New("no rendered chat returned from template rendering")
233-
}
234-
235-
prompt := resp.RenderedChats[0]
236-
traceLogger.Info("Chat template rendered successfully",
237-
"promptLength", len(prompt))
238-
return prompt, nil
211+
return scores, nil
239212
}
240213

241214
// For regular completions, use the prompt directly
242215
if request.Body != nil && request.Body.Completions != nil {
243216
prompt := request.Body.Completions.Prompt
244217
traceLogger.Info("Using completion prompt directly", "promptLength", len(prompt))
245-
return prompt, nil
218+
219+
scores, err := s.kvCacheIndexer.GetPodScores(ctx, nil, prompt, request.TargetModel, nil)
220+
if err != nil {
221+
return nil, fmt.Errorf("failed to get pod scores for completions: %w", err)
222+
}
223+
return scores, nil
246224
}
247225

248-
return "", errors.New("no valid prompt found in request")
226+
return nil, errors.New("no valid input found in request")
249227
}

0 commit comments

Comments
 (0)