diff --git a/pyproject.toml b/pyproject.toml index d645899ad6..80778f3007 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -558,6 +558,7 @@ module = [ "psycopg2", "psycopg2.extras", "psycopg2.extensions", + "sentence_transformers", "torchtune.*", "fairscale.*", "torchvision.*", diff --git a/src/llama_stack/providers/inline/eval/builtin/eval.py b/src/llama_stack/providers/inline/eval/builtin/eval.py index 5d3f6db53e..270c054930 100644 --- a/src/llama_stack/providers/inline/eval/builtin/eval.py +++ b/src/llama_stack/providers/inline/eval/builtin/eval.py @@ -4,6 +4,7 @@ # This source code is licensed under the terms described in the LICENSE file in # the root directory of this source tree. import json +from collections.abc import AsyncIterator from typing import Any from tqdm import tqdm @@ -33,6 +34,7 @@ RunEvalRequest, ScoreRequest, Scoring, + SystemMessage, ) from .config import BuiltinEvalConfig @@ -63,9 +65,9 @@ def __init__( self.responses_api = responses_api # TODO: assume sync job, will need jobs API for async scheduling - self.jobs = {} + self.jobs: dict[str, EvaluateResponse] = {} - self.benchmarks = {} + self.benchmarks: dict[str, Benchmark] = {} async def initialize(self) -> None: self.kvstore = await kvstore_impl(self.config.kvstore) @@ -74,8 +76,8 @@ async def initialize(self) -> None: end_key = f"{EVAL_TASKS_PREFIX}\xff" stored_benchmarks = await self.kvstore.values_in_range(start_key, end_key) - for benchmark in stored_benchmarks: - benchmark = Benchmark.model_validate_json(benchmark) + for benchmark_json in stored_benchmarks: + benchmark = Benchmark.model_validate_json(benchmark_json) self.benchmarks[benchmark.identifier] = benchmark async def shutdown(self) -> None: ... @@ -132,7 +134,7 @@ async def _run_model_generation( ) -> list[dict[str, Any]]: candidate = request.benchmark_config.eval_candidate assert candidate.sampling_params.max_tokens is not None, "SamplingParams.max_tokens must be provided" - sampling_params = {"max_tokens": candidate.sampling_params.max_tokens} + sampling_params: dict[str, Any] = {"max_tokens": candidate.sampling_params.max_tokens} generations = [] for x in tqdm(input_rows): @@ -147,6 +149,7 @@ async def _run_model_generation( **sampling_params, ) response = await self.inference_api.openai_completion(params) + assert not isinstance(response, AsyncIterator), "Streaming not supported in eval" generations.append({ColumnName.generated_answer.value: response.choices[0].text}) elif ColumnName.chat_completion_input.value in x: chat_completion_input_json = json.loads(x[ColumnName.chat_completion_input.value]) @@ -154,7 +157,7 @@ async def _run_model_generation( OpenAIUserMessageParam(**x) for x in chat_completion_input_json if x["role"] == "user" ] - messages = [] + messages: list[SystemMessage | OpenAISystemMessageParam | OpenAIUserMessageParam] = [] if candidate.system_message: messages.append(candidate.system_message) @@ -163,11 +166,14 @@ async def _run_model_generation( messages += input_messages params = OpenAIChatCompletionRequestWithExtraBody( model=candidate.model, - messages=messages, + messages=messages, # type: ignore[arg-type] **sampling_params, ) response = await self.inference_api.openai_chat_completion(params) - generations.append({ColumnName.generated_answer.value: response.choices[0].message.content}) + assert not isinstance(response, AsyncIterator), "Streaming not supported in eval" + content = response.choices[0].message.content + assert content is not None, "Expected content in chat response" + generations.append({ColumnName.generated_answer.value: content}) else: raise ValueError("Invalid input row") diff --git a/src/llama_stack/providers/inline/inference/sentence_transformers/sentence_transformers.py b/src/llama_stack/providers/inline/inference/sentence_transformers/sentence_transformers.py index c03bdcfffc..1d3092a2e0 100644 --- a/src/llama_stack/providers/inline/inference/sentence_transformers/sentence_transformers.py +++ b/src/llama_stack/providers/inline/inference/sentence_transformers/sentence_transformers.py @@ -17,7 +17,9 @@ ModelType, OpenAIChatCompletion, OpenAIChatCompletionChunk, + OpenAIChatCompletionChunkWithReasoning, OpenAIChatCompletionRequestWithExtraBody, + OpenAIChatCompletionWithReasoning, OpenAICompletion, OpenAICompletionRequestWithExtraBody, ) @@ -39,7 +41,9 @@ class SentenceTransformersInferenceImpl( def __init__(self, config: SentenceTransformersInferenceConfig) -> None: self.config = config - async def openai_chat_completions_with_reasoning(self, params: OpenAIChatCompletionRequestWithExtraBody) -> None: + async def openai_chat_completions_with_reasoning( + self, params: OpenAIChatCompletionRequestWithExtraBody + ) -> OpenAIChatCompletionWithReasoning | AsyncIterator[OpenAIChatCompletionChunkWithReasoning]: raise NotImplementedError("SentenceTransformers provider does not support reasoning in chat completions") async def initialize(self) -> None: diff --git a/src/llama_stack/providers/inline/inference/transformers/transformers.py b/src/llama_stack/providers/inline/inference/transformers/transformers.py index 9d624ce65d..b248e6d7cf 100644 --- a/src/llama_stack/providers/inline/inference/transformers/transformers.py +++ b/src/llama_stack/providers/inline/inference/transformers/transformers.py @@ -21,9 +21,11 @@ ModelType, OpenAIChatCompletion, OpenAIChatCompletionChunk, + OpenAIChatCompletionChunkWithReasoning, OpenAIChatCompletionContentPartImageParam, OpenAIChatCompletionContentPartTextParam, OpenAIChatCompletionRequestWithExtraBody, + OpenAIChatCompletionWithReasoning, OpenAICompletion, OpenAICompletionRequestWithExtraBody, OpenAIEmbeddingsRequestWithExtraBody, @@ -56,7 +58,9 @@ class TransformersInferenceImpl( def __init__(self, config: TransformersInferenceConfig) -> None: self.config = config - async def openai_chat_completions_with_reasoning(self, params: OpenAIChatCompletionRequestWithExtraBody) -> None: + async def openai_chat_completions_with_reasoning( + self, params: OpenAIChatCompletionRequestWithExtraBody + ) -> OpenAIChatCompletionWithReasoning | AsyncIterator[OpenAIChatCompletionChunkWithReasoning]: raise NotImplementedError("Transformers provider does not support reasoning in chat completions") async def initialize(self) -> None: diff --git a/src/llama_stack/providers/utils/inference/embedding_mixin.py b/src/llama_stack/providers/utils/inference/embedding_mixin.py index ec1a1e04fd..2aa1e8f431 100644 --- a/src/llama_stack/providers/utils/inference/embedding_mixin.py +++ b/src/llama_stack/providers/utils/inference/embedding_mixin.py @@ -8,7 +8,7 @@ import base64 import platform import struct -from typing import TYPE_CHECKING +from typing import TYPE_CHECKING, Any from llama_stack.log import get_logger @@ -36,7 +36,8 @@ class SentenceTransformerEmbeddingMixin: """Mixin providing OpenAI-compatible embeddings via sentence-transformers models.""" - model_store: ModelStore + config: Any + model_store: ModelStore | None async def openai_embeddings( self,