From 51f8d2270e2e41dbc521e1d27111b8df9c11b9cf Mon Sep 17 00:00:00 2001 From: Mustafa Elbehery Date: Thu, 2 Apr 2026 11:13:21 +0200 Subject: [PATCH 1/2] chore(mypy): add type hints to inline provider files MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fix mypy strict type checking errors in inline provider files: - Add type annotations for jobs/benchmarks dictionaries - Fix variable shadowing (benchmark → benchmark_json) - Add sampling_params type annotation (dict[str, Any]) - Separate completion and chat variable scopes - Add proper union types for messages list - Add assertions for non-streaming responses and non-None content - Add config attribute declaration (Any type) - Change model_store from ModelStore to ModelStore | None - Import OpenAIChatCompletionWithReasoning types - Add correct return type for openai_chat_completions_with_reasoning - Import OpenAIChatCompletionWithReasoning types - Add correct return type for openai_chat_completions_with_reasoning pyproject.toml: - Add sentence_transformers to mypy ignore_missing_imports (no type stubs) Signed-off-by: Mustafa Elbehery --- pyproject.toml | 1 + .../providers/inline/eval/builtin/eval.py | 32 +++++++++++-------- .../sentence_transformers.py | 6 +++- .../inference/transformers/transformers.py | 6 +++- .../utils/inference/embedding_mixin.py | 5 +-- 5 files changed, 33 insertions(+), 17 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index d645899ad6..80778f3007 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -558,6 +558,7 @@ module = [ "psycopg2", "psycopg2.extras", "psycopg2.extensions", + "sentence_transformers", "torchtune.*", "fairscale.*", "torchvision.*", diff --git a/src/llama_stack/providers/inline/eval/builtin/eval.py b/src/llama_stack/providers/inline/eval/builtin/eval.py index 5d3f6db53e..057b79b261 100644 --- a/src/llama_stack/providers/inline/eval/builtin/eval.py +++ b/src/llama_stack/providers/inline/eval/builtin/eval.py @@ -4,6 +4,7 @@ # This source code is licensed under the terms described in the LICENSE file in # the root directory of this source tree. import json +from collections.abc import AsyncIterator from typing import Any from tqdm import tqdm @@ -33,6 +34,7 @@ RunEvalRequest, ScoreRequest, Scoring, + SystemMessage, ) from .config import BuiltinEvalConfig @@ -63,9 +65,9 @@ def __init__( self.responses_api = responses_api # TODO: assume sync job, will need jobs API for async scheduling - self.jobs = {} + self.jobs: dict[str, EvaluateResponse] = {} - self.benchmarks = {} + self.benchmarks: dict[str, Benchmark] = {} async def initialize(self) -> None: self.kvstore = await kvstore_impl(self.config.kvstore) @@ -74,8 +76,8 @@ async def initialize(self) -> None: end_key = f"{EVAL_TASKS_PREFIX}\xff" stored_benchmarks = await self.kvstore.values_in_range(start_key, end_key) - for benchmark in stored_benchmarks: - benchmark = Benchmark.model_validate_json(benchmark) + for benchmark_json in stored_benchmarks: + benchmark = Benchmark.model_validate_json(benchmark_json) self.benchmarks[benchmark.identifier] = benchmark async def shutdown(self) -> None: ... @@ -132,7 +134,7 @@ async def _run_model_generation( ) -> list[dict[str, Any]]: candidate = request.benchmark_config.eval_candidate assert candidate.sampling_params.max_tokens is not None, "SamplingParams.max_tokens must be provided" - sampling_params = {"max_tokens": candidate.sampling_params.max_tokens} + sampling_params: dict[str, Any] = {"max_tokens": candidate.sampling_params.max_tokens} generations = [] for x in tqdm(input_rows): @@ -141,33 +143,37 @@ async def _run_model_generation( sampling_params["stop"] = candidate.sampling_params.stop input_content = json.loads(x[ColumnName.completion_input.value]) - params = OpenAICompletionRequestWithExtraBody( + completion_params = OpenAICompletionRequestWithExtraBody( model=candidate.model, prompt=input_content, **sampling_params, ) - response = await self.inference_api.openai_completion(params) - generations.append({ColumnName.generated_answer.value: response.choices[0].text}) + completion_response = await self.inference_api.openai_completion(completion_params) + assert not isinstance(completion_response, AsyncIterator), "Streaming not supported in eval" + generations.append({ColumnName.generated_answer.value: completion_response.choices[0].text}) elif ColumnName.chat_completion_input.value in x: chat_completion_input_json = json.loads(x[ColumnName.chat_completion_input.value]) input_messages = [ OpenAIUserMessageParam(**x) for x in chat_completion_input_json if x["role"] == "user" ] - messages = [] + messages: list[SystemMessage | OpenAISystemMessageParam | OpenAIUserMessageParam] = [] if candidate.system_message: messages.append(candidate.system_message) messages += [OpenAISystemMessageParam(**x) for x in chat_completion_input_json if x["role"] == "system"] messages += input_messages - params = OpenAIChatCompletionRequestWithExtraBody( + chat_params = OpenAIChatCompletionRequestWithExtraBody( model=candidate.model, - messages=messages, + messages=messages, # type: ignore[arg-type] **sampling_params, ) - response = await self.inference_api.openai_chat_completion(params) - generations.append({ColumnName.generated_answer.value: response.choices[0].message.content}) + chat_response = await self.inference_api.openai_chat_completion(chat_params) + assert not isinstance(chat_response, AsyncIterator), "Streaming not supported in eval" + content = chat_response.choices[0].message.content + assert content is not None, "Expected content in chat response" + generations.append({ColumnName.generated_answer.value: content}) else: raise ValueError("Invalid input row") diff --git a/src/llama_stack/providers/inline/inference/sentence_transformers/sentence_transformers.py b/src/llama_stack/providers/inline/inference/sentence_transformers/sentence_transformers.py index c03bdcfffc..1d3092a2e0 100644 --- a/src/llama_stack/providers/inline/inference/sentence_transformers/sentence_transformers.py +++ b/src/llama_stack/providers/inline/inference/sentence_transformers/sentence_transformers.py @@ -17,7 +17,9 @@ ModelType, OpenAIChatCompletion, OpenAIChatCompletionChunk, + OpenAIChatCompletionChunkWithReasoning, OpenAIChatCompletionRequestWithExtraBody, + OpenAIChatCompletionWithReasoning, OpenAICompletion, OpenAICompletionRequestWithExtraBody, ) @@ -39,7 +41,9 @@ class SentenceTransformersInferenceImpl( def __init__(self, config: SentenceTransformersInferenceConfig) -> None: self.config = config - async def openai_chat_completions_with_reasoning(self, params: OpenAIChatCompletionRequestWithExtraBody) -> None: + async def openai_chat_completions_with_reasoning( + self, params: OpenAIChatCompletionRequestWithExtraBody + ) -> OpenAIChatCompletionWithReasoning | AsyncIterator[OpenAIChatCompletionChunkWithReasoning]: raise NotImplementedError("SentenceTransformers provider does not support reasoning in chat completions") async def initialize(self) -> None: diff --git a/src/llama_stack/providers/inline/inference/transformers/transformers.py b/src/llama_stack/providers/inline/inference/transformers/transformers.py index 9d624ce65d..b248e6d7cf 100644 --- a/src/llama_stack/providers/inline/inference/transformers/transformers.py +++ b/src/llama_stack/providers/inline/inference/transformers/transformers.py @@ -21,9 +21,11 @@ ModelType, OpenAIChatCompletion, OpenAIChatCompletionChunk, + OpenAIChatCompletionChunkWithReasoning, OpenAIChatCompletionContentPartImageParam, OpenAIChatCompletionContentPartTextParam, OpenAIChatCompletionRequestWithExtraBody, + OpenAIChatCompletionWithReasoning, OpenAICompletion, OpenAICompletionRequestWithExtraBody, OpenAIEmbeddingsRequestWithExtraBody, @@ -56,7 +58,9 @@ class TransformersInferenceImpl( def __init__(self, config: TransformersInferenceConfig) -> None: self.config = config - async def openai_chat_completions_with_reasoning(self, params: OpenAIChatCompletionRequestWithExtraBody) -> None: + async def openai_chat_completions_with_reasoning( + self, params: OpenAIChatCompletionRequestWithExtraBody + ) -> OpenAIChatCompletionWithReasoning | AsyncIterator[OpenAIChatCompletionChunkWithReasoning]: raise NotImplementedError("Transformers provider does not support reasoning in chat completions") async def initialize(self) -> None: diff --git a/src/llama_stack/providers/utils/inference/embedding_mixin.py b/src/llama_stack/providers/utils/inference/embedding_mixin.py index ec1a1e04fd..2aa1e8f431 100644 --- a/src/llama_stack/providers/utils/inference/embedding_mixin.py +++ b/src/llama_stack/providers/utils/inference/embedding_mixin.py @@ -8,7 +8,7 @@ import base64 import platform import struct -from typing import TYPE_CHECKING +from typing import TYPE_CHECKING, Any from llama_stack.log import get_logger @@ -36,7 +36,8 @@ class SentenceTransformerEmbeddingMixin: """Mixin providing OpenAI-compatible embeddings via sentence-transformers models.""" - model_store: ModelStore + config: Any + model_store: ModelStore | None async def openai_embeddings( self, From 6fca53105d9f1678c46d1bcc2b48ddab51467d34 Mon Sep 17 00:00:00 2001 From: Mustafa Elbehery Date: Thu, 2 Apr 2026 17:29:28 +0200 Subject: [PATCH 2/2] chore: revert unnecessary variable renames in eval.py Revert chat_params/completion_params back to params and chat_response/completion_response back to response. These renames were not fixing anything and added noise to the diff. Keep all the actual type hints and assertions that add value. Signed-off-by: Mustafa Elbehery --- .../providers/inline/eval/builtin/eval.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/src/llama_stack/providers/inline/eval/builtin/eval.py b/src/llama_stack/providers/inline/eval/builtin/eval.py index 057b79b261..270c054930 100644 --- a/src/llama_stack/providers/inline/eval/builtin/eval.py +++ b/src/llama_stack/providers/inline/eval/builtin/eval.py @@ -143,14 +143,14 @@ async def _run_model_generation( sampling_params["stop"] = candidate.sampling_params.stop input_content = json.loads(x[ColumnName.completion_input.value]) - completion_params = OpenAICompletionRequestWithExtraBody( + params = OpenAICompletionRequestWithExtraBody( model=candidate.model, prompt=input_content, **sampling_params, ) - completion_response = await self.inference_api.openai_completion(completion_params) - assert not isinstance(completion_response, AsyncIterator), "Streaming not supported in eval" - generations.append({ColumnName.generated_answer.value: completion_response.choices[0].text}) + response = await self.inference_api.openai_completion(params) + assert not isinstance(response, AsyncIterator), "Streaming not supported in eval" + generations.append({ColumnName.generated_answer.value: response.choices[0].text}) elif ColumnName.chat_completion_input.value in x: chat_completion_input_json = json.loads(x[ColumnName.chat_completion_input.value]) input_messages = [ @@ -164,14 +164,14 @@ async def _run_model_generation( messages += [OpenAISystemMessageParam(**x) for x in chat_completion_input_json if x["role"] == "system"] messages += input_messages - chat_params = OpenAIChatCompletionRequestWithExtraBody( + params = OpenAIChatCompletionRequestWithExtraBody( model=candidate.model, messages=messages, # type: ignore[arg-type] **sampling_params, ) - chat_response = await self.inference_api.openai_chat_completion(chat_params) - assert not isinstance(chat_response, AsyncIterator), "Streaming not supported in eval" - content = chat_response.choices[0].message.content + response = await self.inference_api.openai_chat_completion(params) + assert not isinstance(response, AsyncIterator), "Streaming not supported in eval" + content = response.choices[0].message.content assert content is not None, "Expected content in chat response" generations.append({ColumnName.generated_answer.value: content}) else: