llamastack · Elbehery · Apr 1, 2026 · Apr 1, 2026
@@ -315,7 +315,7 @@ exclude = [
     # All files now have type annotations! 🎉
     #
     # ============================================================================
-    # Section 2: Files that need strict typing issues fixed (131 files)
+    # Section 2: Files that need strict typing issues fixed (104 files)
     # ============================================================================
     # These files have some type hints but fail strict type checking due to
     # incomplete annotations, Any usage, or other strict mode violations.
@@ -335,43 +335,6 @@ exclude = [
     "^src/llama_stack/core/testing_context\\.py$",
     "^src/llama_stack/core/utils/exec\\.py$",
     "^src/llama_stack/core/utils/serialize\\.py$",
-    # CLI files (8 files)
-    "^src/llama_stack/cli/stack/_list_deps\\.py$",
-    "^src/llama_stack/cli/stack/list_apis\\.py$",
-    "^src/llama_stack/cli/stack/list_deps\\.py$",
-    "^src/llama_stack/cli/stack/list_providers\\.py$",
-    "^src/llama_stack/cli/stack/run\\.py$",
-    "^src/llama_stack/cli/stack/utils\\.py$",
-    "^src/llama_stack/cli/subcommand\\.py$",
-    "^src/llama_stack/cli/utils\\.py$",
-    # Providers - Inline (27 files)
-    "^src/llama_stack/providers/inline/batches/reference/__init__\\.py$",
-    "^src/llama_stack/providers/inline/batches/reference/batches\\.py$",
-    "^src/llama_stack/providers/inline/eval/builtin/__init__\\.py$",
-    "^src/llama_stack/providers/inline/eval/builtin/eval\\.py$",
-    "^src/llama_stack/providers/inline/file_processor/pypdf/__init__\\.py$",
-    "^src/llama_stack/providers/inline/file_processor/pypdf/adapter\\.py$",
-    "^src/llama_stack/providers/inline/file_processor/pypdf/pypdf\\.py$",
-    "^src/llama_stack/providers/inline/files/localfs/__init__\\.py$",
-    "^src/llama_stack/providers/inline/inference/sentence_transformers/__init__\\.py$",
-    "^src/llama_stack/providers/inline/inference/sentence_transformers/config\\.py$",
-    "^src/llama_stack/providers/inline/inference/sentence_transformers/sentence_transformers\\.py$",
-    "^src/llama_stack/providers/inline/inference/transformers/__init__\\.py$",
-    "^src/llama_stack/providers/inline/inference/transformers/config\\.py$",
-    "^src/llama_stack/providers/inline/inference/transformers/transformers\\.py$",
-    "^src/llama_stack/providers/inline/safety/prompt_guard/__init__\\.py$",
-    "^src/llama_stack/providers/inline/safety/prompt_guard/config\\.py$",
-    "^src/llama_stack/providers/inline/safety/prompt_guard/prompt_guard\\.py$",
-    "^src/llama_stack/providers/inline/tool_runtime/file_search/__init__\\.py$",
-    "^src/llama_stack/providers/inline/tool_runtime/file_search/context_retriever\\.py$",
-    "^src/llama_stack/providers/inline/tool_runtime/file_search/file_search\\.py$",
-    "^src/llama_stack/providers/inline/vector_io/chroma/__init__\\.py$",
-    "^src/llama_stack/providers/inline/vector_io/faiss/__init__\\.py$",
-    "^src/llama_stack/providers/inline/vector_io/faiss/faiss\\.py$",
-    "^src/llama_stack/providers/inline/vector_io/milvus/__init__\\.py$",
-    "^src/llama_stack/providers/inline/vector_io/qdrant/__init__\\.py$",
-    "^src/llama_stack/providers/inline/vector_io/sqlite_vec/__init__\\.py$",
-    "^src/llama_stack/providers/inline/vector_io/sqlite_vec/sqlite_vec\\.py$",
     # Providers - Remote (43 files)
     "^src/llama_stack/providers/remote/eval/nvidia/__init__\\.py$",
     "^src/llama_stack/providers/remote/eval/nvidia/config\\.py$",

@@ -4,6 +4,7 @@
 # This source code is licensed under the terms described in the LICENSE file in
 # the root directory of this source tree.
 import json
+from collections.abc import AsyncIterator
 from typing import Any
 
 from tqdm import tqdm
@@ -33,6 +34,7 @@
     RunEvalRequest,
     ScoreRequest,
     Scoring,
+    SystemMessage,
 )
 
 from .config import BuiltinEvalConfig
@@ -63,9 +65,9 @@ def __init__(
         self.responses_api = responses_api
 
         # TODO: assume sync job, will need jobs API for async scheduling
-        self.jobs = {}
+        self.jobs: dict[str, EvaluateResponse] = {}
 
-        self.benchmarks = {}
+        self.benchmarks: dict[str, Benchmark] = {}
 
     async def initialize(self) -> None:
         self.kvstore = await kvstore_impl(self.config.kvstore)
@@ -74,8 +76,8 @@ async def initialize(self) -> None:
         end_key = f"{EVAL_TASKS_PREFIX}\xff"
         stored_benchmarks = await self.kvstore.values_in_range(start_key, end_key)
 
-        for benchmark in stored_benchmarks:
-            benchmark = Benchmark.model_validate_json(benchmark)
+        for benchmark_json in stored_benchmarks:
+            benchmark = Benchmark.model_validate_json(benchmark_json)
             self.benchmarks[benchmark.identifier] = benchmark
 
     async def shutdown(self) -> None: ...
@@ -132,7 +134,7 @@ async def _run_model_generation(
     ) -> list[dict[str, Any]]:
         candidate = request.benchmark_config.eval_candidate
         assert candidate.sampling_params.max_tokens is not None, "SamplingParams.max_tokens must be provided"
-        sampling_params = {"max_tokens": candidate.sampling_params.max_tokens}
+        sampling_params: dict[str, Any] = {"max_tokens": candidate.sampling_params.max_tokens}
 
         generations = []
         for x in tqdm(input_rows):
@@ -141,33 +143,37 @@ async def _run_model_generation(
                     sampling_params["stop"] = candidate.sampling_params.stop
 
                 input_content = json.loads(x[ColumnName.completion_input.value])
-                params = OpenAICompletionRequestWithExtraBody(
+                completion_params = OpenAICompletionRequestWithExtraBody(
                     model=candidate.model,
                     prompt=input_content,
                     **sampling_params,
                 )
-                response = await self.inference_api.openai_completion(params)
-                generations.append({ColumnName.generated_answer.value: response.choices[0].text})
+                completion_response = await self.inference_api.openai_completion(completion_params)
+                assert not isinstance(completion_response, AsyncIterator), "Streaming not supported in eval"
+                generations.append({ColumnName.generated_answer.value: completion_response.choices[0].text})
             elif ColumnName.chat_completion_input.value in x:
                 chat_completion_input_json = json.loads(x[ColumnName.chat_completion_input.value])
                 input_messages = [
                     OpenAIUserMessageParam(**x) for x in chat_completion_input_json if x["role"] == "user"
                 ]
 
-                messages = []
+                messages: list[SystemMessage | OpenAISystemMessageParam | OpenAIUserMessageParam] = []
                 if candidate.system_message:
                     messages.append(candidate.system_message)
 
                 messages += [OpenAISystemMessageParam(**x) for x in chat_completion_input_json if x["role"] == "system"]
 
                 messages += input_messages
-                params = OpenAIChatCompletionRequestWithExtraBody(
+                chat_params = OpenAIChatCompletionRequestWithExtraBody(
                     model=candidate.model,
-                    messages=messages,
+                    messages=messages,  # type: ignore[arg-type]
                     **sampling_params,
                 )
-                response = await self.inference_api.openai_chat_completion(params)
-                generations.append({ColumnName.generated_answer.value: response.choices[0].message.content})
+                chat_response = await self.inference_api.openai_chat_completion(chat_params)
+                assert not isinstance(chat_response, AsyncIterator), "Streaming not supported in eval"
+                content = chat_response.choices[0].message.content
+                assert content is not None, "Expected content in chat response"
+                generations.append({ColumnName.generated_answer.value: content})
             else:
                 raise ValueError("Invalid input row")
 

@@ -17,7 +17,9 @@
     ModelType,
     OpenAIChatCompletion,
     OpenAIChatCompletionChunk,
+    OpenAIChatCompletionChunkWithReasoning,
     OpenAIChatCompletionRequestWithExtraBody,
+    OpenAIChatCompletionWithReasoning,
     OpenAICompletion,
     OpenAICompletionRequestWithExtraBody,
 )
@@ -39,7 +41,9 @@ class SentenceTransformersInferenceImpl(
     def __init__(self, config: SentenceTransformersInferenceConfig) -> None:
         self.config = config
 
-    async def openai_chat_completions_with_reasoning(self, params: OpenAIChatCompletionRequestWithExtraBody) -> None:
+    async def openai_chat_completions_with_reasoning(
+        self, params: OpenAIChatCompletionRequestWithExtraBody
+    ) -> OpenAIChatCompletionWithReasoning | AsyncIterator[OpenAIChatCompletionChunkWithReasoning]:
         raise NotImplementedError("SentenceTransformers provider does not support reasoning in chat completions")
 
     async def initialize(self) -> None:

@@ -21,9 +21,11 @@
     ModelType,
     OpenAIChatCompletion,
     OpenAIChatCompletionChunk,
+    OpenAIChatCompletionChunkWithReasoning,
     OpenAIChatCompletionContentPartImageParam,
     OpenAIChatCompletionContentPartTextParam,
     OpenAIChatCompletionRequestWithExtraBody,
+    OpenAIChatCompletionWithReasoning,
     OpenAICompletion,
     OpenAICompletionRequestWithExtraBody,
     OpenAIEmbeddingsRequestWithExtraBody,
@@ -56,7 +58,9 @@ class TransformersInferenceImpl(
     def __init__(self, config: TransformersInferenceConfig) -> None:
         self.config = config
 
-    async def openai_chat_completions_with_reasoning(self, params: OpenAIChatCompletionRequestWithExtraBody) -> None:
+    async def openai_chat_completions_with_reasoning(
+        self, params: OpenAIChatCompletionRequestWithExtraBody
+    ) -> OpenAIChatCompletionWithReasoning | AsyncIterator[OpenAIChatCompletionChunkWithReasoning]:
         raise NotImplementedError("Transformers provider does not support reasoning in chat completions")
 
     async def initialize(self) -> None:

@@ -36,7 +36,7 @@
 class SentenceTransformerEmbeddingMixin:
     """Mixin providing OpenAI-compatible embeddings via sentence-transformers models."""
 
-    model_store: ModelStore
+    model_store: ModelStore | None
 
     async def openai_embeddings(
         self,